Skip to content

Change done field to integers for handling truncation vs termination #2608

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 9 commits into from
11 changes: 8 additions & 3 deletions gym/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,11 @@ class Env(Generic[ObsType, ActType]):
# Created
_np_random: RandomNumberGenerator | None = None

# Done type enumeration
NOT_DONE = 0 # the episode is not done yet
TERMINATED = DONE = 1 # the episode is terminated, no more further step() calls
TRUNCATED = 2 # the episode is truncated by wrappers or step counting, no more further step() calls

@property
def np_random(self) -> RandomNumberGenerator:
"""Initializes the np_random field if not done already."""
Expand All @@ -61,7 +66,7 @@ def np_random(self, value: RandomNumberGenerator):
self._np_random = value

@abstractmethod
def step(self, action: ActType) -> Tuple[ObsType, float, bool, dict]:
def step(self, action: ActType) -> Tuple[ObsType, float, int, dict]:
"""Run one timestep of the environment's dynamics. When end of
episode is reached, you are responsible for calling `reset()`
to reset this environment's state.
Expand All @@ -74,7 +79,7 @@ def step(self, action: ActType) -> Tuple[ObsType, float, bool, dict]:
Returns:
observation (object): agent's observation of the current environment
reward (float) : amount of reward returned after previous action
done (bool): whether the episode has ended, in which case further step() calls will return undefined results
done (int): whether the episode has ended, in which case further step() calls will return undefined results
info (dict): contains auxiliary diagnostic information (helpful for debugging, logging, and sometimes learning)
"""
raise NotImplementedError
Expand Down Expand Up @@ -276,7 +281,7 @@ def metadata(self) -> dict:
def metadata(self, value):
self._metadata = value

def step(self, action: ActType) -> Tuple[ObsType, float, bool, dict]:
def step(self, action: ActType) -> Tuple[ObsType, float, int, dict]:
return self.env.step(action)

def reset(self, **kwargs) -> Union[ObsType, tuple[ObsType, dict]]:
Expand Down
6 changes: 3 additions & 3 deletions gym/envs/box2d/bipedal_walker.py
Original file line number Diff line number Diff line change
Expand Up @@ -527,12 +527,12 @@ def step(self, action):
reward -= 0.00035 * MOTORS_TORQUE * np.clip(np.abs(a), 0, 1)
# normalized to about -50.0 using heuristic, more optimal agent should spend less

done = False
done = self.NOT_DONE
if self.game_over or pos[0] < 0:
reward = -100
done = True
done = self.TERMINATED
if pos[0] > (TERRAIN_LENGTH - TERRAIN_GRASS) * TERRAIN_STEP:
done = True
done = self.TERMINATED
return np.array(state, dtype=np.float32), reward, done, {}

def render(self, mode="human"):
Expand Down
6 changes: 3 additions & 3 deletions gym/envs/box2d/car_racing.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,7 +421,7 @@ def step(self, action):
self.state = self.render("state_pixels")

step_reward = 0
done = False
done = self.NOT_DONE
if action is not None: # First step without action, called from reset()
self.reward -= 0.1
# We actually don't want to count fuel spent, we want car to be faster.
Expand All @@ -430,10 +430,10 @@ def step(self, action):
step_reward = self.reward - self.prev_reward
self.prev_reward = self.reward
if self.tile_visited_count == len(self.track) or self.new_lap:
done = True
done = self.TERMINATED
x, y = self.car.hull.position
if abs(x) > PLAYFIELD or abs(y) > PLAYFIELD:
done = True
done = self.TERMINATED
step_reward = -100

return self.state, step_reward, done, {}
Expand Down
6 changes: 3 additions & 3 deletions gym/envs/box2d/lunar_lander.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,12 +435,12 @@ def step(self, action):
) # less fuel spent is better, about -30 for heuristic landing
reward -= s_power * 0.03

done = False
done = self.NOT_DONE
if self.game_over or abs(state[0]) >= 1.0:
done = True
done = self.TERMINATED
reward = -100
if not self.lander.awake:
done = True
done = self.TERMINATED
reward = +100
return np.array(state, dtype=np.float32), reward, done, {}

Expand Down
7 changes: 4 additions & 3 deletions gym/envs/classic_control/acrobot.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,9 +208,10 @@ def step(self, a):
ns[2] = bound(ns[2], -self.MAX_VEL_1, self.MAX_VEL_1)
ns[3] = bound(ns[3], -self.MAX_VEL_2, self.MAX_VEL_2)
self.state = ns
terminal = self._terminal()
reward = -1.0 if not terminal else 0.0
return (self._get_ob(), reward, terminal, {})
terminated = self._terminal()
done = self.TERMINATED if terminated else self.NOT_DONE
reward = -1.0 if not terminated else 0.0
return (self._get_ob(), reward, done, {})

def _get_ob(self):
s = self.state
Expand Down
5 changes: 3 additions & 2 deletions gym/envs/classic_control/cartpole.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,14 +145,15 @@ def step(self, action):

self.state = (x, x_dot, theta, theta_dot)

done = bool(
terminated = bool(
x < -self.x_threshold
or x > self.x_threshold
or theta < -self.theta_threshold_radians
or theta > self.theta_threshold_radians
)
done = self.TERMINATED if terminated else self.NOT_DONE

if not done:
if not terminated:
reward = 1.0
elif self.steps_beyond_done is None:
# Pole just fell!
Expand Down
7 changes: 5 additions & 2 deletions gym/envs/classic_control/continuous_mountain_car.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,10 +147,13 @@ def step(self, action):
velocity = 0

# Convert a possible numpy bool to a Python bool.
done = bool(position >= self.goal_position and velocity >= self.goal_velocity)
terminated = bool(
position >= self.goal_position and velocity >= self.goal_velocity
)
done = self.TERMINATED if terminated else self.NOT_DONE

reward = 0
if done:
if terminated:
reward = 100.0
reward -= math.pow(action[0], 2) * 0.1

Expand Down
5 changes: 4 additions & 1 deletion gym/envs/classic_control/mountain_car.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,10 @@ def step(self, action):
if position == self.min_position and velocity < 0:
velocity = 0

done = bool(position >= self.goal_position and velocity >= self.goal_velocity)
terminated = bool(
position >= self.goal_position and velocity >= self.goal_velocity
)
done = self.TERMINATED if terminated else self.NOT_DONE
reward = -1.0

self.state = (position, velocity)
Expand Down
2 changes: 1 addition & 1 deletion gym/envs/classic_control/pendulum.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def step(self, u):
newth = th + newthdot * dt

self.state = np.array([newth, newthdot])
return self._get_obs(), -costs, False, {}
return self._get_obs(), -costs, self.NOT_DONE, {}

def reset(
self,
Expand Down
2 changes: 1 addition & 1 deletion gym/envs/mujoco/ant.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def step(self, a):
reward = forward_reward - ctrl_cost - contact_cost + survive_reward
state = self.state_vector()
notdone = np.isfinite(state).all() and state[2] >= 0.2 and state[2] <= 1.0
done = not notdone
done = self.NOT_DONE if notdone else self.TERMINATED
ob = self._get_obs()
return (
ob,
Expand Down
2 changes: 1 addition & 1 deletion gym/envs/mujoco/ant_v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ def step(self, action):
costs = ctrl_cost + contact_cost

reward = rewards - costs
done = self.done
done = self.TERMINATED if self.done else self.NOT_DONE
observation = self._get_obs()
info = {
"reward_forward": forward_reward,
Expand Down
2 changes: 1 addition & 1 deletion gym/envs/mujoco/half_cheetah.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def step(self, action):
reward_ctrl = -0.1 * np.square(action).sum()
reward_run = (xposafter - xposbefore) / self.dt
reward = reward_ctrl + reward_run
done = False
done = self.NOT_DONE
return ob, reward, done, dict(reward_run=reward_run, reward_ctrl=reward_ctrl)

def _get_obs(self):
Expand Down
2 changes: 1 addition & 1 deletion gym/envs/mujoco/half_cheetah_v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ def step(self, action):

observation = self._get_obs()
reward = forward_reward - ctrl_cost
done = False
done = self.NOT_DONE
info = {
"x_position": x_position_after,
"x_velocity": x_velocity,
Expand Down
3 changes: 2 additions & 1 deletion gym/envs/mujoco/hopper.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,13 @@ def step(self, a):
reward += alive_bonus
reward -= 1e-3 * np.square(a).sum()
s = self.state_vector()
done = not (
terminated = not (
np.isfinite(s).all()
and (np.abs(s[2:]) < 100).all()
and (height > 0.7)
and (abs(ang) < 0.2)
)
done = self.TERMINATED if terminated else self.NOT_DONE
ob = self._get_obs()
return ob, reward, done, {}

Expand Down
2 changes: 1 addition & 1 deletion gym/envs/mujoco/hopper_v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ def step(self, action):

observation = self._get_obs()
reward = rewards - costs
done = self.done
done = self.TERMINATED if self.done else self.NOT_DONE
info = {
"x_position": x_position_after,
"x_velocity": x_velocity,
Expand Down
3 changes: 2 additions & 1 deletion gym/envs/mujoco/humanoid.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ def step(self, a):
quad_impact_cost = min(quad_impact_cost, 10)
reward = lin_vel_cost - quad_ctrl_cost - quad_impact_cost + alive_bonus
qpos = self.sim.data.qpos
done = bool((qpos[2] < 1.0) or (qpos[2] > 2.0))
terminated = bool((qpos[2] < 1.0) or (qpos[2] > 2.0))
done = self.TERMINATED if terminated else self.NOT_DONE
return (
self._get_obs(),
reward,
Expand Down
2 changes: 1 addition & 1 deletion gym/envs/mujoco/humanoid_v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ def step(self, action):

observation = self._get_obs()
reward = rewards - costs
done = self.done
done = self.TERMINATED if self.done else self.NOT_DONE
info = {
"reward_linvel": forward_reward,
"reward_quadctrl": -ctrl_cost,
Expand Down
2 changes: 1 addition & 1 deletion gym/envs/mujoco/humanoidstandup.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ def step(self, a):
quad_impact_cost = min(quad_impact_cost, 10)
reward = uph_cost - quad_ctrl_cost - quad_impact_cost + 1

done = bool(False)
done = self.NOT_DONE
return (
self._get_obs(),
reward,
Expand Down
3 changes: 2 additions & 1 deletion gym/envs/mujoco/inverted_double_pendulum.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,8 @@ def step(self, action):
vel_penalty = 1e-3 * v1 ** 2 + 5e-3 * v2 ** 2
alive_bonus = 10
r = alive_bonus - dist_penalty - vel_penalty
done = bool(y <= 1)
terminated = bool(y <= 1)
done = self.TERMINATED if terminated else self.NOT_DONE
return ob, r, done, {}

def _get_obs(self):
Expand Down
2 changes: 1 addition & 1 deletion gym/envs/mujoco/inverted_pendulum.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def step(self, a):
self.do_simulation(a, self.frame_skip)
ob = self._get_obs()
notdone = np.isfinite(ob).all() and (np.abs(ob[1]) <= 0.2)
done = not notdone
done = self.NOT_DONE if notdone else self.TERMINATED
return ob, reward, done, {}

def reset_model(self):
Expand Down
2 changes: 1 addition & 1 deletion gym/envs/mujoco/pusher.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def step(self, a):

self.do_simulation(a, self.frame_skip)
ob = self._get_obs()
done = False
done = self.NOT_DONE
return ob, reward, done, dict(reward_dist=reward_dist, reward_ctrl=reward_ctrl)

def viewer_setup(self):
Expand Down
2 changes: 1 addition & 1 deletion gym/envs/mujoco/reacher.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def step(self, a):
reward = reward_dist + reward_ctrl
self.do_simulation(a, self.frame_skip)
ob = self._get_obs()
done = False
done = self.NOT_DONE
return ob, reward, done, dict(reward_dist=reward_dist, reward_ctrl=reward_ctrl)

def viewer_setup(self):
Expand Down
2 changes: 1 addition & 1 deletion gym/envs/mujoco/swimmer_v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ def step(self, action):

observation = self._get_obs()
reward = forward_reward - ctrl_cost
done = False
done = self.NOT_DONE
info = {
"reward_fwd": forward_reward,
"reward_ctrl": -ctrl_cost,
Expand Down
3 changes: 2 additions & 1 deletion gym/envs/mujoco/walker2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ def step(self, a):
reward = (posafter - posbefore) / self.dt
reward += alive_bonus
reward -= 1e-3 * np.square(a).sum()
done = not (height > 0.8 and height < 2.0 and ang > -1.0 and ang < 1.0)
terminated = not (height > 0.8 and height < 2.0 and ang > -1.0 and ang < 1.0)
done = self.TERMINATED if terminated else self.NOT_DONE
ob = self._get_obs()
return ob, reward, done, {}

Expand Down
2 changes: 1 addition & 1 deletion gym/envs/mujoco/walker2d_v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ def step(self, action):

observation = self._get_obs()
reward = rewards - costs
done = self.done
done = self.TERMINATED if self.done else self.NOT_DONE
info = {
"x_position": x_position_after,
"x_velocity": x_velocity,
Expand Down
7 changes: 4 additions & 3 deletions gym/envs/toy_text/blackjack.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,13 +125,13 @@ def step(self, action):
if action: # hit: add a card to players hand and return
self.player.append(draw_card(self.np_random))
if is_bust(self.player):
done = True
done = self.TERMINATED
reward = -1.0
else:
done = False
done = self.NOT_DONE
reward = 0.0
else: # stick: play out the dealers hand, and score
done = True
done = self.TERMINATED
while sum_hand(self.dealer) < 17:
self.dealer.append(draw_card(self.np_random))
reward = cmp(score(self.player), score(self.dealer))
Expand Down Expand Up @@ -275,4 +275,5 @@ def scale_card_img(card_img):
np.array(pygame.surfarray.pixels3d(self.screen)), axes=(1, 0, 2)
)


# Pixel art from Mariia Khmelnytska (https://www.123rf.com/photo_104453049_stock-vector-pixel-art-playing-cards-standart-deck-vector-set.html)
3 changes: 2 additions & 1 deletion gym/envs/toy_text/cliffwalking.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,8 @@ def step(self, a):
p, s, r, d = transitions[i]
self.s = s
self.lastaction = a
return (int(s), r, d, {"prob": p})
done = self.TERMINATED if d else self.NOT_DONE
return (int(s), r, done, {"prob": p})

def reset(
self,
Expand Down
3 changes: 2 additions & 1 deletion gym/envs/toy_text/frozen_lake.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,8 @@ def step(self, a):
p, s, r, d = transitions[i]
self.s = s
self.lastaction = a
return (int(s), r, d, {"prob": p})
done = self.TERMINATED if d else self.NOT_DONE
return (int(s), r, done, {"prob": p})

def reset(
self,
Expand Down
3 changes: 2 additions & 1 deletion gym/envs/toy_text/taxi.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,8 @@ def step(self, a):
p, s, r, d = transitions[i]
self.s = s
self.lastaction = a
return (int(s), r, d, {"prob": p})
done = self.TERMINATED if d else self.NOT_DONE
return (int(s), r, done, {"prob": p})

def reset(
self,
Expand Down
9 changes: 8 additions & 1 deletion gym/utils/env_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,14 @@ def _check_returned_values(
assert isinstance(
reward, (float, int, np.float32)
), "The reward returned by `step()` must be a float"
assert isinstance(done, bool), "The `done` signal must be a boolean"
assert isinstance(done, int) and not isinstance(
done, bool
), "The `done` signal must be an integer"
assert done in (
env.NOT_DONE,
env.TERMINATED,
env.TRUNCATED,
), f"The `done` signal must be in {(env.NOT_DONE, env.TERMINATED, env.TRUNCATED)}"
assert isinstance(
info, dict
), "The `info` returned by `step()` must be a python dictionary"
Expand Down
Loading