[RLlib] Fix failing env step in MultiAgentEnvRunner. (#55567)

kamil-kaczmarek · web-flow · commit 29bc824072a7 · 2025-09-18T11:37:52.000+02:00
## Why are these changes needed? Fix failing release test: `learning_tests_multi_agent_cartpole_appo_multi_gpu`. ## Related issue number  ## Checks - [x] I've signed off every commit(by using the -s flag, i.e., `git commit -s`) in this PR. - [x] I've run `scripts/format.sh` to lint the changes in this PR. - [ ] I've included any doc changes needed for https://docs.ray.io/en/master/. - [ ] I've added any new APIs to the API Reference. For example, if I added a method in Tune, I've added it in `doc/source/tune/api/` under the corresponding `.rst` file. - [ ] I've made sure the tests are passing. Note that there might be a few flaky tests, see the recent failures at https://flakey-tests.ray.io/ - Testing Strategy - [x] Unit tests - [x] Release tests - [ ] This PR is not tested :( --------- Signed-off-by: Kamil Kaczmarek <kamil@anyscale.com> Signed-off-by: Kamil Kaczmarek <kaczmarek.poczta@gmail.com>
diff --git a/rllib/env/env_runner.py b/rllib/env/env_runner.py
@@ -228,17 +228,21 @@ def _try_env_reset(
                 raise e
 
     def _try_env_step(self, actions):
-        """Tries stepping the env and - if an error orrurs - handles it gracefully."""
+        """Tries stepping the env and - if an error occurs - handles it gracefully."""
         try:
             with self.metrics.log_time(ENV_STEP_TIMER):
                 results = self.env.step(actions)
             return results
         except Exception as e:
             self.metrics.log_value(NUM_ENV_STEP_FAILURES_LIFETIME, 1, reduce="sum")
 
+            # @OldAPIStack (config.restart_failed_sub_environments)
             if self.config.restart_failed_sub_environments:
                 if not isinstance(e, StepFailedRecreateEnvError):
-                    logger.exception("Stepping the env resulted in an error!")
+                    logger.exception(
+                        "Stepping the env resulted in an error! The original error "
+                        f"is: {e}"
+                    )
                 # Recreate the env.
                 self.make_env()
                 # And return that the stepping failed. The caller will then handle
diff --git a/rllib/env/multi_agent_env.py b/rllib/env/multi_agent_env.py
@@ -444,7 +444,7 @@ def step(self, action_dict):
             #  an additional episode_done bool that covers cases where all agents are
             #  either terminated or truncated, but not all are truncated and not all are
             #  terminated. We can then get rid of the aweful `__all__` special keys!
-            terminated["__all__"] = len(self.terminateds) + len(self.truncateds) == len(
+            terminated["__all__"] = len(self.terminateds | self.truncateds) == len(
                 self.envs
             )
             truncated["__all__"] = len(self.truncateds) == len(self.envs)
diff --git a/rllib/env/multi_agent_env_runner.py b/rllib/env/multi_agent_env_runner.py
@@ -158,9 +158,9 @@ def sample(
 
         Args:
             num_timesteps: The number of timesteps to sample during this call.
-                Note that only one of `num_timetseps` or `num_episodes` may be provided.
+                Note that only one of `num_timesteps` or `num_episodes` may be provided.
             num_episodes: The number of episodes to sample during this call.
-                Note that only one of `num_timetseps` or `num_episodes` may be provided.
+                Note that only one of `num_timesteps` or `num_episodes` may be provided.
             explore: If True, will use the RLModule's `forward_exploration()`
                 method to compute actions. If False, will use the RLModule's
                 `forward_inference()` method. If None (default), will use the `explore`
@@ -183,8 +183,11 @@ def sample(
                 f"{self} doesn't have an env! Can't call `sample()` on it."
             )
 
-        assert not (num_timesteps is not None and num_episodes is not None)
-
+        assert not (num_timesteps is not None and num_episodes is not None), (
+            "Provide "
+            "either `num_timesteps` or `num_episodes`. Both provided here:"
+            f"{num_timesteps=}, {num_episodes=}"
+        )
         # Log time between `sample()` requests.
         if self._time_after_sampling is not None:
             self.metrics.log_value(
@@ -214,23 +217,22 @@ def sample(
                     * self.num_envs
                 )
 
-            # Sample n timesteps.
+            # Sample "num_timesteps" timesteps.
             if num_timesteps is not None:
                 samples = self._sample(
                     num_timesteps=num_timesteps,
                     explore=explore,
                     random_actions=random_actions,
                     force_reset=force_reset,
                 )
-            # Sample m episodes.
+            # Sample "num_episodes" episodes.
             elif num_episodes is not None:
                 samples = self._sample(
                     num_episodes=num_episodes,
                     explore=explore,
                     random_actions=random_actions,
                 )
-            # For complete episodes mode, sample as long as the number of timesteps
-            # done is smaller than the `train_batch_size`.
+            # For batch_mode="complete_episodes" (env_runners configuration), continue sampling as long as the number of timesteps done is smaller than the `train_batch_size`.
             else:
                 samples = self._sample(
                     num_episodes=self.num_envs,
@@ -346,21 +348,24 @@ def _sample(
                         metrics_prefix_key=(MODULE_TO_ENV_CONNECTOR,),
                     )
                 # In case all environments had been terminated `to_module` will be
-                # empty and no actions are needed b/c we reset all environemnts.
+                # empty and no actions are needed b/c we reset all environments.
                 else:
                     to_env = {}
             shared_data["vector_env_episodes_map"] = {}
 
             # Extract the (vectorized) actions (to be sent to the env) from the
             # module/connector output. Note that these actions are fully ready (e.g.
-            # already unsquashed/clipped) to be sent to the environment) and might not
+            # already unsquashed/clipped) to be sent to the environment and might not
             # be identical to the actions produced by the RLModule/distribution, which
             # are the ones stored permanently in the episode objects.
             actions = to_env.pop(Columns.ACTIONS, [{} for _ in episodes])
             actions_for_env = to_env.pop(Columns.ACTIONS_FOR_ENV, actions)
             # Try stepping the environment.
             results = self._try_env_step(actions_for_env)
             if results == ENV_STEP_FAILURE:
+                logging.warning(
+                    f"RLlib {self.__class__.__name__}: Environment step failed. Will force reset env(s) in this EnvRunner."
+                )
                 return self._sample(
                     num_timesteps=num_timesteps,
                     num_episodes=num_episodes,
@@ -372,7 +377,7 @@ def _sample(
 
             call_on_episode_start = set()
             # Store the data from the last environment step into the
-            # episodes for all sub-envrironments.
+            # episodes for all sub-environments.
             for env_index in range(self.num_envs):
                 extra_model_outputs = defaultdict(dict)
                 # `to_env` returns a dictionary with column keys and
@@ -710,7 +715,7 @@ def set_state(self, state: StateDict) -> None:
             # update.
             weights_seq_no = state.get(WEIGHTS_SEQ_NO, 0)
 
-            # Only update the weigths, if this is the first synchronization or
+            # Only update the weights, if this is the first synchronization or
             # if the weights of this `EnvRunner` lacks behind the actual ones.
             if weights_seq_no == 0 or self._weights_seq_no < weights_seq_no:
                 rl_module_state = state[COMPONENT_RL_MODULE]
diff --git a/rllib/env/single_agent_env_runner.py b/rllib/env/single_agent_env_runner.py
@@ -157,9 +157,9 @@ def sample(
 
         Args:
             num_timesteps: The number of timesteps to sample during this call.
-                Note that only one of `num_timetseps` or `num_episodes` may be provided.
+                Note that only one of `num_timesteps` or `num_episodes` may be provided.
             num_episodes: The number of episodes to sample during this call.
-                Note that only one of `num_timetseps` or `num_episodes` may be provided.
+                Note that only one of `num_timesteps` or `num_episodes` may be provided.
             explore: If True, will use the RLModule's `forward_exploration()`
                 method to compute actions. If False, will use the RLModule's
                 `forward_inference()` method. If None (default), will use the `explore`
@@ -328,7 +328,7 @@ def _sample(
 
             # Extract the (vectorized) actions (to be sent to the env) from the
             # module/connector output. Note that these actions are fully ready (e.g.
-            # already unsquashed/clipped) to be sent to the environment) and might not
+            # already unsquashed/clipped) to be sent to the environment and might not
             # be identical to the actions produced by the RLModule/distribution, which
             # are the ones stored permanently in the episode objects.
             actions = to_env.pop(Columns.ACTIONS)
@@ -362,7 +362,7 @@ def _sample(
 
                 # Call `add_env_step()` method on episode.
                 else:
-                    # Only increase ts when we actually stepped (not reset'd as a reset
+                    # Only increase ts when we actually stepped (not reset as a reset
                     # does not count as a timestep).
                     ts += 1
                     episodes[env_index].add_env_step(
@@ -375,7 +375,7 @@ def _sample(
                         extra_model_outputs=extra_model_output,
                     )
 
-            # Env-to-module connector pass (cache results as we will do the RLModule
+            # Env-to-module connector pass cache results as we will do the RLModule
             # forward pass only in the next `while`-iteration.
             if self.module is not None:
                 self._cached_to_module = self._env_to_module(
@@ -442,7 +442,7 @@ def _sample(
             ]
 
             for eps in self._episodes:
-                # Just started Episodes do not have to be returned. There is no data
+                # Just started episodes do not have to be returned. There is no data
                 # in them anyway.
                 if eps.t == 0:
                     continue
@@ -554,8 +554,8 @@ def set_state(self, state: StateDict) -> None:
             # update.
             weights_seq_no = state.get(WEIGHTS_SEQ_NO, 0)
 
-            # Only update the weigths, if this is the first synchronization or
-            # if the weights of this `EnvRunner` lacks behind the actual ones.
+            # Only update the weights, if this is the first synchronization or
+            # if the weights of this `EnvRunner` lag behind the actual ones.
             if weights_seq_no == 0 or self._weights_seq_no < weights_seq_no:
                 rl_module_state = state[COMPONENT_RL_MODULE]
                 if isinstance(rl_module_state, ray.ObjectRef):
@@ -609,13 +609,13 @@ def get_checkpointable_components(self):
     def assert_healthy(self):
         """Checks that self.__init__() has been completed properly.
 
-        Ensures that the instances has a `MultiRLModule` and an
+        Ensures that the instance has a `MultiRLModule` and an
         environment defined.
 
         Raises:
             AssertionError: If the EnvRunner Actor has NOT been properly initialized.
         """
-        # Make sure, we have built our gym.vector.Env and RLModule properly.
+        # Make sure we have built our gym.vector.Env and RLModule properly.
         assert self.env and hasattr(self, "module")
 
     @override(EnvRunner)
@@ -626,8 +626,8 @@ def make_env(self) -> None:
         `self.config.env_config`) and then call this method to create new environments
         with the updated configuration.
         """
-        # If an env already exists, try closing it first (to allow it to properly
-        # cleanup).
+        # If an env already exists, try closing it first
+        # to allow it to properly clean up.
         if self.env is not None:
             try:
                 self.env.close()
@@ -854,7 +854,7 @@ def _log_episode_metrics(self, length, ret, sec):
         # Log general episode metrics.
         # Use the configured window, but factor in the parallelism of the EnvRunners.
         # As a result, we only log the last `window / num_env_runners` steps here,
-        # b/c everything gets parallel-merged in the Algorithm process.
+        # because everything gets parallel-merged in the Algorithm process.
         win = max(
             1,
             int(
diff --git a/rllib/env/tests/test_single_agent_env_runner.py b/rllib/env/tests/test_single_agent_env_runner.py
diff --git a/rllib/examples/envs/classes/simple_corridor.py b/rllib/examples/envs/classes/simple_corridor.py

Original file line number	Diff line number	Diff line change
`@@ -444,7 +444,7 @@ def step(self, action_dict):`
`444`	`444`	`# an additional episode_done bool that covers cases where all agents are`
`445`	`445`	`# either terminated or truncated, but not all are truncated and not all are`
`446`	`446`	# terminated. We can then get rid of the aweful `__all__` special keys!
`447`		`- terminated["__all__"] = len(self.terminateds) + len(self.truncateds) == len(`
	`447`	`+ terminated["__all__"] = len(self.terminateds \| self.truncateds) == len(`
`448`	`448`	`self.envs`
`449`	`449`	`)`
`450`	`450`	`truncated["__all__"] = len(self.truncateds) == len(self.envs)`