From 0f51a94ac079b0e18f67da94b95f3c32abb6c996 Mon Sep 17 00:00:00 2001 From: Mark Towers Date: Fri, 21 Nov 2025 11:26:05 +0000 Subject: [PATCH 1/3] [rllib] Log or raise error if not fully handled Signed-off-by: Mark Towers --- rllib/core/learner/training_data.py | 6 ++++-- rllib/core/rl_module/rl_module.py | 2 ++ rllib/env/env_runner_group.py | 14 ++++++-------- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/rllib/core/learner/training_data.py b/rllib/core/learner/training_data.py index dc2bb1bd3271..8179b866d374 100644 --- a/rllib/core/learner/training_data.py +++ b/rllib/core/learner/training_data.py @@ -132,8 +132,10 @@ def solve_refs(self): for ref in episode_refs: try: episodes.extend(ray.get(ref)) - except ray.exceptions.OwnerDiedError: - pass + except ray.exceptions.OwnerDiedError as e: + print( + f"episode-ref {ref} died and can't be collected with error: {e}" + ) self.episodes = episodes self.episodes_refs = None diff --git a/rllib/core/rl_module/rl_module.py b/rllib/core/rl_module/rl_module.py index 7e63c9cd9662..dce204155f01 100644 --- a/rllib/core/rl_module/rl_module.py +++ b/rllib/core/rl_module/rl_module.py @@ -485,6 +485,8 @@ def __init__( except AttributeError as e: if "'NoneType' object has no attribute " in e.args[0]: raise (self._catalog_ctor_error or e) + raise e + self._is_setup = True # Cache value for returning from `is_stateful` so we don't have to call # the module's `get_initial_state()` method all the time (might be expensive). diff --git a/rllib/env/env_runner_group.py b/rllib/env/env_runner_group.py index 83203907b09f..837eb848542b 100644 --- a/rllib/env/env_runner_group.py +++ b/rllib/env/env_runner_group.py @@ -814,8 +814,8 @@ def stop(self) -> None: self.foreach_env_runner( lambda w: w.stop(), healthy_only=False, local_env_runner=True ) - except Exception: - logger.exception("Failed to stop workers!") + except Exception as e: + logger.exception(f"Failed to stop workers with {e}") finally: self._worker_manager.clear() @@ -1282,9 +1282,8 @@ def _make_worker( .remote(**kwargs) ) - @classmethod - def _valid_module(cls, class_path): - del cls + @staticmethod + def _valid_module(class_path): if ( isinstance(class_path, str) and not os.path.isfile(class_path) @@ -1295,9 +1294,8 @@ def _valid_module(cls, class_path): spec = importlib.util.find_spec(module_path) if spec is not None: return True - except (ModuleNotFoundError, ValueError): + except (ModuleNotFoundError, ValueError) as e: print( - f"module {module_path} not found while trying to get " - f"input {class_path}" + f"module {module_path} not found using input {class_path} with error: {e}" ) return False From 0f6dad6ad5a5f9bd9a8b251f8de133d411b21d14 Mon Sep 17 00:00:00 2001 From: Mark Towers Date: Tue, 25 Nov 2025 14:38:05 +0000 Subject: [PATCH 2/3] Gemini code review Signed-off-by: Mark Towers --- rllib/core/learner/training_data.py | 2 +- rllib/env/env_runner_group.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/rllib/core/learner/training_data.py b/rllib/core/learner/training_data.py index 8179b866d374..acf31c40732c 100644 --- a/rllib/core/learner/training_data.py +++ b/rllib/core/learner/training_data.py @@ -133,7 +133,7 @@ def solve_refs(self): try: episodes.extend(ray.get(ref)) except ray.exceptions.OwnerDiedError as e: - print( + ray.logger.warning( f"episode-ref {ref} died and can't be collected with error: {e}" ) self.episodes = episodes diff --git a/rllib/env/env_runner_group.py b/rllib/env/env_runner_group.py index 837eb848542b..0bbe989b3ece 100644 --- a/rllib/env/env_runner_group.py +++ b/rllib/env/env_runner_group.py @@ -814,8 +814,8 @@ def stop(self) -> None: self.foreach_env_runner( lambda w: w.stop(), healthy_only=False, local_env_runner=True ) - except Exception as e: - logger.exception(f"Failed to stop workers with {e}") + except Exception: + logger.exception("Failed to stop workers!") finally: self._worker_manager.clear() @@ -1295,7 +1295,7 @@ def _valid_module(class_path): if spec is not None: return True except (ModuleNotFoundError, ValueError) as e: - print( + logger.warning( f"module {module_path} not found using input {class_path} with error: {e}" ) return False From 29ed39d3b25717e4734d712f8e7f23d70351c1bf Mon Sep 17 00:00:00 2001 From: Mark Towers Date: Wed, 26 Nov 2025 15:09:21 +0000 Subject: [PATCH 3/3] Improve episode_ref warning Signed-off-by: Mark Towers --- rllib/core/learner/training_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rllib/core/learner/training_data.py b/rllib/core/learner/training_data.py index acf31c40732c..6d178539894a 100644 --- a/rllib/core/learner/training_data.py +++ b/rllib/core/learner/training_data.py @@ -134,7 +134,7 @@ def solve_refs(self): episodes.extend(ray.get(ref)) except ray.exceptions.OwnerDiedError as e: ray.logger.warning( - f"episode-ref {ref} died and can't be collected with error: {e}" + f"episode-ref {ref} died and can't be collected with error: {e}. This can happen if an EnvRunner is lost (for example because of a node failure) and is not critical in such cases." ) self.episodes = episodes self.episodes_refs = None