BonsaiAI · Kiko-Aumond · Jan 10, 2023 · Jan 10, 2023 · Jan 10, 2023 · Jan 10, 2023
@@ -1,6 +1,6 @@
 language: generic
-# Use Ubuntu 18.04
-dist: bionic
+# Use Ubuntu 20.04
+dist: focal
 
 git:
   clone: false  # Clone manually to work around Travis issues like https://github.com/travis-ci/travis-ci/issues/6337

@@ -5,7 +5,7 @@ steps:
     # the hash was computed in Mac
     if [[ $AGENT_OS == "Darwin" ]]; then 
       pushd $BUILD_SOURCESDIRECTORY
-      EXPECTED_HASH_TRAVIS='7e5de1d2a8ccd0947747164f845b3c195dab93d12e3692f82de969a9b849d937'
+      EXPECTED_HASH_TRAVIS='ee20821c018bbc7aa7d4ca551d5542f9ae9399b255701a70ccdd808d814adacc'
       CURRENT_HASH_TRAVIS=$(shasum -a 256 ./.travis.yml | awk '{print $1}')
       if [[ $EXPECTED_HASH_TRAVIS != $CURRENT_HASH_TRAVIS ]]; then 
         echo "The original Travis file of the project has changed"

@@ -9,7 +9,7 @@
 
 import ray
 from ray.util.iter import from_items, from_iterators, from_range, \
-    from_actors, ParallelIteratorWorker, LocalIterator, was_cause_by_stop_iteration
+    from_actors, NoSamplesAvailable, ParallelIteratorWorker, LocalIterator, was_cause_by_stop_iteration
 from ray.test_utils import Semaphore
 
 
@@ -551,7 +551,7 @@ def test_batch_across_shards(ray_start_regular_shared):
             for x in it.batch_across_shards():
                 collected.append(x)
                 attempts_collect_counts[attempts] += 1
-        except (StopIteration, RuntimeError) as ex:
+        except (StopIteration, RuntimeError, NoSamplesAvailable) as ex:
             if was_cause_by_stop_iteration(ex):
                 continue
             else:
@@ -582,7 +582,7 @@ def test_batch_across_unbalanced_shards(ray_start_regular_shared):
             for x in it.batch_across_shards():
                 collected.append(x)
                 attempts_collect_counts[attempts] += 1
-        except (StopIteration, RuntimeError) as ex:
+        except (StopIteration, RuntimeError, NoSamplesAvailable) as ex:
             if was_cause_by_stop_iteration(ex):
                 continue
             else:

@@ -8,6 +8,7 @@
 from typing import TypeVar, Generic, Iterable, List, Callable, Any, Iterator
 
 import ray
+from ray._private.memory_monitor import RayOutOfMemoryError
 from ray.util.iter_metrics import MetricsContext, SharedMetrics
 
 logger = logging.getLogger(__name__)
@@ -18,7 +19,7 @@
 
 
 def was_cause_by_stop_iteration(ex) -> bool:
-    if isinstance(ex, StopIteration):
+    if isinstance(ex, (StopIteration, NoSamplesAvailable)):
         return True
     elif ex.__cause__ is not None:
         return was_cause_by_stop_iteration(ex.__cause__)
@@ -457,7 +458,7 @@ def base_iterator(num_partitions, partition_index, timeout=None):
                             batch_ms=batch_ms)] = actor
                         for item in batch:
                             yield item
-                    except (StopIteration, RuntimeError) as ex:
+                    except (StopIteration, RuntimeError, NoSamplesAvailable) as ex:
                         if was_cause_by_stop_iteration(ex):
                             pass
                         else:
@@ -525,7 +526,10 @@ def base_iterator(timeout=None):
                         yield _NextValueNotReady()
                 except TimeoutError:
                     yield _NextValueNotReady()
-                except (StopIteration, RuntimeError) as ex:
+                # Propagate OOM exceptions up the stack
+                except RayOutOfMemoryError:
+                    raise
+                except (StopIteration, RuntimeError, NoSamplesAvailable) as ex:
                     if was_cause_by_stop_iteration(ex):
                         # If we are streaming (infinite sequence) then
                         # we want to try again as long as at least one
@@ -536,21 +540,20 @@ def base_iterator(timeout=None):
                         for a, f in zip(list(active), futures):
                             try:
                                 results.append(ray.get(f))
-                            except (StopIteration, RuntimeError) as ex_i:
+                            except (StopIteration, RuntimeError, NoSamplesAvailable) as ex_i:
                                 if was_cause_by_stop_iteration(ex_i):
                                     if self.is_infinite_sequence:
                                         stoped_actors.append(a)
                                     else:
                                         active.remove(a)
                                 else:
                                     # BONSAI changes begin - added sampling error logging
-                                    exc_info = ex_i
-                                    if isinstance(ex_i, StopIteration) and ex_i.__cause__ is not None:
-                                        exc_info = ex_i.__cause__
+                                    # ex_is is never a StopIteration since was_cause_by_stop_iteration
+                                    # in the if part of the clause will catch all StopIterations
                                     logger.exception(
                                         "Encountered an exception while extracting "
                                         "valid data from `futures`.",
-                                        exc_info=exc_info
+                                        exc_info=ex_i
                                     )
                                     # BONSAI changes end
                                     raise ex_i
@@ -564,20 +567,19 @@ def base_iterator(timeout=None):
                             # BONSAI changes end
                             yield results
                         elif self.is_infinite_sequence and len(stoped_actors) == len(active):
-                            raise ex
+                            raise NoSamplesAvailable(ex)
                         # BONSAI changes begin - added logging
                         logger.info(f"Kicking off {len(active)} new sampling tasks.")
                         # BONSAI changes end
                         futures = [a.par_iter_next.remote() for a in active]
                     else:
                         # BONSAI changes begin - added sampling error logging
-                        exc_info = ex
-                        if isinstance(ex, StopIteration) and ex.__cause__ is not None:
-                            exc_info = ex.__cause__
+                        # ex is never a StopIteration since was_cause_by_stop_iteration
+                        # in the if part of the clause will catch all StopIterations
                         logger.exception(
                             "Encountered an exception while extracting "
                             "valid data from `futures`.",
-                            exc_info=exc_info
+                            exc_info=ex
                         )
                         # BONSAI changes end
                         raise ex
@@ -651,7 +653,7 @@ def base_iterator(timeout=None):
                         active_actors.add(actor)
                         for item in batch:
                             yield item
-                    except (StopIteration, RuntimeError) as ex:
+                    except (StopIteration, RuntimeError, NoSamplesAvailable) as ex:
                         if was_cause_by_stop_iteration(ex):
                             # If we are streaming (infinite sequence) then
                             # we want to try again as long as at least one
@@ -791,7 +793,7 @@ def base_iterator(timeout=None):
                         yield _NextValueNotReady()
                 except TimeoutError:
                     yield _NextValueNotReady()
-                except (StopIteration, RuntimeError) as ex:
+                except (StopIteration, RuntimeError, NoSamplesAvailable) as ex:
                     if was_cause_by_stop_iteration(ex):
                         break
                     else:
@@ -899,7 +901,7 @@ def __next__(self):
         self._build_once()
         try:
             return next(self.built_iterator)
-        except (StopIteration, RuntimeError) as ex:
+        except (StopIteration, RuntimeError, NoSamplesAvailable) as ex:
             if was_cause_by_stop_iteration(ex):
                 # Force the regeneration of the base iterator
                 if self.is_infinite_sequence:
@@ -932,7 +934,7 @@ def for_each(self, fn: Callable[[T], U], max_concurrency=1,
 
             def apply_foreach(it):
                 for item in it:
-                    if isinstance(item, _NextValueNotReady):
+                    if isinstance(item, (_NextValueNotReady, NoSamplesAvailable)):
                         if hasattr(fn, LocalIterator.HANDLE_NEXT_VALUE_NOT_READY_HOOK_NAME):
                             with self._metrics_context():
                                 result = fn._handle_next_value_not_ready(item)
@@ -957,7 +959,7 @@ def apply_foreach(it):
                 remote = ray.remote(fn).options(**resources)
                 remote_fn = remote.remote
                 for item in it:
-                    if isinstance(item, _NextValueNotReady):
+                    if isinstance(item, (_NextValueNotReady, NoSamplesAvailable)):
                         yield item
                     else:
                         if max_concurrency and len(cur) >= max_concurrency:
@@ -982,7 +984,7 @@ def add_wait_hooks(it):
                             fn._on_fetch_start()
                         new_item = False
                     item = next(it)
-                    if not isinstance(item, _NextValueNotReady):
+                    if not isinstance(item, (_NextValueNotReady, NoSamplesAvailable)):
                         new_item = True
                     yield item
 
@@ -999,7 +1001,7 @@ def filter(self, fn: Callable[[T], bool]) -> "LocalIterator[T]":
         def apply_filter(it):
             for item in it:
                 with self._metrics_context():
-                    if isinstance(item, _NextValueNotReady) or fn(item):
+                    if isinstance(item, (_NextValueNotReady, NoSamplesAvailable)) or fn(item):
                         yield item
 
         return LocalIterator(
@@ -1013,7 +1015,7 @@ def batch(self, n: int) -> "LocalIterator[List[T]]":
         def apply_batch(it):
             batch = []
             for item in it:
-                if isinstance(item, _NextValueNotReady):
+                if isinstance(item, (_NextValueNotReady, NoSamplesAvailable)):
                     yield item
                 else:
                     batch.append(item)
@@ -1034,12 +1036,12 @@ def flatten(self) -> "LocalIterator[T[0]]":
         def apply_flatten(it):
             try:
                 for item in it:
-                    if isinstance(item, _NextValueNotReady):
+                    if isinstance(item, (_NextValueNotReady, NoSamplesAvailable)):
                         yield item
                     else:
                         for subitem in item:
                             yield subitem
-            except (StopIteration, RuntimeError) as ex:
+            except (StopIteration, RuntimeError, NoSamplesAvailable) as ex:
                 if not was_cause_by_stop_iteration(ex):
                     raise ex
 
@@ -1071,7 +1073,7 @@ def shuffle(self, shuffle_buffer_size: int,
         def apply_shuffle(it):
             buffer = []
             for item in it:
-                if isinstance(item, _NextValueNotReady):
+                if isinstance(item, (_NextValueNotReady, NoSamplesAvailable)):
                     yield item
                 else:
                     buffer.append(item)
@@ -1164,7 +1166,7 @@ def gen(timeout):
                         if len(queues[i]) == 0:
                             try:
                                 fill_next(timeout)
-                            except (StopIteration, RuntimeError) as ex:
+                            except (StopIteration, RuntimeError, NoSamplesAvailable) as ex:
                                 if was_cause_by_stop_iteration(ex):
                                     return
                                 else:
@@ -1267,7 +1269,7 @@ def build_union(timeout=None):
                                 yield_counts[i] += 1
                                 pull_counts[i] = 0
                                 yield item
-                    except (StopIteration, RuntimeError) as ex:
+                    except (StopIteration, RuntimeError, NoSamplesAvailable) as ex:
                         if was_cause_by_stop_iteration(ex):
                             fix_weights = [
                                 w != "*" for w in round_robin_weights
@@ -1357,7 +1359,7 @@ def __iter__(self) -> Iterator[Any]:
                 def __next__(self) -> Any:
                     try:
                         return next(self.inner_iterator)
-                    except (StopIteration, RuntimeError) as ex:
+                    except (StopIteration, RuntimeError, NoSamplesAvailable) as ex:
                         if was_cause_by_stop_iteration(ex):
                             self._make_inner_iterator()
                             # If we have an infinite sequence means that we have an stream
@@ -1406,7 +1408,7 @@ def par_iter_next_batch(self, batch_ms: int):
         while time.time() < t_end:
             try:
                 batch.append(self.par_iter_next())
-            except (StopIteration, RuntimeError) as ex:
+            except (StopIteration, RuntimeError, NoSamplesAvailable) as ex:
                 if was_cause_by_stop_iteration(ex):
                     if len(batch) == 0:
                         raise StopIteration
@@ -1431,7 +1433,7 @@ def par_iter_slice(self, step: int, start: int):
                 try:
                     val = next(self.local_it)
                     self.next_ith_buffer[j].append(val)
-                except (StopIteration, RuntimeError) as ex:
+                except (StopIteration, RuntimeError, NoSamplesAvailable) as ex:
                     if was_cause_by_stop_iteration(ex):
                         pass
                     else:
@@ -1452,7 +1454,7 @@ def par_iter_slice_batch(self, step: int, start: int, batch_ms: int):
         while time.time() < t_end:
             try:
                 batch.append(self.par_iter_slice(step, start))
-            except (StopIteration, RuntimeError) as ex:
+            except (StopIteration, RuntimeError, NoSamplesAvailable) as ex:
                 if was_cause_by_stop_iteration(ex):
                     if len(batch) == 0:
                         raise StopIteration
@@ -1479,6 +1481,16 @@ class _NextValueNotReady(Exception):
     pass
 
 
+class NoSamplesAvailable(Exception):
+    """
+    Indicates that a ParallelIterator has no samples currently available.
+    It is ParallelIterator's equivalent of LocalIterator's _NextValueNotReady
+    This could be due to a slow sim, for instance, and reflects a
+    situation where a retry is warranted.
+   """
+    pass
+
+
 class _ActorSet(object):
     """Helper class that represents a set of actors and transforms."""
 

@@ -22,6 +22,7 @@
 from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
 from ray.rllib.evaluation.metrics import collect_metrics
 from ray.rllib.evaluation.worker_set import WorkerSet
+from ray.util.iter import NoSamplesAvailable
 from ray.rllib.utils import FilterManager, deep_update, merge_dicts
 from ray.rllib.utils.spaces import space_utils
 from ray.rllib.utils.framework import try_import_tf, TensorStructType
@@ -461,6 +462,26 @@
 # yapf: enable
 
 
+def is_memory_error(e: Exception) -> bool:
+    """Check if an exception occurred due to a process running out of memory."""
+    memory_error_names = [
+        "ray.memory_monitor.RayOutOfMemoryError",
+        "RayOutOfMemoryError",
+    ]
+    ename = type(e).__name__
+
+    if ename in memory_error_names:
+        return True
+
+    msg_list = list(filter(lambda s: len(s) > 0, str(e).split("\n")))
+
+    if ename.startswith("RayTaskError"):
+        return any(
+            any(ename in msg for msg in msg_list) for ename in memory_error_names
+        )
+    return False
+
+
 @DeveloperAPI
 def with_common_config(
         extra_config: PartialTrainerConfigDict) -> TrainerConfigDict:
@@ -601,20 +622,40 @@ def train(self) -> ResultDict:
         for _ in range(1 + MAX_WORKER_FAILURE_RETRIES):
             try:
                 result = Trainable.train(self)
-            except RayError as e:
-                if self.config["ignore_worker_failures"]:
-                    logger.exception(
-                        "Error in train call, attempting to recover")
-                    self._try_recover()
-                else:
-                    logger.info(
-                        "Worker crashed during call to train(). To attempt to "
-                        "continue training without the failed worker, set "
-                        "`'ignore_worker_failures': True`.")
-                    raise e
             except Exception as e:
-                time.sleep(0.5)  # allow logs messages to propagate
-                raise e
+                if issubclass(e, RayError):
+                    # do not retry in case of OOM errors
+                    if is_memory_error(e):
+                        logger.exception("Not attempting to recover from error in train call "
+                                         "since it was caused by an OOM error",
+                                         exc_info=e)
+                        time.sleep(0.5)  # allow logs messages to propagate
+                        raise e
+                    else:
+                        # always retry on NoSamplesAvailable as this is by definition
+                        # a retryable situation
+                        if isinstance(e, NoSamplesAvailable):
+                            logger.info("No samples available yet, retrying.")
+                            self._try_recover()
+                        elif self.config["ignore_worker_failures"]:
+                            logger.exception("Error in train call and ignore_worker_failures==True, "
+                                             "attempting to recover",
+                                             exc_info=e)
+                            self._try_recover()
+                        else:
+                            logger.info(
+                                "Worker crashed during call to train(). To attempt to "
+                                "continue training without the failed worker, set "
+                                "`'ignore_worker_failures': True`.")
+                            raise e
+                else:
+                    if isinstance(e, StopIteration):
+                        pass
+                    else:
+                        logger.exception("Not attempting to recover from error in train call",
+                                         exc_info=e)
+                        time.sleep(0.5)  # allow logs messages to propagate
+                        raise e
             else:
                 break
         if result is None: