Handle trials with corrupted status

Why: Database corruption occurs when there is Timeouts in PickledDB. The objective is saved but status is not set to completed. How: We catch non-completed trials with objective and log a warning with a pointer to documentation to manually fix corrupted trials.
Epistimio · Apr 1, 2020 · e28a934 · e28a934
1 parent 5bf1dc6
commit e28a934
Show file tree

Hide file tree

Showing 2 changed files with 84 additions and 12 deletions.
diff --git a/src/orion/core/worker/strategy.py b/src/orion/core/worker/strategy.py
@@ -17,6 +17,17 @@
 log = logging.getLogger(__name__)
 
 
+CORRUPTED_DB_WARNING = """\
+Trial `%s` has an objective but status is not completed.
+This is likely due to a corrupted database, possibly because of
+database timeouts. Try setting manually status to `completed`.
+You can find documention to do this at
+https://orion.readthedocs.io/en/latest/user/storage.html#storage-backend.
+
+If you encounter this issue often, please consider reporting it to
+https://github.com/Epistimio/orion/issues."""
+
+
 def get_objective(trial):
     """Get the value for the objective, if it exists, for this trial
 
@@ -57,15 +68,33 @@ def observe(self, points, results):
         # converted to expect trials instead of lists and dictionaries.
         pass
 
-    @abstractmethod
+    # pylint: disable=no-self-use
     def lie(self, trial):
         """Construct a fake result for an incomplete trial
 
-        :param trial: `orion.core.worker.trial.Trial`
-        :return: Float or None
-            The fake objective result corresponding to the trial given
+        Parameters
+        ----------
+        trial: `orion.core.worker.trial.Trial`
+            A trial object which is not supposed to be completed.
+
+        Returns
+        -------
+        ``orion.core.worker.trial.Trial.Result``
+            The fake objective result corresponding to the trial given.
+
+        Notes
+        -----
+        If the trial has an objective even if not completed, a warning is printed to user
+        with a pointer to documentation to resolve the database corruption. The result returned is
+        the corresponding objective instead of the lie.
+
         """
-        pass
+        objective = get_objective(trial)
+        if objective:
+            log.warning(CORRUPTED_DB_WARNING, trial.id)
+            return Trial.Result(name='lie', type='lie', value=objective)
+
+        return None
 
     @property
     def configuration(self):
@@ -83,7 +112,11 @@ def observe(self, points, results):
 
     def lie(self, trial):
         """See BaseParallelStrategy.lie"""
-        pass
+        result = super(NoParallelStrategy, self).lie(trial)
+        if result:
+            return result
+
+        return None
 
 
 class MaxParallelStrategy(BaseParallelStrategy):
@@ -101,8 +134,9 @@ def observe(self, points, results):
 
     def lie(self, trial):
         """See BaseParallelStrategy.lie"""
-        if get_objective(trial):
-            raise RuntimeError("Trial {} is completed but should not be.".format(trial.id))
+        result = super(MaxParallelStrategy, self).lie(trial)
+        if result:
+            return result
 
         return Trial.Result(name='lie', type='lie', value=self.max_result)
 
@@ -123,8 +157,9 @@ def observe(self, points, results):
 
     def lie(self, trial):
         """See BaseParallelStrategy.lie"""
-        if get_objective(trial):
-            raise RuntimeError("Trial {} is completed but should not be.".format(trial.id))
+        result = super(MeanParallelStrategy, self).lie(trial)
+        if result:
+            return result
 
         return Trial.Result(name='lie', type='lie', value=self.mean_result)
 
@@ -142,8 +177,9 @@ def observe(self, points, results):
 
     def lie(self, trial):
         """See BaseParallelStrategy.lie"""
-        if get_objective(trial):
-            raise RuntimeError("Trial {} is completed but should not be.".format(trial.id))
+        result = super(StubParallelStrategy, self).lie(trial)
+        if result:
+            return result
 
         return Trial.Result(name='lie', type='lie', value=self.stub_value)
 

diff --git a/tests/unittests/core/test_strategy.py b/tests/unittests/core/test_strategy.py
@@ -1,6 +1,8 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 """Collection of tests for :mod:`orion.core.worker.strategies`."""
+import logging
+
 import pytest
 
 from orion.core.worker.strategy import (
@@ -23,6 +25,40 @@ def incomplete_trial():
     return Trial(params=[{'name': 'a', 'type': 'integer', 'value': 6}])
 
 
+@pytest.fixture
+def corrupted_trial():
+    """Return a corrupted trial with results but status reserved"""
+    return Trial(params=[{'name': 'a', 'type': 'integer', 'value': 6}],
+                 results=[{'name': 'objective', 'type': 'objective', 'value': 1}],
+                 status='reserved')
+
+
+strategies = [
+    'MaxParallelStrategy', 'MeanParallelStrategy', 'NoParallelStrategy', 'StubParallelStrategy']
+
+
+@pytest.mark.parametrize('strategy', strategies)
+def test_handle_corrupted_trials(caplog, strategy, corrupted_trial):
+    """Verify that corrupted trials are handled properly"""
+    with caplog.at_level(logging.WARNING, logger="orion.core.worker.strategy"):
+        lie = Strategy(strategy).lie(corrupted_trial)
+
+    match = "Trial `{}` has an objective but status is not completed".format(corrupted_trial.id)
+    assert match in caplog.text
+
+    assert lie is not None
+    assert lie.value == corrupted_trial.objective.value
+
+
+@pytest.mark.parametrize('strategy', strategies)
+def test_handle_uncorrupted_trials(caplog, strategy, incomplete_trial):
+    """Verify that no warning is logged if trial is valid"""
+    with caplog.at_level(logging.WARNING, logger="orion.core.worker.strategy"):
+        Strategy(strategy).lie(incomplete_trial)
+
+    assert "Trial `{}` has an objective but status is not completed" not in caplog.text
+
+
 class TestStrategyFactory:
     """Test creating a parallel strategy with the Strategy class"""