Skip to content

Commit

Permalink
Handle trials with corrupted status
Browse files Browse the repository at this point in the history
Why:

Database corruption occurs when there is Timeouts in PickledDB. The
objective is saved but status is not set to completed.

How:

We catch
non-completed trials with objective and log a warning with a pointer to
documentation to manually fix corrupted trials.
  • Loading branch information
bouthilx committed Apr 1, 2020
1 parent 5bf1dc6 commit e28a934
Show file tree
Hide file tree
Showing 2 changed files with 84 additions and 12 deletions.
60 changes: 48 additions & 12 deletions src/orion/core/worker/strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,17 @@
log = logging.getLogger(__name__)


CORRUPTED_DB_WARNING = """\
Trial `%s` has an objective but status is not completed.
This is likely due to a corrupted database, possibly because of
database timeouts. Try setting manually status to `completed`.
You can find documention to do this at
https://orion.readthedocs.io/en/latest/user/storage.html#storage-backend.
If you encounter this issue often, please consider reporting it to
https://github.com/Epistimio/orion/issues."""


def get_objective(trial):
"""Get the value for the objective, if it exists, for this trial
Expand Down Expand Up @@ -57,15 +68,33 @@ def observe(self, points, results):
# converted to expect trials instead of lists and dictionaries.
pass

@abstractmethod
# pylint: disable=no-self-use
def lie(self, trial):
"""Construct a fake result for an incomplete trial
:param trial: `orion.core.worker.trial.Trial`
:return: Float or None
The fake objective result corresponding to the trial given
Parameters
----------
trial: `orion.core.worker.trial.Trial`
A trial object which is not supposed to be completed.
Returns
-------
``orion.core.worker.trial.Trial.Result``
The fake objective result corresponding to the trial given.
Notes
-----
If the trial has an objective even if not completed, a warning is printed to user
with a pointer to documentation to resolve the database corruption. The result returned is
the corresponding objective instead of the lie.
"""
pass
objective = get_objective(trial)
if objective:
log.warning(CORRUPTED_DB_WARNING, trial.id)
return Trial.Result(name='lie', type='lie', value=objective)

return None

@property
def configuration(self):
Expand All @@ -83,7 +112,11 @@ def observe(self, points, results):

def lie(self, trial):
"""See BaseParallelStrategy.lie"""
pass
result = super(NoParallelStrategy, self).lie(trial)
if result:
return result

return None


class MaxParallelStrategy(BaseParallelStrategy):
Expand All @@ -101,8 +134,9 @@ def observe(self, points, results):

def lie(self, trial):
"""See BaseParallelStrategy.lie"""
if get_objective(trial):
raise RuntimeError("Trial {} is completed but should not be.".format(trial.id))
result = super(MaxParallelStrategy, self).lie(trial)
if result:
return result

return Trial.Result(name='lie', type='lie', value=self.max_result)

Expand All @@ -123,8 +157,9 @@ def observe(self, points, results):

def lie(self, trial):
"""See BaseParallelStrategy.lie"""
if get_objective(trial):
raise RuntimeError("Trial {} is completed but should not be.".format(trial.id))
result = super(MeanParallelStrategy, self).lie(trial)
if result:
return result

return Trial.Result(name='lie', type='lie', value=self.mean_result)

Expand All @@ -142,8 +177,9 @@ def observe(self, points, results):

def lie(self, trial):
"""See BaseParallelStrategy.lie"""
if get_objective(trial):
raise RuntimeError("Trial {} is completed but should not be.".format(trial.id))
result = super(StubParallelStrategy, self).lie(trial)
if result:
return result

return Trial.Result(name='lie', type='lie', value=self.stub_value)

Expand Down
36 changes: 36 additions & 0 deletions tests/unittests/core/test_strategy.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Collection of tests for :mod:`orion.core.worker.strategies`."""
import logging

import pytest

from orion.core.worker.strategy import (
Expand All @@ -23,6 +25,40 @@ def incomplete_trial():
return Trial(params=[{'name': 'a', 'type': 'integer', 'value': 6}])


@pytest.fixture
def corrupted_trial():
"""Return a corrupted trial with results but status reserved"""
return Trial(params=[{'name': 'a', 'type': 'integer', 'value': 6}],
results=[{'name': 'objective', 'type': 'objective', 'value': 1}],
status='reserved')


strategies = [
'MaxParallelStrategy', 'MeanParallelStrategy', 'NoParallelStrategy', 'StubParallelStrategy']


@pytest.mark.parametrize('strategy', strategies)
def test_handle_corrupted_trials(caplog, strategy, corrupted_trial):
"""Verify that corrupted trials are handled properly"""
with caplog.at_level(logging.WARNING, logger="orion.core.worker.strategy"):
lie = Strategy(strategy).lie(corrupted_trial)

match = "Trial `{}` has an objective but status is not completed".format(corrupted_trial.id)
assert match in caplog.text

assert lie is not None
assert lie.value == corrupted_trial.objective.value


@pytest.mark.parametrize('strategy', strategies)
def test_handle_uncorrupted_trials(caplog, strategy, incomplete_trial):
"""Verify that no warning is logged if trial is valid"""
with caplog.at_level(logging.WARNING, logger="orion.core.worker.strategy"):
Strategy(strategy).lie(incomplete_trial)

assert "Trial `{}` has an objective but status is not completed" not in caplog.text


class TestStrategyFactory:
"""Test creating a parallel strategy with the Strategy class"""

Expand Down

0 comments on commit e28a934

Please sign in to comment.