Merge pull request #631 from bouthilx/hotfix/reserve_parent_trials

Duplicate pending trials from parent/child for exc
Epistimio · Aug 20, 2021 · 23a4127 · 23a4127
2 parents 0921b7f + 8e0701f
commit 23a4127
Show file tree

Hide file tree

Showing 10 changed files with 349 additions and 78 deletions.
diff --git a/src/orion/client/experiment.py b/src/orion/client/experiment.py
@@ -294,6 +294,15 @@ def fetch_trials_by_status(self, status, with_evc_tree=False):
             status, with_evc_tree=with_evc_tree
         )
 
+    def fetch_pending_trials(self, with_evc_tree=False):
+        """Fetch all trials with status new, interrupted or suspended
+
+        Trials are sorted based on ``Trial.submit_time``
+
+        :return: list of :class:`orion.core.worker.trial.Trial` objects
+        """
+        return self._experiment.fetch_pending_trials(with_evc_tree=with_evc_tree)
+
     def fetch_noncompleted_trials(self, with_evc_tree=False):
         """Fetch non-completed trials of this `Experiment` instance.
 

diff --git a/src/orion/core/worker/experiment.py b/src/orion/core/worker/experiment.py
@@ -16,6 +16,7 @@
 
 from orion.core.evc.adapters import BaseAdapter
 from orion.core.evc.experiment import ExperimentNode
+from orion.core.io.database import DuplicateKeyError
 from orion.core.utils.exceptions import UnsupportedOperation
 from orion.core.utils.flatten import flatten
 from orion.core.utils.singleton import update_singletons
@@ -240,6 +241,8 @@ def reserve_trial(self, score_handle=None):
 
         self.fix_lost_trials()
 
+        self.duplicate_pending_trials()
+
         selected_trial = self._storage.reserve_trial(self)
         log.debug("reserved trial (trial: %s)", selected_trial)
         return selected_trial
@@ -265,6 +268,43 @@ def fix_lost_trials(self):
             except FailedUpdate:
                 log.debug("failed")
 
+    def duplicate_pending_trials(self):
+        """Find pending trials in EVC and duplicate them in current experiment.
+
+        An experiment cannot execute trials from parent experiments otherwise some trials
+        may have been executed in different environements of different experiment although they
+        belong to the same experiment. Instead, trials that are pending in parent and child
+        experiment are copied over to current experiment so that it can be reserved and executed.
+        The parent or child experiment will only see their original copy of the trial, and
+        the current experiment will only see the new copy of the trial.
+        """
+        self._check_if_writable()
+        evc_pending_trials = self._select_evc_call(
+            with_evc_tree=True, function="fetch_pending_trials"
+        )
+        exp_pending_trials = self._select_evc_call(
+            with_evc_tree=False, function="fetch_pending_trials"
+        )
+
+        exp_trials_ids = set(
+            trial.compute_trial_hash(trial, ignore_experiment=True)
+            for trial in exp_pending_trials
+        )
+
+        for trial in evc_pending_trials:
+            if (
+                trial.compute_trial_hash(trial, ignore_experiment=True)
+                in exp_trials_ids
+            ):
+                continue
+
+            trial.experiment = self.id
+            # Danger danger, race conditions!
+            try:
+                self._storage.register_trial(trial)
+            except DuplicateKeyError:
+                log.debug("Race condition while trying to duplicate trial %s", trial.id)
+
     # pylint:disable=unused-argument
     def update_completed_trial(self, trial, results_file=None):
         """Inform database about an evaluated `trial` with results.
@@ -354,6 +394,15 @@ def fetch_trials_by_status(self, status, with_evc_tree=False):
         """
         return self._select_evc_call(with_evc_tree, "fetch_trials_by_status", status)
 
+    def fetch_pending_trials(self, with_evc_tree=False):
+        """Fetch all trials with status new, interrupted or suspended
+
+        Trials are sorted based on `Trial.submit_time`
+
+        :return: list of `Trial` objects
+        """
+        return self._select_evc_call(with_evc_tree, "fetch_pending_trials")
+
     def fetch_noncompleted_trials(self, with_evc_tree=False):
         """Fetch non-completed trials of this `Experiment` instance.
 

diff --git a/src/orion/core/worker/strategy.py b/src/orion/core/worker/strategy.py
@@ -142,9 +142,11 @@ def configuration(self):
     def observe(self, points, results):
         """See BaseParallelStrategy.observe"""
         super(MaxParallelStrategy, self).observe(points, results)
-        self.max_result = max(
+        results = [
             result["objective"] for result in results if result["objective"] is not None
-        )
+        ]
+        if results:
+            self.max_result = max(results)
 
     def lie(self, trial):
         """See BaseParallelStrategy.lie"""
@@ -175,9 +177,10 @@ def observe(self, points, results):
         objective_values = [
             result["objective"] for result in results if result["objective"] is not None
         ]
-        self.mean_result = sum(value for value in objective_values) / float(
-            len(objective_values)
-        )
+        if objective_values:
+            self.mean_result = sum(value for value in objective_values) / float(
+                len(objective_values)
+            )
 
     def lie(self, trial):
         """See BaseParallelStrategy.lie"""

diff --git a/src/orion/testing/evc.py b/src/orion/testing/evc.py
@@ -0,0 +1,85 @@
+import contextlib
+import copy
+
+from orion.client import build_experiment, get_experiment
+
+
+@contextlib.contextmanager
+def disable_duplication(monkeypatch):
+    def stub(self):
+        pass
+
+    with monkeypatch.context() as m:
+        m.setattr(
+            "orion.core.worker.experiment.Experiment.duplicate_pending_trials", stub
+        )
+
+        yield
+
+
+def generate_trials(exp, trials):
+    """Generate trials for each item in trials.
+
+    Items of trials can be either dictionary of valid hyperparameters based on exp.space and status
+    or `None`.
+
+    If status not provided, 'new' is used by default.
+
+    For items that are `None`, trials are suggested with exp.suggest().
+    """
+    for trial_config in trials:
+        trial_config = copy.deepcopy(trial_config)
+        status = trial_config.pop("status", None) if trial_config else None
+        if trial_config:
+            trial = exp.insert(params=trial_config)
+        else:
+            with exp.suggest() as trial:
+                # Releases suggested trial when leaving with-clause.
+                pass
+
+        if status is not None:
+            print(status)
+            exp._experiment._storage.set_trial_status(trial, status)
+
+        print([trial.status for trial in exp.fetch_trials()])
+
+
+def build_root_experiment(space=None, trials=None):
+    """Build a root experiment and generate trials."""
+    if space is None:
+        space = {"x": "uniform(0, 100)", "y": "uniform(0, 100)", "z": "uniform(0, 100)"}
+    if trials is None:
+        trials = [{"x": i, "y": i * 2, "z": i ** 2} for i in range(4)]
+
+    root = build_experiment(name="root", max_trials=len(trials), space=space)
+
+    generate_trials(root, trials)
+
+
+def build_child_experiment(space=None, trials=None, name="child", parent="root"):
+    """Build a child experiment by branching from `parent` and generate trials."""
+    if trials is None:
+        trials = [None for i in range(6)]
+
+    max_trials = get_experiment(parent).max_trials + len(trials)
+
+    child = build_experiment(
+        name=name,
+        space=space,
+        max_trials=max_trials,
+        branching={"branch_from": parent, "enable": True},
+    )
+    assert child.name == name
+    assert child.version == 1
+
+    generate_trials(child, trials)
+
+
+def build_grand_child_experiment(space=None, trials=None):
+    """Build a grand-child experiment by branching from `child` and generate trials."""
+    if trials is None:
+        trials = [None for i in range(5)]
+
+    build_child_experiment(
+        space=space, trials=trials, name="grand-child", parent="child"
+    )
diff --git a/tests/functional/branching/test_branching.py b/tests/functional/branching/test_branching.py
@@ -831,15 +831,15 @@ def test_run_entire_full_x_full_y(init_entire):
 
     orion.core.cli.main(
         (
-            "-vv hunt --max-trials 20 --pool-size 1 -n full_x_full_y "
+            "-vv hunt --max-trials 30 --pool-size 1 -n full_x_full_y "
             "./black_box_with_y.py "
             "-x~uniform(-10,10) "
             "-y~uniform(-10,10,default_value=1)"
         ).split(" ")
     )
 
-    assert len(experiment.fetch_trials(with_evc_tree=True)) == 39
-    assert len(experiment.fetch_trials()) == 20
+    assert len(experiment.fetch_trials(with_evc_tree=True)) == 30
+    assert len(experiment.fetch_trials(with_evc_tree=False)) == 30
 
 
 def test_run_entire_full_x_full_y_no_args(init_entire):
@@ -850,11 +850,11 @@ def test_run_entire_full_x_full_y_no_args(init_entire):
     assert len(experiment.fetch_trials()) == 4
 
     orion.core.cli.main(
-        ("-vv hunt --max-trials 20 --pool-size 1 -n full_x_full_y").split(" ")
+        ("-vv hunt --max-trials 30 --pool-size 1 -n full_x_full_y").split(" ")
     )
 
-    assert len(experiment.fetch_trials(with_evc_tree=True)) == 39
-    assert len(experiment.fetch_trials()) == 20
+    assert len(experiment.fetch_trials(with_evc_tree=True)) == 30
+    assert len(experiment.fetch_trials(with_evc_tree=False)) == 30
 
 
 def test_new_algo(init_full_x_new_algo):
@@ -872,8 +872,8 @@ def test_new_algo(init_full_x_new_algo):
         ("-vv hunt --max-trials 20 --pool-size 1 -n full_x_new_algo").split(" ")
     )
 
-    assert len(experiment.fetch_trials(with_evc_tree=True)) == 21
-    assert len(experiment.fetch_trials()) == 20
+    assert len(experiment.fetch_trials(with_evc_tree=True)) == 20
+    assert len(experiment.fetch_trials(with_evc_tree=False)) == 20
 
 
 def test_new_algo_not_resolved(init_full_x, capsys):
@@ -1002,8 +1002,8 @@ def test_new_cli(init_full_x_new_cli):
         ("-vv hunt --max-trials 20 --pool-size 1 -n full_x_new_cli").split(" ")
     )
 
-    assert len(experiment.fetch_trials(with_evc_tree=True)) == 21
-    assert len(experiment.fetch_trials()) == 20
+    assert len(experiment.fetch_trials(with_evc_tree=True)) == 20
+    assert len(experiment.fetch_trials(with_evc_tree=False)) == 20
 
 
 @pytest.mark.usefixtures("init_full_x")