richardliaw · richardliaw · Nov 5, 2018 · Nov 5, 2018 · Nov 5, 2018 · Nov 5, 2018
diff --git a/.travis/install-dependencies.sh b/.travis/install-dependencies.sh
@@ -25,7 +25,7 @@ if [[ "$PYTHON" == "2.7" ]] && [[ "$platform" == "linux" ]]; then
   bash miniconda.sh -b -p $HOME/miniconda
   export PATH="$HOME/miniconda/bin:$PATH"
   pip install -q cython==0.27.3 cmake tensorflow gym opencv-python pyyaml pandas==0.22 requests \
-    feather-format lxml openpyxl xlrd py-spy setproctitle faulthandler pytest-timeout
+    feather-format lxml openpyxl xlrd py-spy setproctitle faulthandler pytest-timeout mock
 elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "linux" ]]; then
   sudo apt-get update
   sudo apt-get install -y cmake pkg-config python-dev python-numpy build-essential autoconf curl libtool unzip
@@ -51,7 +51,7 @@ elif [[ "$PYTHON" == "2.7" ]] && [[ "$platform" == "macosx" ]]; then
   bash miniconda.sh -b -p $HOME/miniconda
   export PATH="$HOME/miniconda/bin:$PATH"
   pip install -q cython==0.27.3 cmake tensorflow gym opencv-python pyyaml pandas==0.22 requests \
-    feather-format lxml openpyxl xlrd py-spy setproctitle faulthandler pytest-timeout
+    feather-format lxml openpyxl xlrd py-spy setproctitle faulthandler pytest-timeout mock
 elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "macosx" ]]; then
   # check that brew is installed
   which -s brew

diff --git a/python/ray/test/cluster_utils.py b/python/ray/test/cluster_utils.py
@@ -43,6 +43,7 @@ def __init__(self,
             if connect:
                 redis_password = head_node_args.get("redis_password")
                 output_info = ray.init(
+                    ignore_reinit_error=True,
                     redis_address=self.redis_address,
                     redis_password=redis_password)
                 logger.info(output_info)

diff --git a/python/ray/tune/logger.py b/python/ray/tune/logger.py
@@ -71,19 +71,19 @@ def _init(self):
         self._log_syncer = get_syncer(self.logdir, self.uri)
 
     def on_result(self, result):
-        for logger in self._loggers:
-            logger.on_result(result)
+        for _logger in self._loggers:
+            _logger.on_result(result)
         self._log_syncer.set_worker_ip(result.get(NODE_IP))
         self._log_syncer.sync_if_needed()
 
     def close(self):
-        for logger in self._loggers:
-            logger.close()
+        for _logger in self._loggers:
+            _logger.close()
         self._log_syncer.sync_now(force=True)
 
     def flush(self):
-        for logger in self._loggers:
-            logger.flush()
+        for _logger in self._loggers:
+            _logger.flush()
         self._log_syncer.sync_now(force=True)
         self._log_syncer.wait()
 
@@ -99,7 +99,7 @@ def _init(self):
         with open(config_out, "w") as f:
             json.dump(self.config, f, sort_keys=True, cls=_SafeFallbackEncoder)
         local_file = os.path.join(self.logdir, "result.json")
-        self.local_out = open(local_file, "w")
+        self.local_out = open(local_file, "a")
 
     def on_result(self, result):
         json.dump(result, self, cls=_SafeFallbackEncoder)
@@ -109,6 +109,9 @@ def write(self, b):
         self.local_out.write(b)
         self.local_out.flush()
 
+    def flush(self):
+        self.local_out.flush()
+
     def close(self):
         self.local_out.close()
 
@@ -128,14 +131,16 @@ def to_tf_values(result, path):
 
 class _TFLogger(Logger):
     def _init(self):
+        # TODO(rliaw): Implement a proper resume functionality for this.
         self._file_writer = tf.summary.FileWriter(self.logdir)
 
     def on_result(self, result):
         tmp = result.copy()
         for k in [
                 "config", "pid", "timestamp", TIME_TOTAL_S, TRAINING_ITERATION
         ]:
-            del tmp[k]  # not useful to tf log these
+            if k in tmp:
+                del tmp[k]  # not useful to tf log these
         values = to_tf_values(tmp, ["ray", "tune"])
         train_stats = tf.Summary(value=values)
         t = result.get(TIMESTEPS_TOTAL) or result[TRAINING_ITERATION]
@@ -158,15 +163,21 @@ class _VisKitLogger(Logger):
     def _init(self):
         """CSV outputted with Headers as first set of results."""
         # Note that we assume params.json was already created by JsonLogger
-        self._file = open(os.path.join(self.logdir, "progress.csv"), "w")
+        progress_file = os.path.join(self.logdir, "progress.csv")
+        self._continuing = os.path.exists(progress_file)
+        self._file = open(progress_file, "a")
         self._csv_out = None
 
     def on_result(self, result):
         if self._csv_out is None:
             self._csv_out = csv.DictWriter(self._file, result.keys())
-            self._csv_out.writeheader()
+            if not self._continuing:
+                self._csv_out.writeheader()
         self._csv_out.writerow(result.copy())
 
+    def flush(self):
+        self._file.flush()
+
     def close(self):
         self._file.close()
 

diff --git a/python/ray/tune/ray_trial_executor.py b/python/ray/tune/ray_trial_executor.py
@@ -19,8 +19,8 @@
 class RayTrialExecutor(TrialExecutor):
     """An implemention of TrialExecutor based on Ray."""
 
-    def __init__(self, queue_trials=False):
-        super(RayTrialExecutor, self).__init__(queue_trials)
+    def __init__(self, queue_trials=False, track_checkpoints=False):
+        super(RayTrialExecutor, self).__init__(queue_trials, track_checkpoints)
         self._running = {}
         # Since trial resume after paused should not run
         # trial.train.remote(), thus no more new remote object id generated.
@@ -60,7 +60,7 @@ def _train(self, trial):
 
     def _start_trial(self, trial, checkpoint=None):
         prior_status = trial.status
-        trial.status = Trial.RUNNING
+        self.set_status(trial, Trial.RUNNING)
         trial.runner = self._setup_runner(trial)
         if not self.restore(trial, checkpoint):
             return
@@ -88,9 +88,9 @@ def _stop_trial(self, trial, error=False, error_msg=None,
         """
 
         if error:
-            trial.status = Trial.ERROR
+            self.set_status(trial, Trial.ERROR)
         else:
-            trial.status = Trial.TERMINATED
+            self.set_status(trial, Trial.TERMINATED)
 
         try:
             trial.write_error_log(error_msg)
@@ -103,32 +103,46 @@ def _stop_trial(self, trial, error=False, error_msg=None,
                     stop_tasks, num_returns=2, timeout=250)
         except Exception:
             logger.exception("Error stopping runner.")
-            trial.status = Trial.ERROR
+            self.set_status(trial, Trial.ERROR)
         finally:
             trial.runner = None
 
         if stop_logger:
             trial.close_logger()
 
-    def start_trial(self, trial, checkpoint_obj=None):
-        """Starts the trial."""
+    def start_trial(self, trial, checkpoint=None, raise_on_failure=False):
+        """Starts the trial.
+
+        Will not return resources if trial repeatedly fails on start.
+
+        Args:
+            trial (Trial): Trial to be started.
+            checkpoint (Checkpoint): A Python object or path storing the state
+                of trial.
+            raise_on_failure (bool): To raise exception on failure in starting.
+
+        Raises:
+            Exception after 1 retries if `raise_on_failure` is True.
+        """
 
         self._commit_resources(trial.resources)
         try:
-            self._start_trial(trial, checkpoint_obj)
+            self._start_trial(trial, checkpoint)
         except Exception:
             logger.exception("Error stopping runner - retrying...")
             error_msg = traceback.format_exc()
             time.sleep(2)
             self._stop_trial(trial, error=True, error_msg=error_msg)
             try:
-                self._start_trial(trial)
-            except Exception:
+                self._start_trial(trial, checkpoint)
+            except Exception as exc:
                 logger.exception("Error starting runner, aborting!")
                 error_msg = traceback.format_exc()
                 self._stop_trial(trial, error=True, error_msg=error_msg)
                 # note that we don't return the resources, since they may
                 # have been lost
+                if raise_on_failure:
+                    raise exc
 
     def _find_item(self, dictionary, item):
         out = [rid for rid, t in dictionary.items() if t is item]
@@ -140,6 +154,7 @@ def stop_trial(self, trial, error=False, error_msg=None, stop_logger=True):
         self._stop_trial(
             trial, error=error, error_msg=error_msg, stop_logger=stop_logger)
         if prior_status == Trial.RUNNING:
+            logger.debug("Returning resources for this trial.")
             self._return_resources(trial.resources)
             out = self._find_item(self._running, trial)
             for result_id in out:
@@ -293,7 +308,7 @@ def restore(self, trial, checkpoint=None):
             return True
         if trial.runner is None:
             logger.error("Unable to restore - no runner.")
-            trial.status = Trial.ERROR
+            self.set_status(trial, Trial.ERROR)
             return False
         try:
             value = checkpoint.value
@@ -307,5 +322,5 @@ def restore(self, trial, checkpoint=None):
             return True
         except Exception:
             logger.exception("Error restoring runner.")
-            trial.status = Trial.ERROR
+            self.set_status(trial, Trial.ERROR)
             return False