ray-project · richardliaw · Jul 3, 2019 · Mar 21, 2019 · Apr 4, 2019 · Apr 4, 2019
@@ -259,7 +259,7 @@ of a trial, you can additionally set the checkpoint_at_end to True. An example i
 Recovering From Failures (Experimental)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Tune automatically persists the progress of your experiments, so if an experiment crashes or is otherwise cancelled, it can be resumed with ``resume=True``. The default setting of ``resume=False`` creates a new experiment, and ``resume="prompt"`` will cause Tune to prompt you for whether you want to resume. You can always force a new experiment to be created by changing the experiment name.
+Tune automatically persists the progress of your experiments, so if an experiment crashes or is otherwise cancelled, it can be resumed by passing one of True, False, "LOCAL", "REMOTE", or "PROMPT" to ``tune.run(resume=...)``. The default setting of ``resume=False`` creates a new experiment. ``resume="LOCAL"`` and ``resume=True`` restore the experiment from ``local_dir/[experiment_name]``. ``resume="REMOTE"`` syncs the upload dir down to the local dir and then restores the experiment from ``local_dir/experiment_name``. ``resume="PROMPT"`` will cause Tune to prompt you for whether you want to resume. You can always force a new experiment to be created by changing the experiment name.
 
 Note that trials will be restored to their last checkpoint. If trial checkpointing is not enabled, unfinished trials will be restarted from scratch.
 
@@ -399,31 +399,39 @@ An example can be found in `logging_example.py <https://github.com/ray-project/r
 Custom Sync/Upload Commands
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-If an upload directory is provided, Tune will automatically sync results to the given
-directory with standard S3/gsutil commands. You can customize the upload command by
-providing either a function or a string.
+Tune automatically syncs the trial folder on remote nodes back to the head node. This requires the ray cluster to be started with the `autoscaler <autoscaling.html>`__.
+By default, local syncing requires rsync to be installed. You can customize the sync command with the ``sync_to_driver`` argument in ``tune.run`` by providing either a function or a string.
 
-If a string is provided, then it must include replacement fields ``{local_dir}`` and
-``{remote_dir}``, like ``"aws s3 sync {local_dir} {remote_dir}"``.
-
-Alternatively, a function can be provided with the following signature (and must
-be wrapped with ``tune.function``):
+If a string is provided, then it must include replacement fields ``{source}`` and ``{target}``, like ``rsync -savz -e "ssh -i ssh_key.pem" {source} {target}``. Alternatively, a function can be provided with the following signature (and must be wrapped with ``tune.function``):
 
 .. code-block:: python
 
-    def custom_sync_func(local_dir, remote_dir):
-        sync_cmd = "aws s3 sync {local_dir} {remote_dir}".format(
-            local_dir=local_dir,
-            remote_dir=remote_dir)
+    def custom_sync_func(source, target):
+        sync_cmd = "rsync {source} {target}".format(
+            source=source,
+            target=target)
         sync_process = subprocess.Popen(sync_cmd, shell=True)
         sync_process.wait()
 
     tune.run(
         MyTrainableClass,
         name="experiment_name",
-        sync_function=tune.function(custom_sync_func)
+        sync_to_driver=tune.function(custom_sync_func),
     )
 
+When syncing results back to the driver, the source would be a path similar to ``ubuntu@192.0.0.1:/home/ubuntu/ray_results/trial1``, and the target would be a local path.
+This custom sync command would be also be used in node failures, where the source argument would be the path to the trial directory and the target would be a remote path. The `sync_to_driver` would be invoked to push a checkpoint to new node for a queued trial to resume.
+
+If an upload directory is provided, Tune will automatically sync results to the given directory, natively supporting standard S3/gsutil commands.
+You can customize this to specify arbitrary storages with the ``sync_to_cloud`` argument. This argument is similar to ``sync_to_cloud`` in that it supports strings with the same replacement fields and arbitrary functions. See `syncer.py <https://github.com/ray-project/ray/blob/master/python/ray/tune/syncer.py>`__ for implementation details.
+
+.. code-block:: python
+
+    tune.run(
+        MyTrainableClass,
+        name="experiment_name",
+        sync_to_cloud=tune.function(custom_sync_func),
+    )
 
 Tune Client API
 ---------------

@@ -10,6 +10,7 @@
 import ray
 from ray.tests.cluster_utils import Cluster
 from ray.tune.config_parser import make_parser
+from ray.tune.result import DEFAULT_RESULTS_DIR
 from ray.tune.trial import resources_to_json
 from ray.tune.tune import _make_scheduler, run_experiments
 
@@ -71,6 +72,17 @@ def create_parser(parser_creator=None):
         default="default",
         type=str,
         help="Name of the subdirectory under `local_dir` to put results in.")
+    parser.add_argument(
+        "--local-dir",
+        default=DEFAULT_RESULTS_DIR,
+        type=str,
+        help="Local dir to save training results to. Defaults to '{}'.".format(
+            DEFAULT_RESULTS_DIR))
+    parser.add_argument(
+        "--upload-dir",
+        default="",
+        type=str,
+        help="Optional URI to sync training results to (e.g. s3://bucket).")
     parser.add_argument(
         "--resume",
         action="store_true",

@@ -10,7 +10,6 @@
 from six import string_types
 
 from ray.tune import TuneError
-from ray.tune.result import DEFAULT_RESULTS_DIR
 from ray.tune.trial import Trial, json_to_resources
 from ray.tune.logger import _SafeFallbackEncoder
 
@@ -65,17 +64,6 @@ def make_parser(parser_creator=None, **kwargs):
         default=1,
         type=int,
         help="Number of times to repeat each trial.")
-    parser.add_argument(
-        "--local-dir",
-        default=DEFAULT_RESULTS_DIR,
-        type=str,
-        help="Local dir to save training results to. Defaults to '{}'.".format(
-            DEFAULT_RESULTS_DIR))
-    parser.add_argument(
-        "--upload-dir",
-        default="",
-        type=str,
-        help="Optional URI to sync training results to (e.g. s3://bucket).")
     parser.add_argument(
         "--checkpoint-freq",
         default=0,
@@ -183,7 +171,7 @@ def create_trial_from_spec(spec, output_path, parser, **trial_kwargs):
         trainable_name=spec["run"],
         # json.load leads to str -> unicode in py2.7
         config=spec.get("config", {}),
-        local_dir=os.path.join(args.local_dir, output_path),
+        local_dir=os.path.join(spec["local_dir"], output_path),
         # json.load leads to str -> unicode in py2.7
         stopping_criterion=spec.get("stop", {}),
         checkpoint_freq=args.checkpoint_freq,
@@ -193,10 +181,9 @@ def create_trial_from_spec(spec, output_path, parser, **trial_kwargs):
         export_formats=spec.get("export_formats", []),
         # str(None) doesn't create None
         restore_path=spec.get("restore"),
-        upload_dir=args.upload_dir,
         trial_name_creator=spec.get("trial_name_creator"),
         loggers=spec.get("loggers"),
         # str(None) doesn't create None
-        sync_function=spec.get("sync_function"),
+        sync_to_driver_fn=spec.get("sync_to_driver"),
         max_failures=args.max_failures,
         **trial_kwargs)
@@ -11,9 +11,8 @@
 
 import numpy as np
 
-import ray
 from ray import tune
-from ray.tune import Trainable, run, Experiment
+from ray.tune import Trainable, run
 
 
 class TestLogger(tune.logger.Logger):
@@ -60,11 +59,11 @@ def _restore(self, checkpoint_path):
     parser.add_argument(
         "--smoke-test", action="store_true", help="Finish quickly for testing")
     args, _ = parser.parse_known_args()
-    ray.init()
-    exp = Experiment(
+
+    trials = run(
+        MyTrainableClass,
         name="hyperband_test",
-        run=MyTrainableClass,
-        num_samples=1,
+        num_samples=5,
         trial_name_creator=tune.function(trial_str_creator),
         loggers=[TestLogger],
         stop={"training_iteration": 1 if args.smoke_test else 99999},
@@ -73,5 +72,3 @@ def _restore(self, checkpoint_path):
                 lambda spec: 10 + int(90 * random.random())),
             "height": tune.sample_from(lambda spec: int(100 * random.random()))
         })
-
-    trials = run(exp)
@@ -52,7 +52,6 @@ class Experiment(object):
         >>>     },
         >>>     num_samples=10,
         >>>     local_dir="~/ray_results",
-        >>>     upload_dir="s3://your_bucket/path",
         >>>     checkpoint_freq=10,
         >>>     max_failures=2)
     """
@@ -68,7 +67,7 @@ def __init__(self,
                  upload_dir=None,
                  trial_name_creator=None,
                  loggers=None,
-                 sync_function=None,
+                 sync_to_driver=None,
                  checkpoint_freq=0,
                  checkpoint_at_end=False,
                  keep_checkpoints_num=None,
@@ -78,18 +77,16 @@ def __init__(self,
                  restore=None,
                  repeat=None,
                  trial_resources=None,
-                 custom_loggers=None):
-        if sync_function:
-            assert upload_dir, "Need `upload_dir` if sync_function given."
-
+                 custom_loggers=None,
+                 sync_function=None):
         if repeat:
             _raise_deprecation_note("repeat", "num_samples", soft=False)
         if trial_resources:
             _raise_deprecation_note(
                 "trial_resources", "resources_per_trial", soft=False)
-        if custom_loggers:
-            _raise_deprecation_note("custom_loggers", "loggers", soft=False)
-
+        if sync_function:
+            _raise_deprecation_note(
+                "sync_function", "sync_to_driver", soft=False)
         run_identifier = Experiment._register_if_needed(run)
         spec = {
             "run": run_identifier,
@@ -98,10 +95,10 @@ def __init__(self,
             "resources_per_trial": resources_per_trial,
             "num_samples": num_samples,
             "local_dir": os.path.expanduser(local_dir or DEFAULT_RESULTS_DIR),
-            "upload_dir": upload_dir or "",  # argparse converts None to "null"
+            "upload_dir": upload_dir,
             "trial_name_creator": trial_name_creator,
             "loggers": loggers,
-            "sync_function": sync_function,
+            "sync_to_driver": sync_to_driver,
             "checkpoint_freq": checkpoint_freq,
             "checkpoint_at_end": checkpoint_at_end,
             "keep_checkpoints_num": keep_checkpoints_num,
@@ -182,7 +179,13 @@ def local_dir(self):
 
     @property
     def checkpoint_dir(self):
-        return os.path.join(self.spec["local_dir"], self.name)
+        if self.local_dir:
+            return os.path.join(self.local_dir, self.name)
+
+    @property
+    def remote_checkpoint_dir(self):
+        if self.spec["upload_dir"]:
+            return os.path.join(self.spec["upload_dir"], self.name)
 
 
 def convert_to_experiment_list(experiments):