Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
99 commits
Select commit Hold shift + click to select a range
e557b6a
[tune] Throw on overstepping
richardliaw Nov 5, 2018
b755785
Add Tune Multi-Node Tests
richardliaw Nov 5, 2018
32d1242
Add cluster bookkeeping code
richardliaw Nov 5, 2018
9ec3a60
add test for adding node
richardliaw Nov 5, 2018
44fe1e2
multinode test fixes
richardliaw Nov 5, 2018
d9c9e3b
First pass at allowing updatable values
richardliaw Nov 6, 2018
d6cade1
Fix compilation issues
richardliaw Nov 6, 2018
ac74520
Merge branch 'config_updating' into global_state_multinode
richardliaw Nov 6, 2018
a95c718
Add config file parsing
richardliaw Nov 6, 2018
5814655
Full initialization
richardliaw Nov 6, 2018
f63df3f
Merge branch 'config_updating' into global_state_multinode
richardliaw Nov 6, 2018
2824836
Wrote a good test
richardliaw Nov 6, 2018
6e7bd6a
Merge branch 'config_updating' into tune_cluster
richardliaw Nov 6, 2018
4842481
configuration parsing and stuff
richardliaw Nov 7, 2018
8e52103
docs
richardliaw Nov 7, 2018
83d6947
write some tests, make it good
richardliaw Nov 7, 2018
4349adf
Merge branch 'master' into config_updating
richardliaw Nov 7, 2018
8078967
fixed init
richardliaw Nov 7, 2018
2db9f18
Add all config options and bring back stress tests.
Nov 7, 2018
cc8fca2
Merge branch 'config_updating' into tune_cluster
richardliaw Nov 7, 2018
59480dc
Update python/ray/worker.py
richardliaw Nov 7, 2018
6fa9d7c
Update python/ray/worker.py
richardliaw Nov 7, 2018
856547c
TEMP
richardliaw Nov 7, 2018
25e45cd
Fix internalization
richardliaw Nov 7, 2018
2e2b8b0
Merge branch 'config_updating' of github.com:richardliaw/ray into con…
richardliaw Nov 7, 2018
d3fa8f0
some last changes
richardliaw Nov 7, 2018
233f3ee
Merge branch 'config_updating' into tune_cluster
richardliaw Nov 7, 2018
c3c1c9c
skip for now
richardliaw Nov 7, 2018
3e96ec9
Linting and Java fix
Nov 7, 2018
4081c60
add docstring
richardliaw Nov 7, 2018
a916257
Merge branch 'config_updating' into global_state_multinode
richardliaw Nov 7, 2018
5646982
Merge branch 'config_updating' into tune_cluster
richardliaw Nov 8, 2018
38eda57
Merge branch 'master' into tune_cluster
richardliaw Nov 8, 2018
7d19c9f
Merge branch 'master' into global_state_multinode
richardliaw Nov 8, 2018
2f5861c
Fix test, add assertions
richardliaw Nov 8, 2018
90bc3fb
Merge branch 'master' into tune_cluster
richardliaw Nov 8, 2018
d2dccae
Merge branch 'global_state_multinode' into tune_cluster
richardliaw Nov 8, 2018
7f675f7
fix up tests
richardliaw Nov 8, 2018
af0fe9c
pytest ext
richardliaw Nov 8, 2018
8f942fc
Merge branch 'global_state_multinode' into tune_cluster
richardliaw Nov 8, 2018
6194744
Merge branch 'master' into tune_cluster
richardliaw Nov 11, 2018
d01d80c
code to make requeueing work
richardliaw Nov 12, 2018
1e32227
yapf
richardliaw Nov 12, 2018
9db9d16
lint
richardliaw Nov 12, 2018
2639a98
comments
richardliaw Nov 12, 2018
d45c74b
lint
richardliaw Nov 12, 2018
ee6a800
Update multi_node_test_2.py
richardliaw Nov 12, 2018
3f13bfa
lit
richardliaw Nov 12, 2018
5f0d75e
re-enable
richardliaw Nov 12, 2018
b1793bd
lint
richardliaw Nov 12, 2018
0763004
initial nuke test
richardliaw Nov 12, 2018
a1a05f0
Track last result
richardliaw Nov 12, 2018
5abf9d1
Merge branch 'global_state_multinode' into tune_cluster
richardliaw Nov 12, 2018
6ecc2bb
note
richardliaw Nov 12, 2018
eabb28b
Merge branch 'tune_cluster' into tune_cluster-2
richardliaw Nov 12, 2018
09995fa
add checkpointing to trial_runner
richardliaw Nov 12, 2018
24d6e12
trialrunners
richardliaw Nov 12, 2018
dbd1bbc
logging
richardliaw Nov 12, 2018
e2b8380
Redo checkpointing from trial runner
richardliaw Nov 12, 2018
1239c1a
Merge branch 'master' into tune_cluster
richardliaw Nov 14, 2018
7aab84f
fix up tests and checkpointing
richardliaw Nov 14, 2018
6c3d53e
Merge branch 'tune_cluster' into tune_cluster-2
richardliaw Nov 14, 2018
1e8a33d
import error
richardliaw Nov 14, 2018
026192b
Merge branch 'tune_cluster' into tune_cluster-2
richardliaw Nov 14, 2018
637e707
timeout?
richardliaw Nov 15, 2018
162b308
lint
richardliaw Nov 15, 2018
a65fc45
Merge branch 'master' into tune_cluster
richardliaw Nov 16, 2018
b0d5997
Merge branch 'tune_cluster' into tune_cluster-2
richardliaw Nov 17, 2018
e683224
Checkpoint and tests
richardliaw Nov 19, 2018
ef67acf
one full cluster failure
richardliaw Nov 19, 2018
0541f92
lint
richardliaw Nov 21, 2018
267150d
Merge branch 'tune_cluster' into tune_cluster-2
richardliaw Nov 21, 2018
b884fed
Add better test
richardliaw Nov 21, 2018
81ff30f
Merge branch 'master' into tune_cluster-2
richardliaw Nov 21, 2018
8617f2d
error test
richardliaw Nov 22, 2018
f7e31bd
some docs
richardliaw Nov 24, 2018
a2355f8
Tests and better recovery handling
richardliaw Nov 25, 2018
5513099
Add unit test for restoring (but currently failing
richardliaw Nov 25, 2018
782f194
pickle if needed when you set status
richardliaw Nov 25, 2018
d1d5a56
yapf
richardliaw Nov 25, 2018
be445e8
docs and small test for nosaving
richardliaw Nov 25, 2018
c13270b
doc
richardliaw Nov 25, 2018
8a6ed91
more docs
richardliaw Nov 25, 2018
b1e3bf0
test docs
richardliaw Nov 25, 2018
40248aa
py2mock
richardliaw Nov 26, 2018
22930c8
dirpath from tmpdir
richardliaw Nov 26, 2018
82ff45e
fix tsts?
richardliaw Nov 26, 2018
bde644c
yapf
richardliaw Nov 26, 2018
79197b8
Fix up tests
richardliaw Nov 26, 2018
66be742
nits
richardliaw Nov 26, 2018
25be843
nit
richardliaw Nov 26, 2018
ff7b114
test fixup
richardliaw Nov 26, 2018
defe524
yapf
richardliaw Nov 26, 2018
02a9cf8
no skip
richardliaw Nov 26, 2018
f80e318
cluster tests
richardliaw Nov 26, 2018
07df20b
nit
richardliaw Nov 26, 2018
1ff31cb
Fix counting resources test
richardliaw Nov 27, 2018
6998a01
better test and error msg
richardliaw Nov 27, 2018
8ea0f70
Merge branch 'master' into tune_cluster-2
richardliaw Nov 27, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .travis/install-dependencies.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ if [[ "$PYTHON" == "2.7" ]] && [[ "$platform" == "linux" ]]; then
bash miniconda.sh -b -p $HOME/miniconda
export PATH="$HOME/miniconda/bin:$PATH"
pip install -q cython==0.27.3 cmake tensorflow gym opencv-python pyyaml pandas==0.22 requests \
feather-format lxml openpyxl xlrd py-spy setproctitle faulthandler pytest-timeout
feather-format lxml openpyxl xlrd py-spy setproctitle faulthandler pytest-timeout mock
elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "linux" ]]; then
sudo apt-get update
sudo apt-get install -y cmake pkg-config python-dev python-numpy build-essential autoconf curl libtool unzip
Expand All @@ -51,7 +51,7 @@ elif [[ "$PYTHON" == "2.7" ]] && [[ "$platform" == "macosx" ]]; then
bash miniconda.sh -b -p $HOME/miniconda
export PATH="$HOME/miniconda/bin:$PATH"
pip install -q cython==0.27.3 cmake tensorflow gym opencv-python pyyaml pandas==0.22 requests \
feather-format lxml openpyxl xlrd py-spy setproctitle faulthandler pytest-timeout
feather-format lxml openpyxl xlrd py-spy setproctitle faulthandler pytest-timeout mock
elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "macosx" ]]; then
# check that brew is installed
which -s brew
Expand Down
1 change: 1 addition & 0 deletions python/ray/test/cluster_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def __init__(self,
if connect:
redis_password = head_node_args.get("redis_password")
output_info = ray.init(
ignore_reinit_error=True,
redis_address=self.redis_address,
redis_password=redis_password)
logger.info(output_info)
Expand Down
31 changes: 21 additions & 10 deletions python/ray/tune/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,19 +71,19 @@ def _init(self):
self._log_syncer = get_syncer(self.logdir, self.uri)

def on_result(self, result):
for logger in self._loggers:
logger.on_result(result)
for _logger in self._loggers:
_logger.on_result(result)
self._log_syncer.set_worker_ip(result.get(NODE_IP))
self._log_syncer.sync_if_needed()

def close(self):
for logger in self._loggers:
logger.close()
for _logger in self._loggers:
_logger.close()
self._log_syncer.sync_now(force=True)

def flush(self):
for logger in self._loggers:
logger.flush()
for _logger in self._loggers:
_logger.flush()
self._log_syncer.sync_now(force=True)
self._log_syncer.wait()

Expand All @@ -99,7 +99,7 @@ def _init(self):
with open(config_out, "w") as f:
json.dump(self.config, f, sort_keys=True, cls=_SafeFallbackEncoder)
local_file = os.path.join(self.logdir, "result.json")
self.local_out = open(local_file, "w")
self.local_out = open(local_file, "a")

def on_result(self, result):
json.dump(result, self, cls=_SafeFallbackEncoder)
Expand All @@ -109,6 +109,9 @@ def write(self, b):
self.local_out.write(b)
self.local_out.flush()

def flush(self):
self.local_out.flush()

def close(self):
self.local_out.close()

Expand All @@ -128,14 +131,16 @@ def to_tf_values(result, path):

class _TFLogger(Logger):
def _init(self):
# TODO(rliaw): Implement a proper resume functionality for this.
self._file_writer = tf.summary.FileWriter(self.logdir)

def on_result(self, result):
tmp = result.copy()
for k in [
"config", "pid", "timestamp", TIME_TOTAL_S, TRAINING_ITERATION
]:
del tmp[k] # not useful to tf log these
if k in tmp:
del tmp[k] # not useful to tf log these
values = to_tf_values(tmp, ["ray", "tune"])
train_stats = tf.Summary(value=values)
t = result.get(TIMESTEPS_TOTAL) or result[TRAINING_ITERATION]
Expand All @@ -158,15 +163,21 @@ class _VisKitLogger(Logger):
def _init(self):
"""CSV outputted with Headers as first set of results."""
# Note that we assume params.json was already created by JsonLogger
self._file = open(os.path.join(self.logdir, "progress.csv"), "w")
progress_file = os.path.join(self.logdir, "progress.csv")
self._continuing = os.path.exists(progress_file)
self._file = open(progress_file, "a")
self._csv_out = None

def on_result(self, result):
if self._csv_out is None:
self._csv_out = csv.DictWriter(self._file, result.keys())
self._csv_out.writeheader()
if not self._continuing:
self._csv_out.writeheader()
self._csv_out.writerow(result.copy())

def flush(self):
self._file.flush()

def close(self):
self._file.close()

Expand Down
41 changes: 28 additions & 13 deletions python/ray/tune/ray_trial_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
class RayTrialExecutor(TrialExecutor):
"""An implemention of TrialExecutor based on Ray."""

def __init__(self, queue_trials=False):
super(RayTrialExecutor, self).__init__(queue_trials)
def __init__(self, queue_trials=False, track_checkpoints=False):
super(RayTrialExecutor, self).__init__(queue_trials, track_checkpoints)
self._running = {}
# Since trial resume after paused should not run
# trial.train.remote(), thus no more new remote object id generated.
Expand Down Expand Up @@ -60,7 +60,7 @@ def _train(self, trial):

def _start_trial(self, trial, checkpoint=None):
prior_status = trial.status
trial.status = Trial.RUNNING
self.set_status(trial, Trial.RUNNING)
trial.runner = self._setup_runner(trial)
if not self.restore(trial, checkpoint):
return
Expand Down Expand Up @@ -88,9 +88,9 @@ def _stop_trial(self, trial, error=False, error_msg=None,
"""

if error:
trial.status = Trial.ERROR
self.set_status(trial, Trial.ERROR)
else:
trial.status = Trial.TERMINATED
self.set_status(trial, Trial.TERMINATED)

try:
trial.write_error_log(error_msg)
Expand All @@ -103,32 +103,46 @@ def _stop_trial(self, trial, error=False, error_msg=None,
stop_tasks, num_returns=2, timeout=250)
except Exception:
logger.exception("Error stopping runner.")
trial.status = Trial.ERROR
self.set_status(trial, Trial.ERROR)
finally:
trial.runner = None

if stop_logger:
trial.close_logger()

def start_trial(self, trial, checkpoint_obj=None):
"""Starts the trial."""
def start_trial(self, trial, checkpoint=None, raise_on_failure=False):
"""Starts the trial.

Will not return resources if trial repeatedly fails on start.

Args:
trial (Trial): Trial to be started.
checkpoint (Checkpoint): A Python object or path storing the state
of trial.
raise_on_failure (bool): To raise exception on failure in starting.

Raises:
Exception after 1 retries if `raise_on_failure` is True.
"""

self._commit_resources(trial.resources)
try:
self._start_trial(trial, checkpoint_obj)
self._start_trial(trial, checkpoint)
except Exception:
logger.exception("Error stopping runner - retrying...")
error_msg = traceback.format_exc()
time.sleep(2)
self._stop_trial(trial, error=True, error_msg=error_msg)
try:
self._start_trial(trial)
except Exception:
self._start_trial(trial, checkpoint)
except Exception as exc:
logger.exception("Error starting runner, aborting!")
error_msg = traceback.format_exc()
self._stop_trial(trial, error=True, error_msg=error_msg)
# note that we don't return the resources, since they may
# have been lost
if raise_on_failure:
raise exc

def _find_item(self, dictionary, item):
out = [rid for rid, t in dictionary.items() if t is item]
Expand All @@ -140,6 +154,7 @@ def stop_trial(self, trial, error=False, error_msg=None, stop_logger=True):
self._stop_trial(
trial, error=error, error_msg=error_msg, stop_logger=stop_logger)
if prior_status == Trial.RUNNING:
logger.debug("Returning resources for this trial.")
self._return_resources(trial.resources)
out = self._find_item(self._running, trial)
for result_id in out:
Expand Down Expand Up @@ -293,7 +308,7 @@ def restore(self, trial, checkpoint=None):
return True
if trial.runner is None:
logger.error("Unable to restore - no runner.")
trial.status = Trial.ERROR
self.set_status(trial, Trial.ERROR)
return False
try:
value = checkpoint.value
Expand All @@ -307,5 +322,5 @@ def restore(self, trial, checkpoint=None):
return True
except Exception:
logger.exception("Error restoring runner.")
trial.status = Trial.ERROR
self.set_status(trial, Trial.ERROR)
return False
Loading