Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions python/ray/autoscaler/autoscaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@
# The number of workers to launch initially, in addition to the head node.
"initial_workers": (int, OPTIONAL),

# Whether or not to scale aggressively, e.g. to jump back to at least
# initial_workers if we're ever below it and are scaling up
"aggressive_autoscaling": (bool, OPTIONAL),

# The autoscaler will scale up the cluster to this target fraction of
# resources usage. For example, if a cluster of 8 nodes is 100% busy
# and target_utilization was 0.8, it would resize the cluster to 10.
Expand Down Expand Up @@ -512,9 +516,13 @@ def target_num_workers(self):
ideal_num_nodes = int(np.ceil(cur_used / float(target_frac)))
ideal_num_workers = ideal_num_nodes - 1 # subtract 1 for head node

initial_workers = self.config["initial_workers"]
aggressive = self.config["aggressive_autoscaling"]
if self.bringup:
ideal_num_workers = max(ideal_num_workers,
self.config["initial_workers"])
ideal_num_workers = max(ideal_num_workers, initial_workers)
elif aggressive and ideal_num_workers >= 0:
# If we want any workers, we want at least initial_workers
ideal_num_workers = max(ideal_num_workers, initial_workers)

return min(self.config["max_workers"],
max(self.config["min_workers"], ideal_num_workers))
Expand Down
5 changes: 5 additions & 0 deletions python/ray/autoscaler/aws/example-full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ max_workers: 2
# subsequent `ray up`) this number of nodes will be started.
initial_workers: 0

# Whether or not to autoscale aggressively. If this is enabled, if at any point
# we would start more workers, we start at least enough to bring us to
# initial_workers.
aggressive_autoscaling: false

# This executes all commands on all nodes in the docker container,
# and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled.
Expand Down
5 changes: 5 additions & 0 deletions python/ray/autoscaler/aws/example-gpu-docker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ max_workers: 2
# subsequent `ray up`) this number of nodes will be started.
initial_workers: 0

# Whether or not to autoscale aggressively. If this is enabled, if at any point
# we would start more workers, we start at least enough to bring us to
# initial_workers.
aggressive_autoscaling: false

# This executes all commands on all nodes in the docker container,
# and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled.
Expand Down
5 changes: 5 additions & 0 deletions python/ray/autoscaler/gcp/example-full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ max_workers: 2
# subsequent `ray up`) this number of nodes will be started.
initial_workers: 0

# Whether or not to autoscale aggressively. If this is enabled, if at any point
# we would start more workers, we start at least enough to bring us to
# initial_workers.
aggressive_autoscaling: false

# This executes all commands on all nodes in the docker container,
# and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled.
Expand Down
5 changes: 5 additions & 0 deletions python/ray/autoscaler/gcp/example-gpu-docker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ max_workers: 2
# subsequent `ray up`) this number of nodes will be started.
initial_workers: 0

# Whether or not to autoscale aggressively. If this is enabled, if at any point
# we would start more workers, we start at least enough to bring us to
# initial_workers.
aggressive_autoscaling: false

# This executes all commands on all nodes in the docker container,
# and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled.
Expand Down
1 change: 1 addition & 0 deletions python/ray/autoscaler/local/example-full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ cluster_name: default
min_workers: 0
max_workers: 0
initial_workers: 0
aggressive_autoscaling: false
docker:
image: ""
container_name: ""
Expand Down
3 changes: 2 additions & 1 deletion python/ray/dashboard/dashboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ async def ray_config(_) -> aiohttp.web.Response:
"min_workers": cfg["min_workers"],
"max_workers": cfg["max_workers"],
"initial_workers": cfg["initial_workers"],
"aggressive_autoscaling": cfg["aggressive_autoscaling"],
"idle_timeout_minutes": cfg["idle_timeout_minutes"],
}

Expand Down Expand Up @@ -181,7 +182,7 @@ def log_dashboard_url(self):
def run(self):
self.log_dashboard_url()
self.node_stats.start()
aiohttp.web.run_app(self.app, host=self.ip, port=self.port)
aiohttp.web.run_app(self.app, host="0.0.0.0", port=self.port)


class NodeStats(threading.Thread):
Expand Down
6 changes: 2 additions & 4 deletions python/ray/scripts/scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def cli(logging_level, logging_format):
"--include-webui",
is_flag=True,
default=False,
help="provide this argument if the UI should not be started")
help="provide this argument if the UI should be started")
@click.option(
"--block",
is_flag=True,
Expand Down Expand Up @@ -256,6 +256,7 @@ def start(node_ip_address, redis_address, redis_port, num_redis_shards,
raylet_socket_name=raylet_socket_name,
temp_dir=temp_dir,
include_java=include_java,
include_webui=include_webui,
java_worker_options=java_worker_options,
load_code_from_local=load_code_from_local,
_internal_config=internal_config)
Expand Down Expand Up @@ -334,9 +335,6 @@ def start(node_ip_address, redis_address, redis_port, num_redis_shards,
if redis_max_clients is not None:
raise Exception("If --head is not passed in, --redis-max-clients "
"must not be provided.")
if include_webui:
raise Exception("If --head is not passed in, the --include-webui "
"flag is not relevant.")
if include_java is not None:
raise ValueError("--include-java should only be set for the head "
"node.")
Expand Down
6 changes: 5 additions & 1 deletion python/ray/tune/examples/skopt_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,13 @@ def easy_objective(config, reporter):
}
}
optimizer = Optimizer([(0, 20), (-100, 100)])
previously_run_params = [[10, 0], [15, -20]]
known_rewards = [-189, -1144]
algo = SkOptSearch(
optimizer, ["width", "height"],
max_concurrent=4,
reward_attr="neg_mean_loss")
reward_attr="neg_mean_loss",
points_to_evaluate=previously_run_params,
evaluated_rewards=known_rewards)
scheduler = AsyncHyperBandScheduler(reward_attr="neg_mean_loss")
run_experiments(config, search_alg=algo, scheduler=scheduler)
51 changes: 49 additions & 2 deletions python/ray/tune/suggest/skopt.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numbers
from collections import Iterable

try:
import skopt
Expand All @@ -24,10 +26,23 @@ class SkOptSearch(SuggestionAlgorithm):
to 10.
reward_attr (str): The training result objective value attribute.
This refers to an increasing value.
points_to_evaluate (list of lists): A list of trials you'd like to run
first before sampling from the optimiser, e.g. these could be
parameter configurations you already know work well to help
the optimiser select good values. Each trial is a list of the
parameters of that trial using the order definition given
to the optimiser (see example below)
evaluated_rewards (list): If you have previously evaluated the
parameters passed in as points_to_evaluate you can avoid
re-running those trials by passing in the reward attributes
as a list so the optimiser can be told the results without
needing to re-compute the trial. Must be the same length as
points_to_evaluate. (See skopt_example.py)

Example:
>>> from skopt import Optimizer
>>> optimizer = Optimizer([(0,20),(-100,100)])
>>> current_best_params = [[10, 0], [15, -20]]
>>> config = {
>>> "my_exp": {
>>> "run": "exp",
Expand All @@ -39,19 +54,47 @@ class SkOptSearch(SuggestionAlgorithm):
>>> }
>>> algo = SkOptSearch(optimizer,
>>> ["width", "height"], max_concurrent=4,
>>> reward_attr="neg_mean_loss")
>>> reward_attr="neg_mean_loss", points_to_evaluate=current_best_params)
"""

def __init__(self,
optimizer,
parameter_names,
max_concurrent=10,
reward_attr="episode_reward_mean",
points_to_evaluate=None,
evaluated_rewards=None,
**kwargs):
assert skopt is not None, """skopt must be installed!
You can install Skopt with the command:
`pip install scikit-optimize`."""
assert type(max_concurrent) is int and max_concurrent > 0
if points_to_evaluate is None:
points_to_evaluate = []
elif not isinstance(points_to_evaluate[0], (list, tuple)):
points_to_evaluate = [points_to_evaluate]
if not isinstance(points_to_evaluate, list):
raise ValueError(
"`points_to_evaluate` should be a list, but got %s" %
type(points_to_evaluate))
if isinstance(evaluated_rewards, Iterable):
evaluated_rewards = list(evaluated_rewards)
elif isinstance(evaluated_rewards, numbers.Number):
evaluated_rewards = [evaluated_rewards]
self._initial_points = []
if points_to_evaluate and evaluated_rewards:
if not (isinstance(evaluated_rewards, Iterable)
or isinstance(evaluated_rewards, numbers.Number)):
raise ValueError(
"`evaluated_rewards` should be an iterable or a scalar, got %s"
% type(evaluated_rewards))
if len(points_to_evaluate) != len(evaluated_rewards):
raise ValueError(
"`points_to_evaluate` and `evaluated_rewards` should have the same length"
)
optimizer.tell(points_to_evaluate, evaluated_rewards)
elif points_to_evaluate:
self._initial_points = points_to_evaluate
self._max_concurrent = max_concurrent
self._parameters = parameter_names
self._reward_attr = reward_attr
Expand All @@ -62,7 +105,11 @@ def __init__(self,
def _suggest(self, trial_id):
if self._num_live_trials() >= self._max_concurrent:
return None
suggested_config = self._skopt_opt.ask()
if self._initial_points:
suggested_config = self._initial_points[0]
del self._initial_points[0]
else:
suggested_config = self._skopt_opt.ask()
self._live_trial_mapping[trial_id] = suggested_config
return dict(zip(self._parameters, suggested_config))

Expand Down