Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions python/ray/autoscaler/autoscaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ def prune_active_ips(self, active_ips):
def prune(mapping):
unwanted = set(mapping) - active_ips
for unwanted_key in unwanted:
print("Removed mapping", unwanted_key, mapping[unwanted_key])
del mapping[unwanted_key]
if unwanted:
print("Removed {} stale ip mappings: {} not in {}".format(
Expand Down Expand Up @@ -454,9 +455,8 @@ def launch_new_node(self, count):
TAG_RAY_NODE_STATUS: "Uninitialized",
TAG_RAY_LAUNCH_CONFIG: self.launch_hash,
}, count)
# TODO(ekl) be less conservative in this check
assert len(self.workers()) > num_before, \
"Num nodes failed to increase after creating a new node"
if len(self.workers()) <= num_before:
print("Warning: Num nodes failed to increase after node creation")

def workers(self):
return self.provider.nodes(tag_filters={
Expand Down
20 changes: 15 additions & 5 deletions python/ray/ray_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,29 @@
from __future__ import print_function
"""Ray constants used in the Python code."""

import os

def env_integer(key, default):
if key in os.environ:
return int(os.environ(key))
return default


# Abort autoscaling if more than this number of errors are encountered. This
# is a safety feature to prevent e.g. runaway node launches.
AUTOSCALER_MAX_NUM_FAILURES = 5
AUTOSCALER_MAX_NUM_FAILURES = env_integer("AUTOSCALER_MAX_NUM_FAILURES", 5)

# Max number of nodes to launch at a time.
AUTOSCALER_MAX_CONCURRENT_LAUNCHES = 10
AUTOSCALER_MAX_CONCURRENT_LAUNCHES = env_integer(
"AUTOSCALER_MAX_CONCURRENT_LAUNCHES", 10)

# Interval at which to perform autoscaling updates.
AUTOSCALER_UPDATE_INTERVAL_S = 5
AUTOSCALER_UPDATE_INTERVAL_S = env_integer("AUTOSCALER_UPDATE_INTERVAL_S", 5)

# The autoscaler will attempt to restart Ray on nodes it hasn't heard from
# in more than this interval.
AUTOSCALER_HEARTBEAT_TIMEOUT_S = 30
AUTOSCALER_HEARTBEAT_TIMEOUT_S = env_integer(
"AUTOSCALER_HEARTBEAT_TIMEOUT_S", 30)

# Max number of retries to AWS (default is 5, time increases exponentially)
BOTO_MAX_RETRIES = 12
BOTO_MAX_RETRIES = env_integer("BOTO_MAX_RETRIES", 12)
16 changes: 9 additions & 7 deletions test/autoscaler_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,13 +365,15 @@ def testMaxFailures(self):
autoscaler.update()
self.assertRaises(Exception, autoscaler.update)

def testAbortOnCreationFailures(self):
config_path = self.write_config(SMALL_CLUSTER)
self.provider = MockProvider()
self.provider.fail_creates = True
autoscaler = StandardAutoscaler(
config_path, LoadMetrics(), max_failures=0, update_interval_s=0)
self.assertRaises(AssertionError, autoscaler.update)
# TODO(ekl) consider adding this check back if it proves useful. Currently it
# poses a race condition with downscaling.
# def testAbortOnCreationFailures(self):
# config_path = self.write_config(SMALL_CLUSTER)
# self.provider = MockProvider()
# self.provider.fail_creates = True
# autoscaler = StandardAutoscaler(
# config_path, LoadMetrics(), max_failures=0, update_interval_s=0)
# self.assertRaises(AssertionError, autoscaler.update)

def testLaunchNewNodeOnOutOfBandTerminate(self):
config_path = self.write_config(SMALL_CLUSTER)
Expand Down