diff --git a/python/ray/autoscaler/autoscaler.py b/python/ray/autoscaler/autoscaler.py index 2e2f9593c892..b2f9a2adbb08 100644 --- a/python/ray/autoscaler/autoscaler.py +++ b/python/ray/autoscaler/autoscaler.py @@ -142,6 +142,7 @@ def prune_active_ips(self, active_ips): def prune(mapping): unwanted = set(mapping) - active_ips for unwanted_key in unwanted: + print("Removed mapping", unwanted_key, mapping[unwanted_key]) del mapping[unwanted_key] if unwanted: print("Removed {} stale ip mappings: {} not in {}".format( @@ -454,9 +455,8 @@ def launch_new_node(self, count): TAG_RAY_NODE_STATUS: "Uninitialized", TAG_RAY_LAUNCH_CONFIG: self.launch_hash, }, count) - # TODO(ekl) be less conservative in this check - assert len(self.workers()) > num_before, \ - "Num nodes failed to increase after creating a new node" + if len(self.workers()) <= num_before: + print("Warning: Num nodes failed to increase after node creation") def workers(self): return self.provider.nodes(tag_filters={ diff --git a/python/ray/ray_constants.py b/python/ray/ray_constants.py index 4f337ad41e60..7e5df96500e5 100644 --- a/python/ray/ray_constants.py +++ b/python/ray/ray_constants.py @@ -3,19 +3,30 @@ from __future__ import print_function """Ray constants used in the Python code.""" +import os + + +def env_integer(key, default): + if key in os.environ: + return int(os.environ(key)) + return default + + # Abort autoscaling if more than this number of errors are encountered. This # is a safety feature to prevent e.g. runaway node launches. -AUTOSCALER_MAX_NUM_FAILURES = 5 +AUTOSCALER_MAX_NUM_FAILURES = env_integer("AUTOSCALER_MAX_NUM_FAILURES", 5) # Max number of nodes to launch at a time. -AUTOSCALER_MAX_CONCURRENT_LAUNCHES = 10 +AUTOSCALER_MAX_CONCURRENT_LAUNCHES = env_integer( + "AUTOSCALER_MAX_CONCURRENT_LAUNCHES", 10) # Interval at which to perform autoscaling updates. -AUTOSCALER_UPDATE_INTERVAL_S = 5 +AUTOSCALER_UPDATE_INTERVAL_S = env_integer("AUTOSCALER_UPDATE_INTERVAL_S", 5) # The autoscaler will attempt to restart Ray on nodes it hasn't heard from # in more than this interval. -AUTOSCALER_HEARTBEAT_TIMEOUT_S = 30 +AUTOSCALER_HEARTBEAT_TIMEOUT_S = env_integer("AUTOSCALER_HEARTBEAT_TIMEOUT_S", + 30) # Max number of retries to AWS (default is 5, time increases exponentially) -BOTO_MAX_RETRIES = 12 +BOTO_MAX_RETRIES = env_integer("BOTO_MAX_RETRIES", 12) diff --git a/test/autoscaler_test.py b/test/autoscaler_test.py index c3b84ca968f1..c56bd3ce55aa 100644 --- a/test/autoscaler_test.py +++ b/test/autoscaler_test.py @@ -365,14 +365,6 @@ def testMaxFailures(self): autoscaler.update() self.assertRaises(Exception, autoscaler.update) - def testAbortOnCreationFailures(self): - config_path = self.write_config(SMALL_CLUSTER) - self.provider = MockProvider() - self.provider.fail_creates = True - autoscaler = StandardAutoscaler( - config_path, LoadMetrics(), max_failures=0, update_interval_s=0) - self.assertRaises(AssertionError, autoscaler.update) - def testLaunchNewNodeOnOutOfBandTerminate(self): config_path = self.write_config(SMALL_CLUSTER) self.provider = MockProvider()