diff --git a/python/ray/_private/ray_constants.py b/python/ray/_private/ray_constants.py index 576b0aeccfd5..2f99fc7861ba 100644 --- a/python/ray/_private/ray_constants.py +++ b/python/ray/_private/ray_constants.py @@ -163,6 +163,9 @@ def env_set_by_user(key): # instantiate a Job SubmissionClient. RAY_JOB_HEADERS = "RAY_JOB_HEADERS" +# Timeout waiting for the dashboard to come alive during node startup. +RAY_DASHBOARD_STARTUP_TIMEOUT_S = env_integer("RAY_DASHBOARD_STARTUP_TIMEOUT_S", 60) + DEFAULT_DASHBOARD_IP = "127.0.0.1" DEFAULT_DASHBOARD_PORT = 8265 DASHBOARD_ADDRESS = "dashboard" diff --git a/python/ray/_private/services.py b/python/ray/_private/services.py index cb18634a68dc..d00a7bd30cad 100644 --- a/python/ray/_private/services.py +++ b/python/ray/_private/services.py @@ -1313,7 +1313,10 @@ def start_api_server( ray.experimental.internal_kv._initialize_internal_kv(gcs_client) dashboard_url = None dashboard_returncode = None - for _ in range(200): + start_time_s = time.time() + while ( + time.time() - start_time_s < ray_constants.RAY_DASHBOARD_STARTUP_TIMEOUT_S + ): dashboard_url = ray.experimental.internal_kv._internal_kv_get( ray_constants.DASHBOARD_ADDRESS, namespace=ray_constants.KV_NAMESPACE_DASHBOARD, @@ -1324,6 +1327,7 @@ def start_api_server( dashboard_returncode = process_info.process.poll() if dashboard_returncode is not None: break + # This is often on the critical path of ray.init() and ray start, # so we need to poll often. time.sleep(0.1)