Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
a16c807
Added basic functionality and tests
Aug 26, 2020
6750060
Feature parity with old tune search space config
Aug 27, 2020
47c8f23
Merge branch 'master' into tune-search-space
Aug 27, 2020
aaaab73
Convert Optuna search spaces
Aug 27, 2020
546e9ba
Introduced quantized values
Aug 27, 2020
4d75d91
Merge branch 'master' of https://github.com/ray-project/ray into tune…
Aug 28, 2020
18535a6
Updated Optuna resolving
Aug 28, 2020
3ab491c
Added HyperOpt search space conversion
Aug 28, 2020
cba10d8
Convert search spaces to AxSearch
Aug 28, 2020
f83ed33
Convert search spaces to BayesOpt
Aug 28, 2020
94a069e
Fix docs - atexit is not called when you ray.kill() an actor (#10367)
edoakes Aug 28, 2020
9c25ca6
[hotfix] Fix test_cli.py (#10403)
edoakes Aug 28, 2020
519354a
[api] Initial API deprecations for Ray 1.0 (#10325)
ericl Aug 28, 2020
2afb54c
Validate non-integral args to ray.remote (#10221)
kishansagathiya Aug 28, 2020
b1f3c9e
[Autoscaler] Fix resource passing bug fix (#10397)
wuisawesome Aug 28, 2020
2a20426
[api] Second round of 1.0 API changes: exceptions, num_return_vals (#…
ericl Aug 29, 2020
f6a1698
[autoscaler] Add documentation for multi node type autoscaling (#10405)
ericl Aug 29, 2020
c14b44a
[api] Remove legacy memory management docs (#10406)
ericl Aug 29, 2020
d6f2b0d
[docker] Run profiling without sudo (#10388)
ijrsvt Aug 29, 2020
bd92cef
[Autoscaler] Move Resource Demand Scheduler Test to Small (#10399)
wuisawesome Aug 29, 2020
910d5d2
[hotfix] Bad merge with num_return_vals (#10418)
ericl Aug 29, 2020
9a31166
Option to disable profiling and task timeline (#10414)
stephanie-wang Aug 29, 2020
cb438be
[core] Move log_to_driver back to public (#10422)
richardliaw Aug 29, 2020
e9b0463
[Dashboard] Dashboard basic modules (#10303)
fyrestone Aug 30, 2020
f0c3910
[Serialization] Update CloudPickle to 1.6.0 (#9694)
suquark Aug 30, 2020
8c75381
update-scripts (#10425)
richardliaw Aug 30, 2020
63ad2e3
[Dashboard] Fix Issue #10319 - Dashboard autoscaler crash (#10323)
mfitton Aug 30, 2020
c8b14fd
[Tests] Enable large test (#10391)
rkooo567 Aug 30, 2020
3e5cac8
[Tests] Fix Broken GCS restart test. (#10417)
rkooo567 Aug 30, 2020
05fe6dc
Keeping pipelines full (#10225)
Aug 31, 2020
afde3db
[Tune] Synchronous Mode for PBT (#10283)
amogkam Aug 31, 2020
c93fb76
Added basic functionality and tests
Aug 26, 2020
2ffbdca
Feature parity with old tune search space config
Aug 27, 2020
13a153c
Convert Optuna search spaces
Aug 27, 2020
6abffaa
Introduced quantized values
Aug 27, 2020
2cbd77b
Updated Optuna resolving
Aug 28, 2020
c68b9c4
Added HyperOpt search space conversion
Aug 28, 2020
1167c75
Convert search spaces to AxSearch
Aug 28, 2020
a4f2d2d
Convert search spaces to BayesOpt
Aug 28, 2020
bd7ed77
Re-factored samplers into domain classes
Aug 31, 2020
e1ba45c
Re-added base classes
Aug 31, 2020
9d32499
Re-factored into list comprehensions
Aug 31, 2020
24ff642
Merge remote-tracking branch 'origin/tune-search-space' into tune-sea…
Aug 31, 2020
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -1783,11 +1783,6 @@ filegroup(
"python/ray/experimental/*.py",
"python/ray/util/*.py",
"python/ray/internal/*.py",
"python/ray/projects/*.py",
"python/ray/projects/schema.json",
"python/ray/projects/templates/cluster_template.yaml",
"python/ray/projects/templates/project_template.yaml",
"python/ray/projects/templates/requirements.txt",
"python/ray/workers/default_worker.py",
]),
)
Expand Down
30 changes: 20 additions & 10 deletions bazel/python.bzl
Original file line number Diff line number Diff line change
@@ -1,12 +1,22 @@
# py_test_module_list creates a py_test target for each
# py_test_module_list creates a py_test target for each
# Python file in `files`
def py_test_module_list(files, size, deps, extra_srcs, **kwargs):
for file in files:
# remove .py
name = file[:-3]
native.py_test(
name=name,
size=size,
srcs=extra_srcs+[file],
**kwargs
)
for file in files:
# remove .py
name = file[:-3]
native.py_test(
name = name,
size = size,
srcs = extra_srcs + [file],
**kwargs
)

def py_test_run_all_subdirectory(include, exclude, extra_srcs, **kwargs):
for file in native.glob(include = include, exclude = exclude):
print(file)
basename = file.rpartition("/")[-1]
native.py_test(
name = basename[:-3],
srcs = extra_srcs + [file],
**kwargs
)
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def optimizer_creator(model, config):
momentum=config.get("momentum", 0.9))


ray.init(address="auto" if not args.smoke_test else None, log_to_driver=True)
ray.init(address="auto" if not args.smoke_test else None, _log_to_driver=True)
num_training_workers = 1 if args.smoke_test else 3

executor = FailureInjectorExecutor(queue_trials=True)
Expand Down
3 changes: 2 additions & 1 deletion ci/long_running_tests/workloads/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@
@serve.accept_batch
def echo(_):
time.sleep(0.01) # Sleep for 10ms
ray.show_in_webui(str(serve.context.batch_size), key="Current batch size")
ray.show_in_dashboard(
str(serve.context.batch_size), key="Current batch size")
return ["hi {}".format(i) for i in range(serve.context.batch_size)]


Expand Down
2 changes: 1 addition & 1 deletion ci/long_running_tests/workloads/serve_failure.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
dashboard_host="0.0.0.0")

ray.init(
address=cluster.address, dashboard_host="0.0.0.0", log_to_driver=False)
address=cluster.address, dashboard_host="0.0.0.0", _log_to_driver=False)
serve.init()


Expand Down
13 changes: 6 additions & 7 deletions ci/travis/ci.sh
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,6 @@ test_python() {
-python/ray/tests:test_multiprocessing # test_connect_to_ray() fails to connect to raylet
-python/ray/tests:test_node_manager
-python/ray/tests:test_object_manager
-python/ray/tests:test_projects
-python/ray/tests:test_ray_init # test_redis_port() seems to fail here, but pass in isolation
-python/ray/tests:test_resource_demand_scheduler
-python/ray/tests:test_stress # timeout
Expand Down Expand Up @@ -279,12 +278,12 @@ build_wheels() {
# caused timeouts in the past. See the "cache: false" line below.
local MOUNT_BAZEL_CACHE=(
-v "${HOME}/ray-bazel-cache":/root/ray-bazel-cache
-e TRAVIS=true
-e TRAVIS_PULL_REQUEST="${TRAVIS_PULL_REQUEST:-false}"
-e encrypted_1c30b31fe1ee_key="${encrypted_1c30b31fe1ee_key-}"
-e encrypted_1c30b31fe1ee_iv="${encrypted_1c30b31fe1ee_iv-}"
-e TRAVIS_COMMIT="${TRAVIS_COMMIT}"
-e CI="${CI}"
-e "TRAVIS=true"
-e "TRAVIS_PULL_REQUEST=${TRAVIS_PULL_REQUEST:-false}"
-e "encrypted_1c30b31fe1ee_key=${encrypted_1c30b31fe1ee_key-}"
-e "encrypted_1c30b31fe1ee_iv=${encrypted_1c30b31fe1ee_iv-}"
-e "TRAVIS_COMMIT=${TRAVIS_COMMIT}"
-e "CI=${CI}"
)

# This command should be kept in sync with ray/python/README-building-wheels.md,
Expand Down
16 changes: 11 additions & 5 deletions dashboard/BUILD
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
load("//bazel:python.bzl", "py_test_run_all_subdirectory")

# This is a dummy test dependency that causes the above tests to be
# re-run if any of these files changes.
py_library(
name = "dashboard_lib",
srcs = glob(["**/*.py"],exclude=["tests/*"]),
srcs = glob(
["**/*.py"],
exclude = ["tests/*"],
),
)

py_test(
name = "test_dashboard",
py_test_run_all_subdirectory(
size = "small",
srcs = glob(["tests/*.py"]),
deps = [":dashboard_lib"]
include = ["**/test*.py"],
exclude = ["modules/test/**"],
extra_srcs = [],
tags = ["exclusive"],
)
17 changes: 15 additions & 2 deletions dashboard/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@
from ray.core.generated import agent_manager_pb2_grpc
import psutil

try:
create_task = asyncio.create_task
except AttributeError:
create_task = asyncio.ensure_future

logger = logging.getLogger(__name__)
routes = dashboard_utils.ClassMethodRouteTable

Expand All @@ -36,6 +41,7 @@ def __init__(self,
redis_password=None,
temp_dir=None,
log_dir=None,
metrics_export_port=None,
node_manager_port=None,
object_store_name=None,
raylet_name=None):
Expand All @@ -45,6 +51,7 @@ def __init__(self,
self.redis_password = redis_password
self.temp_dir = temp_dir
self.log_dir = log_dir
self.metrics_export_port = metrics_export_port
self.node_manager_port = node_manager_port
self.object_store_name = object_store_name
self.raylet_name = raylet_name
Expand All @@ -69,7 +76,7 @@ def _load_modules(self):
c = cls(self)
dashboard_utils.ClassMethodRouteTable.bind(c)
modules.append(c)
logger.info("Loaded {} modules.".format(len(modules)))
logger.info("Loaded %d modules.", len(modules))
return modules

async def run(self):
Expand All @@ -86,7 +93,7 @@ async def _check_parent():
dashboard_consts.
DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_SECONDS)

check_parent_task = asyncio.create_task(_check_parent())
check_parent_task = create_task(_check_parent())

# Create an aioredis client for all modules.
try:
Expand Down Expand Up @@ -172,6 +179,11 @@ async def _check_parent():
required=True,
type=str,
help="The address to use for Redis.")
parser.add_argument(
"--metrics-export-port",
required=True,
type=int,
help="The port to expose metrics through Prometheus.")
parser.add_argument(
"--node-manager-port",
required=True,
Expand Down Expand Up @@ -277,6 +289,7 @@ async def _check_parent():
redis_password=args.redis_password,
temp_dir=temp_dir,
log_dir=log_dir,
metrics_export_port=args.metrics_export_port,
node_manager_port=args.node_manager_port,
object_store_name=args.object_store_name,
raylet_name=args.raylet_name)
Expand Down
18 changes: 13 additions & 5 deletions dashboard/dashboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,10 @@ def setup_static_dir():
errno.ENOENT, "Dashboard build directory not found. If installing "
"from source, please follow the additional steps "
"required to build the dashboard"
"(cd python/ray/{}/client "
f"(cd python/ray/{module_name}/client "
"&& npm install "
"&& npm ci "
"&& npm run build)".format(module_name), build_dir)
"&& npm run build)", build_dir)

static_dir = os.path.join(build_dir, "static")
routes.static("/static", static_dir, follow_symlinks=True)
Expand All @@ -59,14 +59,21 @@ class Dashboard:
port(int): Port number of dashboard aiohttp server.
redis_address(str): GCS address of a Ray cluster
redis_password(str): Redis password to access GCS
log_dir(str): Log directory of dashboard.
"""

def __init__(self, host, port, redis_address, redis_password=None):
def __init__(self,
host,
port,
redis_address,
redis_password=None,
log_dir=None):
self.dashboard_head = dashboard_head.DashboardHead(
http_host=host,
http_port=port,
redis_address=redis_address,
redis_password=redis_password)
redis_password=redis_password,
log_dir=log_dir)

# Setup Dashboard Routes
build_dir = setup_static_dir()
Expand Down Expand Up @@ -197,7 +204,8 @@ async def run(self):
args.host,
args.port,
args.redis_address,
redis_password=args.redis_password)
redis_password=args.redis_password,
log_dir=log_dir)
loop = asyncio.get_event_loop()
loop.run_until_complete(dashboard.run())
except Exception as e:
Expand Down
5 changes: 3 additions & 2 deletions dashboard/datacenter.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class DataSource:
# {actor id hex(str): actor table data(dict of ActorTableData
# in gcs.proto)}
actors = Dict()
# {ip address(str): dashboard agent grpc server port(int)}
# {ip address(str): dashboard agent [http port(int), grpc port(int)]}
agents = Dict()
# {ip address(str): gcs node info(dict of GcsNodeInfo in gcs.proto)}
nodes = Dict()
Expand Down Expand Up @@ -56,7 +56,7 @@ async def get_node_actors(cls, hostname):
node_worker_id_set.add(worker_stats["workerId"])
node_actors = {}
for actor_id, actor_table_data in DataSource.actors.items():
if actor_table_data["workerId"] in node_worker_id_set:
if actor_table_data["address"]["workerId"] in node_worker_id_set:
node_actors[actor_id] = actor_table_data
return node_actors

Expand Down Expand Up @@ -102,6 +102,7 @@ async def get_all_node_summary(cls):
for hostname in DataSource.hostname_to_ip.keys():
node_info = await cls.get_node_info(hostname)
node_info.pop("workers", None)
node_info.pop("actors", None)
node_info["raylet"].pop("workersStats", None)
node_info["raylet"].pop("viewData", None)
all_nodes_summary.append(node_info)
Expand Down
50 changes: 33 additions & 17 deletions dashboard/head.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ def gcs_node_info_to_dict(message):


class DashboardHead:
def __init__(self, http_host, http_port, redis_address, redis_password):
def __init__(self, http_host, http_port, redis_address, redis_password,
log_dir):
# NodeInfoGcsService
self._gcs_node_info_stub = None
self._gcs_rpc_error_counter = 0
Expand All @@ -37,6 +38,7 @@ def __init__(self, http_host, http_port, redis_address, redis_password):
self.http_port = http_port
self.redis_address = dashboard_utils.address_tuple(redis_address)
self.redis_password = redis_password
self.log_dir = log_dir
self.aioredis_client = None
self.aiogrpc_gcs_channel = None
self.http_session = None
Expand Down Expand Up @@ -74,6 +76,22 @@ async def _update_nodes(self):
while True:
try:
nodes = await self._get_nodes()

# Get correct node info by state,
# 1. The node is ALIVE if any ALIVE node info
# of the hostname exists.
# 2. The node is DEAD if all node info of the
# hostname are DEAD.
hostname_to_node_info = {}
for node in nodes:
hostname = node["nodeManagerAddress"]
assert node["state"] in ["ALIVE", "DEAD"]
choose = hostname_to_node_info.get(hostname)
if choose is not None and choose["state"] == "ALIVE":
continue
hostname_to_node_info[hostname] = node
nodes = hostname_to_node_info.values()

self._gcs_rpc_error_counter = 0
node_ips = [node["nodeManagerAddress"] for node in nodes]
node_hostnames = [
Expand All @@ -83,13 +101,11 @@ async def _update_nodes(self):
agents = dict(DataSource.agents)
for node in nodes:
node_ip = node["nodeManagerAddress"]
if node_ip not in agents:
key = "{}{}".format(
dashboard_consts.DASHBOARD_AGENT_PORT_PREFIX,
node_ip)
agent_port = await self.aioredis_client.get(key)
if agent_port:
agents[node_ip] = json.loads(agent_port)
key = "{}{}".format(
dashboard_consts.DASHBOARD_AGENT_PORT_PREFIX, node_ip)
agent_port = await self.aioredis_client.get(key)
if agent_port:
agents[node_ip] = json.loads(agent_port)
for ip in agents.keys() - set(node_ips):
agents.pop(ip, None)

Expand All @@ -99,8 +115,8 @@ async def _update_nodes(self):
dict(zip(node_hostnames, node_ips)))
DataSource.ip_to_hostname.reset(
dict(zip(node_ips, node_hostnames)))
except aiogrpc.AioRpcError as ex:
logger.exception(ex)
except aiogrpc.AioRpcError:
logger.exception("Got AioRpcError when updating nodes.")
self._gcs_rpc_error_counter += 1
if self._gcs_rpc_error_counter > \
dashboard_consts.MAX_COUNT_OF_GCS_RPC_ERROR:
Expand All @@ -109,8 +125,8 @@ async def _update_nodes(self):
self._gcs_rpc_error_counter,
dashboard_consts.MAX_COUNT_OF_GCS_RPC_ERROR)
sys.exit(-1)
except Exception as ex:
logger.exception(ex)
except Exception:
logger.exception("Error updating nodes.")
finally:
await asyncio.sleep(
dashboard_consts.UPDATE_NODES_INTERVAL_SECONDS)
Expand All @@ -126,7 +142,7 @@ def _load_modules(self):
c = cls(self)
dashboard_utils.ClassMethodRouteTable.bind(c)
modules.append(c)
logger.info("Loaded {} modules.".format(len(modules)))
logger.info("Loaded %d modules.", len(modules))
return modules

async def run(self):
Expand Down Expand Up @@ -183,8 +199,8 @@ async def _async_notify():
co = await dashboard_utils.NotifyQueue.get()
try:
await co
except Exception as e:
logger.exception(e)
except Exception:
logger.exception(f"Error notifying coroutine {co}")

async def _purge_data():
"""Purge data in datacenter."""
Expand All @@ -193,8 +209,8 @@ async def _purge_data():
dashboard_consts.PURGE_DATA_INTERVAL_SECONDS)
try:
await DataOrganizer.purge()
except Exception as e:
logger.exception(e)
except Exception:
logger.exception("Error purging data.")

modules = self._load_modules()

Expand Down
Empty file.
19 changes: 19 additions & 0 deletions dashboard/modules/log/log_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import logging

import mimetypes

import ray.new_dashboard.utils as dashboard_utils

logger = logging.getLogger(__name__)
routes = dashboard_utils.ClassMethodRouteTable


class LogAgent(dashboard_utils.DashboardAgentModule):
def __init__(self, dashboard_agent):
super().__init__(dashboard_agent)
mimetypes.add_type("text/plain", ".err")
mimetypes.add_type("text/plain", ".out")
routes.static("/logs", self._dashboard_agent.log_dir, show_index=True)

async def run(self, server):
pass
Loading