diff --git a/.gitignore b/.gitignore index 8c0cad212c3d..da464a65266a 100644 --- a/.gitignore +++ b/.gitignore @@ -135,6 +135,7 @@ build # Pytest Cache **/.pytest_cache +.benchmarks # Vscode .vscode/ diff --git a/.travis.yml b/.travis.yml index 6e48d406ce28..158657eb93f6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -26,15 +26,17 @@ matrix: - JDK='Oracle JDK 8' - PYTHON=3.5 PYTHONWARNINGS=ignore - RAY_USE_CMAKE=1 + - RAY_INSTALL_JAVA=1 install: - ./ci/travis/install-dependencies.sh - export PATH="$HOME/miniconda/bin:$PATH" + - ./ci/travis/install-ray.sh script: - ./java/test.sh # Test Bazel build - rm -rf build - - ./ci/travis/install-bazel.sh + - ./ci/suppress_output ./ci/travis/install-bazel.sh - bazel build ... - os: linux @@ -82,10 +84,10 @@ matrix: - sudo apt-get update -qq - sudo apt-get install -qq valgrind install: - - ./ci/travis/install-bazel.sh - - ./ci/travis/install-dependencies.sh + - ./ci/suppress_output ./ci/travis/install-bazel.sh + - ./ci/suppress_output ./ci/travis/install-dependencies.sh - export PATH="$HOME/miniconda/bin:$PATH" - - ./ci/travis/install-ray.sh + - ./ci/suppress_output ./ci/travis/install-ray.sh script: - bash src/ray/test/run_object_manager_valgrind.sh @@ -96,11 +98,11 @@ matrix: # - export RAY_REDIS_SERVER_VALGRIND=1 # # Python3.5+ only. Otherwise we will get `SyntaxError` regardless of how we set the tester. - - python -c 'import sys;exit(sys.version_info>=(3,5))' || python -m pytest -v --durations=10 python/ray/experimental/test/async_test.py - - python -m pytest -v --durations=10 python/ray/tests/test_mini.py - - python -m pytest -v --durations=10 python/ray/tests/test_array.py - - python -m pytest -v --durations=10 python/ray/tests/test_multi_node_2.py - - python -m pytest -v --durations=10 python/ray/tests/test_node_manager.py + - python -c 'import sys;exit(sys.version_info>=(3,5))' || python -m pytest --durations=5 python/ray/experimental/test/async_test.py + - python -m pytest --durations=5 python/ray/tests/test_mini.py + - python -m pytest --durations=5 python/ray/tests/test_array.py + - python -m pytest --durations=5 python/ray/tests/test_multi_node_2.py + - python -m pytest --durations=5 python/ray/tests/test_node_manager.py # Build Linux wheels. @@ -136,16 +138,16 @@ matrix: install: - - ./ci/travis/install-bazel.sh - - ./ci/travis/install-dependencies.sh + - ./ci/suppress_output ./ci/travis/install-bazel.sh + - ./ci/suppress_output ./ci/travis/install-dependencies.sh - export PATH="$HOME/miniconda/bin:$PATH" - - ./ci/travis/install-ray.sh - - ./ci/travis/install-cython-examples.sh + - ./ci/suppress_output ./ci/travis/install-ray.sh + - ./ci/suppress_output ./ci/travis/install-cython-examples.sh - - bash src/ray/test/run_gcs_tests.sh + - ./ci/suppress_output bash src/ray/test/run_gcs_tests.sh # Raylet tests. - - bash src/ray/test/run_object_manager_tests.sh - - bazel test --build_tests_only --test_lang_filters=cc ... -c opt + - ./ci/suppress_output bash src/ray/test/run_object_manager_tests.sh + - ./ci/suppress_output bazel test --build_tests_only --test_lang_filters=cc ... -c opt script: @@ -157,20 +159,20 @@ script: # - export PYTHONPATH="$PYTHONPATH:./ci/" # ray tune tests - - python python/ray/tune/tests/test_dependency.py + - ./ci/suppress_output python python/ray/tune/tests/test_dependency.py # `cluster_tests.py` runs on Jenkins, not Travis. - - python -m pytest -v --durations=30 --ignore=python/ray/tune/tests/test_cluster.py python/ray/tune/tests + - python -m pytest --durations=10 --ignore=python/ray/tune/tests/test_cluster.py python/ray/tune/tests # ray rllib tests - - python python/ray/rllib/tests/test_catalog.py - - python python/ray/rllib/tests/test_filters.py - - python python/ray/rllib/tests/test_optimizers.py - - python python/ray/rllib/tests/test_evaluators.py + - python/ray/rllib/tests/run_silent.sh tests/test_catalog.py + - python/ray/rllib/tests/run_silent.sh tests/test_filters.py + - python/ray/rllib/tests/run_silent.sh tests/test_optimizers.py + - python/ray/rllib/tests/run_silent.sh tests/test_evaluators.py # ray tests # Python3.5+ only. Otherwise we will get `SyntaxError` regardless of how we set the tester. - - python -c 'import sys;exit(sys.version_info>=(3,5))' || python -m pytest -v --durations=10 python/ray/experimental/test/async_test.py - - python -m pytest -v --durations=30 python/ray/tests + - python -c 'import sys;exit(sys.version_info>=(3,5))' || python -m pytest --durations=5 python/ray/experimental/test/async_test.py + - python -m pytest --durations=10 python/ray/tests deploy: - provider: s3 access_key_id: AKIAJ2L7XDUSZVTXI5QA diff --git a/BUILD.bazel b/BUILD.bazel index 61651b2f0a18..b597939896b9 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -507,7 +507,6 @@ filegroup( "python/ray/dashboard/res/main.js", "python/ray/experimental/*.py", "python/ray/internal/*.py", - "python/ray/WebUI.ipynb", "python/ray/workers/default_worker.py", ]), ) diff --git a/WORKSPACE b/WORKSPACE index 00f77852e2c1..450bc6a65567 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -28,11 +28,10 @@ git_repository( tag = "v2.2.2", ) -new_git_repository( +git_repository( name = "com_github_google_glog", - build_file = "//bazel:BUILD.glog", - remote = "https://github.com/google/glog.git", - tag = "v0.3.5", + commit = "5c576f78c49b28d89b23fbb1fc80f54c879ec02e", + remote = "https://github.com/google/glog", ) new_git_repository( diff --git a/bazel/BUILD.glog b/bazel/BUILD.glog deleted file mode 100644 index 80c71f35cca5..000000000000 --- a/bazel/BUILD.glog +++ /dev/null @@ -1,84 +0,0 @@ -# This file is from https://github.com/google/or-tools/blob/master/bazel/glog.BUILD - -cc_library( - name = "glog", - srcs = [ - "config.h", - "src/base/commandlineflags.h", - "src/base/googleinit.h", - "src/base/mutex.h", - "src/demangle.cc", - "src/demangle.h", - "src/logging.cc", - "src/raw_logging.cc", - "src/signalhandler.cc", - "src/symbolize.cc", - "src/symbolize.h", - "src/utilities.cc", - "src/utilities.h", - "src/vlog_is_on.cc", - ] + glob(["src/stacktrace*.h"]), - hdrs = [ - "src/glog/log_severity.h", - "src/glog/logging.h", - "src/glog/raw_logging.h", - "src/glog/stl_logging.h", - "src/glog/vlog_is_on.h", - ], - copts = [ - "-Wno-sign-compare", - "-U_XOPEN_SOURCE", - ], - includes = ["./src"], - linkopts = ["-lpthread"] + select({ - ":libunwind": ["-lunwind"], - "//conditions:default": [], - }), - visibility = ["//visibility:public"], - deps = [ - "@com_github_gflags_gflags//:gflags", - ], -) - -config_setting( - name = "libunwind", - values = { - "define": "libunwind=true", - }, -) - -genrule( - name = "run_configure", - srcs = [ - "README", - "Makefile.in", - "config.guess", - "config.sub", - "install-sh", - "ltmain.sh", - "missing", - "libglog.pc.in", - "src/config.h.in", - "src/glog/logging.h.in", - "src/glog/raw_logging.h.in", - "src/glog/stl_logging.h.in", - "src/glog/vlog_is_on.h.in", - ], - outs = [ - "config.h", - "src/glog/logging.h", - "src/glog/raw_logging.h", - "src/glog/stl_logging.h", - "src/glog/vlog_is_on.h", - ], - tools = [ - "configure", - ], - cmd = "$(location :configure)" + - "&& cp -v src/config.h $(location config.h) " + - "&& cp -v src/glog/logging.h $(location src/glog/logging.h) " + - "&& cp -v src/glog/raw_logging.h $(location src/glog/raw_logging.h) " + - "&& cp -v src/glog/stl_logging.h $(location src/glog/stl_logging.h) " + - "&& cp -v src/glog/vlog_is_on.h $(location src/glog/vlog_is_on.h) " - , -) diff --git a/build.sh b/build.sh index cf137c959161..76400aa7a188 100755 --- a/build.sh +++ b/build.sh @@ -121,7 +121,7 @@ else $PYTHON_EXECUTABLE -m pip install \ --target=$ROOT_DIR/python/ray/pyarrow_files pyarrow==0.12.0.RAY \ --find-links https://s3-us-west-2.amazonaws.com/arrow-wheels/9357dc130789ee42f8181d8724bee1d5d1509060/index.html - bazel build //:ray_pkg -c opt + bazel build //:ray_pkg -c opt --verbose_failures # Copy files and keep them writeable. This is a workaround, as Bazel # marks all generated files non-writeable. If we would just copy them # over without adding write permission, the copy would fail the next time. diff --git a/ci/jenkins_tests/multi_node_tests/large_memory_test.py b/ci/jenkins_tests/miscellaneous/large_memory_test.py similarity index 91% rename from ci/jenkins_tests/multi_node_tests/large_memory_test.py rename to ci/jenkins_tests/miscellaneous/large_memory_test.py index 4ac34d2cfdf2..9421c15b06b6 100644 --- a/ci/jenkins_tests/multi_node_tests/large_memory_test.py +++ b/ci/jenkins_tests/miscellaneous/large_memory_test.py @@ -32,6 +32,9 @@ del c print("Successfully put C.") + # The below code runs successfully, but when commented in, the whole test + # takes about 10 minutes. + # D = (2 ** 30 + 1) * ["h"] # d = ray.put(D) # assert ray.get(d) == D diff --git a/ci/jenkins_tests/multi_node_tests/test_wait_hanging.py b/ci/jenkins_tests/miscellaneous/test_wait_hanging.py similarity index 100% rename from ci/jenkins_tests/multi_node_tests/test_wait_hanging.py rename to ci/jenkins_tests/miscellaneous/test_wait_hanging.py diff --git a/ci/jenkins_tests/multi_node_docker_test.py b/ci/jenkins_tests/multi_node_docker_test.py deleted file mode 100644 index 4b22dafc7418..000000000000 --- a/ci/jenkins_tests/multi_node_docker_test.py +++ /dev/null @@ -1,441 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import argparse -import datetime -import os -import random -import re -import signal -import subprocess -import sys - - -# This is duplicated from ray.utils so that we do not have to introduce a -# dependency on Ray to run this file. -def decode(byte_str): - """Make this unicode in Python 3, otherwise leave it as bytes.""" - if not isinstance(byte_str, bytes): - raise ValueError("The argument must be a bytes object.") - if sys.version_info >= (3, 0): - return byte_str.decode("ascii") - else: - return byte_str - - -def wait_for_output(proc): - """This is a convenience method to parse a process's stdout and stderr. - - Args: - proc: A process started by subprocess.Popen. - - Returns: - A tuple of the stdout and stderr of the process as strings. - """ - try: - # NOTE: This test must be run with Python 3. - stdout_data, stderr_data = proc.communicate(timeout=200) - except subprocess.TimeoutExpired: - # Timeout: kill the process. - # Get the remaining message from PIPE for debugging purpose. - print("Killing process because it timed out.") - proc.kill() - stdout_data, stderr_data = proc.communicate() - - if stdout_data is not None: - try: - # NOTE(rkn): This try/except block is here because I once saw an - # exception raised here and want to print more information if that - # happens again. - stdout_data = decode(stdout_data) - except UnicodeDecodeError: - raise Exception("Failed to decode stdout_data:", stdout_data) - - if stderr_data is not None: - try: - # NOTE(rkn): This try/except block is here because I once saw an - # exception raised here and want to print more information if that - # happens again. - stderr_data = decode(stderr_data) - except UnicodeDecodeError: - raise Exception("Failed to decode stderr_data:", stderr_data) - - return stdout_data, stderr_data - - -class DockerRunner(object): - """This class manages the logistics of running multiple nodes in Docker. - - This class is used for starting multiple Ray nodes within Docker, stopping - Ray, running a workload, and determining the success or failure of the - workload. - - Attributes: - head_container_id: The ID of the docker container that runs the head - node. - worker_container_ids: A list of the docker container IDs of the Ray - worker nodes. - head_container_ip: The IP address of the docker container that runs the - head node. - """ - - def __init__(self): - """Initialize the DockerRunner.""" - self.head_container_id = None - self.worker_container_ids = [] - self.head_container_ip = None - - def _get_container_id(self, stdout_data): - """Parse the docker container ID from stdout_data. - - Args: - stdout_data: This should be a string with the standard output of a - call to a docker command. - - Returns: - The container ID of the docker container. - """ - p = re.compile("([0-9a-f]{64})\n") - m = p.match(stdout_data) - if m is None: - return None - else: - return m.group(1) - - def _get_container_ip(self, container_id): - """Get the IP address of a specific docker container. - - Args: - container_id: The docker container ID of the relevant docker - container. - - Returns: - The IP address of the container. - """ - proc = subprocess.Popen( - [ - "docker", "inspect", - "--format={{.NetworkSettings.Networks.bridge" - ".IPAddress}}", container_id - ], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - stdout_data, _ = wait_for_output(proc) - p = re.compile("([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3})") - m = p.match(stdout_data) - if m is None: - raise RuntimeError("Container IP not found.") - else: - return m.group(1) - - def _start_head_node(self, docker_image, mem_size, shm_size, - num_redis_shards, num_cpus, num_gpus, - development_mode): - """Start the Ray head node inside a docker container.""" - mem_arg = ["--memory=" + mem_size] if mem_size else [] - shm_arg = ["--shm-size=" + shm_size] if shm_size else [] - volume_arg = ([ - "-v", "{}:{}".format( - os.path.dirname(os.path.realpath(__file__)), - "/ray/test/jenkins_tests") - ] if development_mode else []) - - command = (["docker", "run", "-d"] + mem_arg + shm_arg + volume_arg + [ - docker_image, "ray", "start", "--head", "--block", - "--redis-port=6379", - "--num-redis-shards={}".format(num_redis_shards), - "--num-cpus={}".format(num_cpus), "--num-gpus={}".format(num_gpus), - "--no-ui" - ]) - print("Starting head node with command:{}".format(command)) - - proc = subprocess.Popen( - command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - stdout_data, _ = wait_for_output(proc) - container_id = self._get_container_id(stdout_data) - if container_id is None: - raise RuntimeError("Failed to find container ID.") - self.head_container_id = container_id - self.head_container_ip = self._get_container_ip(container_id) - - def _start_worker_node(self, docker_image, mem_size, shm_size, num_cpus, - num_gpus, development_mode): - """Start a Ray worker node inside a docker container.""" - mem_arg = ["--memory=" + mem_size] if mem_size else [] - shm_arg = ["--shm-size=" + shm_size] if shm_size else [] - volume_arg = ([ - "-v", "{}:{}".format( - os.path.dirname(os.path.realpath(__file__)), - "/ray/test/jenkins_tests") - ] if development_mode else []) - command = (["docker", "run", "-d"] + mem_arg + shm_arg + volume_arg + [ - "--shm-size=" + shm_size, docker_image, "ray", "start", "--block", - "--redis-address={:s}:6379".format(self.head_container_ip), - "--num-cpus={}".format(num_cpus), "--num-gpus={}".format(num_gpus) - ]) - print("Starting worker node with command:{}".format(command)) - proc = subprocess.Popen( - command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - stdout_data, _ = wait_for_output(proc) - container_id = self._get_container_id(stdout_data) - if container_id is None: - raise RuntimeError("Failed to find container id") - self.worker_container_ids.append(container_id) - - def start_ray(self, - docker_image=None, - mem_size=None, - shm_size=None, - num_nodes=None, - num_redis_shards=1, - num_cpus=None, - num_gpus=None, - development_mode=None): - """Start a Ray cluster within docker. - - This starts one docker container running the head node and - num_nodes - 1 docker containers running the Ray worker nodes. - - Args: - docker_image: The docker image to use for all of the nodes. - mem_size: The amount of memory to start each docker container with. - This will be passed into `docker run` as the --memory flag. If - this is None, then no --memory flag will be used. - shm_size: The amount of shared memory to start each docker - container with. This will be passed into `docker run` as the - `--shm-size` flag. - num_nodes: The number of nodes to use in the cluster (this counts - the head node as well). - num_redis_shards: The number of Redis shards to use on the head - node. - num_cpus: A list of the number of CPUs to start each node with. - num_gpus: A list of the number of GPUs to start each node with. - development_mode: True if you want to mount the local copy of - test/jenkins_test on the head node so we can avoid rebuilding - docker images during development. - """ - assert len(num_cpus) == num_nodes - assert len(num_gpus) == num_nodes - - # Launch the head node. - self._start_head_node(docker_image, mem_size, shm_size, - num_redis_shards, num_cpus[0], num_gpus[0], - development_mode) - # Start the worker nodes. - for i in range(num_nodes - 1): - self._start_worker_node(docker_image, mem_size, shm_size, - num_cpus[1 + i], num_gpus[1 + i], - development_mode) - - def _stop_node(self, container_id): - """Stop a node in the Ray cluster.""" - proc = subprocess.Popen( - ["docker", "kill", container_id], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - stdout_data, _ = wait_for_output(proc) - stopped_container_id = self._get_container_id(stdout_data) - if not container_id == stopped_container_id: - raise Exception("Failed to stop container {}." - .format(container_id)) - - proc = subprocess.Popen( - ["docker", "rm", "-f", container_id], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - stdout_data, _ = wait_for_output(proc) - removed_container_id = self._get_container_id(stdout_data) - if not container_id == removed_container_id: - raise Exception("Failed to remove container {}." - .format(container_id)) - - print( - "stop_node", { - "container_id": container_id, - "is_head": container_id == self.head_container_id - }) - - def stop_ray(self): - """Stop the Ray cluster.""" - success = True - - try: - self._stop_node(self.head_container_id) - except Exception: - success = False - - for container_id in self.worker_container_ids: - try: - self._stop_node(container_id) - except Exception: - success = False - - return success - - def run_test(self, - test_script, - num_drivers, - driver_locations=None, - timeout_seconds=600): - """Run a test script. - - Run a test using the Ray cluster. - - Args: - test_script: The test script to run. - num_drivers: The number of copies of the test script to run. - driver_locations: A list of the indices of the containers that the - different copies of the test script should be run on. If this - is None, then the containers will be chosen randomly. - timeout_seconds: The amount of time in seconds to wait before - considering the test to have failed. When the timeout expires, - this will cause this function to raise an exception. - - Returns: - A dictionary with information about the test script run. - - Raises: - Exception: An exception is raised if the timeout expires. - """ - print("Multi-node docker test started at: {}".format( - datetime.datetime.now())) - all_container_ids = ( - [self.head_container_id] + self.worker_container_ids) - if driver_locations is None: - driver_locations = [ - random.randrange(0, len(all_container_ids)) - for i in range(num_drivers) - ] - print("driver_locations: {}".format(driver_locations)) - - # Define a signal handler and set an alarm to go off in - # timeout_seconds. - def handler(signum, frame): - raise RuntimeError("This test timed out after {} seconds." - .format(timeout_seconds)) - - signal.signal(signal.SIGALRM, handler) - signal.alarm(timeout_seconds) - - # Start the different drivers. - driver_processes = [] - for i in range(len(driver_locations)): - # Get the container ID to run the ith driver in. - container_id = all_container_ids[driver_locations[i]] - command = [ - "docker", "exec", container_id, "/bin/bash", "-c", - ("RAY_REDIS_ADDRESS={}:6379 RAY_DRIVER_INDEX={} " - "python {}".format(self.head_container_ip, i, test_script)) - ] - print("Starting driver with command {}.".format(test_script)) - # Start the driver. - p = subprocess.Popen( - command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - driver_processes.append(p) - - results = [] - for p in driver_processes: - stdout_data, stderr_data = wait_for_output(p) - print("STDOUT:") - print(stdout_data) - print("STDERR:") - print(stderr_data) - results.append({ - "success": p.returncode == 0, - "return_code": p.returncode - }) - - # Disable the alarm. - signal.alarm(0) - print("Multi-node docker test ended at: {}".format( - datetime.datetime.now())) - return results - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Run multinode tests in Docker.") - parser.add_argument( - "--docker-image", default="ray-project/deploy", help="docker image") - parser.add_argument("--mem-size", help="memory size") - parser.add_argument("--shm-size", default="1G", help="shared memory size") - parser.add_argument( - "--num-nodes", - default=1, - type=int, - help="number of nodes to use in the cluster") - parser.add_argument( - "--num-redis-shards", - default=1, - type=int, - help=("the number of Redis shards to start on the " - "head node")) - parser.add_argument( - "--num-cpus", - type=str, - help=("a comma separated list of values representing " - "the number of CPUs to start each node with")) - parser.add_argument( - "--num-gpus", - type=str, - help=("a comma separated list of values representing " - "the number of GPUs to start each node with")) - parser.add_argument( - "--num-drivers", default=1, type=int, help="number of drivers to run") - parser.add_argument( - "--driver-locations", - type=str, - help=("a comma separated list of indices of the " - "containers to run the drivers in")) - parser.add_argument("--test-script", required=True, help="test script") - parser.add_argument( - "--development-mode", - action="store_true", - help="use local copies of the test scripts") - args = parser.parse_args() - - # Parse the number of CPUs and GPUs to use for each worker. - num_nodes = args.num_nodes - num_cpus = ([int(i) for i in args.num_cpus.split(",")] - if args.num_cpus is not None else num_nodes * [10]) - num_gpus = ([int(i) for i in args.num_gpus.split(",")] - if args.num_gpus is not None else num_nodes * [0]) - - # Parse the driver locations. - driver_locations = (None if args.driver_locations is None else - [int(i) for i in args.driver_locations.split(",")]) - - d = DockerRunner() - d.start_ray( - docker_image=args.docker_image, - mem_size=args.mem_size, - shm_size=args.shm_size, - num_nodes=num_nodes, - num_redis_shards=args.num_redis_shards, - num_cpus=num_cpus, - num_gpus=num_gpus, - development_mode=args.development_mode) - try: - run_results = d.run_test( - args.test_script, - args.num_drivers, - driver_locations=driver_locations) - finally: - successfully_stopped = d.stop_ray() - - any_failed = False - for run_result in run_results: - if "success" in run_result and run_result["success"]: - print("RESULT: Test {} succeeded.".format(args.test_script)) - else: - print("RESULT: Test {} failed.".format(args.test_script)) - any_failed = True - - if any_failed: - sys.exit(1) - elif not successfully_stopped: - print("There was a failure when attempting to stop the containers.") - sys.exit(1) - else: - sys.exit(0) diff --git a/ci/jenkins_tests/multi_node_tests/many_drivers_test.py b/ci/jenkins_tests/multi_node_tests/many_drivers_test.py deleted file mode 100644 index 585c3806103a..000000000000 --- a/ci/jenkins_tests/multi_node_tests/many_drivers_test.py +++ /dev/null @@ -1,79 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os -import time - -import ray -from ray.tests.utils import (_wait_for_nodes_to_join, _broadcast_event, - _wait_for_event) - -# This test should be run with 5 nodes, which have 0, 0, 5, 6, and 50 GPUs for -# a total of 61 GPUs. It should be run with a large number of drivers (e.g., -# 100). At most 10 drivers will run at a time, and each driver will use at most -# 5 GPUs (this is ceil(61 / 15), which guarantees that we will always be able -# to make progress). -total_num_nodes = 5 -max_concurrent_drivers = 15 -num_gpus_per_driver = 5 - - -@ray.remote(num_cpus=0, num_gpus=1) -class Actor1(object): - def __init__(self): - assert len(ray.get_gpu_ids()) == 1 - - def check_ids(self): - assert len(ray.get_gpu_ids()) == 1 - - -def driver(redis_address, driver_index): - """The script for all drivers. - - This driver should create five actors that each use one GPU. After a while, - it should exit. - """ - ray.init(redis_address=redis_address) - - # Wait for all the nodes to join the cluster. - _wait_for_nodes_to_join(total_num_nodes) - - # Limit the number of drivers running concurrently. - for i in range(driver_index - max_concurrent_drivers + 1): - _wait_for_event("DRIVER_{}_DONE".format(i), redis_address) - - def try_to_create_actor(actor_class, timeout=500): - # Try to create an actor, but allow failures while we wait for the - # monitor to release the resources for the removed drivers. - start_time = time.time() - while time.time() - start_time < timeout: - try: - actor = actor_class.remote() - except Exception: - time.sleep(0.1) - else: - return actor - # If we are here, then we timed out while looping. - raise Exception("Timed out while trying to create actor.") - - # Create some actors that require one GPU. - actors_one_gpu = [] - for _ in range(num_gpus_per_driver): - actors_one_gpu.append(try_to_create_actor(Actor1)) - - for _ in range(100): - ray.get([actor.check_ids.remote() for actor in actors_one_gpu]) - - _broadcast_event("DRIVER_{}_DONE".format(driver_index), redis_address) - - -if __name__ == "__main__": - driver_index = int(os.environ["RAY_DRIVER_INDEX"]) - redis_address = os.environ["RAY_REDIS_ADDRESS"] - print("Driver {} started at {}.".format(driver_index, time.time())) - - # In this test, all drivers will run the same script. - driver(redis_address, driver_index) - - print("Driver {} finished at {}.".format(driver_index, time.time())) diff --git a/ci/jenkins_tests/multi_node_tests/remove_driver_test.py b/ci/jenkins_tests/multi_node_tests/remove_driver_test.py deleted file mode 100644 index 1cd10195b607..000000000000 --- a/ci/jenkins_tests/multi_node_tests/remove_driver_test.py +++ /dev/null @@ -1,274 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os -import time - -import ray -from ray.tests.utils import (_wait_for_nodes_to_join, _broadcast_event, - _wait_for_event, wait_for_pid_to_exit) - -# This test should be run with 5 nodes, which have 0, 1, 2, 3, and 4 GPUs for a -# total of 10 GPUs. It should be run with 7 drivers. Drivers 2 through 6 must -# run on different nodes so they can check if all the relevant workers on all -# the nodes have been killed. -total_num_nodes = 5 - - -def actor_event_name(driver_index, actor_index): - return "DRIVER_{}_ACTOR_{}_RUNNING".format(driver_index, actor_index) - - -def remote_function_event_name(driver_index, task_index): - return "DRIVER_{}_TASK_{}_RUNNING".format(driver_index, task_index) - - -@ray.remote -def long_running_task(driver_index, task_index, redis_address): - _broadcast_event( - remote_function_event_name(driver_index, task_index), - redis_address, - data=(ray.services.get_node_ip_address(), os.getpid())) - # Loop forever. - while True: - time.sleep(100) - - -num_long_running_tasks_per_driver = 2 - - -@ray.remote -class Actor0(object): - def __init__(self, driver_index, actor_index, redis_address): - _broadcast_event( - actor_event_name(driver_index, actor_index), - redis_address, - data=(ray.services.get_node_ip_address(), os.getpid())) - assert len(ray.get_gpu_ids()) == 0 - - def check_ids(self): - assert len(ray.get_gpu_ids()) == 0 - - def long_running_method(self): - # Loop forever. - while True: - time.sleep(100) - - -@ray.remote(num_gpus=1) -class Actor1(object): - def __init__(self, driver_index, actor_index, redis_address): - _broadcast_event( - actor_event_name(driver_index, actor_index), - redis_address, - data=(ray.services.get_node_ip_address(), os.getpid())) - assert len(ray.get_gpu_ids()) == 1 - - def check_ids(self): - assert len(ray.get_gpu_ids()) == 1 - - def long_running_method(self): - # Loop forever. - while True: - time.sleep(100) - - -@ray.remote(num_gpus=2) -class Actor2(object): - def __init__(self, driver_index, actor_index, redis_address): - _broadcast_event( - actor_event_name(driver_index, actor_index), - redis_address, - data=(ray.services.get_node_ip_address(), os.getpid())) - assert len(ray.get_gpu_ids()) == 2 - - def check_ids(self): - assert len(ray.get_gpu_ids()) == 2 - - def long_running_method(self): - # Loop forever. - while True: - time.sleep(100) - - -def driver_0(redis_address, driver_index): - """The script for driver 0. - - This driver should create five actors that each use one GPU and some actors - that use no GPUs. After a while, it should exit. - """ - ray.init(redis_address=redis_address) - - # Wait for all the nodes to join the cluster. - _wait_for_nodes_to_join(total_num_nodes) - - # Start some long running task. Driver 2 will make sure the worker running - # this task has been killed. - for i in range(num_long_running_tasks_per_driver): - long_running_task.remote(driver_index, i, redis_address) - - # Create some actors that require one GPU. - actors_one_gpu = [ - Actor1.remote(driver_index, i, redis_address) for i in range(5) - ] - # Create some actors that don't require any GPUs. - actors_no_gpus = [ - Actor0.remote(driver_index, 5 + i, redis_address) for i in range(5) - ] - - for _ in range(1000): - ray.get([actor.check_ids.remote() for actor in actors_one_gpu]) - ray.get([actor.check_ids.remote() for actor in actors_no_gpus]) - - # Start a long-running method on one actor and make sure this doesn't - # affect anything. - actors_no_gpus[0].long_running_method.remote() - - _broadcast_event("DRIVER_0_DONE", redis_address) - - -def driver_1(redis_address, driver_index): - """The script for driver 1. - - This driver should create one actor that uses two GPUs, three actors that - each use one GPU (the one requiring two must be created first), and some - actors that don't use any GPUs. After a while, it should exit. - """ - ray.init(redis_address=redis_address) - - # Wait for all the nodes to join the cluster. - _wait_for_nodes_to_join(total_num_nodes) - - # Start some long running task. Driver 2 will make sure the worker running - # this task has been killed. - for i in range(num_long_running_tasks_per_driver): - long_running_task.remote(driver_index, i, redis_address) - - # Create an actor that requires two GPUs. - actors_two_gpus = [ - Actor2.remote(driver_index, i, redis_address) for i in range(1) - ] - # Create some actors that require one GPU. - actors_one_gpu = [ - Actor1.remote(driver_index, 1 + i, redis_address) for i in range(3) - ] - # Create some actors that don't require any GPUs. - actors_no_gpus = [ - Actor0.remote(driver_index, 1 + 3 + i, redis_address) for i in range(5) - ] - - for _ in range(1000): - ray.get([actor.check_ids.remote() for actor in actors_two_gpus]) - ray.get([actor.check_ids.remote() for actor in actors_one_gpu]) - ray.get([actor.check_ids.remote() for actor in actors_no_gpus]) - - # Start a long-running method on one actor and make sure this doesn't - # affect anything. - actors_one_gpu[0].long_running_method.remote() - - _broadcast_event("DRIVER_1_DONE", redis_address) - - -def cleanup_driver(redis_address, driver_index): - """The script for drivers 2 through 6. - - This driver should wait for the first two drivers to finish. Then it should - create some actors that use a total of ten GPUs. - """ - ray.init(redis_address=redis_address) - - # Only one of the cleanup drivers should create more actors. - if driver_index == 2: - # We go ahead and create some actors that don't require any GPUs. We - # don't need to wait for the other drivers to finish. We call methods - # on these actors later to make sure they haven't been killed. - actors_no_gpus = [ - Actor0.remote(driver_index, i, redis_address) for i in range(10) - ] - - _wait_for_event("DRIVER_0_DONE", redis_address) - _wait_for_event("DRIVER_1_DONE", redis_address) - - def try_to_create_actor(actor_class, driver_index, actor_index, - timeout=20): - # Try to create an actor, but allow failures while we wait for the - # monitor to release the resources for the removed drivers. - start_time = time.time() - while time.time() - start_time < timeout: - try: - actor = actor_class.remote(driver_index, actor_index, - redis_address) - except Exception: - time.sleep(0.1) - else: - return actor - # If we are here, then we timed out while looping. - raise Exception("Timed out while trying to create actor.") - - # Only one of the cleanup drivers should create more actors. - if driver_index == 2: - # Create some actors that require one GPU. - actors_one_gpu = [] - for i in range(10): - actors_one_gpu.append( - try_to_create_actor(Actor1, driver_index, 10 + 3 + i)) - - removed_workers = 0 - - # Make sure that the PIDs for the long-running tasks from driver 0 and - # driver 1 have been killed. - for i in range(num_long_running_tasks_per_driver): - node_ip_address, pid = _wait_for_event( - remote_function_event_name(0, i), redis_address) - if node_ip_address == ray.services.get_node_ip_address(): - wait_for_pid_to_exit(pid) - removed_workers += 1 - for i in range(num_long_running_tasks_per_driver): - node_ip_address, pid = _wait_for_event( - remote_function_event_name(1, i), redis_address) - if node_ip_address == ray.services.get_node_ip_address(): - wait_for_pid_to_exit(pid) - removed_workers += 1 - # Make sure that the PIDs for the actors from driver 0 and driver 1 have - # been killed. - for i in range(10): - node_ip_address, pid = _wait_for_event( - actor_event_name(0, i), redis_address) - if node_ip_address == ray.services.get_node_ip_address(): - wait_for_pid_to_exit(pid) - removed_workers += 1 - for i in range(9): - node_ip_address, pid = _wait_for_event( - actor_event_name(1, i), redis_address) - if node_ip_address == ray.services.get_node_ip_address(): - wait_for_pid_to_exit(pid) - removed_workers += 1 - - print("{} workers/actors were removed on this node." - .format(removed_workers)) - - # Only one of the cleanup drivers should create and use more actors. - if driver_index == 2: - for _ in range(1000): - ray.get([actor.check_ids.remote() for actor in actors_one_gpu]) - ray.get([actor.check_ids.remote() for actor in actors_no_gpus]) - - _broadcast_event("DRIVER_{}_DONE".format(driver_index), redis_address) - - -if __name__ == "__main__": - driver_index = int(os.environ["RAY_DRIVER_INDEX"]) - redis_address = os.environ["RAY_REDIS_ADDRESS"] - print("Driver {} started at {}.".format(driver_index, time.time())) - - if driver_index == 0: - driver_0(redis_address, driver_index) - elif driver_index == 1: - driver_1(redis_address, driver_index) - elif driver_index in [2, 3, 4, 5, 6]: - cleanup_driver(redis_address, driver_index) - else: - raise Exception("This code should be unreachable.") - - print("Driver {} finished at {}.".format(driver_index, time.time())) diff --git a/ci/jenkins_tests/multi_node_tests/test_0.py b/ci/jenkins_tests/multi_node_tests/test_0.py deleted file mode 100644 index 7d8240568ba6..000000000000 --- a/ci/jenkins_tests/multi_node_tests/test_0.py +++ /dev/null @@ -1,36 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os -import time - -import ray - - -@ray.remote -def f(): - time.sleep(0.1) - return ray.services.get_node_ip_address() - - -if __name__ == "__main__": - driver_index = int(os.environ["RAY_DRIVER_INDEX"]) - redis_address = os.environ["RAY_REDIS_ADDRESS"] - print("Driver {} started at {}.".format(driver_index, time.time())) - - ray.init(redis_address=redis_address) - # Check that tasks are scheduled on all nodes. - num_attempts = 30 - for i in range(num_attempts): - ip_addresses = ray.get([f.remote() for i in range(1000)]) - distinct_addresses = set(ip_addresses) - counts = [ - ip_addresses.count(address) for address in distinct_addresses - ] - print("Counts are {}".format(counts)) - if len(counts) == 5: - break - assert len(counts) == 5 - - print("Driver {} finished at {}.".format(driver_index, time.time())) diff --git a/ci/jenkins_tests/run_multi_node_tests.sh b/ci/jenkins_tests/run_multi_node_tests.sh index 10da4f03bfc6..28f805984673 100755 --- a/ci/jenkins_tests/run_multi_node_tests.sh +++ b/ci/jenkins_tests/run_multi_node_tests.sh @@ -12,6 +12,7 @@ SHM_SIZE="20G" ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd) DOCKER_SHA=$($ROOT_DIR/../../build-docker.sh --output-sha --no-cache) +SUPPRESS_OUTPUT=$ROOT_DIR/../suppress_output echo "Using Docker image" $DOCKER_SHA ######################## RLLIB TESTS ################################# @@ -24,58 +25,31 @@ bash $ROOT_DIR/run_tune_tests.sh ${MEMORY_SIZE} ${SHM_SIZE} $DOCKER_SHA ######################## SGD TESTS ################################# -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ +$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ python /ray/python/ray/experimental/sgd/test_sgd.py --num-iters=2 \ --batch-size=1 --strategy=simple -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ +$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ python /ray/python/ray/experimental/sgd/test_sgd.py --num-iters=2 \ --batch-size=1 --strategy=ps -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ +$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ python /ray/python/ray/experimental/sgd/test_save_and_restore.py --num-iters=2 \ --batch-size=1 --strategy=simple -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ +$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ python /ray/python/ray/experimental/sgd/test_save_and_restore.py --num-iters=2 \ --batch-size=1 --strategy=ps -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ +$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ python /ray/python/ray/experimental/sgd/mnist_example.py --num-iters=1 \ --num-workers=1 --devices-per-worker=1 --strategy=ps -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ +$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ python /ray/python/ray/experimental/sgd/mnist_example.py --num-iters=1 \ --num-workers=1 --devices-per-worker=1 --strategy=ps --tune ######################## RAY BACKEND TESTS ################################# -python3 $ROOT_DIR/multi_node_docker_test.py \ - --docker-image=$DOCKER_SHA \ - --num-nodes=5 \ - --num-redis-shards=10 \ - --test-script=/ray/ci/jenkins_tests/multi_node_tests/test_0.py - -python3 $ROOT_DIR/multi_node_docker_test.py \ - --docker-image=$DOCKER_SHA \ - --num-nodes=5 \ - --num-redis-shards=5 \ - --num-gpus=0,1,2,3,4 \ - --num-drivers=7 \ - --driver-locations=0,1,0,1,2,3,4 \ - --test-script=/ray/ci/jenkins_tests/multi_node_tests/remove_driver_test.py - -python3 $ROOT_DIR/multi_node_docker_test.py \ - --docker-image=$DOCKER_SHA \ - --num-nodes=5 \ - --num-redis-shards=2 \ - --num-gpus=0,0,5,6,50 \ - --num-drivers=100 \ - --test-script=/ray/ci/jenkins_tests/multi_node_tests/many_drivers_test.py - -python3 $ROOT_DIR/multi_node_docker_test.py \ - --docker-image=$DOCKER_SHA \ - --num-nodes=1 \ - --mem-size=60G \ - --shm-size=60G \ - --test-script=/ray/ci/jenkins_tests/multi_node_tests/large_memory_test.py +$SUPPRESS_OUTPUT docker run --rm --shm-size=60G --memory=60G $DOCKER_SHA \ + python /ray/ci/jenkins_tests/miscellaneous/large_memory_test.py diff --git a/ci/jenkins_tests/run_rllib_tests.sh b/ci/jenkins_tests/run_rllib_tests.sh index 36590210250d..31f16c64ba86 100644 --- a/ci/jenkins_tests/run_rllib_tests.sh +++ b/ci/jenkins_tests/run_rllib_tests.sh @@ -2,49 +2,49 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env PongDeterministic-v0 \ --run A3C \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_workers": 2}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env Pong-ram-v4 \ --run A3C \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_workers": 2}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env PongDeterministic-v0 \ --run A2C \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_workers": 2}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v1 \ --run PPO \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"kl_coeff": 1.0, "num_sgd_iter": 10, "lr": 1e-4, "sgd_minibatch_size": 64, "train_batch_size": 2000, "num_workers": 1, "model": {"free_log_std": true}}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v1 \ --run PPO \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"simple_optimizer": false, "num_sgd_iter": 2, "model": {"use_lstm": true}}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v1 \ --run PPO \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"simple_optimizer": true, "num_sgd_iter": 2, "model": {"use_lstm": true}}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v1 \ --run PPO \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_gpus": 0.1}' \ --ray-num-gpus 1 @@ -52,187 +52,201 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v1 \ --run PPO \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"kl_coeff": 1.0, "num_sgd_iter": 10, "lr": 1e-4, "sgd_minibatch_size": 64, "train_batch_size": 2000, "num_workers": 1, "use_gae": false, "batch_mode": "complete_episodes"}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v1 \ --run PPO \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"remote_worker_envs": true, "num_envs_per_worker": 2, "num_workers": 1, "train_batch_size": 100, "sgd_minibatch_size": 50}' +docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ + /ray/python/ray/rllib/tests/run_silent.sh train.py \ + --env CartPole-v1 \ + --run PPO \ + --stop '{"training_iteration": 2}' \ + --config '{"async_remote_worker_envs": true, "num_envs_per_worker": 2, "num_workers": 1, "train_batch_size": 100, "sgd_minibatch_size": 50}' + +docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ + /ray/python/ray/rllib/tests/run_silent.sh train.py \ + --env Pendulum-v0 \ + --run APPO \ + --stop '{"training_iteration": 1}' \ + --config '{"num_workers": 2, "num_gpus": 0}' + docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env Pendulum-v0 \ --run ES \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"stepsize": 0.01, "episodes_per_batch": 20, "train_batch_size": 100, "num_workers": 2}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env Pong-v0 \ --run ES \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"stepsize": 0.01, "episodes_per_batch": 20, "train_batch_size": 100, "num_workers": 2}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v0 \ --run A3C \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v0 \ --run DQN \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"lr": 1e-3, "schedule_max_timesteps": 100000, "exploration_fraction": 0.1, "exploration_final_eps": 0.02, "dueling": false, "hiddens": [], "model": {"fcnet_hiddens": [64], "fcnet_activation": "relu"}}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v0 \ --run DQN \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_workers": 2}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v0 \ --run APEX \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_workers": 2, "timesteps_per_iteration": 1000, "num_gpus": 0, "min_iter_time_s": 1}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env FrozenLake-v0 \ --run DQN \ - --stop '{"training_iteration": 2}' + --stop '{"training_iteration": 1}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env FrozenLake-v0 \ --run PPO \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_sgd_iter": 10, "sgd_minibatch_size": 64, "train_batch_size": 1000, "num_workers": 1}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env PongDeterministic-v4 \ --run DQN \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"lr": 1e-4, "schedule_max_timesteps": 2000000, "buffer_size": 10000, "exploration_fraction": 0.1, "exploration_final_eps": 0.01, "sample_batch_size": 4, "learning_starts": 10000, "target_network_update_freq": 1000, "gamma": 0.99, "prioritized_replay": true}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env MontezumaRevenge-v0 \ --run PPO \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"kl_coeff": 1.0, "num_sgd_iter": 10, "lr": 1e-4, "sgd_minibatch_size": 64, "train_batch_size": 2000, "num_workers": 1, "model": {"dim": 40, "conv_filters": [[16, [8, 8], 4], [32, [4, 4], 2], [512, [5, 5], 1]]}}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v1 \ --run A3C \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_workers": 2, "model": {"use_lstm": true}}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v0 \ --run DQN \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_workers": 2}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v0 \ --run PG \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"sample_batch_size": 500, "num_workers": 1}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v0 \ --run PG \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"sample_batch_size": 500, "use_pytorch": true}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v0 \ --run PG \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"sample_batch_size": 500, "num_workers": 1, "model": {"use_lstm": true, "max_seq_len": 100}}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v0 \ --run PG \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"sample_batch_size": 500, "num_workers": 1, "num_envs_per_worker": 10}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env Pong-v0 \ --run PG \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"sample_batch_size": 500, "num_workers": 1}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env FrozenLake-v0 \ --run PG \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"sample_batch_size": 500, "num_workers": 1}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env Pendulum-v0 \ --run DDPG \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_workers": 1}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v0 \ --run IMPALA \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_gpus": 0, "num_workers": 2, "min_iter_time_s": 1}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v0 \ --run IMPALA \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_gpus": 0, "num_workers": 2, "min_iter_time_s": 1, "model": {"use_lstm": true}}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v0 \ --run IMPALA \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_gpus": 0, "num_workers": 2, "min_iter_time_s": 1, "num_data_loader_buffers": 2, "replay_buffer_num_slots": 100, "replay_proportion": 1.0}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v0 \ --run IMPALA \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_gpus": 0, "num_workers": 2, "min_iter_time_s": 1, "num_data_loader_buffers": 2, "replay_buffer_num_slots": 100, "replay_proportion": 1.0, "model": {"use_lstm": true}}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env MountainCarContinuous-v0 \ --run DDPG \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_workers": 1}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env MountainCarContinuous-v0 \ --run DDPG \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_workers": 1}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ @@ -240,7 +254,7 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ --env Pendulum-v0 \ --run APEX_DDPG \ --ray-num-cpus 8 \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_workers": 2, "optimizer": {"num_replay_buffer_shards": 1}, "learning_starts": 100, "min_iter_time_s": 1}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ @@ -248,22 +262,24 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ --env Pendulum-v0 \ --run APEX_DDPG \ --ray-num-cpus 8 \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_workers": 2, "optimizer": {"num_replay_buffer_shards": 1}, "learning_starts": 100, "min_iter_time_s": 1, "batch_mode": "complete_episodes", "parameter_noise": true}' -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ - /ray/python/ray/rllib/tests/run_silent.sh train.py \ - --env CartPole-v0 \ - --run MARWIL \ - --stop '{"training_iteration": 2}' \ - --config '{"input": "/ray/python/ray/rllib/tests/data/cartpole_small", "learning_starts": 0, "input_evaluation": ["wis", "is"], "shuffle_buffer_size": 10}' +# TODO(ericl): reenable the test after fix the arrow serialization error. +# https://github.com/ray-project/ray/pull/4127#issuecomment-468903577 +#docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ +# /ray/python/ray/rllib/tests/run_silent.sh train.py \ +# --env CartPole-v0 \ +# --run MARWIL \ +# --stop '{"training_iteration": 1}' \ +# --config '{"input": "/ray/python/ray/rllib/tests/data/cartpole_small", "learning_starts": 0, "input_evaluation": ["wis", "is"], "shuffle_buffer_size": 10}' -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ - /ray/python/ray/rllib/tests/run_silent.sh train.py \ - --env CartPole-v0 \ - --run DQN \ - --stop '{"training_iteration": 2}' \ - --config '{"input": "/ray/python/ray/rllib/tests/data/cartpole_small", "learning_starts": 0, "input_evaluation": ["wis", "is"], "soft_q": true}' +#docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ +# /ray/python/ray/rllib/tests/run_silent.sh train.py \ +# --env CartPole-v0 \ +# --run DQN \ +# --stop '{"training_iteration": 1}' \ +# --config '{"input": "/ray/python/ray/rllib/tests/data/cartpole_small", "learning_starts": 0, "input_evaluation": ["wis", "is"], "soft_q": true}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh tests/test_local.py @@ -353,8 +369,10 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh examples/cartpole_lstm.py --stop=200 --use-prev-action-reward -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ - /ray/python/ray/rllib/tests/run_silent.sh examples/custom_loss.py --iters=2 +# TODO(ericl): reenable the test after fix the arrow serialization error. +# https://github.com/ray-project/ray/pull/4127#issuecomment-468903577 +#docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ +# /ray/python/ray/rllib/tests/run_silent.sh examples/custom_loss.py --iters=2 docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh examples/custom_metrics_and_callbacks.py --num-iters=2 @@ -375,14 +393,14 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env PongDeterministic-v4 \ --run A3C \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_workers": 2, "use_pytorch": true, "sample_async": false, "model": {"use_lstm": false, "grayscale": true, "zero_mean": false, "dim": 84}, "preprocessor_pref": "rllib"}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/python/ray/rllib/tests/run_silent.sh train.py \ --env CartPole-v1 \ --run A3C \ - --stop '{"training_iteration": 2}' \ + --stop '{"training_iteration": 1}' \ --config '{"num_workers": 2, "use_pytorch": true, "sample_async": false}' docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ @@ -392,6 +410,3 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ --stop='{"timesteps_total": 40000}' \ --ray-object-store-memory=500000000 \ --config '{"num_workers": 1, "num_gpus": 0, "num_envs_per_worker": 64, "sample_batch_size": 50, "train_batch_size": 50, "learner_queue_size": 1}' - -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ - python /ray/python/ray/rllib/agents/impala/vtrace_test.py diff --git a/ci/jenkins_tests/run_tune_tests.sh b/ci/jenkins_tests/run_tune_tests.sh index e70cd60d5636..6e8e63aa607b 100755 --- a/ci/jenkins_tests/run_tune_tests.sh +++ b/ci/jenkins_tests/run_tune_tests.sh @@ -11,6 +11,7 @@ SHM_SIZE=$2 DOCKER_SHA=$3 ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd) +SUPPRESS_OUTPUT=$ROOT_DIR/../suppress_output if [ "$MEMORY_SIZE" == "" ]; then MEMORY_SIZE="20G" @@ -30,46 +31,46 @@ fi echo "Using Docker image" $DOCKER_SHA -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ +$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ pytest /ray/python/ray/tune/tests/test_cluster.py -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ +$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ python /ray/python/ray/tune/examples/tune_mnist_ray.py \ --smoke-test -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ +$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ python /ray/python/ray/tune/examples/pbt_example.py \ --smoke-test -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ +$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ python /ray/python/ray/tune/examples/hyperband_example.py \ --smoke-test -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ +$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ python /ray/python/ray/tune/examples/async_hyperband_example.py \ --smoke-test -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ +$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ python /ray/python/ray/tune/examples/tune_mnist_ray_hyperband.py \ --smoke-test -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ +$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ python /ray/python/ray/tune/examples/tune_mnist_async_hyperband.py \ --smoke-test -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ +$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ python /ray/python/ray/tune/examples/logging_example.py \ --smoke-test -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ +$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ python /ray/python/ray/tune/examples/bayesopt_example.py \ --smoke-test -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ +$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ python /ray/python/ray/tune/examples/hyperopt_example.py \ --smoke-test -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} -e SIGOPT_KEY $DOCKER_SHA \ +$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} -e SIGOPT_KEY $DOCKER_SHA \ python /ray/python/ray/tune/examples/sigopt_example.py \ --smoke-test @@ -78,21 +79,21 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} -e SIGOPT_KEY $DO # python3 /ray/python/ray/tune/examples/nevergrad_example.py \ # --smoke-test -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ +$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ python /ray/python/ray/tune/examples/tune_mnist_keras.py \ --smoke-test -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ +$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ python /ray/python/ray/tune/examples/mnist_pytorch.py --smoke-test --no-cuda -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ +$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ python /ray/python/ray/tune/examples/mnist_pytorch_trainable.py \ --smoke-test --no-cuda -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ +$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ python /ray/python/ray/tune/examples/genetic_example.py \ --smoke-test -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ +$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ python /ray/python/ray/tune/examples/skopt_example.py \ --smoke-test diff --git a/ci/long_running_tests/config.yaml b/ci/long_running_tests/config.yaml index 76254385f124..8112bd522e87 100644 --- a/ci/long_running_tests/config.yaml +++ b/ci/long_running_tests/config.yaml @@ -13,7 +13,7 @@ auth: ssh_user: ubuntu head_node: - InstanceType: m5.2xlarge + InstanceType: m5.xlarge ImageId: ami-0def3275 # Default Ubuntu 16.04 AMI. # Set primary volume to 25 GiB @@ -51,7 +51,7 @@ setup_commands: # - git clone https://github.com/ray-project/ray || true # - cd ray/python; git checkout master; git pull; pip install -e . --verbose # Install nightly Ray wheels. - - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev0-cp36-cp36m-manylinux1_x86_64.whl + - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev1-cp36-cp36m-manylinux1_x86_64.whl - pip install ray[rllib] ray[debug] tensorflow - pip install -U dask # fix error importing lz4 diff --git a/ci/long_running_tests/start_workloads.sh b/ci/long_running_tests/start_workloads.sh index f68a774a20b4..21fdf2bb4093 100755 --- a/ci/long_running_tests/start_workloads.sh +++ b/ci/long_running_tests/start_workloads.sh @@ -60,5 +60,12 @@ done echo "" echo "" +echo "To shut down all instances, run the following." +echo " $ROOT_DIR/shut_down_workloads.sh" + +echo "" +echo "" + echo "To check up on the scripts, run the following." -echo " $ROOT_DIR/check_workloads.sh" +echo " $ROOT_DIR/check_workloads.sh --load" +echo " $ROOT_DIR/check_workloads.sh --logs" diff --git a/ci/long_running_tests/workloads/actor_deaths.py b/ci/long_running_tests/workloads/actor_deaths.py new file mode 100644 index 000000000000..55f05c34c5d0 --- /dev/null +++ b/ci/long_running_tests/workloads/actor_deaths.py @@ -0,0 +1,107 @@ +# This workload tests repeatedly killing actors and submitting tasks to them. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import sys +import time + +import ray +from ray.tests.cluster_utils import Cluster + +num_redis_shards = 1 +redis_max_memory = 10**8 +object_store_memory = 10**8 +num_nodes = 2 + +message = ("Make sure there is enough memory on this machine to run this " + "workload. We divide the system memory by 2 to provide a buffer.") +assert (num_nodes * object_store_memory + num_redis_shards * redis_max_memory < + ray.utils.get_system_memory() / 2) + +# Simulate a cluster on one machine. + +cluster = Cluster() +for i in range(num_nodes): + cluster.add_node( + redis_port=6379 if i == 0 else None, + num_redis_shards=num_redis_shards if i == 0 else None, + num_cpus=8, + num_gpus=0, + resources={str(i): 2}, + object_store_memory=object_store_memory, + redis_max_memory=redis_max_memory) +ray.init(redis_address=cluster.redis_address) + +# Run the workload. + +num_parents = 5 +num_children = 5 +death_probability = 0.95 + + +@ray.remote +class Child(object): + def __init__(self, death_probability): + self.death_probability = death_probability + + def ping(self): + # Exit process with some probability. + exit_chance = np.random.rand() + if exit_chance > self.death_probability: + sys.exit(-1) + + +@ray.remote +class Parent(object): + def __init__(self, num_children, death_probability): + self.death_probability = death_probability + self.children = [ + Child.remote(death_probability) for _ in range(num_children) + ] + + def ping(self, num_pings): + children_outputs = [] + for _ in range(num_pings): + children_outputs += [ + child.ping.remote() for child in self.children + ] + try: + ray.get(children_outputs) + except Exception: + # Replace the children if one of them died. + self.__init__(len(self.children), self.death_probability) + + def kill(self): + # Clean up children. + ray.get([child.__ray_terminate__.remote() for child in self.children]) + + +parents = [ + Parent.remote(num_children, death_probability) for _ in range(num_parents) +] + +iteration = 0 +start_time = time.time() +previous_time = start_time +while True: + ray.get([parent.ping.remote(10) for parent in parents]) + + # Kill a parent actor with some probability. + exit_chance = np.random.rand() + if exit_chance > death_probability: + parent_index = np.random.randint(len(parents)) + parents[parent_index].kill.remote() + parents[parent_index] = Parent.remote(num_children, death_probability) + + new_time = time.time() + print("Iteration {}:\n" + " - Iteration time: {}.\n" + " - Absolute time: {}.\n" + " - Total elapsed time: {}.".format( + iteration, new_time - previous_time, new_time, + new_time - start_time)) + previous_time = new_time + iteration += 1 diff --git a/ci/long_running_tests/workloads/workload_apex.py b/ci/long_running_tests/workloads/apex.py similarity index 100% rename from ci/long_running_tests/workloads/workload_apex.py rename to ci/long_running_tests/workloads/apex.py diff --git a/ci/long_running_tests/workloads/workload_impala.py b/ci/long_running_tests/workloads/impala.py similarity index 100% rename from ci/long_running_tests/workloads/workload_impala.py rename to ci/long_running_tests/workloads/impala.py diff --git a/ci/long_running_tests/workloads/workload2.py b/ci/long_running_tests/workloads/many_actor_tasks.py similarity index 82% rename from ci/long_running_tests/workloads/workload2.py rename to ci/long_running_tests/workloads/many_actor_tasks.py index 75fefb05873c..c37b22ccc3d1 100644 --- a/ci/long_running_tests/workloads/workload2.py +++ b/ci/long_running_tests/workloads/many_actor_tasks.py @@ -26,7 +26,7 @@ cluster.add_node( redis_port=6379 if i == 0 else None, num_redis_shards=num_redis_shards if i == 0 else None, - num_cpus=2, + num_cpus=5, num_gpus=0, resources={str(i): 2}, object_store_memory=object_store_memory, @@ -36,7 +36,9 @@ # Run the workload. -@ray.remote +# TODO (williamma12): Remove the num_cpus argument once +# https://github.com/ray-project/ray/issues/4312 gets resolved +@ray.remote(num_cpus=0.1) class Actor(object): def __init__(self): self.value = 0 @@ -45,8 +47,10 @@ def method(self): self.value += 1 +# TODO (williamma12): Update the actors to each have only 0.1 of a cpu once +# https://github.com/ray-project/ray/issues/4312 gets resolved. actors = [ - Actor._remote([], {}, num_cpus=0.1, resources={str(i % num_nodes): 0.1}) + Actor._remote([], {}, resources={str(i % num_nodes): 0.1}) for i in range(num_nodes * 5) ] diff --git a/ci/long_running_tests/workloads/many_drivers.py b/ci/long_running_tests/workloads/many_drivers.py new file mode 100644 index 000000000000..b3fb4d1ee296 --- /dev/null +++ b/ci/long_running_tests/workloads/many_drivers.py @@ -0,0 +1,105 @@ +# This workload tests many drivers using the same cluster. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import time + +import ray +from ray.tests.cluster_utils import Cluster +from ray.tests.utils import run_string_as_driver + +num_redis_shards = 5 +redis_max_memory = 10**8 +object_store_memory = 10**8 +num_nodes = 4 + +message = ("Make sure there is enough memory on this machine to run this " + "workload. We divide the system memory by 2 to provide a buffer.") +assert (num_nodes * object_store_memory + num_redis_shards * redis_max_memory < + ray.utils.get_system_memory() / 2) + +# Simulate a cluster on one machine. + +cluster = Cluster() +for i in range(num_nodes): + cluster.add_node( + redis_port=6379 if i == 0 else None, + num_redis_shards=num_redis_shards if i == 0 else None, + num_cpus=4, + num_gpus=0, + resources={str(i): 5}, + object_store_memory=object_store_memory, + redis_max_memory=redis_max_memory) +ray.init(redis_address=cluster.redis_address) + +# Run the workload. + +# Define a driver script that runs a few tasks and actors on each node in the +# cluster. +driver_script = """ +import ray + +ray.init(redis_address="{}") + +num_nodes = {} + + +@ray.remote +def f(): + return 1 + + +@ray.remote +class Actor(object): + def method(self): + return 1 + + +for _ in range(5): + for i in range(num_nodes): + assert (ray.get( + f._remote(args=[], kwargs={{}}, resources={{str(i): 1}})) == 1) + actor = Actor._remote(args=[], kwargs={{}}, resources={{str(i): 1}}) + assert ray.get(actor.method.remote()) == 1 + +print("success") +""".format(cluster.redis_address, num_nodes) + + +@ray.remote +def run_driver(): + output = run_string_as_driver(driver_script) + assert "success" in output + + +iteration = 0 +running_ids = [ + run_driver._remote( + args=[], kwargs={}, num_cpus=0, resources={str(i): 0.01}) + for i in range(num_nodes) +] +start_time = time.time() +previous_time = start_time +while True: + # Wait for a driver to finish and start a new driver. + [ready_id], running_ids = ray.wait(running_ids, num_returns=1) + ray.get(ready_id) + + running_ids.append( + run_driver._remote( + args=[], + kwargs={}, + num_cpus=0, + resources={str(iteration % num_nodes): 0.01})) + + new_time = time.time() + print("Iteration {}:\n" + " - Iteration time: {}.\n" + " - Absolute time: {}.\n" + " - Total elapsed time: {}.".format( + iteration, new_time - previous_time, new_time, + new_time - start_time)) + previous_time = new_time + iteration += 1 diff --git a/ci/long_running_tests/workloads/workload1.py b/ci/long_running_tests/workloads/many_tasks.py similarity index 100% rename from ci/long_running_tests/workloads/workload1.py rename to ci/long_running_tests/workloads/many_tasks.py diff --git a/ci/long_running_tests/workloads/workload3.py b/ci/long_running_tests/workloads/node_failures.py similarity index 100% rename from ci/long_running_tests/workloads/workload3.py rename to ci/long_running_tests/workloads/node_failures.py diff --git a/ci/long_running_tests/workloads/workload_pbt.py b/ci/long_running_tests/workloads/pbt.py similarity index 100% rename from ci/long_running_tests/workloads/workload_pbt.py rename to ci/long_running_tests/workloads/pbt.py diff --git a/ci/stress_tests/application_cluster_template.yaml b/ci/stress_tests/application_cluster_template.yaml index c8a3f817c11e..0d7baedb78fb 100644 --- a/ci/stress_tests/application_cluster_template.yaml +++ b/ci/stress_tests/application_cluster_template.yaml @@ -90,8 +90,8 @@ file_mounts: { # List of shell commands to run to set up nodes. setup_commands: - echo 'export PATH="$HOME/anaconda3/envs/tensorflow_<<>>/bin:$PATH"' >> ~/.bashrc - - ray || wget https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev0-<<>>-manylinux1_x86_64.whl - - rllib || pip install -U ray-0.7.0.dev0-<<>>-manylinux1_x86_64.whl[rllib] + - ray || wget https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev1-<<>>-manylinux1_x86_64.whl + - rllib || pip install -U ray-0.7.0.dev1-<<>>-manylinux1_x86_64.whl[rllib] - pip install tensorflow-gpu==1.12.0 - echo "sudo halt" | at now + 60 minutes # Consider uncommenting these if you also want to run apt-get commands during setup diff --git a/ci/stress_tests/stress_testing_config.yaml b/ci/stress_tests/stress_testing_config.yaml index 70b61d3c107a..1bee8248eacc 100644 --- a/ci/stress_tests/stress_testing_config.yaml +++ b/ci/stress_tests/stress_testing_config.yaml @@ -100,7 +100,7 @@ setup_commands: # - git clone https://github.com/ray-project/ray || true - pip install boto3==1.4.8 cython==0.29.0 # - cd ray/python; git checkout master; git pull; pip install -e . --verbose - - pip install https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev0-cp36-cp36m-manylinux1_x86_64.whl + - pip install https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev1-cp36-cp36m-manylinux1_x86_64.whl - echo "sudo halt" | at now + 60 minutes # Custom commands that will be run on the head node after common setup. diff --git a/ci/suppress_output b/ci/suppress_output new file mode 100755 index 000000000000..18652d1ec86c --- /dev/null +++ b/ci/suppress_output @@ -0,0 +1,34 @@ +#!/bin/bash +# Run a command, suppressing output unless it hangs or crashes. + +TMPFILE=`mktemp` +COMMAND="$@" +PID=$$ + +# Print output to avoid travis killing us +watchdog() { + for i in `seq 5 5 120`; do + sleep 300 + echo "This command has been running for more than $i minutes..." + done + echo "Command timed out after 2h, dumping logs:" + cat $TMPFILE + echo "TIMED OUT" + kill -SIGKILL $PID +} + +watchdog & 2>/dev/null +WATCHDOG_PID=$! + +time $COMMAND >$TMPFILE 2>&1 + +CODE=$? +if [ $CODE != 0 ]; then + cat $TMPFILE + echo "FAILED $CODE" + kill $WATCHDOG_PID + exit $CODE +fi + +kill $WATCHDOG_PID +exit 0 diff --git a/ci/travis/install-dependencies.sh b/ci/travis/install-dependencies.sh index 287fa62a427a..5f92eb43ab11 100755 --- a/ci/travis/install-dependencies.sh +++ b/ci/travis/install-dependencies.sh @@ -24,8 +24,8 @@ if [[ "$PYTHON" == "2.7" ]] && [[ "$platform" == "linux" ]]; then wget https://repo.continuum.io/miniconda/Miniconda2-4.5.4-Linux-x86_64.sh -O miniconda.sh -nv bash miniconda.sh -b -p $HOME/miniconda export PATH="$HOME/miniconda/bin:$PATH" - pip install -q scipy tensorflow cython==0.29.0 gym==0.10.11 opencv-python-headless pyyaml pandas==0.23.4 requests \ - feather-format lxml openpyxl xlrd py-spy setproctitle faulthandler pytest-timeout mock flaky networkx + pip install -q scipy tensorflow cython==0.29.0 gym opencv-python-headless pyyaml pandas==0.23.4 requests \ + feather-format lxml openpyxl xlrd py-spy setproctitle faulthandler pytest-timeout mock flaky networkx tabulate elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "linux" ]]; then sudo apt-get update sudo apt-get install -y python-dev python-numpy build-essential curl unzip tmux gdb @@ -34,7 +34,7 @@ elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "linux" ]]; then bash miniconda.sh -b -p $HOME/miniconda export PATH="$HOME/miniconda/bin:$PATH" pip install -q scipy tensorflow cython==0.29.0 gym opencv-python-headless pyyaml pandas==0.23.4 requests \ - feather-format lxml openpyxl xlrd py-spy setproctitle pytest-timeout flaky networkx + feather-format lxml openpyxl xlrd py-spy setproctitle pytest-timeout flaky networkx tabulate elif [[ "$PYTHON" == "2.7" ]] && [[ "$platform" == "macosx" ]]; then # check that brew is installed which -s brew @@ -49,8 +49,8 @@ elif [[ "$PYTHON" == "2.7" ]] && [[ "$platform" == "macosx" ]]; then wget https://repo.continuum.io/miniconda/Miniconda2-4.5.4-MacOSX-x86_64.sh -O miniconda.sh -nv bash miniconda.sh -b -p $HOME/miniconda export PATH="$HOME/miniconda/bin:$PATH" - pip install -q cython==0.29.0 tensorflow gym==0.10.11 opencv-python-headless pyyaml pandas==0.23.4 requests \ - feather-format lxml openpyxl xlrd py-spy setproctitle faulthandler pytest-timeout mock flaky networkx + pip install -q cython==0.29.0 tensorflow gym opencv-python-headless pyyaml pandas==0.23.4 requests \ + feather-format lxml openpyxl xlrd py-spy setproctitle faulthandler pytest-timeout mock flaky networkx tabulate elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "macosx" ]]; then # check that brew is installed which -s brew @@ -66,7 +66,7 @@ elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "macosx" ]]; then bash miniconda.sh -b -p $HOME/miniconda export PATH="$HOME/miniconda/bin:$PATH" pip install -q cython==0.29.0 tensorflow gym opencv-python-headless pyyaml pandas==0.23.4 requests \ - feather-format lxml openpyxl xlrd py-spy setproctitle pytest-timeout flaky networkx + feather-format lxml openpyxl xlrd py-spy setproctitle pytest-timeout flaky networkx tabulate elif [[ "$LINT" == "1" ]]; then sudo apt-get update sudo apt-get install -y build-essential curl unzip diff --git a/cmake/Modules/ArrowExternalProject.cmake b/cmake/Modules/ArrowExternalProject.cmake index b6cba2e0072c..31dccb1add16 100644 --- a/cmake/Modules/ArrowExternalProject.cmake +++ b/cmake/Modules/ArrowExternalProject.cmake @@ -20,12 +20,12 @@ set(arrow_URL https://github.com/ray-project/arrow.git) # Arrow often rewrites git history and invalidates certain commits. # It has been patched to fix an upstream symbol clash with TensorFlow, # the patch is available at -# https://github.com/ray-project/arrow/commit/007e1ca289e979bac80231fa9ee7510be744b60b +# https://github.com/ray-project/arrow/commit/68299c5f48289c4f39a948cbd0426b9199a9df1e # See the discussion in https://github.com/apache/arrow/pull/3177 # WARNING: If the arrow version is updated, you need to also update the # SETUPTOOLS_SCM_PRETEND_VERSION version string in the ThirdpartyToolchain.cmake # file -set(arrow_TAG 007e1ca289e979bac80231fa9ee7510be744b60b) +set(arrow_TAG 68299c5f48289c4f39a948cbd0426b9199a9df1e) set(ARROW_INSTALL_PREFIX ${CMAKE_CURRENT_BINARY_DIR}/external/arrow-install) set(ARROW_HOME ${ARROW_INSTALL_PREFIX}) diff --git a/doc/source/installation.rst b/doc/source/installation.rst index a54c5405c236..dab5e16a4283 100644 --- a/doc/source/installation.rst +++ b/doc/source/installation.rst @@ -34,16 +34,16 @@ Here are links to the latest wheels (which are built off of master). To install =================== =================== -.. _`Linux Python 3.7`: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev0-cp37-cp37m-manylinux1_x86_64.whl -.. _`Linux Python 3.6`: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev0-cp36-cp36m-manylinux1_x86_64.whl -.. _`Linux Python 3.5`: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev0-cp35-cp35m-manylinux1_x86_64.whl -.. _`Linux Python 3.4`: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev0-cp34-cp34m-manylinux1_x86_64.whl -.. _`Linux Python 2.7`: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev0-cp27-cp27mu-manylinux1_x86_64.whl -.. _`MacOS Python 3.7`: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev0-cp37-cp37m-macosx_10_6_intel.whl -.. _`MacOS Python 3.6`: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev0-cp36-cp36m-macosx_10_6_intel.whl -.. _`MacOS Python 3.5`: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev0-cp35-cp35m-macosx_10_6_intel.whl -.. _`MacOS Python 3.4`: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev0-cp34-cp34m-macosx_10_6_intel.whl -.. _`MacOS Python 2.7`: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev0-cp27-cp27m-macosx_10_6_intel.whl +.. _`Linux Python 3.7`: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev1-cp37-cp37m-manylinux1_x86_64.whl +.. _`Linux Python 3.6`: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev1-cp36-cp36m-manylinux1_x86_64.whl +.. _`Linux Python 3.5`: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev1-cp35-cp35m-manylinux1_x86_64.whl +.. _`Linux Python 3.4`: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev1-cp34-cp34m-manylinux1_x86_64.whl +.. _`Linux Python 2.7`: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev1-cp27-cp27mu-manylinux1_x86_64.whl +.. _`MacOS Python 3.7`: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev1-cp37-cp37m-macosx_10_6_intel.whl +.. _`MacOS Python 3.6`: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev1-cp36-cp36m-macosx_10_6_intel.whl +.. _`MacOS Python 3.5`: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev1-cp35-cp35m-macosx_10_6_intel.whl +.. _`MacOS Python 3.4`: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev1-cp34-cp34m-macosx_10_6_intel.whl +.. _`MacOS Python 2.7`: https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev1-cp27-cp27m-macosx_10_6_intel.whl Building Ray from source diff --git a/doc/source/rllib-env.rst b/doc/source/rllib-env.rst index a03077c7ed63..7180099fca04 100644 --- a/doc/source/rllib-env.rst +++ b/doc/source/rllib-env.rst @@ -66,7 +66,7 @@ For a full runnable code example using the custom environment API, see `custom_e .. warning:: - Please do **not** try to use gym registration to register custom environments. The gym registry is not compatible with Ray. Instead, always use the registration flows documented above. + The gym registry is not compatible with Ray. Instead, always use the registration flows documented above to ensure Ray workers can access the environment. Configuring Environments ------------------------ @@ -119,7 +119,7 @@ Vectorized RLlib will auto-vectorize Gym envs for batch evaluation if the ``num_envs_per_worker`` config is set, or you can define a custom environment class that subclasses `VectorEnv `__ to implement ``vector_step()`` and ``vector_reset()``. -Note that auto-vectorization only applies to policy inference by default. This means that policy inference will be batched, but your envs will still be stepped one at a time. If you would like your envs to be stepped in parallel, you can set ``"remote_worker_envs": True``. This will create env instances in Ray actors and step them in parallel. These remote processes introduce communication overheads, so this only helps if your env is very expensive to step. +Note that auto-vectorization only applies to policy inference by default. This means that policy inference will be batched, but your envs will still be stepped one at a time. If you would like your envs to be stepped in parallel, you can set ``"remote_worker_envs": True`` or ``"async_remote_worker_envs": True``. This will create env instances in Ray actors and step them in parallel. These remote processes introduce communication overheads, so this only helps if your env is very expensive to step. Multi-Agent and Hierarchical ---------------------------- @@ -319,11 +319,9 @@ Note that envs can read from different partitions of the logs based on the ``wor .. seealso:: - `RLlib I/O `__ provides higher-level interfaces for working with offline experience datasets. + `Offline Datasets `__ provide higher-level interfaces for working with offline experience datasets. -Batch Asynchronous ------------------- +Advanced Integrations +--------------------- -The lowest-level "catch-all" environment supported by RLlib is `BaseEnv `__. BaseEnv models multiple agents executing asynchronously in multiple environments. A call to ``poll()`` returns observations from ready agents keyed by their environment and agent ids, and actions for those agents can be sent back via ``send_actions()``. This interface can be subclassed directly to support batched simulators such as `ELF `__. - -Under the hood, all other envs are converted to BaseEnv by RLlib so that there is a common internal path for policy evaluation. +For more complex / high-performance environment integrations, you can instead extend the low-level `BaseEnv `__ class. This low-level API models multiple agents executing asynchronously in multiple environments. A call to ``BaseEnv:poll()`` returns observations from ready agents keyed by their environment and agent ids, and actions for those agents are sent back via ``BaseEnv:send_actions()``. BaseEnv is used to implement all the other env types in RLlib, so it offers a superset of their functionality. For example, ``BaseEnv`` is used to implement dynamic batching of observations for inference over `multiple simulator actors `__. diff --git a/doc/source/rllib.rst b/doc/source/rllib.rst index 3ab98ce8d20b..7553de5d9319 100644 --- a/doc/source/rllib.rst +++ b/doc/source/rllib.rst @@ -40,7 +40,7 @@ Environments * `Vectorized `__ * `Multi-Agent and Hierarchical `__ * `Interfacing with External Agents `__ -* `Batch Asynchronous `__ +* `Advanced Integrations `__ Algorithms ---------- diff --git a/doc/source/tune-usage.rst b/doc/source/tune-usage.rst index c69640a64509..82ae5a33b530 100644 --- a/doc/source/tune-usage.rst +++ b/doc/source/tune-usage.rst @@ -499,6 +499,50 @@ And stopping a trial (``PUT /trials/:id``): curl -X PUT http://
:/trials/ +Tune CLI (Experimental) +----------------------- + +``tune`` has an easy-to-use command line interface (CLI) to manage and monitor your experiments on Ray. To do this, verify that you have the ``tabulate`` library installed: + +.. code-block:: bash + + $ pip install tabulate + +Here are a few examples of command line calls. + +- ``tune list-trials``: List tabular information about trials within an experiment. Add the ``--sort`` flag to sort the output by specific columns. + +.. code-block:: bash + + $ tune list-trials [EXPERIMENT_DIR] + + +------------------+-----------------------+------------+ + | trainable_name | experiment_tag | trial_id | + |------------------+-----------------------+------------| + | MyTrainableClass | 0_height=40,width=37 | 87b54a1d | + | MyTrainableClass | 1_height=21,width=70 | 23b89036 | + | MyTrainableClass | 2_height=99,width=90 | 518dbe95 | + | MyTrainableClass | 3_height=54,width=21 | 7b99a28a | + | MyTrainableClass | 4_height=90,width=69 | ae4e02fb | + +------------------+-----------------------+------------+ + Dropped columns: ['status', 'last_update_time'] + +- ``tune list-experiments``: List tabular information about experiments within a project. Add the ``--sort`` flag to sort the output by specific columns. + +.. code-block:: bash + + $ tune list-experiments [PROJECT_DIR] + + +----------------------+----------------+------------------+---------------------+ + | name | total_trials | running_trials | terminated_trials | + |----------------------+----------------+------------------+---------------------| + | pbt_test | 10 | 0 | 0 | + | test | 1 | 0 | 0 | + | hyperband_test | 1 | 0 | 1 | + +----------------------+----------------+------------------+---------------------+ + Dropped columns: ['error_trials', 'last_updated'] + + Further Questions or Issues? ---------------------------- diff --git a/docker/examples/Dockerfile b/docker/examples/Dockerfile index df205c9a3d52..6883c5a64a0e 100644 --- a/docker/examples/Dockerfile +++ b/docker/examples/Dockerfile @@ -5,7 +5,7 @@ FROM ray-project/deploy # This updates numpy to 1.14 and mutes errors from other libraries RUN conda install -y numpy RUN apt-get install -y zlib1g-dev -RUN pip install gym[atari]==0.10.11 opencv-python-headless tensorflow lz4 keras pytest-timeout smart_open +RUN pip install gym[atari] opencv-python-headless tensorflow lz4 keras pytest-timeout smart_open RUN pip install -U h5py # Mutes FutureWarnings RUN pip install --upgrade bayesian-optimization RUN pip install --upgrade git+git://github.com/hyperopt/hyperopt.git diff --git a/docker/stress_test/Dockerfile b/docker/stress_test/Dockerfile index 94c1f2f0a401..2716670e7705 100644 --- a/docker/stress_test/Dockerfile +++ b/docker/stress_test/Dockerfile @@ -4,7 +4,7 @@ FROM ray-project/base-deps # We install ray and boto3 to enable the ray autoscaler as # a test runner. -RUN pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev0-cp27-cp27mu-manylinux1_x86_64.whl boto3 +RUN pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev1-cp27-cp27mu-manylinux1_x86_64.whl boto3 RUN mkdir -p /root/.ssh/ # We port the source code in so that we run the most up-to-date stress tests. diff --git a/docker/tune_test/Dockerfile b/docker/tune_test/Dockerfile index 8755af698071..42deadec8799 100644 --- a/docker/tune_test/Dockerfile +++ b/docker/tune_test/Dockerfile @@ -4,7 +4,7 @@ FROM ray-project/base-deps # We install ray and boto3 to enable the ray autoscaler as # a test runner. -RUN pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev0-cp27-cp27mu-manylinux1_x86_64.whl boto3 +RUN pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev1-cp27-cp27mu-manylinux1_x86_64.whl boto3 # We install this after the latest wheels -- this should not override the latest wheels. RUN apt-get install -y zlib1g-dev RUN pip install gym[atari]==0.10.11 opencv-python-headless tensorflow lz4 keras pytest-timeout smart_open diff --git a/java/runtime/src/main/java/org/ray/runtime/RayDevRuntime.java b/java/runtime/src/main/java/org/ray/runtime/RayDevRuntime.java index 7dffd3fd54c5..e5d7b20b1d64 100644 --- a/java/runtime/src/main/java/org/ray/runtime/RayDevRuntime.java +++ b/java/runtime/src/main/java/org/ray/runtime/RayDevRuntime.java @@ -28,4 +28,9 @@ public void shutdown() { public MockObjectStore getObjectStore() { return store; } + + @Override + public Worker getWorker() { + return ((MockRayletClient) rayletClient).getCurrentWorker(); + } } diff --git a/java/runtime/src/main/java/org/ray/runtime/RuntimeContextImpl.java b/java/runtime/src/main/java/org/ray/runtime/RuntimeContextImpl.java index f0780cc2d8cd..b0ba67a4c3f2 100644 --- a/java/runtime/src/main/java/org/ray/runtime/RuntimeContextImpl.java +++ b/java/runtime/src/main/java/org/ray/runtime/RuntimeContextImpl.java @@ -4,7 +4,6 @@ import org.ray.api.RuntimeContext; import org.ray.api.id.UniqueId; import org.ray.runtime.config.RunMode; -import org.ray.runtime.config.WorkerMode; import org.ray.runtime.task.TaskSpec; public class RuntimeContextImpl implements RuntimeContext { @@ -22,8 +21,10 @@ public UniqueId getCurrentDriverId() { @Override public UniqueId getCurrentActorId() { - Preconditions.checkState(runtime.rayConfig.workerMode == WorkerMode.WORKER); - return runtime.getWorker().getCurrentActorId(); + Worker worker = runtime.getWorker(); + Preconditions.checkState(worker != null && !worker.getCurrentActorId().isNil(), + "This method should only be called from an actor."); + return worker.getCurrentActorId(); } @Override diff --git a/java/runtime/src/main/java/org/ray/runtime/Worker.java b/java/runtime/src/main/java/org/ray/runtime/Worker.java index e6a069efce76..ef319ea20233 100644 --- a/java/runtime/src/main/java/org/ray/runtime/Worker.java +++ b/java/runtime/src/main/java/org/ray/runtime/Worker.java @@ -79,7 +79,6 @@ public void loop() { * Execute a task. */ public void execute(TaskSpec spec) { - LOGGER.info("Executing task {}", spec.taskId); LOGGER.debug("Executing task {}", spec); UniqueId returnId = spec.returnIds[0]; ClassLoader oldLoader = Thread.currentThread().getContextClassLoader(); @@ -123,7 +122,7 @@ public void execute(TaskSpec spec) { maybeLoadCheckpoint(result, returnId); currentActor = result; } - LOGGER.info("Finished executing task {}", spec.taskId); + LOGGER.debug("Finished executing task {}", spec.taskId); } catch (Exception e) { LOGGER.error("Error executing task " + spec, e); if (!spec.isActorCreationTask()) { diff --git a/java/runtime/src/main/java/org/ray/runtime/objectstore/MockObjectStore.java b/java/runtime/src/main/java/org/ray/runtime/objectstore/MockObjectStore.java index 3470840826d2..4b80d3e4c276 100644 --- a/java/runtime/src/main/java/org/ray/runtime/objectstore/MockObjectStore.java +++ b/java/runtime/src/main/java/org/ray/runtime/objectstore/MockObjectStore.java @@ -1,6 +1,5 @@ package org.ray.runtime.objectstore; -import java.lang.reflect.Constructor; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -65,7 +64,7 @@ public byte[] get(byte[] objectId, int timeoutMs, boolean isMetadata) { public List get(byte[][] objectIds, int timeoutMs, boolean isMetadata) { return get(objectIds, timeoutMs) .stream() - .map(data -> isMetadata ? data.data : data.metadata) + .map(data -> isMetadata ? data.metadata : data.data) .collect(Collectors.toList()); } @@ -93,16 +92,9 @@ public List get(byte[][] objectIds, int timeoutMs) { firstCheck = false; } ArrayList rets = new ArrayList<>(); - for (byte[] id : objectIds) { - try { - Constructor constructor = ObjectStoreData.class.getConstructor( - byte[].class, byte[].class); - constructor.setAccessible(true); - rets.add(constructor.newInstance(metadata.get(new UniqueId(id)), - data.get(new UniqueId(id)))); - } catch (Exception e) { - throw new RuntimeException(e); - } + for (byte[] objId : objectIds) { + UniqueId uniqueId = new UniqueId(objId); + rets.add(new ObjectStoreData(metadata.get(uniqueId), data.get(uniqueId))); } return rets; } diff --git a/java/runtime/src/main/java/org/ray/runtime/objectstore/ObjectStoreProxy.java b/java/runtime/src/main/java/org/ray/runtime/objectstore/ObjectStoreProxy.java index e3d8f2e586a6..d1d9102f798b 100644 --- a/java/runtime/src/main/java/org/ray/runtime/objectstore/ObjectStoreProxy.java +++ b/java/runtime/src/main/java/org/ray/runtime/objectstore/ObjectStoreProxy.java @@ -78,10 +78,8 @@ public List> get(List ids, int timeoutMs) { List> results = new ArrayList<>(); for (int i = 0; i < dataAndMetaList.size(); i++) { - // TODO(hchen): Plasma API returns data and metadata in wrong order, this should be fixed - // from the arrow side first. - byte[] meta = dataAndMetaList.get(i).data; - byte[] data = dataAndMetaList.get(i).metadata; + byte[] meta = dataAndMetaList.get(i).metadata; + byte[] data = dataAndMetaList.get(i).data; GetResult result; if (meta != null) { diff --git a/java/runtime/src/main/java/org/ray/runtime/raylet/MockRayletClient.java b/java/runtime/src/main/java/org/ray/runtime/raylet/MockRayletClient.java index f16c8f9f8cfc..e44fd1014a63 100644 --- a/java/runtime/src/main/java/org/ray/runtime/raylet/MockRayletClient.java +++ b/java/runtime/src/main/java/org/ray/runtime/raylet/MockRayletClient.java @@ -39,6 +39,7 @@ public class MockRayletClient implements RayletClient { private final ExecutorService exec; private final Deque idleWorkers; private final Map actorWorkers; + private final ThreadLocal currentWorker; public MockRayletClient(RayDevRuntime runtime, int numberThreads) { this.runtime = runtime; @@ -48,6 +49,7 @@ public MockRayletClient(RayDevRuntime runtime, int numberThreads) { exec = Executors.newFixedThreadPool(numberThreads); idleWorkers = new LinkedList<>(); actorWorkers = new HashMap<>(); + currentWorker = new ThreadLocal<>(); } public synchronized void onObjectPut(UniqueId id) { @@ -60,22 +62,28 @@ public synchronized void onObjectPut(UniqueId id) { } } + public Worker getCurrentWorker() { + return currentWorker.get(); + } + /** * Get a worker from the worker pool to run the given task. */ private Worker getWorker(TaskSpec task) { - if (task.isActorTask()) { - return actorWorkers.get(task.actorId); - } Worker worker; - if (idleWorkers.size() > 0) { - worker = idleWorkers.pop(); + if (task.isActorTask()) { + worker = actorWorkers.get(task.actorId); } else { - worker = new Worker(runtime); - } - if (task.isActorCreationTask()) { - actorWorkers.put(task.actorCreationId, worker); + if (idleWorkers.size() > 0) { + worker = idleWorkers.pop(); + } else { + worker = new Worker(runtime); + } + if (task.isActorCreationTask()) { + actorWorkers.put(task.actorCreationId, worker); + } } + currentWorker.set(worker); return worker; } @@ -83,6 +91,7 @@ private Worker getWorker(TaskSpec task) { * Return the worker to the worker pool. */ private void returnWorker(Worker worker) { + currentWorker.remove(); idleWorkers.push(worker); } @@ -105,9 +114,7 @@ public synchronized void submitTask(TaskSpec task) { new byte[]{}, new byte[]{}); } } finally { - if (!task.isActorCreationTask() && !task.isActorTask()) { - returnWorker(worker); - } + returnWorker(worker); } }); } else { diff --git a/java/runtime/src/main/resources/ray.default.conf b/java/runtime/src/main/resources/ray.default.conf index 81dab4d3d017..5faeda7cfedf 100644 --- a/java/runtime/src/main/resources/ray.default.conf +++ b/java/runtime/src/main/resources/ray.default.conf @@ -100,7 +100,7 @@ ray { // ---------------------------- dev-runtime { // Number of threads that you process tasks - execution-parallelism: 5 + execution-parallelism: 10 } } diff --git a/java/test.sh b/java/test.sh index 1c6370d1fe8e..b3e889371bcd 100755 --- a/java/test.sh +++ b/java/test.sh @@ -2,36 +2,25 @@ # Cause the script to exit if a single command fails. set -e - # Show explicitly which commands are currently running. set -x ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd) -$ROOT_DIR/../build.sh -l java pushd $ROOT_DIR/../java +echo "Compiling Java code." mvn clean install -Dmaven.test.skip -check_style=$(mvn checkstyle:check) -echo "${check_style}" -[[ ${check_style} =~ "BUILD FAILURE" ]] && exit 1 - -# test raylet -mvn test | tee mvn_test -if [ `grep -c "BUILD FAILURE" mvn_test` -eq '0' ]; then - rm mvn_test - echo "Tests passed under CLUSTER mode!" -else - rm mvn_test - exit 1 -fi -# test raylet under SINGLE_PROCESS mode -mvn test -Dray.run-mode=SINGLE_PROCESS | tee dev_mvn_test -if [ `grep -c "BUILD FAILURE" dev_mvn_test` -eq '0' ]; then - rm dev_mvn_test - echo "Tests passed under SINGLE_PROCESS mode!" -else - rm dev_mvn_test - exit 1 -fi + +echo "Checking code format." +mvn checkstyle:check + +echo "Running tests under cluster mode." +ENABLE_MULTI_LANGUAGE_TESTS=1 mvn test + +echo "Running tests under single-process mode." +mvn test -Dray.run-mode=SINGLE_PROCESS + +set +x +set +e popd diff --git a/java/test/src/main/java/org/ray/api/TestUtils.java b/java/test/src/main/java/org/ray/api/TestUtils.java index 18b7230eec79..9b1ea915b409 100644 --- a/java/test/src/main/java/org/ray/api/TestUtils.java +++ b/java/test/src/main/java/org/ray/api/TestUtils.java @@ -9,7 +9,7 @@ public class TestUtils { public static void skipTestUnderSingleProcess() { AbstractRayRuntime runtime = (AbstractRayRuntime)Ray.internal(); if (runtime.getRayConfig().runMode == RunMode.SINGLE_PROCESS) { - throw new SkipException("Skip case."); + throw new SkipException("This test doesn't work under single-process mode."); } } } diff --git a/java/test/src/main/java/org/ray/api/test/ActorReconstructionTest.java b/java/test/src/main/java/org/ray/api/test/ActorReconstructionTest.java index 12d7d1a8a931..e575daa84f13 100644 --- a/java/test/src/main/java/org/ray/api/test/ActorReconstructionTest.java +++ b/java/test/src/main/java/org/ray/api/test/ActorReconstructionTest.java @@ -44,13 +44,9 @@ public int getPid() { } } - @Override - public void beforeEachCase() { - TestUtils.skipTestUnderSingleProcess(); - } - @Test public void testActorReconstruction() throws InterruptedException, IOException { + TestUtils.skipTestUnderSingleProcess(); ActorCreationOptions options = new ActorCreationOptions(new HashMap<>(), 1); RayActor actor = Ray.createActor(Counter::new, options); // Call increase 3 times. @@ -130,6 +126,8 @@ public void checkpointExpired(UniqueId actorId, UniqueId checkpointId) { @Test public void testActorCheckpointing() throws IOException, InterruptedException { + TestUtils.skipTestUnderSingleProcess(); + ActorCreationOptions options = new ActorCreationOptions(new HashMap<>(), 1); RayActor actor = Ray.createActor(CheckpointableCounter::new, options); // Call increase 3 times. @@ -138,8 +136,6 @@ public void testActorCheckpointing() throws IOException, InterruptedException { } // Assert that the actor wasn't resumed from a checkpoint. Assert.assertFalse(Ray.call(CheckpointableCounter::wasResumedFromCheckpoint, actor).get()); - - // Kill the actor process. int pid = Ray.call(CheckpointableCounter::getPid, actor).get(); Runtime.getRuntime().exec("kill -9 " + pid); // Wait for the actor to be killed. diff --git a/java/test/src/main/java/org/ray/api/test/ActorTest.java b/java/test/src/main/java/org/ray/api/test/ActorTest.java index 96be700b9002..876ab322d66d 100644 --- a/java/test/src/main/java/org/ray/api/test/ActorTest.java +++ b/java/test/src/main/java/org/ray/api/test/ActorTest.java @@ -5,6 +5,7 @@ import org.ray.api.Ray; import org.ray.api.RayActor; import org.ray.api.RayObject; +import org.ray.api.TestUtils; import org.ray.api.annotation.RayRemote; import org.ray.api.exception.UnreconstructableException; import org.ray.api.id.UniqueId; @@ -90,6 +91,7 @@ public void testForkingActorHandle() { @Test public void testUnreconstructableActorObject() throws InterruptedException { + TestUtils.skipTestUnderSingleProcess(); RayActor counter = Ray.createActor(Counter::new, 100); // Call an actor method. RayObject value = Ray.call(Counter::getValue, counter); diff --git a/java/test/src/main/java/org/ray/api/test/BaseTest.java b/java/test/src/main/java/org/ray/api/test/BaseTest.java index e84e8fadf8ea..b67a8f64c7ce 100644 --- a/java/test/src/main/java/org/ray/api/test/BaseTest.java +++ b/java/test/src/main/java/org/ray/api/test/BaseTest.java @@ -1,27 +1,31 @@ package org.ray.api.test; +import java.lang.reflect.Method; import org.ray.api.Ray; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.testng.annotations.AfterMethod; import org.testng.annotations.BeforeMethod; public class BaseTest { + private static final Logger LOGGER = LoggerFactory.getLogger(BaseTest.class); + @BeforeMethod - public void setUp() { + public void setUpBase(Method method) { + LOGGER.info("===== Running test: " + + method.getDeclaringClass().getName() + "." + method.getName()); System.setProperty("ray.home", "../.."); System.setProperty("ray.resources", "CPU:4,RES-A:4"); - beforeInitRay(); Ray.init(); - beforeEachCase(); } @AfterMethod - public void tearDown() { + public void tearDownBase() { // TODO(qwang): This is double check to check that the socket file is removed actually. // We could not enable this until `systemInfo` enabled. //File rayletSocketFIle = new File(Ray.systemInfo().rayletSocketName()); Ray.shutdown(); - afterShutdownRay(); //remove raylet socket file //rayletSocketFIle.delete(); @@ -31,15 +35,4 @@ public void tearDown() { System.clearProperty("ray.resources"); } - protected void beforeInitRay() { - - } - - protected void afterShutdownRay() { - - } - - protected void beforeEachCase() { - - } } diff --git a/java/test/src/main/java/org/ray/api/test/ClientExceptionTest.java b/java/test/src/main/java/org/ray/api/test/ClientExceptionTest.java index 0c0433299386..e9f53dddd794 100644 --- a/java/test/src/main/java/org/ray/api/test/ClientExceptionTest.java +++ b/java/test/src/main/java/org/ray/api/test/ClientExceptionTest.java @@ -17,13 +17,9 @@ public class ClientExceptionTest extends BaseTest { private static final Logger LOGGER = LoggerFactory.getLogger(ClientExceptionTest.class); - @Override - public void beforeEachCase() { - TestUtils.skipTestUnderSingleProcess(); - } - @Test public void testWaitAndCrash() { + TestUtils.skipTestUnderSingleProcess(); UniqueId randomId = UniqueId.randomId(); RayObject notExisting = new RayObjectImpl(randomId); diff --git a/java/test/src/main/java/org/ray/api/test/FailureTest.java b/java/test/src/main/java/org/ray/api/test/FailureTest.java index f74860177909..6d47a2fc99fa 100644 --- a/java/test/src/main/java/org/ray/api/test/FailureTest.java +++ b/java/test/src/main/java/org/ray/api/test/FailureTest.java @@ -55,30 +55,29 @@ private static void assertTaskFailedWithRayTaskException(RayObject rayObject) } } - @Override - public void beforeEachCase() { - TestUtils.skipTestUnderSingleProcess(); - } - @Test public void testNormalTaskFailure() { + TestUtils.skipTestUnderSingleProcess(); assertTaskFailedWithRayTaskException(Ray.call(FailureTest::badFunc)); } @Test public void testActorCreationFailure() { + TestUtils.skipTestUnderSingleProcess(); RayActor actor = Ray.createActor(BadActor::new, true); assertTaskFailedWithRayTaskException(Ray.call(BadActor::badMethod, actor)); } @Test public void testActorTaskFailure() { + TestUtils.skipTestUnderSingleProcess(); RayActor actor = Ray.createActor(BadActor::new, false); assertTaskFailedWithRayTaskException(Ray.call(BadActor::badMethod, actor)); } @Test public void testWorkerProcessDying() { + TestUtils.skipTestUnderSingleProcess(); try { Ray.call(FailureTest::badFunc2).get(); Assert.fail("This line shouldn't be reached."); @@ -90,6 +89,7 @@ public void testWorkerProcessDying() { @Test public void testActorProcessDying() { + TestUtils.skipTestUnderSingleProcess(); RayActor actor = Ray.createActor(BadActor::new, false); try { Ray.call(BadActor::badMethod2, actor).get(); diff --git a/java/test/src/main/java/org/ray/api/test/MultiLanguageClusterTest.java b/java/test/src/main/java/org/ray/api/test/MultiLanguageClusterTest.java index b3a8e87b7326..c81a148980f9 100644 --- a/java/test/src/main/java/org/ray/api/test/MultiLanguageClusterTest.java +++ b/java/test/src/main/java/org/ray/api/test/MultiLanguageClusterTest.java @@ -2,7 +2,7 @@ import com.google.common.collect.ImmutableList; import java.io.File; -import java.lang.ProcessBuilder.Redirect; +import java.lang.reflect.Method; import java.util.List; import java.util.concurrent.TimeUnit; import org.ray.api.Ray; @@ -33,13 +33,13 @@ public static String echo(String word) { /** * Execute an external command. + * * @return Whether the command succeeded. */ private boolean executeCommand(List command, int waitTimeoutSeconds) { try { LOGGER.info("Executing command: {}", String.join(" ", command)); - Process process = new ProcessBuilder(command).redirectOutput(Redirect.INHERIT) - .redirectError(Redirect.INHERIT).start(); + Process process = new ProcessBuilder(command).inheritIO().start(); process.waitFor(waitTimeoutSeconds, TimeUnit.SECONDS); return process.exitValue() == 0; } catch (Exception e) { @@ -48,11 +48,12 @@ private boolean executeCommand(List command, int waitTimeoutSeconds) { } @BeforeMethod - public void setUp() { - // Check whether 'ray' command is installed. - boolean rayCommandExists = executeCommand(ImmutableList.of("which", "ray"), 5); - if (!rayCommandExists) { - throw new SkipException("Skipping test, because ray command doesn't exist."); + public void setUp(Method method) { + String testName = method.getName(); + if (!"1".equals(System.getenv("ENABLE_MULTI_LANGUAGE_TESTS"))) { + LOGGER.info("Skip " + testName + + " because env variable ENABLE_MULTI_LANGUAGE_TESTS isn't set"); + throw new SkipException("Skip test."); } // Delete existing socket files. @@ -64,15 +65,20 @@ public void setUp() { } // Start ray cluster. + String testDir = System.getProperty("user.dir"); + String workerOptions = String.format("-Dray.home=%s/../../", testDir); + workerOptions += + " -classpath " + String.format("%s/../../build/java/*:%s/target/*", testDir, testDir); final List startCommand = ImmutableList.of( "ray", "start", "--head", "--redis-port=6379", - "--include-java", String.format("--plasma-store-socket-name=%s", PLASMA_STORE_SOCKET_NAME), String.format("--raylet-socket-name=%s", RAYLET_SOCKET_NAME), - "--java-worker-options=-classpath ../../build/java/*:../../java/test/target/*" + "--load-code-from-local", + "--include-java", + "--java-worker-options=" + workerOptions ); if (!executeCommand(startCommand, 10)) { throw new RuntimeException("Couldn't start ray cluster."); diff --git a/java/test/src/main/java/org/ray/api/test/MultiThreadingTest.java b/java/test/src/main/java/org/ray/api/test/MultiThreadingTest.java index 6bbd39ffa20b..6289d1cd7170 100644 --- a/java/test/src/main/java/org/ray/api/test/MultiThreadingTest.java +++ b/java/test/src/main/java/org/ray/api/test/MultiThreadingTest.java @@ -12,6 +12,7 @@ import org.ray.api.Ray; import org.ray.api.RayActor; import org.ray.api.RayObject; +import org.ray.api.TestUtils; import org.ray.api.WaitResult; import org.ray.api.annotation.RayRemote; import org.testng.Assert; @@ -73,11 +74,15 @@ public static String testMultiThreading() { @Test public void testInDriver() { + // TODO(hchen): Fix this test under single-process mode. + TestUtils.skipTestUnderSingleProcess(); testMultiThreading(); } @Test public void testInWorker() { + // Single-process mode doesn't have real workers. + TestUtils.skipTestUnderSingleProcess(); RayObject obj = Ray.call(MultiThreadingTest::testMultiThreading); Assert.assertEquals("ok", obj.get()); } diff --git a/java/test/src/main/java/org/ray/api/test/PlasmaStoreTest.java b/java/test/src/main/java/org/ray/api/test/PlasmaStoreTest.java index 726bad3da97c..7abc3f421f97 100644 --- a/java/test/src/main/java/org/ray/api/test/PlasmaStoreTest.java +++ b/java/test/src/main/java/org/ray/api/test/PlasmaStoreTest.java @@ -4,6 +4,7 @@ import org.apache.arrow.plasma.exceptions.DuplicateObjectException; import org.ray.api.Ray; +import org.ray.api.TestUtils; import org.ray.api.id.UniqueId; import org.ray.runtime.AbstractRayRuntime; import org.testng.Assert; @@ -13,6 +14,7 @@ public class PlasmaStoreTest extends BaseTest { @Test public void testPutWithDuplicateId() { + TestUtils.skipTestUnderSingleProcess(); UniqueId objectId = UniqueId.randomId(); AbstractRayRuntime runtime = (AbstractRayRuntime) Ray.internal(); PlasmaClient store = new PlasmaClient(runtime.getRayConfig().objectStoreSocketName, "", 0); diff --git a/java/test/src/main/java/org/ray/api/test/RedisPasswordTest.java b/java/test/src/main/java/org/ray/api/test/RedisPasswordTest.java index 210a4a045540..114ef7498a77 100644 --- a/java/test/src/main/java/org/ray/api/test/RedisPasswordTest.java +++ b/java/test/src/main/java/org/ray/api/test/RedisPasswordTest.java @@ -4,18 +4,20 @@ import org.ray.api.RayObject; import org.ray.api.annotation.RayRemote; import org.testng.Assert; +import org.testng.annotations.AfterClass; +import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; public class RedisPasswordTest extends BaseTest { - @Override - public void beforeInitRay() { + @BeforeClass + public void setUp() { System.setProperty("ray.redis.head-password", "12345678"); System.setProperty("ray.redis.password", "12345678"); } - @Override - public void afterShutdownRay() { + @AfterClass + public void tearDown() { System.clearProperty("ray.redis.head-password"); System.clearProperty("ray.redis.password"); } diff --git a/java/test/src/main/java/org/ray/api/test/ResourcesManagementTest.java b/java/test/src/main/java/org/ray/api/test/ResourcesManagementTest.java index 5d021d0cb1ad..114dfd3960ce 100644 --- a/java/test/src/main/java/org/ray/api/test/ResourcesManagementTest.java +++ b/java/test/src/main/java/org/ray/api/test/ResourcesManagementTest.java @@ -25,18 +25,15 @@ public static Integer echo(Integer number) { @RayRemote public static class Echo { + public Integer echo(Integer number) { return number; } } - @Override - public void beforeEachCase() { - TestUtils.skipTestUnderSingleProcess(); - } - @Test public void testMethods() { + TestUtils.skipTestUnderSingleProcess(); CallOptions callOptions1 = new CallOptions(ImmutableMap.of("CPU", 4.0, "GPU", 0.0)); // This is a case that can satisfy required resources. @@ -57,6 +54,7 @@ public void testMethods() { @Test public void testActors() { + TestUtils.skipTestUnderSingleProcess(); ActorCreationOptions actorCreationOptions1 = new ActorCreationOptions(ImmutableMap.of("CPU", 2.0, "GPU", 0.0)); diff --git a/java/test/src/main/java/org/ray/api/test/RuntimeContextTest.java b/java/test/src/main/java/org/ray/api/test/RuntimeContextTest.java index b6fdca32f170..512519bce02a 100644 --- a/java/test/src/main/java/org/ray/api/test/RuntimeContextTest.java +++ b/java/test/src/main/java/org/ray/api/test/RuntimeContextTest.java @@ -5,6 +5,8 @@ import org.ray.api.annotation.RayRemote; import org.ray.api.id.UniqueId; import org.testng.Assert; +import org.testng.annotations.AfterClass; +import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; public class RuntimeContextTest extends BaseTest { @@ -14,13 +16,20 @@ public class RuntimeContextTest extends BaseTest { private static String RAYLET_SOCKET_NAME = "/tmp/ray/test/raylet_socket"; private static String OBJECT_STORE_SOCKET_NAME = "/tmp/ray/test/object_store_socket"; - @Override - public void beforeInitRay() { + @BeforeClass + public void setUp() { System.setProperty("ray.driver.id", DRIVER_ID.toString()); System.setProperty("ray.raylet.socket-name", RAYLET_SOCKET_NAME); System.setProperty("ray.object-store.socket-name", OBJECT_STORE_SOCKET_NAME); } + @AfterClass + public void tearDown() { + System.clearProperty("ray.driver.id"); + System.clearProperty("ray.raylet.socket-name"); + System.clearProperty("ray.object-store.socket-name"); + } + @Test public void testRuntimeContextInDriver() { Assert.assertEquals(DRIVER_ID, Ray.getRuntimeContext().getCurrentDriverId()); diff --git a/java/test/src/main/java/org/ray/api/test/StressTest.java b/java/test/src/main/java/org/ray/api/test/StressTest.java index 24bc467db0ff..b5bf1356ea4f 100644 --- a/java/test/src/main/java/org/ray/api/test/StressTest.java +++ b/java/test/src/main/java/org/ray/api/test/StressTest.java @@ -17,13 +17,9 @@ public static int echo(int x) { return x; } - @Override - public void beforeEachCase() { - TestUtils.skipTestUnderSingleProcess(); - } - @Test public void testSubmittingTasks() { + TestUtils.skipTestUnderSingleProcess(); for (int numIterations : ImmutableList.of(1, 10, 100, 1000)) { int numTasks = 1000 / numIterations; for (int i = 0; i < numIterations; i++) { @@ -40,6 +36,7 @@ public void testSubmittingTasks() { @Test public void testDependency() { + TestUtils.skipTestUnderSingleProcess(); RayObject x = Ray.call(StressTest::echo, 1); for (int i = 0; i < 1000; i++) { x = Ray.call(StressTest::echo, x); @@ -77,6 +74,7 @@ public int ping(int n) { @Test public void testSubmittingManyTasksToOneActor() { + TestUtils.skipTestUnderSingleProcess(); RayActor actor = Ray.createActor(Actor::new); List objectIds = new ArrayList<>(); for (int i = 0; i < 10; i++) { @@ -90,6 +88,7 @@ public void testSubmittingManyTasksToOneActor() { @Test public void testPuttingAndGettingManyObjects() { + TestUtils.skipTestUnderSingleProcess(); Integer objectToPut = 1; List> objects = new ArrayList<>(); for (int i = 0; i < 100_000; i++) { diff --git a/kubernetes/head.yaml b/kubernetes/head.yaml index f347ec0e841c..fdbe117c93a4 100644 --- a/kubernetes/head.yaml +++ b/kubernetes/head.yaml @@ -31,11 +31,20 @@ spec: selector: matchLabels: component: ray-head + type: ray template: metadata: labels: component: ray-head + type: ray spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoreDuringExecution: + - labelSelector: + matchLabels: + type: ray + topologyKey: kubernetes.io/hostname containers: - name: ray-head image: rayproject/examples diff --git a/kubernetes/submit.yaml b/kubernetes/submit.yaml index e0ed3446b761..78470591c6dd 100644 --- a/kubernetes/submit.yaml +++ b/kubernetes/submit.yaml @@ -31,11 +31,20 @@ spec: selector: matchLabels: component: ray-head + type: ray template: metadata: labels: component: ray-head + type: ray spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoreDuringExecution: + - labelSelector: + matchLabels: + type: ray + topologyKey: kubernetes.io/hostname containers: - name: ray-head image: rayproject/examples @@ -68,11 +77,20 @@ spec: selector: matchLabels: component: ray-worker + type: ray template: metadata: labels: component: ray-worker + type: ray spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoreDuringExecution: + - labelSelector: + matchLabels: + type: ray + topologyKey: kubernetes.io/hostname containers: - name: ray-worker image: rayproject/examples diff --git a/kubernetes/worker.yaml b/kubernetes/worker.yaml index 09035a9e36e5..12a21aed4b5f 100644 --- a/kubernetes/worker.yaml +++ b/kubernetes/worker.yaml @@ -7,11 +7,20 @@ spec: selector: matchLabels: component: ray-worker + type: ray template: metadata: labels: component: ray-worker + type: ray spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoreDuringExecution: + - labelSelector: + matchLabels: + type: ray + topologyKey: kubernetes.io/hostname containers: - name: ray-worker image: rayproject/examples diff --git a/python/build-wheel-macos.sh b/python/build-wheel-macos.sh index 3b21aacf16df..c635666232dd 100755 --- a/python/build-wheel-macos.sh +++ b/python/build-wheel-macos.sh @@ -75,7 +75,7 @@ for ((i=0; i<${#PY_VERSIONS[@]}; ++i)); do $PIP_CMD install -q wheel # Add the correct Python to the path and build the wheel. This is only # needed so that the installation finds the cython executable. - INCLUDE_UI=1 PATH=$MACPYTHON_PY_PREFIX/$PY_MM/bin:$PATH $PYTHON_EXE setup.py bdist_wheel + PATH=$MACPYTHON_PY_PREFIX/$PY_MM/bin:$PATH $PYTHON_EXE setup.py bdist_wheel mv dist/*.whl ../.whl/ popd done diff --git a/python/build-wheel-manylinux1.sh b/python/build-wheel-manylinux1.sh index 82db1f5065fe..a96c3fcb0b91 100755 --- a/python/build-wheel-manylinux1.sh +++ b/python/build-wheel-manylinux1.sh @@ -39,7 +39,7 @@ for ((i=0; i<${#PYTHONS[@]}; ++i)); do # Fix the numpy version because this will be the oldest numpy version we can # support. /opt/python/${PYTHON}/bin/pip install -q numpy==${NUMPY_VERSION} cython==0.29.0 - INCLUDE_UI=1 PATH=/opt/python/${PYTHON}/bin:$PATH /opt/python/${PYTHON}/bin/python setup.py bdist_wheel + PATH=/opt/python/${PYTHON}/bin:$PATH /opt/python/${PYTHON}/bin/python setup.py bdist_wheel # In the future, run auditwheel here. mv dist/*.whl ../.whl/ popd diff --git a/python/ray/WebUI.ipynb b/python/ray/WebUI.ipynb deleted file mode 100644 index 229366eba10b..000000000000 --- a/python/ray/WebUI.ipynb +++ /dev/null @@ -1,97 +0,0 @@ -{ - "cells": [{ - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Ray UI\n", "\n", - "Start the UI with **Kernel -> Restart and Run All**." - ] - }, { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", "import ray\n", - "import ray.experimental.ui as ui\n", "\n", - "ray.init(redis_address=os.environ[\"REDIS_ADDRESS\"])" - ] - }, { - "cell_type": "markdown", - "metadata": {}, - "source": ["#### Task trace timeline."] - }, { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To view arrows, go to View Options and select Flow Events." - ] - }, { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": ["ui.task_timeline()"] - }, { - "cell_type": "markdown", - "metadata": {}, - "source": ["#### Object transfer timeline."] - }, { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": ["ui.object_transfer_timeline()"] - }, { - "cell_type": "markdown", - "metadata": {}, - "source": ["#### Task durations."] - }, { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": ["ui.task_completion_time_distribution()"] - }, { - "cell_type": "markdown", - "metadata": {}, - "source": ["#### CPU usage."] - }, { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": ["ui.cpu_usage()"] - }, { - "cell_type": "markdown", - "metadata": {}, - "source": ["#### Cluster usage."] - }, { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": ["ui.cluster_usage()"] - }], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.1" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/python/ray/_raylet.pyx b/python/ray/_raylet.pyx index 8b12a04fd979..453f66bb4009 100644 --- a/python/ray/_raylet.pyx +++ b/python/ray/_raylet.pyx @@ -72,7 +72,7 @@ cdef c_vector[CObjectID] ObjectIDsToVector(object_ids): ObjectID object_id c_vector[CObjectID] result for object_id in object_ids: - result.push_back(object_id.data) + result.push_back(object_id.native()) return result @@ -87,11 +87,11 @@ def compute_put_id(TaskID task_id, int64_t put_index): if put_index < 1 or put_index > kMaxTaskPuts: raise ValueError("The range of 'put_index' should be [1, %d]" % kMaxTaskPuts) - return ObjectID(ComputePutId(task_id.data, put_index).binary()) + return ObjectID(ComputePutId(task_id.native(), put_index).binary()) def compute_task_id(ObjectID object_id): - return TaskID(ComputeTaskId(object_id.data).binary()) + return TaskID(ComputeTaskId(object_id.native()).binary()) cdef c_bool is_simple_value(value, int *num_elements_contained): @@ -225,8 +225,8 @@ cdef class RayletClient: # parameter. # TODO(suquark): Should we allow unicode chars in "raylet_socket"? self.client.reset(new CRayletClient( - raylet_socket.encode("ascii"), client_id.data, is_worker, - driver_id.data, LANGUAGE_PYTHON)) + raylet_socket.encode("ascii"), client_id.native(), is_worker, + driver_id.native(), LANGUAGE_PYTHON)) def disconnect(self): check_status(self.client.get().Disconnect()) @@ -252,22 +252,23 @@ cdef class RayletClient: TaskID current_task_id=TaskID.nil()): cdef c_vector[CObjectID] fetch_ids = ObjectIDsToVector(object_ids) check_status(self.client.get().FetchOrReconstruct( - fetch_ids, fetch_only, current_task_id.data)) + fetch_ids, fetch_only, current_task_id.native())) def notify_unblocked(self, TaskID current_task_id): - check_status(self.client.get().NotifyUnblocked(current_task_id.data)) + check_status(self.client.get().NotifyUnblocked(current_task_id.native())) def wait(self, object_ids, int num_returns, int64_t timeout_milliseconds, c_bool wait_local, TaskID current_task_id): cdef: WaitResultPair result c_vector[CObjectID] wait_ids + CTaskID c_task_id = current_task_id.native() wait_ids = ObjectIDsToVector(object_ids) with nogil: check_status(self.client.get().Wait(wait_ids, num_returns, timeout_milliseconds, wait_local, - current_task_id.data, &result)) + c_task_id, &result)) return (VectorToObjectIDs(result.first), VectorToObjectIDs(result.second)) @@ -291,9 +292,9 @@ cdef class RayletClient: postincrement(iterator) return resources_dict - def push_error(self, DriverID job_id, error_type, error_message, + def push_error(self, DriverID driver_id, error_type, error_message, double timestamp): - check_status(self.client.get().PushError(job_id.data, + check_status(self.client.get().PushError(driver_id.native(), error_type.encode("ascii"), error_message.encode("ascii"), timestamp)) @@ -354,7 +355,7 @@ cdef class RayletClient: def prepare_actor_checkpoint(self, ActorID actor_id): cdef CActorCheckpointID checkpoint_id - cdef CActorID c_actor_id = actor_id.data + cdef CActorID c_actor_id = actor_id.native() # PrepareActorCheckpoint will wait for raylet's reply, release # the GIL so other Python threads can run. with nogil: @@ -365,7 +366,7 @@ cdef class RayletClient: def notify_actor_resumed_from_checkpoint(self, ActorID actor_id, ActorCheckpointID checkpoint_id): check_status(self.client.get().NotifyActorResumedFromCheckpoint( - actor_id.data, checkpoint_id.data)) + actor_id.native(), checkpoint_id.native())) @property def language(self): diff --git a/python/ray/actor.py b/python/ray/actor.py index 26ff93399a88..f5c2a665898e 100644 --- a/python/ray/actor.py +++ b/python/ray/actor.py @@ -125,7 +125,11 @@ def __call__(self, *args, **kwargs): def remote(self, *args, **kwargs): return self._remote(args, kwargs) - def _remote(self, args, kwargs, num_return_vals=None): + def _remote(self, args=None, kwargs=None, num_return_vals=None): + if args is None: + args = [] + if kwargs is None: + kwargs = {} if num_return_vals is None: num_return_vals = self._num_return_vals @@ -233,8 +237,8 @@ def remote(self, *args, **kwargs): return self._remote(args=args, kwargs=kwargs) def _remote(self, - args, - kwargs, + args=None, + kwargs=None, num_cpus=None, num_gpus=None, resources=None): @@ -255,6 +259,11 @@ def _remote(self, Returns: A handle to the newly created actor. """ + if args is None: + args = [] + if kwargs is None: + kwargs = {} + worker = ray.worker.get_global_worker() if worker.mode is None: raise Exception("Actors cannot be created before ray.init() " @@ -293,10 +302,6 @@ def _remote(self, actor_placement_resources = resources.copy() actor_placement_resources["CPU"] += 1 - if args is None: - args = [] - if kwargs is None: - kwargs = {} function_name = "__init__" function_signature = self._method_signatures[function_name] creation_args = signature.extend_args(function_signature, args, diff --git a/python/ray/autoscaler/aws/example-full.yaml b/python/ray/autoscaler/aws/example-full.yaml index f7c412e01e90..539a6dbc2245 100644 --- a/python/ray/autoscaler/aws/example-full.yaml +++ b/python/ray/autoscaler/aws/example-full.yaml @@ -100,9 +100,9 @@ setup_commands: # has your Ray repo pre-cloned. Then, you can replace the pip installs # below with a git checkout (and possibly a recompile). - echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc - # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev0-cp27-cp27mu-manylinux1_x86_64.whl - # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev0-cp35-cp35m-manylinux1_x86_64.whl - - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev0-cp36-cp36m-manylinux1_x86_64.whl + # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev1-cp27-cp27mu-manylinux1_x86_64.whl + # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev1-cp35-cp35m-manylinux1_x86_64.whl + - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev1-cp36-cp36m-manylinux1_x86_64.whl # Consider uncommenting these if you also want to run apt-get commands during setup # - sudo pkill -9 apt-get || true # - sudo pkill -9 dpkg || true diff --git a/python/ray/autoscaler/aws/example-gpu-docker.yaml b/python/ray/autoscaler/aws/example-gpu-docker.yaml index 8540ef584d6c..fa889fd49190 100644 --- a/python/ray/autoscaler/aws/example-gpu-docker.yaml +++ b/python/ray/autoscaler/aws/example-gpu-docker.yaml @@ -92,9 +92,9 @@ file_mounts: { # List of shell commands to run to set up nodes. setup_commands: - # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev0-cp27-cp27mu-manylinux1_x86_64.whl - - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev0-cp35-cp35m-manylinux1_x86_64.whl - # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev0-cp36-cp36m-manylinux1_x86_64.whl + # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev1-cp27-cp27mu-manylinux1_x86_64.whl + - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev1-cp35-cp35m-manylinux1_x86_64.whl + # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev1-cp36-cp36m-manylinux1_x86_64.whl # Custom commands that will be run on the head node after common setup. head_setup_commands: diff --git a/python/ray/autoscaler/gcp/example-full.yaml b/python/ray/autoscaler/gcp/example-full.yaml index 480a9827f8e2..5339690cc7ab 100644 --- a/python/ray/autoscaler/gcp/example-full.yaml +++ b/python/ray/autoscaler/gcp/example-full.yaml @@ -122,9 +122,9 @@ setup_commands: && echo 'export PATH="$HOME/anaconda3/bin:$PATH"' >> ~/.profile # Install ray - # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev0-cp27-cp27mu-manylinux1_x86_64.whl - # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev0-cp35-cp35m-manylinux1_x86_64.whl - - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev0-cp36-cp36m-manylinux1_x86_64.whl + # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev1-cp27-cp27mu-manylinux1_x86_64.whl + # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev1-cp35-cp35m-manylinux1_x86_64.whl + - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev1-cp36-cp36m-manylinux1_x86_64.whl # Custom commands that will be run on the head node after common setup. diff --git a/python/ray/autoscaler/gcp/example-gpu-docker.yaml b/python/ray/autoscaler/gcp/example-gpu-docker.yaml index b29a9f8f169c..d7080b94cf8f 100644 --- a/python/ray/autoscaler/gcp/example-gpu-docker.yaml +++ b/python/ray/autoscaler/gcp/example-gpu-docker.yaml @@ -127,9 +127,9 @@ setup_commands: # - echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc # Install ray - # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev0-cp27-cp27mu-manylinux1_x86_64.whl - - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev0-cp35-cp35m-manylinux1_x86_64.whl - # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev0-cp36-cp36m-manylinux1_x86_64.whl + # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev1-cp27-cp27mu-manylinux1_x86_64.whl + - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev1-cp35-cp35m-manylinux1_x86_64.whl + # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.7.0.dev1-cp36-cp36m-manylinux1_x86_64.whl # Custom commands that will be run on the head node after common setup. head_setup_commands: diff --git a/python/ray/experimental/serve/README.rst b/python/ray/experimental/serve/README.rst new file mode 100644 index 000000000000..b59672c46912 --- /dev/null +++ b/python/ray/experimental/serve/README.rst @@ -0,0 +1,76 @@ +Ray Serve Module +================ + +``ray.experimental.serve`` is a module for publishing your actors to +interact with outside world. + +Use Case +-------- + +Serve machine learning model +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Scalable anayltics query +~~~~~~~~~~~~~~~~~~~~~~~~ + +Composible pipelines +~~~~~~~~~~~~~~~~~~~~ + +Architecture +------------ + +``ray.experimental.serve`` is implemented in a three-tiered system. Each +tier can scale horizontally. + +In the following illustration, call chain goes from top to bottom. Each +box is one or more replicated ray actors. + +:: + + +-------------------+ +-----------------+ +------------+ + Frontend | HTTP Frontend | | Arrow RPC | | ... | + Tier | | | | | | + +-------------------+ +-----------------+ +------------+ + + +------------------------------------------------------------+ + + +--------------------+ +-------------------+ + Router | Default Router | | Deadline Aware | + Tier | | | Router | + +--------------------+ +-------------------+ + + +------------------------------------------------------------+ + + +----------------+ +--------------+ +-------------+ + Managed | Managed Actor | | ... | | ... | + Actor | Replica | | | | | + Tier +----------------+ +--------------+ +-------------+ + +Frontend Tier +~~~~~~~~~~~~~ + +The frontend tier is repsonsible for interface with the world. Currently +``ray.experimental.serve`` provides implementation for - HTTP Frontend + +And we are planning to add support for - Arrow RPC - zeromq + +Router Tier +~~~~~~~~~~~ + +The router tier receives calls from frontend and route them to the +managed actors. Routers both *route* and *queue* incoming queries. +``ray.experimental.serve`` has native support for (micro-)batching +queries. + +In addition, we implemented a deadline aware routers that will put high +priority queries in the front of the queue so they will be delivered +first. + +Managed Actor Tier +~~~~~~~~~~~~~~~~~~ + +Managed actors will be managed by routers. These actors can contains +arbitrary methods. Methods in the actors class are assumed to be able to +take into a single input. To fully utilize the vectorized instructions, like +``np.sum``, you can use the ``@batched_input`` decorator, it will run your method +in on a micro-batch. diff --git a/python/ray/experimental/serve/__init__.py b/python/ray/experimental/serve/__init__.py new file mode 100644 index 000000000000..15757a9d5f54 --- /dev/null +++ b/python/ray/experimental/serve/__init__.py @@ -0,0 +1,28 @@ +"""A module for serving from actors. + +The ray.experimental.serve module is a module for publishing your actors to +interact with the outside world. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import sys + +assert sys.version_info >= (3, ), ( + "ray.experimental.serve is a python3 only library") + +from ray.experimental.serve.router import (DeadlineAwareRouter, + SingleQuery) # noqa: E402 +from ray.experimental.serve.frontend import HTTPFrontendActor # noqa: E402 +from ray.experimental.serve.mixin import (RayServeMixin, + batched_input) # noqa: E402 + +__all__ = [ + "DeadlineAwareRouter", + "SingleQuery", + "HTTPFrontendActor", + "RayServeMixin", + "batched_input", +] diff --git a/python/ray/experimental/serve/examples/adder.py b/python/ray/experimental/serve/examples/adder.py new file mode 100644 index 000000000000..862e61c7150b --- /dev/null +++ b/python/ray/experimental/serve/examples/adder.py @@ -0,0 +1,47 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +import ray +from ray.experimental.serve import RayServeMixin, batched_input + + +@ray.remote +class VectorizedAdder(RayServeMixin): + """Actor that adds scaler_increment to input batch. + + result = np.array(input_batch) + scaler_increment + """ + + def __init__(self, scaler_increment): + self.inc = scaler_increment + + @batched_input + def __call__(self, input_batch): + arr = np.array(input_batch) + arr += self.inc + return arr.tolist() + + +@ray.remote +class ScalerAdder(RayServeMixin): + """Actor that adds a scaler_increment to a single input.""" + + def __init__(self, scaler_increment): + self.inc = scaler_increment + + def __call__(self, input_scaler): + return input_scaler + self.inc + + +@ray.remote +class VectorDouble(RayServeMixin): + """Actor that doubles the batched input.""" + + @batched_input + def __call__(self, batched_vectors): + matrix = np.array(batched_vectors) + matrix *= 2 + return [v.tolist() for v in matrix] diff --git a/python/ray/experimental/serve/examples/counter.py b/python/ray/experimental/serve/examples/counter.py new file mode 100644 index 000000000000..369d53fb5a8e --- /dev/null +++ b/python/ray/experimental/serve/examples/counter.py @@ -0,0 +1,29 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import ray +from ray.experimental.serve import RayServeMixin, batched_input + + +@ray.remote +class Counter(RayServeMixin): + """Return the query id. Used for testing router.""" + + def __init__(self): + self.counter = 0 + + def __call__(self, batched_input): + self.counter += 1 + return self.counter + + +@ray.remote +class CustomCounter(RayServeMixin): + """Return the query id. Used for testing `serve_method` signature.""" + + serve_method = "count" + + @batched_input + def count(self, input_batch): + return [1 for _ in range(len(input_batch))] diff --git a/python/ray/experimental/serve/examples/halt.py b/python/ray/experimental/serve/examples/halt.py new file mode 100644 index 000000000000..eceb94d8653e --- /dev/null +++ b/python/ray/experimental/serve/examples/halt.py @@ -0,0 +1,41 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import time + +import ray +from ray.experimental.serve import RayServeMixin, batched_input + + +@ray.remote +class SleepOnFirst(RayServeMixin): + """Sleep on the first request, return batch size. + + Used for testing the DeadlineAwareRouter. + """ + + def __init__(self, sleep_time): + self.nap_time = sleep_time + + @batched_input + def __call__(self, input_batch): + time.sleep(self.nap_time) + return [len(input_batch) for _ in range(len(input_batch))] + + +@ray.remote +class SleepCounter(RayServeMixin): + """Sleep on input argument seconds, return the query id. + + Used to test the DeadlineAwareRouter. + """ + + def __init__(self): + self.counter = 0 + + def __call__(self, inp): + time.sleep(inp) + + self.counter += 1 + return self.counter diff --git a/python/ray/experimental/serve/frontend/__init__.py b/python/ray/experimental/serve/frontend/__init__.py new file mode 100644 index 000000000000..b1cb44636bde --- /dev/null +++ b/python/ray/experimental/serve/frontend/__init__.py @@ -0,0 +1,7 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from ray.experimental.serve.frontend.http_frontend import HTTPFrontendActor + +__all__ = ["HTTPFrontendActor"] diff --git a/python/ray/experimental/serve/frontend/http_frontend.py b/python/ray/experimental/serve/frontend/http_frontend.py new file mode 100644 index 000000000000..66973caca838 --- /dev/null +++ b/python/ray/experimental/serve/frontend/http_frontend.py @@ -0,0 +1,72 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import time + +import uvicorn +from starlette.applications import Starlette +from starlette.responses import JSONResponse + +import ray + + +def unwrap(future): + """Unwrap the result from ray.experimental.server router. + Router returns a list of object ids when you call them. + """ + + return ray.get(future)[0] + + +@ray.remote +class HTTPFrontendActor: + """HTTP API for an Actor. This exposes /{actor_name} endpoint for query. + + Request: + GET /{actor_name} or POST /{actor_name} + Content-type: application/json + { + "slo_ms": float, + "input": any + } + Response: + Content-type: application/json + { + "success": bool, + "actor": str, + "result": any + } + """ + + def __init__(self, ip="0.0.0.0", port=8080, router="DefaultRouter"): + self.ip = ip + self.port = port + self.router = ray.experimental.named_actors.get_actor(router) + + def start(self): + default_app = Starlette() + + @default_app.route("/{actor}", methods=["GET", "POST"]) + async def dispatch_remote_function(request): + data = await request.json() + actor_name = request.path_params["actor"] + + slo_seconds = data.pop("slo_ms") / 1000 + deadline = time.perf_counter() + slo_seconds + + inp = data.pop("input") + + result_future = unwrap( + self.router.call.remote(actor_name, inp, deadline)) + + # TODO(simon): change to asyncio ray.get + result = ray.get(result_future) + + return JSONResponse({ + "success": True, + "actor": actor_name, + "result": result + }) + + uvicorn.run(default_app, host=self.ip, port=self.port) diff --git a/python/ray/experimental/serve/mixin.py b/python/ray/experimental/serve/mixin.py new file mode 100644 index 000000000000..858572634a04 --- /dev/null +++ b/python/ray/experimental/serve/mixin.py @@ -0,0 +1,63 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import traceback +from typing import List + +import ray +from ray.experimental.serve import SingleQuery + + +def batched_input(func): + """Decorator to mark an actor method as accepting only a single input. + + By default methods accept a batch. + """ + func.ray_serve_batched_input = True + return func + + +def _execute_and_seal_error(method, arg, method_name): + """Execute method with arg and return the result. + + If the method fails, return a RayTaskError so it can be sealed in the + resultOID and retried by user. + """ + try: + return method(arg) + except Exception: + return ray.worker.RayTaskError(method_name, traceback.format_exc()) + + +class RayServeMixin: + """Enable a ray actor to interact with ray.serve + + Usage: + ``` + @ray.remote + class MyActor(RayServeMixin): + # This is optional, by default it is "__call__" + serve_method = 'my_method' + + def my_method(self, arg): + ... + ``` + """ + + serve_method = "__call__" + + def _dispatch(self, input_batch: List[SingleQuery]): + """Helper method to dispatch a batch of input to self.serve_method.""" + method = getattr(self, self.serve_method) + if hasattr(method, "ray_serve_batched_input"): + batch = [inp.data for inp in input_batch] + result = _execute_and_seal_error(method, batch, self.serve_method) + for res, inp in zip(result, input_batch): + ray.worker.global_worker.put_object(inp.result_object_id, res) + else: + for inp in input_batch: + result = _execute_and_seal_error(method, inp.data, + self.serve_method) + ray.worker.global_worker.put_object(inp.result_object_id, + result) diff --git a/python/ray/experimental/serve/object_id.py b/python/ray/experimental/serve/object_id.py new file mode 100644 index 000000000000..cdde52532a58 --- /dev/null +++ b/python/ray/experimental/serve/object_id.py @@ -0,0 +1,21 @@ +""" +Helper methods for dealing with ray.ObjectID +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import ray + + +def unwrap(future): + return ray.get(future)[0] + + +def get_new_oid(): + worker = ray.worker.global_worker + oid = ray._raylet.compute_put_id(worker.current_task_id, + worker.task_context.put_index) + worker.task_context.put_index += 1 + return oid diff --git a/python/ray/experimental/serve/router/__init__.py b/python/ray/experimental/serve/router/__init__.py new file mode 100644 index 000000000000..dae5fcb7ce01 --- /dev/null +++ b/python/ray/experimental/serve/router/__init__.py @@ -0,0 +1,26 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from ray.experimental.serve.router.routers import (DeadlineAwareRouter, + SingleQuery) +import ray + + +def start_router(router_class, router_name): + """Wrapper for starting a router and register it. + + Args: + router_class: The router class to instantiate. + router_name: The name to give to the router. + + Returns: + A handle to newly started router actor. + """ + handle = router_class.remote(router_name) + ray.experimental.register_actor(router_name, handle) + handle.start.remote() + return handle + + +__all__ = ["DeadlineAwareRouter", "SingleQuery"] diff --git a/python/ray/experimental/serve/router/routers.py b/python/ray/experimental/serve/router/routers.py new file mode 100644 index 000000000000..28fd91a8d0f8 --- /dev/null +++ b/python/ray/experimental/serve/router/routers.py @@ -0,0 +1,203 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from collections import defaultdict +from functools import total_ordering +from typing import Callable, Dict, List, Set, Tuple + +import ray +from ray.experimental.serve.object_id import get_new_oid +from ray.experimental.serve.utils.priority_queue import PriorityQueue + +ACTOR_NOT_REGISTERED_MSG: Callable = ( + lambda name: ("Actor {} is not registered with this router. Please use " + "'router.register_actor.remote(...)' " + "to register it.").format(name)) + + +# Use @total_ordering so we can sort SingleQuery +@total_ordering +class SingleQuery: + """A data container for a query. + + Attributes: + data: The request data. + result_object_id: The result object ID. + deadline: The deadline in seconds. + """ + + def __init__(self, data, result_object_id: ray.ObjectID, + deadline_s: float): + self.data = data + self.result_object_id = result_object_id + self.deadline = deadline_s + + def __lt__(self, other): + return self.deadline < other.deadline + + def __eq__(self, other): + return self.deadline == other.deadline + + +@ray.remote +class DeadlineAwareRouter: + """DeadlineAwareRouter is a router that is aware of deadlines. + + It takes into consideration the deadline attached to each query. It will + reorder incoming query based on their deadlines. + """ + + def __init__(self, router_name): + # Runtime Data + self.query_queues: Dict[str, PriorityQueue] = defaultdict( + PriorityQueue) + self.running_queries: Dict[ray.ObjectID, ray.actor.ActorHandle] = {} + self.actor_handles: Dict[str, List[ray.actor.ActorHandle]] = ( + defaultdict(list)) + + # Actor Metadata + self.managed_actors: Dict[str, ray.actor.ActorClass] = {} + self.actor_init_arguments: Dict[str, Tuple[List, Dict]] = {} + self.max_batch_size: Dict[str, int] = {} + + # Router Metadata + self.name = router_name + + def start(self): + """Kick off the router loop""" + + # Note: This is meant for hiding the complexity for a user + # facing method. + # Because the `loop` api can be hard to understand. + ray.experimental.get_actor(self.name).loop.remote() + + def register_actor( + self, + actor_name: str, + actor_class: ray.actor.ActorClass, + init_args: List = [], + init_kwargs: dict = {}, + num_replicas: int = 1, + max_batch_size: int = -1, # Unbounded batch size + ): + """Register a new managed actor. + """ + self.managed_actors[actor_name] = actor_class + self.actor_init_arguments[actor_name] = (init_args, init_kwargs) + self.max_batch_size[actor_name] = max_batch_size + + ray.experimental.get_actor(self.name).set_replica.remote( + actor_name, num_replicas) + + def set_replica(self, actor_name, new_replica_count): + """Scale a managed actor according to new_replica_count.""" + assert actor_name in self.managed_actors, ( + ACTOR_NOT_REGISTERED_MSG(actor_name)) + + current_replicas = len(self.actor_handles[actor_name]) + + # Increase the number of replicas + if new_replica_count > current_replicas: + for _ in range(new_replica_count - current_replicas): + args = self.actor_init_arguments[actor_name][0] + kwargs = self.actor_init_arguments[actor_name][1] + new_actor_handle = self.managed_actors[actor_name].remote( + *args, **kwargs) + self.actor_handles[actor_name].append(new_actor_handle) + + # Decrease the number of replicas + if new_replica_count < current_replicas: + for _ in range(current_replicas - new_replica_count): + # Note actor destructor will be called after all remaining + # calls finish. Therefore it's safe to call del here. + del self.actor_handles[actor_name][-1] + + def call(self, actor_name, data, deadline_s): + """Enqueue a request to one of the actor managed by this router. + + Returns: + List[ray.ObjectID] with length 1, the object ID wrapped inside is + the result object ID when the query is executed. + """ + assert actor_name in self.managed_actors, ( + ACTOR_NOT_REGISTERED_MSG(actor_name)) + + result_object_id = get_new_oid() + self.query_queues[actor_name].push( + SingleQuery(data, result_object_id, deadline_s)) + + return [result_object_id] + + def loop(self): + """Main loop for router. It will does the following things: + + 1. Check which running actors finished. + 2. Iterate over free actors and request queues, dispatch requests batch + to free actors. + 3. Tail recursively schedule itself. + """ + + # 1. Check which running actors finished. + ready_oids, _ = ray.wait( + object_ids=list(self.running_queries.keys()), + num_returns=len(self.running_queries), + timeout=0, + ) + + for ready_oid in ready_oids: + self.running_queries.pop(ready_oid) + busy_actors: Set[ray.actor.ActorHandle] = set( + self.running_queries.values()) + + # 2. Iterate over free actors and request queues, dispatch requests + # batch to free actors. + for actor_name, queue in self.query_queues.items(): + # try to drain the queue + for actor_handle in self.actor_handles[actor_name]: + if len(queue) == 0: + break + + if actor_handle in busy_actors: + continue + + # A free actor found. Dispatch queries. + batch = self._get_next_batch(actor_name) + assert len(batch) + + batch_result_object_id = actor_handle._dispatch.remote(batch) + self._mark_running(batch_result_object_id, actor_handle) + + # 3. Tail recursively schedule itself. + ray.experimental.get_actor(self.name).loop.remote() + + def _get_next_batch(self, actor_name: str) -> List[SingleQuery]: + """Get next batch of request for the actor whose name is provided.""" + assert actor_name in self.query_queues, ( + ACTOR_NOT_REGISTERED_MSG(actor_name)) + + inputs = [] + batch_size = self.max_batch_size[actor_name] + if batch_size == -1: + inp = self.query_queues[actor_name].try_pop() + while inp: + inputs.append(inp) + inp = self.query_queues[actor_name].try_pop() + else: + for _ in range(batch_size): + inp = self.query_queues[actor_name].try_pop() + if inp: + inputs.append(inp) + else: + break + + return inputs + + def _mark_running(self, batch_oid: ray.ObjectID, + actor_handle: ray.actor.ActorHandle): + """Mark actor_handle as running identified by batch_oid. + + This means that if batch_oid is fullfilled, then actor_handle must be + free. + """ + self.running_queries[batch_oid] = actor_handle diff --git a/python/ray/experimental/serve/tests/test_actors.py b/python/ray/experimental/serve/tests/test_actors.py new file mode 100644 index 000000000000..3b2748b73bf3 --- /dev/null +++ b/python/ray/experimental/serve/tests/test_actors.py @@ -0,0 +1,68 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import pytest + +import ray +from ray.experimental.serve import SingleQuery +from ray.experimental.serve.examples.adder import ScalerAdder, VectorizedAdder +from ray.experimental.serve.examples.counter import Counter, CustomCounter +from ray.experimental.serve.object_id import get_new_oid + +INCREMENT = 3 + + +@pytest.fixture(scope="module") +def ray_start(): + ray.init(num_cpus=4) + yield + ray.shutdown() + + +@pytest.fixture +def generated_inputs(): + deadline = 11111.11 + inputs = [] + input_arr = np.arange(10) + for i in input_arr: + oid = get_new_oid() + inputs.append( + SingleQuery(data=i, result_object_id=oid, deadline_s=deadline)) + return inputs + + +def test_vadd(ray_start, generated_inputs): + adder = VectorizedAdder.remote(INCREMENT) + inputs = generated_inputs + oids = [inp.result_object_id for inp in inputs] + input_data = [inp.data for inp in inputs] + + adder._dispatch.remote(inputs) + result_arr = np.array(ray.get(oids)) + assert np.array_equal(result_arr, np.array(input_data) + INCREMENT) + + +def test_batched_input(ray_start, generated_inputs): + counter = Counter.remote() + counter._dispatch.remote(generated_inputs) + oids = [inp.result_object_id for inp in generated_inputs] + returned_query_ids = np.array(ray.get(oids)) + assert np.array_equal(returned_query_ids, np.arange(1, 11)) + + +def test_custom_method(ray_start, generated_inputs): + dummy = CustomCounter.remote() + dummy._dispatch.remote(generated_inputs) + oids = [inp.result_object_id for inp in generated_inputs] + returned_query_ids = np.array(ray.get(oids)) + assert np.array_equal(returned_query_ids, np.ones(10)) + + +def test_exception(ray_start): + adder = ScalerAdder.remote(INCREMENT) + query = SingleQuery("this can't be added with int", get_new_oid(), 10) + adder._dispatch.remote([query]) + with pytest.raises(ray.worker.RayTaskError): + ray.get(query.result_object_id) diff --git a/python/ray/experimental/serve/tests/test_deadline_router.py b/python/ray/experimental/serve/tests/test_deadline_router.py new file mode 100644 index 000000000000..d1d4d6769794 --- /dev/null +++ b/python/ray/experimental/serve/tests/test_deadline_router.py @@ -0,0 +1,91 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import time + +import numpy as np +import pytest + +import ray +from ray.experimental.serve.examples.adder import ScalerAdder, VectorizedAdder +from ray.experimental.serve.examples.halt import SleepCounter, SleepOnFirst +from ray.experimental.serve.object_id import unwrap +from ray.experimental.serve.router import DeadlineAwareRouter, start_router + + +@pytest.fixture(scope="module") +def router(): + # We need at least 5 workers so resource won't be oversubscribed + ray.init(num_cpus=5) + + # The following two blobs are equivalent + # + # handle = DeadlineAwareRouter.remote("DefaultTestRouter") + # ray.experimental.register_actor("DefaultTestRouter", handle) + # handle.start.remote() + # + # handle = start_router(DeadlineAwareRouter, "DefaultRouter") + handle = start_router(DeadlineAwareRouter, "DefaultRouter") + + handle.register_actor.remote( + "VAdder", VectorizedAdder, + init_kwargs={"scaler_increment": 1}) # init args + handle.register_actor.remote( + "SAdder", ScalerAdder, init_kwargs={"scaler_increment": 2}) + handle.register_actor.remote( + "SleepFirst", SleepOnFirst, init_kwargs={"sleep_time": 1}) + handle.register_actor.remote( + "SleepCounter", SleepCounter, max_batch_size=1) + + yield handle + + ray.shutdown() + + +@pytest.fixture +def now(): + return time.perf_counter() + + +def test_throw_assert(router: DeadlineAwareRouter, now: float): + try: + ray.get(router.call.remote("Action-Not-Exist", "input", now + 1)) + except ray.worker.RayTaskError as e: + assert "AssertionError" in e.traceback_str + + +def test_vector_adder(router: DeadlineAwareRouter, now: float): + result = unwrap(router.call.remote("VAdder", 42, now + 1)) + assert isinstance(result, ray.ObjectID) + assert ray.get(result) == 43 + + +def test_scaler_adder(router: DeadlineAwareRouter, now: float): + result = unwrap(router.call.remote("SAdder", 42, now + 1)) + assert isinstance(result, ray.ObjectID) + assert ray.get(result) == 44 + + +def test_batching_ability(router: DeadlineAwareRouter, now: float): + first = unwrap(router.call.remote("SleepFirst", 1, now + 1)) + rest = [ + unwrap(router.call.remote("SleepFirst", 1, now + 1)) for _ in range(10) + ] + assert ray.get(first) == 1 + assert np.alltrue(np.array(ray.get(rest)) == 10) + + +def test_deadline_priority(router: DeadlineAwareRouter, now: float): + # first sleep 2 seconds + first = unwrap(router.call.remote("SleepCounter", 2, now + 1)) + + # then send a request to with deadline farther away + second = unwrap(router.call.remote("SleepCounter", 0, now + 10)) + + # and a request with sooner deadline + third = unwrap(router.call.remote("SleepCounter", 0, now + 1)) + + id_1, id_2, id_3 = ray.get([first, second, third]) + + assert id_1 < id_3 < id_2 diff --git a/python/ray/experimental/serve/tests/test_default_app.py b/python/ray/experimental/serve/tests/test_default_app.py new file mode 100644 index 000000000000..5eb758c5f7ed --- /dev/null +++ b/python/ray/experimental/serve/tests/test_default_app.py @@ -0,0 +1,46 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import time + +import pytest +import requests + +import ray +from ray.experimental.serve import DeadlineAwareRouter +from ray.experimental.serve.examples.adder import VectorizedAdder +from ray.experimental.serve.frontend import HTTPFrontendActor +from ray.experimental.serve.router import start_router + +ROUTER_NAME = "DefaultRouter" +NUMBER_OF_TRIES = 5 + + +@pytest.fixture +def get_router(): + # We need this many workers so resource are not oversubscribed + ray.init(num_cpus=4) + router = start_router(DeadlineAwareRouter, ROUTER_NAME) + yield router + ray.shutdown() + + +def test_http_basic(get_router): + router = get_router + a = HTTPFrontendActor.remote(router=ROUTER_NAME) + a.start.remote() + + router.register_actor.remote( + "VAdder", VectorizedAdder, init_kwargs={"scaler_increment": 1}) + + for _ in range(NUMBER_OF_TRIES): + try: + url = "http://0.0.0.0:8080/VAdder" + payload = {"input": 10, "slo_ms": 1000} + resp = requests.request("POST", url, json=payload) + except Exception: + # it is possible that the actor is not yet instantiated + time.sleep(1) + + assert resp.json() == {"success": True, "actor": "VAdder", "result": 11} diff --git a/python/ray/experimental/serve/utils/priority_queue.py b/python/ray/experimental/serve/utils/priority_queue.py new file mode 100644 index 000000000000..05b7045b43a5 --- /dev/null +++ b/python/ray/experimental/serve/utils/priority_queue.py @@ -0,0 +1,27 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import heapq + + +class PriorityQueue: + """A min-heap class wrapping heapq module.""" + + def __init__(self): + self.q = [] + + def push(self, item): + heapq.heappush(self.q, item) + + def pop(self): + return heapq.heappop(self.q) + + def try_pop(self): + if len(self.q) == 0: + return None + else: + return self.pop() + + def __len__(self): + return len(self.q) diff --git a/python/ray/experimental/state.py b/python/ray/experimental/state.py index c20da64064ab..eea005874bd1 100644 --- a/python/ray/experimental/state.py +++ b/python/ray/experimental/state.py @@ -243,14 +243,8 @@ def _object_table(self, object_id): object_info = { "DataSize": entry.ObjectSize(), "Manager": entry.Manager(), - "IsEviction": [entry.IsEviction()], } - for i in range(1, gcs_entry.EntriesLength()): - entry = ray.gcs_utils.ObjectTableData.GetRootAsObjectTableData( - gcs_entry.Entries(i), 0) - object_info["IsEviction"].append(entry.IsEviction()) - return object_info def object_table(self, object_id=None): diff --git a/python/ray/experimental/ui.py b/python/ray/experimental/ui.py deleted file mode 100644 index 967336953b7e..000000000000 --- a/python/ray/experimental/ui.py +++ /dev/null @@ -1,702 +0,0 @@ -import logging -import numpy as np -import os -import pprint -import shutil -import tempfile -import time - -import ipywidgets as widgets -from IPython.display import display, IFrame, clear_output - -import ray - -logger = logging.getLogger(__name__) - - -# Instances of this class maintains keep track of whether or not a -# callback is currently executing. Since the execution of the callback -# may trigger more calls to the callback, this is used to prevent infinite -# recursions. -class _EventRecursionContextManager(object): - def __init__(self): - self.should_recurse = True - - def __enter__(self): - self.should_recurse = False - - def __exit__(self, *args): - self.should_recurse = True - - -total_time_value = "% total time" -total_tasks_value = "% total tasks" - -# Function that returns instances of sliders and handles associated events. - - -def get_sliders(update): - # Start_box value indicates the desired start point of queried window. - start_box = widgets.FloatText( - description="Start Time:", - disabled=True, - ) - - # End_box value indicates the desired end point of queried window. - end_box = widgets.FloatText( - description="End Time:", - disabled=True, - ) - - # Percentage slider. Indicates either % of total time or total tasks - # depending on what breakdown_opt is set to. - range_slider = widgets.IntRangeSlider( - value=[0, 100], - min=0, - max=100, - step=1, - description="%:", - continuous_update=False, - orientation="horizontal", - readout=True, - ) - - # Indicates the number of tasks that the user wants to be returned. Is - # disabled when the breakdown_opt value is set to total_time_value. - num_tasks_box = widgets.IntText(description="Num Tasks:", disabled=False) - - # Dropdown bar that lets the user choose between modifying % of total - # time or total number of tasks. - breakdown_opt = widgets.Dropdown( - options=[total_time_value, total_tasks_value], - value=total_tasks_value, - description="Selection Options:") - - # Display box for layout. - total_time_box = widgets.VBox([start_box, end_box]) - - # This sets the CSS style display to hide the box. - total_time_box.layout.display = 'none' - - # Initially passed in to the update_wrapper function. - INIT_EVENT = "INIT" - - # Create instance of context manager to determine whether callback is - # currently executing - out_recursion = _EventRecursionContextManager() - - def update_wrapper(event): - # Feature received a callback, but it shouldn't be executed - # because the callback was the result of a different feature - # executing its callback based on user input. - if not out_recursion.should_recurse: - return - - # Feature received a callback and it should be executed because - # the callback was the result of user input. - with out_recursion: - smallest, largest, num_tasks = ray.global_state._job_length() - diff = largest - smallest - if num_tasks != 0: - - # Describes the initial values that the slider/text box - # values should be set to. - if event == INIT_EVENT: - if breakdown_opt.value == total_tasks_value: - num_tasks_box.value = -min(10000, num_tasks) - range_slider.value = (int( - 100 - (100. * -num_tasks_box.value) / num_tasks), - 100) - else: - low, high = map(lambda x: x / 100., range_slider.value) - start_box.value = round(diff * low, 2) - end_box.value = round(diff * high, 2) - - # Event was triggered by a change in the start_box value. - elif event["owner"] == start_box: - if start_box.value > end_box.value: - start_box.value = end_box.value - elif start_box.value < 0: - start_box.value = 0 - low, high = range_slider.value - range_slider.value = (int((start_box.value * 100.) / diff), - high) - - # Event was triggered by a change in the end_box value. - elif event["owner"] == end_box: - if start_box.value > end_box.value: - end_box.value = start_box.value - elif end_box.value > diff: - end_box.value = diff - low, high = range_slider.value - range_slider.value = (low, - int((end_box.value * 100.) / diff)) - - # Event was triggered by a change in the breakdown options - # toggle. - elif event["owner"] == breakdown_opt: - if breakdown_opt.value == total_tasks_value: - start_box.disabled = True - end_box.disabled = True - num_tasks_box.disabled = False - total_time_box.layout.display = 'none' - - # Make CSS display go back to the default settings. - num_tasks_box.layout.display = None - num_tasks_box.value = min(10000, num_tasks) - range_slider.value = (int( - 100 - (100. * num_tasks_box.value) / num_tasks), - 100) - else: - start_box.disabled = False - end_box.disabled = False - num_tasks_box.disabled = True - - # Make CSS display go back to the default settings. - total_time_box.layout.display = None - num_tasks_box.layout.display = 'none' - range_slider.value = ( - int((start_box.value * 100.) / diff), - int((end_box.value * 100.) / diff)) - - # Event was triggered by a change in the range_slider - # value. - elif event["owner"] == range_slider: - low, high = map(lambda x: x / 100., range_slider.value) - if breakdown_opt.value == total_tasks_value: - old_low, old_high = event["old"] - new_low, new_high = event["new"] - if old_low != new_low: - range_slider.value = (new_low, 100) - num_tasks_box.value = ( - -(100. - new_low) / 100. * num_tasks) - else: - range_slider.value = (0, new_high) - num_tasks_box.value = new_high / 100. * num_tasks - else: - start_box.value = round(diff * low, 2) - end_box.value = round(diff * high, 2) - - # Event was triggered by a change in the num_tasks_box - # value. - elif event["owner"] == num_tasks_box: - if num_tasks_box.value > 0: - range_slider.value = ( - 0, int( - 100 * float(num_tasks_box.value) / num_tasks)) - elif num_tasks_box.value < 0: - range_slider.value = (100 + int( - 100 * float(num_tasks_box.value) / num_tasks), 100) - - # Get updated values from a slider or text box, and update the rest of - # them accordingly. - range_slider.observe(update_wrapper, names="value") - breakdown_opt.observe(update_wrapper, names="value") - start_box.observe(update_wrapper, names="value") - end_box.observe(update_wrapper, names="value") - num_tasks_box.observe(update_wrapper, names="value") - - # Initializes the sliders - update_wrapper(INIT_EVENT) - - # Display sliders and search boxes - display(breakdown_opt, - widgets.HBox([range_slider, total_time_box, num_tasks_box])) - - # Return the sliders and text boxes - return start_box, end_box, range_slider, breakdown_opt - - -def object_search_bar(): - object_search = widgets.Text( - value="", - placeholder="Object ID", - description="Search for an object:", - disabled=False) - display(object_search) - - def handle_submit(sender): - pp = pprint.PrettyPrinter() - pp.pprint(ray.global_state.object_table(object_search.value)) - - object_search.on_submit(handle_submit) - - -def task_search_bar(): - task_search = widgets.Text( - value="", - placeholder="Task ID", - description="Search for a task:", - disabled=False) - display(task_search) - - def handle_submit(sender): - pp = pprint.PrettyPrinter() - pp.pprint(ray.global_state.task_table(task_search.value)) - - task_search.on_submit(handle_submit) - - -# Hard limit on the number of tasks to return to the UI client at once -MAX_TASKS_TO_VISUALIZE = 10000 - - -# Helper function that guarantees unique and writeable temp files. -# Prevents clashes in task trace files when multiple notebooks are running. -def _get_temp_file_path(**kwargs): - temp_file = tempfile.NamedTemporaryFile( - delete=False, dir=os.getcwd(), **kwargs) - temp_file_path = temp_file.name - temp_file.close() - return os.path.relpath(temp_file_path) - - -def task_timeline(): - # Check that the trace viewer renderer file is present, and copy it to the - # current working directory if it is not present. - if not os.path.exists("trace_viewer_full.html"): - shutil.copy( - os.path.join( - os.path.dirname(os.path.abspath(__file__)), - "../core/src/catapult_files/trace_viewer_full.html"), - "trace_viewer_full.html") - - trace_viewer_path = os.path.join( - os.path.dirname(os.path.abspath(__file__)), - "../core/src/catapult_files/index.html") - - html_file_path = _get_temp_file_path(suffix=".html") - json_file_path = _get_temp_file_path(suffix=".json") - - ray.global_state.chrome_tracing_dump(filename=json_file_path) - - with open(trace_viewer_path) as f: - data = f.read() - - # Replace the demo data path with our own - # https://github.com/catapult-project/catapult/blob/ - # 33a9271eb3cf5caf925293ec6a4b47c94f1ac968/tracing/bin/index.html#L107 - data = data.replace("../test_data/big_trace.json", json_file_path) - - with open(html_file_path, "w+") as f: - f.write(data) - - # Display the task trace within the Jupyter notebook - clear_output(wait=True) - logger.info("To view fullscreen, open chrome://tracing in Google Chrome " - "and load `{}`".format(os.path.abspath(json_file_path))) - display(IFrame(html_file_path, 900, 800)) - - -def object_transfer_timeline(): - # Check that the trace viewer renderer file is present, and copy it to the - # current working directory if it is not present. - if not os.path.exists("trace_viewer_full.html"): - shutil.copy( - os.path.join( - os.path.dirname(os.path.abspath(__file__)), - "../core/src/catapult_files/trace_viewer_full.html"), - "trace_viewer_full.html") - - trace_viewer_path = os.path.join( - os.path.dirname(os.path.abspath(__file__)), - "../core/src/catapult_files/index.html") - - html_file_path = _get_temp_file_path(suffix=".html") - json_file_path = _get_temp_file_path(suffix=".json") - - ray.global_state.chrome_tracing_object_transfer_dump( - filename=json_file_path) - - with open(trace_viewer_path) as f: - data = f.read() - - # Replace the demo data path with our own - # https://github.com/catapult-project/catapult/blob/ - # 33a9271eb3cf5caf925293ec6a4b47c94f1ac968/tracing/bin/index.html#L107 - data = data.replace("../test_data/big_trace.json", json_file_path) - - with open(html_file_path, "w+") as f: - f.write(data) - - # Display the task trace within the Jupyter notebook - clear_output(wait=True) - logger.info("To view fullscreen, open chrome://tracing in Google Chrome " - "and load `{}`".format(os.path.abspath(json_file_path))) - display(IFrame(html_file_path, 900, 800)) - - -def task_completion_time_distribution(): - from bokeh.models import ColumnDataSource - from bokeh.layouts import gridplot - from bokeh.plotting import figure, show, helpers - from bokeh.io import output_notebook, push_notebook - from bokeh.resources import CDN - output_notebook(resources=CDN) - - # Create the Bokeh plot - p = figure( - title="Task Completion Time Distribution", - tools=["save", "hover", "wheel_zoom", "box_zoom", "pan"], - background_fill_color="#FFFFFF", - x_range=(0, 1), - y_range=(0, 1)) - - # Create the data source that the plot pulls from - source = ColumnDataSource(data={"top": [], "left": [], "right": []}) - - # Plot the histogram rectangles - p.quad( - top="top", - bottom=0, - left="left", - right="right", - source=source, - fill_color="#B3B3B3", - line_color="#033649") - - # Label the plot axes - p.xaxis.axis_label = "Duration in seconds" - p.yaxis.axis_label = "Number of tasks" - - handle = show( - gridplot( - p, - ncols=1, - plot_width=500, - plot_height=500, - toolbar_location="below"), - notebook_handle=True) - - # Function to update the plot - def task_completion_time_update(abs_earliest, abs_latest, abs_num_tasks, - tasks): - if len(tasks) == 0: - return - - # Create the distribution to plot - distr = [] - for task_id, data in tasks.items(): - distr.append(data["store_outputs_end"] - - data["get_arguments_start"]) - - # Create a histogram from the distribution - top, bin_edges = np.histogram(distr, bins="auto") - left = bin_edges[:-1] - right = bin_edges[1:] - - source.data = {"top": top, "left": left, "right": right} - - # Set the x and y ranges - x_range = (min(left) if len(left) else 0, max(right) - if len(right) else 1) - y_range = (0, max(top) + 1 if len(top) else 1) - - x_range = helpers._get_range(x_range) - p.x_range.start = x_range.start - p.x_range.end = x_range.end - - y_range = helpers._get_range(y_range) - p.y_range.start = y_range.start - p.y_range.end = y_range.end - - # Push updates to the plot - push_notebook(handle=handle) - - get_sliders(task_completion_time_update) - - -def compute_utilizations(abs_earliest, - abs_latest, - num_tasks, - tasks, - num_buckets, - use_abs_times=False): - if len(tasks) == 0: - return [], [], [] - - if use_abs_times: - earliest_time = abs_earliest - latest_time = abs_latest - else: - # Determine what the earliest and latest tasks are out of the ones - # that are passed in - earliest_time = time.time() - latest_time = 0 - for task_id, data in tasks.items(): - latest_time = max((latest_time, data["store_outputs_end"])) - earliest_time = min((earliest_time, data["get_arguments_start"])) - - # Add some epsilon to latest_time to ensure that the end time of the - # last task falls __within__ a bucket, and not on the edge - latest_time += 1e-6 - - # Compute average CPU utilization per time bucket by summing - # cpu-time per bucket - bucket_time_length = (latest_time - earliest_time) / float(num_buckets) - cpu_time = [0 for _ in range(num_buckets)] - - for data in tasks.values(): - task_start_time = data["get_arguments_start"] - task_end_time = data["store_outputs_end"] - - start_bucket = int( - (task_start_time - earliest_time) / bucket_time_length) - end_bucket = int((task_end_time - earliest_time) / bucket_time_length) - # Walk over each time bucket that this task intersects, adding the - # amount of time that the task intersects within each bucket - for bucket_idx in range(start_bucket, end_bucket + 1): - bucket_start_time = ( - (earliest_time + bucket_idx) * bucket_time_length) - bucket_end_time = ( - (earliest_time + (bucket_idx + 1)) * bucket_time_length) - - task_start_time_within_bucket = max(task_start_time, - bucket_start_time) - task_end_time_within_bucket = min(task_end_time, bucket_end_time) - task_cpu_time_within_bucket = ( - task_end_time_within_bucket - task_start_time_within_bucket) - - if bucket_idx > -1 and bucket_idx < num_buckets: - cpu_time[bucket_idx] += task_cpu_time_within_bucket - - # Cpu_utilization is the average cpu utilization of the bucket, which - # is just cpu_time divided by bucket_time_length. - cpu_utilization = list( - map(lambda x: x / float(bucket_time_length), cpu_time)) - - # Generate histogram bucket edges. Subtract out abs_earliest to get - # relative time. - all_edges = [ - earliest_time - abs_earliest + i * bucket_time_length - for i in range(num_buckets + 1) - ] - # Left edges are all but the rightmost edge, right edges are all but - # the leftmost edge. - left_edges = all_edges[:-1] - right_edges = all_edges[1:] - - return left_edges, right_edges, cpu_utilization - - -def cpu_usage(): - from bokeh.layouts import gridplot - from bokeh.plotting import figure, show, helpers - from bokeh.resources import CDN - from bokeh.io import output_notebook, push_notebook - from bokeh.models import ColumnDataSource - output_notebook(resources=CDN) - - # Parse the client table to determine how many CPUs are available - num_cpus = ray.global_state.cluster_resources()["CPU"] - - # Update the plot based on the sliders - def plot_utilization(): - # Create the Bokeh plot - time_series_fig = figure( - title="CPU Utilization", - tools=["save", "hover", "wheel_zoom", "box_zoom", "pan"], - background_fill_color="#FFFFFF", - x_range=[0, 1], - y_range=[0, 1]) - - # Create the data source that the plot will pull from - time_series_source = ColumnDataSource(data={ - 'left': [], - 'right': [], - 'top': [] - }) - - # Plot the rectangles representing the distribution - time_series_fig.quad( - left="left", - right="right", - top="top", - bottom=0, - source=time_series_source, - fill_color="#B3B3B3", - line_color="#033649") - - # Label the plot axes - time_series_fig.xaxis.axis_label = "Time in seconds" - time_series_fig.yaxis.axis_label = "Number of CPUs used" - - handle = show( - gridplot( - time_series_fig, - ncols=1, - plot_width=500, - plot_height=500, - toolbar_location="below"), - notebook_handle=True) - - def update_plot(abs_earliest, abs_latest, abs_num_tasks, tasks): - num_buckets = 100 - left, right, top = compute_utilizations( - abs_earliest, abs_latest, abs_num_tasks, tasks, num_buckets) - - time_series_source.data = { - "left": left, - "right": right, - "top": top - } - - x_range = (max(0, min(left)) if len(left) else 0, max(right) - if len(right) else 1) - y_range = (0, max(top) + 1 if len(top) else 1) - - # Define the axis ranges - x_range = helpers._get_range(x_range) - time_series_fig.x_range.start = x_range.start - time_series_fig.x_range.end = x_range.end - - y_range = helpers._get_range(y_range) - time_series_fig.y_range.start = y_range.start - time_series_fig.y_range.end = num_cpus - - # Push the updated data to the notebook - push_notebook(handle=handle) - - get_sliders(update_plot) - - plot_utilization() - - -# Function to create the cluster usage "heat map" -def cluster_usage(): - from bokeh.io import show, output_notebook, push_notebook - from bokeh.resources import CDN - from bokeh.plotting import figure - from bokeh.models import ( - ColumnDataSource, - HoverTool, - LinearColorMapper, - BasicTicker, - ColorBar, - ) - output_notebook(resources=CDN) - - # Initial values - source = ColumnDataSource( - data={ - "node_ip_address": ['127.0.0.1'], - "time": ['0.5'], - "num_tasks": ['1'], - "length": [1] - }) - - # Define the color schema - colors = [ - "#75968f", "#a5bab7", "#c9d9d3", "#e2e2e2", "#dfccce", "#ddb7b1", - "#cc7878", "#933b41", "#550b1d" - ] - mapper = LinearColorMapper(palette=colors, low=0, high=2) - - TOOLS = "hover, save, xpan, box_zoom, reset, xwheel_zoom" - - # Create the plot - p = figure( - title="Cluster Usage", - y_range=list(set(source.data['node_ip_address'])), - x_axis_location="above", - plot_width=900, - plot_height=500, - tools=TOOLS, - toolbar_location='below') - - # Format the plot axes - p.grid.grid_line_color = None - p.axis.axis_line_color = None - p.axis.major_tick_line_color = None - p.axis.major_label_text_font_size = "10pt" - p.axis.major_label_standoff = 0 - p.xaxis.major_label_orientation = np.pi / 3 - - # Plot rectangles - p.rect( - x="time", - y="node_ip_address", - width="length", - height=1, - source=source, - fill_color={ - "field": "num_tasks", - "transform": mapper - }, - line_color=None) - - # Add legend to the side of the plot - color_bar = ColorBar( - color_mapper=mapper, - major_label_text_font_size="8pt", - ticker=BasicTicker(desired_num_ticks=len(colors)), - label_standoff=6, - border_line_color=None, - location=(0, 0)) - p.add_layout(color_bar, "right") - - # Define hover tool - p.select_one(HoverTool).tooltips = [("Node IP Address", - "@node_ip_address"), - ("Number of tasks running", - "@num_tasks"), ("Time", "@time")] - - # Define the axis labels - p.xaxis.axis_label = "Time in seconds" - p.yaxis.axis_label = "Node IP Address" - handle = show(p, notebook_handle=True) - workers = ray.global_state.workers() - - # Function to update the heat map - def heat_map_update(abs_earliest, abs_latest, abs_num_tasks, tasks): - if len(tasks) == 0: - return - - earliest = time.time() - latest = 0 - - node_to_tasks = {} - # Determine which task has the earlest start time out of the ones - # passed into the update function - for task_id, data in tasks.items(): - if data["score"] > latest: - latest = data["score"] - if data["score"] < earliest: - earliest = data["score"] - worker_id = data["worker_id"] - node_ip = workers[worker_id]["node_ip_address"] - if node_ip not in node_to_tasks: - node_to_tasks[node_ip] = {} - node_to_tasks[node_ip][task_id] = data - - nodes = [] - times = [] - lengths = [] - num_tasks = [] - - for node_ip, task_dict in node_to_tasks.items(): - left, right, top = compute_utilizations( - earliest, latest, abs_num_tasks, task_dict, 100, True) - for (l, r, t) in zip(left, right, top): - nodes.append(node_ip) - times.append((l + r) / 2) - lengths.append(r - l) - num_tasks.append(t) - - # Set the y range of the plot to be the node IP addresses - p.y_range.factors = list(set(nodes)) - - mapper.low = min(min(num_tasks), 0) - mapper.high = max(max(num_tasks), 1) - - # Update plot with new data based on slider and text box values - source.data = { - "node_ip_address": nodes, - "time": times, - "num_tasks": num_tasks, - "length": lengths - } - - push_notebook(handle=handle) - - get_sliders(heat_map_update) diff --git a/python/ray/includes/libraylet.pxd b/python/ray/includes/libraylet.pxd index 1a4ffb250235..a496c5b83783 100644 --- a/python/ray/includes/libraylet.pxd +++ b/python/ray/includes/libraylet.pxd @@ -62,7 +62,7 @@ cdef extern from "ray/raylet/raylet_client.h" nogil: int num_returns, int64_t timeout_milliseconds, c_bool wait_local, const CTaskID ¤t_task_id, WaitResultPair *result) - CRayStatus PushError(const CDriverID &job_id, const c_string &type, + CRayStatus PushError(const CDriverID &driver_id, const c_string &type, const c_string &error_message, double timestamp) CRayStatus PushProfileEvents( const GCSProfileTableDataT &profile_events) diff --git a/python/ray/includes/task.pxi b/python/ray/includes/task.pxi index a7cfc684b9d0..872b93d22269 100644 --- a/python/ray/includes/task.pxi +++ b/python/ray/includes/task.pxi @@ -54,7 +54,7 @@ cdef class Task: for arg in arguments: if isinstance(arg, ObjectID): references = c_vector[CObjectID]() - references.push_back((arg).data) + references.push_back((arg).native()) task_args.push_back( static_pointer_cast[CTaskArgument, CTaskArgumentByReference]( @@ -71,23 +71,21 @@ cdef class Task: for new_actor_handle in new_actor_handles: task_new_actor_handles.push_back( - (new_actor_handle).data) + (new_actor_handle).native()) self.task_spec.reset(new CTaskSpecification( - CUniqueID(driver_id.data), parent_task_id.data, parent_counter, - actor_creation_id.data, actor_creation_dummy_object_id.data, - max_actor_reconstructions, CUniqueID(actor_id.data), - CUniqueID(actor_handle_id.data), actor_counter, - task_new_actor_handles, task_args, num_returns, - required_resources, required_placement_resources, - LANGUAGE_PYTHON, c_function_descriptor)) + driver_id.native(), parent_task_id.native(), parent_counter, actor_creation_id.native(), + actor_creation_dummy_object_id.native(), max_actor_reconstructions, actor_id.native(), + actor_handle_id.native(), actor_counter, task_new_actor_handles, task_args, num_returns, + required_resources, required_placement_resources, LANGUAGE_PYTHON, + c_function_descriptor)) # Set the task's execution dependencies. self.execution_dependencies.reset(new c_vector[CObjectID]()) if execution_arguments is not None: for execution_arg in execution_arguments: self.execution_dependencies.get().push_back( - (execution_arg).data) + (execution_arg).native()) @staticmethod cdef make(unique_ptr[CTaskSpecification]& task_spec): diff --git a/python/ray/includes/unique_ids.pxd b/python/ray/includes/unique_ids.pxd index fc36f97766c1..cadbdfea2827 100644 --- a/python/ray/includes/unique_ids.pxd +++ b/python/ray/includes/unique_ids.pxd @@ -5,13 +5,14 @@ from libc.stdint cimport uint8_t cdef extern from "ray/id.h" namespace "ray" nogil: cdef cppclass CUniqueID "ray::UniqueID": CUniqueID() + CUniqueID(const c_string &binary) CUniqueID(const CUniqueID &from_id) @staticmethod CUniqueID from_random() @staticmethod - CUniqueID from_binary(const c_string & binary) + CUniqueID from_binary(const c_string &binary) @staticmethod const CUniqueID nil() @@ -26,14 +27,73 @@ cdef extern from "ray/id.h" namespace "ray" nogil: c_string binary() const c_string hex() const -ctypedef CUniqueID CActorCheckpointID -ctypedef CUniqueID CActorClassID -ctypedef CUniqueID CActorHandleID -ctypedef CUniqueID CActorID -ctypedef CUniqueID CClientID -ctypedef CUniqueID CConfigID -ctypedef CUniqueID CDriverID -ctypedef CUniqueID CFunctionID -ctypedef CUniqueID CObjectID -ctypedef CUniqueID CTaskID -ctypedef CUniqueID CWorkerID + cdef cppclass CActorCheckpointID "ray::ActorCheckpointID"(CUniqueID): + + @staticmethod + CActorCheckpointID from_binary(const c_string &binary) + + + cdef cppclass CActorClassID "ray::ActorClassID"(CUniqueID): + + @staticmethod + CActorClassID from_binary(const c_string &binary) + + + cdef cppclass CActorID "ray::ActorID"(CUniqueID): + + @staticmethod + CActorID from_binary(const c_string &binary) + + + cdef cppclass CActorHandleID "ray::ActorHandleID"(CUniqueID): + + @staticmethod + CActorHandleID from_binary(const c_string &binary) + + + cdef cppclass CClientID "ray::ClientID"(CUniqueID): + + @staticmethod + CClientID from_binary(const c_string &binary) + + + cdef cppclass CConfigID "ray::ConfigID"(CUniqueID): + + @staticmethod + CConfigID from_binary(const c_string &binary) + + + cdef cppclass CFunctionID "ray::FunctionID"(CUniqueID): + + @staticmethod + CFunctionID from_binary(const c_string &binary) + + + cdef cppclass CDriverID "ray::DriverID"(CUniqueID): + + @staticmethod + CDriverID from_binary(const c_string &binary) + + + cdef cppclass CJobID "ray::JobID"(CUniqueID): + + @staticmethod + CJobID from_binary(const c_string &binary) + + + cdef cppclass CTaskID "ray::TaskID"(CUniqueID): + + @staticmethod + CTaskID from_binary(const c_string &binary) + + + cdef cppclass CObjectID" ray::ObjectID"(CUniqueID): + + @staticmethod + CObjectID from_binary(const c_string &binary) + + + cdef cppclass CWorkerID "ray::WorkerID"(CUniqueID): + + @staticmethod + CWorkerID from_binary(const c_string &binary) diff --git a/python/ray/includes/unique_ids.pxi b/python/ray/includes/unique_ids.pxi index 670579737d7c..0086f76b51b0 100644 --- a/python/ray/includes/unique_ids.pxi +++ b/python/ray/includes/unique_ids.pxi @@ -19,6 +19,7 @@ from ray.includes.unique_ids cimport ( CConfigID, CDriverID, CFunctionID, + CJobID, CObjectID, CTaskID, CUniqueID, @@ -45,11 +46,8 @@ cdef class UniqueID: cdef CUniqueID data def __init__(self, id): - if not id: - self.data = CUniqueID() - else: - check_id(id) - self.data = CUniqueID.from_binary(id) + check_id(id) + self.data = CUniqueID.from_binary(id) @classmethod def from_binary(cls, id_bytes): @@ -59,7 +57,7 @@ cdef class UniqueID: @classmethod def nil(cls): - return cls(b"") + return cls(CUniqueID.nil().binary()) def __hash__(self): return self.data.hash() @@ -106,40 +104,93 @@ cdef class UniqueID: cdef class ObjectID(UniqueID): - pass + + def __init__(self, id): + check_id(id) + self.data = CObjectID.from_binary(id) + + cdef CObjectID native(self): + return self.data cdef class TaskID(UniqueID): - pass + + def __init__(self, id): + check_id(id) + self.data = CTaskID.from_binary(id) + + cdef CTaskID native(self): + return self.data cdef class ClientID(UniqueID): - pass + + def __init__(self, id): + check_id(id) + self.data = CClientID.from_binary(id) + + cdef CClientID native(self): + return self.data cdef class DriverID(UniqueID): - pass + + def __init__(self, id): + check_id(id) + self.data = CDriverID.from_binary(id) + + cdef CDriverID native(self): + return self.data cdef class ActorID(UniqueID): - pass + + def __init__(self, id): + check_id(id) + self.data = CActorID.from_binary(id) + + cdef CActorID native(self): + return self.data cdef class ActorHandleID(UniqueID): - pass + + def __init__(self, id): + check_id(id) + self.data = CActorHandleID.from_binary(id) + + cdef CActorHandleID native(self): + return self.data cdef class ActorCheckpointID(UniqueID): - pass + + def __init__(self, id): + check_id(id) + self.data = CActorCheckpointID.from_binary(id) + + cdef CActorCheckpointID native(self): + return self.data cdef class FunctionID(UniqueID): - pass + + def __init__(self, id): + check_id(id) + self.data = CFunctionID.from_binary(id) + + cdef CFunctionID native(self): + return self.data cdef class ActorClassID(UniqueID): - pass + def __init__(self, id): + check_id(id) + self.data = CActorClassID.from_binary(id) + + cdef CActorClassID native(self): + return self.data _ID_TYPES = [ ActorCheckpointID, diff --git a/python/ray/node.py b/python/ray/node.py index 5f3d89be6b09..c3d0e0866872 100644 --- a/python/ray/node.py +++ b/python/ray/node.py @@ -319,22 +319,6 @@ def start_dashboard(self): redis_client = self.create_redis_client() redis_client.hmset("webui", {"url": self._webui_url}) - def start_ui(self): - """Start the web UI.""" - stdout_file, stderr_file = self.new_log_files("webui") - notebook_name = self._make_inc_temp( - suffix=".ipynb", prefix="ray_ui", directory_name=self._temp_dir) - _, process_info = ray.services.start_ui( - self._redis_address, - notebook_name, - stdout_file=stdout_file, - stderr_file=stderr_file) - assert ray_constants.PROCESS_TYPE_WEB_UI not in self.all_processes - if process_info is not None: - self.all_processes[ray_constants.PROCESS_TYPE_WEB_UI] = [ - process_info - ] - def start_plasma_store(self): """Start the plasma store.""" assert self._plasma_store_socket_name is None diff --git a/python/ray/parameter.py b/python/ray/parameter.py index 016ddcef4c5c..33c1322a2da3 100644 --- a/python/ray/parameter.py +++ b/python/ray/parameter.py @@ -56,7 +56,7 @@ class RayParams(object): huge_pages: Boolean flag indicating whether to start the Object Store with hugetlbfs support. Requires plasma_directory. include_webui: Boolean flag indicating whether to start the web - UI, which is a Jupyter notebook. + UI, which displays the status of the Ray cluster. logging_level: Logging level, default will be logging.INFO. logging_format: Logging format, default contains a timestamp, filename, line number, and message. See ray_constants.py. diff --git a/python/ray/remote_function.py b/python/ray/remote_function.py index d07b97e7ea47..dff3c7801b3f 100644 --- a/python/ray/remote_function.py +++ b/python/ray/remote_function.py @@ -107,6 +107,7 @@ def _remote(self, worker.function_actor_manager.export(self) kwargs = {} if kwargs is None else kwargs + args = [] if args is None else args args = ray.signature.extend_args(self._function_signature, args, kwargs) diff --git a/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py b/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py index 882c64031388..a5a91abb5962 100644 --- a/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py +++ b/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py @@ -50,14 +50,15 @@ def __init__(self, observation_space, action_space, config): tf.float32, [None] + list(observation_space.shape)) dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) - prev_actions = ModelCatalog.get_action_placeholder(action_space) - prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") + self.prev_actions = ModelCatalog.get_action_placeholder(action_space) + self.prev_rewards = tf.placeholder( + tf.float32, [None], name="prev_reward") self.model = ModelCatalog.get_model({ "obs": self.observations, - "prev_actions": prev_actions, - "prev_rewards": prev_rewards, + "prev_actions": self.prev_actions, + "prev_rewards": self.prev_rewards, "is_training": self._get_is_training_placeholder(), - }, observation_space, logit_dim, self.config["model"]) + }, observation_space, action_space, logit_dim, self.config["model"]) action_dist = dist_class(self.model.outputs) self.vf = self.model.value_function() self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, @@ -83,8 +84,8 @@ def __init__(self, observation_space, action_space, config): loss_in = [ ("obs", self.observations), ("actions", actions), - ("prev_actions", prev_actions), - ("prev_rewards", prev_rewards), + ("prev_actions", self.prev_actions), + ("prev_rewards", self.prev_rewards), ("advantages", advantages), ("value_targets", self.v_target), ] @@ -103,8 +104,8 @@ def __init__(self, observation_space, action_space, config): loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, - prev_action_input=prev_actions, - prev_reward_input=prev_rewards, + prev_action_input=self.prev_actions, + prev_reward_input=self.prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"]) @@ -138,7 +139,9 @@ def postprocess_trajectory(self, next_state = [] for i in range(len(self.model.state_in)): next_state.append([sample_batch["state_out_{}".format(i)][-1]]) - last_r = self._value(sample_batch["new_obs"][-1], *next_state) + last_r = self._value(sample_batch["new_obs"][-1], + sample_batch["actions"][-1], + sample_batch["rewards"][-1], *next_state) return compute_advantages(sample_batch, last_r, self.config["gamma"], self.config["lambda"]) @@ -159,8 +162,13 @@ def extra_compute_action_fetches(self): TFPolicyGraph.extra_compute_action_fetches(self), **{"vf_preds": self.vf}) - def _value(self, ob, *args): - feed_dict = {self.observations: [ob], self.model.seq_lens: [1]} + def _value(self, ob, prev_action, prev_reward, *args): + feed_dict = { + self.observations: [ob], + self.prev_actions: [prev_action], + self.prev_rewards: [prev_reward], + self.model.seq_lens: [1] + } assert len(args) == len(self.model.state_in), \ (args, self.model.state_in) for k, v in zip(self.model.state_in, args): diff --git a/python/ray/rllib/agents/agent.py b/python/ray/rllib/agents/agent.py index 829c020f5ec6..235d0f704b1f 100644 --- a/python/ray/rllib/agents/agent.py +++ b/python/ray/rllib/agents/agent.py @@ -134,6 +134,9 @@ # remote processes instead of in the same worker. This adds overheads, but # can make sense if your envs are very CPU intensive (e.g., for StarCraft). "remote_worker_envs": False, + # Similar to remote_worker_envs, but runs the envs asynchronously in the + # background for greater efficiency. Conflicts with remote_worker_envs. + "async_remote_worker_envs": False, # === Offline Datasets === # __sphinx_doc_input_begin__ @@ -473,9 +476,7 @@ def make_local_evaluator(self, "tf_session_args": self. config["local_evaluator_tf_session_args"] }), - extra_config or {}), - remote_worker_envs=False, - ) + extra_config or {})) @DeveloperAPI def make_remote_evaluators(self, env_creator, policy_graph, count): @@ -490,14 +491,8 @@ def make_remote_evaluators(self, env_creator, policy_graph, count): cls = PolicyEvaluator.as_remote(**remote_args).remote return [ - self._make_evaluator( - cls, - env_creator, - policy_graph, - i + 1, - self.config, - remote_worker_envs=self.config["remote_worker_envs"]) - for i in range(count) + self._make_evaluator(cls, env_creator, policy_graph, i + 1, + self.config) for i in range(count) ] @DeveloperAPI @@ -563,13 +558,8 @@ def _validate_config(config): "`input_evaluation` must be a list of strings, got {}".format( config["input_evaluation"])) - def _make_evaluator(self, - cls, - env_creator, - policy_graph, - worker_index, - config, - remote_worker_envs=False): + def _make_evaluator(self, cls, env_creator, policy_graph, worker_index, + config): def session_creator(): logger.debug("Creating TF session {}".format( config["tf_session_args"])) @@ -639,7 +629,8 @@ def session_creator(): input_creator=input_creator, input_evaluation=input_evaluation, output_creator=output_creator, - remote_worker_envs=remote_worker_envs) + remote_worker_envs=config["remote_worker_envs"], + async_remote_worker_envs=config["async_remote_worker_envs"]) @override(Trainable) def _export_model(self, export_formats, export_dir): diff --git a/python/ray/rllib/agents/ars/policies.py b/python/ray/rllib/agents/ars/policies.py index 7c4defd6908c..fe82be5b65dd 100644 --- a/python/ray/rllib/agents/ars/policies.py +++ b/python/ray/rllib/agents/ars/policies.py @@ -78,7 +78,7 @@ def __init__(self, model = ModelCatalog.get_model({ "obs": self.inputs - }, obs_space, dist_dim, model_config) + }, obs_space, action_space, dist_dim, model_config) dist = dist_class(model.outputs) self.sampler = dist.sample() diff --git a/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py b/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py index 439671e93291..c329a8b64ee3 100644 --- a/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py +++ b/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py @@ -217,7 +217,7 @@ def __init__(self, observation_space, action_space, config): # Actor: P (policy) network with tf.variable_scope(P_SCOPE) as scope: p_values, self.p_model = self._build_p_network( - self.cur_observations, observation_space) + self.cur_observations, observation_space, action_space) self.p_func_vars = _scope_vars(scope.name) # Noise vars for P network except for layer normalization vars @@ -256,14 +256,16 @@ def __init__(self, observation_space, action_space, config): # p network evaluation with tf.variable_scope(P_SCOPE, reuse=True) as scope: prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - self.p_t, _ = self._build_p_network(self.obs_t, observation_space) + self.p_t, _ = self._build_p_network(self.obs_t, observation_space, + action_space) p_batchnorm_update_ops = list( set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops) # target p network evaluation with tf.variable_scope(P_TARGET_SCOPE) as scope: - p_tp1, _ = self._build_p_network(self.obs_tp1, observation_space) + p_tp1, _ = self._build_p_network(self.obs_tp1, observation_space, + action_space) target_p_func_vars = _scope_vars(scope.name) # Action outputs @@ -283,7 +285,7 @@ def __init__(self, observation_space, action_space, config): prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) with tf.variable_scope(Q_SCOPE) as scope: q_t, self.q_model = self._build_q_network( - self.obs_t, observation_space, self.act_t) + self.obs_t, observation_space, action_space, self.act_t) self.q_func_vars = _scope_vars(scope.name) self.stats = { "mean_q": tf.reduce_mean(q_t), @@ -292,11 +294,11 @@ def __init__(self, observation_space, action_space, config): } with tf.variable_scope(Q_SCOPE, reuse=True): q_tp0, _ = self._build_q_network(self.obs_t, observation_space, - output_actions) + action_space, output_actions) if self.config["twin_q"]: with tf.variable_scope(TWIN_Q_SCOPE) as scope: twin_q_t, self.twin_q_model = self._build_q_network( - self.obs_t, observation_space, self.act_t) + self.obs_t, observation_space, action_space, self.act_t) self.twin_q_func_vars = _scope_vars(scope.name) q_batchnorm_update_ops = list( set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops) @@ -304,12 +306,14 @@ def __init__(self, observation_space, action_space, config): # target q network evalution with tf.variable_scope(Q_TARGET_SCOPE) as scope: q_tp1, _ = self._build_q_network(self.obs_tp1, observation_space, + action_space, output_actions_estimated) target_q_func_vars = _scope_vars(scope.name) if self.config["twin_q"]: with tf.variable_scope(TWIN_Q_TARGET_SCOPE) as scope: twin_q_tp1, _ = self._build_q_network( - self.obs_tp1, observation_space, output_actions_estimated) + self.obs_tp1, observation_space, action_space, + output_actions_estimated) twin_target_q_func_vars = _scope_vars(scope.name) if self.config["twin_q"]: @@ -492,23 +496,23 @@ def set_state(self, state): TFPolicyGraph.set_state(self, state[0]) self.set_epsilon(state[1]) - def _build_q_network(self, obs, obs_space, actions): + def _build_q_network(self, obs, obs_space, action_space, actions): q_net = QNetwork( ModelCatalog.get_model({ "obs": obs, "is_training": self._get_is_training_placeholder(), - }, obs_space, 1, self.config["model"]), actions, + }, obs_space, action_space, 1, self.config["model"]), actions, self.config["critic_hiddens"], self.config["critic_hidden_activation"]) return q_net.value, q_net.model - def _build_p_network(self, obs, obs_space): + def _build_p_network(self, obs, obs_space, action_space): policy_net = PNetwork( ModelCatalog.get_model({ "obs": obs, "is_training": self._get_is_training_placeholder(), - }, obs_space, 1, self.config["model"]), self.dim_actions, - self.config["actor_hiddens"], + }, obs_space, action_space, 1, self.config["model"]), + self.dim_actions, self.config["actor_hiddens"], self.config["actor_hidden_activation"], self.config["parameter_noise"]) return policy_net.action_scores, policy_net.model diff --git a/python/ray/rllib/agents/dqn/dqn.py b/python/ray/rllib/agents/dqn/dqn.py index 31f7f12cc31e..9c51e565315f 100644 --- a/python/ray/rllib/agents/dqn/dqn.py +++ b/python/ray/rllib/agents/dqn/dqn.py @@ -12,6 +12,7 @@ from ray.rllib.evaluation.metrics import collect_metrics from ray.rllib.utils.annotations import override from ray.rllib.utils.schedules import ConstantSchedule, LinearSchedule +from ray.tune.trial import Resources logger = logging.getLogger(__name__) @@ -141,6 +142,21 @@ class DQNAgent(Agent): _policy_graph = DQNPolicyGraph _optimizer_shared_configs = OPTIMIZER_SHARED_CONFIGS + @classmethod + @override(Agent) + def default_resource_request(cls, config): + cf = dict(cls._default_config, **config) + Agent._validate_config(cf) + if cf["optimizer_class"] == "AsyncReplayOptimizer": + extra = cf["optimizer"]["num_replay_buffer_shards"] + else: + extra = 0 + return Resources( + cpu=cf["num_cpus_for_driver"], + gpu=cf["num_gpus"], + extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"] + extra, + extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"]) + @override(Agent) def _init(self): self._validate_config() diff --git a/python/ray/rllib/agents/dqn/dqn_policy_graph.py b/python/ray/rllib/agents/dqn/dqn_policy_graph.py index d422b0f3cef2..686e09312853 100644 --- a/python/ray/rllib/agents/dqn/dqn_policy_graph.py +++ b/python/ray/rllib/agents/dqn/dqn_policy_graph.py @@ -312,7 +312,7 @@ def __init__(self, observation_space, action_space, config): # Action Q network with tf.variable_scope(Q_SCOPE) as scope: q_values, q_logits, q_dist, _ = self._build_q_network( - self.cur_observations, observation_space) + self.cur_observations, observation_space, action_space) self.q_values = q_values self.q_func_vars = _scope_vars(scope.name) @@ -342,7 +342,7 @@ def __init__(self, observation_space, action_space, config): with tf.variable_scope(Q_SCOPE, reuse=True): prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) q_t, q_logits_t, q_dist_t, model = self._build_q_network( - self.obs_t, observation_space) + self.obs_t, observation_space, action_space) q_batchnorm_update_ops = list( set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops) @@ -350,7 +350,7 @@ def __init__(self, observation_space, action_space, config): # target q network evalution with tf.variable_scope(Q_TARGET_SCOPE) as scope: q_tp1, q_logits_tp1, q_dist_tp1, _ = self._build_q_network( - self.obs_tp1, observation_space) + self.obs_tp1, observation_space, action_space) self.target_q_func_vars = _scope_vars(scope.name) # q scores for actions which we know were selected in the given state. @@ -364,7 +364,7 @@ def __init__(self, observation_space, action_space, config): with tf.variable_scope(Q_SCOPE, reuse=True): q_tp1_using_online_net, q_logits_tp1_using_online_net, \ q_dist_tp1_using_online_net, _ = self._build_q_network( - self.obs_tp1, observation_space) + self.obs_tp1, observation_space, action_space) q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1) q_tp1_best_one_hot_selection = tf.one_hot( q_tp1_best_using_online_net, self.num_actions) @@ -556,13 +556,14 @@ def update_target(self): def set_epsilon(self, epsilon): self.cur_epsilon = epsilon - def _build_q_network(self, obs, space): + def _build_q_network(self, obs, obs_space, action_space): qnet = QNetwork( ModelCatalog.get_model({ "obs": obs, "is_training": self._get_is_training_placeholder(), - }, space, self.num_actions, self.config["model"]), - self.num_actions, self.config["dueling"], self.config["hiddens"], + }, obs_space, action_space, self.num_actions, + self.config["model"]), self.num_actions, + self.config["dueling"], self.config["hiddens"], self.config["noisy"], self.config["num_atoms"], self.config["v_min"], self.config["v_max"], self.config["sigma0"], self.config["parameter_noise"]) diff --git a/python/ray/rllib/agents/es/policies.py b/python/ray/rllib/agents/es/policies.py index 61f748ce007f..78ff29da4f86 100644 --- a/python/ray/rllib/agents/es/policies.py +++ b/python/ray/rllib/agents/es/policies.py @@ -56,7 +56,7 @@ def __init__(self, sess, action_space, obs_space, preprocessor, self.action_space, model_options, dist_type="deterministic") model = ModelCatalog.get_model({ "obs": self.inputs - }, obs_space, dist_dim, model_options) + }, obs_space, action_space, dist_dim, model_options) dist = dist_class(model.outputs) self.sampler = dist.sample() diff --git a/python/ray/rllib/agents/impala/vtrace.py b/python/ray/rllib/agents/impala/vtrace.py index 4031ee4c8d45..ac5abf0e6592 100644 --- a/python/ray/rllib/agents/impala/vtrace.py +++ b/python/ray/rllib/agents/impala/vtrace.py @@ -20,12 +20,6 @@ by Espeholt, Soyer, Munos et al. See https://arxiv.org/abs/1802.01561 for the full paper. - -In addition to the original paper's code, changes have been made -to support MultiDiscrete action spaces. behaviour_policy_logits, -target_policy_logits and actions parameters in the entry point -multi_from_logits method accepts lists of tensors instead of just -tensors. """ from __future__ import absolute_import @@ -47,48 +41,29 @@ def log_probs_from_logits_and_actions(policy_logits, actions): - return multi_log_probs_from_logits_and_actions([policy_logits], - [actions])[0] - - -def multi_log_probs_from_logits_and_actions(policy_logits, actions): """Computes action log-probs from policy logits and actions. In the notation used throughout documentation and comments, T refers to the time dimension ranging from 0 to T-1. B refers to the batch size and - ACTION_SPACE refers to the list of numbers each representing a number of - actions. + NUM_ACTIONS refers to the number of actions. Args: - policy_logits: A list with length of ACTION_SPACE of float32 - tensors of shapes - [T, B, ACTION_SPACE[0]], - ..., - [T, B, ACTION_SPACE[-1]] - with un-normalized log-probabilities parameterizing a softmax policy. - actions: A list with length of ACTION_SPACE of int32 - tensors of shapes - [T, B], - ..., - [T, B] - with actions. + policy_logits: A float32 tensor of shape [T, B, NUM_ACTIONS] with + un-normalized log-probabilities parameterizing a softmax policy. + actions: An int32 tensor of shape [T, B] with actions. Returns: - A list with length of ACTION_SPACE of float32 - tensors of shapes - [T, B], - ..., - [T, B] - corresponding to the sampling log probability - of the chosen action w.r.t. the policy. + A float32 tensor of shape [T, B] corresponding to the sampling log + probability of the chosen action w.r.t. the policy. """ + policy_logits = tf.convert_to_tensor(policy_logits, dtype=tf.float32) + actions = tf.convert_to_tensor(actions, dtype=tf.int32) - log_probs = [] - for i in range(len(policy_logits)): - log_probs.append(-tf.nn.sparse_softmax_cross_entropy_with_logits( - logits=policy_logits[i], labels=actions[i])) + policy_logits.shape.assert_has_rank(3) + actions.shape.assert_has_rank(2) - return log_probs + return -tf.nn.sparse_softmax_cross_entropy_with_logits( + logits=policy_logits, labels=actions) def from_logits(behaviour_policy_logits, @@ -101,39 +76,6 @@ def from_logits(behaviour_policy_logits, clip_rho_threshold=1.0, clip_pg_rho_threshold=1.0, name='vtrace_from_logits'): - """multi_from_logits wrapper used only for tests""" - - res = multi_from_logits( - [behaviour_policy_logits], [target_policy_logits], [actions], - discounts, - rewards, - values, - bootstrap_value, - clip_rho_threshold=clip_rho_threshold, - clip_pg_rho_threshold=clip_pg_rho_threshold, - name=name) - - return VTraceFromLogitsReturns( - vs=res.vs, - pg_advantages=res.pg_advantages, - log_rhos=res.log_rhos, - behaviour_action_log_probs=tf.squeeze( - res.behaviour_action_log_probs, axis=0), - target_action_log_probs=tf.squeeze( - res.target_action_log_probs, axis=0), - ) - - -def multi_from_logits(behaviour_policy_logits, - target_policy_logits, - actions, - discounts, - rewards, - values, - bootstrap_value, - clip_rho_threshold=1.0, - clip_pg_rho_threshold=1.0, - name='vtrace_from_logits'): r"""V-trace for softmax policies. Calculates V-trace actor critic targets for softmax polices as described in @@ -148,30 +90,16 @@ def multi_from_logits(behaviour_policy_logits, In the notation used throughout documentation and comments, T refers to the time dimension ranging from 0 to T-1. B refers to the batch size and - ACTION_SPACE refers to the list of numbers each representing a number of - actions. + NUM_ACTIONS refers to the number of actions. Args: - behaviour_policy_logits: A list with length of ACTION_SPACE of float32 - tensors of shapes - [T, B, ACTION_SPACE[0]], - ..., - [T, B, ACTION_SPACE[-1]] - with un-normalized log-probabilities parameterizing the softmax behaviour - policy. - target_policy_logits: A list with length of ACTION_SPACE of float32 - tensors of shapes - [T, B, ACTION_SPACE[0]], - ..., - [T, B, ACTION_SPACE[-1]] - with un-normalized log-probabilities parameterizing the softmax target + behaviour_policy_logits: A float32 tensor of shape [T, B, NUM_ACTIONS] with + un-normalized log-probabilities parametrizing the softmax behaviour policy. - actions: A list with length of ACTION_SPACE of int32 - tensors of shapes - [T, B], - ..., - [T, B] - with actions sampled from the behaviour policy. + target_policy_logits: A float32 tensor of shape [T, B, NUM_ACTIONS] with + un-normalized log-probabilities parametrizing the softmax target policy. + actions: An int32 tensor of shape [T, B] of actions sampled from the + behaviour policy. discounts: A float32 tensor of shape [T, B] with the discount encountered when following the behaviour policy. rewards: A float32 tensor of shape [T, B] with the rewards generated by @@ -200,19 +128,17 @@ def multi_from_logits(behaviour_policy_logits, target_action_log_probs: A float32 tensor of shape [T, B] containing target policy action probabilities (log \pi(a_t)). """ - - for i in range(len(behaviour_policy_logits)): - behaviour_policy_logits[i] = tf.convert_to_tensor( - behaviour_policy_logits[i], dtype=tf.float32) - target_policy_logits[i] = tf.convert_to_tensor( - target_policy_logits[i], dtype=tf.float32) - actions[i] = tf.convert_to_tensor(actions[i], dtype=tf.int32) - - # Make sure tensor ranks are as expected. - # The rest will be checked by from_action_log_probs. - behaviour_policy_logits[i].shape.assert_has_rank(3) - target_policy_logits[i].shape.assert_has_rank(3) - actions[i].shape.assert_has_rank(2) + behaviour_policy_logits = tf.convert_to_tensor( + behaviour_policy_logits, dtype=tf.float32) + target_policy_logits = tf.convert_to_tensor( + target_policy_logits, dtype=tf.float32) + actions = tf.convert_to_tensor(actions, dtype=tf.int32) + + # Make sure tensor ranks are as expected. + # The rest will be checked by from_action_log_probs. + behaviour_policy_logits.shape.assert_has_rank(3) + target_policy_logits.shape.assert_has_rank(3) + actions.shape.assert_has_rank(2) with tf.name_scope( name, @@ -220,14 +146,11 @@ def multi_from_logits(behaviour_policy_logits, behaviour_policy_logits, target_policy_logits, actions, discounts, rewards, values, bootstrap_value ]): - target_action_log_probs = multi_log_probs_from_logits_and_actions( + target_action_log_probs = log_probs_from_logits_and_actions( target_policy_logits, actions) - behaviour_action_log_probs = multi_log_probs_from_logits_and_actions( + behaviour_action_log_probs = log_probs_from_logits_and_actions( behaviour_policy_logits, actions) - - log_rhos = get_log_rhos(target_action_log_probs, - behaviour_action_log_probs) - + log_rhos = target_action_log_probs - behaviour_action_log_probs vtrace_returns = from_importance_weights( log_rhos=log_rhos, discounts=discounts, @@ -236,7 +159,6 @@ def multi_from_logits(behaviour_policy_logits, bootstrap_value=bootstrap_value, clip_rho_threshold=clip_rho_threshold, clip_pg_rho_threshold=clip_pg_rho_threshold) - return VTraceFromLogitsReturns( log_rhos=log_rhos, behaviour_action_log_probs=behaviour_action_log_probs, @@ -261,13 +183,13 @@ def from_importance_weights(log_rhos, by Espeholt, Soyer, Munos et al. In the notation used throughout documentation and comments, T refers to the - time dimension ranging from 0 to T-1. B refers to the batch size. This code - also supports the case where all tensors have the same number of additional - dimensions, e.g., `rewards` is [T, B, C], `values` is [T, B, C], - `bootstrap_value` is [B, C]. + time dimension ranging from 0 to T-1. B refers to the batch size and + NUM_ACTIONS refers to the number of actions. This code also supports the + case where all tensors have the same number of additional dimensions, e.g., + `rewards` is [T, B, C], `values` is [T, B, C], `bootstrap_value` is [B, C]. Args: - log_rhos: A float32 tensor of shape [T, B] representing the + log_rhos: A float32 tensor of shape [T, B, NUM_ACTIONS] representing the log importance sampling weights, i.e. log(target_policy(a) / behaviour_policy(a)). V-trace performs operations on rhos in log-space for numerical stability. @@ -324,14 +246,6 @@ def from_importance_weights(log_rhos, if clip_rho_threshold is not None: clipped_rhos = tf.minimum( clip_rho_threshold, rhos, name='clipped_rhos') - - tf.summary.histogram('clipped_rhos_1000', tf.minimum(1000.0, rhos)) - tf.summary.scalar( - 'num_of_clipped_rhos', - tf.reduce_sum( - tf.cast( - tf.equal(clipped_rhos, clip_rho_threshold), tf.int32))) - tf.summary.scalar('size_of_clipped_rhos', tf.size(clipped_rhos)) else: clipped_rhos = rhos @@ -384,16 +298,3 @@ def scanfunc(acc, sequence_item): return VTraceReturns( vs=tf.stop_gradient(vs), pg_advantages=tf.stop_gradient(pg_advantages)) - - -def get_log_rhos(behaviour_action_log_probs, target_action_log_probs): - """With the selected log_probs for multi-discrete actions of behaviour - and target policies we compute the log_rhos for calculating the vtrace.""" - log_rhos = [ - t - b - for t, b in zip(target_action_log_probs, behaviour_action_log_probs) - ] - log_rhos = [tf.convert_to_tensor(l, dtype=tf.float32) for l in log_rhos] - log_rhos = tf.reduce_sum(tf.stack(log_rhos), axis=0) - - return log_rhos diff --git a/python/ray/rllib/agents/impala/vtrace_policy_graph.py b/python/ray/rllib/agents/impala/vtrace_policy_graph.py index 7e888cf8589e..7f36e78f75c8 100644 --- a/python/ray/rllib/agents/impala/vtrace_policy_graph.py +++ b/python/ray/rllib/agents/impala/vtrace_policy_graph.py @@ -6,19 +6,19 @@ from __future__ import division from __future__ import print_function +import tensorflow as tf import gym + import ray -import numpy as np -import tensorflow as tf from ray.rllib.agents.impala import vtrace from ray.rllib.evaluation.policy_graph import PolicyGraph from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph, \ LearningRateSchedule -from ray.rllib.models.action_dist import MultiCategorical from ray.rllib.models.catalog import ModelCatalog from ray.rllib.utils.annotations import override from ray.rllib.utils.error import UnsupportedSpaceException from ray.rllib.utils.explained_variance import explained_variance +from ray.rllib.models.action_dist import Categorical class VTraceLoss(object): @@ -45,20 +45,12 @@ def __init__(self, handle episode cut boundaries. Args: - actions: An int32 tensor of shape [T, B, ACTION_SPACE]. + actions: An int32 tensor of shape [T, B, NUM_ACTIONS]. actions_logp: A float32 tensor of shape [T, B]. actions_entropy: A float32 tensor of shape [T, B]. dones: A bool tensor of shape [T, B]. - behaviour_logits: A list with length of ACTION_SPACE of float32 - tensors of shapes - [T, B, ACTION_SPACE[0]], - ..., - [T, B, ACTION_SPACE[-1]] - target_logits: A list with length of ACTION_SPACE of float32 - tensors of shapes - [T, B, ACTION_SPACE[0]], - ..., - [T, B, ACTION_SPACE[-1]] + behaviour_logits: A float32 tensor of shape [T, B, NUM_ACTIONS]. + target_logits: A float32 tensor of shape [T, B, NUM_ACTIONS]. discount: A float32 scalar. rewards: A float32 tensor of shape [T, B]. values: A float32 tensor of shape [T, B]. @@ -68,10 +60,10 @@ def __init__(self, # Compute vtrace on the CPU for better perf. with tf.device("/cpu:0"): - self.vtrace_returns = vtrace.multi_from_logits( + self.vtrace_returns = vtrace.from_logits( behaviour_policy_logits=behaviour_logits, target_policy_logits=target_logits, - actions=tf.unstack(tf.cast(actions, tf.int32), axis=2), + actions=tf.cast(actions, tf.int32), discounts=tf.to_float(~dones) * discount, rewards=rewards, values=values, @@ -109,20 +101,6 @@ def __init__(self, "Must use `truncate_episodes` batch mode with V-trace." self.config = config self.sess = tf.get_default_session() - self.grads = None - - if isinstance(action_space, gym.spaces.Discrete): - is_multidiscrete = False - actions_shape = [None] - output_hidden_shape = [action_space.n] - elif isinstance(action_space, gym.spaces.multi_discrete.MultiDiscrete): - is_multidiscrete = True - actions_shape = [None, len(action_space.nvec)] - output_hidden_shape = action_space.nvec.astype(np.int32) - else: - raise UnsupportedSpaceException( - "Action space {} is not supported for IMPALA.".format( - action_space)) # Create input placeholders if existing_inputs: @@ -131,21 +109,22 @@ def __init__(self, existing_state_in = existing_inputs[7:-1] existing_seq_lens = existing_inputs[-1] else: - actions = tf.placeholder(tf.int64, actions_shape, name="ac") + if isinstance(action_space, gym.spaces.Discrete): + ac_size = action_space.n + actions = tf.placeholder(tf.int64, [None], name="ac") + else: + raise UnsupportedSpaceException( + "Action space {} is not supported for IMPALA.".format( + action_space)) dones = tf.placeholder(tf.bool, [None], name="dones") rewards = tf.placeholder(tf.float32, [None], name="rewards") behaviour_logits = tf.placeholder( - tf.float32, [None, sum(output_hidden_shape)], - name="behaviour_logits") + tf.float32, [None, ac_size], name="behaviour_logits") observations = tf.placeholder( tf.float32, [None] + list(observation_space.shape)) existing_state_in = None existing_seq_lens = None - # Unpack behaviour logits - unpacked_behaviour_logits = tf.split( - behaviour_logits, output_hidden_shape, axis=1) - # Setup the policy dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) @@ -159,34 +138,17 @@ def __init__(self, "is_training": self._get_is_training_placeholder(), }, observation_space, + action_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) - unpacked_outputs = tf.split( - self.model.outputs, output_hidden_shape, axis=1) - - dist_inputs = unpacked_outputs if is_multidiscrete else \ - self.model.outputs - action_dist = dist_class(dist_inputs) - + action_dist = dist_class(self.model.outputs) values = self.model.value_function() self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) - def make_time_major(tensor, drop_last=False): - """Swaps batch and trajectory axis. - Args: - tensor: A tensor or list of tensors to reshape. - drop_last: A bool indicating whether to drop the last - trajectory item. - Returns: - res: A tensor with swapped axes or a list of tensors with - swapped axes. - """ - if isinstance(tensor, list): - return [make_time_major(t, drop_last) for t in tensor] - + def to_batches(tensor): if self.model.state_init: B = tf.shape(self.model.seq_lens)[0] T = tf.shape(tensor)[0] // B @@ -197,16 +159,11 @@ def make_time_major(tensor, drop_last=False): B = tf.shape(tensor)[0] // T rs = tf.reshape(tensor, tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0)) - # swap B and T axes - res = tf.transpose( + return tf.transpose( rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0])))) - if drop_last: - return res[:-1] - return res - if self.model.state_in: max_seq_len = tf.reduce_max(self.model.seq_lens) - 1 mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) @@ -214,52 +171,31 @@ def make_time_major(tensor, drop_last=False): else: mask = tf.ones_like(rewards, dtype=tf.bool) - # Prepare actions for loss - loss_actions = actions if is_multidiscrete else tf.expand_dims( - actions, axis=1) - # Inputs are reshaped from [B * T] => [T - 1, B] for V-trace calc. self.loss = VTraceLoss( - actions=make_time_major(loss_actions, drop_last=True), - actions_logp=make_time_major( - action_dist.logp(actions), drop_last=True), - actions_entropy=make_time_major( - action_dist.entropy(), drop_last=True), - dones=make_time_major(dones, drop_last=True), - behaviour_logits=make_time_major( - unpacked_behaviour_logits, drop_last=True), - target_logits=make_time_major(unpacked_outputs, drop_last=True), + actions=to_batches(actions)[:-1], + actions_logp=to_batches(action_dist.logp(actions))[:-1], + actions_entropy=to_batches(action_dist.entropy())[:-1], + dones=to_batches(dones)[:-1], + behaviour_logits=to_batches(behaviour_logits)[:-1], + target_logits=to_batches(self.model.outputs)[:-1], discount=config["gamma"], - rewards=make_time_major(rewards, drop_last=True), - values=make_time_major(values, drop_last=True), - bootstrap_value=make_time_major(values)[-1], - valid_mask=make_time_major(mask, drop_last=True), + rewards=to_batches(rewards)[:-1], + values=to_batches(values)[:-1], + bootstrap_value=to_batches(values)[-1], + valid_mask=to_batches(mask)[:-1], vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.config["entropy_coeff"], clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], clip_pg_rho_threshold=self.config["vtrace_clip_pg_rho_threshold"]) # KL divergence between worker and learner logits for debugging - model_dist = MultiCategorical(unpacked_outputs) - behaviour_dist = MultiCategorical(unpacked_behaviour_logits) - - kls = model_dist.kl(behaviour_dist) - if len(kls) > 1: - self.KL_stats = {} - - for i, kl in enumerate(kls): - self.KL_stats.update({ - "mean_KL_{}".format(i): tf.reduce_mean(kl), - "max_KL_{}".format(i): tf.reduce_max(kl), - "median_KL_{}".format(i): tf.contrib.distributions. - percentile(kl, 50.0), - }) - else: - self.KL_stats = { - "mean_KL": tf.reduce_mean(kls[0]), - "max_KL": tf.reduce_max(kls[0]), - "median_KL": tf.contrib.distributions.percentile(kls[0], 50.0), - } + model_dist = Categorical(self.model.outputs) + behaviour_dist = Categorical(behaviour_logits) + self.KLs = model_dist.kl(behaviour_dist) + self.mean_KL = tf.reduce_mean(self.KLs) + self.max_KL = tf.reduce_max(self.KLs) + self.median_KL = tf.contrib.distributions.percentile(self.KLs, 50.0) # Initialize TFPolicyGraph loss_in = [ @@ -295,7 +231,7 @@ def make_time_major(tensor, drop_last=False): self.sess.run(tf.global_variables_initializer()) self.stats_fetches = { - "stats": dict({ + "stats": { "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "entropy": self.loss.entropy, @@ -304,8 +240,11 @@ def make_time_major(tensor, drop_last=False): "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance( tf.reshape(self.loss.vtrace_returns.vs, [-1]), - tf.reshape(make_time_major(values, drop_last=True), [-1])), - }, **self.KL_stats), + tf.reshape(to_batches(values)[:-1], [-1])), + "mean_KL": self.mean_KL, + "max_KL": self.max_KL, + "median_KL": self.median_KL, + }, } @override(TFPolicyGraph) diff --git a/python/ray/rllib/agents/impala/vtrace_test.py b/python/ray/rllib/agents/impala/vtrace_test.py deleted file mode 100644 index f74798fffdbb..000000000000 --- a/python/ray/rllib/agents/impala/vtrace_test.py +++ /dev/null @@ -1,268 +0,0 @@ -# Copyright 2018 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Tests for V-trace. - -For details and theory see: - -"IMPALA: Scalable Distributed Deep-RL with -Importance Weighted Actor-Learner Architectures" -by Espeholt, Soyer, Munos et al. -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from absl.testing import parameterized -import numpy as np -import tensorflow as tf -import vtrace - - -def _shaped_arange(*shape): - """Runs np.arange, converts to float and reshapes.""" - return np.arange(np.prod(shape), dtype=np.float32).reshape(*shape) - - -def _softmax(logits): - """Applies softmax non-linearity on inputs.""" - return np.exp(logits) / np.sum(np.exp(logits), axis=-1, keepdims=True) - - -def _ground_truth_calculation(discounts, log_rhos, rewards, values, - bootstrap_value, clip_rho_threshold, - clip_pg_rho_threshold): - """Calculates the ground truth for V-trace in Python/Numpy.""" - vs = [] - seq_len = len(discounts) - rhos = np.exp(log_rhos) - cs = np.minimum(rhos, 1.0) - clipped_rhos = rhos - if clip_rho_threshold: - clipped_rhos = np.minimum(rhos, clip_rho_threshold) - clipped_pg_rhos = rhos - if clip_pg_rho_threshold: - clipped_pg_rhos = np.minimum(rhos, clip_pg_rho_threshold) - - # This is a very inefficient way to calculate the V-trace ground truth. - # We calculate it this way because it is close to the mathematical notation - # of - # V-trace. - # v_s = V(x_s) - # + \sum^{T-1}_{t=s} \gamma^{t-s} - # * \prod_{i=s}^{t-1} c_i - # * \rho_t (r_t + \gamma V(x_{t+1}) - V(x_t)) - # Note that when we take the product over c_i, we write `s:t` as the - # notation - # of the paper is inclusive of the `t-1`, but Python is exclusive. - # Also note that np.prod([]) == 1. - values_t_plus_1 = np.concatenate( - [values, bootstrap_value[None, :]], axis=0) - for s in range(seq_len): - v_s = np.copy(values[s]) # Very important copy. - for t in range(s, seq_len): - v_s += (np.prod(discounts[s:t], axis=0) * np.prod(cs[s:t], axis=0) - * clipped_rhos[t] * (rewards[t] + discounts[t] * - values_t_plus_1[t + 1] - values[t])) - vs.append(v_s) - vs = np.stack(vs, axis=0) - pg_advantages = (clipped_pg_rhos * (rewards + discounts * np.concatenate( - [vs[1:], bootstrap_value[None, :]], axis=0) - values)) - - return vtrace.VTraceReturns(vs=vs, pg_advantages=pg_advantages) - - -class LogProbsFromLogitsAndActionsTest(tf.test.TestCase, - parameterized.TestCase): - @parameterized.named_parameters(('Batch1', 1), ('Batch2', 2)) - def test_log_probs_from_logits_and_actions(self, batch_size): - """Tests log_probs_from_logits_and_actions.""" - seq_len = 7 - num_actions = 3 - - policy_logits = _shaped_arange(seq_len, batch_size, num_actions) + 10 - actions = np.random.randint( - 0, num_actions - 1, size=(seq_len, batch_size), dtype=np.int32) - - action_log_probs_tensor = vtrace.log_probs_from_logits_and_actions( - policy_logits, actions) - - # Ground Truth - # Using broadcasting to create a mask that indexes action logits - action_index_mask = actions[..., None] == np.arange(num_actions) - - def index_with_mask(array, mask): - return array[mask].reshape(*array.shape[:-1]) - - # Note: Normally log(softmax) is not a good idea because it's not - # numerically stable. However, in this test we have well-behaved - # values. - ground_truth_v = index_with_mask( - np.log(_softmax(policy_logits)), action_index_mask) - - with self.test_session() as session: - self.assertAllClose(ground_truth_v, - session.run(action_log_probs_tensor)) - - -class VtraceTest(tf.test.TestCase, parameterized.TestCase): - @parameterized.named_parameters(('Batch1', 1), ('Batch5', 5)) - def test_vtrace(self, batch_size): - """Tests V-trace against ground truth data calculated in python.""" - seq_len = 5 - - # Create log_rhos such that rho will span from near-zero to above the - # clipping thresholds. In particular, calculate log_rhos in - # [-2.5, 2.5), - # so that rho is in approx [0.08, 12.2). - log_rhos = _shaped_arange(seq_len, batch_size) / (batch_size * seq_len) - log_rhos = 5 * (log_rhos - 0.5) # [0.0, 1.0) -> [-2.5, 2.5). - values = { - 'log_rhos': log_rhos, - # T, B where B_i: [0.9 / (i+1)] * T - 'discounts': np.array([[0.9 / (b + 1) for b in range(batch_size)] - for _ in range(seq_len)]), - 'rewards': _shaped_arange(seq_len, batch_size), - 'values': _shaped_arange(seq_len, batch_size) / batch_size, - 'bootstrap_value': _shaped_arange(batch_size) + 1.0, - 'clip_rho_threshold': 3.7, - 'clip_pg_rho_threshold': 2.2, - } - - output = vtrace.from_importance_weights(**values) - - with self.test_session() as session: - output_v = session.run(output) - - ground_truth_v = _ground_truth_calculation(**values) - for a, b in zip(ground_truth_v, output_v): - self.assertAllClose(a, b) - - @parameterized.named_parameters(('Batch1', 1), ('Batch2', 2)) - def test_vtrace_from_logits(self, batch_size): - """Tests V-trace calculated from logits.""" - seq_len = 5 - num_actions = 3 - clip_rho_threshold = None # No clipping. - clip_pg_rho_threshold = None # No clipping. - - # Intentionally leaving shapes unspecified to test if V-trace can - # deal with that. - placeholders = { - # T, B, NUM_ACTIONS - 'behaviour_policy_logits': tf.placeholder( - dtype=tf.float32, shape=[None, None, None]), - # T, B, NUM_ACTIONS - 'target_policy_logits': tf.placeholder( - dtype=tf.float32, shape=[None, None, None]), - 'actions': tf.placeholder(dtype=tf.int32, shape=[None, None]), - 'discounts': tf.placeholder(dtype=tf.float32, shape=[None, None]), - 'rewards': tf.placeholder(dtype=tf.float32, shape=[None, None]), - 'values': tf.placeholder(dtype=tf.float32, shape=[None, None]), - 'bootstrap_value': tf.placeholder(dtype=tf.float32, shape=[None]), - } - - from_logits_output = vtrace.from_logits( - clip_rho_threshold=clip_rho_threshold, - clip_pg_rho_threshold=clip_pg_rho_threshold, - **placeholders) - - target_log_probs = vtrace.log_probs_from_logits_and_actions( - placeholders['target_policy_logits'], placeholders['actions']) - behaviour_log_probs = vtrace.log_probs_from_logits_and_actions( - placeholders['behaviour_policy_logits'], placeholders['actions']) - log_rhos = target_log_probs - behaviour_log_probs - ground_truth = (log_rhos, behaviour_log_probs, target_log_probs) - - values = { - 'behaviour_policy_logits': _shaped_arange(seq_len, batch_size, - num_actions), - 'target_policy_logits': _shaped_arange(seq_len, batch_size, - num_actions), - 'actions': np.random.randint( - 0, num_actions - 1, size=(seq_len, batch_size)), - 'discounts': np.array( # T, B where B_i: [0.9 / (i+1)] * T - [[0.9 / (b + 1) for b in range(batch_size)] - for _ in range(seq_len)]), - 'rewards': _shaped_arange(seq_len, batch_size), - 'values': _shaped_arange(seq_len, batch_size) / batch_size, - 'bootstrap_value': _shaped_arange(batch_size) + 1.0, # B - } - - feed_dict = {placeholders[k]: v for k, v in values.items()} - with self.test_session() as session: - from_logits_output_v = session.run( - from_logits_output, feed_dict=feed_dict) - (ground_truth_log_rhos, ground_truth_behaviour_action_log_probs, - ground_truth_target_action_log_probs) = session.run( - ground_truth, feed_dict=feed_dict) - - # Calculate V-trace using the ground truth logits. - from_iw = vtrace.from_importance_weights( - log_rhos=ground_truth_log_rhos, - discounts=values['discounts'], - rewards=values['rewards'], - values=values['values'], - bootstrap_value=values['bootstrap_value'], - clip_rho_threshold=clip_rho_threshold, - clip_pg_rho_threshold=clip_pg_rho_threshold) - - with self.test_session() as session: - from_iw_v = session.run(from_iw) - - self.assertAllClose(from_iw_v.vs, from_logits_output_v.vs) - self.assertAllClose(from_iw_v.pg_advantages, - from_logits_output_v.pg_advantages) - self.assertAllClose(ground_truth_behaviour_action_log_probs, - from_logits_output_v.behaviour_action_log_probs) - self.assertAllClose(ground_truth_target_action_log_probs, - from_logits_output_v.target_action_log_probs) - self.assertAllClose(ground_truth_log_rhos, - from_logits_output_v.log_rhos) - - def test_higher_rank_inputs_for_importance_weights(self): - """Checks support for additional dimensions in inputs.""" - placeholders = { - 'log_rhos': tf.placeholder( - dtype=tf.float32, shape=[None, None, 1]), - 'discounts': tf.placeholder( - dtype=tf.float32, shape=[None, None, 1]), - 'rewards': tf.placeholder( - dtype=tf.float32, shape=[None, None, 42]), - 'values': tf.placeholder(dtype=tf.float32, shape=[None, None, 42]), - 'bootstrap_value': tf.placeholder( - dtype=tf.float32, shape=[None, 42]) - } - output = vtrace.from_importance_weights(**placeholders) - self.assertEqual(output.vs.shape.as_list()[-1], 42) - - def test_inconsistent_rank_inputs_for_importance_weights(self): - """Test one of many possible errors in shape of inputs.""" - placeholders = { - 'log_rhos': tf.placeholder( - dtype=tf.float32, shape=[None, None, 1]), - 'discounts': tf.placeholder( - dtype=tf.float32, shape=[None, None, 1]), - 'rewards': tf.placeholder( - dtype=tf.float32, shape=[None, None, 42]), - 'values': tf.placeholder(dtype=tf.float32, shape=[None, None, 42]), - # Should be [None, 42]. - 'bootstrap_value': tf.placeholder(dtype=tf.float32, shape=[None]) - } - with self.assertRaisesRegexp(ValueError, 'must have rank 2'): - vtrace.from_importance_weights(**placeholders) - - -if __name__ == '__main__': - tf.test.main() diff --git a/python/ray/rllib/agents/marwil/marwil_policy_graph.py b/python/ray/rllib/agents/marwil/marwil_policy_graph.py index 8d52807efdcc..7b66350d8d52 100644 --- a/python/ray/rllib/agents/marwil/marwil_policy_graph.py +++ b/python/ray/rllib/agents/marwil/marwil_policy_graph.py @@ -73,7 +73,8 @@ def __init__(self, observation_space, action_space, config): "prev_actions": prev_actions_ph, "prev_rewards": prev_rewards_ph, "is_training": self._get_is_training_placeholder(), - }, observation_space, logit_dim, self.config["model"]) + }, observation_space, action_space, logit_dim, + self.config["model"]) logits = self.model.outputs self.p_func_vars = _scope_vars(scope.name) diff --git a/python/ray/rllib/agents/pg/pg_policy_graph.py b/python/ray/rllib/agents/pg/pg_policy_graph.py index 4907e00ff201..8928bb108346 100644 --- a/python/ray/rllib/agents/pg/pg_policy_graph.py +++ b/python/ray/rllib/agents/pg/pg_policy_graph.py @@ -39,7 +39,7 @@ def __init__(self, obs_space, action_space, config): "prev_actions": prev_actions, "prev_rewards": prev_rewards, "is_training": self._get_is_training_placeholder(), - }, obs_space, self.logit_dim, self.config["model"]) + }, obs_space, action_space, self.logit_dim, self.config["model"]) action_dist = dist_class(self.model.outputs) # logit for each action # Setup policy loss diff --git a/python/ray/rllib/agents/ppo/appo_policy_graph.py b/python/ray/rllib/agents/ppo/appo_policy_graph.py index 362f93b0721a..4e4b2480a7ef 100644 --- a/python/ray/rllib/agents/ppo/appo_policy_graph.py +++ b/python/ray/rllib/agents/ppo/appo_policy_graph.py @@ -6,7 +6,6 @@ from __future__ import division from __future__ import print_function -import numpy as np import tensorflow as tf import logging import gym @@ -18,7 +17,7 @@ from ray.rllib.models.catalog import ModelCatalog from ray.rllib.utils.error import UnsupportedSpaceException from ray.rllib.utils.explained_variance import explained_variance -from ray.rllib.models.action_dist import MultiCategorical +from ray.rllib.models.action_dist import Categorical from ray.rllib.evaluation.postprocessing import compute_advantages logger = logging.getLogger(__name__) @@ -30,7 +29,7 @@ class PPOSurrogateLoss(object): Arguments: prev_actions_logp: A float32 tensor of shape [T, B]. actions_logp: A float32 tensor of shape [T, B]. - action_kl: A float32 tensor of shape [T, B]. + actions_kl: A float32 tensor of shape [T, B]. actions_entropy: A float32 tensor of shape [T, B]. values: A float32 tensor of shape [T, B]. valid_mask: A bool tensor of valid RNN input elements (#2992). @@ -105,7 +104,7 @@ def __init__(self, actions: An int32 tensor of shape [T, B, NUM_ACTIONS]. prev_actions_logp: A float32 tensor of shape [T, B]. actions_logp: A float32 tensor of shape [T, B]. - action_kl: A float32 tensor of shape [T, B]. + actions_kl: A float32 tensor of shape [T, B]. actions_entropy: A float32 tensor of shape [T, B]. dones: A bool tensor of shape [T, B]. behaviour_logits: A float32 tensor of shape [T, B, NUM_ACTIONS]. @@ -119,10 +118,10 @@ def __init__(self, # Compute vtrace on the CPU for better perf. with tf.device("/cpu:0"): - self.vtrace_returns = vtrace.multi_from_logits( + self.vtrace_returns = vtrace.from_logits( behaviour_policy_logits=behaviour_logits, target_policy_logits=target_logits, - actions=tf.unstack(tf.cast(actions, tf.int32), axis=2), + actions=tf.cast(actions, tf.int32), discounts=tf.to_float(~dones) * discount, rewards=rewards, values=values, @@ -167,20 +166,6 @@ def __init__(self, "Must use `truncate_episodes` batch mode with V-trace." self.config = config self.sess = tf.get_default_session() - self.grads = None - - if isinstance(action_space, gym.spaces.Discrete): - is_multidiscrete = False - actions_shape = [None] - output_hidden_shape = [action_space.n] - elif isinstance(action_space, gym.spaces.multi_discrete.MultiDiscrete): - is_multidiscrete = True - actions_shape = [None, len(action_space.nvec)] - output_hidden_shape = action_space.nvec.astype(np.int32) - else: - raise UnsupportedSpaceException( - "Action space {} is not supported for APPO.", - format(action_space)) # Policy network model dist_class, logit_dim = ModelCatalog.get_action_dist( @@ -200,7 +185,12 @@ def __init__(self, existing_state_in = existing_inputs[9:-1] existing_seq_lens = existing_inputs[-1] else: - actions = tf.placeholder(tf.int64, actions_shape, name="ac") + actions = ModelCatalog.get_action_placeholder(action_space) + if (not isinstance(action_space, gym.spaces.Discrete) + and self.config["vtrace"]): + raise UnsupportedSpaceException( + "Action space {} is not supported with vtrace.".format( + action_space)) dones = tf.placeholder(tf.bool, [None], name="dones") rewards = tf.placeholder(tf.float32, [None], name="rewards") behaviour_logits = tf.placeholder( @@ -209,7 +199,6 @@ def __init__(self, tf.float32, [None] + list(observation_space.shape)) existing_state_in = None existing_seq_lens = None - if not self.config["vtrace"]: adv_ph = tf.placeholder( tf.float32, name="advantages", shape=(None, )) @@ -217,13 +206,7 @@ def __init__(self, tf.float32, name="value_targets", shape=(None, )) self.observations = observations - # Unpack behaviour logits - unpacked_behaviour_logits = tf.split( - behaviour_logits, output_hidden_shape, axis=1) - # Setup the policy - dist_class, logit_dim = ModelCatalog.get_action_dist( - action_space, self.config["model"]) prev_actions = ModelCatalog.get_action_placeholder(action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.model = ModelCatalog.get_model( @@ -231,42 +214,23 @@ def __init__(self, "obs": observations, "prev_actions": prev_actions, "prev_rewards": prev_rewards, - "is_training": self._get_is_training_placeholder(), }, observation_space, + action_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) - unpacked_outputs = tf.split( - self.model.outputs, output_hidden_shape, axis=1) - - dist_inputs = unpacked_outputs if is_multidiscrete else \ - self.model.outputs - prev_dist_inputs = unpacked_behaviour_logits if is_multidiscrete else \ - behaviour_logits - action_dist = dist_class(dist_inputs) - prev_action_dist = dist_class(prev_dist_inputs) + action_dist = dist_class(self.model.outputs) + prev_action_dist = dist_class(behaviour_logits) values = self.model.value_function() self.value_function = values self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) - def make_time_major(tensor, drop_last=False): - """Swaps batch and trajectory axis. - Args: - tensor: A tensor or list of tensors to reshape. - drop_last: A bool indicating whether to drop the last - trajectory item. - Returns: - res: A tensor with swapped axes or a list of tensors with - swapped axes. - """ - if isinstance(tensor, list): - return [make_time_major(t, drop_last) for t in tensor] - + def to_batches(tensor): if self.model.state_init: B = tf.shape(self.model.seq_lens)[0] T = tf.shape(tensor)[0] // B @@ -277,16 +241,11 @@ def make_time_major(tensor, drop_last=False): B = tf.shape(tensor)[0] // T rs = tf.reshape(tensor, tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0)) - # swap B and T axes - res = tf.transpose( + return tf.transpose( rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0])))) - if drop_last: - return res[:-1] - return res - if self.model.state_in: max_seq_len = tf.reduce_max(self.model.seq_lens) - 1 mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) @@ -297,30 +256,21 @@ def make_time_major(tensor, drop_last=False): # Inputs are reshaped from [B * T] => [T - 1, B] for V-trace calc. if self.config["vtrace"]: logger.info("Using V-Trace surrogate loss (vtrace=True)") - - # Prepare actions for loss - loss_actions = actions if is_multidiscrete else tf.expand_dims( - actions, axis=1) - self.loss = VTraceSurrogateLoss( - actions=make_time_major(loss_actions, drop_last=True), - prev_actions_logp=make_time_major( - prev_action_dist.logp(actions), drop_last=True), - actions_logp=make_time_major( - action_dist.logp(actions), drop_last=True), + actions=to_batches(actions)[:-1], + prev_actions_logp=to_batches( + prev_action_dist.logp(actions))[:-1], + actions_logp=to_batches(action_dist.logp(actions))[:-1], action_kl=prev_action_dist.kl(action_dist), - actions_entropy=make_time_major( - action_dist.entropy(), drop_last=True), - dones=make_time_major(dones, drop_last=True), - behaviour_logits=make_time_major( - unpacked_behaviour_logits, drop_last=True), - target_logits=make_time_major( - unpacked_outputs, drop_last=True), + actions_entropy=to_batches(action_dist.entropy())[:-1], + dones=to_batches(dones)[:-1], + behaviour_logits=to_batches(behaviour_logits)[:-1], + target_logits=to_batches(self.model.outputs)[:-1], discount=config["gamma"], - rewards=make_time_major(rewards, drop_last=True), - values=make_time_major(values, drop_last=True), - bootstrap_value=make_time_major(values)[-1], - valid_mask=make_time_major(mask, drop_last=True), + rewards=to_batches(rewards)[:-1], + values=to_batches(values)[:-1], + bootstrap_value=to_batches(values)[-1], + valid_mask=to_batches(mask)[:-1], vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.config["entropy_coeff"], clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], @@ -330,41 +280,25 @@ def make_time_major(tensor, drop_last=False): else: logger.info("Using PPO surrogate loss (vtrace=False)") self.loss = PPOSurrogateLoss( - prev_actions_logp=make_time_major( - prev_action_dist.logp(actions)), - actions_logp=make_time_major(action_dist.logp(actions)), + prev_actions_logp=to_batches(prev_action_dist.logp(actions)), + actions_logp=to_batches(action_dist.logp(actions)), action_kl=prev_action_dist.kl(action_dist), - actions_entropy=make_time_major(action_dist.entropy()), - values=make_time_major(values), - valid_mask=make_time_major(mask), - advantages=make_time_major(adv_ph), - value_targets=make_time_major(value_targets), + actions_entropy=to_batches(action_dist.entropy()), + values=to_batches(values), + valid_mask=to_batches(mask), + advantages=to_batches(adv_ph), + value_targets=to_batches(value_targets), vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"]) # KL divergence between worker and learner logits for debugging - model_dist = MultiCategorical(unpacked_outputs) - behaviour_dist = MultiCategorical(unpacked_behaviour_logits) - - kls = model_dist.kl(behaviour_dist) - if len(kls) > 1: - self.KL_stats = {} - - for i, kl in enumerate(kls): - self.KL_stats.update({ - "mean_KL_{}".format(i): tf.reduce_mean(kl), - "max_KL_{}".format(i): tf.reduce_max(kl), - "median_KL_{}".format(i): tf.contrib.distributions. - percentile(kl, 50.0), - }) - else: - self.KL_stats = { - "mean_KL": tf.reduce_mean(kls[0]), - "max_KL": tf.reduce_max(kls[0]), - "median_KL": tf.contrib.distributions.percentile(kls[0], 50.0), - } - + model_dist = Categorical(self.model.outputs) + behaviour_dist = Categorical(behaviour_logits) + self.KLs = model_dist.kl(behaviour_dist) + self.mean_KL = tf.reduce_mean(self.KLs) + self.max_KL = tf.reduce_max(self.KLs) + self.median_KL = tf.contrib.distributions.percentile(self.KLs, 50.0) # Initialize TFPolicyGraph loss_in = [ ("actions", actions), @@ -401,10 +335,12 @@ def make_time_major(tensor, drop_last=False): self.sess.run(tf.global_variables_initializer()) - values_batched = make_time_major( - values, drop_last=self.config["vtrace"]) + if self.config["vtrace"]: + values_batched = to_batches(values)[:-1] + else: + values_batched = to_batches(values) self.stats_fetches = { - "stats": dict({ + "stats": { "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "entropy": self.loss.entropy, @@ -414,8 +350,12 @@ def make_time_major(tensor, drop_last=False): "vf_explained_var": explained_variance( tf.reshape(self.loss.value_targets, [-1]), tf.reshape(values_batched, [-1])), - }, **self.KL_stats), + "mean_KL": self.mean_KL, + "max_KL": self.max_KL, + "median_KL": self.median_KL, + }, } + self.stats_fetches["kl"] = self.loss.mean_kl def optimizer(self): if self.config["opt_type"] == "adam": diff --git a/python/ray/rllib/agents/ppo/ppo_policy_graph.py b/python/ray/rllib/agents/ppo/ppo_policy_graph.py index bd098e6977e5..cd0e68ab7563 100644 --- a/python/ray/rllib/agents/ppo/ppo_policy_graph.py +++ b/python/ray/rllib/agents/ppo/ppo_policy_graph.py @@ -169,6 +169,7 @@ def __init__(self, "is_training": self._get_is_training_placeholder(), }, observation_space, + action_space, logit_dim, self.config["model"], state_in=existing_state_in, @@ -208,7 +209,7 @@ def __init__(self, "prev_actions": prev_actions_ph, "prev_rewards": prev_rewards_ph, "is_training": self._get_is_training_placeholder(), - }, observation_space, 1, vf_config).outputs + }, observation_space, action_space, 1, vf_config).outputs self.value_function = tf.reshape(self.value_function, [-1]) else: self.value_function = tf.zeros(shape=tf.shape(obs_ph)[:1]) diff --git a/python/ray/rllib/env/base_env.py b/python/ray/rllib/env/base_env.py index 85993f05c862..7dd1921f131d 100644 --- a/python/ray/rllib/env/base_env.py +++ b/python/ray/rllib/env/base_env.py @@ -38,14 +38,22 @@ class BaseEnv(object): "env_0": { "car_0": [2.4, 1.6], "car_1": [3.4, -3.2], - } + }, + "env_1": { + "car_0": [8.0, 4.1], + }, + "env_2": { + "car_0": [2.3, 3.3], + "car_1": [1.4, -0.2], + "car_3": [1.2, 0.1], + }, } >>> env.send_actions( actions={ "env_0": { "car_0": 0, "car_1": 1, - } + }, ... }) >>> obs, rewards, dones, infos, off_policy_actions = env.poll() >>> print(obs) @@ -53,7 +61,7 @@ class BaseEnv(object): "env_0": { "car_0": [4.1, 1.7], "car_1": [3.2, -4.2], - } + }, ... } >>> print(dones) { @@ -61,25 +69,40 @@ class BaseEnv(object): "__all__": False, "car_0": False, "car_1": True, - } + }, ... } """ @staticmethod - def to_base_env(env, make_env=None, num_envs=1, remote_envs=False): + def to_base_env(env, + make_env=None, + num_envs=1, + remote_envs=False, + async_remote_envs=False): """Wraps any env type as needed to expose the async interface.""" - if remote_envs and num_envs == 1: + + from ray.rllib.env.remote_vector_env import RemoteVectorEnv + if (remote_envs or async_remote_envs) and num_envs == 1: raise ValueError( "Remote envs only make sense to use if num_envs > 1 " "(i.e. vectorization is enabled).") + if remote_envs and async_remote_envs: + raise ValueError("You can only specify one of remote_envs or " + "async_remote_envs.") + if not isinstance(env, BaseEnv): if isinstance(env, MultiAgentEnv): if remote_envs: - raise NotImplementedError( - "Remote multiagent environments are not implemented") - - env = _MultiAgentEnvToBaseEnv( - make_env=make_env, existing_envs=[env], num_envs=num_envs) + env = RemoteVectorEnv( + make_env, num_envs, multiagent=True, sync=True) + elif async_remote_envs: + env = RemoteVectorEnv( + make_env, num_envs, multiagent=True, sync=False) + else: + env = _MultiAgentEnvToBaseEnv( + make_env=make_env, + existing_envs=[env], + num_envs=num_envs) elif isinstance(env, ExternalEnv): if num_envs != 1: raise ValueError( @@ -88,15 +111,21 @@ def to_base_env(env, make_env=None, num_envs=1, remote_envs=False): elif isinstance(env, VectorEnv): env = _VectorEnvToBaseEnv(env) else: - env = VectorEnv.wrap( - make_env=make_env, - existing_envs=[env], - num_envs=num_envs, - remote_envs=remote_envs, - action_space=env.action_space, - observation_space=env.observation_space) - env = _VectorEnvToBaseEnv(env) - assert isinstance(env, BaseEnv) + if remote_envs: + env = RemoteVectorEnv( + make_env, num_envs, multiagent=False, sync=True) + elif async_remote_envs: + env = RemoteVectorEnv( + make_env, num_envs, multiagent=False, sync=False) + else: + env = VectorEnv.wrap( + make_env=make_env, + existing_envs=[env], + num_envs=num_envs, + action_space=env.action_space, + observation_space=env.observation_space) + env = _VectorEnvToBaseEnv(env) + assert isinstance(env, BaseEnv), env return env @PublicAPI diff --git a/python/ray/rllib/env/remote_vector_env.py b/python/ray/rllib/env/remote_vector_env.py new file mode 100644 index 000000000000..1f33a739b11e --- /dev/null +++ b/python/ray/rllib/env/remote_vector_env.py @@ -0,0 +1,118 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import logging + +import ray +from ray.rllib.env.base_env import BaseEnv, _DUMMY_AGENT_ID + +logger = logging.getLogger(__name__) + + +class RemoteVectorEnv(BaseEnv): + """Vector env that executes envs in remote workers. + + This provides dynamic batching of inference as observations are returned + from the remote simulator actors. Both single and multi-agent child envs + are supported, and envs can be stepped synchronously or async. + """ + + def __init__(self, make_env, num_envs, multiagent, sync): + self.make_local_env = make_env + if sync: + self.timeout = 9999999.0 # wait for all envs + else: + self.timeout = 0.0 # wait for only ready envs + + def make_remote_env(i): + logger.info("Launching env {} in remote actor".format(i)) + if multiagent: + return _RemoteMultiAgentEnv.remote(self.make_local_env, i) + else: + return _RemoteSingleAgentEnv.remote(self.make_local_env, i) + + self.actors = [make_remote_env(i) for i in range(num_envs)] + self.pending = None # lazy init + + def poll(self): + if self.pending is None: + self.pending = {a.reset.remote(): a for a in self.actors} + + # each keyed by env_id in [0, num_remote_envs) + obs, rewards, dones, infos = {}, {}, {}, {} + ready = [] + + # Wait for at least 1 env to be ready here + while not ready: + ready, _ = ray.wait( + list(self.pending), + num_returns=len(self.pending), + timeout=self.timeout) + + # Get and return observations for each of the ready envs + env_ids = set() + for obj_id in ready: + actor = self.pending.pop(obj_id) + env_id = self.actors.index(actor) + env_ids.add(env_id) + ob, rew, done, info = ray.get(obj_id) + obs[env_id] = ob + rewards[env_id] = rew + dones[env_id] = done + infos[env_id] = info + + logger.debug("Got obs batch for actors {}".format(env_ids)) + return obs, rewards, dones, infos, {} + + def send_actions(self, action_dict): + for env_id, actions in action_dict.items(): + actor = self.actors[env_id] + obj_id = actor.step.remote(actions) + self.pending[obj_id] = actor + + def try_reset(self, env_id): + obs, _, _, _ = ray.get(self.actors[env_id].reset.remote()) + return obs + + +@ray.remote(num_cpus=0) +class _RemoteMultiAgentEnv(object): + """Wrapper class for making a multi-agent env a remote actor.""" + + def __init__(self, make_env, i): + self.env = make_env(i) + + def reset(self): + obs = self.env.reset() + # each keyed by agent_id in the env + rew = {agent_id: 0 for agent_id in obs.keys()} + info = {agent_id: {} for agent_id in obs.keys()} + done = {"__all__": False} + return obs, rew, done, info + + def step(self, action_dict): + return self.env.step(action_dict) + + +@ray.remote(num_cpus=0) +class _RemoteSingleAgentEnv(object): + """Wrapper class for making a gym env a remote actor.""" + + def __init__(self, make_env, i): + self.env = make_env(i) + + def reset(self): + obs = {_DUMMY_AGENT_ID: self.env.reset()} + rew = {agent_id: 0 for agent_id in obs.keys()} + info = {agent_id: {} for agent_id in obs.keys()} + done = {"__all__": False} + return obs, rew, done, info + + def step(self, action): + obs, rew, done, info = self.env.step(action[_DUMMY_AGENT_ID]) + obs, rew, done, info = [{ + _DUMMY_AGENT_ID: x + } for x in [obs, rew, done, info]] + done["__all__"] = done[_DUMMY_AGENT_ID] + return obs, rew, done, info diff --git a/python/ray/rllib/env/vector_env.py b/python/ray/rllib/env/vector_env.py index e1de12375e5b..d0df24177f38 100644 --- a/python/ray/rllib/env/vector_env.py +++ b/python/ray/rllib/env/vector_env.py @@ -5,7 +5,6 @@ import logging import numpy as np -import ray from ray.rllib.utils.annotations import override, PublicAPI logger = logging.getLogger(__name__) @@ -27,12 +26,8 @@ class VectorEnv(object): def wrap(make_env=None, existing_envs=None, num_envs=1, - remote_envs=False, action_space=None, observation_space=None): - if remote_envs: - return _RemoteVectorizedGymEnv(make_env, num_envs, action_space, - observation_space) return _VectorizedGymEnv(make_env, existing_envs or [], num_envs, action_space, observation_space) @@ -129,71 +124,3 @@ def vector_step(self, actions): @override(VectorEnv) def get_unwrapped(self): return self.envs - - -@ray.remote(num_cpus=0) -class _RemoteEnv(object): - """Wrapper class for making a gym env a remote actor.""" - - def __init__(self, make_env, i): - self.env = make_env(i) - - def reset(self): - return self.env.reset() - - def step(self, action): - return self.env.step(action) - - -class _RemoteVectorizedGymEnv(_VectorizedGymEnv): - """Internal wrapper for gym envs to implement VectorEnv as remote workers. - """ - - def __init__(self, - make_env, - num_envs, - action_space=None, - observation_space=None): - self.make_local_env = make_env - self.num_envs = num_envs - self.initialized = False - self.action_space = action_space - self.observation_space = observation_space - - def _initialize_if_needed(self): - if self.initialized: - return - - self.initialized = True - - def make_remote_env(i): - logger.info("Launching env {} in remote actor".format(i)) - return _RemoteEnv.remote(self.make_local_env, i) - - _VectorizedGymEnv.__init__(self, make_remote_env, [], self.num_envs, - self.action_space, self.observation_space) - - for env in self.envs: - assert isinstance(env, ray.actor.ActorHandle), env - - @override(_VectorizedGymEnv) - def vector_reset(self): - self._initialize_if_needed() - return ray.get([env.reset.remote() for env in self.envs]) - - @override(_VectorizedGymEnv) - def reset_at(self, index): - return ray.get(self.envs[index].reset.remote()) - - @override(_VectorizedGymEnv) - def vector_step(self, actions): - step_outs = ray.get( - [env.step.remote(act) for env, act in zip(self.envs, actions)]) - - obs_batch, rew_batch, done_batch, info_batch = [], [], [], [] - for obs, rew, done, info in step_outs: - obs_batch.append(obs) - rew_batch.append(rew) - done_batch.append(done) - info_batch.append(info) - return obs_batch, rew_batch, done_batch, info_batch diff --git a/python/ray/rllib/evaluation/episode.py b/python/ray/rllib/evaluation/episode.py index 3075689150e9..acf7d85cb9cc 100644 --- a/python/ray/rllib/evaluation/episode.py +++ b/python/ray/rllib/evaluation/episode.py @@ -58,6 +58,7 @@ def __init__(self, policies, policy_mapping_fn, batch_builder_factory, self._agent_to_policy = {} self._agent_to_rnn_state = {} self._agent_to_last_obs = {} + self._agent_to_last_raw_obs = {} self._agent_to_last_info = {} self._agent_to_last_action = {} self._agent_to_last_pi_info = {} @@ -82,6 +83,12 @@ def last_observation_for(self, agent_id=_DUMMY_AGENT_ID): return self._agent_to_last_obs.get(agent_id) + @DeveloperAPI + def last_raw_obs_for(self, agent_id=_DUMMY_AGENT_ID): + """Returns the last un-preprocessed obs for the specified agent.""" + + return self._agent_to_last_raw_obs.get(agent_id) + @DeveloperAPI def last_info_for(self, agent_id=_DUMMY_AGENT_ID): """Returns the last info for the specified agent.""" @@ -149,10 +156,16 @@ def _set_rnn_state(self, agent_id, rnn_state): def _set_last_observation(self, agent_id, obs): self._agent_to_last_obs[agent_id] = obs + def _set_last_raw_obs(self, agent_id, obs): + self._agent_to_last_raw_obs[agent_id] = obs + def _set_last_info(self, agent_id, info): self._agent_to_last_info[agent_id] = info def _set_last_action(self, agent_id, action): + if agent_id in self._agent_to_last_action: + self._agent_to_prev_action[agent_id] = \ + self._agent_to_last_action[agent_id] self._agent_to_last_action[agent_id] = action def _set_last_pi_info(self, agent_id, pi_info): diff --git a/python/ray/rllib/evaluation/policy_evaluator.py b/python/ray/rllib/evaluation/policy_evaluator.py index 71e2009a1882..9bf2cca532c2 100644 --- a/python/ray/rllib/evaluation/policy_evaluator.py +++ b/python/ray/rllib/evaluation/policy_evaluator.py @@ -122,7 +122,8 @@ def __init__(self, input_creator=lambda ioctx: ioctx.default_sampler_input(), input_evaluation=frozenset([]), output_creator=lambda ioctx: NoopOutput(), - remote_worker_envs=False): + remote_worker_envs=False, + async_remote_worker_envs=False): """Initialize a policy evaluator. Arguments: @@ -201,6 +202,8 @@ def __init__(self, those new envs in remote processes instead of in the current process. This adds overheads, but can make sense if your envs are very CPU intensive (e.g., for StarCraft). + async_remote_worker_envs (bool): Similar to remote_worker_envs, + but runs the envs asynchronously in the background. """ if log_level: @@ -307,7 +310,8 @@ def make_env(vector_index): self.env, make_env=make_env, num_envs=num_envs, - remote_envs=remote_worker_envs) + remote_envs=remote_worker_envs, + async_remote_envs=async_remote_worker_envs) self.num_envs = num_envs if self.batch_mode == "truncate_episodes": @@ -658,7 +662,7 @@ def _build_policy_map(self, policy_dict, policy_config): return policy_map, preprocessors def __del__(self): - if hasattr(self, "sampler") and isinstance(self.sampler, AsyncSampler): + if isinstance(self.sampler, AsyncSampler): self.sampler.shutdown = True diff --git a/python/ray/rllib/evaluation/sampler.py b/python/ray/rllib/evaluation/sampler.py index 76a73274d082..3d91cc44fe01 100644 --- a/python/ray/rllib/evaluation/sampler.py +++ b/python/ray/rllib/evaluation/sampler.py @@ -372,6 +372,7 @@ def _process_observations(base_env, policies, batch_builder_pool, last_observation = episode.last_observation_for(agent_id) episode._set_last_observation(agent_id, filtered_obs) + episode._set_last_raw_obs(agent_id, raw_obs) episode._set_last_info(agent_id, infos[env_id].get(agent_id, {})) # Record transition info if applicable diff --git a/python/ray/rllib/examples/custom_loss.py b/python/ray/rllib/examples/custom_loss.py index 85855992c6c0..005428b00a66 100644 --- a/python/ray/rllib/examples/custom_loss.py +++ b/python/ray/rllib/examples/custom_loss.py @@ -41,7 +41,8 @@ def _build_layers_v2(self, input_dict, num_outputs, options): self.obs_in = input_dict["obs"] with tf.variable_scope("shared", reuse=tf.AUTO_REUSE): self.fcnet = FullyConnectedNetwork(input_dict, self.obs_space, - num_outputs, options) + self.action_space, num_outputs, + options) return self.fcnet.outputs, self.fcnet.last_layer def custom_loss(self, policy_loss, loss_inputs): diff --git a/python/ray/rllib/examples/custom_metrics_and_callbacks.py b/python/ray/rllib/examples/custom_metrics_and_callbacks.py index af1d25f16cad..0f0dcb040b4e 100644 --- a/python/ray/rllib/examples/custom_metrics_and_callbacks.py +++ b/python/ray/rllib/examples/custom_metrics_and_callbacks.py @@ -20,6 +20,8 @@ def on_episode_start(info): def on_episode_step(info): episode = info["episode"] pole_angle = abs(episode.last_observation_for()[2]) + raw_angle = abs(episode.last_raw_obs_for()[2]) + assert pole_angle == raw_angle episode.user_data["pole_angles"].append(pole_angle) diff --git a/python/ray/rllib/models/action_dist.py b/python/ray/rllib/models/action_dist.py index 138fd9f8a6a8..724e54fd1fac 100644 --- a/python/ray/rllib/models/action_dist.py +++ b/python/ray/rllib/models/action_dist.py @@ -114,31 +114,6 @@ def _build_sample_op(self): return tf.squeeze(tf.multinomial(self.inputs, 1), axis=1) -class MultiCategorical(ActionDistribution): - """Categorical distribution for discrete action spaces.""" - - def __init__(self, inputs): - self.cats = [Categorical(input_) for input_ in inputs] - self.sample_op = self._build_sample_op() - - def logp(self, actions): - # If tensor is provided, unstack it into list - if isinstance(actions, tf.Tensor): - actions = tf.unstack(actions, axis=1) - logps = tf.stack( - [cat.logp(act) for cat, act in zip(self.cats, actions)]) - return tf.reduce_sum(logps, axis=0) - - def entropy(self): - return tf.stack([cat.entropy() for cat in self.cats], axis=1) - - def kl(self, other): - return [cat.kl(oth_cat) for cat, oth_cat in zip(self.cats, other.cats)] - - def _build_sample_op(self): - return tf.stack([cat.sample() for cat in self.cats], axis=1) - - class DiagGaussian(ActionDistribution): """Action distribution where each vector element is a gaussian. diff --git a/python/ray/rllib/models/catalog.py b/python/ray/rllib/models/catalog.py index 40bc2a13f864..73b55675c8f7 100644 --- a/python/ray/rllib/models/catalog.py +++ b/python/ray/rllib/models/catalog.py @@ -12,8 +12,8 @@ _global_registry from ray.rllib.models.extra_spaces import Simplex -from ray.rllib.models.action_dist import (Categorical, MultiCategorical, - Deterministic, DiagGaussian, +from ray.rllib.models.action_dist import (Categorical, Deterministic, + DiagGaussian, MultiActionDistribution, Dirichlet) from ray.rllib.models.preprocessors import get_preprocessor from ray.rllib.models.fcnet import FullyConnectedNetwork @@ -136,9 +136,6 @@ def get_action_dist(action_space, config, dist_type=None): input_lens=input_lens), sum(input_lens) elif isinstance(action_space, Simplex): return Dirichlet, action_space.shape[0] - elif isinstance(action_space, gym.spaces.multi_discrete.MultiDiscrete): - return MultiCategorical, sum(action_space.nvec) - raise NotImplementedError("Unsupported args: {} {}".format( action_space, dist_type)) @@ -174,11 +171,6 @@ def get_action_placeholder(action_space): elif isinstance(action_space, Simplex): return tf.placeholder( tf.float32, shape=(None, action_space.shape[0]), name="action") - elif isinstance(action_space, gym.spaces.multi_discrete.MultiDiscrete): - return tf.placeholder( - tf.as_dtype(action_space.dtype), - shape=(None, len(action_space.nvec)), - name="action") else: raise NotImplementedError("action space {}" " not supported".format(action_space)) @@ -187,6 +179,7 @@ def get_action_placeholder(action_space): @DeveloperAPI def get_model(input_dict, obs_space, + action_space, num_outputs, options, state_in=None, @@ -197,10 +190,11 @@ def get_model(input_dict, input_dict (dict): Dict of input tensors to the model, including the observation under the "obs" key. obs_space (Space): Observation space of the target gym env. + action_space (Space): Action space of the target gym env. num_outputs (int): The size of the output vector of the model. options (dict): Optional args to pass to the model constructor. state_in (list): Optional RNN state in tensors. - seq_in (Tensor): Optional RNN sequence length tensor. + seq_lens (Tensor): Optional RNN sequence length tensor. Returns: model (models.Model): Neural network model. @@ -208,33 +202,36 @@ def get_model(input_dict, assert isinstance(input_dict, dict) options = options or MODEL_DEFAULTS - model = ModelCatalog._get_model(input_dict, obs_space, num_outputs, - options, state_in, seq_lens) + model = ModelCatalog._get_model(input_dict, obs_space, action_space, + num_outputs, options, state_in, + seq_lens) if options.get("use_lstm"): copy = dict(input_dict) copy["obs"] = model.last_layer feature_space = gym.spaces.Box( -1, 1, shape=(model.last_layer.shape[1], )) - model = LSTM(copy, feature_space, num_outputs, options, state_in, - seq_lens) + model = LSTM(copy, feature_space, action_space, num_outputs, + options, state_in, seq_lens) - logger.debug("Created model {}: ({} of {}, {}, {}) -> {}, {}".format( - model, input_dict, obs_space, state_in, seq_lens, model.outputs, - model.state_out)) + logger.debug( + "Created model {}: ({} of {}, {}, {}, {}) -> {}, {}".format( + model, input_dict, obs_space, action_space, state_in, seq_lens, + model.outputs, model.state_out)) model._validate_output_shape() return model @staticmethod - def _get_model(input_dict, obs_space, num_outputs, options, state_in, - seq_lens): + def _get_model(input_dict, obs_space, action_space, num_outputs, options, + state_in, seq_lens): if options.get("custom_model"): model = options["custom_model"] logger.debug("Using custom model {}".format(model)) return _global_registry.get(RLLIB_MODEL, model)( input_dict, obs_space, + action_space, num_outputs, options, state_in=state_in, @@ -243,10 +240,11 @@ def _get_model(input_dict, obs_space, num_outputs, options, state_in, obs_rank = len(input_dict["obs"].shape) - 1 if obs_rank > 1: - return VisionNetwork(input_dict, obs_space, num_outputs, options) + return VisionNetwork(input_dict, obs_space, action_space, + num_outputs, options) - return FullyConnectedNetwork(input_dict, obs_space, num_outputs, - options) + return FullyConnectedNetwork(input_dict, obs_space, action_space, + num_outputs, options) @staticmethod @DeveloperAPI diff --git a/python/ray/rllib/models/model.py b/python/ray/rllib/models/model.py index 39324ee81356..b5664057d9a8 100644 --- a/python/ray/rllib/models/model.py +++ b/python/ray/rllib/models/model.py @@ -48,6 +48,7 @@ class Model(object): def __init__(self, input_dict, obs_space, + action_space, num_outputs, options, state_in=None, @@ -59,6 +60,7 @@ def __init__(self, self.state_in = state_in or [] self.state_out = [] self.obs_space = obs_space + self.action_space = action_space self.num_outputs = num_outputs self.options = options self.scope = tf.get_variable_scope() diff --git a/python/ray/rllib/optimizers/async_replay_optimizer.py b/python/ray/rllib/optimizers/async_replay_optimizer.py index a2cfee61a0d7..f1ae1bf71639 100644 --- a/python/ray/rllib/optimizers/async_replay_optimizer.py +++ b/python/ray/rllib/optimizers/async_replay_optimizer.py @@ -225,7 +225,8 @@ def _step(self): return sample_timesteps, train_timesteps -@ray.remote(num_cpus=0) +# reserve 1 CPU so that our method calls don't get stalled +@ray.remote(num_cpus=1) class ReplayActor(object): """A replay buffer shard. diff --git a/python/ray/rllib/optimizers/async_samples_optimizer.py b/python/ray/rllib/optimizers/async_samples_optimizer.py index 039fea346945..250ec735595f 100644 --- a/python/ray/rllib/optimizers/async_samples_optimizer.py +++ b/python/ray/rllib/optimizers/async_samples_optimizer.py @@ -84,9 +84,6 @@ def _init(self, learner_queue_size) self.learner.start() - if len(self.remote_evaluators) == 0: - logger.warning("Config num_workers=0 means training will hang!") - # Stats self._optimizer_step_timer = TimerStat() self.num_weight_syncs = 0 @@ -137,6 +134,8 @@ def get_mean_stats_and_reset(self): @override(PolicyOptimizer) def step(self): + if len(self.remote_evaluators) == 0: + raise ValueError("Config num_workers=0 means training will hang!") assert self.learner.is_alive() with self._optimizer_step_timer: sample_timesteps, train_timesteps = self._step() diff --git a/python/ray/rllib/tests/test_catalog.py b/python/ray/rllib/tests/test_catalog.py index efa1aba0e2f0..9346e1064c67 100644 --- a/python/ray/rllib/tests/test_catalog.py +++ b/python/ray/rllib/tests/test_catalog.py @@ -73,13 +73,14 @@ def testDefaultModels(self): with tf.variable_scope("test1"): p1 = ModelCatalog.get_model({ "obs": tf.zeros((10, 3), dtype=tf.float32) - }, Box(0, 1, shape=(3, ), dtype=np.float32), 5, {}) + }, Box(0, 1, shape=(3, ), dtype=np.float32), Discrete(5), 5, {}) self.assertEqual(type(p1), FullyConnectedNetwork) with tf.variable_scope("test2"): p2 = ModelCatalog.get_model({ "obs": tf.zeros((10, 84, 84, 3), dtype=tf.float32) - }, Box(0, 1, shape=(84, 84, 3), dtype=np.float32), 5, {}) + }, Box(0, 1, shape=(84, 84, 3), dtype=np.float32), Discrete(5), 5, + {}) self.assertEqual(type(p2), VisionNetwork) def testCustomModel(self): @@ -87,7 +88,7 @@ def testCustomModel(self): ModelCatalog.register_custom_model("foo", CustomModel) p1 = ModelCatalog.get_model({ "obs": tf.constant([1, 2, 3]) - }, Box(0, 1, shape=(3, ), dtype=np.float32), 5, + }, Box(0, 1, shape=(3, ), dtype=np.float32), Discrete(5), 5, {"custom_model": "foo"}) self.assertEqual(str(type(p1)), str(CustomModel)) diff --git a/python/ray/rllib/tests/test_multi_agent_env.py b/python/ray/rllib/tests/test_multi_agent_env.py index 99f00ccaf434..6eeca3ef22a0 100644 --- a/python/ray/rllib/tests/test_multi_agent_env.py +++ b/python/ray/rllib/tests/test_multi_agent_env.py @@ -334,6 +334,38 @@ def testMultiAgentSample(self): self.assertEqual(batch.policy_batches["p0"]["t"].tolist(), list(range(25)) * 6) + def testMultiAgentSampleSyncRemote(self): + act_space = gym.spaces.Discrete(2) + obs_space = gym.spaces.Discrete(2) + ev = PolicyEvaluator( + env_creator=lambda _: BasicMultiAgent(5), + policy_graph={ + "p0": (MockPolicyGraph, obs_space, act_space, {}), + "p1": (MockPolicyGraph, obs_space, act_space, {}), + }, + policy_mapping_fn=lambda agent_id: "p{}".format(agent_id % 2), + batch_steps=50, + num_envs=4, + remote_worker_envs=True) + batch = ev.sample() + self.assertEqual(batch.count, 200) + + def testMultiAgentSampleAsyncRemote(self): + act_space = gym.spaces.Discrete(2) + obs_space = gym.spaces.Discrete(2) + ev = PolicyEvaluator( + env_creator=lambda _: BasicMultiAgent(5), + policy_graph={ + "p0": (MockPolicyGraph, obs_space, act_space, {}), + "p1": (MockPolicyGraph, obs_space, act_space, {}), + }, + policy_mapping_fn=lambda agent_id: "p{}".format(agent_id % 2), + batch_steps=50, + num_envs=4, + async_remote_worker_envs=True) + batch = ev.sample() + self.assertEqual(batch.count, 200) + def testMultiAgentSampleWithHorizon(self): act_space = gym.spaces.Discrete(2) obs_space = gym.spaces.Discrete(2) @@ -621,5 +653,5 @@ def testTrainMultiCartpoleManyPolicies(self): if __name__ == "__main__": - ray.init() + ray.init(num_cpus=4) unittest.main(verbosity=2) diff --git a/python/ray/rllib/tests/test_policy_evaluator.py b/python/ray/rllib/tests/test_policy_evaluator.py index a810542adfa1..d5466865bad0 100644 --- a/python/ray/rllib/tests/test_policy_evaluator.py +++ b/python/ray/rllib/tests/test_policy_evaluator.py @@ -4,6 +4,7 @@ import gym import numpy as np +import random import time import unittest from collections import Counter @@ -27,7 +28,7 @@ def compute_actions(self, prev_reward_batch=None, episodes=None, **kwargs): - return [0] * len(obs_batch), [], {} + return [random.choice([0, 1])] * len(obs_batch), [], {} def postprocess_trajectory(self, batch, @@ -138,6 +139,7 @@ def testBasic(self): "prev_rewards", "prev_actions" ]: self.assertIn(key, batch) + self.assertGreater(np.abs(np.mean(batch[key])), 0) def to_prev(vec): out = np.zeros_like(vec) diff --git a/python/ray/rllib/tests/test_supported_spaces.py b/python/ray/rllib/tests/test_supported_spaces.py index 7d59a04fb223..93a8366bf56c 100644 --- a/python/ray/rllib/tests/test_supported_spaces.py +++ b/python/ray/rllib/tests/test_supported_spaces.py @@ -105,7 +105,7 @@ def check_support_multiagent(alg, config): class ModelSupportedSpaces(unittest.TestCase): def setUp(self): - ray.init(num_cpus=4) + ray.init(num_cpus=10) def tearDown(self): ray.shutdown() diff --git a/python/ray/services.py b/python/ray/services.py index 76be5bc9d63f..4bfa4719c6be 100644 --- a/python/ray/services.py +++ b/python/ray/services.py @@ -10,7 +10,6 @@ import os import random import resource -import shutil import socket import subprocess import sys @@ -943,75 +942,6 @@ def start_dashboard(redis_address, return dashboard_url, process_info -def start_ui(redis_address, notebook_name, stdout_file=None, stderr_file=None): - """Start a UI process. - - Args: - redis_address: The address of the primary Redis shard. - notebook_name: The destination of the notebook file. - stdout_file: A file handle opened for writing to redirect stdout to. If - no redirection should happen, then this should be None. - stderr_file: A file handle opened for writing to redirect stderr to. If - no redirection should happen, then this should be None. - - Returns: - A tuple of the web UI url and ProcessInfo for the process that was - started. - """ - - port = 8888 - while True: - try: - port_test_socket = socket.socket() - port_test_socket.bind(("127.0.0.1", port)) - port_test_socket.close() - break - except socket.error: - port += 1 - - notebook_filepath = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "WebUI.ipynb") - # We copy the notebook file so that the original doesn't get modified by - # the user. - shutil.copy(notebook_filepath, notebook_name) - - new_notebook_directory = os.path.dirname(notebook_name) - # We generate the token used for authentication ourselves to avoid - # querying the jupyter server. - token = ray.utils.decode(binascii.hexlify(os.urandom(24))) - # The --ip=0.0.0.0 flag is intended to enable connecting to a notebook - # running within a docker container (from the outside). - command = [ - "jupyter", "notebook", "--no-browser", "--port={}".format(port), - "--ip=0.0.0.0", "--NotebookApp.iopub_data_rate_limit=10000000000", - "--NotebookApp.open_browser=False", - "--NotebookApp.token={}".format(token) - ] - # If the user is root, add the --allow-root flag. - if os.geteuid() == 0: - command.append("--allow-root") - - try: - process_info = start_ray_process( - command, - ray_constants.PROCESS_TYPE_WEB_UI, - env_updates={"REDIS_ADDRESS": redis_address}, - cwd=new_notebook_directory, - stdout_file=stdout_file, - stderr_file=stderr_file) - except Exception: - logger.warning("Failed to start the UI, you may need to run " - "'pip install jupyter'.") - else: - webui_url = ("http://localhost:{}/notebooks/{}?token={}".format( - port, os.path.basename(notebook_name), token)) - print("\n" + "=" * 70) - print("View the web UI at {}".format(webui_url)) - print("=" * 70 + "\n") - return webui_url, process_info - return None, None - - def check_and_update_resources(num_cpus, num_gpus, resources): """Sanity check a resource dictionary and add sensible defaults. @@ -1242,7 +1172,7 @@ def build_java_worker_command( """ assert java_worker_options is not None - command = "java {} ".format(java_worker_options) + command = "java ".format(java_worker_options) if redis_address is not None: command += "-Dray.redis.address={} ".format(redis_address) @@ -1259,6 +1189,11 @@ def build_java_worker_command( command += "-Dray.home={} ".format(RAY_HOME) # TODO(suquark): We should use temp_dir as the input of a java worker. command += "-Dray.log-dir={} ".format(os.path.join(temp_dir, "sockets")) + + if java_worker_options: + # Put `java_worker_options` in the last, so it can overwrite the + # above options. + command += java_worker_options + " " command += "org.ray.runtime.runner.worker.DefaultWorker" return command diff --git a/python/ray/tests/test_basic.py b/python/ray/tests/test_basic.py index 8d91766caee0..88c64d3e3f11 100644 --- a/python/ray/tests/test_basic.py +++ b/python/ray/tests/test_basic.py @@ -755,7 +755,7 @@ def testNoArgs(self): def no_op(): pass - self.init_ray() + self.ray_start() ray.get(no_op.remote()) @@ -827,51 +827,63 @@ def m(x): assert ray.get(k2.remote(1)) == 2 assert ray.get(m.remote(1)) == 2 - def test_submit_api(shutdown_only): - ray.init(num_cpus=1, num_gpus=1, resources={"Custom": 1}) - @ray.remote - def f(n): - return list(range(n)) +def test_submit_api(shutdown_only): + ray.init(num_cpus=1, num_gpus=1, resources={"Custom": 1}) - @ray.remote - def g(): - return ray.get_gpu_ids() + @ray.remote + def f(n): + return list(range(n)) - assert f._remote([0], num_return_vals=0) is None - id1 = f._remote(args=[1], num_return_vals=1) - assert ray.get(id1) == [0] - id1, id2 = f._remote(args=[2], num_return_vals=2) - assert ray.get([id1, id2]) == [0, 1] - id1, id2, id3 = f._remote(args=[3], num_return_vals=3) - assert ray.get([id1, id2, id3]) == [0, 1, 2] - assert ray.get( - g._remote( - args=[], num_cpus=1, num_gpus=1, - resources={"Custom": 1})) == [0] - infeasible_id = g._remote(args=[], resources={"NonexistentCustom": 1}) - ready_ids, remaining_ids = ray.wait([infeasible_id], timeout=0.05) - assert len(ready_ids) == 0 - assert len(remaining_ids) == 1 + @ray.remote + def g(): + return ray.get_gpu_ids() + + assert f._remote([0], num_return_vals=0) is None + id1 = f._remote(args=[1], num_return_vals=1) + assert ray.get(id1) == [0] + id1, id2 = f._remote(args=[2], num_return_vals=2) + assert ray.get([id1, id2]) == [0, 1] + id1, id2, id3 = f._remote(args=[3], num_return_vals=3) + assert ray.get([id1, id2, id3]) == [0, 1, 2] + assert ray.get( + g._remote(args=[], num_cpus=1, num_gpus=1, + resources={"Custom": 1})) == [0] + infeasible_id = g._remote(args=[], resources={"NonexistentCustom": 1}) + assert ray.get(g._remote()) == [] + ready_ids, remaining_ids = ray.wait([infeasible_id], timeout=0.05) + assert len(ready_ids) == 0 + assert len(remaining_ids) == 1 - @ray.remote - class Actor(object): - def __init__(self, x, y=0): - self.x = x - self.y = y + @ray.remote + class Actor(object): + def __init__(self, x, y=0): + self.x = x + self.y = y - def method(self, a, b=0): - return self.x, self.y, a, b + def method(self, a, b=0): + return self.x, self.y, a, b - def gpu_ids(self): - return ray.get_gpu_ids() + def gpu_ids(self): + return ray.get_gpu_ids() - a = Actor._remote( - args=[0], kwargs={"y": 1}, num_gpus=1, resources={"Custom": 1}) + @ray.remote + class Actor2(object): + def __init__(self): + pass + + def method(self): + pass + + a = Actor._remote( + args=[0], kwargs={"y": 1}, num_gpus=1, resources={"Custom": 1}) + + a2 = Actor2._remote() + ray.get(a2.method._remote()) - id1, id2, id3, id4 = a.method._remote( - args=["test"], kwargs={"b": 2}, num_return_vals=4) - assert ray.get([id1, id2, id3, id4]) == [0, 1, "test", 2] + id1, id2, id3, id4 = a.method._remote( + args=["test"], kwargs={"b": 2}, num_return_vals=4) + assert ray.get([id1, id2, id3, id4]) == [0, 1, "test", 2] def test_get_multiple(shutdown_only): @@ -2493,10 +2505,6 @@ def wait_for_object_table(): object_table = ray.global_state.object_table() assert len(object_table) == 2 - assert object_table[x_id]["IsEviction"][0] is False - - assert object_table[result_id]["IsEviction"][0] is False - assert object_table[x_id] == ray.global_state.object_table(x_id) object_table_entry = ray.global_state.object_table(result_id) assert object_table[result_id] == object_table_entry diff --git a/python/ray/tests/test_failure.py b/python/ray/tests/test_failure.py index 905b1ee28b4a..21152c35359a 100644 --- a/python/ray/tests/test_failure.py +++ b/python/ray/tests/test_failure.py @@ -655,11 +655,14 @@ def run_one_command(*command): -1) run_failure_test("Index is not a number.", "RAY.TABLE_APPEND", 1, 1, 2, 1, b"a") + run_failure_test("The entry to remove doesn't exist.", "RAY.SET_REMOVE", 1, + 1, 3, 1) run_one_command("RAY.TABLE_APPEND", 1, 1, 2, 1) # It's okay to add duplicate entries. run_one_command("RAY.TABLE_APPEND", 1, 1, 2, 1) run_one_command("RAY.TABLE_APPEND", 1, 1, 2, 1, 0) run_one_command("RAY.TABLE_APPEND", 1, 1, 2, 1, 1) + run_one_command("RAY.SET_ADD", 1, 1, 3, 1) @pytest.fixture diff --git a/python/ray/tests/utils.py b/python/ray/tests/utils.py index 189f9ae35f58..e4249f89a9cb 100644 --- a/python/ray/tests/utils.py +++ b/python/ray/tests/utils.py @@ -2,9 +2,7 @@ from __future__ import division from __future__ import print_function -import json import os -import redis import subprocess import sys import tempfile @@ -12,92 +10,6 @@ import ray -EVENT_KEY = "RAY_MULTI_NODE_TEST_KEY" -"""This key is used internally within this file for coordinating drivers.""" - - -def _wait_for_nodes_to_join(num_nodes, timeout=20): - """Wait until the nodes have joined the cluster. - - This will wait until exactly num_nodes have joined the cluster. - - Args: - num_nodes: The number of nodes to wait for. - timeout: The amount of time in seconds to wait before failing. - - Raises: - Exception: An exception is raised if too many nodes join the cluster or - if the timeout expires while we are waiting. - """ - start_time = time.time() - while time.time() - start_time < timeout: - client_table = ray.global_state.client_table() - num_ready_nodes = len(client_table) - if num_ready_nodes == num_nodes: - return - if num_ready_nodes > num_nodes: - # Too many nodes have joined. Something must be wrong. - raise Exception("{} nodes have joined the cluster, but we were " - "expecting {} nodes.".format( - num_ready_nodes, num_nodes)) - time.sleep(0.1) - - # If we get here then we timed out. - raise Exception("Timed out while waiting for {} nodes to join. Only {} " - "nodes have joined so far.".format(num_ready_nodes, - num_nodes)) - - -def _broadcast_event(event_name, redis_address, data=None): - """Broadcast an event. - - This is used to synchronize drivers for the multi-node tests. - - Args: - event_name: The name of the event to wait for. - redis_address: The address of the Redis server to use for - synchronization. - data: Extra data to include in the broadcast (this will be returned by - the corresponding _wait_for_event call). This data must be json - serializable. - """ - redis_host, redis_port = redis_address.split(":") - redis_client = redis.StrictRedis(host=redis_host, port=int(redis_port)) - payload = json.dumps((event_name, data)) - redis_client.rpush(EVENT_KEY, payload) - - -def _wait_for_event(event_name, redis_address, extra_buffer=0): - """Block until an event has been broadcast. - - This is used to synchronize drivers for the multi-node tests. - - Args: - event_name: The name of the event to wait for. - redis_address: The address of the Redis server to use for - synchronization. - extra_buffer: An amount of time in seconds to wait after the event. - - Returns: - The data that was passed into the corresponding _broadcast_event call. - """ - redis_host, redis_port = redis_address.split(":") - redis_client = redis.StrictRedis(host=redis_host, port=int(redis_port)) - while True: - event_infos = redis_client.lrange(EVENT_KEY, 0, -1) - events = {} - for event_info in event_infos: - name, data = json.loads(event_info) - if name in events: - raise Exception("The same event {} was broadcast twice." - .format(name)) - events[name] = data - if event_name in events: - # Potentially sleep a little longer and then return the event data. - time.sleep(extra_buffer) - return events[event_name] - time.sleep(0.1) - def _pid_alive(pid): """Check if the process with this PID is alive or not. diff --git a/python/ray/tune/commands.py b/python/ray/tune/commands.py new file mode 100644 index 000000000000..5abf49858dd1 --- /dev/null +++ b/python/ray/tune/commands.py @@ -0,0 +1,258 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import glob +import json +import logging +import os +import sys +import subprocess +from datetime import datetime + +import pandas as pd +from ray.tune.util import flatten_dict +from ray.tune.result import TRAINING_ITERATION, MEAN_ACCURACY, MEAN_LOSS +from ray.tune.trial import Trial +try: + from tabulate import tabulate +except ImportError: + tabulate = None + +logger = logging.getLogger(__name__) + +TIMESTAMP_FORMAT = "%Y-%m-%d %H:%M:%S (%A)" + +DEFAULT_EXPERIMENT_INFO_KEYS = ( + "trainable_name", + "experiment_tag", + "trial_id", + "status", + "last_update_time", +) + +DEFAULT_RESULT_KEYS = (TRAINING_ITERATION, MEAN_ACCURACY, MEAN_LOSS) + +DEFAULT_PROJECT_INFO_KEYS = ( + "name", + "total_trials", + "running_trials", + "terminated_trials", + "error_trials", + "last_updated", +) + +try: + TERM_HEIGHT, TERM_WIDTH = subprocess.check_output(['stty', 'size']).split() + TERM_HEIGHT, TERM_WIDTH = int(TERM_HEIGHT), int(TERM_WIDTH) +except subprocess.CalledProcessError: + TERM_HEIGHT, TERM_WIDTH = 100, 100 + +EDITOR = os.getenv('EDITOR', 'vim') + + +def _check_tabulate(): + """Checks whether tabulate is installed.""" + if tabulate is None: + raise ImportError( + "Tabulate not installed. Please run `pip install tabulate`.") + + +def print_format_output(dataframe): + """Prints output of given dataframe to fit into terminal. + + Returns: + table (pd.DataFrame): Final outputted dataframe. + dropped_cols (list): Columns dropped due to terminal size. + empty_cols (list): Empty columns (dropped on default). + """ + print_df = pd.DataFrame() + dropped_cols = [] + empty_cols = [] + # column display priority is based on the info_keys passed in + for i, col in enumerate(dataframe): + if dataframe[col].isnull().all(): + # Don't add col to print_df if is fully empty + empty_cols += [col] + continue + + print_df[col] = dataframe[col] + test_table = tabulate(print_df, headers="keys", tablefmt="psql") + if str(test_table).index('\n') > TERM_WIDTH: + # Drop all columns beyond terminal width + print_df.drop(col, axis=1, inplace=True) + dropped_cols += list(dataframe.columns)[i:] + break + + table = tabulate( + print_df, headers="keys", tablefmt="psql", showindex="never") + + print(table) + if dropped_cols: + print("Dropped columns:", dropped_cols) + print("Please increase your terminal size to view remaining columns.") + if empty_cols: + print("Empty columns:", empty_cols) + + return table, dropped_cols, empty_cols + + +def _get_experiment_state(experiment_path, exit_on_fail=False): + experiment_path = os.path.expanduser(experiment_path) + experiment_state_paths = glob.glob( + os.path.join(experiment_path, "experiment_state*.json")) + if not experiment_state_paths: + if exit_on_fail: + print("No experiment state found!") + sys.exit(0) + else: + return + experiment_filename = max(list(experiment_state_paths)) + + with open(experiment_filename) as f: + experiment_state = json.load(f) + return experiment_state + + +def list_trials(experiment_path, + sort=None, + info_keys=DEFAULT_EXPERIMENT_INFO_KEYS, + result_keys=DEFAULT_RESULT_KEYS): + """Lists trials in the directory subtree starting at the given path. + + Args: + experiment_path (str): Directory where trials are located. + Corresponds to Experiment.local_dir/Experiment.name. + sort (str): Key to sort by. + info_keys (list): Keys that are displayed. + result_keys (list): Keys of last result that are displayed. + """ + _check_tabulate() + experiment_state = _get_experiment_state( + experiment_path, exit_on_fail=True) + + checkpoint_dicts = experiment_state["checkpoints"] + checkpoint_dicts = [flatten_dict(g) for g in checkpoint_dicts] + checkpoints_df = pd.DataFrame(checkpoint_dicts) + + result_keys = ["last_result:{}".format(k) for k in result_keys] + col_keys = [ + k for k in list(info_keys) + result_keys if k in checkpoints_df + ] + checkpoints_df = checkpoints_df[col_keys] + + if "last_update_time" in checkpoints_df: + with pd.option_context('mode.use_inf_as_null', True): + datetime_series = checkpoints_df["last_update_time"].dropna() + + datetime_series = datetime_series.apply( + lambda t: datetime.fromtimestamp(t).strftime(TIMESTAMP_FORMAT)) + checkpoints_df["last_update_time"] = datetime_series + + if "logdir" in checkpoints_df: + # logdir often too verbose to view in table, so drop experiment_path + checkpoints_df["logdir"] = checkpoints_df["logdir"].str.replace( + experiment_path, '') + + if sort: + if sort not in checkpoints_df: + raise KeyError("Sort Index '{}' not in: {}".format( + sort, list(checkpoints_df))) + checkpoints_df = checkpoints_df.sort_values(by=sort) + + print_format_output(checkpoints_df) + + +def list_experiments(project_path, + sort=None, + info_keys=DEFAULT_PROJECT_INFO_KEYS): + """Lists experiments in the directory subtree. + + Args: + project_path (str): Directory where experiments are located. + Corresponds to Experiment.local_dir. + sort (str): Key to sort by. + info_keys (list): Keys that are displayed. + """ + _check_tabulate() + base, experiment_folders, _ = next(os.walk(project_path)) + + experiment_data_collection = [] + + for experiment_dir in experiment_folders: + experiment_state = _get_experiment_state( + os.path.join(base, experiment_dir)) + if not experiment_state: + logger.debug("No experiment state found in %s", experiment_dir) + continue + + checkpoints = pd.DataFrame(experiment_state["checkpoints"]) + runner_data = experiment_state["runner_data"] + + # Format time-based values. + time_values = { + "start_time": runner_data.get("_start_time"), + "last_updated": experiment_state.get("timestamp"), + } + + formatted_time_values = { + key: datetime.fromtimestamp(val).strftime(TIMESTAMP_FORMAT) + if val else None + for key, val in time_values.items() + } + + experiment_data = { + "name": experiment_dir, + "total_trials": checkpoints.shape[0], + "running_trials": (checkpoints["status"] == Trial.RUNNING).sum(), + "terminated_trials": ( + checkpoints["status"] == Trial.TERMINATED).sum(), + "error_trials": (checkpoints["status"] == Trial.ERROR).sum(), + } + experiment_data.update(formatted_time_values) + experiment_data_collection.append(experiment_data) + + if not experiment_data_collection: + print("No experiments found!") + sys.exit(0) + + info_df = pd.DataFrame(experiment_data_collection) + col_keys = [k for k in list(info_keys) if k in info_df] + + if not col_keys: + print("None of keys {} in experiment data!".format(info_keys)) + sys.exit(0) + + info_df = info_df[col_keys] + + if sort: + if sort not in info_df: + raise KeyError("Sort Index '{}' not in: {}".format( + sort, list(info_df))) + info_df = info_df.sort_values(by=sort) + + print_format_output(info_df) + + +def add_note(path, filename="note.txt"): + """Opens a txt file at the given path where user can add and save notes. + + Args: + path (str): Directory where note will be saved. + filename (str): Name of note. Defaults to "note.txt" + """ + path = os.path.expanduser(path) + assert os.path.isdir(path), "{} is not a valid directory.".format(path) + + filepath = os.path.join(path, filename) + exists = os.path.isfile(filepath) + + try: + subprocess.call([EDITOR, filepath]) + except Exception as exc: + logger.error("Editing note failed!") + raise exc + if exists: + print("Note updated at:", filepath) + else: + print("Note created at:", filepath) diff --git a/python/ray/tune/examples/mnist_pytorch.py b/python/ray/tune/examples/mnist_pytorch.py index df4072eaf1e6..ee23297d59fa 100644 --- a/python/ray/tune/examples/mnist_pytorch.py +++ b/python/ray/tune/examples/mnist_pytorch.py @@ -26,9 +26,9 @@ parser.add_argument( '--epochs', type=int, - default=10, + default=1, metavar='N', - help='number of epochs to train (default: 10)') + help='number of epochs to train (default: 1)') parser.add_argument( '--lr', type=float, diff --git a/python/ray/tune/examples/mnist_pytorch_trainable.py b/python/ray/tune/examples/mnist_pytorch_trainable.py index d22beddeead2..b4856c462da3 100644 --- a/python/ray/tune/examples/mnist_pytorch_trainable.py +++ b/python/ray/tune/examples/mnist_pytorch_trainable.py @@ -29,9 +29,9 @@ parser.add_argument( '--epochs', type=int, - default=10, + default=1, metavar='N', - help='number of epochs to train (default: 10)') + help='number of epochs to train (default: 1)') parser.add_argument( '--lr', type=float, diff --git a/python/ray/tune/logger.py b/python/ray/tune/logger.py index 7f4bddac8015..2aa30ceb596c 100644 --- a/python/ray/tune/logger.py +++ b/python/ray/tune/logger.py @@ -17,15 +17,8 @@ logger = logging.getLogger(__name__) -try: - import tensorflow as tf - use_tf150_api = (distutils.version.LooseVersion(tf.VERSION) >= - distutils.version.LooseVersion("1.5.0")) -except ImportError: - tf = None - use_tf150_api = True - logger.warning("Couldn't import TensorFlow - " - "disabling TensorBoard logging.") +tf = None +use_tf150_api = True class Logger(object): @@ -121,6 +114,15 @@ def to_tf_values(result, path): class TFLogger(Logger): def _init(self): + try: + global tf, use_tf150_api + import tensorflow + tf = tensorflow + use_tf150_api = (distutils.version.LooseVersion(tf.VERSION) >= + distutils.version.LooseVersion("1.5.0")) + except ImportError: + logger.warning("Couldn't import TensorFlow - " + "disabling TensorBoard logging.") self._file_writer = tf.summary.FileWriter(self.logdir) def on_result(self, result): @@ -226,14 +228,13 @@ def on_result(self, result): def close(self): for _logger in self._loggers: _logger.close() - self._log_syncer.sync_now(force=True) + self._log_syncer.sync_now(force=False) self._log_syncer.close() def flush(self): for _logger in self._loggers: _logger.flush() - self._log_syncer.sync_now(force=True) - self._log_syncer.wait() + self._log_syncer.sync_now(force=False) def sync_results_to_new_location(self, worker_ip): """Sends the current log directory to the remote node. diff --git a/python/ray/tune/result.py b/python/ray/tune/result.py index 0d5aeb0d0618..47b53618651e 100644 --- a/python/ray/tune/result.py +++ b/python/ray/tune/result.py @@ -18,6 +18,15 @@ # (Auto-filled) The pid of the training process. PID = "pid" +# (Optional) Mean reward for current training iteration +EPISODE_REWARD_MEAN = "episode_reward_mean" + +# (Optional) Mean loss for training iteration +MEAN_LOSS = "mean_loss" + +# (Optional) Mean accuracy for training iteration +MEAN_ACCURACY = "mean_accuracy" + # Number of episodes in this iteration. EPISODES_THIS_ITER = "episodes_this_iter" diff --git a/python/ray/tune/scripts.py b/python/ray/tune/scripts.py new file mode 100644 index 000000000000..3b810ee3ae5c --- /dev/null +++ b/python/ray/tune/scripts.py @@ -0,0 +1,56 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import click +import ray.tune.commands as commands + + +@click.group() +def cli(): + pass + + +@cli.command() +@click.argument("experiment_path", required=True, type=str) +@click.option( + '--sort', default=None, type=str, help='Select which column to sort on.') +def list_trials(experiment_path, sort): + """Lists trials in the directory subtree starting at the given path.""" + commands.list_trials(experiment_path, sort) + + +@cli.command() +@click.argument("project_path", required=True, type=str) +@click.option( + '--sort', default=None, type=str, help='Select which column to sort on.') +def list_experiments(project_path, sort): + """Lists experiments in the directory subtree.""" + commands.list_experiments(project_path, sort) + + +@cli.command() +@click.argument("path", required=True, type=str) +@click.option( + "--filename", + default="note.txt", + type=str, + help='Specify filename for note.') +def add_note(path, filename): + """Adds user notes as a text file at the given path.""" + commands.add_note(path, filename) + + +cli.add_command(list_trials, name="ls") +cli.add_command(list_trials, name="list-trials") +cli.add_command(list_experiments, name="lsx") +cli.add_command(list_experiments, name="list-experiments") +cli.add_command(add_note, name="add-note") + + +def main(): + return cli() + + +if __name__ == "__main__": + main() diff --git a/python/ray/tune/suggest/bayesopt.py b/python/ray/tune/suggest/bayesopt.py index 48f6406aa366..089f1b26d7ae 100644 --- a/python/ray/tune/suggest/bayesopt.py +++ b/python/ray/tune/suggest/bayesopt.py @@ -4,13 +4,16 @@ import copy -try: - import bayes_opt as byo -except Exception: - byo = None - from ray.tune.suggest.suggestion import SuggestionAlgorithm +byo = None + + +def _import_bayesopt(): + global byo + import bayes_opt + byo = bayes_opt + class BayesOptSearch(SuggestionAlgorithm): """A wrapper around BayesOpt to provide trial suggestions. @@ -56,6 +59,7 @@ def __init__(self, random_state=1, verbose=0, **kwargs): + _import_bayesopt() assert byo is not None, ( "BayesOpt must be installed!. You can install BayesOpt with" " the command: `pip install bayesian-optimization`.") diff --git a/python/ray/tune/suggest/hyperopt.py b/python/ray/tune/suggest/hyperopt.py index 2c32562505f9..62795cc6cb13 100644 --- a/python/ray/tune/suggest/hyperopt.py +++ b/python/ray/tune/suggest/hyperopt.py @@ -6,17 +6,19 @@ import copy import logging -try: - hyperopt_logger = logging.getLogger("hyperopt") - hyperopt_logger.setLevel(logging.WARNING) - import hyperopt as hpo - from hyperopt.fmin import generate_trials_to_calculate -except Exception: - hpo = None - from ray.tune.error import TuneError from ray.tune.suggest.suggestion import SuggestionAlgorithm +hpo = None + + +def _import_hyperopt(): + global hpo + hyperopt_logger = logging.getLogger("hyperopt") + hyperopt_logger.setLevel(logging.WARNING) + import hyperopt + hpo = hyperopt + class HyperOptSearch(SuggestionAlgorithm): """A wrapper around HyperOpt to provide trial suggestions. @@ -73,7 +75,9 @@ def __init__(self, reward_attr="episode_reward_mean", points_to_evaluate=None, **kwargs): + _import_hyperopt() assert hpo is not None, "HyperOpt must be installed!" + from hyperopt.fmin import generate_trials_to_calculate assert type(max_concurrent) is int and max_concurrent > 0 self._max_concurrent = max_concurrent self._reward_attr = reward_attr diff --git a/python/ray/tune/suggest/skopt.py b/python/ray/tune/suggest/skopt.py index 5618b50521b6..2d3fdbcb1261 100644 --- a/python/ray/tune/suggest/skopt.py +++ b/python/ray/tune/suggest/skopt.py @@ -67,34 +67,21 @@ def __init__(self, **kwargs): assert skopt is not None, """skopt must be installed! You can install Skopt with the command: - `pip install scikit-optimize`.""" - assert type(max_concurrent) is int and max_concurrent > 0 - if points_to_evaluate is None: - points_to_evaluate = [] - elif not isinstance(points_to_evaluate[0], (list, tuple)): - points_to_evaluate = [points_to_evaluate] - if not isinstance(points_to_evaluate, list): - raise ValueError( - "`points_to_evaluate` should be a list, but got %s" % - type(points_to_evaluate)) - if isinstance(evaluated_rewards, Iterable): - evaluated_rewards = list(evaluated_rewards) - elif isinstance(evaluated_rewards, numbers.Number): - evaluated_rewards = [evaluated_rewards] + `pip install scikit-optimize`.""" + assert type(max_concurrent) is int and max_concurrent > 0 + if points_to_evaluate: + self._validate_points_to_evaluate(points_to_evaluate, len(parameter_names)) + if evaluated_rewards: + self._validate_evaluated_rewards(evaluated_rewards) self._initial_points = [] if points_to_evaluate and evaluated_rewards: - if not (isinstance(evaluated_rewards, Iterable) - or isinstance(evaluated_rewards, numbers.Number)): - raise ValueError( - "`evaluated_rewards` should be an iterable or a scalar, got %s" - % type(evaluated_rewards)) if len(points_to_evaluate) != len(evaluated_rewards): raise ValueError( "`points_to_evaluate` and `evaluated_rewards` should have the same length" ) optimizer.tell(points_to_evaluate, evaluated_rewards) elif points_to_evaluate: - self._initial_points = points_to_evaluate + self._initial_points = points_to_evaluate self._max_concurrent = max_concurrent self._parameters = parameter_names self._reward_attr = reward_attr @@ -102,6 +89,28 @@ def __init__(self, self._live_trial_mapping = {} super(SkOptSearch, self).__init__(**kwargs) + def _validate_points_to_evaluate(self, points, dimension): + if not isinstance(points, list): + raise TypeError( + "`points_to_evaluate` should be a list, but got %s" % + type(points)) + for point in points: + if not isinstance(point, list): + raise TypeError( + "`points_to_evaluate` should be a list, but got %s" % + type(point)) + if len(point) != dimension: + raise TypeError( + """each point in `points_to_evaluate` should + have the same dimensions as `parameter_names`""" + ) + + def _validate_evaluated_rewards(self, rewards): + if not isinstance(rewards, list): + raise TypeError( + "`evaluated_rewards` should be a list, but got %s" % + type(points_to_evaluate)) + def _suggest(self, trial_id): if self._num_live_trials() >= self._max_concurrent: return None diff --git a/python/ray/tune/tests/test_commands.py b/python/ray/tune/tests/test_commands.py new file mode 100644 index 000000000000..174f356d5b08 --- /dev/null +++ b/python/ray/tune/tests/test_commands.py @@ -0,0 +1,66 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import pytest + +import ray +from ray import tune +from ray.rllib import _register_all +from ray.tune import commands + + +@pytest.fixture +def start_ray(): + ray.init() + _register_all() + yield + ray.shutdown() + + +def test_ls(start_ray, capsys, tmpdir): + """This test captures output of list_trials.""" + experiment_name = "test_ls" + experiment_path = os.path.join(str(tmpdir), experiment_name) + num_samples = 2 + with capsys.disabled(): + tune.run_experiments({ + experiment_name: { + "run": "__fake", + "stop": { + "training_iteration": 1 + }, + "num_samples": num_samples, + "local_dir": str(tmpdir) + } + }) + + commands.list_trials(experiment_path, info_keys=("status", )) + captured = capsys.readouterr().out.strip() + lines = captured.split("\n") + assert sum("TERMINATED" in line for line in lines) == num_samples + + +def test_lsx(start_ray, capsys, tmpdir): + """This test captures output of list_experiments.""" + project_path = str(tmpdir) + num_experiments = 3 + for i in range(num_experiments): + experiment_name = "test_lsx{}".format(i) + with capsys.disabled(): + tune.run_experiments({ + experiment_name: { + "run": "__fake", + "stop": { + "training_iteration": 1 + }, + "num_samples": 1, + "local_dir": project_path + } + }) + + commands.list_experiments(project_path, info_keys=("total_trials", )) + captured = capsys.readouterr().out.strip() + lines = captured.split("\n") + assert sum("1" in line for line in lines) >= 3 diff --git a/python/ray/tune/tests/test_ray_trial_executor.py b/python/ray/tune/tests/test_ray_trial_executor.py index ee5a98a87067..0341dd487d0e 100644 --- a/python/ray/tune/tests/test_ray_trial_executor.py +++ b/python/ray/tune/tests/test_ray_trial_executor.py @@ -18,6 +18,7 @@ class RayTrialExecutorTest(unittest.TestCase): def setUp(self): self.trial_executor = RayTrialExecutor(queue_trials=False) ray.init() + _register_all() # Needed for flaky tests def tearDown(self): ray.shutdown() diff --git a/python/ray/tune/tests/test_trial_scheduler.py b/python/ray/tune/tests/test_trial_scheduler.py index b5426bb3dd4b..0f68e39f98d8 100644 --- a/python/ray/tune/tests/test_trial_scheduler.py +++ b/python/ray/tune/tests/test_trial_scheduler.py @@ -583,7 +583,7 @@ def __init__(self, i, config): self.logger_running = False self.restored_checkpoint = None self.resources = Resources(1, 0) - self.trial_name = None + self.custom_trial_name = None class PopulationBasedTestingSuite(unittest.TestCase): diff --git a/python/ray/tune/trial.py b/python/ray/tune/trial.py index 0989ba3f4617..fb324fcb7885 100644 --- a/python/ray/tune/trial.py +++ b/python/ray/tune/trial.py @@ -25,7 +25,8 @@ # have been defined yet. See https://github.com/ray-project/ray/issues/1716. import ray.tune.registry from ray.tune.result import (DEFAULT_RESULTS_DIR, DONE, HOSTNAME, PID, - TIME_TOTAL_S, TRAINING_ITERATION, TIMESTEPS_TOTAL) + TIME_TOTAL_S, TRAINING_ITERATION, TIMESTEPS_TOTAL, + EPISODE_REWARD_MEAN, MEAN_LOSS, MEAN_ACCURACY) from ray.utils import _random_string, binary_to_hex, hex_to_binary DEBUG_PRINT_INTERVAL = 5 @@ -299,9 +300,27 @@ def __init__(self, self.error_file = None self.num_failures = 0 - self.trial_name = None + self.custom_trial_name = None + + # AutoML fields + self.results = None + self.best_result = None + self.param_config = None + self.extra_arg = None + + self._nonjson_fields = [ + "_checkpoint", + "config", + "loggers", + "sync_function", + "last_result", + "results", + "best_result", + "param_config", + "extra_arg", + ] if trial_name_creator: - self.trial_name = trial_name_creator(self) + self.custom_trial_name = trial_name_creator(self) @classmethod def _registration_check(cls, trainable_name): @@ -429,17 +448,17 @@ def location_string(hostname, pid): if self.last_result.get(TIMESTEPS_TOTAL) is not None: pieces.append('{} ts'.format(self.last_result[TIMESTEPS_TOTAL])) - if self.last_result.get("episode_reward_mean") is not None: + if self.last_result.get(EPISODE_REWARD_MEAN) is not None: pieces.append('{} rew'.format( - format(self.last_result["episode_reward_mean"], '.3g'))) + format(self.last_result[EPISODE_REWARD_MEAN], '.3g'))) - if self.last_result.get("mean_loss") is not None: + if self.last_result.get(MEAN_LOSS) is not None: pieces.append('{} loss'.format( - format(self.last_result["mean_loss"], '.3g'))) + format(self.last_result[MEAN_LOSS], '.3g'))) - if self.last_result.get("mean_accuracy") is not None: + if self.last_result.get(MEAN_ACCURACY) is not None: pieces.append('{} acc'.format( - format(self.last_result["mean_accuracy"], '.3g'))) + format(self.last_result[MEAN_ACCURACY], '.3g'))) return ', '.join(pieces) @@ -496,8 +515,8 @@ def __str__(self): Can be overriden with a custom string creator. """ - if self.trial_name: - return self.trial_name + if self.custom_trial_name: + return self.custom_trial_name if "env" in self.config: env = self.config["env"] @@ -521,22 +540,11 @@ def __getstate__(self): state = self.__dict__.copy() state["resources"] = resources_to_json(self.resources) - # These are non-pickleable entries. - pickle_data = { - "_checkpoint": self._checkpoint, - "config": self.config, - "loggers": self.loggers, - "sync_function": self.sync_function, - "last_result": self.last_result - } - - for key, value in pickle_data.items(): - state[key] = binary_to_hex(cloudpickle.dumps(value)) + for key in self._nonjson_fields: + state[key] = binary_to_hex(cloudpickle.dumps(state.get(key))) state["runner"] = None state["result_logger"] = None - if self.status == Trial.RUNNING: - state["status"] = Trial.PENDING if self.result_logger: self.result_logger.flush() state["__logger_started__"] = True @@ -547,10 +555,9 @@ def __getstate__(self): def __setstate__(self, state): logger_started = state.pop("__logger_started__") state["resources"] = json_to_resources(state["resources"]) - for key in [ - "_checkpoint", "config", "loggers", "sync_function", - "last_result" - ]: + if state["status"] == Trial.RUNNING: + state["status"] = Trial.PENDING + for key in self._nonjson_fields: state[key] = cloudpickle.loads(hex_to_binary(state[key])) self.__dict__.update(state) diff --git a/python/ray/tune/trial_runner.py b/python/ray/tune/trial_runner.py index afce6863a98c..96dfaa5deef6 100644 --- a/python/ray/tune/trial_runner.py +++ b/python/ray/tune/trial_runner.py @@ -112,7 +112,9 @@ def __init__(self, self._stop_queue = [] self._metadata_checkpoint_dir = metadata_checkpoint_dir - self._session = datetime.today().strftime("%Y-%m-%d_%H-%M-%S") + self._start_time = time.time() + self._session_str = datetime.fromtimestamp( + self._start_time).strftime("%Y-%m-%d_%H-%M-%S") @classmethod def checkpoint_exists(cls, directory): @@ -136,7 +138,8 @@ def checkpoint(self): runner_state = { "checkpoints": list( self.trial_executor.get_checkpoints().values()), - "runner_data": self.__getstate__() + "runner_data": self.__getstate__(), + "timestamp": time.time() } tmp_file_name = os.path.join(metadata_checkpoint_dir, ".tmp_checkpoint") @@ -146,7 +149,7 @@ def checkpoint(self): os.rename( tmp_file_name, os.path.join(metadata_checkpoint_dir, - TrialRunner.CKPT_FILE_TMPL.format(self._session))) + TrialRunner.CKPT_FILE_TMPL.format(self._session_str))) return metadata_checkpoint_dir @classmethod @@ -558,8 +561,12 @@ def __getstate__(self): """ state = self.__dict__.copy() for k in [ - "_trials", "_stop_queue", "_server", "_search_alg", - "_scheduler_alg", "trial_executor", "_session" + "_trials", + "_stop_queue", + "_server", + "_search_alg", + "_scheduler_alg", + "trial_executor", ]: del state[k] state["launch_web_server"] = bool(self._server) @@ -567,6 +574,14 @@ def __getstate__(self): def __setstate__(self, state): launch_web_server = state.pop("launch_web_server") + + # Use session_str from previous checkpoint if does not exist + session_str = state.pop("_session_str") + self.__dict__.setdefault("_session_str", session_str) + # Use start_time from previous checkpoint if does not exist + start_time = state.pop("_start_time") + self.__dict__.setdefault("_start_time", start_time) + self.__dict__.update(state) if launch_web_server: self._server = TuneServer(self, self._server_port) diff --git a/python/ray/tune/util.py b/python/ray/tune/util.py index ce4047f2e7b4..75ac57ef188a 100644 --- a/python/ray/tune/util.py +++ b/python/ray/tune/util.py @@ -61,7 +61,7 @@ def deep_update(original, new_dict, new_keys_allowed, whitelist): if k not in original: if not new_keys_allowed: raise Exception("Unknown config parameter `{}` ".format(k)) - if type(original.get(k)) is dict: + if isinstance(original.get(k), dict): if k in whitelist: deep_update(original[k], value, True, []) else: @@ -71,6 +71,21 @@ def deep_update(original, new_dict, new_keys_allowed, whitelist): return original +def flatten_dict(dt): + while any(isinstance(v, dict) for v in dt.values()): + remove = [] + add = {} + for key, value in dt.items(): + if isinstance(value, dict): + for subkey, v in value.items(): + add[":".join([key, subkey])] = v + remove.append(key) + dt.update(add) + for k in remove: + del dt[k] + return dt + + def _to_pinnable(obj): """Converts obj to a form that can be pinned in object store memory. diff --git a/python/ray/tune/visual_utils.py b/python/ray/tune/visual_utils.py index 9273a91542c3..4a68bcec9d4e 100644 --- a/python/ray/tune/visual_utils.py +++ b/python/ray/tune/visual_utils.py @@ -10,22 +10,9 @@ import numpy as np import json -logger = logging.getLogger(__name__) - +from ray.tune.util import flatten_dict -def _flatten_dict(dt): - while any(type(v) is dict for v in dt.values()): - remove = [] - add = {} - for key, value in dt.items(): - if type(value) is dict: - for subkey, v in value.items(): - add[":".join([key, subkey])] = v - remove.append(key) - dt.update(add) - for k in remove: - del dt[k] - return dt +logger = logging.getLogger(__name__) def _parse_results(res_path): @@ -35,7 +22,7 @@ def _parse_results(res_path): # Get last line in file for line in f: pass - res_dict = _flatten_dict(json.loads(line.strip())) + res_dict = flatten_dict(json.loads(line.strip())) except Exception: logger.exception("Importing %s failed...Perhaps empty?" % res_path) return res_dict @@ -44,7 +31,7 @@ def _parse_results(res_path): def _parse_configs(cfg_path): try: with open(cfg_path) as f: - cfg_dict = _flatten_dict(json.load(f)) + cfg_dict = flatten_dict(json.load(f)) except Exception: logger.exception("Config parsing failed.") return cfg_dict diff --git a/python/ray/worker.py b/python/ray/worker.py index c7ee4e4d7214..3938d9256d86 100644 --- a/python/ray/worker.py +++ b/python/ray/worker.py @@ -1342,7 +1342,7 @@ def init(redis_address=None, huge_pages: Boolean flag indicating whether to start the Object Store with hugetlbfs support. Requires plasma_directory. include_webui: Boolean flag indicating whether to start the web - UI, which is a Jupyter notebook. + UI, which displays the status of the Ray cluster. driver_id: The ID of driver. configure_logging: True if allow the logging cofiguration here. Otherwise, the users may want to configure it by their own. diff --git a/python/setup.py b/python/setup.py index 2177e0c6b5a4..07b9b3b92164 100644 --- a/python/setup.py +++ b/python/setup.py @@ -24,9 +24,8 @@ "ray/core/src/ray/gcs/redis_module/libray_redis_module.so", "ray/core/src/plasma/plasma_store_server", "ray/_raylet.so", "ray/core/src/ray/raylet/raylet_monitor", "ray/core/src/ray/raylet/raylet", - "ray/WebUI.ipynb", "ray/dashboard/dashboard.py", - "ray/dashboard/index.html", "ray/dashboard/res/main.css", - "ray/dashboard/res/main.js" + "ray/dashboard/dashboard.py", "ray/dashboard/index.html", + "ray/dashboard/res/main.css", "ray/dashboard/res/main.js" ] # These are the directories where automatically generated Python flatbuffer @@ -38,11 +37,6 @@ optional_ray_files = [] -ray_ui_files = [ - "ray/core/src/catapult_files/index.html", - "ray/core/src/catapult_files/trace_viewer_full.html" -] - ray_autoscaler_files = [ "ray/autoscaler/aws/example-full.yaml", "ray/autoscaler/gcp/example-full.yaml", @@ -56,13 +50,6 @@ "ray/core/src/credis/redis/src/redis-server" ] -# The UI files are mandatory if the INCLUDE_UI environment variable equals 1. -# Otherwise, they are optional. -if "INCLUDE_UI" in os.environ and os.environ["INCLUDE_UI"] == "1": - ray_files += ray_ui_files -else: - optional_ray_files += ray_ui_files - optional_ray_files += ray_autoscaler_files extras = { @@ -80,7 +67,11 @@ def run(self): # version of Python to build pyarrow inside the build.sh script. Note # that certain flags will not be passed along such as --user or sudo. # TODO(rkn): Fix this. - subprocess.check_call(["../build.sh", "-p", sys.executable]) + command = ["../build.sh", "-p", sys.executable] + if os.getenv("RAY_INSTALL_JAVA") == "1": + # Also build binaries for Java if the above env variable exists. + command += ["-l", "python,java"] + subprocess.check_call(command) # We also need to install pyarrow along with Ray, so make sure that the # relevant non-Python pyarrow files get copied. @@ -184,7 +175,7 @@ def find_version(*filepath): entry_points={ "console_scripts": [ "ray=ray.scripts.scripts:main", - "rllib=ray.rllib.scripts:cli [rllib]" + "rllib=ray.rllib.scripts:cli [rllib]", "tune=ray.tune.scripts:cli" ] }, include_package_data=True, diff --git a/src/ray/common/client_connection.cc b/src/ray/common/client_connection.cc index 5cbaa4cff2b8..de9e71f05e0a 100644 --- a/src/ray/common/client_connection.cc +++ b/src/ray/common/client_connection.cc @@ -12,6 +12,12 @@ namespace ray { ray::Status TcpConnect(boost::asio::ip::tcp::socket &socket, const std::string &ip_address_string, int port) { + // Disable Nagle's algorithm, which caused transfer delays of 10s of ms in + // certain cases. + socket.open(boost::asio::ip::tcp::v4()); + boost::asio::ip::tcp::no_delay option(true); + socket.set_option(option); + boost::asio::ip::address ip_address = boost::asio::ip::address::from_string(ip_address_string); boost::asio::ip::tcp::endpoint endpoint(ip_address, port); diff --git a/src/ray/common/common_protocol.cc b/src/ray/common/common_protocol.cc index f5ed40af570c..adce684fc299 100644 --- a/src/ray/common/common_protocol.cc +++ b/src/ray/common/common_protocol.cc @@ -2,74 +2,6 @@ #include "ray/util/logging.h" -flatbuffers::Offset to_flatbuf(flatbuffers::FlatBufferBuilder &fbb, - ray::ObjectID object_id) { - return fbb.CreateString(reinterpret_cast(object_id.data()), - sizeof(ray::ObjectID)); -} - -ray::ObjectID from_flatbuf(const flatbuffers::String &string) { - ray::ObjectID object_id; - RAY_CHECK(string.size() == sizeof(ray::ObjectID)); - memcpy(object_id.mutable_data(), string.data(), sizeof(ray::ObjectID)); - return object_id; -} - -const std::vector from_flatbuf( - const flatbuffers::Vector> &vector) { - std::vector object_ids; - for (int64_t i = 0; i < vector.Length(); i++) { - object_ids.push_back(from_flatbuf(*vector.Get(i))); - } - return object_ids; -} - -const std::vector object_ids_from_flatbuf( - const flatbuffers::String &string) { - const auto &object_ids = string_from_flatbuf(string); - std::vector ret; - RAY_CHECK(object_ids.size() % kUniqueIDSize == 0); - auto count = object_ids.size() / kUniqueIDSize; - - for (size_t i = 0; i < count; ++i) { - auto pos = static_cast(kUniqueIDSize * i); - const auto &id = object_ids.substr(pos, kUniqueIDSize); - ret.push_back(ray::ObjectID::from_binary(id)); - } - - return ret; -} - -flatbuffers::Offset object_ids_to_flatbuf( - flatbuffers::FlatBufferBuilder &fbb, const std::vector &object_ids) { - std::string result; - for (const auto &id : object_ids) { - result += id.binary(); - } - - return fbb.CreateString(result); -} - -flatbuffers::Offset>> -to_flatbuf(flatbuffers::FlatBufferBuilder &fbb, ray::ObjectID object_ids[], - int64_t num_objects) { - std::vector> results; - for (int64_t i = 0; i < num_objects; i++) { - results.push_back(to_flatbuf(fbb, object_ids[i])); - } - return fbb.CreateVector(results); -} - -flatbuffers::Offset>> -to_flatbuf(flatbuffers::FlatBufferBuilder &fbb, - const std::vector &object_ids) { - std::vector> results; - for (auto object_id : object_ids) { - results.push_back(to_flatbuf(fbb, object_id)); - } - return fbb.CreateVector(results); -} - std::string string_from_flatbuf(const flatbuffers::String &string) { return std::string(string.data(), string.size()); } diff --git a/src/ray/common/common_protocol.h b/src/ray/common/common_protocol.h index bea4a5b92542..bc3d9b646a4b 100644 --- a/src/ray/common/common_protocol.h +++ b/src/ray/common/common_protocol.h @@ -6,63 +6,68 @@ #include #include "ray/id.h" +#include "ray/util/logging.h" -/// Convert an object ID to a flatbuffer string. +/// Convert an unique ID to a flatbuffer string. /// /// @param fbb Reference to the flatbuffer builder. -/// @param object_id The object ID to be converted. -/// @return The flatbuffer string contining the object ID. +/// @param id The ID to be converted. +/// @return The flatbuffer string containing the ID. +template flatbuffers::Offset to_flatbuf(flatbuffers::FlatBufferBuilder &fbb, - ray::ObjectID object_id); + ID id); -/// Convert a flatbuffer string to an object ID. +/// Convert a flatbuffer string to an unique ID. /// /// @param string The flatbuffer string. -/// @return The object ID. -ray::ObjectID from_flatbuf(const flatbuffers::String &string); +/// @return The ID. +template +ID from_flatbuf(const flatbuffers::String &string); -/// Convert a flatbuffer vector of strings to a vector of object IDs. +/// Convert a flatbuffer vector of strings to a vector of unique IDs. /// /// @param vector The flatbuffer vector. -/// @return The vector of object IDs. -const std::vector from_flatbuf( +/// @return The vector of IDs. +template +const std::vector from_flatbuf( const flatbuffers::Vector> &vector); /// Convert a flatbuffer of string that concatenated -/// object IDs to a vector of object IDs. +/// unique IDs to a vector of unique IDs. /// /// @param vector The flatbuffer vector. -/// @return The vector of object IDs. -const std::vector object_ids_from_flatbuf( - const flatbuffers::String &string); +/// @return The vector of IDs. +template +const std::vector ids_from_flatbuf(const flatbuffers::String &string); -/// Convert a vector of object IDs to a flatbuffer string. +/// Convert a vector of unique IDs to a flatbuffer string. /// The IDs are concatenated to a string with binary. /// /// @param fbb Reference to the flatbuffer builder. -/// @param object_ids The vector of object IDs. +/// @param ids The vector of IDs. /// @return Flatbuffer string of concatenated IDs. -flatbuffers::Offset object_ids_to_flatbuf( - flatbuffers::FlatBufferBuilder &fbb, const std::vector &object_ids); +template +flatbuffers::Offset ids_to_flatbuf( + flatbuffers::FlatBufferBuilder &fbb, const std::vector &ids); -/// Convert an array of object IDs to a flatbuffer vector of strings. +/// Convert an array of unique IDs to a flatbuffer vector of strings. /// /// @param fbb Reference to the flatbuffer builder. -/// @param object_ids Array of object IDs. -/// @param num_objects Number of elements in the array. +/// @param ids Array of unique IDs. +/// @param num_ids Number of elements in the array. /// @return Flatbuffer vector of strings. +template flatbuffers::Offset>> -to_flatbuf(flatbuffers::FlatBufferBuilder &fbb, ray::ObjectID object_ids[], - int64_t num_objects); +to_flatbuf(flatbuffers::FlatBufferBuilder &fbb, ID ids[], int64_t num_ids); -/// Convert a vector of object IDs to a flatbuffer vector of strings. +/// Convert a vector of unique IDs to a flatbuffer vector of strings. /// /// @param fbb Reference to the flatbuffer builder. -/// @param object_ids Vector of object IDs. +/// @param ids Vector of IDs. /// @return Flatbuffer vector of strings. +template flatbuffers::Offset>> -to_flatbuf(flatbuffers::FlatBufferBuilder &fbb, - const std::vector &object_ids); +to_flatbuf(flatbuffers::FlatBufferBuilder &fbb, const std::vector &ids); /// Convert a flatbuffer string to a std::string. /// @@ -95,4 +100,76 @@ std::vector string_vec_from_flatbuf( flatbuffers::Offset>> string_vec_to_flatbuf(flatbuffers::FlatBufferBuilder &fbb, const std::vector &string_vector); + +template +flatbuffers::Offset to_flatbuf(flatbuffers::FlatBufferBuilder &fbb, + ID id) { + return fbb.CreateString(reinterpret_cast(id.data()), sizeof(ID)); +} + +template +ID from_flatbuf(const flatbuffers::String &string) { + ID id; + RAY_CHECK(string.size() == sizeof(ID)); + memcpy(id.mutable_data(), string.data(), sizeof(ID)); + return id; +} + +template +const std::vector from_flatbuf( + const flatbuffers::Vector> &vector) { + std::vector ids; + for (int64_t i = 0; i < vector.Length(); i++) { + ids.push_back(from_flatbuf(*vector.Get(i))); + } + return ids; +} + +template +const std::vector ids_from_flatbuf(const flatbuffers::String &string) { + const auto &ids = string_from_flatbuf(string); + std::vector ret; + RAY_CHECK(ids.size() % kUniqueIDSize == 0); + auto count = ids.size() / kUniqueIDSize; + + for (size_t i = 0; i < count; ++i) { + auto pos = static_cast(kUniqueIDSize * i); + const auto &id = ids.substr(pos, kUniqueIDSize); + ret.push_back(ID::from_binary(id)); + } + + return ret; +} + +template +flatbuffers::Offset ids_to_flatbuf( + flatbuffers::FlatBufferBuilder &fbb, const std::vector &ids) { + std::string result; + for (const auto &id : ids) { + result += id.binary(); + } + + return fbb.CreateString(result); +} + +template +flatbuffers::Offset>> +to_flatbuf(flatbuffers::FlatBufferBuilder &fbb, ID ids[], int64_t num_ids) { + std::vector> results; + for (int64_t i = 0; i < num_ids; i++) { + results.push_back(to_flatbuf(fbb, ids[i])); + } + return fbb.CreateVector(results); +} + +template +flatbuffers::Offset>> +to_flatbuf(flatbuffers::FlatBufferBuilder &fbb, const std::vector &ids) { + std::vector> results; + for (auto id : ids) { + results.push_back(to_flatbuf(fbb, id)); + } + return fbb.CreateVector(results); +} + #endif diff --git a/src/ray/gcs/client.cc b/src/ray/gcs/client.cc index 4ce6a07b49fb..b51421e10a14 100644 --- a/src/ray/gcs/client.cc +++ b/src/ray/gcs/client.cc @@ -112,7 +112,7 @@ AsyncGcsClient::AsyncGcsClient(const std::string &address, int port, driver_table_.reset(new DriverTable({primary_context_}, this)); heartbeat_batch_table_.reset(new HeartbeatBatchTable({primary_context_}, this)); // Tables below would be sharded. - object_table_.reset(new ObjectTable(shard_contexts_, this, command_type)); + object_table_.reset(new ObjectTable(shard_contexts_, this)); raylet_task_table_.reset(new raylet::TaskTable(shard_contexts_, this, command_type)); task_reconstruction_log_.reset(new TaskReconstructionLog(shard_contexts_, this)); task_lease_table_.reset(new TaskLeaseTable(shard_contexts_, this)); diff --git a/src/ray/gcs/client_test.cc b/src/ray/gcs/client_test.cc index 6bf2a53156be..0d1f812a52be 100644 --- a/src/ray/gcs/client_test.cc +++ b/src/ray/gcs/client_test.cc @@ -131,40 +131,42 @@ TEST_MACRO(TestGcsWithChainAsio, TestTableLookup); void TestLogLookup(const JobID &job_id, std::shared_ptr client) { // Append some entries to the log at an object ID. - ObjectID object_id = ObjectID::from_random(); - std::vector managers = {"abc", "def", "ghi"}; - for (auto &manager : managers) { - auto data = std::make_shared(); - data->manager = manager; + TaskID task_id = TaskID::from_random(); + std::vector node_manager_ids = {"abc", "def", "ghi"}; + for (auto &node_manager_id : node_manager_ids) { + auto data = std::make_shared(); + data->node_manager_id = node_manager_id; // Check that we added the correct object entries. - auto add_callback = [object_id, data](gcs::AsyncGcsClient *client, const UniqueID &id, - const ObjectTableDataT &d) { - ASSERT_EQ(id, object_id); - ASSERT_EQ(data->manager, d.manager); + auto add_callback = [task_id, data](gcs::AsyncGcsClient *client, const UniqueID &id, + const TaskReconstructionDataT &d) { + ASSERT_EQ(id, task_id); + ASSERT_EQ(data->node_manager_id, d.node_manager_id); }; - RAY_CHECK_OK(client->object_table().Append(job_id, object_id, data, add_callback)); + RAY_CHECK_OK( + client->task_reconstruction_log().Append(job_id, task_id, data, add_callback)); } // Check that lookup returns the added object entries. - auto lookup_callback = [object_id, managers]( - gcs::AsyncGcsClient *client, const ObjectID &id, - const std::vector &data) { - ASSERT_EQ(id, object_id); + auto lookup_callback = [task_id, node_manager_ids]( + gcs::AsyncGcsClient *client, const UniqueID &id, + const std::vector &data) { + ASSERT_EQ(id, task_id); for (const auto &entry : data) { - ASSERT_EQ(entry.manager, managers[test->NumCallbacks()]); + ASSERT_EQ(entry.node_manager_id, node_manager_ids[test->NumCallbacks()]); test->IncrementNumCallbacks(); } - if (test->NumCallbacks() == managers.size()) { + if (test->NumCallbacks() == node_manager_ids.size()) { test->Stop(); } }; // Do a lookup at the object ID. - RAY_CHECK_OK(client->object_table().Lookup(job_id, object_id, lookup_callback)); + RAY_CHECK_OK( + client->task_reconstruction_log().Lookup(job_id, task_id, lookup_callback)); // Run the event loop. The loop will only stop if the Lookup callback is // called (or an assertion failure). test->Start(); - ASSERT_EQ(test->NumCallbacks(), managers.size()); + ASSERT_EQ(test->NumCallbacks(), node_manager_ids.size()); } TEST_F(TestGcsWithAsio, TestLogLookup) { @@ -201,11 +203,11 @@ TEST_MACRO(TestGcsWithChainAsio, TestTableLookupFailure); void TestLogAppendAt(const JobID &job_id, std::shared_ptr client) { TaskID task_id = TaskID::from_random(); - std::vector managers = {"A", "B"}; + std::vector node_manager_ids = {"A", "B"}; std::vector> data_log; - for (const auto &manager : managers) { + for (const auto &node_manager_id : node_manager_ids) { auto data = std::make_shared(); - data->node_manager_id = manager; + data->node_manager_id = node_manager_id; data_log.push_back(data); } @@ -234,13 +236,14 @@ void TestLogAppendAt(const JobID &job_id, std::shared_ptr c job_id, task_id, data_log[1], /*done callback=*/nullptr, failure_callback, /*log_length=*/1)); - auto lookup_callback = [managers](gcs::AsyncGcsClient *client, const UniqueID &id, - const std::vector &data) { + auto lookup_callback = [node_manager_ids]( + gcs::AsyncGcsClient *client, const UniqueID &id, + const std::vector &data) { std::vector appended_managers; for (const auto &entry : data) { appended_managers.push_back(entry.node_manager_id); } - ASSERT_EQ(appended_managers, managers); + ASSERT_EQ(appended_managers, node_manager_ids); test->Stop(); }; RAY_CHECK_OK( @@ -256,14 +259,13 @@ TEST_F(TestGcsWithAsio, TestLogAppendAt) { TestLogAppendAt(job_id_, client_); } -void TestDeleteKeysFromLog(const JobID &job_id, - std::shared_ptr client, - std::vector> &data_vector) { - std::vector ids; - ObjectID object_id; - for (auto &data : data_vector) { - object_id = ObjectID::from_random(); - ids.push_back(object_id); +void TestSet(const JobID &job_id, std::shared_ptr client) { + // Add some entries to the set at an object ID. + ObjectID object_id = ObjectID::from_random(); + std::vector managers = {"abc", "def", "ghi"}; + for (auto &manager : managers) { + auto data = std::make_shared(); + data->manager = manager; // Check that we added the correct object entries. auto add_callback = [object_id, data](gcs::AsyncGcsClient *client, const UniqueID &id, const ObjectTableDataT &d) { @@ -271,32 +273,102 @@ void TestDeleteKeysFromLog(const JobID &job_id, ASSERT_EQ(data->manager, d.manager); test->IncrementNumCallbacks(); }; - RAY_CHECK_OK(client->object_table().Append(job_id, object_id, data, add_callback)); + RAY_CHECK_OK(client->object_table().Add(job_id, object_id, data, add_callback)); } - for (const auto &object_id : ids) { - // Check that lookup returns the added object entries. - auto lookup_callback = [object_id, data_vector]( - gcs::AsyncGcsClient *client, const ObjectID &id, - const std::vector &data) { + + // Check that lookup returns the added object entries. + auto lookup_callback = [object_id, managers]( + gcs::AsyncGcsClient *client, const ObjectID &id, + const std::vector &data) { + ASSERT_EQ(id, object_id); + ASSERT_EQ(data.size(), managers.size()); + test->IncrementNumCallbacks(); + }; + + // Do a lookup at the object ID. + RAY_CHECK_OK(client->object_table().Lookup(job_id, object_id, lookup_callback)); + + for (auto &manager : managers) { + auto data = std::make_shared(); + data->manager = manager; + // Check that we added the correct object entries. + auto remove_entry_callback = [object_id, data]( + gcs::AsyncGcsClient *client, const UniqueID &id, const ObjectTableDataT &d) { ASSERT_EQ(id, object_id); + ASSERT_EQ(data->manager, d.manager); + test->IncrementNumCallbacks(); + }; + RAY_CHECK_OK( + client->object_table().Remove(job_id, object_id, data, remove_entry_callback)); + } + + // Check that the entries are removed. + auto lookup_callback2 = [object_id, managers]( + gcs::AsyncGcsClient *client, const ObjectID &id, + const std::vector &data) { + ASSERT_EQ(id, object_id); + ASSERT_EQ(data.size(), 0); + test->IncrementNumCallbacks(); + test->Stop(); + }; + + // Do a lookup at the object ID. + RAY_CHECK_OK(client->object_table().Lookup(job_id, object_id, lookup_callback2)); + // Run the event loop. The loop will only stop if the Lookup callback is + // called (or an assertion failure). + test->Start(); + ASSERT_EQ(test->NumCallbacks(), managers.size() * 2 + 2); +} + +TEST_F(TestGcsWithAsio, TestSet) { + test = this; + TestSet(job_id_, client_); +} + +void TestDeleteKeysFromLog( + const JobID &job_id, std::shared_ptr client, + std::vector> &data_vector) { + std::vector ids; + TaskID task_id; + for (auto &data : data_vector) { + task_id = TaskID::from_random(); + ids.push_back(task_id); + // Check that we added the correct object entries. + auto add_callback = [task_id, data](gcs::AsyncGcsClient *client, const UniqueID &id, + const TaskReconstructionDataT &d) { + ASSERT_EQ(id, task_id); + ASSERT_EQ(data->node_manager_id, d.node_manager_id); + test->IncrementNumCallbacks(); + }; + RAY_CHECK_OK( + client->task_reconstruction_log().Append(job_id, task_id, data, add_callback)); + } + for (const auto &task_id : ids) { + // Check that lookup returns the added object entries. + auto lookup_callback = [task_id, data_vector]( + gcs::AsyncGcsClient *client, const UniqueID &id, + const std::vector &data) { + ASSERT_EQ(id, task_id); ASSERT_EQ(data.size(), 1); test->IncrementNumCallbacks(); }; - RAY_CHECK_OK(client->object_table().Lookup(job_id, object_id, lookup_callback)); + RAY_CHECK_OK( + client->task_reconstruction_log().Lookup(job_id, task_id, lookup_callback)); } if (ids.size() == 1) { - client->object_table().Delete(job_id, ids[0]); + client->task_reconstruction_log().Delete(job_id, ids[0]); } else { - client->object_table().Delete(job_id, ids); + client->task_reconstruction_log().Delete(job_id, ids); } - for (const auto &object_id : ids) { - auto lookup_callback = [object_id](gcs::AsyncGcsClient *client, const ObjectID &id, - const std::vector &data) { - ASSERT_EQ(id, object_id); + for (const auto &task_id : ids) { + auto lookup_callback = [task_id](gcs::AsyncGcsClient *client, const TaskID &id, + const std::vector &data) { + ASSERT_EQ(id, task_id); ASSERT_TRUE(data.size() == 0); test->IncrementNumCallbacks(); }; - RAY_CHECK_OK(client->object_table().Lookup(job_id, object_id, lookup_callback)); + RAY_CHECK_OK( + client->task_reconstruction_log().Lookup(job_id, task_id, lookup_callback)); } } @@ -349,34 +421,80 @@ void TestDeleteKeysFromTable(const JobID &job_id, } } +void TestDeleteKeysFromSet(const JobID &job_id, + std::shared_ptr client, + std::vector> &data_vector) { + std::vector ids; + ObjectID object_id; + for (auto &data : data_vector) { + object_id = ObjectID::from_random(); + ids.push_back(object_id); + // Check that we added the correct object entries. + auto add_callback = [object_id, data](gcs::AsyncGcsClient *client, const UniqueID &id, + const ObjectTableDataT &d) { + ASSERT_EQ(id, object_id); + ASSERT_EQ(data->manager, d.manager); + test->IncrementNumCallbacks(); + }; + RAY_CHECK_OK(client->object_table().Add(job_id, object_id, data, add_callback)); + } + for (const auto &object_id : ids) { + // Check that lookup returns the added object entries. + auto lookup_callback = [object_id, data_vector]( + gcs::AsyncGcsClient *client, const ObjectID &id, + const std::vector &data) { + ASSERT_EQ(id, object_id); + ASSERT_EQ(data.size(), 1); + test->IncrementNumCallbacks(); + }; + RAY_CHECK_OK(client->object_table().Lookup(job_id, object_id, lookup_callback)); + } + if (ids.size() == 1) { + client->object_table().Delete(job_id, ids[0]); + } else { + client->object_table().Delete(job_id, ids); + } + for (const auto &object_id : ids) { + auto lookup_callback = [object_id](gcs::AsyncGcsClient *client, const ObjectID &id, + const std::vector &data) { + ASSERT_EQ(id, object_id); + ASSERT_TRUE(data.size() == 0); + test->IncrementNumCallbacks(); + }; + RAY_CHECK_OK(client->object_table().Lookup(job_id, object_id, lookup_callback)); + } +} + // Test delete function for keys of Log or Table. void TestDeleteKeys(const JobID &job_id, std::shared_ptr client) { // Test delete function for keys of Log. - std::vector> object_vector; - auto AppendObjectData = [&object_vector](size_t add_count) { + std::vector> task_reconstruction_vector; + auto AppendTaskReconstructionData = [&task_reconstruction_vector](size_t add_count) { for (size_t i = 0; i < add_count; ++i) { - auto data = std::make_shared(); - data->manager = ObjectID::from_random().hex(); - object_vector.push_back(data); + auto data = std::make_shared(); + data->node_manager_id = ObjectID::from_random().hex(); + task_reconstruction_vector.push_back(data); } }; // Test one element case. - AppendObjectData(1); - ASSERT_EQ(object_vector.size(), 1); - TestDeleteKeysFromLog(job_id, client, object_vector); + AppendTaskReconstructionData(1); + ASSERT_EQ(task_reconstruction_vector.size(), 1); + TestDeleteKeysFromLog(job_id, client, task_reconstruction_vector); // Test the case for more than one elements and less than // maximum_gcs_deletion_batch_size. - AppendObjectData(RayConfig::instance().maximum_gcs_deletion_batch_size() / 2); - ASSERT_GT(object_vector.size(), 1); - ASSERT_LT(object_vector.size(), + AppendTaskReconstructionData(RayConfig::instance().maximum_gcs_deletion_batch_size() / + 2); + ASSERT_GT(task_reconstruction_vector.size(), 1); + ASSERT_LT(task_reconstruction_vector.size(), RayConfig::instance().maximum_gcs_deletion_batch_size()); - TestDeleteKeysFromLog(job_id, client, object_vector); + TestDeleteKeysFromLog(job_id, client, task_reconstruction_vector); // Test the case for more than maximum_gcs_deletion_batch_size. // The Delete function will split the data into two commands. - AppendObjectData(RayConfig::instance().maximum_gcs_deletion_batch_size() / 2); - ASSERT_GT(object_vector.size(), + AppendTaskReconstructionData(RayConfig::instance().maximum_gcs_deletion_batch_size() / + 2); + ASSERT_GT(task_reconstruction_vector.size(), RayConfig::instance().maximum_gcs_deletion_batch_size()); - TestDeleteKeysFromLog(job_id, client, object_vector); + TestDeleteKeysFromLog(job_id, client, task_reconstruction_vector); // Test delete function for keys of Table. std::vector> task_vector; @@ -403,6 +521,33 @@ void TestDeleteKeys(const JobID &job_id, std::shared_ptr cl test->Start(); ASSERT_GT(test->NumCallbacks(), 9 * RayConfig::instance().maximum_gcs_deletion_batch_size()); + + // Test delete function for keys of Set. + std::vector> object_vector; + auto AppendObjectData = [&object_vector](size_t add_count) { + for (size_t i = 0; i < add_count; ++i) { + auto data = std::make_shared(); + data->manager = ObjectID::from_random().hex(); + object_vector.push_back(data); + } + }; + // Test one element case. + AppendObjectData(1); + ASSERT_EQ(object_vector.size(), 1); + TestDeleteKeysFromSet(job_id, client, object_vector); + // Test the case for more than one elements and less than + // maximum_gcs_deletion_batch_size. + AppendObjectData(RayConfig::instance().maximum_gcs_deletion_batch_size() / 2); + ASSERT_GT(object_vector.size(), 1); + ASSERT_LT(object_vector.size(), + RayConfig::instance().maximum_gcs_deletion_batch_size()); + TestDeleteKeysFromSet(job_id, client, object_vector); + // Test the case for more than maximum_gcs_deletion_batch_size. + // The Delete function will split the data into two commands. + AppendObjectData(RayConfig::instance().maximum_gcs_deletion_batch_size() / 2); + ASSERT_GT(object_vector.size(), + RayConfig::instance().maximum_gcs_deletion_batch_size()); + TestDeleteKeysFromSet(job_id, client, object_vector); } TEST_F(TestGcsWithAsio, TestDeleteKey) { @@ -451,22 +596,77 @@ void TaskLookupAfterUpdateFailure(gcs::AsyncGcsClient *client, const TaskID &id) void TestLogSubscribeAll(const JobID &job_id, std::shared_ptr client) { - std::vector managers = {"abc", "def", "ghi"}; + std::vector driver_ids; + for (int i = 0; i < 3; i++) { + driver_ids.emplace_back(DriverID::from_random()); + } + // Callback for a notification. + auto notification_callback = [driver_ids](gcs::AsyncGcsClient *client, + const UniqueID &id, + const std::vector data) { + ASSERT_EQ(id, driver_ids[test->NumCallbacks()]); + // Check that we get notifications in the same order as the writes. + for (const auto &entry : data) { + ASSERT_EQ(entry.driver_id, driver_ids[test->NumCallbacks()].binary()); + test->IncrementNumCallbacks(); + } + if (test->NumCallbacks() == driver_ids.size()) { + test->Stop(); + } + }; + + // Callback for subscription success. We are guaranteed to receive + // notifications after this is called. + auto subscribe_callback = [driver_ids](gcs::AsyncGcsClient *client) { + // We have subscribed. Do the writes to the table. + for (size_t i = 0; i < driver_ids.size(); i++) { + RAY_CHECK_OK(client->driver_table().AppendDriverData(driver_ids[i], false)); + } + }; + + // Subscribe to all driver table notifications. Once we have successfully + // subscribed, we will append to the key several times and check that we get + // notified for each. + RAY_CHECK_OK(client->driver_table().Subscribe( + job_id, ClientID::nil(), notification_callback, subscribe_callback)); + + // Run the event loop. The loop will only stop if the registered subscription + // callback is called (or an assertion failure). + test->Start(); + // Check that we received one notification callback for each write. + ASSERT_EQ(test->NumCallbacks(), driver_ids.size()); +} + +TEST_F(TestGcsWithAsio, TestLogSubscribeAll) { + test = this; + TestLogSubscribeAll(job_id_, client_); +} + +void TestSetSubscribeAll(const JobID &job_id, + std::shared_ptr client) { std::vector object_ids; - for (size_t i = 0; i < managers.size(); i++) { - object_ids.push_back(ObjectID::from_random()); + for (int i = 0; i < 3; i++) { + object_ids.emplace_back(ObjectID::from_random()); } + std::vector managers = {"abc", "def", "ghi"}; + // Callback for a notification. auto notification_callback = [object_ids, managers]( gcs::AsyncGcsClient *client, const UniqueID &id, + const GcsTableNotificationMode notification_mode, const std::vector data) { - ASSERT_EQ(id, object_ids[test->NumCallbacks()]); + if (test->NumCallbacks() < 3 * 3) { + ASSERT_EQ(notification_mode, GcsTableNotificationMode::APPEND_OR_ADD); + } else { + ASSERT_EQ(notification_mode, GcsTableNotificationMode::REMOVE); + } + ASSERT_EQ(id, object_ids[test->NumCallbacks() / 3 % 3]); // Check that we get notifications in the same order as the writes. for (const auto &entry : data) { - ASSERT_EQ(entry.manager, managers[test->NumCallbacks()]); + ASSERT_EQ(entry.manager, managers[test->NumCallbacks() % 3]); test->IncrementNumCallbacks(); } - if (test->NumCallbacks() == managers.size()) { + if (test->NumCallbacks() == object_ids.size() * 3 * 2) { test->Stop(); } }; @@ -476,13 +676,26 @@ void TestLogSubscribeAll(const JobID &job_id, auto subscribe_callback = [job_id, object_ids, managers](gcs::AsyncGcsClient *client) { // We have subscribed. Do the writes to the table. for (size_t i = 0; i < object_ids.size(); i++) { - auto data = std::make_shared(); - data->manager = managers[i]; - RAY_CHECK_OK(client->object_table().Append(job_id, object_ids[i], data, nullptr)); + for (size_t j = 0; j < managers.size(); j++) { + auto data = std::make_shared(); + data->manager = managers[j]; + for (int k = 0; k < 3; k++) { + // Add the same entry several times. + // Expect no notification if the entry already exists. + RAY_CHECK_OK(client->object_table().Add(job_id, object_ids[i], data, nullptr)); + } + } + } + for (size_t i = 0; i < object_ids.size(); i++) { + for (size_t j = 0; j < managers.size(); j++) { + auto data = std::make_shared(); + data->manager = managers[j]; + RAY_CHECK_OK(client->object_table().Remove(job_id, object_ids[i], data, nullptr)); + } } }; - // Subscribe to all object table notifications. Once we have successfully + // Subscribe to all driver table notifications. Once we have successfully // subscribed, we will append to the key several times and check that we get // notified for each. RAY_CHECK_OK(client->object_table().Subscribe( @@ -492,12 +705,12 @@ void TestLogSubscribeAll(const JobID &job_id, // callback is called (or an assertion failure). test->Start(); // Check that we received one notification callback for each write. - ASSERT_EQ(test->NumCallbacks(), managers.size()); + ASSERT_EQ(test->NumCallbacks(), object_ids.size() * 3 * 2); } -TEST_F(TestGcsWithAsio, TestLogSubscribeAll) { +TEST_F(TestGcsWithAsio, TestSetSubscribeAll) { test = this; - TestLogSubscribeAll(job_id_, client_); + TestSetSubscribeAll(job_id_, client_); } void TestTableSubscribeId(const JobID &job_id, @@ -579,24 +792,100 @@ TEST_MACRO(TestGcsWithChainAsio, TestTableSubscribeId); void TestLogSubscribeId(const JobID &job_id, std::shared_ptr client) { // Add a log entry. + DriverID driver_id1 = DriverID::from_random(); + std::vector driver_ids1 = {"abc", "def", "ghi"}; + auto data1 = std::make_shared(); + data1->driver_id = driver_ids1[0]; + RAY_CHECK_OK(client->driver_table().Append(job_id, driver_id1, data1, nullptr)); + + // Add a log entry at a second key. + DriverID driver_id2 = DriverID::from_random(); + std::vector driver_ids2 = {"jkl", "mno", "pqr"}; + auto data2 = std::make_shared(); + data2->driver_id = driver_ids2[0]; + RAY_CHECK_OK(client->driver_table().Append(job_id, driver_id2, data2, nullptr)); + + // The callback for a notification from the table. This should only be + // received for keys that we requested notifications for. + auto notification_callback = [driver_id2, driver_ids2]( + gcs::AsyncGcsClient *client, const UniqueID &id, + const std::vector &data) { + // Check that we only get notifications for the requested key. + ASSERT_EQ(id, driver_id2); + // Check that we get notifications in the same order as the writes. + for (const auto &entry : data) { + ASSERT_EQ(entry.driver_id, driver_ids2[test->NumCallbacks()]); + test->IncrementNumCallbacks(); + } + if (test->NumCallbacks() == driver_ids2.size()) { + test->Stop(); + } + }; + + // The callback for subscription success. Once we've subscribed, request + // notifications for only one of the keys, then write to both keys. + auto subscribe_callback = [job_id, driver_id1, driver_id2, driver_ids1, + driver_ids2](gcs::AsyncGcsClient *client) { + // Request notifications for one of the keys. + RAY_CHECK_OK(client->driver_table().RequestNotifications( + job_id, driver_id2, client->client_table().GetLocalClientId())); + // Write both keys. We should only receive notifications for the key that + // we requested them for. + auto remaining = std::vector(++driver_ids1.begin(), driver_ids1.end()); + for (const auto &driver_id : remaining) { + auto data = std::make_shared(); + data->driver_id = driver_id; + RAY_CHECK_OK(client->driver_table().Append(job_id, driver_id1, data, nullptr)); + } + remaining = std::vector(++driver_ids2.begin(), driver_ids2.end()); + for (const auto &driver_id : remaining) { + auto data = std::make_shared(); + data->driver_id = driver_id; + RAY_CHECK_OK(client->driver_table().Append(job_id, driver_id2, data, nullptr)); + } + }; + + // Subscribe to notifications for this client. This allows us to request and + // receive notifications for specific keys. + RAY_CHECK_OK( + client->driver_table().Subscribe(job_id, client->client_table().GetLocalClientId(), + notification_callback, subscribe_callback)); + // Run the event loop. The loop will only stop if the registered subscription + // callback is called for the requested key. + test->Start(); + // Check that we received one notification callback for each write to the + // requested key. + ASSERT_EQ(test->NumCallbacks(), driver_ids2.size()); +} + +TEST_F(TestGcsWithAsio, TestLogSubscribeId) { + test = this; + TestLogSubscribeId(job_id_, client_); +} + +void TestSetSubscribeId(const JobID &job_id, + std::shared_ptr client) { + // Add a set entry. ObjectID object_id1 = ObjectID::from_random(); std::vector managers1 = {"abc", "def", "ghi"}; auto data1 = std::make_shared(); data1->manager = managers1[0]; - RAY_CHECK_OK(client->object_table().Append(job_id, object_id1, data1, nullptr)); + RAY_CHECK_OK(client->object_table().Add(job_id, object_id1, data1, nullptr)); - // Add a log entry at a second key. + // Add a set entry at a second key. ObjectID object_id2 = ObjectID::from_random(); std::vector managers2 = {"jkl", "mno", "pqr"}; auto data2 = std::make_shared(); data2->manager = managers2[0]; - RAY_CHECK_OK(client->object_table().Append(job_id, object_id2, data2, nullptr)); + RAY_CHECK_OK(client->object_table().Add(job_id, object_id2, data2, nullptr)); // The callback for a notification from the table. This should only be // received for keys that we requested notifications for. auto notification_callback = [object_id2, managers2]( gcs::AsyncGcsClient *client, const ObjectID &id, + const GcsTableNotificationMode notification_mode, const std::vector &data) { + ASSERT_EQ(notification_mode, GcsTableNotificationMode::APPEND_OR_ADD); // Check that we only get notifications for the requested key. ASSERT_EQ(id, object_id2); // Check that we get notifications in the same order as the writes. @@ -622,13 +911,13 @@ void TestLogSubscribeId(const JobID &job_id, for (const auto &manager : remaining) { auto data = std::make_shared(); data->manager = manager; - RAY_CHECK_OK(client->object_table().Append(job_id, object_id1, data, nullptr)); + RAY_CHECK_OK(client->object_table().Add(job_id, object_id1, data, nullptr)); } remaining = std::vector(++managers2.begin(), managers2.end()); for (const auto &manager : remaining) { auto data = std::make_shared(); data->manager = manager; - RAY_CHECK_OK(client->object_table().Append(job_id, object_id2, data, nullptr)); + RAY_CHECK_OK(client->object_table().Add(job_id, object_id2, data, nullptr)); } }; @@ -645,9 +934,9 @@ void TestLogSubscribeId(const JobID &job_id, ASSERT_EQ(test->NumCallbacks(), managers2.size()); } -TEST_F(TestGcsWithAsio, TestLogSubscribeId) { +TEST_F(TestGcsWithAsio, TestSetSubscribeId) { test = this; - TestLogSubscribeId(job_id_, client_); + TestSetSubscribeId(job_id_, client_); } void TestTableSubscribeCancel(const JobID &job_id, @@ -727,28 +1016,110 @@ TEST_MACRO(TestGcsWithChainAsio, TestTableSubscribeCancel); void TestLogSubscribeCancel(const JobID &job_id, std::shared_ptr client) { // Add a log entry. + DriverID driver_id = DriverID::from_random(); + std::vector driver_ids = {"jkl", "mno", "pqr"}; + auto data = std::make_shared(); + data->driver_id = driver_ids[0]; + RAY_CHECK_OK(client->driver_table().Append(job_id, driver_id, data, nullptr)); + + // The callback for a notification from the object table. This should only be + // received for the object that we requested notifications for. + auto notification_callback = [driver_id, driver_ids]( + gcs::AsyncGcsClient *client, const UniqueID &id, + const std::vector &data) { + ASSERT_EQ(id, driver_id); + // Check that we get a duplicate notification for the first write. We get a + // duplicate notification because the log is append-only and notifications + // are canceled after the first write, then requested again. + auto driver_ids_copy = driver_ids; + driver_ids_copy.insert(driver_ids_copy.begin(), driver_ids_copy.front()); + for (const auto &entry : data) { + ASSERT_EQ(entry.driver_id, driver_ids_copy[test->NumCallbacks()]); + test->IncrementNumCallbacks(); + } + if (test->NumCallbacks() == driver_ids_copy.size()) { + test->Stop(); + } + }; + + // The callback for a notification from the table. This should only be + // received for keys that we requested notifications for. + auto subscribe_callback = [job_id, driver_id, driver_ids](gcs::AsyncGcsClient *client) { + // Request notifications, then cancel immediately. We should receive a + // notification for the current value at the key. + RAY_CHECK_OK(client->driver_table().RequestNotifications( + job_id, driver_id, client->client_table().GetLocalClientId())); + RAY_CHECK_OK(client->driver_table().CancelNotifications( + job_id, driver_id, client->client_table().GetLocalClientId())); + // Append to the key. Since we canceled notifications, we should not + // receive a notification for these writes. + auto remaining = std::vector(++driver_ids.begin(), driver_ids.end()); + for (const auto &remaining_driver_id : remaining) { + auto data = std::make_shared(); + data->driver_id = remaining_driver_id; + RAY_CHECK_OK(client->driver_table().Append(job_id, driver_id, data, nullptr)); + } + // Request notifications again. We should receive a notification for the + // current values at the key. + RAY_CHECK_OK(client->driver_table().RequestNotifications( + job_id, driver_id, client->client_table().GetLocalClientId())); + }; + + // Subscribe to notifications for this client. This allows us to request and + // receive notifications for specific keys. + RAY_CHECK_OK( + client->driver_table().Subscribe(job_id, client->client_table().GetLocalClientId(), + notification_callback, subscribe_callback)); + // Run the event loop. The loop will only stop if the registered subscription + // callback is called for the requested key. + test->Start(); + // Check that we received a notification callback for the first append to the + // key, then a notification for all of the appends, because we cancel + // notifications in between. + ASSERT_EQ(test->NumCallbacks(), driver_ids.size() + 1); +} + +TEST_F(TestGcsWithAsio, TestLogSubscribeCancel) { + test = this; + TestLogSubscribeCancel(job_id_, client_); +} + +void TestSetSubscribeCancel(const JobID &job_id, + std::shared_ptr client) { + // Add a set entry. ObjectID object_id = ObjectID::from_random(); std::vector managers = {"jkl", "mno", "pqr"}; auto data = std::make_shared(); data->manager = managers[0]; - RAY_CHECK_OK(client->object_table().Append(job_id, object_id, data, nullptr)); + RAY_CHECK_OK(client->object_table().Add(job_id, object_id, data, nullptr)); // The callback for a notification from the object table. This should only be // received for the object that we requested notifications for. auto notification_callback = [object_id, managers]( gcs::AsyncGcsClient *client, const ObjectID &id, + const GcsTableNotificationMode notification_mode, const std::vector &data) { + ASSERT_EQ(notification_mode, GcsTableNotificationMode::APPEND_OR_ADD); ASSERT_EQ(id, object_id); // Check that we get a duplicate notification for the first write. We get a - // duplicate notification because the log is append-only and notifications + // duplicate notification because notifications // are canceled after the first write, then requested again. - auto managers_copy = managers; - managers_copy.insert(managers_copy.begin(), managers_copy.front()); - for (const auto &entry : data) { - ASSERT_EQ(entry.manager, managers_copy[test->NumCallbacks()]); + if (data.size() == 1) { + // first notification + ASSERT_EQ(data[0].manager, managers[0]); test->IncrementNumCallbacks(); + } else { + // second notification + ASSERT_EQ(data.size(), managers.size()); + std::unordered_set managers_set(managers.begin(), managers.end()); + std::unordered_set data_managers_set; + for (const auto &entry : data) { + data_managers_set.insert(entry.manager); + test->IncrementNumCallbacks(); + } + ASSERT_EQ(managers_set, data_managers_set); } - if (test->NumCallbacks() == managers_copy.size()) { + if (test->NumCallbacks() == managers.size() + 1) { test->Stop(); } }; @@ -762,13 +1133,13 @@ void TestLogSubscribeCancel(const JobID &job_id, job_id, object_id, client->client_table().GetLocalClientId())); RAY_CHECK_OK(client->object_table().CancelNotifications( job_id, object_id, client->client_table().GetLocalClientId())); - // Append to the key. Since we canceled notifications, we should not + // Add to the key. Since we canceled notifications, we should not // receive a notification for these writes. auto remaining = std::vector(++managers.begin(), managers.end()); for (const auto &manager : remaining) { auto data = std::make_shared(); data->manager = manager; - RAY_CHECK_OK(client->object_table().Append(job_id, object_id, data, nullptr)); + RAY_CHECK_OK(client->object_table().Add(job_id, object_id, data, nullptr)); } // Request notifications again. We should receive a notification for the // current values at the key. @@ -790,9 +1161,9 @@ void TestLogSubscribeCancel(const JobID &job_id, ASSERT_EQ(test->NumCallbacks(), managers.size() + 1); } -TEST_F(TestGcsWithAsio, TestLogSubscribeCancel) { +TEST_F(TestGcsWithAsio, TestSetSubscribeCancel) { test = this; - TestLogSubscribeCancel(job_id_, client_); + TestSetSubscribeCancel(job_id_, client_); } void ClientTableNotification(gcs::AsyncGcsClient *client, const ClientID &client_id, @@ -814,7 +1185,7 @@ void TestClientTableConnect(const JobID &job_id, // Register callbacks for when a client gets added and removed. The latter // event will stop the event loop. client->client_table().RegisterClientAddedCallback( - [](gcs::AsyncGcsClient *client, const UniqueID &id, const ClientTableDataT &data) { + [](gcs::AsyncGcsClient *client, const ClientID &id, const ClientTableDataT &data) { ClientTableNotification(client, id, data, true); test->Stop(); }); @@ -839,14 +1210,14 @@ void TestClientTableDisconnect(const JobID &job_id, // Register callbacks for when a client gets added and removed. The latter // event will stop the event loop. client->client_table().RegisterClientAddedCallback( - [](gcs::AsyncGcsClient *client, const UniqueID &id, const ClientTableDataT &data) { + [](gcs::AsyncGcsClient *client, const ClientID &id, const ClientTableDataT &data) { ClientTableNotification(client, id, data, /*is_insertion=*/true); // Disconnect from the client table. We should receive a notification // for the removal of our own entry. RAY_CHECK_OK(client->client_table().Disconnect()); }); client->client_table().RegisterClientRemovedCallback( - [](gcs::AsyncGcsClient *client, const UniqueID &id, const ClientTableDataT &data) { + [](gcs::AsyncGcsClient *client, const ClientID &id, const ClientTableDataT &data) { ClientTableNotification(client, id, data, /*is_insertion=*/false); test->Stop(); }); @@ -870,11 +1241,11 @@ void TestClientTableImmediateDisconnect(const JobID &job_id, // Register callbacks for when a client gets added and removed. The latter // event will stop the event loop. client->client_table().RegisterClientAddedCallback( - [](gcs::AsyncGcsClient *client, const UniqueID &id, const ClientTableDataT &data) { + [](gcs::AsyncGcsClient *client, const ClientID &id, const ClientTableDataT &data) { ClientTableNotification(client, id, data, true); }); client->client_table().RegisterClientRemovedCallback( - [](gcs::AsyncGcsClient *client, const UniqueID &id, const ClientTableDataT &data) { + [](gcs::AsyncGcsClient *client, const ClientID &id, const ClientTableDataT &data) { ClientTableNotification(client, id, data, false); test->Stop(); }); diff --git a/src/ray/gcs/format/gcs.fbs b/src/ray/gcs/format/gcs.fbs index c826d97a66de..5595c365731c 100644 --- a/src/ray/gcs/format/gcs.fbs +++ b/src/ray/gcs/format/gcs.fbs @@ -108,7 +108,13 @@ table ResourcePair { value: double; } +enum GcsTableNotificationMode:int { + APPEND_OR_ADD = 0, + REMOVE, +} + table GcsTableEntry { + notification_mode: GcsTableNotificationMode; id: string; entries: [string]; } @@ -124,8 +130,6 @@ table ObjectTableData { object_size: long; // The node manager ID that this object appeared on or was evicted by. manager: string; - // Whether this entry is an addition or a deletion. - is_eviction: bool; } table TaskReconstructionData { diff --git a/src/ray/gcs/redis_module/ray_redis_module.cc b/src/ray/gcs/redis_module/ray_redis_module.cc index ee1e00f85f03..f1fa99a0f8b3 100644 --- a/src/ray/gcs/redis_module/ray_redis_module.cc +++ b/src/ray/gcs/redis_module/ray_redis_module.cc @@ -181,7 +181,7 @@ flatbuffers::Offset RedisStringToFlatbuf( return fbb.CreateString(redis_string_str, redis_string_size); } -/// Publish a notification for a new entry at a key. This publishes a +/// Publish a notification for an entry update at a key. This publishes a /// notification to all subscribers of the table, as well as every client that /// has requested notifications for this key. /// @@ -189,15 +189,18 @@ flatbuffers::Offset RedisStringToFlatbuf( /// this key should be published to. When publishing to a specific /// client, the channel name should be :. /// \param id The ID of the key that the notification is about. -/// \param data The data to publish. +/// \param mode the update mode, such as append or remove. +/// \param data The appended/removed data. /// \return OK if there is no error during a publish. -int PublishTableAdd(RedisModuleCtx *ctx, RedisModuleString *pubsub_channel_str, - RedisModuleString *id, RedisModuleString *data) { +int PublishTableUpdate(RedisModuleCtx *ctx, RedisModuleString *pubsub_channel_str, + RedisModuleString *id, GcsTableNotificationMode notification_mode, + RedisModuleString *data) { // Serialize the notification to send. flatbuffers::FlatBufferBuilder fbb; auto data_flatbuf = RedisStringToFlatbuf(fbb, data); - auto message = CreateGcsTableEntry(fbb, RedisStringToFlatbuf(fbb, id), - fbb.CreateVector(&data_flatbuf, 1)); + auto message = + CreateGcsTableEntry(fbb, notification_mode, RedisStringToFlatbuf(fbb, id), + fbb.CreateVector(&data_flatbuf, 1)); fbb.Finish(message); // Write the data back to any subscribers that are listening to all table @@ -265,7 +268,8 @@ int TableAdd_DoPublish(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) if (pubsub_channel != TablePubsub::NO_PUBLISH) { // All other pubsub channels write the data back directly onto the channel. - return PublishTableAdd(ctx, pubsub_channel_str, id, data); + return PublishTableUpdate(ctx, pubsub_channel_str, id, + GcsTableNotificationMode::APPEND_OR_ADD, data); } else { return RedisModule_ReplyWithSimpleString(ctx, "OK"); } @@ -364,7 +368,8 @@ int TableAppend_DoPublish(RedisModuleCtx *ctx, RedisModuleString **argv, int /*a if (pubsub_channel != TablePubsub::NO_PUBLISH) { // All other pubsub channels write the data back directly onto the // channel. - return PublishTableAdd(ctx, pubsub_channel_str, id, data); + return PublishTableUpdate(ctx, pubsub_channel_str, id, + GcsTableNotificationMode::APPEND_OR_ADD, data); } else { return RedisModule_ReplyWithSimpleString(ctx, "OK"); } @@ -407,6 +412,112 @@ int ChainTableAppend_RedisCommand(RedisModuleCtx *ctx, RedisModuleString **argv, } #endif +int Set_DoPublish(RedisModuleCtx *ctx, RedisModuleString **argv, bool is_add) { + RedisModuleString *pubsub_channel_str = argv[2]; + RedisModuleString *id = argv[3]; + RedisModuleString *data = argv[4]; + // Publish a message on the requested pubsub channel if necessary. + TablePubsub pubsub_channel; + REPLY_AND_RETURN_IF_NOT_OK(ParseTablePubsub(&pubsub_channel, pubsub_channel_str)); + if (pubsub_channel != TablePubsub::NO_PUBLISH) { + // All other pubsub channels write the data back directly onto the + // channel. + return PublishTableUpdate(ctx, pubsub_channel_str, id, + is_add ? GcsTableNotificationMode::APPEND_OR_ADD + : GcsTableNotificationMode::REMOVE, + data); + } else { + return RedisModule_ReplyWithSimpleString(ctx, "OK"); + } +} + +int Set_DoWrite(RedisModuleCtx *ctx, RedisModuleString **argv, int argc, bool is_add, + bool *changed) { + if (argc != 5) { + return RedisModule_WrongArity(ctx); + } + + RedisModuleString *prefix_str = argv[1]; + RedisModuleString *id = argv[3]; + RedisModuleString *data = argv[4]; + + RedisModuleString *key_string = PrefixedKeyString(ctx, prefix_str, id); + // TODO(kfstorm): According to https://redis.io/topics/modules-intro, + // set type API is not available yet. We can change RedisModule_Call to + // set type API later. + RedisModuleCallReply *reply = + RedisModule_Call(ctx, is_add ? "SADD" : "SREM", "ss", key_string, data); + if (RedisModule_CallReplyType(reply) != REDISMODULE_REPLY_ERROR) { + *changed = RedisModule_CallReplyInteger(reply) > 0; + if (!is_add && *changed) { + // try to delete the empty set. + RedisModuleKey *key; + REPLY_AND_RETURN_IF_NOT_OK( + OpenPrefixedKey(&key, ctx, prefix_str, id, REDISMODULE_WRITE)); + auto size = RedisModule_ValueLength(key); + if (size == 0) { + REPLY_AND_RETURN_IF_FALSE(RedisModule_DeleteKey(key) == REDISMODULE_OK, + "ERR Failed to delete empty set."); + } + } + return REDISMODULE_OK; + } else { + // the SADD/SREM command failed + RedisModule_ReplyWithCallReply(ctx, reply); + return REDISMODULE_ERR; + } +} + +/// Add an entry to the set stored at a key. Publishes a notification about +/// the update to all subscribers, if a pubsub channel is provided. +/// +/// This is called from a client with the command: +// +/// RAY.SET_ADD +/// +/// \param table_prefix The prefix string for keys in this set. +/// \param pubsub_channel The pubsub channel name that notifications for +/// this key should be published to. When publishing to a specific +/// client, the channel name should be :. +/// \param id The ID of the key to add to. +/// \param data The data to add to the key. +/// \return OK if the add succeeds, or an error message string if the add +/// fails. +int SetAdd_RedisCommand(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) { + bool changed; + if (Set_DoWrite(ctx, argv, argc, /*is_add=*/true, &changed) != REDISMODULE_OK) { + return REDISMODULE_ERR; + } + if (changed) { + return Set_DoPublish(ctx, argv, /*is_add=*/true); + } + return REDISMODULE_OK; +} + +/// Remove an entry from the set stored at a key. Publishes a notification about +/// the update to all subscribers, if a pubsub channel is provided. +/// +/// This is called from a client with the command: +// +/// RAY.SET_REMOVE +/// +/// \param table_prefix The prefix string for keys in this table. +/// \param pubsub_channel The pubsub channel name that notifications for +/// this key should be published to. When publishing to a specific +/// client, the channel name should be :. +/// \param id The ID of the key to remove from. +/// \param data The data to remove from the key. +/// \return OK if the remove succeeds, or an error message string if the remove +/// fails. +int SetRemove_RedisCommand(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) { + bool changed; + if (Set_DoWrite(ctx, argv, argc, /*is_add=*/false, &changed) != REDISMODULE_OK) { + return REDISMODULE_ERR; + } + REPLY_AND_RETURN_IF_FALSE(changed, "ERR The entry to remove doesn't exist."); + return Set_DoPublish(ctx, argv, /*is_add=*/false); +} + /// A helper function to create and finish a GcsTableEntry, based on the /// current value or values at the given key. /// @@ -428,11 +539,13 @@ Status TableEntryToFlatbuf(RedisModuleCtx *ctx, RedisModuleKey *table_key, size_t data_len = 0; char *data_buf = RedisModule_StringDMA(table_key, &data_len, REDISMODULE_READ); auto data = fbb.CreateString(data_buf, data_len); - auto message = CreateGcsTableEntry(fbb, RedisStringToFlatbuf(fbb, entry_id), + auto message = CreateGcsTableEntry(fbb, GcsTableNotificationMode::APPEND_OR_ADD, + RedisStringToFlatbuf(fbb, entry_id), fbb.CreateVector(&data, 1)); fbb.Finish(message); } break; - case REDISMODULE_KEYTYPE_LIST: { + case REDISMODULE_KEYTYPE_LIST: + case REDISMODULE_KEYTYPE_SET: { RedisModule_CloseKey(table_key); // Close the key before executing the command. NOTE(swang): According to // https://github.com/RedisLabs/RedisModulesSDK/blob/master/API.md, "While @@ -440,10 +553,17 @@ Status TableEntryToFlatbuf(RedisModuleCtx *ctx, RedisModuleKey *table_key, RedisModuleString *table_key_str = PrefixedKeyString(ctx, prefix_str, entry_id); // TODO(swang): This could potentially be replaced with the native redis // server list iterator, once it is implemented for redis modules. - RedisModuleCallReply *reply = - RedisModule_Call(ctx, "LRANGE", "sll", table_key_str, 0, -1); + RedisModuleCallReply *reply = nullptr; + switch (key_type) { + case REDISMODULE_KEYTYPE_LIST: + reply = RedisModule_Call(ctx, "LRANGE", "sll", table_key_str, 0, -1); + break; + case REDISMODULE_KEYTYPE_SET: + reply = RedisModule_Call(ctx, "SMEMBERS", "s", table_key_str); + break; + } // Build the flatbuffer from the set of log entries. - if (RedisModule_CallReplyType(reply) != REDISMODULE_REPLY_ARRAY) { + if (reply == nullptr || RedisModule_CallReplyType(reply) != REDISMODULE_REPLY_ARRAY) { return Status::RedisError("Empty list or wrong type"); } std::vector> data; @@ -453,13 +573,14 @@ Status TableEntryToFlatbuf(RedisModuleCtx *ctx, RedisModuleKey *table_key, const char *element_str = RedisModule_CallReplyStringPtr(element, &len); data.push_back(fbb.CreateString(element_str, len)); } - auto message = CreateGcsTableEntry(fbb, RedisStringToFlatbuf(fbb, entry_id), - fbb.CreateVector(data)); + auto message = + CreateGcsTableEntry(fbb, GcsTableNotificationMode::APPEND_OR_ADD, + RedisStringToFlatbuf(fbb, entry_id), fbb.CreateVector(data)); fbb.Finish(message); } break; case REDISMODULE_KEYTYPE_EMPTY: { auto message = CreateGcsTableEntry( - fbb, RedisStringToFlatbuf(fbb, entry_id), + fbb, GcsTableNotificationMode::APPEND_OR_ADD, RedisStringToFlatbuf(fbb, entry_id), fbb.CreateVector(std::vector>())); fbb.Finish(message); } break; @@ -752,6 +873,8 @@ int DebugString_RedisCommand(RedisModuleCtx *ctx, RedisModuleString **argv, int // Wrap all Redis commands with Redis' auto memory management. AUTO_MEMORY(TableAdd_RedisCommand); AUTO_MEMORY(TableAppend_RedisCommand); +AUTO_MEMORY(SetAdd_RedisCommand); +AUTO_MEMORY(SetRemove_RedisCommand); AUTO_MEMORY(TableLookup_RedisCommand); AUTO_MEMORY(TableRequestNotifications_RedisCommand); AUTO_MEMORY(TableDelete_RedisCommand); @@ -781,7 +904,17 @@ int RedisModule_OnLoad(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) } if (RedisModule_CreateCommand(ctx, "ray.table_append", TableAppend_RedisCommand, - "write", 0, 0, 0) == REDISMODULE_ERR) { + "write pubsub", 0, 0, 0) == REDISMODULE_ERR) { + return REDISMODULE_ERR; + } + + if (RedisModule_CreateCommand(ctx, "ray.set_add", SetAdd_RedisCommand, "write pubsub", + 0, 0, 0) == REDISMODULE_ERR) { + return REDISMODULE_ERR; + } + + if (RedisModule_CreateCommand(ctx, "ray.set_remove", SetRemove_RedisCommand, + "write pubsub", 0, 0, 0) == REDISMODULE_ERR) { return REDISMODULE_ERR; } diff --git a/src/ray/gcs/tables.cc b/src/ray/gcs/tables.cc index 87a72258ba2a..9b41c9460b3e 100644 --- a/src/ray/gcs/tables.cc +++ b/src/ray/gcs/tables.cc @@ -91,7 +91,7 @@ Status Log::Lookup(const JobID &job_id, const ID &id, const Callback & std::vector results; if (!data.empty()) { auto root = flatbuffers::GetRoot(data.data()); - RAY_CHECK(from_flatbuf(*root->id()) == id); + RAY_CHECK(from_flatbuf(*root->id()) == id); for (size_t i = 0; i < root->entries()->size(); i++) { DataT result; auto data_root = flatbuffers::GetRoot(root->entries()->Get(i)->data()); @@ -112,6 +112,19 @@ template Status Log::Subscribe(const JobID &job_id, const ClientID &client_id, const Callback &subscribe, const SubscriptionCallback &done) { + auto subscribe_wrapper = [subscribe](AsyncGcsClient *client, const ID &id, + const GcsTableNotificationMode notification_mode, + const std::vector &data) { + RAY_CHECK(notification_mode != GcsTableNotificationMode::REMOVE); + subscribe(client, id, data); + }; + return Subscribe(job_id, client_id, subscribe_wrapper, done); +} + +template +Status Log::Subscribe(const JobID &job_id, const ClientID &client_id, + const NotificationCallback &subscribe, + const SubscriptionCallback &done) { RAY_CHECK(subscribe_callback_index_ == -1) << "Client called Subscribe twice on the same table"; auto callback = [this, subscribe, done](const std::string &data) { @@ -128,7 +141,7 @@ Status Log::Subscribe(const JobID &job_id, const ClientID &client_id, auto root = flatbuffers::GetRoot(data.data()); ID id; if (root->id()->size() > 0) { - id = from_flatbuf(*root->id()); + id = from_flatbuf(*root->id()); } std::vector results; for (size_t i = 0; i < root->entries()->size(); i++) { @@ -137,7 +150,7 @@ Status Log::Subscribe(const JobID &job_id, const ClientID &client_id, data_root->UnPackTo(&result); results.emplace_back(std::move(result)); } - subscribe(client_, id, results); + subscribe(client_, id, root->notification_mode(), results); } } // We do not delete the callback after calling it since there may be @@ -274,18 +287,62 @@ std::string Table::DebugString() const { return result.str(); } -Status ErrorTable::PushErrorToDriver(const JobID &job_id, const std::string &type, +template +Status Set::Add(const JobID &job_id, const ID &id, + std::shared_ptr &dataT, const WriteCallback &done) { + num_adds_++; + auto callback = [this, id, dataT, done](const std::string &data) { + if (done != nullptr) { + (done)(client_, id, *dataT); + } + return true; + }; + flatbuffers::FlatBufferBuilder fbb; + fbb.ForceDefaults(true); + fbb.Finish(Data::Pack(fbb, dataT.get())); + return GetRedisContext(id)->RunAsync("RAY.SET_ADD", id, fbb.GetBufferPointer(), + fbb.GetSize(), prefix_, pubsub_channel_, + std::move(callback)); +} + +template +Status Set::Remove(const JobID &job_id, const ID &id, + std::shared_ptr &dataT, const WriteCallback &done) { + num_removes_++; + auto callback = [this, id, dataT, done](const std::string &data) { + if (done != nullptr) { + (done)(client_, id, *dataT); + } + return true; + }; + flatbuffers::FlatBufferBuilder fbb; + fbb.ForceDefaults(true); + fbb.Finish(Data::Pack(fbb, dataT.get())); + return GetRedisContext(id)->RunAsync("RAY.SET_REMOVE", id, fbb.GetBufferPointer(), + fbb.GetSize(), prefix_, pubsub_channel_, + std::move(callback)); +} + +template +std::string Set::DebugString() const { + std::stringstream result; + result << "num lookups: " << num_lookups_ << ", num adds: " << num_adds_ + << ", num removes: " << num_removes_; + return result.str(); +} + +Status ErrorTable::PushErrorToDriver(const DriverID &driver_id, const std::string &type, const std::string &error_message, double timestamp) { auto data = std::make_shared(); - data->job_id = job_id.binary(); + data->job_id = driver_id.binary(); data->type = type; data->error_message = error_message; data->timestamp = timestamp; - return Append(job_id, job_id, data, /*done_callback=*/nullptr); + return Append(JobID(driver_id), driver_id, data, /*done_callback=*/nullptr); } std::string ErrorTable::DebugString() const { - return Log::DebugString(); + return Log::DebugString(); } Status ProfileTable::AddProfileEventBatch(const ProfileTableData &profile_events) { @@ -302,11 +359,11 @@ std::string ProfileTable::DebugString() const { return Log::DebugString(); } -Status DriverTable::AppendDriverData(const JobID &driver_id, bool is_dead) { +Status DriverTable::AppendDriverData(const DriverID &driver_id, bool is_dead) { auto data = std::make_shared(); data->driver_id = driver_id.binary(); data->is_dead = is_dead; - return Append(driver_id, driver_id, data, /*done_callback=*/nullptr); + return Append(JobID(driver_id), driver_id, data, /*done_callback=*/nullptr); } void ClientTable::RegisterClientAddedCallback(const ClientTableCallback &callback) { @@ -492,7 +549,7 @@ Status ClientTable::Lookup(const Callback &lookup) { std::string ClientTable::DebugString() const { std::stringstream result; - result << Log::DebugString(); + result << Log::DebugString(); result << ", cache size: " << client_cache_.size() << ", num removed: " << removed_clients_.size(); return result.str(); @@ -500,7 +557,7 @@ std::string ClientTable::DebugString() const { Status ActorCheckpointIdTable::AddCheckpointId(const JobID &job_id, const ActorID &actor_id, - const UniqueID &checkpoint_id) { + const ActorCheckpointID &checkpoint_id) { auto lookup_callback = [this, checkpoint_id, job_id, actor_id]( ray::gcs::AsyncGcsClient *client, const UniqueID &id, const ActorCheckpointIdDataT &data) { @@ -512,7 +569,7 @@ Status ActorCheckpointIdTable::AddCheckpointId(const JobID &job_id, while (copy->timestamps.size() > num_to_keep) { // Delete the checkpoint from actor checkpoint table. const auto &checkpoint_id = - UniqueID::from_binary(copy->checkpoint_ids.substr(0, kUniqueIDSize)); + ActorCheckpointID::from_binary(copy->checkpoint_ids.substr(0, kUniqueIDSize)); RAY_LOG(DEBUG) << "Deleting checkpoint " << checkpoint_id << " for actor " << actor_id; copy->timestamps.erase(copy->timestamps.begin()); @@ -534,6 +591,7 @@ Status ActorCheckpointIdTable::AddCheckpointId(const JobID &job_id, } template class Log; +template class Set; template class Log; template class Table; template class Table; @@ -542,9 +600,9 @@ template class Log; template class Table; template class Table; template class Table; -template class Log; -template class Log; -template class Log; +template class Log; +template class Log; +template class Log; template class Log; template class Table; template class Table; diff --git a/src/ray/gcs/tables.h b/src/ray/gcs/tables.h index 71e1c39d6da7..54f7a68da68d 100644 --- a/src/ray/gcs/tables.h +++ b/src/ray/gcs/tables.h @@ -67,8 +67,6 @@ class LogInterface { /// pubsub_channel_ member if pubsub is required. /// /// Example tables backed by Log: -/// ObjectTable: Stores a log of which clients have added or evicted an -/// object. /// ClientTable: Stores a log of which GCS clients have been added or deleted /// from the system. template @@ -77,6 +75,9 @@ class Log : public LogInterface, virtual public PubsubInterface { using DataT = typename Data::NativeTableType; using Callback = std::function &data)>; + using NotificationCallback = std::function &data)>; /// The callback to call when a write to a key succeeds. using WriteCallback = typename LogInterface::WriteCallback; /// The callback to call when a SUBSCRIBE call completes and we are ready to @@ -208,6 +209,29 @@ class Log : public LogInterface, virtual public PubsubInterface { static std::hash index; return shard_contexts_[index(id) % shard_contexts_.size()]; } + + /// Subscribe to any modifications to the key. The caller may choose + /// to subscribe to all modifications, or to subscribe only to keys that it + /// requests notifications for. This may only be called once per Log + /// instance. This function is different from public version due to + /// an additional parameter notification_mode in NotificationCallback. Therefore this + /// function supports notifications of remove operations. + /// + /// \param job_id The ID of the job (= driver). + /// \param client_id The type of update to listen to. If this is nil, then a + /// message for each Add to the table will be received. Else, only + /// messages for the given client will be received. In the latter + /// case, the client may request notifications on specific keys in the + /// table via `RequestNotifications`. + /// \param subscribe Callback that is called on each received message. If the + /// callback is called with an empty vector, then there was no data at the key. + /// \param done Callback that is called when subscription is complete and we + /// are ready to receive messages. + /// \return Status + Status Subscribe(const JobID &job_id, const ClientID &client_id, + const NotificationCallback &subscribe, + const SubscriptionCallback &done); + /// The connection to the GCS. std::vector> shard_contexts_; /// The GCS client. @@ -228,7 +252,6 @@ class Log : public LogInterface, virtual public PubsubInterface { /// Commands to a GCS table can either be regular (default) or chain-replicated. CommandType command_type_ = CommandType::kRegular; - private: int64_t num_appends_ = 0; int64_t num_lookups_ = 0; }; @@ -337,26 +360,104 @@ class Table : private Log, using Log::command_type_; using Log::GetRedisContext; - private: int64_t num_adds_ = 0; int64_t num_lookups_ = 0; }; -class ObjectTable : public Log { +template +class SetInterface { + public: + using DataT = typename Data::NativeTableType; + using WriteCallback = typename Log::WriteCallback; + virtual Status Add(const JobID &job_id, const ID &id, std::shared_ptr &data, + const WriteCallback &done) = 0; + virtual Status Remove(const JobID &job_id, const ID &id, std::shared_ptr &data, + const WriteCallback &done) = 0; + virtual ~SetInterface(){}; +}; + +/// \class Set +/// +/// A GCS table where every entry is an addable & removable set. This class is not +/// meant to be used directly. All set classes should derive from this class +/// and override the prefix_ member with a unique prefix for that set, and the +/// pubsub_channel_ member if pubsub is required. +/// +/// Example tables backed by Set: +/// ObjectTable: Stores a set of which clients have added an object. +template +class Set : private Log, + public SetInterface, + virtual public PubsubInterface { + public: + using DataT = typename Log::DataT; + using Callback = typename Log::Callback; + using WriteCallback = typename Log::WriteCallback; + using NotificationCallback = typename Log::NotificationCallback; + using SubscriptionCallback = typename Log::SubscriptionCallback; + + Set(const std::vector> &contexts, AsyncGcsClient *client) + : Log(contexts, client) {} + + using Log::RequestNotifications; + using Log::CancelNotifications; + using Log::Lookup; + using Log::Delete; + + /// Add an entry to the set. + /// + /// \param job_id The ID of the job (= driver). + /// \param id The ID of the data that is added to the GCS. + /// \param data Data to add to the set. + /// \param done Callback that is called once the data has been written to the + /// GCS. + /// \return Status + Status Add(const JobID &job_id, const ID &id, std::shared_ptr &data, + const WriteCallback &done); + + /// Remove an entry from the set. + /// + /// \param job_id The ID of the job (= driver). + /// \param id The ID of the data that is removed from the GCS. + /// \param data Data to remove from the set. + /// \param done Callback that is called once the data has been written to the + /// GCS. + /// \return Status + Status Remove(const JobID &job_id, const ID &id, std::shared_ptr &data, + const WriteCallback &done); + + Status Subscribe(const JobID &job_id, const ClientID &client_id, + const NotificationCallback &subscribe, + const SubscriptionCallback &done) { + return Log::Subscribe(job_id, client_id, subscribe, done); + } + + /// Returns debug string for class. + /// + /// \return string. + std::string DebugString() const; + + protected: + using Log::shard_contexts_; + using Log::client_; + using Log::pubsub_channel_; + using Log::prefix_; + using Log::GetRedisContext; + + int64_t num_adds_ = 0; + int64_t num_removes_ = 0; + using Log::num_lookups_; +}; + +class ObjectTable : public Set { public: ObjectTable(const std::vector> &contexts, AsyncGcsClient *client) - : Log(contexts, client) { + : Set(contexts, client) { pubsub_channel_ = TablePubsub::OBJECT; prefix_ = TablePrefix::OBJECT; }; - ObjectTable(const std::vector> &contexts, - AsyncGcsClient *client, gcs::CommandType command_type) - : ObjectTable(contexts, client) { - command_type_ = command_type; - }; - virtual ~ObjectTable(){}; }; @@ -382,7 +483,7 @@ class HeartbeatBatchTable : public Table { virtual ~HeartbeatBatchTable() {} }; -class DriverTable : public Log { +class DriverTable : public Log { public: DriverTable(const std::vector> &contexts, AsyncGcsClient *client) @@ -398,7 +499,7 @@ class DriverTable : public Log { /// \param driver_id The driver id. /// \param is_dead Whether the driver is dead. /// \return The return status. - Status AppendDriverData(const JobID &driver_id, bool is_dead); + Status AppendDriverData(const DriverID &driver_id, bool is_dead); }; class FunctionTable : public Table { @@ -488,7 +589,7 @@ class ActorCheckpointIdTable : public Table { /// \param checkpoint_id ID of the checkpoint. /// \return Status. Status AddCheckpointId(const JobID &job_id, const ActorID &actor_id, - const UniqueID &checkpoint_id); + const ActorCheckpointID &checkpoint_id); }; namespace raylet { @@ -511,7 +612,7 @@ class TaskTable : public Table { } // namespace raylet -class ErrorTable : private Log { +class ErrorTable : private Log { public: ErrorTable(const std::vector> &contexts, AsyncGcsClient *client) @@ -532,7 +633,7 @@ class ErrorTable : private Log { /// \param error_message The error message to push. /// \param timestamp The timestamp of the error. /// \return Status. - Status PushErrorToDriver(const JobID &job_id, const std::string &type, + Status PushErrorToDriver(const DriverID &driver_id, const std::string &type, const std::string &error_message, double timestamp); /// Returns debug string for class. @@ -574,7 +675,7 @@ using ConfigTable = Table; /// it should append an entry to the log indicating that it is dead. A client /// that is marked as dead should never again be marked as alive; if it needs /// to reconnect, it must connect with a different ClientID. -class ClientTable : private Log { +class ClientTable : private Log { public: using ClientTableCallback = std::function; @@ -678,7 +779,7 @@ class ClientTable : private Log { /// The key at which the log of client information is stored. This key must /// be kept the same across all instances of the ClientTable, so that all /// clients append and read from the same key. - UniqueID client_log_key_; + ClientID client_log_key_; /// Whether this client has called Disconnect(). bool disconnected_; /// This client's ID. diff --git a/src/ray/id.cc b/src/ray/id.cc index 70454bbdfb0d..a9d9c5a7e765 100644 --- a/src/ray/id.cc +++ b/src/ray/id.cc @@ -165,7 +165,7 @@ std::ostream &operator<<(std::ostream &os, const UniqueID &id) { const ObjectID ComputeObjectId(const TaskID &task_id, int64_t object_index) { RAY_CHECK(object_index <= kMaxTaskReturns && object_index >= -kMaxTaskPuts); - ObjectID return_id = task_id; + ObjectID return_id = ObjectID(task_id); int64_t *first_bytes = reinterpret_cast(&return_id); // Zero out the lowest kObjectIdIndexSize bits of the first byte of the // object ID. @@ -176,7 +176,9 @@ const ObjectID ComputeObjectId(const TaskID &task_id, int64_t object_index) { return return_id; } -const TaskID FinishTaskId(const TaskID &task_id) { return ComputeObjectId(task_id, 0); } +const TaskID FinishTaskId(const TaskID &task_id) { + return TaskID(ComputeObjectId(task_id, 0)); +} const ObjectID ComputeReturnId(const TaskID &task_id, int64_t return_index) { RAY_CHECK(return_index >= 1 && return_index <= kMaxTaskReturns); @@ -190,7 +192,7 @@ const ObjectID ComputePutId(const TaskID &task_id, int64_t put_index) { } const TaskID ComputeTaskId(const ObjectID &object_id) { - TaskID task_id = object_id; + TaskID task_id = TaskID(object_id); int64_t *first_bytes = reinterpret_cast(&task_id); // Zero out the lowest kObjectIdIndexSize bits of the first byte of the // object ID. diff --git a/src/ray/id.h b/src/ray/id.h index 562365951fc2..35c67b220faf 100644 --- a/src/ray/id.h +++ b/src/ray/id.h @@ -30,7 +30,7 @@ class RAY_EXPORT UniqueID { std::string hex() const; plasma::UniqueID to_plasma_id() const; - private: + protected: uint8_t id_[kUniqueIDSize]; }; @@ -38,18 +38,24 @@ static_assert(std::is_standard_layout::value, "UniqueID must be standa std::ostream &operator<<(std::ostream &os, const UniqueID &id); -typedef UniqueID TaskID; -typedef UniqueID JobID; -typedef UniqueID ObjectID; -typedef UniqueID FunctionID; -typedef UniqueID ActorClassID; -typedef UniqueID ActorID; -typedef UniqueID ActorHandleID; -typedef UniqueID ActorCheckpointID; -typedef UniqueID WorkerID; -typedef UniqueID DriverID; -typedef UniqueID ConfigID; -typedef UniqueID ClientID; +#define DEFINE_UNIQUE_ID(type) \ + class RAY_EXPORT type : public UniqueID { \ + public: \ + explicit type(const UniqueID &from) { \ + std::memcpy(&id_, from.data(), kUniqueIDSize); \ + } \ + type() : UniqueID() {} \ + static type from_random() { return type(UniqueID::from_random()); } \ + static type from_binary(const std::string &binary) { return type(binary); } \ + static type nil() { return type(UniqueID::nil()); } \ + \ + private: \ + type(const std::string &binary) { std::memcpy(id_, binary.data(), kUniqueIDSize); } \ + }; + +#include "id_def.h" + +#undef DEFINE_UNIQUE_ID // TODO(swang): ObjectID and TaskID should derive from UniqueID. Then, we // can make these methods of the derived classes. @@ -101,14 +107,20 @@ int64_t ComputeObjectIndex(const ObjectID &object_id); } // namespace ray namespace std { -template <> -struct hash<::ray::UniqueID> { - size_t operator()(const ::ray::UniqueID &id) const { return id.hash(); } -}; -template <> -struct hash { - size_t operator()(const ::ray::UniqueID &id) const { return id.hash(); } -}; -} +#define DEFINE_UNIQUE_ID(type) \ + template <> \ + struct hash<::ray::type> { \ + size_t operator()(const ::ray::type &id) const { return id.hash(); } \ + }; \ + template <> \ + struct hash { \ + size_t operator()(const ::ray::type &id) const { return id.hash(); } \ + }; + +DEFINE_UNIQUE_ID(UniqueID); +#include "id_def.h" + +#undef DEFINE_UNIQUE_ID +} // namespace std #endif // RAY_ID_H_ diff --git a/src/ray/id_def.h b/src/ray/id_def.h new file mode 100644 index 000000000000..8e8b2b3fb717 --- /dev/null +++ b/src/ray/id_def.h @@ -0,0 +1,18 @@ +// This header file is used to avoid code duplication. +// It can be included multiple times in id.h, and each inclusion +// could use a different definition of the DEFINE_UNIQUE_ID macro. +// Macro definition format: DEFINE_UNIQUE_ID(id_type). +// NOTE: This file should NOT be included in any file other than id.h. + +DEFINE_UNIQUE_ID(TaskID); +DEFINE_UNIQUE_ID(JobID); +DEFINE_UNIQUE_ID(ObjectID); +DEFINE_UNIQUE_ID(FunctionID); +DEFINE_UNIQUE_ID(ActorClassID); +DEFINE_UNIQUE_ID(ActorID); +DEFINE_UNIQUE_ID(ActorHandleID); +DEFINE_UNIQUE_ID(ActorCheckpointID); +DEFINE_UNIQUE_ID(WorkerID); +DEFINE_UNIQUE_ID(DriverID); +DEFINE_UNIQUE_ID(ConfigID); +DEFINE_UNIQUE_ID(ClientID); diff --git a/src/ray/object_manager/object_directory.cc b/src/ray/object_manager/object_directory.cc index 51cb2600beb3..f9ec353658c5 100644 --- a/src/ray/object_manager/object_directory.cc +++ b/src/ray/object_manager/object_directory.cc @@ -8,30 +8,19 @@ ObjectDirectory::ObjectDirectory(boost::asio::io_service &io_service, namespace { -/// Process a suffix of the object table log and store the result in +/// Process a notification of the object table entries and store the result in /// client_ids. This assumes that client_ids already contains the result of the -/// object table log up to but not including this suffix. This also stores a -/// bool in has_been_created indicating whether the object has ever been -/// created before. -void UpdateObjectLocations(const std::vector &location_history, +/// object table entries up to but not including this notification. +void UpdateObjectLocations(const GcsTableNotificationMode notification_mode, + const std::vector &location_updates, const ray::gcs::ClientTable &client_table, - std::unordered_set *client_ids, - bool *has_been_created) { - // location_history contains the history of locations of the object (it is a log), - // which might look like the following: - // client1.is_eviction = false - // client1.is_eviction = true - // client2.is_eviction = false - // In such a scenario, we want to indicate client2 is the only client that contains - // the object, which the following code achieves. - if (!location_history.empty()) { - // If there are entries, then the object has been created. Once this flag - // is set to true, it should never go back to false. - *has_been_created = true; - } - for (const auto &object_table_data : location_history) { + std::unordered_set *client_ids) { + // location_updates contains the updates of locations of the object. + // with GcsTableNotificationMode, we can determine whether the update mode is + // addition or deletion. + for (const auto &object_table_data : location_updates) { ClientID client_id = ClientID::from_binary(object_table_data.manager); - if (!object_table_data.is_eviction) { + if (notification_mode != GcsTableNotificationMode::REMOVE) { client_ids->insert(client_id); } else { client_ids->erase(client_id); @@ -52,17 +41,22 @@ void UpdateObjectLocations(const std::vector &location_history void ObjectDirectory::RegisterBackend() { auto object_notification_callback = [this]( gcs::AsyncGcsClient *client, const ObjectID &object_id, - const std::vector &location_history) { + const GcsTableNotificationMode notification_mode, + const std::vector &location_updates) { // Objects are added to this map in SubscribeObjectLocations. auto it = listeners_.find(object_id); // Do nothing for objects we are not listening for. if (it == listeners_.end()) { return; } + + // Once this flag is set to true, it should never go back to false. + it->second.subscribed = true; + // Update entries for this object. - UpdateObjectLocations(location_history, gcs_client_->client_table(), - &it->second.current_object_locations, - &it->second.has_been_created); + UpdateObjectLocations(notification_mode, location_updates, + gcs_client_->client_table(), + &it->second.current_object_locations); // Copy the callbacks so that the callbacks can unsubscribe without interrupting // looping over the callbacks. auto callbacks = it->second.callbacks; @@ -73,12 +67,11 @@ void ObjectDirectory::RegisterBackend() { for (const auto &callback_pair : callbacks) { // It is safe to call the callback directly since this is already running // in the subscription callback stack. - callback_pair.second(object_id, it->second.current_object_locations, - it->second.has_been_created); + callback_pair.second(object_id, it->second.current_object_locations); } }; RAY_CHECK_OK(gcs_client_->object_table().Subscribe( - UniqueID::nil(), gcs_client_->client_table().GetLocalClientId(), + JobID::nil(), gcs_client_->client_table().GetLocalClientId(), object_notification_callback, nullptr)); } @@ -89,22 +82,22 @@ ray::Status ObjectDirectory::ReportObjectAdded( // Append the addition entry to the object table. auto data = std::make_shared(); data->manager = client_id.binary(); - data->is_eviction = false; data->object_size = object_info.data_size; ray::Status status = - gcs_client_->object_table().Append(JobID::nil(), object_id, data, nullptr); + gcs_client_->object_table().Add(JobID::nil(), object_id, data, nullptr); return status; } -ray::Status ObjectDirectory::ReportObjectRemoved(const ObjectID &object_id, - const ClientID &client_id) { +ray::Status ObjectDirectory::ReportObjectRemoved( + const ObjectID &object_id, const ClientID &client_id, + const object_manager::protocol::ObjectInfoT &object_info) { RAY_LOG(DEBUG) << "Reporting object removed to GCS " << object_id; // Append the eviction entry to the object table. auto data = std::make_shared(); data->manager = client_id.binary(); - data->is_eviction = true; + data->object_size = object_info.data_size; ray::Status status = - gcs_client_->object_table().Append(JobID::nil(), object_id, data, nullptr); + gcs_client_->object_table().Remove(JobID::nil(), object_id, data, nullptr); return status; }; @@ -141,17 +134,16 @@ void ObjectDirectory::HandleClientRemoved(const ClientID &client_id) { const ObjectID &object_id = listener.first; if (listener.second.current_object_locations.count(client_id) > 0) { // If the subscribed object has the removed client as a location, update - // its locations with an empty log so that the location will be removed. - UpdateObjectLocations({}, gcs_client_->client_table(), - &listener.second.current_object_locations, - &listener.second.has_been_created); + // its locations with an empty update so that the location will be removed. + UpdateObjectLocations(GcsTableNotificationMode::APPEND_OR_ADD, {}, + gcs_client_->client_table(), + &listener.second.current_object_locations); // Re-call all the subscribed callbacks for the object, since its // locations have changed. for (const auto &callback_pair : listener.second.callbacks) { // It is safe to call the callback directly since this is already running // in the subscription callback stack. - callback_pair.second(object_id, listener.second.current_object_locations, - listener.second.has_been_created); + callback_pair.second(object_id, listener.second.current_object_locations); } } } @@ -175,11 +167,10 @@ ray::Status ObjectDirectory::SubscribeObjectLocations(const UniqueID &callback_i listener_state.callbacks.emplace(callback_id, callback); // If we previously received some notifications about the object's locations, // immediately notify the caller of the current known locations. - if (listener_state.has_been_created) { + if (listener_state.subscribed) { auto &locations = listener_state.current_object_locations; - io_service_.post([callback, locations, object_id]() { - callback(object_id, locations, /*has_been_created=*/true); - }); + io_service_.post( + [callback, locations, object_id]() { callback(object_id, locations); }); } return status; } @@ -204,16 +195,14 @@ ray::Status ObjectDirectory::LookupLocations(const ObjectID &object_id, const OnLocationsFound &callback) { ray::Status status; auto it = listeners_.find(object_id); - if (it != listeners_.end() && it->second.has_been_created) { + if (it != listeners_.end() && it->second.subscribed) { // If we have locations cached due to a concurrent SubscribeObjectLocations // call, and we have received at least one notification from the GCS about // the object's creation, then call the callback immediately with the // cached locations. auto &locations = it->second.current_object_locations; - bool has_been_created = it->second.has_been_created; - io_service_.post([callback, object_id, locations, has_been_created]() { - callback(object_id, locations, has_been_created); - }); + io_service_.post( + [callback, object_id, locations]() { callback(object_id, locations); }); } else { // We do not have any locations cached due to a concurrent // SubscribeObjectLocations call, so look up the object's locations @@ -221,15 +210,14 @@ ray::Status ObjectDirectory::LookupLocations(const ObjectID &object_id, status = gcs_client_->object_table().Lookup( JobID::nil(), object_id, [this, callback](gcs::AsyncGcsClient *client, const ObjectID &object_id, - const std::vector &location_history) { + const std::vector &location_updates) { // Build the set of current locations based on the entries in the log. std::unordered_set client_ids; - bool has_been_created = false; - UpdateObjectLocations(location_history, gcs_client_->client_table(), - &client_ids, &has_been_created); + UpdateObjectLocations(GcsTableNotificationMode::APPEND_OR_ADD, location_updates, + gcs_client_->client_table(), &client_ids); // It is safe to call the callback directly since this is already running // in the GCS client's lookup callback stack. - callback(object_id, client_ids, has_been_created); + callback(object_id, client_ids); }); } return status; diff --git a/src/ray/object_manager/object_directory.h b/src/ray/object_manager/object_directory.h index 0559ad534514..96a2d726e241 100644 --- a/src/ray/object_manager/object_directory.h +++ b/src/ray/object_manager/object_directory.h @@ -51,8 +51,7 @@ class ObjectDirectoryInterface { /// Callback for object location notifications. using OnLocationsFound = std::function &, - bool has_been_created)>; + const std::unordered_set &)>; /// Lookup object locations. Callback may be invoked with empty list of client ids. /// @@ -110,9 +109,11 @@ class ObjectDirectoryInterface { /// /// \param object_id The object id that was removed from the store. /// \param client_id The client id corresponding to this node. + /// \param object_info Additional information about the object. /// \return Status of whether this method succeeded. - virtual ray::Status ReportObjectRemoved(const ObjectID &object_id, - const ClientID &client_id) = 0; + virtual ray::Status ReportObjectRemoved( + const ObjectID &object_id, const ClientID &client_id, + const object_manager::protocol::ObjectInfoT &object_info) = 0; /// Get local client id /// @@ -159,8 +160,9 @@ class ObjectDirectory : public ObjectDirectoryInterface { ray::Status ReportObjectAdded( const ObjectID &object_id, const ClientID &client_id, const object_manager::protocol::ObjectInfoT &object_info) override; - ray::Status ReportObjectRemoved(const ObjectID &object_id, - const ClientID &client_id) override; + ray::Status ReportObjectRemoved( + const ObjectID &object_id, const ClientID &client_id, + const object_manager::protocol::ObjectInfoT &object_info) override; ray::ClientID GetLocalClientID() override; @@ -176,12 +178,12 @@ class ObjectDirectory : public ObjectDirectoryInterface { std::unordered_map callbacks; /// The current set of known locations of this object. std::unordered_set current_object_locations; - /// This flag will get set to true if the object has ever been created. It + /// This flag will get set to true if received any notification of the object. + /// It means current_object_locations is up-to-date with GCS. It /// should never go back to false once set to true. If this is true, and /// the current_object_locations is empty, then this means that the object - /// does not exist on any nodes due to eviction (rather than due to the - /// object never getting created, for instance). - bool has_been_created; + /// does not exist on any nodes due to eviction or the object never getting created. + bool subscribed; }; /// Reference to the event loop. diff --git a/src/ray/object_manager/object_manager.cc b/src/ray/object_manager/object_manager.cc index 5459985e5b61..29338b165294 100644 --- a/src/ray/object_manager/object_manager.cc +++ b/src/ray/object_manager/object_manager.cc @@ -93,8 +93,10 @@ void ObjectManager::HandleObjectAdded( void ObjectManager::NotifyDirectoryObjectDeleted(const ObjectID &object_id) { auto it = local_objects_.find(object_id); RAY_CHECK(it != local_objects_.end()); + auto object_info = it->second.object_info; local_objects_.erase(it); - ray::Status status = object_directory_->ReportObjectRemoved(object_id, client_id_); + ray::Status status = + object_directory_->ReportObjectRemoved(object_id, client_id_, object_info); } ray::Status ObjectManager::SubscribeObjAdded( @@ -127,8 +129,7 @@ ray::Status ObjectManager::Pull(const ObjectID &object_id) { // no ordering guarantee between notifications. return object_directory_->SubscribeObjectLocations( object_directory_pull_callback_id_, object_id, - [this](const ObjectID &object_id, const std::unordered_set &client_ids, - bool created) { + [this](const ObjectID &object_id, const std::unordered_set &client_ids) { // Exit if the Pull request has already been fulfilled or canceled. auto it = pull_requests_.find(object_id); if (it == pull_requests_.end()) { @@ -578,9 +579,8 @@ ray::Status ObjectManager::LookupRemainingWaitObjects(const UniqueID &wait_id) { // Lookup remaining objects. wait_state.requested_objects.insert(object_id); RAY_RETURN_NOT_OK(object_directory_->LookupLocations( - object_id, - [this, wait_id](const ObjectID &lookup_object_id, - const std::unordered_set &client_ids, bool created) { + object_id, [this, wait_id](const ObjectID &lookup_object_id, + const std::unordered_set &client_ids) { auto &wait_state = active_wait_requests_.find(wait_id)->second; if (!client_ids.empty()) { wait_state.remaining.erase(lookup_object_id); @@ -618,7 +618,7 @@ void ObjectManager::SubscribeRemainingWaitObjects(const UniqueID &wait_id) { RAY_CHECK_OK(object_directory_->SubscribeObjectLocations( wait_id, object_id, [this, wait_id](const ObjectID &subscribe_object_id, - const std::unordered_set &client_ids, bool created) { + const std::unordered_set &client_ids) { if (!client_ids.empty()) { RAY_LOG(DEBUG) << "Wait request " << wait_id << ": subscription notification received for object " @@ -767,7 +767,7 @@ void ObjectManager::ConnectClient(std::shared_ptr &conn, // TODO: trash connection on failure. auto info = flatbuffers::GetRoot(message); - ClientID client_id = ObjectID::from_binary(info->client_id()->str()); + ClientID client_id = ClientID::from_binary(info->client_id()->str()); bool is_transfer = info->is_transfer(); conn->SetClientID(client_id); if (is_transfer) { @@ -885,7 +885,7 @@ void ObjectManager::ReceiveFreeRequest(std::shared_ptr &con const uint8_t *message) { auto free_request = flatbuffers::GetRoot(message); - std::vector object_ids = from_flatbuf(*free_request->object_ids()); + std::vector object_ids = from_flatbuf(*free_request->object_ids()); // This RPC should come from another Object Manager. // Keep this request local. bool local_only = true; diff --git a/src/ray/object_manager/object_store_notification_manager.cc b/src/ray/object_manager/object_store_notification_manager.cc index aa19787f3c37..746f4d622d5a 100644 --- a/src/ray/object_manager/object_store_notification_manager.cc +++ b/src/ray/object_manager/object_store_notification_manager.cc @@ -58,7 +58,7 @@ void ObjectStoreNotificationManager::ProcessStoreNotification( const auto &object_info = flatbuffers::GetRoot(notification_.data()); - const auto &object_id = from_flatbuf(*object_info->object_id()); + const auto &object_id = from_flatbuf(*object_info->object_id()); if (object_info->is_deletion()) { ProcessStoreRemove(object_id); } else { diff --git a/src/ray/object_manager/test/object_manager_test.cc b/src/ray/object_manager/test/object_manager_test.cc index 699d119e41b3..a373ea9b9365 100644 --- a/src/ray/object_manager/test/object_manager_test.cc +++ b/src/ray/object_manager/test/object_manager_test.cc @@ -291,10 +291,9 @@ class TestObjectManager : public TestObjectManagerBase { UniqueID sub_id = ray::ObjectID::from_random(); RAY_CHECK_OK(server1->object_manager_.object_directory_->SubscribeObjectLocations( - sub_id, object_1, - [this, sub_id, object_1, object_2]( - const ray::ObjectID &object_id, - const std::unordered_set &clients, bool created) { + sub_id, object_1, [this, sub_id, object_1, object_2]( + const ray::ObjectID &object_id, + const std::unordered_set &clients) { if (!clients.empty()) { TestWaitWhileSubscribed(sub_id, object_1, object_2); } diff --git a/src/ray/raylet/format/node_manager.fbs b/src/ray/raylet/format/node_manager.fbs index 710928cdbd88..20bb1c735c1c 100644 --- a/src/ray/raylet/format/node_manager.fbs +++ b/src/ray/raylet/format/node_manager.fbs @@ -196,7 +196,7 @@ table WaitReply { // This struct is the same as ErrorTableData. table PushErrorRequest { // The ID of the job that the error is for. - job_id: string; + driver_id: string; // The type of the error. type: string; // The error message. diff --git a/src/ray/raylet/lib/java/org_ray_runtime_raylet_RayletClientImpl.cc b/src/ray/raylet/lib/java/org_ray_runtime_raylet_RayletClientImpl.cc index 68004a37bf21..c55b2608b2fd 100644 --- a/src/ray/raylet/lib/java/org_ray_runtime_raylet_RayletClientImpl.cc +++ b/src/ray/raylet/lib/java/org_ray_runtime_raylet_RayletClientImpl.cc @@ -6,31 +6,30 @@ #include "ray/raylet/raylet_client.h" #include "ray/util/logging.h" -#ifdef __cplusplus -extern "C" { -#endif - +template class UniqueIdFromJByteArray { - private: - JNIEnv *_env; - jbyteArray _bytes; - public: - UniqueID *PID; + const ID &GetId() const { return *id_pointer_; } - UniqueIdFromJByteArray(JNIEnv *env, jbyteArray wid) { - _env = env; - _bytes = wid; - - jbyte *b = reinterpret_cast(_env->GetByteArrayElements(_bytes, nullptr)); - PID = reinterpret_cast(b); + UniqueIdFromJByteArray(JNIEnv *env, jbyteArray bytes) : env_(env), bytes_(bytes) { + jbyte *b = reinterpret_cast(env_->GetByteArrayElements(bytes_, nullptr)); + id_pointer_ = reinterpret_cast(b); } ~UniqueIdFromJByteArray() { - _env->ReleaseByteArrayElements(_bytes, reinterpret_cast(PID), 0); + env_->ReleaseByteArrayElements(bytes_, reinterpret_cast(id_pointer_), 0); } + + private: + JNIEnv *env_; + jbyteArray bytes_; + ID *id_pointer_; }; +#ifdef __cplusplus +extern "C" { +#endif + inline bool ThrowRayExceptionIfNotOK(JNIEnv *env, const ray::Status &status) { if (!status.ok()) { jclass exception_class = env->FindClass("org/ray/api/exception/RayException"); @@ -49,11 +48,11 @@ inline bool ThrowRayExceptionIfNotOK(JNIEnv *env, const ray::Status &status) { JNIEXPORT jlong JNICALL Java_org_ray_runtime_raylet_RayletClientImpl_nativeInit( JNIEnv *env, jclass, jstring sockName, jbyteArray workerId, jboolean isWorker, jbyteArray driverId) { - UniqueIdFromJByteArray worker_id(env, workerId); - UniqueIdFromJByteArray driver_id(env, driverId); + UniqueIdFromJByteArray worker_id(env, workerId); + UniqueIdFromJByteArray driver_id(env, driverId); const char *nativeString = env->GetStringUTFChars(sockName, JNI_FALSE); - auto raylet_client = new RayletClient(nativeString, *worker_id.PID, isWorker, - *driver_id.PID, Language::JAVA); + auto raylet_client = new RayletClient(nativeString, worker_id.GetId(), isWorker, + driver_id.GetId(), Language::JAVA); env->ReleaseStringUTFChars(sockName, nativeString); return reinterpret_cast(raylet_client); } @@ -70,8 +69,8 @@ JNIEXPORT void JNICALL Java_org_ray_runtime_raylet_RayletClientImpl_nativeSubmit std::vector execution_dependencies; if (cursorId != nullptr) { - UniqueIdFromJByteArray cursor_id(env, cursorId); - execution_dependencies.push_back(*cursor_id.PID); + UniqueIdFromJByteArray cursor_id(env, cursorId); + execution_dependencies.push_back(cursor_id.GetId()); } auto data = reinterpret_cast(env->GetDirectBufferAddress(taskBuff)) + pos; @@ -143,14 +142,14 @@ Java_org_ray_runtime_raylet_RayletClientImpl_nativeFetchOrReconstruct( for (int i = 0; i < len; i++) { jbyteArray object_id_bytes = static_cast(env->GetObjectArrayElement(objectIds, i)); - UniqueIdFromJByteArray object_id(env, object_id_bytes); - object_ids.push_back(*object_id.PID); + UniqueIdFromJByteArray object_id(env, object_id_bytes); + object_ids.push_back(object_id.GetId()); env->DeleteLocalRef(object_id_bytes); } - UniqueIdFromJByteArray current_task_id(env, currentTaskId); + UniqueIdFromJByteArray current_task_id(env, currentTaskId); auto raylet_client = reinterpret_cast(client); auto status = - raylet_client->FetchOrReconstruct(object_ids, fetchOnly, *current_task_id.PID); + raylet_client->FetchOrReconstruct(object_ids, fetchOnly, current_task_id.GetId()); ThrowRayExceptionIfNotOK(env, status); } @@ -161,9 +160,9 @@ Java_org_ray_runtime_raylet_RayletClientImpl_nativeFetchOrReconstruct( */ JNIEXPORT void JNICALL Java_org_ray_runtime_raylet_RayletClientImpl_nativeNotifyUnblocked( JNIEnv *env, jclass, jlong client, jbyteArray currentTaskId) { - UniqueIdFromJByteArray current_task_id(env, currentTaskId); + UniqueIdFromJByteArray current_task_id(env, currentTaskId); auto raylet_client = reinterpret_cast(client); - auto status = raylet_client->NotifyUnblocked(*current_task_id.PID); + auto status = raylet_client->NotifyUnblocked(current_task_id.GetId()); ThrowRayExceptionIfNotOK(env, status); } @@ -181,19 +180,19 @@ Java_org_ray_runtime_raylet_RayletClientImpl_nativeWaitObject( for (int i = 0; i < len; i++) { jbyteArray object_id_bytes = static_cast(env->GetObjectArrayElement(objectIds, i)); - UniqueIdFromJByteArray object_id(env, object_id_bytes); - object_ids.push_back(*object_id.PID); + UniqueIdFromJByteArray object_id(env, object_id_bytes); + object_ids.push_back(object_id.GetId()); env->DeleteLocalRef(object_id_bytes); } - UniqueIdFromJByteArray current_task_id(env, currentTaskId); + UniqueIdFromJByteArray current_task_id(env, currentTaskId); auto raylet_client = reinterpret_cast(client); // Invoke wait. WaitResultPair result; - auto status = - raylet_client->Wait(object_ids, numReturns, timeoutMillis, - static_cast(isWaitLocal), *current_task_id.PID, &result); + auto status = raylet_client->Wait(object_ids, numReturns, timeoutMillis, + static_cast(isWaitLocal), + current_task_id.GetId(), &result); if (ThrowRayExceptionIfNotOK(env, status)) { return nullptr; } @@ -231,15 +230,12 @@ JNIEXPORT jbyteArray JNICALL Java_org_ray_runtime_raylet_RayletClientImpl_nativeGenerateTaskId( JNIEnv *env, jclass, jbyteArray driverId, jbyteArray parentTaskId, jint parent_task_counter) { - UniqueIdFromJByteArray object_id1(env, driverId); - ray::DriverID driver_id = *object_id1.PID; + UniqueIdFromJByteArray driver_id(env, driverId); + UniqueIdFromJByteArray parent_task_id(env, parentTaskId); - UniqueIdFromJByteArray object_id2(env, parentTaskId); - ray::TaskID parent_task_id = *object_id2.PID; - - ray::TaskID task_id = - ray::GenerateTaskId(driver_id, parent_task_id, parent_task_counter); - jbyteArray result = env->NewByteArray(sizeof(ray::TaskID)); + TaskID task_id = + ray::GenerateTaskId(driver_id.GetId(), parent_task_id.GetId(), parent_task_counter); + jbyteArray result = env->NewByteArray(sizeof(TaskID)); if (nullptr == result) { return nullptr; } @@ -261,8 +257,8 @@ Java_org_ray_runtime_raylet_RayletClientImpl_nativeFreePlasmaObjects( for (int i = 0; i < len; i++) { jbyteArray object_id_bytes = static_cast(env->GetObjectArrayElement(objectIds, i)); - UniqueIdFromJByteArray object_id(env, object_id_bytes); - object_ids.push_back(*object_id.PID); + UniqueIdFromJByteArray object_id(env, object_id_bytes); + object_ids.push_back(object_id.GetId()); env->DeleteLocalRef(object_id_bytes); } auto raylet_client = reinterpret_cast(client); @@ -280,9 +276,9 @@ Java_org_ray_runtime_raylet_RayletClientImpl_nativePrepareCheckpoint(JNIEnv *env jlong client, jbyteArray actorId) { auto raylet_client = reinterpret_cast(client); - UniqueIdFromJByteArray actor_id(env, actorId); + UniqueIdFromJByteArray actor_id(env, actorId); ActorCheckpointID checkpoint_id; - auto status = raylet_client->PrepareActorCheckpoint(*actor_id.PID, checkpoint_id); + auto status = raylet_client->PrepareActorCheckpoint(actor_id.GetId(), checkpoint_id); if (ThrowRayExceptionIfNotOK(env, status)) { return nullptr; } @@ -301,10 +297,10 @@ JNIEXPORT void JNICALL Java_org_ray_runtime_raylet_RayletClientImpl_nativeNotifyActorResumedFromCheckpoint( JNIEnv *env, jclass, jlong client, jbyteArray actorId, jbyteArray checkpointId) { auto raylet_client = reinterpret_cast(client); - UniqueIdFromJByteArray actor_id(env, actorId); - UniqueIdFromJByteArray checkpoint_id(env, checkpointId); - auto status = - raylet_client->NotifyActorResumedFromCheckpoint(*actor_id.PID, *checkpoint_id.PID); + UniqueIdFromJByteArray actor_id(env, actorId); + UniqueIdFromJByteArray checkpoint_id(env, checkpointId); + auto status = raylet_client->NotifyActorResumedFromCheckpoint(actor_id.GetId(), + checkpoint_id.GetId()); ThrowRayExceptionIfNotOK(env, status); } diff --git a/src/ray/raylet/lineage_cache.cc b/src/ray/raylet/lineage_cache.cc index 93e56a93a81b..949dc9eca1c2 100644 --- a/src/ray/raylet/lineage_cache.cc +++ b/src/ray/raylet/lineage_cache.cc @@ -358,8 +358,9 @@ void LineageCache::FlushTask(const TaskID &task_id) { auto task_data = std::make_shared(); auto root = flatbuffers::GetRoot(fbb.GetBufferPointer()); root->UnPackTo(task_data.get()); - RAY_CHECK_OK(task_storage_.Add(task->TaskData().GetTaskSpecification().DriverId(), - task_id, task_data, task_callback)); + RAY_CHECK_OK( + task_storage_.Add(JobID(task->TaskData().GetTaskSpecification().DriverId()), + task_id, task_data, task_callback)); // We successfully wrote the task, so mark it as committing. // TODO(swang): Use a batched interface and write with all object entries. diff --git a/src/ray/raylet/lineage_cache_test.cc b/src/ray/raylet/lineage_cache_test.cc index 973483759e4b..1ed0dcc84f39 100644 --- a/src/ray/raylet/lineage_cache_test.cc +++ b/src/ray/raylet/lineage_cache_test.cc @@ -113,9 +113,9 @@ static inline Task ExampleTask(const std::vector &arguments, task_arguments.emplace_back(std::make_shared(references)); } std::vector function_descriptor(3); - auto spec = TaskSpecification(UniqueID::nil(), UniqueID::from_random(), 0, - task_arguments, num_returns, required_resources, - Language::PYTHON, function_descriptor); + auto spec = TaskSpecification(DriverID::nil(), TaskID::from_random(), 0, task_arguments, + num_returns, required_resources, Language::PYTHON, + function_descriptor); auto execution_spec = TaskExecutionSpecification(std::vector()); execution_spec.IncrementNumForwards(); Task task = Task(execution_spec, spec); diff --git a/src/ray/raylet/monitor.cc b/src/ray/raylet/monitor.cc index 30f05de226c4..d18edbad8238 100644 --- a/src/ray/raylet/monitor.cc +++ b/src/ray/raylet/monitor.cc @@ -35,7 +35,7 @@ void Monitor::Start() { HandleHeartbeat(id, heartbeat_data); }; RAY_CHECK_OK(gcs_client_.heartbeat_table().Subscribe( - UniqueID::nil(), UniqueID::nil(), heartbeat_callback, nullptr, nullptr)); + JobID::nil(), ClientID::nil(), heartbeat_callback, nullptr, nullptr)); Tick(); } @@ -69,7 +69,7 @@ void Monitor::Tick() { << " has missed too many heartbeats from it."; // We use the nil JobID to broadcast the message to all drivers. RAY_CHECK_OK(gcs_client_.error_table().PushErrorToDriver( - JobID::nil(), type, error_message.str(), current_time_ms())); + DriverID::nil(), type, error_message.str(), current_time_ms())); } }; RAY_CHECK_OK(gcs_client_.client_table().Lookup(lookup_callback)); @@ -88,7 +88,7 @@ void Monitor::Tick() { batch->batch.push_back(std::unique_ptr( new HeartbeatTableDataT(heartbeat.second))); } - RAY_CHECK_OK(gcs_client_.heartbeat_batch_table().Add(UniqueID::nil(), UniqueID::nil(), + RAY_CHECK_OK(gcs_client_.heartbeat_batch_table().Add(JobID::nil(), ClientID::nil(), batch, nullptr)); heartbeat_buffer_.clear(); } diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc index 684cad003b87..f94ddaeb147c 100644 --- a/src/ray/raylet/node_manager.cc +++ b/src/ray/raylet/node_manager.cc @@ -60,9 +60,7 @@ NodeManager::NodeManager(boost::asio::io_service &io_service, scheduling_policy_(local_queues_), reconstruction_policy_( io_service_, - [this](const TaskID &task_id, bool return_values_lost) { - HandleTaskReconstruction(task_id); - }, + [this](const TaskID &task_id) { HandleTaskReconstruction(task_id); }, RayConfig::instance().initial_reconstruction_timeout_milliseconds(), gcs_client_->client_table().GetLocalClientId(), gcs_client_->task_lease_table(), object_directory_, gcs_client_->task_reconstruction_log()), @@ -145,7 +143,7 @@ ray::Status NodeManager::RegisterGcs() { }; RAY_RETURN_NOT_OK(gcs_client_->actor_table().Subscribe( - UniqueID::nil(), UniqueID::nil(), actor_notification_callback, nullptr)); + JobID::nil(), ClientID::nil(), actor_notification_callback, nullptr)); // Register a callback on the client table for new clients. auto node_manager_client_added = [this](gcs::AsyncGcsClient *client, const UniqueID &id, @@ -167,17 +165,17 @@ ray::Status NodeManager::RegisterGcs() { HeartbeatBatchAdded(heartbeat_batch); }; RAY_RETURN_NOT_OK(gcs_client_->heartbeat_batch_table().Subscribe( - UniqueID::nil(), UniqueID::nil(), heartbeat_batch_added, + JobID::nil(), ClientID::nil(), heartbeat_batch_added, /*subscribe_callback=*/nullptr, /*done_callback=*/nullptr)); // Subscribe to driver table updates. const auto driver_table_handler = [this]( - gcs::AsyncGcsClient *client, const ClientID &client_id, + gcs::AsyncGcsClient *client, const DriverID &client_id, const std::vector &driver_data) { HandleDriverTableUpdate(client_id, driver_data); }; - RAY_RETURN_NOT_OK(gcs_client_->driver_table().Subscribe(JobID::nil(), UniqueID::nil(), + RAY_RETURN_NOT_OK(gcs_client_->driver_table().Subscribe(JobID::nil(), ClientID::nil(), driver_table_handler, nullptr)); // Start sending heartbeats to the GCS. @@ -210,12 +208,12 @@ void NodeManager::KillWorker(std::shared_ptr worker) { } void NodeManager::HandleDriverTableUpdate( - const ClientID &id, const std::vector &driver_data) { + const DriverID &id, const std::vector &driver_data) { for (const auto &entry : driver_data) { RAY_LOG(DEBUG) << "HandleDriverTableUpdate " << UniqueID::from_binary(entry.driver_id) << " " << entry.is_dead; if (entry.is_dead) { - auto driver_id = UniqueID::from_binary(entry.driver_id); + auto driver_id = DriverID::from_binary(entry.driver_id); auto workers = worker_pool_.GetWorkersRunningTasksForDriver(driver_id); // Kill all the workers. The actual cleanup for these workers is done @@ -270,7 +268,7 @@ void NodeManager::Heartbeat() { } ray::Status status = heartbeat_table.Add( - UniqueID::nil(), gcs_client_->client_table().GetLocalClientId(), heartbeat_data, + JobID::nil(), gcs_client_->client_table().GetLocalClientId(), heartbeat_data, /*success_callback=*/nullptr); RAY_CHECK_OK_PREPEND(status, "Heartbeat failed"); @@ -351,7 +349,7 @@ void NodeManager::ClientAdded(const ClientTableDataT &client_data) { << ". This may be since the node was recently removed."; // We use the nil JobID to broadcast the message to all drivers. RAY_CHECK_OK(gcs_client_->error_table().PushErrorToDriver( - JobID::nil(), type, error_message.str(), current_time_ms())); + DriverID::nil(), type, error_message.str(), current_time_ms())); return; } @@ -684,7 +682,7 @@ void NodeManager::ProcessClientMessage( } break; case protocol::MessageType::NotifyUnblocked: { auto message = flatbuffers::GetRoot(message_data); - HandleTaskUnblocked(client, from_flatbuf(*message->task_id())); + HandleTaskUnblocked(client, from_flatbuf(*message->task_id())); } break; case protocol::MessageType::WaitRequest: { ProcessWaitRequestMessage(client, message_data); @@ -698,7 +696,7 @@ void NodeManager::ProcessClientMessage( } break; case protocol::MessageType::FreeObjectsInObjectStoreRequest: { auto message = flatbuffers::GetRoot(message_data); - std::vector object_ids = from_flatbuf(*message->object_ids()); + std::vector object_ids = from_flatbuf(*message->object_ids()); object_manager_.FreeObjects(object_ids, message->local_only()); } break; case protocol::MessageType::PrepareActorCheckpointRequest: { @@ -719,7 +717,7 @@ void NodeManager::ProcessClientMessage( void NodeManager::ProcessRegisterClientRequestMessage( const std::shared_ptr &client, const uint8_t *message_data) { auto message = flatbuffers::GetRoot(message_data); - client->SetClientID(from_flatbuf(*message->client_id())); + client->SetClientID(from_flatbuf(*message->client_id())); auto worker = std::make_shared(message->worker_pid(), message->language(), client); if (message->is_worker()) { @@ -731,11 +729,11 @@ void NodeManager::ProcessRegisterClientRequestMessage( // message is actually the ID of the driver task, while client_id represents the // real driver ID, which can associate all the tasks/actors for a given driver, // which is set to the worker ID. - const JobID driver_task_id = from_flatbuf(*message->driver_id()); - worker->AssignTaskId(driver_task_id); - worker->AssignDriverId(from_flatbuf(*message->client_id())); + const JobID driver_task_id = from_flatbuf(*message->driver_id()); + worker->AssignTaskId(TaskID(driver_task_id)); + worker->AssignDriverId(from_flatbuf(*message->client_id())); worker_pool_.RegisterDriver(std::move(worker)); - local_queues_.AddDriverTaskId(driver_task_id); + local_queues_.AddDriverTaskId(TaskID(driver_task_id)); } } @@ -865,14 +863,14 @@ void NodeManager::ProcessDisconnectClientMessage( if (!intentional_disconnect) { // Push the error to driver. - const JobID &job_id = worker->GetAssignedDriverId(); + const DriverID &driver_id = worker->GetAssignedDriverId(); // TODO(rkn): Define this constant somewhere else. std::string type = "worker_died"; std::ostringstream error_message; error_message << "A worker died or was killed while executing task " << task_id << "."; RAY_CHECK_OK(gcs_client_->error_table().PushErrorToDriver( - job_id, type, error_message.str(), current_time_ms())); + driver_id, type, error_message.str(), current_time_ms())); } } @@ -899,8 +897,9 @@ void NodeManager::ProcessDisconnectClientMessage( DispatchTasks(local_queues_.GetReadyTasksWithResources()); } else if (is_driver) { // The client is a driver. - RAY_CHECK_OK(gcs_client_->driver_table().AppendDriverData(client->GetClientId(), - /*is_dead=*/true)); + RAY_CHECK_OK( + gcs_client_->driver_table().AppendDriverData(DriverID(client->GetClientId()), + /*is_dead=*/true)); auto driver_id = worker->GetAssignedTaskId(); RAY_CHECK(!driver_id.is_nil()); local_queues_.RemoveDriverTaskId(driver_id); @@ -919,7 +918,7 @@ void NodeManager::ProcessSubmitTaskMessage(const uint8_t *message_data) { // Read the task submitted by the client. auto message = flatbuffers::GetRoot(message_data); TaskExecutionSpecification task_execution_spec( - from_flatbuf(*message->execution_dependencies())); + from_flatbuf(*message->execution_dependencies())); TaskSpecification task_spec(*message->task_spec()); Task task(task_execution_spec, task_spec); // Submit the task to the local scheduler. Since the task was submitted @@ -932,7 +931,7 @@ void NodeManager::ProcessFetchOrReconstructMessage( auto message = flatbuffers::GetRoot(message_data); std::vector required_object_ids; for (size_t i = 0; i < message->object_ids()->size(); ++i) { - ObjectID object_id = from_flatbuf(*message->object_ids()->Get(i)); + ObjectID object_id = from_flatbuf(*message->object_ids()->Get(i)); if (message->fetch_only()) { // If only a fetch is required, then do not subscribe to the // dependencies to the task dependency manager. @@ -950,7 +949,7 @@ void NodeManager::ProcessFetchOrReconstructMessage( } if (!required_object_ids.empty()) { - const TaskID task_id = from_flatbuf(*message->task_id()); + const TaskID task_id = from_flatbuf(*message->task_id()); HandleTaskBlocked(client, required_object_ids, task_id); } } @@ -959,7 +958,7 @@ void NodeManager::ProcessWaitRequestMessage( const std::shared_ptr &client, const uint8_t *message_data) { // Read the data. auto message = flatbuffers::GetRoot(message_data); - std::vector object_ids = from_flatbuf(*message->object_ids()); + std::vector object_ids = from_flatbuf(*message->object_ids()); int64_t wait_ms = message->timeout(); uint64_t num_required_objects = static_cast(message->num_ready_objects()); bool wait_local = message->wait_local(); @@ -974,7 +973,7 @@ void NodeManager::ProcessWaitRequestMessage( } } - const TaskID ¤t_task_id = from_flatbuf(*message->task_id()); + const TaskID ¤t_task_id = from_flatbuf(*message->task_id()); bool client_blocked = !required_object_ids.empty(); if (client_blocked) { HandleTaskBlocked(client, required_object_ids, current_task_id); @@ -1012,20 +1011,20 @@ void NodeManager::ProcessWaitRequestMessage( void NodeManager::ProcessPushErrorRequestMessage(const uint8_t *message_data) { auto message = flatbuffers::GetRoot(message_data); - JobID job_id = from_flatbuf(*message->job_id()); + DriverID driver_id = from_flatbuf(*message->driver_id()); auto const &type = string_from_flatbuf(*message->type()); auto const &error_message = string_from_flatbuf(*message->error_message()); double timestamp = message->timestamp(); - RAY_CHECK_OK(gcs_client_->error_table().PushErrorToDriver(job_id, type, error_message, - timestamp)); + RAY_CHECK_OK(gcs_client_->error_table().PushErrorToDriver(driver_id, type, + error_message, timestamp)); } void NodeManager::ProcessPrepareActorCheckpointRequest( const std::shared_ptr &client, const uint8_t *message_data) { auto message = flatbuffers::GetRoot(message_data); - ActorID actor_id = from_flatbuf(*message->actor_id()); + ActorID actor_id = from_flatbuf(*message->actor_id()); RAY_LOG(DEBUG) << "Preparing checkpoint for actor " << actor_id; const auto &actor_entry = actor_registry_.find(actor_id); RAY_CHECK(actor_entry != actor_registry_.end()); @@ -1037,15 +1036,15 @@ void NodeManager::ProcessPrepareActorCheckpointRequest( const auto task_id = worker->GetAssignedTaskId(); const Task &task = local_queues_.GetTaskOfState(task_id, TaskState::RUNNING); // Generate checkpoint id and data. - ActorCheckpointID checkpoint_id = UniqueID::from_random(); + ActorCheckpointID checkpoint_id = ActorCheckpointID::from_random(); auto checkpoint_data = actor_entry->second.GenerateCheckpointData(actor_entry->first, task); // Write checkpoint data to GCS. RAY_CHECK_OK(gcs_client_->actor_checkpoint_table().Add( - UniqueID::nil(), checkpoint_id, checkpoint_data, + JobID::nil(), checkpoint_id, checkpoint_data, [worker, actor_id, this](ray::gcs::AsyncGcsClient *client, - const UniqueID &checkpoint_id, + const ActorCheckpointID &checkpoint_id, const ActorCheckpointDataT &data) { RAY_LOG(DEBUG) << "Checkpoint " << checkpoint_id << " saved for actor " << worker->GetActorId(); @@ -1072,8 +1071,9 @@ void NodeManager::ProcessPrepareActorCheckpointRequest( void NodeManager::ProcessNotifyActorResumedFromCheckpoint(const uint8_t *message_data) { auto message = flatbuffers::GetRoot(message_data); - ActorID actor_id = from_flatbuf(*message->actor_id()); - ActorCheckpointID checkpoint_id = from_flatbuf(*message->checkpoint_id()); + ActorID actor_id = from_flatbuf(*message->actor_id()); + ActorCheckpointID checkpoint_id = + from_flatbuf(*message->checkpoint_id()); RAY_LOG(DEBUG) << "Actor " << actor_id << " was resumed from checkpoint " << checkpoint_id; checkpoint_id_to_restore_.emplace(actor_id, checkpoint_id); @@ -1093,12 +1093,12 @@ void NodeManager::ProcessNodeManagerMessage(TcpClientConnection &node_manager_cl switch (message_type_value) { case protocol::MessageType::ConnectClient: { auto message = flatbuffers::GetRoot(message_data); - auto client_id = from_flatbuf(*message->client_id()); + auto client_id = from_flatbuf(*message->client_id()); node_manager_client.SetClientID(client_id); } break; case protocol::MessageType::ForwardTaskRequest: { auto message = flatbuffers::GetRoot(message_data); - TaskID task_id = from_flatbuf(*message->task_id()); + TaskID task_id = from_flatbuf(*message->task_id()); Lineage uncommitted_lineage(*message); const Task &task = uncommitted_lineage.GetEntry(task_id)->TaskData(); @@ -1285,14 +1285,13 @@ void NodeManager::TreatTaskAsFailedIfLost(const Task &task) { const ObjectID object_id = spec.ReturnId(i); // Lookup the return value's locations. RAY_CHECK_OK(object_directory_->LookupLocations( - object_id, - [this, task_marked_as_failed, task]( - const ray::ObjectID &object_id, - const std::unordered_set &clients, bool has_been_created) { + object_id, [this, task_marked_as_failed, task]( + const ray::ObjectID &object_id, + const std::unordered_set &clients) { if (!*task_marked_as_failed) { // Only process the object locations if we haven't already marked the // task as failed. - if (clients.empty() && has_been_created) { + if (clients.empty()) { // The object does not exist on any nodes but has been created // before, so the object has been lost. Mark the task as failed to // prevent any tasks that depend on this object from hanging. @@ -1589,7 +1588,7 @@ bool NodeManager::AssignTask(const Task &task) { const std::string warning_message = worker_pool_.WarningAboutSize(); if (warning_message != "") { RAY_CHECK_OK(gcs_client_->error_table().PushErrorToDriver( - JobID::nil(), "worker_pool_large", warning_message, current_time_ms())); + DriverID::nil(), "worker_pool_large", warning_message, current_time_ms())); } } // We couldn't assign this task, as no worker available. @@ -1902,7 +1901,6 @@ void NodeManager::HandleTaskReconstruction(const TaskID &task_id) { // Use a copy of the cached task spec to re-execute the task. const Task task = lineage_cache_.GetTaskOrDie(task_id); ResubmitTask(task); - })); } diff --git a/src/ray/raylet/node_manager.h b/src/ray/raylet/node_manager.h index 061ef5ef8969..1e97c380b1f5 100644 --- a/src/ray/raylet/node_manager.h +++ b/src/ray/raylet/node_manager.h @@ -326,7 +326,7 @@ class NodeManager { /// \param id An unused value. TODO(rkn): Should this be removed? /// \param driver_data Data associated with a driver table event. /// \return Void. - void HandleDriverTableUpdate(const ClientID &id, + void HandleDriverTableUpdate(const DriverID &id, const std::vector &driver_data); /// Check if certain invariants associated with the task dependency manager diff --git a/src/ray/raylet/raylet.cc b/src/ray/raylet/raylet.cc index c5ce98b1cc74..3b0ebd5b691f 100644 --- a/src/ray/raylet/raylet.cc +++ b/src/ray/raylet/raylet.cc @@ -9,8 +9,12 @@ namespace { -const std::vector GenerateEnumNames(const char *const *enum_names_ptr) { +const std::vector GenerateEnumNames(const char *const *enum_names_ptr, + int start_index, int end_index) { std::vector enum_names; + for (int i = 0; i < start_index; ++i) { + enum_names.push_back("EmptyMessageType"); + } size_t i = 0; while (true) { const char *name = enum_names_ptr[i]; @@ -20,13 +24,19 @@ const std::vector GenerateEnumNames(const char *const *enum_names_p enum_names.push_back(name); i++; } + RAY_CHECK(static_cast(end_index) == enum_names.size() - 1) + << "Message Type mismatch!"; return enum_names; } static const std::vector node_manager_message_enum = - GenerateEnumNames(ray::protocol::EnumNamesMessageType()); + GenerateEnumNames(ray::protocol::EnumNamesMessageType(), + static_cast(ray::protocol::MessageType::MIN), + static_cast(ray::protocol::MessageType::MAX)); static const std::vector object_manager_message_enum = - GenerateEnumNames(ray::object_manager::protocol::EnumNamesMessageType()); + GenerateEnumNames(ray::object_manager::protocol::EnumNamesMessageType(), + static_cast(ray::object_manager::protocol::MessageType::MIN), + static_cast(ray::object_manager::protocol::MessageType::MAX)); } namespace ray { diff --git a/src/ray/raylet/raylet_client.cc b/src/ray/raylet/raylet_client.cc index 13e92d0c4ccc..28a51c7e10fd 100644 --- a/src/ray/raylet/raylet_client.cc +++ b/src/ray/raylet/raylet_client.cc @@ -201,8 +201,8 @@ ray::Status RayletConnection::AtomicRequestReply( return ReadMessage(reply_type, reply_message); } -RayletClient::RayletClient(const std::string &raylet_socket, const UniqueID &client_id, - bool is_worker, const JobID &driver_id, +RayletClient::RayletClient(const std::string &raylet_socket, const ClientID &client_id, + bool is_worker, const DriverID &driver_id, const Language &language) : client_id_(client_id), is_worker_(is_worker), @@ -323,11 +323,11 @@ ray::Status RayletClient::Wait(const std::vector &object_ids, int num_ return ray::Status::OK(); } -ray::Status RayletClient::PushError(const JobID &job_id, const std::string &type, +ray::Status RayletClient::PushError(const DriverID &driver_id, const std::string &type, const std::string &error_message, double timestamp) { flatbuffers::FlatBufferBuilder fbb; auto message = ray::protocol::CreatePushErrorRequest( - fbb, to_flatbuf(fbb, job_id), fbb.CreateString(type), + fbb, to_flatbuf(fbb, driver_id), fbb.CreateString(type), fbb.CreateString(error_message), timestamp); fbb.Finish(message); @@ -373,7 +373,7 @@ ray::Status RayletClient::PrepareActorCheckpoint(const ActorID &actor_id, if (!status.ok()) return status; auto reply_message = flatbuffers::GetRoot(reply.get()); - checkpoint_id = ObjectID::from_binary(reply_message->checkpoint_id()->str()); + checkpoint_id = ActorCheckpointID::from_binary(reply_message->checkpoint_id()->str()); return ray::Status::OK(); } diff --git a/src/ray/raylet/raylet_client.h b/src/ray/raylet/raylet_client.h index d3ea765df65c..2e07becfc245 100644 --- a/src/ray/raylet/raylet_client.h +++ b/src/ray/raylet/raylet_client.h @@ -9,13 +9,14 @@ #include "ray/raylet/task_spec.h" #include "ray/status.h" -using ray::ActorID; using ray::ActorCheckpointID; +using ray::ActorID; +using ray::ClientID; +using ray::DriverID; using ray::JobID; using ray::ObjectID; using ray::TaskID; using ray::UniqueID; -using ray::ClientID; using MessageType = ray::protocol::MessageType; using ResourceMappingType = @@ -68,8 +69,8 @@ class RayletClient { /// additional message will be sent to register as one. /// \param driver_id The ID of the driver. This is non-nil if the client is a driver. /// \return The connection information. - RayletClient(const std::string &raylet_socket, const UniqueID &client_id, - bool is_worker, const JobID &driver_id, const Language &language); + RayletClient(const std::string &raylet_socket, const ClientID &client_id, + bool is_worker, const DriverID &driver_id, const Language &language); ray::Status Disconnect() { return conn_->Disconnect(); }; @@ -130,7 +131,7 @@ class RayletClient { /// \param The error message. /// \param The timestamp of the error. /// \return ray::Status. - ray::Status PushError(const JobID &job_id, const std::string &type, + ray::Status PushError(const DriverID &driver_id, const std::string &type, const std::string &error_message, double timestamp); /// Store some profile events in the GCS. diff --git a/src/ray/raylet/reconstruction_policy.cc b/src/ray/raylet/reconstruction_policy.cc index d698402994a4..d75f8799fe76 100644 --- a/src/ray/raylet/reconstruction_policy.cc +++ b/src/ray/raylet/reconstruction_policy.cc @@ -6,7 +6,7 @@ namespace raylet { ReconstructionPolicy::ReconstructionPolicy( boost::asio::io_service &io_service, - std::function reconstruction_handler, + std::function reconstruction_handler, int64_t initial_reconstruction_timeout_ms, const ClientID &client_id, gcs::PubsubInterface &task_lease_pubsub, std::shared_ptr object_directory, @@ -74,14 +74,13 @@ void ReconstructionPolicy::HandleReconstructionLogAppend(const TaskID &task_id, SetTaskTimeout(it, initial_reconstruction_timeout_ms_); if (success) { - reconstruction_handler_(task_id, it->second.return_values_lost); + reconstruction_handler_(task_id); } } void ReconstructionPolicy::AttemptReconstruction(const TaskID &task_id, const ObjectID &required_object_id, - int reconstruction_attempt, - bool created) { + int reconstruction_attempt) { // If we are no longer listening for objects created by this task, give up. auto it = listening_tasks_.find(task_id); if (it == listening_tasks_.end()) { @@ -93,10 +92,6 @@ void ReconstructionPolicy::AttemptReconstruction(const TaskID &task_id, return; } - if (created) { - it->second.return_values_lost = true; - } - // Suppress duplicate reconstructions of the same task. This can happen if, // for example, a task creates two different objects that both require // reconstruction. @@ -142,14 +137,13 @@ void ReconstructionPolicy::HandleTaskLeaseExpired(const TaskID &task_id) { // attempted asynchronously. for (const auto &created_object_id : it->second.created_objects) { RAY_CHECK_OK(object_directory_->LookupLocations( - created_object_id, - [this, task_id, reconstruction_attempt]( - const ray::ObjectID &object_id, - const std::unordered_set &clients, bool created) { + created_object_id, [this, task_id, reconstruction_attempt]( + const ray::ObjectID &object_id, + const std::unordered_set &clients) { if (clients.empty()) { // The required object no longer exists on any live nodes. Attempt // reconstruction. - AttemptReconstruction(task_id, object_id, reconstruction_attempt, created); + AttemptReconstruction(task_id, object_id, reconstruction_attempt); } })); } diff --git a/src/ray/raylet/reconstruction_policy.h b/src/ray/raylet/reconstruction_policy.h index d936a632e1f1..f18290aa3725 100644 --- a/src/ray/raylet/reconstruction_policy.h +++ b/src/ray/raylet/reconstruction_policy.h @@ -40,7 +40,7 @@ class ReconstructionPolicy : public ReconstructionPolicyInterface { /// lease notifications from. ReconstructionPolicy( boost::asio::io_service &io_service, - std::function reconstruction_handler, + std::function reconstruction_handler, int64_t initial_reconstruction_timeout_ms, const ClientID &client_id, gcs::PubsubInterface &task_lease_pubsub, std::shared_ptr object_directory, @@ -93,7 +93,6 @@ class ReconstructionPolicy : public ReconstructionPolicyInterface { bool subscribed; // The number of times we've attempted reconstructing this task so far. int reconstruction_attempt; - bool return_values_lost; // The task's reconstruction timer. If this expires before a lease // notification is received, then the task will be reconstructed. std::unique_ptr reconstruction_timer; @@ -116,7 +115,7 @@ class ReconstructionPolicy : public ReconstructionPolicyInterface { /// reconstructions of the same task (e.g., if a task creates two objects /// that both require reconstruction). void AttemptReconstruction(const TaskID &task_id, const ObjectID &required_object_id, - int reconstruction_attempt, bool created); + int reconstruction_attempt); /// Handle expiration of a task lease. void HandleTaskLeaseExpired(const TaskID &task_id); @@ -128,7 +127,7 @@ class ReconstructionPolicy : public ReconstructionPolicyInterface { /// The event loop. boost::asio::io_service &io_service_; /// The handler to call for tasks that require reconstruction. - const std::function reconstruction_handler_; + const std::function reconstruction_handler_; /// The initial timeout within which a task lease notification must be /// received. Otherwise, reconstruction will be triggered. const int64_t initial_reconstruction_timeout_ms_; diff --git a/src/ray/raylet/reconstruction_policy_test.cc b/src/ray/raylet/reconstruction_policy_test.cc index 5e9ae6d7e521..c5678d6cea41 100644 --- a/src/ray/raylet/reconstruction_policy_test.cc +++ b/src/ray/raylet/reconstruction_policy_test.cc @@ -29,10 +29,9 @@ class MockObjectDirectory : public ObjectDirectoryInterface { const ObjectID object_id = callback.first; auto it = locations_.find(object_id); if (it == locations_.end()) { - callback.second(object_id, std::unordered_set(), - /*created=*/false); + callback.second(object_id, std::unordered_set()); } else { - callback.second(object_id, it->second, /*created=*/true); + callback.second(object_id, it->second); } } callbacks_.clear(); @@ -63,7 +62,9 @@ class MockObjectDirectory : public ObjectDirectoryInterface { MOCK_METHOD3(ReportObjectAdded, ray::Status(const ObjectID &, const ClientID &, const object_manager::protocol::ObjectInfoT &)); - MOCK_METHOD2(ReportObjectRemoved, ray::Status(const ObjectID &, const ClientID &)); + MOCK_METHOD3(ReportObjectRemoved, + ray::Status(const ObjectID &, const ClientID &, + const object_manager::protocol::ObjectInfoT &)); private: std::vector> callbacks_; @@ -151,8 +152,8 @@ class ReconstructionPolicyTest : public ::testing::Test { mock_object_directory_(std::make_shared()), reconstruction_timeout_ms_(50), reconstruction_policy_(std::make_shared( - io_service_, [this](const TaskID &task_id, - bool created) { TriggerReconstruction(task_id); }, + io_service_, + [this](const TaskID &task_id) { TriggerReconstruction(task_id); }, reconstruction_timeout_ms_, ClientID::from_random(), mock_gcs_, mock_object_directory_, mock_gcs_)), timer_canceled_(false) { @@ -322,7 +323,7 @@ TEST_F(ReconstructionPolicyTest, TestReconstructionSuppressed) { task_lease_data->node_manager_id = ClientID::from_random().binary(); task_lease_data->acquired_at = current_sys_time_ms(); task_lease_data->timeout = 2 * test_period; - mock_gcs_.Add(DriverID::nil(), task_id, task_lease_data); + mock_gcs_.Add(JobID::nil(), task_id, task_lease_data); // Listen for an object. reconstruction_policy_->ListenAndMaybeReconstruct(object_id); @@ -350,7 +351,7 @@ TEST_F(ReconstructionPolicyTest, TestReconstructionContinuallySuppressed) { task_lease_data->node_manager_id = ClientID::from_random().binary(); task_lease_data->acquired_at = current_sys_time_ms(); task_lease_data->timeout = reconstruction_timeout_ms_; - mock_gcs_.Add(DriverID::nil(), task_id, task_lease_data); + mock_gcs_.Add(JobID::nil(), task_id, task_lease_data); }); // Run the test for much longer than the reconstruction timeout. Run(reconstruction_timeout_ms_ * 2); @@ -404,7 +405,7 @@ TEST_F(ReconstructionPolicyTest, TestSimultaneousReconstructionSuppressed) { task_reconstruction_data->node_manager_id = ClientID::from_random().binary(); task_reconstruction_data->num_reconstructions = 0; RAY_CHECK_OK( - mock_gcs_.AppendAt(DriverID::nil(), task_id, task_reconstruction_data, nullptr, + mock_gcs_.AppendAt(JobID::nil(), task_id, task_reconstruction_data, nullptr, /*failure_callback=*/ [](ray::gcs::AsyncGcsClient *client, const TaskID &task_id, const TaskReconstructionDataT &data) { ASSERT_TRUE(false); }, diff --git a/src/ray/raylet/task_dependency_manager.cc b/src/ray/raylet/task_dependency_manager.cc index fe4364c4491f..2f1b64a87480 100644 --- a/src/ray/raylet/task_dependency_manager.cc +++ b/src/ray/raylet/task_dependency_manager.cc @@ -263,7 +263,7 @@ void TaskDependencyManager::AcquireTaskLease(const TaskID &task_id) { task_lease_data->node_manager_id = client_id_.hex(); task_lease_data->acquired_at = current_sys_time_ms(); task_lease_data->timeout = it->second.lease_period; - RAY_CHECK_OK(task_lease_table_.Add(DriverID::nil(), task_id, task_lease_data, nullptr)); + RAY_CHECK_OK(task_lease_table_.Add(JobID::nil(), task_id, task_lease_data, nullptr)); auto period = boost::posix_time::milliseconds(it->second.lease_period / 2); it->second.lease_timer->expires_from_now(period); diff --git a/src/ray/raylet/task_dependency_manager_test.cc b/src/ray/raylet/task_dependency_manager_test.cc index f414d7469565..e0d30bf9ebd6 100644 --- a/src/ray/raylet/task_dependency_manager_test.cc +++ b/src/ray/raylet/task_dependency_manager_test.cc @@ -75,9 +75,9 @@ static inline Task ExampleTask(const std::vector &arguments, task_arguments.emplace_back(std::make_shared(references)); } std::vector function_descriptor(3); - auto spec = TaskSpecification(UniqueID::nil(), UniqueID::from_random(), 0, - task_arguments, num_returns, required_resources, - Language::PYTHON, function_descriptor); + auto spec = TaskSpecification(DriverID::nil(), TaskID::from_random(), 0, task_arguments, + num_returns, required_resources, Language::PYTHON, + function_descriptor); auto execution_spec = TaskExecutionSpecification(std::vector()); execution_spec.IncrementNumForwards(); Task task = Task(execution_spec, spec); diff --git a/src/ray/raylet/task_spec.cc b/src/ray/raylet/task_spec.cc index a8c0f40fed60..da8bafc60fd4 100644 --- a/src/ray/raylet/task_spec.cc +++ b/src/ray/raylet/task_spec.cc @@ -17,7 +17,7 @@ TaskArgumentByReference::TaskArgumentByReference(const std::vector &re flatbuffers::Offset TaskArgumentByReference::ToFlatbuffer( flatbuffers::FlatBufferBuilder &fbb) const { - return CreateArg(fbb, object_ids_to_flatbuf(fbb, references_)); + return CreateArg(fbb, ids_to_flatbuf(fbb, references_)); } TaskArgumentByValue::TaskArgumentByValue(const uint8_t *value, size_t length) { @@ -57,7 +57,7 @@ TaskSpecification::TaskSpecification(const std::string &string) { } TaskSpecification::TaskSpecification( - const UniqueID &driver_id, const TaskID &parent_task_id, int64_t parent_counter, + const DriverID &driver_id, const TaskID &parent_task_id, int64_t parent_counter, const std::vector> &task_arguments, int64_t num_returns, const std::unordered_map &required_resources, const Language &language, const std::vector &function_descriptor) @@ -68,7 +68,7 @@ TaskSpecification::TaskSpecification( function_descriptor) {} TaskSpecification::TaskSpecification( - const UniqueID &driver_id, const TaskID &parent_task_id, int64_t parent_counter, + const DriverID &driver_id, const TaskID &parent_task_id, int64_t parent_counter, const ActorID &actor_creation_id, const ObjectID &actor_creation_dummy_object_id, const int64_t max_actor_reconstructions, const ActorID &actor_id, const ActorHandleID &actor_handle_id, int64_t actor_counter, @@ -100,8 +100,8 @@ TaskSpecification::TaskSpecification( to_flatbuf(fbb, parent_task_id), parent_counter, to_flatbuf(fbb, actor_creation_id), to_flatbuf(fbb, actor_creation_dummy_object_id), max_actor_reconstructions, to_flatbuf(fbb, actor_id), to_flatbuf(fbb, actor_handle_id), actor_counter, - object_ids_to_flatbuf(fbb, new_actor_handles), fbb.CreateVector(arguments), - object_ids_to_flatbuf(fbb, returns), map_to_flatbuf(fbb, required_resources), + ids_to_flatbuf(fbb, new_actor_handles), fbb.CreateVector(arguments), + ids_to_flatbuf(fbb, returns), map_to_flatbuf(fbb, required_resources), map_to_flatbuf(fbb, required_placement_resources), language, string_vec_to_flatbuf(fbb, function_descriptor)); fbb.Finish(spec); @@ -122,15 +122,15 @@ size_t TaskSpecification::size() const { return spec_.size(); } // Task specification getter methods. TaskID TaskSpecification::TaskId() const { auto message = flatbuffers::GetRoot(spec_.data()); - return from_flatbuf(*message->task_id()); + return from_flatbuf(*message->task_id()); } -UniqueID TaskSpecification::DriverId() const { +DriverID TaskSpecification::DriverId() const { auto message = flatbuffers::GetRoot(spec_.data()); - return from_flatbuf(*message->driver_id()); + return from_flatbuf(*message->driver_id()); } TaskID TaskSpecification::ParentTaskId() const { auto message = flatbuffers::GetRoot(spec_.data()); - return from_flatbuf(*message->parent_task_id()); + return from_flatbuf(*message->parent_task_id()); } int64_t TaskSpecification::ParentCounter() const { auto message = flatbuffers::GetRoot(spec_.data()); @@ -168,7 +168,7 @@ int64_t TaskSpecification::NumReturns() const { ObjectID TaskSpecification::ReturnId(int64_t return_index) const { auto message = flatbuffers::GetRoot(spec_.data()); - return object_ids_from_flatbuf(*message->returns())[return_index]; + return ids_from_flatbuf(*message->returns())[return_index]; } bool TaskSpecification::ArgByRef(int64_t arg_index) const { @@ -184,7 +184,7 @@ int TaskSpecification::ArgIdCount(int64_t arg_index) const { ObjectID TaskSpecification::ArgId(int64_t arg_index, int64_t id_index) const { auto message = flatbuffers::GetRoot(spec_.data()); const auto &object_ids = - object_ids_from_flatbuf(*message->args()->Get(arg_index)->object_ids()); + ids_from_flatbuf(*message->args()->Get(arg_index)->object_ids()); return object_ids[id_index]; } @@ -232,12 +232,12 @@ bool TaskSpecification::IsActorTask() const { return !ActorId().is_nil(); } ActorID TaskSpecification::ActorCreationId() const { auto message = flatbuffers::GetRoot(spec_.data()); - return from_flatbuf(*message->actor_creation_id()); + return from_flatbuf(*message->actor_creation_id()); } ObjectID TaskSpecification::ActorCreationDummyObjectId() const { auto message = flatbuffers::GetRoot(spec_.data()); - return from_flatbuf(*message->actor_creation_dummy_object_id()); + return from_flatbuf(*message->actor_creation_dummy_object_id()); } int64_t TaskSpecification::MaxActorReconstructions() const { @@ -247,12 +247,12 @@ int64_t TaskSpecification::MaxActorReconstructions() const { ActorID TaskSpecification::ActorId() const { auto message = flatbuffers::GetRoot(spec_.data()); - return from_flatbuf(*message->actor_id()); + return from_flatbuf(*message->actor_id()); } ActorHandleID TaskSpecification::ActorHandleId() const { auto message = flatbuffers::GetRoot(spec_.data()); - return from_flatbuf(*message->actor_handle_id()); + return from_flatbuf(*message->actor_handle_id()); } int64_t TaskSpecification::ActorCounter() const { @@ -267,7 +267,7 @@ ObjectID TaskSpecification::ActorDummyObject() const { std::vector TaskSpecification::NewActorHandles() const { auto message = flatbuffers::GetRoot(spec_.data()); - return object_ids_from_flatbuf(*message->new_actor_handles()); + return ids_from_flatbuf(*message->new_actor_handles()); } } // namespace raylet diff --git a/src/ray/raylet/task_spec.h b/src/ray/raylet/task_spec.h index 11e93050b9d1..baa6165c9ede 100644 --- a/src/ray/raylet/task_spec.h +++ b/src/ray/raylet/task_spec.h @@ -96,7 +96,7 @@ class TaskSpecification { /// \param num_returns The number of values returned by the task. /// \param required_resources The task's resource demands. /// \param language The language of the worker that must execute the function. - TaskSpecification(const UniqueID &driver_id, const TaskID &parent_task_id, + TaskSpecification(const DriverID &driver_id, const TaskID &parent_task_id, int64_t parent_counter, const std::vector> &task_arguments, int64_t num_returns, @@ -129,7 +129,7 @@ class TaskSpecification { /// \param language The language of the worker that must execute the function. /// \param function_descriptor The function descriptor. TaskSpecification( - const UniqueID &driver_id, const TaskID &parent_task_id, int64_t parent_counter, + const DriverID &driver_id, const TaskID &parent_task_id, int64_t parent_counter, const ActorID &actor_creation_id, const ObjectID &actor_creation_dummy_object_id, int64_t max_actor_reconstructions, const ActorID &actor_id, const ActorHandleID &actor_handle_id, int64_t actor_counter, @@ -164,7 +164,7 @@ class TaskSpecification { // TODO(swang): Finalize and document these methods. TaskID TaskId() const; - UniqueID DriverId() const; + DriverID DriverId() const; TaskID ParentTaskId() const; int64_t ParentCounter() const; std::vector FunctionDescriptor() const; diff --git a/src/ray/raylet/worker_pool_test.cc b/src/ray/raylet/worker_pool_test.cc index 4a7f71ea81ea..c548fc924d67 100644 --- a/src/ray/raylet/worker_pool_test.cc +++ b/src/ray/raylet/worker_pool_test.cc @@ -75,7 +75,7 @@ static inline TaskSpecification ExampleTaskSpec( const ActorID actor_id = ActorID::nil(), const Language &language = Language::PYTHON) { std::vector function_descriptor(3); - return TaskSpecification(UniqueID::nil(), TaskID::nil(), 0, ActorID::nil(), + return TaskSpecification(DriverID::nil(), TaskID::nil(), 0, ActorID::nil(), ObjectID::nil(), 0, actor_id, ActorHandleID::nil(), 0, {}, {}, 0, {{}}, {{}}, language, function_descriptor); } diff --git a/thirdparty/scripts/build_ui.sh b/thirdparty/scripts/build_ui.sh deleted file mode 100755 index eeab6ca91cd6..000000000000 --- a/thirdparty/scripts/build_ui.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/bash - -set -x - -# Cause the script to exit if a single command fails. -set -e - -TP_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd)/../ - -CATAPULT_COMMIT=18cd334755701cf0c3b90b7172126c686d2eb787 -CATAPULT_HOME=$TP_DIR/pkg/catapult -VULCANIZE_BIN=$CATAPULT_HOME/tracing/bin/vulcanize_trace_viewer - -CATAPULT_FILES=$TP_DIR/../build/src/catapult_files - -# This is where we will copy the files that need to be packaged with the wheels. -mkdir -p $CATAPULT_FILES - -if [[ "$INCLUDE_UI" == "0" ]]; then - # Let installation continue without building the UI. - exit 0 -fi - -if ! type python2 > /dev/null; then - echo "cannot properly set up UI without a python2 executable" - if [[ "$INCLUDE_UI" == "1" ]]; then - # Since the UI is explicitly supposed to be included, fail here. - exit 1 - else - # Let installation continue without building the UI. - exit 0 - fi -fi - -# Download catapult and use it to autogenerate some static html if it isn't -# already present. -if [[ ! -d $CATAPULT_HOME ]]; then - echo "setting up catapult" - # The git clone command seems to fail in Travis, so retry up to 20 times. - for COUNT in {1..20}; do - # Attempt to git clone catapult and break from the retry loop if it succeeds. - git clone -q https://github.com/ray-project/catapult.git $CATAPULT_HOME && break - # If none of the retries succeeded at getting boost, then fail. - if [[ $COUNT == 20 ]]; then - exit 1 - fi - done -fi - -REBUILD=off - -# Check out the appropriate commit from catapult. -pushd $CATAPULT_HOME -if [ "$CATAPULT_COMMIT" != `git rev-parse HEAD` ]; then - git fetch origin master - git checkout $CATAPULT_COMMIT - REBUILD=on -fi -popd - -# If the autogenerated catapult files aren't present, then generate them. -if [[ ! -f $CATAPULT_FILES/index.html || "$REBUILD" == "on" ]]; then - python2 $VULCANIZE_BIN --config chrome --output $CATAPULT_FILES/trace_viewer_full.html - cp $CATAPULT_HOME/tracing/bin/index.html $CATAPULT_FILES/index.html -fi diff --git a/thirdparty/scripts/setup.sh b/thirdparty/scripts/setup.sh index da283bd3b2bb..4327a668d84f 100755 --- a/thirdparty/scripts/setup.sh +++ b/thirdparty/scripts/setup.sh @@ -57,12 +57,6 @@ bash "$TP_SCRIPT_DIR/build_credis.sh" #RAY_BUILD_JAVA=$RAY_BUILD_JAVA \ #bash "$TP_SCRIPT_DIR/build_arrow.sh" $PYTHON_EXECUTABLE -############################################## -# catapult -############################################## -# Clone catapult and build the static HTML needed for the UI. -bash "$TP_SCRIPT_DIR/build_ui.sh" - ############################################## # rDSN (optional) ##############################################