wingman-ai · stefanpantic · Jun 6, 2019 · May 11, 2019 · May 11, 2019 · May 12, 2019
diff --git a/BUILD.bazel b/BUILD.bazel
@@ -77,6 +77,7 @@ cc_library(
             "src/ray/raylet/mock_gcs_client.cc",
             "src/ray/raylet/monitor_main.cc",
             "src/ray/raylet/*_test.cc",
+            "src/ray/raylet/main.cc",
         ],
     ),
     hdrs = glob([
@@ -105,6 +106,39 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "core_worker_lib",
+    srcs = glob(
+        [
+            "src/ray/core_worker/*.cc",
+        ],
+        exclude = [
+            "src/ray/core_worker/*_test.cc",
+        ],
+    ),
+    hdrs = glob([
+        "src/ray/core_worker/*.h",
+    ]),
+    copts = COPTS,
+    deps = [
+        ":ray_common",
+        ":ray_util",
+        ":raylet_lib",
+    ],
+)
+
+# This test is run by src/ray/test/run_core_worker_tests.sh
+cc_binary(
+    name = "core_worker_test",
+    srcs = ["src/ray/core_worker/core_worker_test.cc"],
+    copts = COPTS,
+    deps = [
+        ":core_worker_lib",
+        ":gcs",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_test(
     name = "lineage_cache_test",
     srcs = ["src/ray/raylet/lineage_cache_test.cc"],
@@ -247,16 +281,13 @@ cc_library(
     name = "ray_util",
     srcs = glob(
         [
-            "src/ray/*.cc",
             "src/ray/util/*.cc",
         ],
         exclude = [
-            "src/ray/util/logging_test.cc",
-            "src/ray/util/signal_test.cc",
+            "src/ray/util/*_test.cc",
         ],
     ),
     hdrs = glob([
-        "src/ray/*.h",
         "src/ray/util/*.h",
     ]),
     copts = COPTS,
@@ -272,23 +303,28 @@ cc_library(
 
 cc_library(
     name = "ray_common",
-    srcs = [
-        "src/ray/common/client_connection.cc",
-        "src/ray/common/common_protocol.cc",
-    ],
-    hdrs = [
-        "src/ray/common/client_connection.h",
-        "src/ray/common/common_protocol.h",
-    ],
+    srcs = glob(
+        [
+            "src/ray/common/*.cc",
+        ],
+        exclude = [
+            "src/ray/common/*_test.cc",
+        ],
+    ),
+    hdrs = glob(
+        [
+            "src/ray/common/*.h",
+        ],
+    ),
     copts = COPTS,
     includes = [
         "src/ray/gcs/format",
     ],
     deps = [
         ":gcs_fbs",
-        ":node_manager_fbs",
         ":ray_util",
         "@boost//:asio",
+        "@plasma//:plasma_client",
     ],
 )
 
@@ -432,7 +468,7 @@ cc_binary(
     srcs = [
         "src/ray/raylet/lib/java/org_ray_runtime_raylet_RayletClientImpl.h",
         "src/ray/raylet/lib/java/org_ray_runtime_raylet_RayletClientImpl.cc",
-        "src/ray/id.h",
+        "src/ray/common/id.h",
         "src/ray/raylet/raylet_client.h",
         "src/ray/util/logging.h",
         "@bazel_tools//tools/jdk:jni_header",
@@ -637,8 +673,8 @@ genrule(
         cp -f $(location //:raylet) $$WORK_DIR/python/ray/core/src/ray/raylet/ &&
         for f in $(locations //:python_gcs_fbs); do cp -f $$f $$WORK_DIR/python/ray/core/generated/; done &&
         mkdir -p $$WORK_DIR/python/ray/core/generated/ray/protocol/ &&
-        for f in $(locations //:python_node_manager_fbs); do 
-            cp -f $$f $$WORK_DIR/python/ray/core/generated/ray/protocol/; 
+        for f in $(locations //:python_node_manager_fbs); do
+            cp -f $$f $$WORK_DIR/python/ray/core/generated/ray/protocol/;
         done &&
         echo $$WORK_DIR > $@
     """,

diff --git a/bazel/BUILD.plasma b/bazel/BUILD.plasma
@@ -25,11 +25,13 @@ cc_library(
     name = "arrow",
     srcs = [
         "cpp/src/arrow/buffer.cc",
+        "cpp/src/arrow/io/interfaces.cc",
         "cpp/src/arrow/memory_pool.cc",
         "cpp/src/arrow/status.cc",
         "cpp/src/arrow/util/io-util.cc",
         "cpp/src/arrow/util/logging.cc",
         "cpp/src/arrow/util/memory.cc",
+        "cpp/src/arrow/util/string_builder.cc",
         "cpp/src/arrow/util/thread-pool.cc",
     ],
     hdrs = [
@@ -42,6 +44,7 @@ cc_library(
         "cpp/src/arrow/util/logging.h",
         "cpp/src/arrow/util/macros.h",
         "cpp/src/arrow/util/memory.h",
+        "cpp/src/arrow/util/stl.h",
         "cpp/src/arrow/util/string_builder.h",
         "cpp/src/arrow/util/string_view.h",
         "cpp/src/arrow/util/thread-pool.h",
@@ -53,6 +56,9 @@ cc_library(
         "cpp/src/arrow/vendored/xxhash/xxhash.h",
     ],
     strip_include_prefix = "cpp/src",
+    deps = [
+        "@boost//:filesystem",
+    ],
 )
 
 cc_library(

diff --git a/bazel/ray_deps_setup.bzl b/bazel/ray_deps_setup.bzl
@@ -3,9 +3,9 @@ load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 
 def ray_deps_setup():
     RULES_JVM_EXTERNAL_TAG = "1.2"
-   
+
     RULES_JVM_EXTERNAL_SHA = "e5c68b87f750309a79f59c2b69ead5c3221ffa54ff9496306937bfa1c9c8c86b"
-   
+
     http_archive(
         name = "rules_jvm_external",
         sha256 = RULES_JVM_EXTERNAL_SHA,
@@ -18,72 +18,72 @@ def ray_deps_setup():
         strip_prefix = "bazel-common-f1115e0f777f08c3cdb115526c4e663005bec69b",
         url = "https://github.com/google/bazel-common/archive/f1115e0f777f08c3cdb115526c4e663005bec69b.zip",
     )
- 
+
     BAZEL_SKYLIB_TAG = "0.6.0"
 
     http_archive(
         name = "bazel_skylib",
         strip_prefix = "bazel-skylib-%s" % BAZEL_SKYLIB_TAG,
         url = "https://github.com/bazelbuild/bazel-skylib/archive/%s.tar.gz" % BAZEL_SKYLIB_TAG,
     )
-   
+
     git_repository(
         name = "com_github_checkstyle_java",
         commit = "85f37871ca03b9d3fee63c69c8107f167e24e77b",
         remote = "https://github.com/ruifangChen/checkstyle_java",
     )
-   
+
     git_repository(
         name = "com_github_nelhage_rules_boost",
         commit = "5171b9724fbb39c5fdad37b9ca9b544e8858d8ac",
         remote = "https://github.com/ray-project/rules_boost",
     )
-   
+
     git_repository(
         name = "com_github_google_flatbuffers",
         commit = "63d51afd1196336a7d1f56a988091ef05deb1c62",
         remote = "https://github.com/google/flatbuffers.git",
     )
-   
+
     git_repository(
         name = "com_google_googletest",
         commit = "3306848f697568aacf4bcca330f6bdd5ce671899",
         remote = "https://github.com/google/googletest",
     )
-   
+
     git_repository(
         name = "com_github_gflags_gflags",
         remote = "https://github.com/gflags/gflags.git",
         tag = "v2.2.2",
     )
-   
+
     new_git_repository(
         name = "com_github_google_glog",
         build_file = "@//bazel:BUILD.glog",
         commit = "5c576f78c49b28d89b23fbb1fc80f54c879ec02e",
         remote = "https://github.com/google/glog",
     )
-   
+
     new_git_repository(
         name = "plasma",
         build_file = "@//bazel:BUILD.plasma",
-        commit = "d00497b38be84fd77c40cbf77f3422f2a81c44f9",
+        commit = "9fcc12fc094b85ec2e3e9798bae5c8151d14df5e",
         remote = "https://github.com/apache/arrow",
     )
-   
+
     new_git_repository(
         name = "cython",
         build_file = "@//bazel:BUILD.cython",
         commit = "49414dbc7ddc2ca2979d6dbe1e44714b10d72e7e",
         remote = "https://github.com/cython/cython",
     )
-   
+
     http_archive(
         name = "io_opencensus_cpp",
         strip_prefix = "opencensus-cpp-3aa11f20dd610cb8d2f7c62e58d1e69196aadf11",
         urls = ["https://github.com/census-instrumentation/opencensus-cpp/archive/3aa11f20dd610cb8d2f7c62e58d1e69196aadf11.zip"],
     )
-   
+
     # OpenCensus depends on Abseil so we have to explicitly pull it in.
     # This is how diamond dependencies are prevented.
     git_repository(
@@ -96,7 +96,7 @@ def ray_deps_setup():
     http_archive(
         name = "com_github_jupp0r_prometheus_cpp",
         strip_prefix = "prometheus-cpp-master",
-   
+
         # TODO(qwang): We should use the repository of `jupp0r` here when this PR
         # `https://github.com/jupp0r/prometheus-cpp/pull/225` getting merged.
         urls = ["https://github.com/jovany-wang/prometheus-cpp/archive/master.zip"],

diff --git a/build.sh b/build.sh
@@ -101,8 +101,8 @@ pushd "$BUILD_DIR"
 # generated from https://github.com/ray-project/arrow-build from
 # the commit listed in the command.
 $PYTHON_EXECUTABLE -m pip install \
-    --target="$ROOT_DIR/python/ray/pyarrow_files" pyarrow==0.12.0.RAY \
-    --find-links https://s3-us-west-2.amazonaws.com/arrow-wheels/ca1fa51f0901f5a4298f0e4faea00f24e5dd7bb7/index.html
+    --target="$ROOT_DIR/python/ray/pyarrow_files" pyarrow==0.14.0.RAY \
+    --find-links https://s3-us-west-2.amazonaws.com/arrow-wheels/9f35817b35f9d0614a736a497d70de2cf07fed52/index.html
 export PYTHON_BIN_PATH="$PYTHON_EXECUTABLE"
 
 if [ "$RAY_BUILD_JAVA" == "YES" ]; then

diff --git a/ci/jenkins_tests/run_multi_node_tests.sh b/ci/jenkins_tests/run_multi_node_tests.sh
@@ -31,25 +31,4 @@ $SUPPRESS_OUTPUT docker run --rm --shm-size=60G --memory=60G $DOCKER_SHA \
 ######################## SGD TESTS #################################
 
 $SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
-    python /ray/python/ray/experimental/sgd/test_sgd.py --num-iters=2 \
-        --batch-size=1 --strategy=simple
-
-$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
-    python /ray/python/ray/experimental/sgd/test_sgd.py --num-iters=2 \
-        --batch-size=1 --strategy=ps
-
-$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
-    python /ray/python/ray/experimental/sgd/test_save_and_restore.py --num-iters=2 \
-        --batch-size=1 --strategy=simple
-
-$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
-    python /ray/python/ray/experimental/sgd/test_save_and_restore.py --num-iters=2 \
-        --batch-size=1 --strategy=ps
-
-$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
-    python /ray/python/ray/experimental/sgd/mnist_example.py --num-iters=1 \
-        --num-workers=1 --devices-per-worker=1 --strategy=ps
-
-$SUPPRESS_OUTPUT docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
-    python /ray/python/ray/experimental/sgd/mnist_example.py --num-iters=1 \
-        --num-workers=1 --devices-per-worker=1 --strategy=ps --tune
+    python -m pytest /ray/python/ray/experimental/sgd/tests
diff --git a/ci/jenkins_tests/run_rllib_tests.sh b/ci/jenkins_tests/run_rllib_tests.sh
@@ -302,7 +302,7 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
     /ray/ci/suppress_output python /ray/python/ray/rllib/tests/test_checkpoint_restore.py
 
 docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
-    /ray/ci/suppress_output python /ray/python/ray/rllib/tests/test_policy_evaluator.py
+    /ray/ci/suppress_output python /ray/python/ray/rllib/tests/test_rollout_worker.py
 
 docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
     /ray/ci/suppress_output python /ray/python/ray/rllib/tests/test_nested_spaces.py
@@ -390,7 +390,16 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
     /ray/ci/suppress_output python /ray/python/ray/rllib/examples/custom_loss.py --iters=2
 
 docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
-    /ray/ci/suppress_output python /ray/python/ray/rllib/examples/policy_evaluator_custom_workflow.py
+    /ray/ci/suppress_output python /ray/python/ray/rllib/examples/rollout_worker_custom_workflow.py
+
+docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
+    /ray/ci/suppress_output python /ray/python/ray/rllib/examples/custom_tf_policy.py --iters=2
+
+docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
+    /ray/ci/suppress_output python /ray/python/ray/rllib/examples/custom_torch_policy.py --iters=2
+
+docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
+    /ray/ci/suppress_output python /ray/python/ray/rllib/examples/rollout_worker_custom_workflow.py
 
 docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
     /ray/ci/suppress_output python /ray/python/ray/rllib/examples/custom_metrics_and_callbacks.py --num-iters=2

diff --git a/ci/long_running_tests/workloads/pbt.py b/ci/long_running_tests/workloads/pbt.py
@@ -37,7 +37,8 @@
 
 pbt = PopulationBasedTraining(
     time_attr="training_iteration",
-    reward_attr="episode_reward_mean",
+    metric="episode_reward_mean",
+    mode="max",
     perturbation_interval=10,
     hyperparam_mutations={
         "lr": [0.1, 0.01, 0.001, 0.0001],

diff --git a/doc/source/conf.py b/doc/source/conf.py
@@ -53,6 +53,10 @@
     "tensorflow.python",
     "tensorflow.python.client",
     "tensorflow.python.util",
+    "torch",
+    "torch.distributed",
+    "torch.nn",
+    "torch.utils.data",
 ]
 for mod_name in MOCK_MODULES:
     sys.modules[mod_name] = mock.Mock()

diff --git a/doc/source/distributed_training.rst b/doc/source/distributed_training.rst
@@ -0,0 +1,48 @@
+Distributed Training (Experimental)
+===================================
+
+
+Ray includes abstractions for distributed model training that integrate with
+deep learning frameworks, such as PyTorch.
+
+Ray Train is built on top of the Ray task and actor abstractions to provide
+seamless integration into existing Ray applications.
+
+PyTorch Interface
+-----------------
+
+To use Ray Train with PyTorch, pass model and data creator functions to the
+``ray.experimental.sgd.pytorch.PyTorchTrainer`` class.
+To drive the distributed training, ``trainer.train()`` can be called
+repeatedly.
+
+.. code-block:: python
+
+    model_creator = lambda config: YourPyTorchModel()
+    data_creator = lambda config: YourTrainingSet(), YourValidationSet()
+
+    trainer = PyTorchTrainer(
+        model_creator,
+        data_creator,
+        optimizer_creator=utils.sgd_mse_optimizer,
+        config={"lr": 1e-4},
+        num_replicas=2,
+        resources_per_replica=Resources(num_gpus=1),
+        batch_size=16,
+        backend="auto")
+
+    for i in range(NUM_EPOCHS):
+        trainer.train()
+
+Under the hood, Ray Train will create *replicas* of your model
+(controlled by ``num_replicas``) which are each managed by a worker.
+Multiple devices (e.g. GPUs) can be managed by each replica (controlled by ``resources_per_replica``),
+which allows training of lage models across multiple GPUs.
+The ``PyTorchTrainer`` class coordinates the distributed computation and training to improve the model.
+
+The full documentation for ``PyTorchTrainer`` is as follows:
+
+.. autoclass:: ray.experimental.sgd.pytorch.PyTorchTrainer
+    :members:
+
+    .. automethod:: __init__