-
Notifications
You must be signed in to change notification settings - Fork 7k
[Collective][PR 2/6] Driver program declarative interfaces #12874
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 50 commits
18ea0e2
bf1051c
3c5628a
0714c4a
20df179
5267df1
8ff63ad
88fbea1
1e66354
c41f046
912bd0f
5db388f
bd91da9
d971237
3f2f86b
135b9ec
03e49e7
ec02002
5588322
49e59a3
be40e84
0133c6a
cbeaafe
893142d
ec1c07a
8f15ba4
5b40ec3
c76a645
793830c
f8587df
d7e4aee
cd62a50
bdb90de
63973ec
e027891
4136fa9
ac603ad
a8f6898
a3aafba
af15ca5
9abf10f
6aad76d
82170bf
93567cf
d521fcb
5a71f2c
38daf7b
c5c414a
b0ab663
322c822
7c5f414
9018ccd
f50758d
1b3ba3b
918c905
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,6 +9,7 @@ | |
| import ray.worker | ||
| from ray.util.placement_group import ( | ||
| PlacementGroup, check_placement_group_index, get_current_placement_group) | ||
| from ray.util.collective_utils import collective_to_envs | ||
|
|
||
| from ray import ActorClassID, Language | ||
| from ray._raylet import PythonFunctionDescriptor | ||
|
|
@@ -424,7 +425,8 @@ def options(self, | |
| placement_group=None, | ||
| placement_group_bundle_index=-1, | ||
| placement_group_capture_child_tasks=None, | ||
| override_environment_variables=None): | ||
| override_environment_variables=None, | ||
| collective=None): | ||
| """Configures and overrides the actor instantiation parameters. | ||
|
|
||
| The arguments are the same as those that can be passed | ||
|
|
@@ -466,7 +468,8 @@ def remote(self, *args, **kwargs): | |
| placement_group_capture_child_tasks=( | ||
| placement_group_capture_child_tasks), | ||
| override_environment_variables=( | ||
| override_environment_variables)) | ||
| override_environment_variables), | ||
| collective=collective) | ||
|
|
||
| return ActorOptionWrapper() | ||
|
|
||
|
|
@@ -487,7 +490,8 @@ def _remote(self, | |
| placement_group=None, | ||
| placement_group_bundle_index=-1, | ||
| placement_group_capture_child_tasks=None, | ||
| override_environment_variables=None): | ||
| override_environment_variables=None, | ||
| collective=None): | ||
| """Create an actor. | ||
|
|
||
| This method allows more flexibility than the remote method because | ||
|
|
@@ -527,6 +531,7 @@ def _remote(self, | |
| override_environment_variables: Environment variables to override | ||
| and/or introduce for this actor. This is a dictionary mapping | ||
| variable names to their values. | ||
| collective: what colletive configuration to use | ||
|
|
||
| Returns: | ||
| A handle to the newly created actor. | ||
|
|
@@ -657,6 +662,11 @@ def _remote(self, | |
| function_signature = meta.method_meta.signatures["__init__"] | ||
| creation_args = signature.flatten_args(function_signature, args, | ||
| kwargs) | ||
|
|
||
| if collective: | ||
| override_environment_variables = collective_to_envs( | ||
| collective, override_environment_variables) | ||
|
||
|
|
||
| actor_id = worker.core_worker.create_actor( | ||
| meta.language, | ||
| meta.actor_creation_function_descriptor, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,9 +1,10 @@ | ||
| from .collective import nccl_available, mpi_available, is_group_initialized, \ | ||
zhisbug marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| init_collective_group, destroy_collective_group, get_rank, \ | ||
| get_world_size, allreduce, barrier | ||
| init_collective_group, declare_collective_group, \ | ||
| destroy_collective_group, get_rank, get_world_size, allreduce, barrier | ||
|
|
||
| __all__ = [ | ||
| "nccl_available", "mpi_available", "is_group_initialized", | ||
| "init_collective_group", "destroy_collective_group", "get_rank", | ||
| "get_world_size", "allreduce", "barrier" | ||
| "init_collective_group", "declare_collective_group", | ||
| "destroy_collective_group", "get_rank", "get_world_size", "allreduce", | ||
| "barrier" | ||
| ] | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,6 @@ | ||
| """APIs exposed under the namespace ray.util.collective.""" | ||
| import logging | ||
| import os | ||
|
|
||
| import numpy as np | ||
| import ray | ||
|
|
@@ -150,6 +151,54 @@ def init_collective_group(world_size: int, | |
| _group_mgr.create_collective_group(backend, world_size, rank, group_name) | ||
|
|
||
|
|
||
| def declare_collective_group(actors, group_options): | ||
|
||
| """ | ||
| Declare a list of actors in a collective group with group options. | ||
|
|
||
| Note: This function should be called in a driver process. | ||
|
|
||
| Args: | ||
| actors (list): a list of actors to be set in a collective group. | ||
| group_options (dict): a dictionary that contains group_name(str), | ||
| world_size(int), rank(list of int, e.g. [0,1] | ||
| means the first actor is rank 0, and the second | ||
| actor is rank 1), backend(str). | ||
| """ | ||
zhisbug marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| try: | ||
| group_name = group_options["group_name"] | ||
| world_size = group_options["world_size"] | ||
| rank = group_options["rank"] | ||
| backend = group_options["backend"] | ||
| except KeyError: | ||
| raise ValueError("group options incomplete.") | ||
|
|
||
| backend = types.Backend(backend) | ||
| _check_backend_availability(backend) | ||
|
|
||
| name = "info_" + group_name | ||
| try: | ||
| ray.get_actor(name) | ||
| raise RuntimeError("Trying to initialize a group twice.") | ||
| except ValueError: | ||
| pass | ||
|
|
||
| if len(rank) != len(actors): | ||
| raise RuntimeError("Each actor should correspond to one rank.") | ||
|
|
||
| if set(rank) != set(range(len(rank))): | ||
| raise RuntimeError("Rank must be a permutation from 0 to len-1.") | ||
|
|
||
| assert world_size > 0 | ||
| assert all(rank) >= 0 and all(rank) < world_size | ||
|
|
||
| from ray.util.collective.util import Info | ||
| # store the information into a NamedActor that can be accessed later/ | ||
| name = "info_" + group_name | ||
| actors_id = [a._ray_actor_id for a in actors] | ||
| info = Info.options(name=name, lifetime="detached").remote() | ||
| ray.wait([info.set_info.remote(actors_id, world_size, rank, backend)]) | ||
|
|
||
|
|
||
| def destroy_collective_group(group_name: str = "default") -> None: | ||
| """Destroy a collective group given its group name.""" | ||
| _check_inside_actor() | ||
|
|
@@ -231,9 +280,33 @@ def barrier(group_name): | |
| def _check_and_get_group(group_name): | ||
| """Check the existence and return the group handle.""" | ||
| _check_inside_actor() | ||
| global _group_mgr | ||
| if not is_group_initialized(group_name): | ||
| raise RuntimeError("The collective group '{}' is not " | ||
| "initialized in the process.".format(group_name)) | ||
| # try loading from remote info store | ||
| try: | ||
| # if the information is stored in an Info object, | ||
| # get and create the group. | ||
| name = "info_" + group_name | ||
| mgr = ray.get_actor(name=name) | ||
| ids, world_size, rank, backend = ray.get(mgr.get_info.remote()) | ||
| worker = ray.worker.global_worker | ||
| id_ = worker.core_worker.get_actor_id() | ||
| r = rank[ids.index(id_)] | ||
| _group_mgr.create_collective_group(backend, world_size, r, | ||
| group_name) | ||
| except ValueError: | ||
zhisbug marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| # check if this group is initialized using options() | ||
| if "collective_group_name" in os.environ and \ | ||
| os.environ["collective_group_name"] == group_name: | ||
| rank = int(os.environ["collective_rank"]) | ||
| world_size = int(os.environ["collective_world_size"]) | ||
| backend = os.environ["collective_backend"] | ||
| _group_mgr.create_collective_group(backend, world_size, rank, | ||
| group_name) | ||
| else: | ||
| raise RuntimeError( | ||
| "The collective group '{}' is not " | ||
| "initialized in the process.".format(group_name)) | ||
zhisbug marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| g = _group_mgr.get_group_by_name(group_name) | ||
| return g | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,33 @@ | ||
| import cupy as cp | ||
| import ray | ||
|
|
||
| import ray.util.collective as collective | ||
|
|
||
|
|
||
| @ray.remote(num_gpus=1) | ||
| class Worker: | ||
| def __init__(self): | ||
| self.send = cp.ones((4, ), dtype=cp.float32) | ||
|
|
||
| def compute(self): | ||
| collective.allreduce(self.send, "177") | ||
| return self.send | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| ray.init(num_gpus=2) | ||
|
|
||
| num_workers = 2 | ||
| workers = [] | ||
| for i in range(num_workers): | ||
| _options = { | ||
| "group_name": "177", | ||
| "world_size": 2, | ||
| "rank": i, | ||
| "backend": "nccl" | ||
| } | ||
| w = Worker.options(collective=_options).remote() | ||
| workers.append(w) | ||
| results = ray.get([w.compute.remote() for w in workers]) | ||
| print(results) | ||
| ray.shutdown() |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,34 @@ | ||
| import cupy as cp | ||
| import ray | ||
|
|
||
| import ray.util.collective as collective | ||
|
|
||
|
|
||
| @ray.remote(num_gpus=1) | ||
| class Worker: | ||
| def __init__(self): | ||
| self.send = cp.ones((4, ), dtype=cp.float32) | ||
|
|
||
| def compute(self): | ||
| collective.allreduce(self.send, "177") | ||
| return self.send | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| ray.init(num_gpus=2) | ||
|
|
||
| num_workers = 2 | ||
| workers = [] | ||
| for i in range(num_workers): | ||
| w = Worker.remote() | ||
| workers.append(w) | ||
| _options = { | ||
| "group_name": "177", | ||
| "world_size": 2, | ||
| "rank": [0, 1], | ||
| "backend": "nccl" | ||
| } | ||
| collective.declare_collective_group(workers, _options) | ||
| results = ray.get([w.compute.remote() for w in workers]) | ||
| print(results) | ||
| ray.shutdown() |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -40,3 +40,29 @@ def get_id(self): | |
| logger.warning("The NCCL ID has not been " | ||
| "set yet for store {}.".format(self.name)) | ||
| return self.nccl_id | ||
|
|
||
|
|
||
| @ray.remote | ||
| class Info: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you please rename? Also, this is just a makeshift kvstore right? Can we maybe use
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @zhisbug let's chat offline about possible alternatives here? this may seem harmless but I think it could easily be a source of issues later on.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's talk tomorrow. If the |
||
| """ | ||
| Store the group information created via `declare_collective_group`. | ||
zhisbug marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| Note: Should be used as a NamedActor. | ||
| """ | ||
|
|
||
| def __init__(self): | ||
| self.ids = None | ||
| self.world_size = -1 | ||
| self.rank = -1 | ||
| self.backend = None | ||
|
|
||
| def set_info(self, ids, world_size, rank, backend): | ||
| """Store collective information.""" | ||
| self.ids = ids | ||
| self.world_size = world_size | ||
| self.rank = rank | ||
| self.backend = backend | ||
|
|
||
| def get_info(self): | ||
| """Get previously stored collective information.""" | ||
| return self.ids, self.world_size, self.rank, self.backend | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,28 @@ | ||
| """Collections of collective util functions""" | ||
|
|
||
|
|
||
| def collective_to_envs(collective, envs): | ||
| """A helper method that get information from collective and add to envs. | ||
|
|
||
| Args: | ||
| collective (dict): collective information | ||
| envs (dict): os environment dict | ||
|
|
||
| Returns: | ||
| envs (dict): modified os environment dict | ||
| """ | ||
|
|
||
| if envs is not None: | ||
| assert not any(k in envs for k in [ | ||
| "collective_group_name", "collective_rank", | ||
| "collective_world_size", "collective_backend" | ||
| ]) | ||
|
|
||
| else: | ||
| envs = {} | ||
| envs["collective_group_name"] = str(collective["group_name"]) | ||
| envs["collective_rank"] = str(collective["rank"]) | ||
| envs["collective_world_size"] = str(collective["world_size"]) | ||
| envs["collective_backend"] = str(collective["backend"]) | ||
|
|
||
| return envs |
Uh oh!
There was an error while loading. Please reload this page.