diff --git a/.style.yapf b/.style.yapf index 8572636c2268..8acfe3e7b8e7 100644 --- a/.style.yapf +++ b/.style.yapf @@ -14,9 +14,19 @@ allow_multiline_dictionary_keys=False # Allow lambdas to be formatted on more than one line. allow_multiline_lambdas=False +# Allow splits before the dictionary value. +allow_split_before_dict_value=True + +# Number of blank lines surrounding top-level function and class +# definitions. +blank_lines_around_top_level_definition=2 + # Insert a blank line before a class-level docstring. blank_line_before_class_docstring=False +# Insert a blank line before a module docstring. +blank_line_before_module_docstring=False + # Insert a blank line before a 'def' or 'class' immediately nested # within another 'def' or 'class'. For example: # @@ -42,10 +52,26 @@ blank_line_before_nested_class_or_def=False # 'key1': 'value1', # 'key2': 'value2', # }) -coalesce_brackets=True +coalesce_brackets=False # The column limit. -column_limit=79 +column_limit=80 + +# The style for continuation alignment. Possible values are: +# +# - SPACE: Use spaces for continuation alignment. This is default behavior. +# - FIXED: Use fixed number (CONTINUATION_INDENT_WIDTH) of columns +# (ie: CONTINUATION_INDENT_WIDTH/INDENT_WIDTH tabs) for continuation +# alignment. +# - LESS: Slightly left if cannot vertically align continuation lines with +# indent characters. +# - VALIGN-RIGHT: Vertically align continuation lines with indent +# characters. Slightly right (one more indent character) if cannot +# vertically align continuation lines with indent characters. +# +# For options FIXED, and VALIGN-RIGHT are only available when USE_TABS is +# enabled. +continuation_align_style=SPACE # Indent width used for line continuations. continuation_indent_width=4 @@ -66,7 +92,7 @@ continuation_indent_width=4 # start_ts=now()-timedelta(days=3), # end_ts=now(), # ) # <--- this bracket is dedented and on a separate line -dedent_closing_brackets=False +dedent_closing_brackets=True # Place each dictionary entry onto its own line. each_dict_entry_on_separate_line=True @@ -90,13 +116,13 @@ i18n_function_call= # 'key2': value1 + # value2, # } -indent_dictionary_value=True +indent_dictionary_value=False # The number of columns to use for indentation. indent_width=4 # Join short lines into one line. E.g., single line 'if' statements. -join_multiple_lines=True +join_multiple_lines=False # Do not include spaces around selected binary operators. For example: # @@ -106,7 +132,7 @@ join_multiple_lines=True # # 1 + 2*3 - 4/5 # -no_spaces_around_selected_binary_operators=set([]) +no_spaces_around_selected_binary_operators=set() # Use spaces around default or named assigns. spaces_around_default_or_named_assign=False @@ -123,12 +149,16 @@ space_between_ending_comma_and_closing_bracket=True # Split before arguments if the argument list is terminated by a # comma. -split_arguments_when_comma_terminated=False +split_arguments_when_comma_terminated=True # Set to True to prefer splitting before '&', '|' or '^' rather than # after. split_before_bitwise_operator=True +# Split before the closing bracket if a list or dict literal doesn't fit on +# a single line. +split_before_closing_bracket=True + # Split before a dictionary or set generator (comp_for). For example, note # the split before the 'for': # @@ -138,6 +168,10 @@ split_before_bitwise_operator=True # } split_before_dict_set_generator=True +# Split after the opening paren which surrounds an expression if it doesn't +# fit on a single line. +split_before_expression_after_opening_paren=True + # If an argument / parameter list is going to be split, then split before # the first argument. split_before_first_argument=False @@ -149,6 +183,22 @@ split_before_logical_operator=True # Split named assignments onto individual lines. split_before_named_assigns=True +# Set to True to split list comprehensions and generators that have +# non-trivial expressions and multiple clauses before each of these +# clauses. For example: +# +# result = [ +# a_long_var + 100 for a_long_var in xrange(1000) +# if a_long_var % 10] +# +# would reformat to something like: +# +# result = [ +# a_long_var + 100 +# for a_long_var in xrange(1000) +# if a_long_var % 10] +split_complex_comprehension=True + # The penalty for splitting right after the opening bracket. split_penalty_after_opening_bracket=30 @@ -162,8 +212,12 @@ split_penalty_before_if_expr=0 # operators. split_penalty_bitwise_operator=300 +# The penalty for splitting a list comprehension or generator +# expression. +split_penalty_comprehension=80 + # The penalty for characters over the column limit. -split_penalty_excess_character=4500 +split_penalty_excess_character=1000 # The penalty incurred by adding a line split to the unwrapped line. The # more line splits added the higher the penalty. @@ -187,3 +241,4 @@ split_penalty_logical_operator=300 # Use the Tab character for indentation. use_tabs=False + diff --git a/.travis/yapf.sh b/.travis/yapf.sh index b8af8656c040..894483af274b 100755 --- a/.travis/yapf.sh +++ b/.travis/yapf.sh @@ -1,27 +1,32 @@ #!/usr/bin/env bash # Cause the script to exit if a single command fails -set -e - -ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd) - -pushd $ROOT_DIR/../test - find . -name '*.py' -type f -exec yapf --style=pep8 -i -r {} \; -popd - -pushd $ROOT_DIR/../python - find . -name '*.py' -type f -not -path './ray/dataframe/*' -not -path './ray/rllib/*' -not -path './ray/cloudpickle/*' -exec yapf --style=pep8 -i -r {} \; -popd - -CHANGED_FILES=(`git diff --name-only`) -if [ "$CHANGED_FILES" ]; then - echo 'Reformatted staged files. Please review and stage the changes.' - echo - echo 'Files updated:' - for file in ${CHANGED_FILES[@]}; do - echo " $file" - done - exit 1 -else - exit 0 +set -eo pipefail + +# this stops git rev-parse from failing if we run this from the .git directory +builtin cd "$(dirname "${BASH_SOURCE:-$0}")" + +ROOT="$(git rev-parse --show-toplevel)" +builtin cd "$ROOT" + +yapf \ + --style "$ROOT/.style.yapf" \ + --in-place --recursive --parallel \ + --exclude 'python/ray/dataframe/' \ + --exclude 'python/ray/rllib/' \ + --exclude 'python/ray/cloudpickle/' \ + -- \ + 'test/' 'python/' + +CHANGED_FILES=($(git diff --name-only)) + +if [[ "${#CHANGED_FILES[@]}" -gt 0 ]]; then + echo 'Reformatted staged files. Please review and stage the changes.' + echo 'Files updated:' + + for file in "${CHANGED_FILES[@]}"; do + echo "$file" + done + + exit 1 fi diff --git a/doc/source/api.rst b/doc/source/api.rst index 7d5f96d17be7..b61f86060f84 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -227,7 +227,7 @@ lists is equal to the list passed in to ``ray.wait`` (up to ordering). ready_ids, remaining_ids = ray.wait(results, num_returns=2) # Start 5 tasks with different durations. - results = [f.remote(i) for i in range(3)] + results = [f.remote(i) for i in range(5)] # Block until 4 of them have finished or 2.5 seconds pass. ready_ids, remaining_ids = ray.wait(results, num_returns=4, timeout=2500) diff --git a/doc/source/pbt.rst b/doc/source/pbt.rst index 2174956450c6..5a5a4858816e 100644 --- a/doc/source/pbt.rst +++ b/doc/source/pbt.rst @@ -17,7 +17,7 @@ Ray Tune's PBT scheduler can be plugged in on top of an existing grid or random time_attr='time_total_s', reward_attr='mean_accuracy', perturbation_interval=600.0, - hyperparameter_mutations={ + hyperparam_mutations={ "lr": [1e-3, 5e-4, 1e-4, 5e-5, 1e-5], "alpha": lambda: random.uniform(0.0, 1.0), ... diff --git a/doc/source/tune.rst b/doc/source/tune.rst index fda47491f489..d97612021a1a 100644 --- a/doc/source/tune.rst +++ b/doc/source/tune.rst @@ -26,7 +26,7 @@ Quick Start tune.run_experiments({ "my_experiment": { "run": "train_func", - "stop": {"mean_accuracy": 99} + "stop": {"mean_accuracy": 99}, "config": { "lr": tune.grid_search([0.2, 0.4, 0.6]), "momentum": tune.grid_search([0.1, 0.2]), diff --git a/examples/resnet/resnet_main.py b/examples/resnet/resnet_main.py index 4b42fbee93cf..5175b6de374b 100644 --- a/examples/resnet/resnet_main.py +++ b/examples/resnet/resnet_main.py @@ -230,7 +230,7 @@ def train(): # testing task with the current weights every 200 steps. acc = ray.get(acc_id) acc_id = test_actor.accuracy.remote(weight_id, step) - print("Step {0}: {1:.6f}".format(step - 200, acc)) + print("Step {}: {:.6f}".format(step - 200, acc)) except KeyboardInterrupt: pass diff --git a/python/ray/__init__.py b/python/ray/__init__.py index eee900c1d7dc..c45b81d5ec7a 100644 --- a/python/ray/__init__.py +++ b/python/ray/__init__.py @@ -46,7 +46,7 @@ e.args += (helpful_message, ) raise -from ray.local_scheduler import _config # noqa: E402 +from ray.local_scheduler import ObjectID, _config # noqa: E402 from ray.worker import (error_info, init, connect, disconnect, get, put, wait, remote, log_event, log_span, flush_log, get_gpu_ids, get_webui_url, @@ -68,7 +68,7 @@ "remote", "log_event", "log_span", "flush_log", "actor", "method", "get_gpu_ids", "get_webui_url", "register_custom_serializer", "SCRIPT_MODE", "WORKER_MODE", "PYTHON_MODE", "SILENT_MODE", "global_state", - "_config", "__version__" + "ObjectID", "_config", "__version__" ] import ctypes # noqa: E402 diff --git a/python/ray/actor.py b/python/ray/actor.py index 505dd07c6a53..1023e334d486 100644 --- a/python/ray/actor.py +++ b/python/ray/actor.py @@ -12,12 +12,13 @@ import ray.local_scheduler import ray.signature as signature import ray.worker -from ray.utils import (FunctionProperties, _random_string, is_cython, - push_error_to_driver) +from ray.utils import _random_string, is_cython, push_error_to_driver + +DEFAULT_ACTOR_METHOD_NUM_RETURN_VALS = 1 def compute_actor_handle_id(actor_handle_id, num_forks): - """Deterministically comopute an actor handle ID. + """Deterministically compute an actor handle ID. A new actor handle ID is generated when it is forked from another actor handle. The new handle ID is computed as hash(old_handle_id || num_forks). @@ -28,14 +29,46 @@ def compute_actor_handle_id(actor_handle_id, num_forks): forked so far. Returns: - An object ID for the new actor handle. + An ID for the new actor handle. """ handle_id_hash = hashlib.sha1() handle_id_hash.update(actor_handle_id.id()) handle_id_hash.update(str(num_forks).encode("ascii")) handle_id = handle_id_hash.digest() assert len(handle_id) == 20 - return ray.local_scheduler.ObjectID(handle_id) + return ray.ObjectID(handle_id) + + +def compute_actor_handle_id_non_forked(actor_id, actor_handle_id, + current_task_id): + """Deterministically compute an actor handle ID in the non-forked case. + + This code path is used whenever an actor handle is pickled and unpickled + (for example, if a remote function closes over an actor handle). Then, + whenever the actor handle is used, a new actor handle ID will be generated + on the fly as a deterministic function of the actor ID, the previous actor + handle ID and the current task ID. + + TODO(rkn): It may be possible to cause problems by closing over multiple + actor handles in a remote function, which then get unpickled and give rise + to the same actor handle IDs. + + Args: + actor_id: The actor ID. + actor_handle_id: The original actor handle ID. + num_forks: The number of times the original actor handle has been + forked so far. + + Returns: + An ID for the new actor handle. + """ + handle_id_hash = hashlib.sha1() + handle_id_hash.update(actor_id.id()) + handle_id_hash.update(actor_handle_id.id()) + handle_id_hash.update(current_task_id.id()) + handle_id = handle_id_hash.digest() + assert len(handle_id) == 20 + return ray.ObjectID(handle_id) def compute_actor_creation_function_id(class_id): @@ -47,7 +80,7 @@ def compute_actor_creation_function_id(class_id): Returns: The function ID of the actor creation event. """ - return ray.local_scheduler.ObjectID(class_id) + return ray.ObjectID(class_id) def compute_actor_method_function_id(class_name, attr): @@ -61,11 +94,11 @@ def compute_actor_method_function_id(class_name, attr): Function ID corresponding to the method. """ function_id_hash = hashlib.sha1() - function_id_hash.update(class_name) + function_id_hash.update(class_name.encode("ascii")) function_id_hash.update(attr.encode("ascii")) function_id = function_id_hash.digest() assert len(function_id) == 20 - return ray.local_scheduler.ObjectID(function_id) + return ray.ObjectID(function_id) def set_actor_checkpoint(worker, actor_id, checkpoint_index, checkpoint, @@ -225,7 +258,7 @@ def actor_method_executor(dummy_return_id, actor, *args): return actor_method_executor -def fetch_and_register_actor(actor_class_key, resources, worker): +def fetch_and_register_actor(actor_class_key, worker): """Import an actor. This will be called by the worker's import thread when the worker receives @@ -234,25 +267,20 @@ def fetch_and_register_actor(actor_class_key, resources, worker): Args: actor_class_key: The key in Redis to use to fetch the actor. - resources: The resources required for this actor's lifetime. worker: The worker to use. """ actor_id_str = worker.actor_id (driver_id, class_id, class_name, module, pickled_class, - checkpoint_interval, actor_method_names, - actor_method_num_return_vals) = worker.redis_client.hmget( + checkpoint_interval, actor_method_names) = worker.redis_client.hmget( actor_class_key, [ "driver_id", "class_id", "class_name", "module", "class", - "checkpoint_interval", "actor_method_names", - "actor_method_num_return_vals" + "checkpoint_interval", "actor_method_names" ]) - actor_name = class_name.decode("ascii") + class_name = class_name.decode("ascii") module = module.decode("ascii") checkpoint_interval = int(checkpoint_interval) actor_method_names = json.loads(actor_method_names.decode("ascii")) - actor_method_num_return_vals = json.loads( - actor_method_num_return_vals.decode("ascii")) # Create a temporary actor with some temporary methods so that if the actor # fails to be unpickled, the temporary actor can be used (just to produce @@ -265,11 +293,8 @@ class TemporaryActor(object): def temporary_actor_method(*xs): raise Exception("The actor with name {} failed to be imported, and so " - "cannot execute this method".format(actor_name)) + "cannot execute this method".format(class_name)) - # Register the actor method signatures. - register_actor_signatures(worker, driver_id, class_id, class_name, - actor_method_names, actor_method_num_return_vals) # Register the actor method executors. for actor_method_name in actor_method_names: function_id = compute_actor_method_function_id(class_name, @@ -279,8 +304,11 @@ def temporary_actor_method(*xs): actor_method_name, temporary_actor_method, actor_imported=False) - worker.functions[driver_id][function_id] = (actor_method_name, - temporary_executor) + worker.function_execution_info[driver_id][function_id] = ( + ray.worker.FunctionExecutionInfo( + function=temporary_executor, + function_name=actor_method_name, + max_calls=0)) worker.num_task_executions[driver_id][function_id] = 0 try: @@ -315,63 +343,16 @@ def pred(x): class_name, actor_method_name).id() executor = make_actor_method_executor( worker, actor_method_name, actor_method, actor_imported=True) - worker.functions[driver_id][function_id] = (actor_method_name, - executor) + worker.function_execution_info[driver_id][function_id] = ( + ray.worker.FunctionExecutionInfo( + function=executor, + function_name=actor_method_name, + max_calls=0)) # We do not set worker.function_properties[driver_id][function_id] # because we currently do need the actor worker to submit new tasks # for the actor. -def register_actor_signatures(worker, - driver_id, - class_id, - class_name, - actor_method_names, - actor_method_num_return_vals, - actor_creation_resources=None, - actor_method_cpus=None): - """Register an actor's method signatures in the worker. - - Args: - worker: The worker to register the signatures on. - driver_id: The ID of the driver that this actor is associated with. - class_id: The ID of the actor class. - class_name: The name of the actor class. - actor_method_names: The names of the methods to register. - actor_method_num_return_vals: A list of the number of return values for - each of the actor's methods. - actor_creation_resources: The resources required by the actor creation - task. - actor_method_cpus: The number of CPUs required by each actor method. - """ - assert len(actor_method_names) == len(actor_method_num_return_vals) - for actor_method_name, num_return_vals in zip( - actor_method_names, actor_method_num_return_vals): - # TODO(rkn): When we create a second actor, we are probably overwriting - # the values from the first actor here. This may or may not be a - # problem. - function_id = compute_actor_method_function_id(class_name, - actor_method_name).id() - worker.function_properties[driver_id][function_id] = ( - # The extra return value is an actor dummy object. - # In the cases where actor_method_cpus is None, that value should - # never be used. - FunctionProperties( - num_return_vals=num_return_vals + 1, - resources={"CPU": actor_method_cpus}, - max_calls=0)) - - if actor_creation_resources is not None: - # Also register the actor creation task. - function_id = compute_actor_creation_function_id(class_id) - worker.function_properties[driver_id][function_id.id()] = ( - # The extra return value is an actor dummy object. - FunctionProperties( - num_return_vals=0 + 1, - resources=actor_creation_resources, - max_calls=0)) - - def publish_actor_class_to_key(key, actor_class_info, worker): """Push an actor class definition to Redis. @@ -392,17 +373,14 @@ def publish_actor_class_to_key(key, actor_class_info, worker): def export_actor_class(class_id, Class, actor_method_names, - actor_method_num_return_vals, checkpoint_interval, - worker): + checkpoint_interval, worker): key = b"ActorClass:" + class_id actor_class_info = { "class_name": Class.__name__, "module": Class.__module__, "class": pickle.dumps(Class), "checkpoint_interval": checkpoint_interval, - "actor_method_names": json.dumps(list(actor_method_names)), - "actor_method_num_return_vals": - json.dumps(actor_method_num_return_vals) + "actor_method_names": json.dumps(list(actor_method_names)) } if worker.mode is None: @@ -423,43 +401,6 @@ def export_actor_class(class_id, Class, actor_method_names, # https://github.com/ray-project/ray/issues/1146. -def export_actor(actor_id, class_id, class_name, actor_method_names, - actor_method_num_return_vals, actor_creation_resources, - actor_method_cpus, worker): - """Export an actor to redis. - - Args: - actor_id (common.ObjectID): The ID of the actor. - class_id (str): A random ID for the actor class. - class_name (str): The actor class name. - actor_method_names (list): A list of the names of this actor's methods. - actor_method_num_return_vals: A list of the number of return values for - each of the actor's methods. - actor_creation_resources: A dictionary mapping resource name to the - quantity of that resource required by the actor. - actor_method_cpus: The number of CPUs required by actor methods. - """ - ray.worker.check_main_thread() - if worker.mode is None: - raise Exception("Actors cannot be created before Ray has been " - "started. You can start Ray with 'ray.init()'.") - - driver_id = worker.task_driver_id.id() - register_actor_signatures( - worker, - driver_id, - class_id, - class_name, - actor_method_names, - actor_method_num_return_vals, - actor_creation_resources=actor_creation_resources, - actor_method_cpus=actor_method_cpus) - - args = [class_id] - function_id = compute_actor_creation_function_id(class_id) - return worker.submit_task(function_id, args, actor_creation_id=actor_id)[0] - - def method(*args, **kwargs): assert len(args) == 0 assert len(kwargs) == 1 @@ -476,9 +417,10 @@ def annotate_method(method): # Create objects to wrap method invocations. This is done so that we can # invoke methods with actor.method.remote() instead of actor.method(). class ActorMethod(object): - def __init__(self, actor, method_name): + def __init__(self, actor, method_name, num_return_vals): self._actor = actor self._method_name = method_name + self._num_return_vals = num_return_vals def __call__(self, *args, **kwargs): raise Exception("Actor methods cannot be called directly. Instead " @@ -487,385 +429,534 @@ def __call__(self, *args, **kwargs): self._method_name)) def remote(self, *args, **kwargs): + return self._submit(args, kwargs) + + def _submit(self, args, kwargs, num_return_vals=None): + if num_return_vals is None: + num_return_vals = self._num_return_vals + return self._actor._actor_method_call( self._method_name, args=args, kwargs=kwargs, + num_return_vals=num_return_vals, dependency=self._actor._ray_actor_cursor) -class ActorHandleWrapper(object): - """A wrapper for the contents of an ActorHandle. +class ActorClass(object): + """An actor class. - This is essentially just a dictionary, but it is used so that the recipient - can tell that an argument is an ActorHandle. - """ - - def __init__(self, actor_id, class_id, actor_handle_id, actor_cursor, - actor_counter, actor_method_names, - actor_method_num_return_vals, method_signatures, - checkpoint_interval, class_name, - actor_creation_dummy_object_id, actor_creation_resources, - actor_method_cpus): - # TODO(rkn): Some of these fields are probably not necessary. We should - # strip out the unnecessary fields to keep actor handles lightweight. - self.actor_id = actor_id - self.class_id = class_id - self.actor_handle_id = actor_handle_id - self.actor_cursor = actor_cursor - self.actor_counter = actor_counter - self.actor_method_names = actor_method_names - self.actor_method_num_return_vals = actor_method_num_return_vals - # TODO(swang): Fetch this information from Redis so that we don't have - # to fall back to pickle. - self.method_signatures = method_signatures - self.checkpoint_interval = checkpoint_interval - self.class_name = class_name - self.actor_creation_dummy_object_id = actor_creation_dummy_object_id - self.actor_creation_resources = actor_creation_resources - self.actor_method_cpus = actor_method_cpus - - -def wrap_actor_handle(actor_handle): - """Wrap the ActorHandle to store the fields. + This is a decorated class. It can be used to create actors. - Args: - actor_handle: The ActorHandle instance to wrap. - - Returns: - An ActorHandleWrapper instance that stores the ActorHandle's fields. + Attributes: + _modified_class: The original class that was decorated (with some + additional methods added like __ray_terminate__). + _class_id: The ID of this actor class. + _class_name: The name of this class. + _checkpoint_interval: The interval at which to checkpoint actor state. + _num_cpus: The default number of CPUs required by the actor creation + task. + _num_gpus: The default number of GPUs required by the actor creation + task. + _resources: The default resources required by the actor creation task. + _actor_method_cpus: The number of CPUs required by actor method tasks. + _exported: True if the actor class has been exported and false + otherwise. + _actor_methods: The actor methods. + _method_signatures: The signatures of the methods. + _actor_method_names: The names of the actor methods. + _actor_method_num_return_vals: The default number of return values for + each actor method. """ - wrapper = ActorHandleWrapper( - actor_handle._ray_actor_id, - actor_handle._ray_class_id, - compute_actor_handle_id(actor_handle._ray_actor_handle_id, - actor_handle._ray_actor_forks), - actor_handle._ray_actor_cursor, - 0, # Reset the actor counter. - actor_handle._ray_actor_method_names, - actor_handle._ray_actor_method_num_return_vals, - actor_handle._ray_method_signatures, - actor_handle._ray_checkpoint_interval, - actor_handle._ray_class_name, - actor_handle._ray_actor_creation_dummy_object_id, - actor_handle._ray_actor_creation_resources, - actor_handle._ray_actor_method_cpus) - actor_handle._ray_actor_forks += 1 - return wrapper - - -def unwrap_actor_handle(worker, wrapper): - """Make an ActorHandle from the stored fields. - Args: - worker: The worker that is unwrapping the actor handle. - wrapper: An ActorHandleWrapper instance to unwrap. + def __init__(self, modified_class, class_id, checkpoint_interval, num_cpus, + num_gpus, resources, actor_method_cpus): + self._modified_class = modified_class + self._class_id = class_id + self._class_name = modified_class.__name__ + self._checkpoint_interval = checkpoint_interval + self._num_cpus = num_cpus + self._num_gpus = num_gpus + self._resources = resources + self._actor_method_cpus = actor_method_cpus + self._exported = False + + # Get the actor methods of the given class. + def pred(x): + return (inspect.isfunction(x) or inspect.ismethod(x) + or is_cython(x)) - Returns: - The unwrapped ActorHandle instance. - """ - driver_id = worker.task_driver_id.id() - register_actor_signatures( - worker, driver_id, wrapper.class_id, wrapper.class_name, - wrapper.actor_method_names, wrapper.actor_method_num_return_vals, - wrapper.actor_creation_resources, wrapper.actor_method_cpus) - - actor_handle_class = make_actor_handle_class(wrapper.class_name) - actor_object = actor_handle_class.__new__(actor_handle_class) - actor_object._manual_init( - wrapper.actor_id, wrapper.class_id, wrapper.actor_handle_id, - wrapper.actor_cursor, wrapper.actor_counter, - wrapper.actor_method_names, wrapper.actor_method_num_return_vals, - wrapper.method_signatures, wrapper.checkpoint_interval, - wrapper.actor_creation_dummy_object_id, - wrapper.actor_creation_resources, wrapper.actor_method_cpus) - return actor_object - - -class ActorHandleParent(object): - """This is the parent class of all ActorHandle classes. - - This enables us to identify actor handles by checking if an object obj - satisfies isinstance(obj, ActorHandleParent). - """ - pass + self._actor_methods = inspect.getmembers( + self._modified_class, predicate=pred) + # Extract the signatures of each of the methods. This will be used + # to catch some errors if the methods are called with inappropriate + # arguments. + self._method_signatures = dict() + self._actor_method_num_return_vals = dict() + for method_name, method in self._actor_methods: + # Print a warning message if the method signature is not + # supported. We don't raise an exception because if the actor + # inherits from a class that has a method whose signature we + # don't support, there may not be much the user can do about it. + signature.check_signature_supported(method, warn=True) + self._method_signatures[method_name] = signature.extract_signature( + method, ignore_first=True) + + # Set the default number of return values for this method. + if hasattr(method, "__ray_num_return_vals__"): + self._actor_method_num_return_vals[method_name] = ( + method.__ray_num_return_vals__) + else: + self._actor_method_num_return_vals[method_name] = ( + DEFAULT_ACTOR_METHOD_NUM_RETURN_VALS) + self._actor_method_names = [ + method_name for method_name, _ in self._actor_methods + ] -def make_actor_handle_class(class_name): - class ActorHandle(ActorHandleParent): - def __init__(self, *args, **kwargs): - raise Exception("Actor classes cannot be instantiated directly. " - "Instead of running '{}()', try '{}.remote()'." - .format(class_name, class_name)) + def __call__(self, *args, **kwargs): + raise Exception("Actors methods cannot be instantiated directly. " + "Instead of running '{}()', try '{}.remote()'.".format( + self._class_name, self._class_name)) - @classmethod - def remote(cls, *args, **kwargs): - raise NotImplementedError("The classmethod remote() can only be " - "called on the original Class.") - - def _manual_init(self, actor_id, class_id, actor_handle_id, - actor_cursor, actor_counter, actor_method_names, - actor_method_num_return_vals, method_signatures, - checkpoint_interval, actor_creation_dummy_object_id, - actor_creation_resources, actor_method_cpus): - self._ray_actor_id = actor_id - self._ray_class_id = class_id - self._ray_actor_handle_id = actor_handle_id - self._ray_actor_cursor = actor_cursor - self._ray_actor_counter = actor_counter - self._ray_actor_method_names = actor_method_names - self._ray_actor_method_num_return_vals = ( - actor_method_num_return_vals) - self._ray_method_signatures = method_signatures - self._ray_checkpoint_interval = checkpoint_interval - self._ray_class_name = class_name - self._ray_actor_forks = 0 - self._ray_actor_creation_dummy_object_id = ( - actor_creation_dummy_object_id) - self._ray_actor_creation_resources = actor_creation_resources - self._ray_actor_method_cpus = actor_method_cpus - - def _actor_method_call(self, - method_name, - args=None, - kwargs=None, - dependency=None): - """Method execution stub for an actor handle. - - This is the function that executes when - `actor.method_name.remote(*args, **kwargs)` is called. Instead of - executing locally, the method is packaged as a task and scheduled - to the remote actor instance. - - Args: - self: The local actor handle. - method_name: The name of the actor method to execute. - args: A list of arguments for the actor method. - kwargs: A dictionary of keyword arguments for the actor method. - dependency: The object ID that this method is dependent on. - Defaults to None, for no dependencies. Most tasks should - pass in the dummy object returned by the preceding task. - Some tasks, such as checkpoint and terminate methods, have - no dependencies. + def remote(self, *args, **kwargs): + """Create an actor. - Returns: - object_ids: A list of object IDs returned by the remote actor - method. - """ - ray.worker.check_connected() - ray.worker.check_main_thread() - function_signature = self._ray_method_signatures[method_name] - if args is None: - args = [] - if kwargs is None: - kwargs = {} - args = signature.extend_args(function_signature, args, kwargs) - - # Execute functions locally if Ray is run in PYTHON_MODE - # Copy args to prevent the function from mutating them. - if ray.worker.global_worker.mode == ray.PYTHON_MODE: - return getattr( - ray.worker.global_worker.actors[self._ray_actor_id], - method_name)(*copy.deepcopy(args)) - - # Add the execution dependency. - if dependency is None: - execution_dependencies = [] - else: - execution_dependencies = [dependency] + Args: + args: These arguments are forwarded directly to the actor + constructor. + kwargs: These arguments are forwarded directly to the actor + constructor. - is_actor_checkpoint_method = (method_name == "__ray_checkpoint__") + Returns: + A handle to the newly created actor. + """ + return self._submit(args=args, kwargs=kwargs) - function_id = compute_actor_method_function_id( - self._ray_class_name, method_name) - object_ids = ray.worker.global_worker.submit_task( - function_id, + def _submit(self, args, - actor_id=self._ray_actor_id, - actor_handle_id=self._ray_actor_handle_id, - actor_counter=self._ray_actor_counter, - is_actor_checkpoint_method=is_actor_checkpoint_method, - actor_creation_dummy_object_id=( - self._ray_actor_creation_dummy_object_id), - execution_dependencies=execution_dependencies) - # Update the actor counter and cursor to reflect the most recent - # invocation. - self._ray_actor_counter += 1 - self._ray_actor_cursor = object_ids.pop() - - # The last object returned is the dummy object that should be - # passed in to the next actor method. Do not return it to the user. - if len(object_ids) == 1: - return object_ids[0] - elif len(object_ids) > 1: - return object_ids - - # Make tab completion work. - def __dir__(self): - return self._ray_actor_method_names - - def __getattribute__(self, attr): - try: - # Check whether this is an actor method. - actor_method_names = object.__getattribute__( - self, "_ray_actor_method_names") - if attr in actor_method_names: - # We create the ActorMethod on the fly here so that the - # ActorHandle doesn't need a reference to the ActorMethod. - # The ActorMethod has a reference to the ActorHandle and - # this was causing cyclic references which were prevent - # object deallocation from behaving in a predictable - # manner. - actor_method_cls = ActorMethod - return actor_method_cls(self, attr) - except AttributeError: - pass - - # If the requested attribute is not a registered method, fall back - # to default __getattribute__. - return object.__getattribute__(self, attr) - - def __repr__(self): - return "Actor(" + self._ray_actor_id.hex() + ")" - - def __reduce__(self): - raise Exception("Actor objects cannot be pickled.") - - def __del__(self): - """Kill the worker that is running this actor.""" - # TODO(swang): Also clean up forked actor handles. - # Kill the worker if this is the original actor handle, created - # with Class.remote(). - if (ray.worker.global_worker.connected and - self._ray_actor_handle_id.id() == ray.worker.NIL_ACTOR_ID): - # TODO(rkn): Should we be passing in the actor cursor as a - # dependency here? - self._actor_method_call( - "__ray_terminate__", args=[self._ray_actor_id.id()]) - - return ActorHandle - - -def actor_handle_from_class(Class, class_id, actor_creation_resources, - checkpoint_interval, actor_method_cpus): - class_name = Class.__name__.encode("ascii") - actor_handle_class = make_actor_handle_class(class_name) - exported = [] - - class ActorHandle(actor_handle_class): - @classmethod - def remote(cls, *args, **kwargs): - if ray.worker.global_worker.mode is None: - raise Exception("Actors cannot be created before ray.init() " - "has been called.") - - actor_id = ray.local_scheduler.ObjectID(_random_string()) - # The ID for this instance of ActorHandle. These should be unique - # across instances with the same _ray_actor_id. - actor_handle_id = ray.local_scheduler.ObjectID( - ray.worker.NIL_ACTOR_ID) - # The actor cursor is a dummy object representing the most recent - # actor method invocation. For each subsequent method invocation, - # the current cursor should be added as a dependency, and then - # updated to reflect the new invocation. - actor_cursor = None - # The number of actor method invocations that we've called so far. - actor_counter = 0 - - # Get the actor methods of the given class. - def pred(x): - return (inspect.isfunction(x) or inspect.ismethod(x) - or is_cython(x)) - - actor_methods = inspect.getmembers(Class, predicate=pred) - # Extract the signatures of each of the methods. This will be used - # to catch some errors if the methods are called with inappropriate - # arguments. - method_signatures = dict() - for k, v in actor_methods: - # Print a warning message if the method signature is not - # supported. We don't raise an exception because if the actor - # inherits from a class that has a method whose signature we - # don't support, we there may not be much the user can do about - # it. - signature.check_signature_supported(v, warn=True) - method_signatures[k] = signature.extract_signature( - v, ignore_first=True) - - actor_method_names = [ - method_name for method_name, _ in actor_methods - ] - actor_method_num_return_vals = [] - for _, method in actor_methods: - if hasattr(method, "__ray_num_return_vals__"): - actor_method_num_return_vals.append( - method.__ray_num_return_vals__) - else: - actor_method_num_return_vals.append(1) - # Do not export the actor class or the actor if run in PYTHON_MODE - # Instead, instantiate the actor locally and add it to - # global_worker's dictionary - if ray.worker.global_worker.mode == ray.PYTHON_MODE: - ray.worker.global_worker.actors[actor_id] = ( - Class.__new__(Class)) - else: - # Export the actor. - if not exported: - export_actor_class(class_id, Class, actor_method_names, - actor_method_num_return_vals, - checkpoint_interval, - ray.worker.global_worker) - exported.append(0) - actor_cursor = export_actor( - actor_id, class_id, class_name, actor_method_names, - actor_method_num_return_vals, actor_creation_resources, - actor_method_cpus, ray.worker.global_worker) - # Increment the actor counter to account for the creation task. - actor_counter += 1 - - # Instantiate the actor handle. - actor_object = cls.__new__(cls) - actor_object._manual_init( - actor_id, class_id, actor_handle_id, actor_cursor, - actor_counter, actor_method_names, - actor_method_num_return_vals, method_signatures, - checkpoint_interval, actor_cursor, actor_creation_resources, - actor_method_cpus) - - # Call __init__ as a remote function. - if "__init__" in actor_object._ray_actor_method_names: - actor_object._actor_method_call( - "__init__", - args=args, - kwargs=kwargs, - dependency=actor_cursor) - else: - print("WARNING: this object has no __init__ method.") + kwargs, + num_cpus=None, + num_gpus=None, + resources=None): + """Create an actor. + + This method allows more flexibility than the remote method because + resource requirements can be specified and override the defaults in the + decorator. + + Args: + args: The arguments to forward to the actor constructor. + kwargs: The keyword arguments to forward to the actor constructor. + num_cpus: The number of CPUs required by the actor creation task. + num_gpus: The number of GPUs required by the actor creation task. + resources: The custom resources required by the actor creation + task. + + Returns: + A handle to the newly created actor. + """ + worker = ray.worker.get_global_worker() + ray.worker.check_main_thread() + if worker.mode is None: + raise Exception("Actors cannot be created before ray.init() " + "has been called.") + + actor_id = ray.ObjectID(_random_string()) + # The actor cursor is a dummy object representing the most recent + # actor method invocation. For each subsequent method invocation, + # the current cursor should be added as a dependency, and then + # updated to reflect the new invocation. + actor_cursor = None + + # Do not export the actor class or the actor if run in PYTHON_MODE + # Instead, instantiate the actor locally and add it to the worker's + # dictionary + if worker.mode == ray.PYTHON_MODE: + worker.actors[actor_id] = self._modified_class.__new__( + self._modified_class) + else: + # Export the actor. + if not self._exported: + export_actor_class(self._class_id, self._modified_class, + self._actor_method_names, + self._checkpoint_interval, worker) + self._exported = True + + resources = ray.utils.resources_from_resource_arguments( + self._num_cpus, self._num_gpus, self._resources, num_cpus, + num_gpus, resources) + + creation_args = [self._class_id] + function_id = compute_actor_creation_function_id(self._class_id) + [actor_cursor] = worker.submit_task( + function_id, + creation_args, + actor_creation_id=actor_id, + num_return_vals=1, + resources=resources) + + # We initialize the actor counter at 1 to account for the actor + # creation task. + actor_counter = 1 + actor_handle = ActorHandle( + actor_id, self._class_name, actor_cursor, actor_counter, + self._actor_method_names, self._method_signatures, + self._actor_method_num_return_vals, actor_cursor, + self._actor_method_cpus, worker.task_driver_id) + + # Call __init__ as a remote function. + if "__init__" in actor_handle._ray_actor_method_names: + actor_handle.__init__.remote(*args, **kwargs) + else: + if len(args) != 0 or len(kwargs) != 0: + raise Exception("Arguments cannot be passed to the actor " + "constructor because this actor class has no " + "__init__ method.") + + return actor_handle + + @property + def class_id(self): + return self._class_id + + +class ActorHandle(object): + """A handle to an actor. + + The fields in this class are prefixed with _ray_ to hide them from the user + and to avoid collision with actor method names. + + An ActorHandle can be created in three ways. First, by calling .remote() on + an ActorClass. Second, by passing an actor handle into a task (forking the + ActorHandle). Third, by directly serializing the ActorHandle (e.g., with + cloudpickle). + + Attributes: + _ray_actor_id: The ID of the corresponding actor. + _ray_actor_handle_id: The ID of this handle. If this is the "original" + handle for an actor (as opposed to one created by passing another + handle into a task), then this ID must be NIL_ID. If this + ActorHandle was created by forking an existing ActorHandle, then + this ID must be computed deterministically via + compute_actor_handle_id. If this ActorHandle was created by an + out-of-band mechanism (e.g., pickling), then this must be None (in + this case, a new actor handle ID will be generated on the fly every + time a method is invoked). + _ray_actor_cursor: The actor cursor is a dummy object representing the + most recent actor method invocation. For each subsequent method + invocation, the current cursor should be added as a dependency, and + then updated to reflect the new invocation. + _ray_actor_counter: The number of actor method invocations that we've + called so far. + _ray_actor_method_names: The names of the actor methods. + _ray_method_signatures: The signatures of the actor methods. + _ray_method_num_return_vals: The default number of return values for + each method. + _ray_class_name: The name of the actor class. + _ray_actor_forks: The number of times this handle has been forked. + _ray_actor_creation_dummy_object_id: The dummy object ID from the actor + creation task. + _ray_actor_method_cpus: The number of CPUs required by actor methods. + _ray_original_handle: True if this is the original actor handle for a + given actor. If this is true, then the actor will be destroyed when + this handle goes out of scope. + _ray_actor_driver_id: The driver ID of the job that created the actor + (it is possible that this ActorHandle exists on a driver with a + different driver ID). + _ray_previous_actor_handle_id: If this actor handle is not an original + handle, (e.g., it was created by forking or pickling), then + this is the ID of the handle that this handle was created from. + Otherwise, this is None. + """ - return actor_object + def __init__(self, + actor_id, + class_name, + actor_cursor, + actor_counter, + actor_method_names, + method_signatures, + method_num_return_vals, + actor_creation_dummy_object_id, + actor_method_cpus, + actor_driver_id, + actor_handle_id=None, + previous_actor_handle_id=None): + # False if this actor handle was created by forking or pickling. True + # if it was created by the _serialization_helper function. + self._ray_original_handle = previous_actor_handle_id is None + + self._ray_actor_id = actor_id + if self._ray_original_handle: + self._ray_actor_handle_id = ray.ObjectID( + ray.worker.NIL_ACTOR_HANDLE_ID) + else: + self._ray_actor_handle_id = actor_handle_id + self._ray_actor_cursor = actor_cursor + self._ray_actor_counter = actor_counter + self._ray_actor_method_names = actor_method_names + self._ray_method_signatures = method_signatures + self._ray_method_num_return_vals = method_num_return_vals + self._ray_class_name = class_name + self._ray_actor_forks = 0 + self._ray_actor_creation_dummy_object_id = ( + actor_creation_dummy_object_id) + self._ray_actor_method_cpus = actor_method_cpus + self._ray_actor_driver_id = actor_driver_id + self._ray_previous_actor_handle_id = previous_actor_handle_id + + def _actor_method_call(self, + method_name, + args=None, + kwargs=None, + num_return_vals=None, + dependency=None): + """Method execution stub for an actor handle. + + This is the function that executes when + `actor.method_name.remote(*args, **kwargs)` is called. Instead of + executing locally, the method is packaged as a task and scheduled + to the remote actor instance. + + Args: + method_name: The name of the actor method to execute. + args: A list of arguments for the actor method. + kwargs: A dictionary of keyword arguments for the actor method. + dependency: The object ID that this method is dependent on. + Defaults to None, for no dependencies. Most tasks should + pass in the dummy object returned by the preceding task. + Some tasks, such as checkpoint and terminate methods, have + no dependencies. + + Returns: + object_ids: A list of object IDs returned by the remote actor + method. + """ + worker = ray.worker.get_global_worker() + + worker.check_connected() + ray.worker.check_main_thread() + + function_signature = self._ray_method_signatures[method_name] + if args is None: + args = [] + if kwargs is None: + kwargs = {} + args = signature.extend_args(function_signature, args, kwargs) + + # Execute functions locally if Ray is run in PYTHON_MODE + # Copy args to prevent the function from mutating them. + if worker.mode == ray.PYTHON_MODE: + return getattr(worker.actors[self._ray_actor_id], + method_name)(*copy.deepcopy(args)) + + # Add the execution dependency. + if dependency is None: + execution_dependencies = [] + else: + execution_dependencies = [dependency] - return ActorHandle + is_actor_checkpoint_method = (method_name == "__ray_checkpoint__") + if self._ray_actor_handle_id is None: + actor_handle_id = compute_actor_handle_id_non_forked( + self._ray_actor_id, self._ray_previous_actor_handle_id, + worker.current_task_id) + else: + actor_handle_id = self._ray_actor_handle_id + + function_id = compute_actor_method_function_id(self._ray_class_name, + method_name) + object_ids = worker.submit_task( + function_id, + args, + actor_id=self._ray_actor_id, + actor_handle_id=actor_handle_id, + actor_counter=self._ray_actor_counter, + is_actor_checkpoint_method=is_actor_checkpoint_method, + actor_creation_dummy_object_id=( + self._ray_actor_creation_dummy_object_id), + execution_dependencies=execution_dependencies, + # We add one for the dummy return ID. + num_return_vals=num_return_vals + 1, + resources={"CPU": self._ray_actor_method_cpus}, + driver_id=self._ray_actor_driver_id) + # Update the actor counter and cursor to reflect the most recent + # invocation. + self._ray_actor_counter += 1 + # The last object returned is the dummy object that should be + # passed in to the next actor method. Do not return it to the user. + self._ray_actor_cursor = object_ids.pop() + + if len(object_ids) == 1: + object_ids = object_ids[0] + elif len(object_ids) == 0: + object_ids = None + + return object_ids + + # Make tab completion work. + def __dir__(self): + return self._ray_actor_method_names + + def __getattribute__(self, attr): + try: + # Check whether this is an actor method. + actor_method_names = object.__getattribute__( + self, "_ray_actor_method_names") + if attr in actor_method_names: + # We create the ActorMethod on the fly here so that the + # ActorHandle doesn't need a reference to the ActorMethod. + # The ActorMethod has a reference to the ActorHandle and + # this was causing cyclic references which were prevent + # object deallocation from behaving in a predictable + # manner. + return ActorMethod(self, attr, + self._ray_method_num_return_vals[attr]) + except AttributeError: + pass + + # If the requested attribute is not a registered method, fall back + # to default __getattribute__. + return object.__getattribute__(self, attr) + + def __repr__(self): + return "Actor(" + self._ray_actor_id.hex() + ")" + + def __del__(self): + """Kill the worker that is running this actor.""" + # TODO(swang): Also clean up forked actor handles. + # Kill the worker if this is the original actor handle, created + # with Class.remote(). TODO(rkn): Even without passing handles around, + # this is not the right policy. the actor should be alive as long as + # there are ANY handles in scope in the process that created the actor, + # not just the first one. + worker = ray.worker.get_global_worker() + if worker.connected and self._ray_original_handle: + # TODO(rkn): Should we be passing in the actor cursor as a + # dependency here? + self.__ray_terminate__.remote() + + @property + def _actor_id(self): + return self._ray_actor_id + + @property + def _actor_handle_id(self): + return self._ray_actor_handle_id + + def _serialization_helper(self, ray_forking): + """This is defined in order to make pickling work. + + Args: + ray_forking: True if this is being called because Ray is forking + the actor handle and false if it is being called by pickling. + + Returns: + A dictionary of the information needed to reconstruct the object. + """ + state = { + "actor_id": + self._ray_actor_id.id(), + "class_name": + self._ray_class_name, + "actor_forks": + self._ray_actor_forks, + "actor_cursor": + self._ray_actor_cursor.id(), + "actor_counter": + 0, # Reset the actor counter. + "actor_method_names": + self._ray_actor_method_names, + "method_signatures": + self._ray_method_signatures, + "method_num_return_vals": + self._ray_method_num_return_vals, + "actor_creation_dummy_object_id": + self._ray_actor_creation_dummy_object_id.id(), + "actor_method_cpus": + self._ray_actor_method_cpus, + "actor_driver_id": + self._ray_actor_driver_id.id(), + "previous_actor_handle_id": + self._ray_actor_handle_id.id(), + "ray_forking": + ray_forking + } + + if ray_forking: + self._ray_actor_forks += 1 + + return state + + def _deserialization_helper(self, state, ray_forking): + """This is defined in order to make pickling work. + + Args: + state: The serialized state of the actor handle. + ray_forking: True if this is being called because Ray is forking + the actor handle and false if it is being called by pickling. + """ + worker = ray.worker.get_global_worker() + worker.check_connected() + ray.worker.check_main_thread() + + if state["ray_forking"]: + actor_handle_id = compute_actor_handle_id( + ray.ObjectID(state["previous_actor_handle_id"]), + state["actor_forks"]) + else: + actor_handle_id = None + + # This is the driver ID of the driver that owns the actor, not + # necessarily the driver that owns this actor handle. + actor_driver_id = ray.ObjectID(state["actor_driver_id"]) + + self.__init__( + ray.ObjectID(state["actor_id"]), + state["class_name"], + ray.ObjectID(state["actor_cursor"]), + state["actor_counter"], + state["actor_method_names"], + state["method_signatures"], + state["method_num_return_vals"], + ray.ObjectID(state["actor_creation_dummy_object_id"]), + state["actor_method_cpus"], + actor_driver_id, + actor_handle_id=actor_handle_id, + previous_actor_handle_id=ray.ObjectID( + state["previous_actor_handle_id"])) + + def __getstate__(self): + """This code path is used by pickling but not by Ray forking.""" + return self._serialization_helper(False) + + def __setstate__(self, state): + """This code path is used by pickling but not by Ray forking.""" + return self._deserialization_helper(state, False) + + +def make_actor(cls, num_cpus, num_gpus, resources, actor_method_cpus, + checkpoint_interval): + if checkpoint_interval is None: + checkpoint_interval = -1 -def make_actor(cls, resources, checkpoint_interval, actor_method_cpus): if checkpoint_interval == 0: raise Exception("checkpoint_interval must be greater than 0.") # Modify the class to have an additional method that will be used for # terminating the worker. class Class(cls): - def __ray_terminate__(self, actor_id): - # Record that this actor has been removed so that if this node - # dies later, the actor won't be recreated. Alternatively, we could - # remove the actor key from Redis here. - ray.worker.global_worker.redis_client.hset(b"Actor:" + actor_id, - "removed", True) - # Disconnect the worker from the local scheduler. The point of this - # is so that when the worker kills itself below, the local - # scheduler won't push an error message to the driver. - ray.worker.global_worker.local_scheduler_client.disconnect() - import os - os._exit(0) + def __ray_terminate__(self): + worker = ray.worker.get_global_worker() + if worker.mode != ray.PYTHON_MODE: + # Disconnect the worker from the local scheduler. The point of + # this is so that when the worker kills itself below, the local + # scheduler won't push an error message to the driver. + worker.local_scheduler_client.disconnect() + import os + os._exit(0) def __ray_save_checkpoint__(self): if hasattr(self, "__ray_save__"): @@ -903,7 +994,7 @@ def __ray_checkpoint__(self): # scheduler has seen. Handle IDs for which no task has yet reached # the local scheduler will not be included, and may not be runnable # on checkpoint resumption. - actor_id = ray.local_scheduler.ObjectID(worker.actor_id) + actor_id = ray.ObjectID(worker.actor_id) frontier = worker.local_scheduler_client.get_actor_frontier( actor_id) # Save the checkpoint in Redis. TODO(rkn): Checkpoints @@ -945,8 +1036,8 @@ def __ray_checkpoint_restore__(self): class_id = _random_string() - return actor_handle_from_class(Class, class_id, resources, - checkpoint_interval, actor_method_cpus) + return ActorClass(Class, class_id, checkpoint_interval, num_cpus, num_gpus, + resources, actor_method_cpus) ray.worker.global_worker.fetch_and_register_actor = fetch_and_register_actor diff --git a/python/ray/autoscaler/autoscaler.py b/python/ray/autoscaler/autoscaler.py index d5a5336f41b8..502b4c6c783e 100644 --- a/python/ray/autoscaler/autoscaler.py +++ b/python/ray/autoscaler/autoscaler.py @@ -142,6 +142,7 @@ def prune_active_ips(self, active_ips): def prune(mapping): unwanted = set(mapping) - active_ips for unwanted_key in unwanted: + print("Removed mapping", unwanted_key, mapping[unwanted_key]) del mapping[unwanted_key] if unwanted: print("Removed {} stale ip mappings: {} not in {}".format( @@ -454,9 +455,8 @@ def launch_new_node(self, count): TAG_RAY_NODE_STATUS: "Uninitialized", TAG_RAY_LAUNCH_CONFIG: self.launch_hash, }, count) - # TODO(ekl) be less conservative in this check - assert len(self.workers()) > num_before, \ - "Num nodes failed to increase after creating a new node" + if len(self.workers()) <= num_before: + print("Warning: Num nodes failed to increase after node creation") def workers(self): return self.provider.nodes(tag_filters={ diff --git a/python/ray/common/test/test.py b/python/ray/common/test/test.py index 5892d289fa73..4ae867ff9525 100644 --- a/python/ray/common/test/test.py +++ b/python/ray/common/test/test.py @@ -133,7 +133,7 @@ def test_hashability(self): x = random_object_id() y = random_object_id() {x: y} - set([x, y]) + {x, y} class TestTask(unittest.TestCase): diff --git a/python/ray/dataframe/__init__.py b/python/ray/dataframe/__init__.py index 7eea37f99f2a..6e91a3c6623e 100644 --- a/python/ray/dataframe/__init__.py +++ b/python/ray/dataframe/__init__.py @@ -3,7 +3,13 @@ from __future__ import print_function import pandas as pd -from pandas import (eval, Panel, date_range, MultiIndex) +# TODO: In the future `set_option` or similar needs to run on every node +# in order to keep all pandas instances across nodes consistent +from pandas import (eval, unique, value_counts, cut, to_numeric, factorize, + test, qcut, match, Panel, date_range, Index, MultiIndex, + CategoricalIndex, Series, bdate_range, DatetimeIndex, + Timedelta, Timestamp, to_timedelta, set_eng_float_format, + set_option, NaT) import threading pd_version = pd.__version__ @@ -29,16 +35,21 @@ def get_npartitions(): # We import these file after above two function # because they depend on npartitions. from .dataframe import DataFrame # noqa: 402 -from .series import Series # noqa: 402 from .io import (read_csv, read_parquet, read_json, read_html, # noqa: 402 read_clipboard, read_excel, read_hdf, read_feather, # noqa: 402 read_msgpack, read_stata, read_sas, read_pickle, # noqa: 402 read_sql) # noqa: 402 from .concat import concat # noqa: 402 +from .datetimes import to_datetime # noqa: 402 +from .reshape import get_dummies # noqa: 402 __all__ = [ "DataFrame", "Series", "read_csv", "read_parquet", "concat", "eval", - "Panel", "date_range", "MultiIndex" + "unique", "value_counts", "cut", "to_numeric", "factorize", "test", "qcut", + "match", "to_datetime", "get_dummies", "Panel", "date_range", "Index", + "MultiIndex", "Series", "bdate_range", "DatetimeIndex", "to_timedelta", + "set_eng_float_format", "set_option", "CategoricalIndex", "Timedelta", + "Timestamp", "NaT" ] try: diff --git a/python/ray/dataframe/dataframe.py b/python/ray/dataframe/dataframe.py index b96c4c836453..41924d801bd7 100644 --- a/python/ray/dataframe/dataframe.py +++ b/python/ray/dataframe/dataframe.py @@ -3,20 +3,23 @@ from __future__ import print_function import pandas as pd +import functools from pandas.api.types import is_scalar from pandas.util._validators import validate_bool_kwarg from pandas.core.index import _ensure_index_from_sequences from pandas._libs import lib from pandas.core.dtypes.cast import maybe_upcast_putmask from pandas import compat -from pandas.compat import lzip, string_types, cPickle as pkl +from pandas.compat import lzip, to_str, string_types, cPickle as pkl import pandas.core.common as com from pandas.core.dtypes.common import ( is_bool_dtype, is_list_like, is_numeric_dtype, - is_timedelta64_dtype) + is_timedelta64_dtype, + _get_dtype_from_object) from pandas.core.indexing import check_bool_indexer +from pandas.errors import MergeError import warnings import numpy as np @@ -37,7 +40,10 @@ _create_block_partitions, _inherit_docstrings, _reindex_helper, - _co_op_helper) + _co_op_helper, + _match_partitioning, + _concat_index, + _correct_column_dtypes) from . import get_npartitions from .index_metadata import _IndexMetadata @@ -47,7 +53,8 @@ class DataFrame(object): def __init__(self, data=None, index=None, columns=None, dtype=None, copy=False, col_partitions=None, row_partitions=None, - block_partitions=None, row_metadata=None, col_metadata=None): + block_partitions=None, row_metadata=None, col_metadata=None, + dtypes_cache=None): """Distributed DataFrame object backed by Pandas dataframes. Args: @@ -71,7 +78,7 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, col_metadata (_IndexMetadata): Metadata for the new dataframe's columns """ - self._row_metadata = self._col_metadata = None + self._dtypes_cache = dtypes_cache # Check type of data and use appropriate constructor if data is not None or (col_partitions is None and @@ -81,6 +88,9 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, pd_df = pd.DataFrame(data=data, index=index, columns=columns, dtype=dtype, copy=copy) + # Cache dtypes + self._dtypes_cache = pd_df.dtypes + # TODO convert _partition_pandas_dataframe to block partitioning. row_partitions = \ _partition_pandas_dataframe(pd_df, @@ -97,9 +107,9 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, else: # created this invariant to make sure we never have to go into the # partitions to get the columns - assert columns is not None, \ - "Columns not defined, must define columns for internal " \ - "DataFrame creations" + assert columns is not None or col_metadata is not None, \ + "Columns not defined, must define columns or col_metadata " \ + "for internal DataFrame creations" if block_partitions is not None: # put in numpy array here to make accesses easier since it's 2D @@ -109,18 +119,23 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, if row_partitions is not None: axis = 0 partitions = row_partitions + axis_length = len(columns) if columns is not None else \ + len(col_metadata) elif col_partitions is not None: axis = 1 partitions = col_partitions - + axis_length = None + # All partitions will already have correct dtypes + self._dtypes_cache = [ + _deploy_func.remote(lambda df: df.dtypes, pd_df) + for pd_df in col_partitions + ] + + # TODO: write explicit tests for "short and wide" + # column partitions self._block_partitions = \ _create_block_partitions(partitions, axis=axis, - length=len(columns)) - - if row_metadata is not None: - self._row_metadata = row_metadata.copy() - if col_metadata is not None: - self._col_metadata = col_metadata.copy() + length=axis_length) # Sometimes we only get a single column or row, which is # problematic for building blocks from the partitions, so we @@ -133,13 +148,25 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, # Create the row and column index objects for using our partitioning. # If the objects haven't been inherited, then generate them - if self._row_metadata is None: + if row_metadata is not None: + self._row_metadata = row_metadata.copy() + if index is not None: + self.index = index + else: self._row_metadata = _IndexMetadata(self._block_partitions[:, 0], index=index, axis=0) - if self._col_metadata is None: + + if col_metadata is not None: + self._col_metadata = col_metadata.copy() + if columns is not None: + self.columns = columns + else: self._col_metadata = _IndexMetadata(self._block_partitions[0, :], index=columns, axis=1) + if self._dtypes_cache is None: + self._correct_dtypes() + def _get_row_partitions(self): return [_blocks_to_row.remote(*part) for part in self._block_partitions] @@ -336,7 +363,7 @@ def _arithmetic_helper(self, remote_func, axis, level=None): # We use the index to get the internal index. oid_series = [(oid_series[i], i) for i in range(len(oid_series))] - if len(oid_series) > 1: + if len(oid_series) > 0: for df, partition in oid_series: this_partition = \ self._col_metadata.partition_series(partition) @@ -403,6 +430,24 @@ def ftypes(self): result.index = self.columns return result + def _correct_dtypes(self): + """Corrects dtypes by concatenating column blocks and then splitting them + apart back into the original blocks. + + Also caches ObjectIDs for the dtypes of every column. + + Args: + block_partitions: arglist of column blocks. + """ + if self._block_partitions.shape[0] > 1: + self._block_partitions = np.array( + [_correct_column_dtypes._submit( + args=column, num_return_vals=len(column)) + for column in self._block_partitions.T]).T + + self._dtypes_cache = [_deploy_func.remote(lambda df: df.dtypes, pd_df) + for pd_df in self._block_partitions[0]] + @property def dtypes(self): """Get the dtypes for this DataFrame. @@ -410,12 +455,15 @@ def dtypes(self): Returns: The dtypes for this DataFrame. """ - # The dtypes are common across all partitions. - # The first partition will be enough. - result = ray.get(_deploy_func.remote(lambda df: df.dtypes, - self._row_partitions[0])) - result.index = self.columns - return result + assert self._dtypes_cache is not None + + if isinstance(self._dtypes_cache, list) and \ + isinstance(self._dtypes_cache[0], + ray.ObjectID): + self._dtypes_cache = pd.concat(ray.get(self._dtypes_cache)) + self._dtypes_cache.index = self.columns + + return self._dtypes_cache @property def empty(self): @@ -489,6 +537,7 @@ def _update_inplace(self, row_partitions=None, col_partitions=None, if block_partitions is not None: self._block_partitions = block_partitions + elif row_partitions is not None: self._row_partitions = row_partitions @@ -499,7 +548,7 @@ def _update_inplace(self, row_partitions=None, col_partitions=None, self._col_metadata = col_metadata else: assert columns is not None, \ - "Columns must be passed without col_metadata" + "If col_metadata is None, columns must be passed in" self._col_metadata = _IndexMetadata( self._block_partitions[0, :], index=columns, axis=1) if row_metadata is not None: @@ -509,6 +558,9 @@ def _update_inplace(self, row_partitions=None, col_partitions=None, self._row_metadata = _IndexMetadata( self._block_partitions[:, 0], index=index, axis=0) + # Update dtypes + self._correct_dtypes() + def add_prefix(self, prefix): """Add a prefix to each of the column names. @@ -518,7 +570,8 @@ def add_prefix(self, prefix): new_cols = self.columns.map(lambda x: str(prefix) + str(x)) return DataFrame(block_partitions=self._block_partitions, columns=new_cols, - index=self.index) + col_metadata=self._col_metadata, + row_metadata=self._row_metadata) def add_suffix(self, suffix): """Add a suffix to each of the column names. @@ -529,7 +582,8 @@ def add_suffix(self, suffix): new_cols = self.columns.map(lambda x: str(x) + str(suffix)) return DataFrame(block_partitions=self._block_partitions, columns=new_cols, - index=self.index) + col_metadata=self._col_metadata, + row_metadata=self._row_metadata) def applymap(self, func): """Apply a function to a DataFrame elementwise. @@ -546,8 +600,8 @@ def applymap(self, func): for block in self._block_partitions]) return DataFrame(block_partitions=new_block_partitions, - columns=self.columns, - index=self.index) + row_metadata=self._row_metadata, + col_metadata=self._col_metadata) def copy(self, deep=True): """Creates a shallow copy of the DataFrame. @@ -557,7 +611,8 @@ def copy(self, deep=True): """ return DataFrame(block_partitions=self._block_partitions, columns=self.columns, - index=self.index) + index=self.index, + dtypes_cache=self.dtypes) def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, group_keys=True, squeeze=False, **kwargs): @@ -579,6 +634,9 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, elif isinstance(by, compat.string_types): by = self.__getitem__(by).values.tolist() elif is_list_like(by): + if isinstance(by, pd.Series): + by = by.values.tolist() + mismatch = len(by) != len(self) if axis == 0 \ else len(by) != len(self.columns) @@ -591,7 +649,8 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, return DataFrameGroupBy(self, by, axis, level, as_index, sort, group_keys, squeeze, **kwargs) - def sum(self, axis=None, skipna=True, level=None, numeric_only=None): + def sum(self, axis=None, skipna=True, level=None, numeric_only=None, + min_count=1, **kwargs): """Perform a sum across the DataFrame. Args: @@ -603,7 +662,8 @@ def sum(self, axis=None, skipna=True, level=None, numeric_only=None): """ def remote_func(df): return df.sum(axis=axis, skipna=skipna, level=level, - numeric_only=numeric_only) + numeric_only=numeric_only, min_count=min_count, + **kwargs) return self._arithmetic_helper(remote_func, axis, level) @@ -659,8 +719,6 @@ def isna(self): lambda df: df.isna(), block) for block in self._block_partitions]) return DataFrame(block_partitions=new_block_partitions, - columns=self.columns, - index=self.index, row_metadata=self._row_metadata, col_metadata=self._col_metadata) @@ -678,8 +736,8 @@ def isnull(self): for block in self._block_partitions]) return DataFrame(block_partitions=new_block_partitions, - columns=self.columns, - index=self.index) + row_metadata=self._row_metadata, + col_metadata=self._col_metadata) def keys(self): """Get the info axis for the DataFrame. @@ -705,7 +763,8 @@ def transpose(self, *args, **kwargs): T = property(transpose) - def dropna(self, axis, how, thresh=None, subset=[], inplace=False): + def dropna(self, axis=0, how='any', thresh=None, subset=None, + inplace=False): """Create a new DataFrame from the removed NA values from this one. Args: @@ -723,7 +782,99 @@ def dropna(self, axis, how, thresh=None, subset=[], inplace=False): If inplace is set to True, returns None, otherwise returns a new DataFrame with the dropna applied. """ - raise NotImplementedError("Not yet") + inplace = validate_bool_kwarg(inplace, "inplace") + + if is_list_like(axis): + axis = [pd.DataFrame()._get_axis_number(ax) for ax in axis] + + result = self + # TODO(kunalgosar): this builds an intermediate dataframe, + # which does unnecessary computation + for ax in axis: + result = result.dropna( + axis=ax, how=how, thresh=thresh, subset=subset) + if not inplace: + return result + + self._update_inplace(block_partitions=result._block_partitions, + columns=result.columns, + index=result.index) + + return None + + axis = pd.DataFrame()._get_axis_number(axis) + + if how is not None and how not in ['any', 'all']: + raise ValueError('invalid how option: %s' % how) + if how is None and thresh is None: + raise TypeError('must specify how or thresh') + + indices = None + if subset is not None: + if axis == 1: + indices = self.index.get_indexer_for(subset) + check = indices == -1 + if check.any(): + raise KeyError(list(np.compress(check, subset))) + else: + indices = self.columns.get_indexer_for(subset) + check = indices == -1 + if check.any(): + raise KeyError(list(np.compress(check, subset))) + + def dropna_helper(df): + new_df = df.dropna(axis=axis, how=how, thresh=thresh, + subset=indices, inplace=False) + + if axis == 1: + new_index = new_df.columns + new_df.columns = pd.RangeIndex(0, len(new_df.columns)) + else: + new_index = new_df.index + new_df.reset_index(drop=True, inplace=True) + + return new_df, new_index + + parts = self._col_partitions if axis == 1 else self._row_partitions + result = [_deploy_func._submit(args=(dropna_helper, df), + num_return_vals=2) for df in parts] + new_parts, new_vals = [list(t) for t in zip(*result)] + + if axis == 1: + new_vals = [self._col_metadata.get_global_indices(i, vals) + for i, vals in enumerate(ray.get(new_vals))] + + # This flattens the 2d array to 1d + new_vals = [i for j in new_vals for i in j] + new_cols = self.columns[new_vals] + + if not inplace: + return DataFrame(col_partitions=new_parts, + columns=new_cols, + index=self.index) + + self._update_inplace(col_partitions=new_parts, + columns=new_cols, + index=self.index) + + else: + new_vals = [self._row_metadata.get_global_indices(i, vals) + for i, vals in enumerate(ray.get(new_vals))] + + # This flattens the 2d array to 1d + new_vals = [i for j in new_vals for i in j] + new_rows = self.index[new_vals] + + if not inplace: + return DataFrame(row_partitions=new_parts, + index=new_rows, + columns=self.columns) + + self._update_inplace(row_partitions=new_parts, + index=new_rows, + columns=self.columns) + + return None def add(self, other, axis='columns', level=None, fill_value=None): """Add this DataFrame to another or a scalar/list. @@ -776,16 +927,7 @@ def _aggregate(self, arg, *args, **kwargs): "To contribute to Pandas on Ray, please visit " "github.com/ray-project/ray.") elif is_list_like(arg): - from .concat import concat - - x = [self._aggregate(func, *args, **kwargs) - for func in arg] - - new_dfs = [x[i] if not isinstance(x[i], pd.Series) - else pd.DataFrame(x[i], columns=[arg[i]]).T - for i in range(len(x))] - - return concat(new_dfs) + return self.apply(arg, axis=_axis, args=args, **kwargs) elif callable(arg): self._callable_function(arg, _axis, *args, **kwargs) else: @@ -996,24 +1138,58 @@ def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, """ axis = pd.DataFrame()._get_axis_number(axis) - if is_list_like(func) and not all([isinstance(obj, str) - for obj in func]): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") - - if axis == 0 and is_list_like(func): - return self.aggregate(func, axis, *args, **kwds) if isinstance(func, compat.string_types): if axis == 1: kwds['axis'] = axis return getattr(self, func)(*args, **kwds) + elif isinstance(func, dict): + if axis == 1: + raise TypeError( + "(\"'dict' object is not callable\", " + "'occurred at index {0}'".format(self.index[0])) + if len(self.columns) != len(set(self.columns)): + warnings.warn( + 'duplicate column names not supported with apply().', + FutureWarning, stacklevel=2) + has_list = list in map(type, func.values()) + part_ind_tuples = [(self._col_metadata[key], key) for key in func] + + if has_list: + # if input dict has a list, the function to apply must wrap + # single functions in lists as well to get the desired output + # format + result = [_deploy_func.remote( + lambda df: df.iloc[:, ind].apply( + func[key] if is_list_like(func[key]) + else [func[key]]), + self._col_partitions[part]) + for (part, ind), key in part_ind_tuples] + return pd.concat(ray.get(result), axis=1) + else: + result = [_deploy_func.remote( + lambda df: df.iloc[:, ind].apply(func[key]), + self._col_partitions[part]) + for (part, ind), key in part_ind_tuples] + return pd.Series(ray.get(result), index=func.keys()) + + elif is_list_like(func): + if axis == 1: + raise TypeError( + "(\"'list' object is not callable\", " + "'occurred at index {0}'".format(self.index[0])) + # TODO: some checking on functions that return Series or Dataframe + new_cols = _map_partitions(lambda df: df.apply(func), + self._col_partitions) + + # resolve function names for the DataFrame index + new_index = [f_name if isinstance(f_name, compat.string_types) + else f_name.__name__ for f_name in func] + return DataFrame(col_partitions=new_cols, + columns=self.columns, + index=new_index, + col_metadata=self._col_metadata) elif callable(func): return self._callable_function(func, axis=axis, *args, **kwds) - else: - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") def as_blocks(self, copy=True): raise NotImplementedError( @@ -1021,9 +1197,17 @@ def as_blocks(self, copy=True): "github.com/ray-project/ray.") def as_matrix(self, columns=None): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") + """Convert the frame to its Numpy-array representation. + + Args: + columns: If None, return all columns, otherwise, + returns specified columns. + + Returns: + values: ndarray + """ + # TODO this is very inneficient, also see __array__ + return to_pandas(self).as_matrix(columns) def asfreq(self, freq, method=None, how=None, normalize=False, fill_value=None): @@ -1042,9 +1226,42 @@ def assign(self, **kwargs): "github.com/ray-project/ray.") def astype(self, dtype, copy=True, errors='raise', **kwargs): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") + if isinstance(dtype, dict): + if (not set(dtype.keys()).issubset(set(self.columns)) and + errors == 'raise'): + raise KeyError( + "Only a column name can be used for the key in" + "a dtype mappings argument.") + columns = list(dtype.keys()) + col_idx = [(self.columns.get_loc(columns[i]), columns[i]) + if columns[i] in self.columns + else (columns[i], columns[i]) + for i in range(len(columns))] + new_dict = {} + for idx, key in col_idx: + new_dict[idx] = dtype[key] + new_rows = _map_partitions(lambda df, dt: df.astype(dtype=dt, + copy=True, + errors=errors, + **kwargs), + self._row_partitions, new_dict) + if copy: + return DataFrame(row_partitions=new_rows, + columns=self.columns, + index=self.index) + self._row_partitions = new_rows + else: + new_blocks = [_map_partitions(lambda d: d.astype(dtype=dtype, + copy=True, + errors=errors, + **kwargs), + block) + for block in self._block_partitions] + if copy: + return DataFrame(block_partitions=new_blocks, + columns=self.columns, + index=self.index) + self._block_partitions = new_blocks def at_time(self, time, asof=False): raise NotImplementedError( @@ -1173,13 +1390,13 @@ def _cumulative_helper(self, func, axis): if axis == 0: new_cols = _map_partitions(func, self._col_partitions) return DataFrame(col_partitions=new_cols, - columns=self.columns, - index=self.index) + row_metadata=self._row_metadata, + col_metadata=self._col_metadata) else: new_rows = _map_partitions(func, self._row_partitions) return DataFrame(row_partitions=new_rows, - columns=self.columns, - index=self.index) + row_metadata=self._row_metadata, + col_metadata=self._col_metadata) def cummax(self, axis=None, skipna=True, *args, **kwargs): """Perform a cumulative maximum across the DataFrame. @@ -1280,9 +1497,31 @@ def describe_helper(df): return result def diff(self, periods=1, axis=0): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") + """Finds the difference between elements on the axis requested + + Args: + periods: Periods to shift for forming difference + axis: Take difference over rows or columns + + Returns: + DataFrame with the diff applied + """ + axis = pd.DataFrame()._get_axis_number(axis) + partitions = (self._col_partitions if + axis == 0 else self._row_partitions) + + result = _map_partitions(lambda df: + df.diff(axis=axis, periods=periods), + partitions) + + if (axis == 1): + return DataFrame(row_partitions=result, + columns=self.columns, + index=self.index) + if (axis == 0): + return DataFrame(col_partitions=result, + columns=self.columns, + index=self.index) def div(self, other, axis='columns', level=None, fill_value=None): """Divides this DataFrame against another DataFrame/Series/scalar. @@ -1497,7 +1736,7 @@ def helper(df, index, other_series): # TODO: group series here into full df partitions to reduce # the number of remote calls to helper other_series = other_df.iloc[idx['index_within_partition']] - curr_index = self._row_metadata._coord_df.iloc[i] + curr_index = self._row_metadata._coord_df.loc[i] curr_df = self._row_partitions[int(curr_index['partition'])] results.append(_deploy_func.remote(helper, curr_df, @@ -1721,9 +1960,45 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, return new_obj def filter(self, items=None, like=None, regex=None, axis=None): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") + """Subset rows or columns based on their labels + + Args: + items (list): list of labels to subset + like (string): retain labels where `arg in label == True` + regex (string): retain labels matching regex input + axis: axis to filter on + + Returns: + A new dataframe with the filter applied. + """ + nkw = com._count_not_none(items, like, regex) + if nkw > 1: + raise TypeError('Keyword arguments `items`, `like`, or `regex` ' + 'are mutually exclusive') + if nkw == 0: + raise TypeError('Must pass either `items`, `like`, or `regex`') + + if axis is None: + axis = 'columns' # This is the default info axis for dataframes + + axis = pd.DataFrame()._get_axis_number(axis) + labels = self.columns if axis else self.index + + if items is not None: + bool_arr = labels.isin(items) + elif like is not None: + def f(x): + return like in to_str(x) + bool_arr = labels.map(f).tolist() + else: + def f(x): + return matcher.search(to_str(x)) is not None + matcher = re.compile(regex) + bool_arr = labels.map(f).tolist() + + if not axis: + return self[bool_arr] + return self[self.columns[bool_arr]] def first(self, offset): raise NotImplementedError( @@ -1869,7 +2144,7 @@ def head(self, n=5): index = self._row_metadata.index[:n] return DataFrame(col_partitions=new_dfs, - columns=self.columns, + col_metadata=self._col_metadata, index=index) def hist(self, data, column=None, by=None, grid=True, xlabelsize=None, @@ -2396,9 +2671,145 @@ def merge(self, right, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=False, suffixes=('_x', '_y'), copy=True, indicator=False, validate=None): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") + """Database style join, where common columns in "on" are merged. + + Args: + right: The DataFrame to merge against. + how: What type of join to use. + on: The common column name(s) to join on. If None, and left_on and + right_on are also None, will default to all commonly named + columns. + left_on: The column(s) on the left to use for the join. + right_on: The column(s) on the right to use for the join. + left_index: Use the index from the left as the join keys. + right_index: Use the index from the right as the join keys. + sort: Sort the join keys lexicographically in the result. + suffixes: Add this suffix to the common names not in the "on". + copy: Does nothing in our implementation + indicator: Adds a column named _merge to the DataFrame with + metadata from the merge about each row. + validate: Checks if merge is a specific type. + + Returns: + A merged Dataframe + """ + + if not isinstance(right, DataFrame): + raise ValueError("can not merge DataFrame with instance of type " + "{}".format(type(right))) + + args = (how, on, left_on, right_on, left_index, right_index, sort, + suffixes, False, indicator, validate) + + left_cols = ray.put(self.columns) + right_cols = ray.put(right.columns) + + # This can be put in a remote function because we don't need it until + # the end, and the columns can be built asynchronously. This takes the + # columns defining off the critical path and speeds up the overall + # merge. + new_columns = _merge_columns.remote(left_cols, right_cols, *args) + + if on is not None: + if left_on is not None or right_on is not None: + raise MergeError("Can only pass argument \"on\" OR \"left_on\"" + " and \"right_on\", not a combination of " + "both.") + if not is_list_like(on): + on = [on] + + if next((True for key in on if key not in self), False) or \ + next((True for key in on if key not in right), False): + + missing_key = \ + next((str(key) for key in on if key not in self), "") + \ + next((str(key) for key in on if key not in right), "") + raise KeyError(missing_key) + + elif right_on is not None or right_index is True: + if left_on is None and left_index is False: + # Note: This is not the same error as pandas, but pandas throws + # a ValueError NoneType has no len(), and I don't think that + # helps enough. + raise TypeError("left_on must be specified or left_index must " + "be true if right_on is specified.") + + elif left_on is not None or left_index is True: + if right_on is None and right_index is False: + # Note: See note above about TypeError. + raise TypeError("right_on must be specified or right_index " + "must be true if right_on is specified.") + + if left_on is not None: + if not is_list_like(left_on): + left_on = [left_on] + + if next((True for key in left_on if key not in self), False): + raise KeyError(next(key for key in left_on + if key not in self)) + + if right_on is not None: + if not is_list_like(right_on): + right_on = [right_on] + + if next((True for key in right_on if key not in right), False): + raise KeyError(next(key for key in right_on + if key not in right)) + + # There's a small chance that our partitions are already perfect, but + # if it's not, we need to adjust them. We adjust the right against the + # left because the defaults of merge rely on the order of the left. We + # have to push the index down here, so if we're joining on the right's + # index we go ahead and push it down here too. + if not np.array_equal(self._row_metadata._lengths, + right._row_metadata._lengths) or right_index: + + repartitioned_right = np.array([_match_partitioning._submit( + args=(df, self._row_metadata._lengths, right.index), + num_return_vals=len(self._row_metadata._lengths)) + for df in right._col_partitions]).T + else: + repartitioned_right = right._block_partitions + + if not left_index and not right_index: + # Passing None to each call specifies that we don't care about the + # left's index for the join. + left_idx = itertools.repeat(None) + + # We only return the index if we need to update it, and that only + # happens when either left_index or right_index is True. We will + # use this value to add the return vals if we are getting an index + # back. + return_index = False + else: + # We build this to push the index down so that we can use it for + # the join. + left_idx = \ + (v.index for k, v in + self._row_metadata._coord_df.copy().groupby('partition')) + return_index = True + + new_blocks = \ + np.array([_co_op_helper._submit( + args=tuple([lambda x, y: x.merge(y, *args), + left_cols, right_cols, + len(self._block_partitions.T), next(left_idx)] + + np.concatenate(obj).tolist()), + num_return_vals=len(self._block_partitions.T) + return_index) + for obj in zip(self._block_partitions, + repartitioned_right)]) + + if not return_index: + # Default to RangeIndex if left_index and right_index both false. + new_index = None + else: + new_index_parts = new_blocks[:, -1] + new_index = _concat_index.remote(*new_index_parts) + new_blocks = new_blocks[:, :-1] + + return DataFrame(block_partitions=new_blocks, + columns=new_columns, + index=new_index) def min(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): @@ -2433,9 +2844,44 @@ def mod(self, other, axis='columns', level=None, fill_value=None): fill_value) def mode(self, axis=0, numeric_only=False): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") + """Perform mode across the DataFrame. + + Args: + axis (int): The axis to take the mode on. + numeric_only (bool): if True, only apply to numeric columns. + + Returns: + DataFrame: The mode of the DataFrame. + """ + axis = pd.DataFrame()._get_axis_number(axis) + + def mode_helper(df): + mode_df = df.mode(axis=axis, numeric_only=numeric_only) + return mode_df, mode_df.shape[axis] + + def fix_length(df, *lengths): + max_len = max(lengths[0]) + df = df.reindex(pd.RangeIndex(max_len), axis=axis) + return df + + parts = self._col_partitions if axis == 0 else self._row_partitions + + result = [_deploy_func._submit(args=(lambda df: mode_helper(df), + part), num_return_vals=2) + for part in parts] + + parts, lengths = [list(t) for t in zip(*result)] + + parts = [_deploy_func.remote( + lambda df, *l: fix_length(df, l), part, *lengths) + for part in parts] + + if axis == 0: + return DataFrame(col_partitions=parts, + columns=self.columns) + else: + return DataFrame(row_partitions=parts, + index=self.index) def mul(self, other, axis='columns', level=None, fill_value=None): """Multiplies this DataFrame against another DataFrame/Series/scalar. @@ -2498,8 +2944,8 @@ def notna(self): lambda df: df.notna(), block) for block in self._block_partitions]) return DataFrame(block_partitions=new_block_partitions, - columns=self.columns, - index=self.index) + row_metadata=self._row_metadata, + col_metadata=self._col_metadata) def notnull(self): """Perform notnull across the DataFrame. @@ -2516,8 +2962,8 @@ def notnull(self): for block in self._block_partitions]) return DataFrame(block_partitions=new_block_partitions, - columns=self.columns, - index=self.index) + row_metadata=self._row_metadata, + col_metadata=self._col_metadata) def nsmallest(self, n, columns, keep='first'): raise NotImplementedError( @@ -2525,9 +2971,20 @@ def nsmallest(self, n, columns, keep='first'): "github.com/ray-project/ray.") def nunique(self, axis=0, dropna=True): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") + """Return Series with number of distinct + observations over requested axis. + + Args: + axis : {0 or 'index', 1 or 'columns'}, default 0 + dropna : boolean, default True + + Returns: + nunique : Series + """ + def remote_func(df): + return df.nunique(axis=axis, dropna=dropna) + + return self._arithmetic_helper(remote_func, axis) def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, **kwargs): @@ -2536,9 +2993,17 @@ def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, "github.com/ray-project/ray.") def pipe(self, func, *args, **kwargs): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") + """Apply func(self, *args, **kwargs) + + Args: + func: function to apply to the df. + args: positional arguments passed into ``func``. + kwargs: a dictionary of keyword arguments passed into ``func``. + + Returns: + object: the return type of ``func``. + """ + return com._pipe(self, func, *args, **kwargs) def pivot(self, index=None, columns=None, values=None): raise NotImplementedError( @@ -2593,16 +3058,43 @@ def pow(self, other, axis='columns', level=None, fill_value=None): fill_value) def prod(self, axis=None, skipna=None, level=None, numeric_only=None, - min_count=0, **kwargs): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") + min_count=1, **kwargs): + """Return the product of the values for the requested axis + + Args: + axis : {index (0), columns (1)} + skipna : boolean, default True + level : int or level name, default None + numeric_only : boolean, default None + min_count : int, default 1 + + Returns: + prod : Series or DataFrame (if level specified) + """ + def remote_func(df): + return df.prod(axis=axis, skipna=skipna, level=level, + numeric_only=numeric_only, min_count=min_count, + **kwargs) + + return self._arithmetic_helper(remote_func, axis, level) def product(self, axis=None, skipna=None, level=None, numeric_only=None, - min_count=0, **kwargs): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") + min_count=1, **kwargs): + """Return the product of the values for the requested axis + + Args: + axis : {index (0), columns (1)} + skipna : boolean, default True + level : int or level name, default None + numeric_only : boolean, default None + min_count : int, default 1 + + Returns: + product : Series or DataFrame (if level specified) + """ + return self.prod(axis=axis, skipna=skipna, level=level, + numeric_only=numeric_only, min_count=min_count, + **kwargs) def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation='linear'): @@ -2627,35 +3119,80 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, are the quantiles. """ - def quantile_helper(df, q, axis, numeric_only, interpolation): - try: + def check_bad_dtype(t): + return t == np.dtype('O') or is_timedelta64_dtype(t) + + if not numeric_only: + # check if there are any object columns + if all(check_bad_dtype(t) for t in self.dtypes): + raise TypeError("can't multiply sequence by non-int of type " + "'float'") + else: + if next((True for t in self.dtypes if check_bad_dtype(t)), + False): + dtype = next(t for t in self.dtypes if check_bad_dtype(t)) + raise ValueError("Cannot compare type '{}' with type '{}'" + .format(type(dtype), float)) + else: + # Normally pandas returns this near the end of the quantile, but we + # can't afford the overhead of running the entire operation before + # we error. + if all(check_bad_dtype(t) for t in self.dtypes): + raise ValueError("need at least one array to concatenate") + + # check that all qs are between 0 and 1 + pd.DataFrame()._check_percentile(q) + + def quantile_helper(df, base_object): + """Quantile to be run inside each partitoin. + + Args: + df: The DataFrame composing the partition. + base_object: An empty pd.Series or pd.DataFrame depending on q. + + Returns: + A new Series or DataFrame depending on q. + """ + # This if call prevents ValueErrors with object only partitions + if (numeric_only and + all([dtype == np.dtype('O') or + is_timedelta64_dtype(dtype) + for dtype in df.dtypes])): + return base_object + else: return df.quantile(q=q, axis=axis, numeric_only=numeric_only, interpolation=interpolation) - except ValueError: - return pd.Series() + + axis = pd.DataFrame()._get_axis_number(axis) if isinstance(q, (pd.Series, np.ndarray, pd.Index, list)): - # In the case of a list, we build it one at a time. - # TODO Revisit for performance - quantiles = [] - for q_i in q: - def remote_func(df): - return quantile_helper(df, q=q_i, axis=axis, - numeric_only=numeric_only, - interpolation=interpolation) - - result = self._arithmetic_helper(remote_func, axis) - result.name = q_i - quantiles.append(result) - - return pd.concat(quantiles, axis=1).T - else: - def remote_func(df): - return quantile_helper(df, q=q, axis=axis, - numeric_only=numeric_only, - interpolation=interpolation) - result = self._arithmetic_helper(remote_func, axis) + q_index = pd.Float64Index(q) + + if axis == 0: + new_partitions = _map_partitions( + lambda df: quantile_helper(df, pd.DataFrame()), + self._col_partitions) + + # select only correct dtype columns + new_columns = self.dtypes[self.dtypes.apply( + lambda x: is_numeric_dtype(x))].index + + else: + new_partitions = _map_partitions( + lambda df: quantile_helper(df, pd.DataFrame()), + self._row_partitions) + new_columns = self.index + + return DataFrame(col_partitions=new_partitions, + index=q_index, + columns=new_columns) + + else: + # When q is a single float, we return a Series, so using + # arithmetic_helper works well here. + result = self._arithmetic_helper( + lambda df: quantile_helper(df, pd.Series()), axis) result.name = q return result @@ -2682,16 +3219,60 @@ def query_helper(df): if inplace: self._update_inplace(row_partitions=new_rows) else: - return DataFrame(row_partitions=new_rows, columns=self.columns) + return DataFrame(row_partitions=new_rows, + col_metadata=self._col_metadata) def radd(self, other, axis='columns', level=None, fill_value=None): return self.add(other, axis, level, fill_value) def rank(self, axis=0, method='average', numeric_only=None, na_option='keep', ascending=True, pct=False): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") + + """ + Compute numerical data ranks (1 through n) along axis. + Equal values are assigned a rank that is the [method] of + the ranks of those values. + + Args: + axis (int): 0 or 'index' for row-wise, + 1 or 'columns' for column-wise + interpolation: {'average', 'min', 'max', 'first', 'dense'} + Specifies which method to use for equal vals + numeric_only (boolean) + Include only float, int, boolean data. + na_option: {'keep', 'top', 'bottom'} + Specifies how to handle NA options + ascending (boolean): + Decedes ranking order + pct (boolean): + Computes percentage ranking of data + Returns: + A new DataFrame + """ + + def rank_helper(df): + return df.rank(axis=axis, method=method, + numeric_only=numeric_only, + na_option=na_option, + ascending=ascending, pct=pct) + + axis = pd.DataFrame()._get_axis_number(axis) + + if (axis == 1): + new_cols = self.dtypes[self.dtypes.apply( + lambda x: is_numeric_dtype(x))].index + result = _map_partitions(rank_helper, + self._row_partitions) + return DataFrame(row_partitions=result, + columns=new_cols, + index=self.index) + + if (axis == 0): + result = _map_partitions(rank_helper, + self._col_partitions) + return DataFrame(col_partitions=result, + columns=self.columns, + index=self.index) def rdiv(self, other, axis='columns', level=None, fill_value=None): return self._single_df_op_helper( @@ -2939,8 +3520,8 @@ def round(self, decimals=0, *args, **kwargs): for block in self._block_partitions]) return DataFrame(block_partitions=new_block_partitions, - columns=self.columns, - index=self.index) + row_metadata=self._row_metadata, + col_metadata=self._col_metadata) def rpow(self, other, axis='columns', level=None, fill_value=None): return self._single_df_op_helper( @@ -2966,7 +3547,7 @@ def sample(self, n=None, frac=None, replace=False, weights=None, Default = 1 if frac = None. frac: Fraction of axis items to return. Cannot be used with n. replace: Sample with or without replacement. Default = False. - weights: Default ‘None’ results in equal probability weighting. + weights: Default 'None' results in equal probability weighting. If passed a Series, will align with target object on index. Index values in weights not found in sampled object will be ignored and index values in sampled object not in weights will @@ -3126,9 +3707,42 @@ def select(self, crit, axis=0): "github.com/ray-project/ray.") def select_dtypes(self, include=None, exclude=None): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") + # Validates arguments for whether both include and exclude are None or + # if they are disjoint. Also invalidates string dtypes. + pd.DataFrame().select_dtypes(include, exclude) + + if include and not is_list_like(include): + include = [include] + elif not include: + include = [] + + if exclude and not is_list_like(exclude): + exclude = [exclude] + elif not exclude: + exclude = [] + + sel = tuple(map(set, (include, exclude))) + + include, exclude = map( + lambda x: set(map(_get_dtype_from_object, x)), sel) + + include_these = pd.Series(not bool(include), index=self.columns) + exclude_these = pd.Series(not bool(exclude), index=self.columns) + + def is_dtype_instance_mapper(column, dtype): + return column, functools.partial(issubclass, dtype.type) + + for column, f in itertools.starmap(is_dtype_instance_mapper, + self.dtypes.iteritems()): + if include: # checks for the case of empty include or exclude + include_these[column] = any(map(f, include)) + if exclude: + exclude_these[column] = not any(map(f, exclude)) + + dtype_indexer = include_these & exclude_these + indicate = [i for i in range(len(dtype_indexer.values)) + if not dtype_indexer.values[i]] + return self.drop(columns=self.columns[indicate], inplace=False) def sem(self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs): @@ -3260,9 +3874,23 @@ def shift(self, periods=1, freq=None, axis=0): def skew(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") + """Return unbiased skew over requested axis Normalized by N-1 + + Args: + axis : {index (0), columns (1)} + skipna : boolean, default True + Exclude NA/null values when computing the result. + level : int or level name, default None + numeric_only : boolean, default None + + Returns: + skew : Series or DataFrame (if level specified) + """ + def remote_func(df): + return df.skew(axis=axis, skipna=skipna, level=level, + numeric_only=numeric_only, **kwargs) + + return self._arithmetic_helper(remote_func, axis, level) def slice_shift(self, periods=1, axis=0): raise NotImplementedError( @@ -3272,15 +3900,175 @@ def slice_shift(self, periods=1, axis=0): def sort_index(self, axis=0, level=None, ascending=True, inplace=False, kind='quicksort', na_position='last', sort_remaining=True, by=None): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") + """Sort a DataFrame by one of the indices (columns or index). + + Args: + axis: The axis to sort over. + level: The MultiIndex level to sort over. + ascending: Ascending or descending + inplace: Whether or not to update this DataFrame inplace. + kind: How to perform the sort. + na_position: Where to position NA on the sort. + sort_remaining: On Multilevel Index sort based on all levels. + by: (Deprecated) argument to pass to sort_values. + + Returns: + A sorted DataFrame + """ + if level is not None: + raise NotImplementedError("Multilevel index not yet implemented.") + + if by is not None: + warnings.warn("by argument to sort_index is deprecated, " + "please use .sort_values(by=...)", + FutureWarning, stacklevel=2) + if level is not None: + raise ValueError("unable to simultaneously sort by and level") + return self.sort_values(by, axis=axis, ascending=ascending, + inplace=inplace) + + axis = pd.DataFrame()._get_axis_number(axis) + + args = (axis, level, ascending, False, kind, na_position, + sort_remaining) + + def _sort_helper(df, index, axis, *args): + if axis == 0: + df.index = index + else: + df.columns = index + + result = df.sort_index(*args) + df.reset_index(drop=True, inplace=True) + df.columns = pd.RangeIndex(len(df.columns)) + return result + + if axis == 0: + index = self.index + new_column_parts = _map_partitions( + lambda df: _sort_helper(df, index, axis, *args), + self._col_partitions) + + new_columns = self.columns + new_index = self.index.sort_values() + new_row_parts = None + else: + columns = self.columns + new_row_parts = _map_partitions( + lambda df: _sort_helper(df, columns, axis, *args), + self._row_partitions) + + new_columns = self.columns.sort_values() + new_index = self.index + new_column_parts = None + + if not inplace: + return DataFrame(col_partitions=new_column_parts, + row_partitions=new_row_parts, + index=new_index, + columns=new_columns) + else: + self._update_inplace(row_partitions=new_row_parts, + col_partitions=new_column_parts, + columns=new_columns, + index=new_index) def sort_values(self, by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last'): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") + """Sorts by a column/row or list of columns/rows. + + Args: + by: A list of labels for the axis to sort over. + axis: The axis to sort. + ascending: Sort in ascending or descending order. + inplace: If true, do the operation inplace. + kind: How to sort. + na_position: Where to put np.nan values. + + Returns: + A sorted DataFrame. + """ + + axis = pd.DataFrame()._get_axis_number(axis) + + if not is_list_like(by): + by = [by] + + if axis == 0: + broadcast_value_dict = {str(col): self[col] for col in by} + broadcast_values = pd.DataFrame(broadcast_value_dict) + else: + broadcast_value_list = [to_pandas(self[row::len(self.index)]) + for row in by] + + index_builder = list(zip(broadcast_value_list, by)) + + for row, idx in index_builder: + row.index = [str(idx)] + + broadcast_values = pd.concat([row for row, idx in index_builder]) + + # We are converting the by to string here so that we don't have a + # collision with the RangeIndex on the inner frame. It is cheap and + # gaurantees that we sort by the correct column. + by = [str(col) for col in by] + + args = (by, axis, ascending, False, kind, na_position) + + def _sort_helper(df, broadcast_values, axis, *args): + """Sorts the data on a partition. + + Args: + df: The DataFrame to sort. + broadcast_values: The by DataFrame to use for the sort. + axis: The axis to sort over. + args: The args for the sort. + + Returns: + A new sorted DataFrame. + """ + if axis == 0: + broadcast_values.index = df.index + names = broadcast_values.columns + else: + broadcast_values.columns = df.columns + names = broadcast_values.index + + return pd.concat([df, broadcast_values], axis=axis ^ 1, + copy=False).sort_values(*args)\ + .drop(names, axis=axis ^ 1) + + if axis == 0: + new_column_partitions = _map_partitions( + lambda df: _sort_helper(df, broadcast_values, axis, *args), + self._col_partitions) + + new_row_partitions = None + new_columns = self.columns + + # This is important because it allows us to get the axis that we + # aren't sorting over. We need the order of the columns/rows and + # this will provide that in the return value. + new_index = broadcast_values.sort_values(*args).index + else: + new_row_partitions = _map_partitions( + lambda df: _sort_helper(df, broadcast_values, axis, *args), + self._row_partitions) + + new_column_partitions = None + new_columns = broadcast_values.sort_values(*args).columns + new_index = self.index + + if inplace: + self._update_inplace(row_partitions=new_row_partitions, + col_partitions=new_column_partitions, + columns=new_columns, + index=new_index) + else: + return DataFrame(row_partitions=new_row_partitions, + col_partitions=new_column_partitions, + columns=new_columns, + index=new_index) def sortlevel(self, level=0, axis=0, ascending=True, inplace=False, sort_remaining=True): @@ -3372,7 +4160,7 @@ def tail(self, n=5): index = self._row_metadata.index[-n:] return DataFrame(col_partitions=new_dfs, - columns=self.columns, + col_metadata=self._col_metadata, index=index) def take(self, indices, axis=0, convert=None, is_copy=True, **kwargs): @@ -3635,9 +4423,34 @@ def unstack(self, level=-1, fill_value=None): def update(self, other, join='left', overwrite=True, filter_func=None, raise_conflict=False): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") + """Modify DataFrame in place using non-NA values from other. + + Args: + other: DataFrame, or object coercible into a DataFrame + join: {'left'}, default 'left' + overwrite: If True then overwrite values for common keys in frame + filter_func: Can choose to replace values other than NA. + raise_conflict: If True, will raise an error if the DataFrame and + other both contain data in the same place. + + Returns: + None + """ + if raise_conflict: + raise NotImplementedError( + "raise_conflict parameter not yet supported. " + "To contribute to Pandas on Ray, please visit " + "github.com/ray-project/ray.") + + if not isinstance(other, DataFrame): + other = DataFrame(other) + + def update_helper(x, y): + x.update(y, join, overwrite, filter_func, False) + return x + + self._inter_df_op_helper(update_helper, other, join, 0, None, + inplace=True) def var(self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs): @@ -3659,9 +4472,105 @@ def remote_func(df): def where(self, cond, other=np.nan, inplace=False, axis=None, level=None, errors='raise', try_cast=False, raise_on_error=None): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") + """Replaces values not meeting condition with values in other. + + Args: + cond: A condition to be met, can be callable, array-like or a + DataFrame. + other: A value or DataFrame of values to use for setting this. + inplace: Whether or not to operate inplace. + axis: The axis to apply over. Only valid when a Series is passed + as other. + level: The MultiLevel index level to apply over. + errors: Whether or not to raise errors. Does nothing in Pandas. + try_cast: Try to cast the result back to the input type. + raise_on_error: Whether to raise invalid datatypes (deprecated). + + Returns: + A new DataFrame with the replaced values. + """ + + inplace = validate_bool_kwarg(inplace, 'inplace') + + if isinstance(other, pd.Series) and axis is None: + raise ValueError("Must specify axis=0 or 1") + + if level is not None: + raise NotImplementedError("Multilevel Index not yet supported on " + "Pandas on Ray.") + + axis = pd.DataFrame()._get_axis_number(axis) if axis is not None else 0 + + cond = cond(self) if callable(cond) else cond + + if not isinstance(cond, DataFrame): + if not hasattr(cond, 'shape'): + cond = np.asanyarray(cond) + if cond.shape != self.shape: + raise ValueError("Array conditional must be same shape as " + "self") + cond = DataFrame(cond, index=self.index, columns=self.columns) + + zipped_partitions = self._copartition(cond, self.index) + args = (False, axis, level, errors, try_cast, raise_on_error) + + if isinstance(other, DataFrame): + other_zipped = (v for k, v in self._copartition(other, + self.index)) + + new_partitions = [_where_helper.remote(k, v, next(other_zipped), + self.columns, cond.columns, + other.columns, *args) + for k, v in zipped_partitions] + + # Series has to be treated specially because we're operating on row + # partitions from here on. + elif isinstance(other, pd.Series): + if axis == 0: + # Pandas determines which index to use based on axis. + other = other.reindex(self.index) + other.index = pd.RangeIndex(len(other)) + + # Since we're working on row partitions, we have to partition + # the Series based on the partitioning of self (since both + # self and cond are co-partitioned by self. + other_builder = [] + for length in self._row_metadata._lengths: + other_builder.append(other[:length]) + other = other[length:] + # Resetting the index here ensures that we apply each part + # to the correct row within the partitions. + other.index = pd.RangeIndex(len(other)) + + other = (obj for obj in other_builder) + + new_partitions = [_where_helper.remote(k, v, next(other, + pd.Series()), + self.columns, + cond.columns, + None, *args) + for k, v in zipped_partitions] + else: + other = other.reindex(self.columns) + new_partitions = [_where_helper.remote(k, v, other, + self.columns, + cond.columns, + None, *args) + for k, v in zipped_partitions] + + else: + new_partitions = [_where_helper.remote(k, v, other, self.columns, + cond.columns, None, *args) + for k, v in zipped_partitions] + + if inplace: + self._update_inplace(row_partitions=new_partitions, + row_metadata=self._row_metadata, + col_metadata=self._col_metadata) + else: + return DataFrame(row_partitions=new_partitions, + row_metadata=self._row_metadata, + col_metadata=self._col_metadata) def xs(self, key, axis=0, level=None, drop_level=True): raise NotImplementedError( @@ -3734,9 +4643,9 @@ def _getitem_array(self, key): index=index) else: columns = self._col_metadata[key].index - - indices_for_rows = [self.columns.index(new_col) - for new_col in columns] + indices_for_rows = \ + [i for i, item in enumerate(self.columns) + if item in set(columns)] new_parts = [_deploy_func.remote( lambda df: df.__getitem__(indices_for_rows), @@ -3764,8 +4673,8 @@ def _getitem_slice(self, key): index = self.index[key] return DataFrame(col_partitions=new_cols, - index=index, - columns=self.columns) + col_metadata=self._col_metadata, + index=index) def __getattr__(self, key): """After regular attribute access, looks up the name in the columns @@ -3784,9 +4693,16 @@ def __getattr__(self, key): raise e def __setitem__(self, key, value): - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") + if not isinstance(key, str): + raise NotImplementedError( + "To contribute to Pandas on Ray, please visit " + "github.com/ray-project/ray.") + if key not in self.columns: + self.insert(loc=len(self.columns), column=key, value=value) + else: + loc = self.columns.get_loc(key) + self.__delitem__(key) + self.insert(loc=loc, column=key, value=value) def __len__(self): """Gets the length of the dataframe. @@ -3854,8 +4770,8 @@ def __round__(self, decimals=0): "github.com/ray-project/ray.") def __array__(self, dtype=None): - # TODO: This is very inefficient and needs fix - return np.array(to_pandas(self)) + # TODO: This is very inefficient and needs fix, also see as_matrix + return to_pandas(self).__array__(dtype=dtype) def __array_wrap__(self, result, context=None): raise NotImplementedError( @@ -4073,8 +4989,8 @@ def __neg__(self): for block in self._block_partitions]) return DataFrame(block_partitions=new_block_partitions, - columns=self.columns, - index=self.index) + col_metadata=self._col_metadata, + row_metadata=self._row_metadata) def __sizeof__(self): raise NotImplementedError( @@ -4111,9 +5027,8 @@ def loc(self): We currently support: single label, list array, slice object We do not support: boolean array, callable """ - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") + from .indexing import _Loc_Indexer + return _Loc_Indexer(self) @property def is_copy(self): @@ -4138,9 +5053,8 @@ def iloc(self): We currently support: single label, list array, slice object We do not support: boolean array, callable """ - raise NotImplementedError( - "To contribute to Pandas on Ray, please visit " - "github.com/ray-project/ray.") + from .indexing import _iLoc_Indexer + return _iLoc_Indexer(self) def _copartition(self, other, new_index): """Colocates the values of other with this for certain operations. @@ -4184,36 +5098,40 @@ def _operator_helper(self, func, other, axis, level, *args): if isinstance(other, DataFrame): return self._inter_df_op_helper( lambda x, y: func(x, y, axis, level, *args), - other, axis, level) + other, "outer", axis, level) else: return self._single_df_op_helper( lambda df: func(df, other, axis, level, *args), other, axis, level) - def _inter_df_op_helper(self, func, other, axis, level): + def _inter_df_op_helper(self, func, other, how, axis, level, + inplace=False): if level is not None: raise NotImplementedError("Mutlilevel index not yet supported " "in Pandas on Ray") axis = pd.DataFrame()._get_axis_number(axis) - # Adding two DataFrames causes an outer join. - if isinstance(other, DataFrame): - new_column_index = self.columns.join(other.columns, how="outer") - new_index = self.index.join(other.index, how="outer") - copartitions = self._copartition(other, new_index) - - new_blocks = \ - np.array([_co_op_helper._submit( - args=tuple([func, self.columns, other.columns, - len(part[0])] + - np.concatenate(part).tolist()), - num_return_vals=len(part[0])) - for part in copartitions]) + new_column_index = self.columns.join(other.columns, how=how) + new_index = self.index.join(other.index, how=how) + copartitions = self._copartition(other, new_index) + new_blocks = \ + np.array([_co_op_helper._submit( + args=tuple([func, self.columns, other.columns, + len(part[0]), None] + + np.concatenate(part).tolist()), + num_return_vals=len(part[0])) + for part in copartitions]) + + if not inplace: # TODO join the Index Metadata objects together for performance. return DataFrame(block_partitions=new_blocks, columns=new_column_index, index=new_index) + else: + self._update_inplace(block_partitions=new_blocks, + columns=new_column_index, + index=new_index) def _single_df_op_helper(self, func, other, axis, level): if level is not None: @@ -4260,3 +5178,44 @@ def _single_df_op_helper(self, func, other, axis, level): columns=new_column_index, col_metadata=new_col_metadata, row_metadata=new_row_metadata) + + +@ray.remote +def _merge_columns(left_columns, right_columns, *args): + """Merge two columns to get the correct column names and order. + + Args: + left_columns: The columns on the left side of the merge. + right_columns: The columns on the right side of the merge. + args: The arguments for the merge. + + Returns: + The columns for the merge operation. + """ + return pd.DataFrame(columns=left_columns, index=[0], dtype='uint8').merge( + pd.DataFrame(columns=right_columns, index=[0], dtype='uint8'), + *args).columns + + +@ray.remote +def _where_helper(left, cond, other, left_columns, cond_columns, + other_columns, *args): + + left = pd.concat(ray.get(left.tolist()), axis=1) + # We have to reset the index and columns here because we are coming + # from blocks and the axes are set according to the blocks. We have + # already correctly copartitioned everything, so there's no + # correctness problems with doing this. + left.reset_index(inplace=True, drop=True) + left.columns = left_columns + + cond = pd.concat(ray.get(cond.tolist()), axis=1) + cond.reset_index(inplace=True, drop=True) + cond.columns = cond_columns + + if isinstance(other, np.ndarray): + other = pd.concat(ray.get(other.tolist()), axis=1) + other.reset_index(inplace=True, drop=True) + other.columns = other_columns + + return left.where(cond, other, *args) diff --git a/python/ray/dataframe/datetimes.py b/python/ray/dataframe/datetimes.py new file mode 100644 index 000000000000..48a437cebc7a --- /dev/null +++ b/python/ray/dataframe/datetimes.py @@ -0,0 +1,64 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import pandas +import ray + +from .dataframe import DataFrame +from .utils import _map_partitions + + +def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, utc=None, + box=True, format=None, exact=True, unit=None, + infer_datetime_format=False, origin='unix'): + """Convert the arg to datetime format. If not Ray DataFrame, this falls + back on pandas. + + Args: + errors ('raise' or 'ignore'): If 'ignore', errors are silenced. + dayfirst (bool): Date format is passed in as day first. + yearfirst (bool): Date format is passed in as year first. + utc (bool): retuns a UTC DatetimeIndex if True. + box (bool): If True, returns a DatetimeIndex. + format (string): strftime to parse time, eg "%d/%m/%Y". + exact (bool): If True, require an exact format match. + unit (string, default 'ns'): unit of the arg. + infer_datetime_format (bool): Whether or not to infer the format. + origin (string): Define the reference date. + + Returns: + Type depends on input: + + - list-like: DatetimeIndex + - Series: Series of datetime64 dtype + - scalar: Timestamp + """ + if not isinstance(arg, DataFrame): + return pandas.to_datetime(arg, errors=errors, dayfirst=dayfirst, + yearfirst=yearfirst, utc=utc, box=box, + format=format, exact=exact, unit=unit, + infer_datetime_format=infer_datetime_format, + origin=origin) + if errors == 'raise': + pandas.to_datetime(pandas.DataFrame(columns=arg.columns), + errors=errors, dayfirst=dayfirst, + yearfirst=yearfirst, utc=utc, box=box, + format=format, exact=exact, unit=unit, + infer_datetime_format=infer_datetime_format, + origin=origin) + + def datetime_helper(df, cols): + df.columns = cols + return pandas.to_datetime(df, errors=errors, dayfirst=dayfirst, + yearfirst=yearfirst, utc=utc, box=box, + format=format, exact=exact, unit=unit, + infer_datetime_format=infer_datetime_format, + origin=origin) + + datetime_series = _map_partitions(datetime_helper, arg._row_partitions, + arg.columns) + result = pandas.concat(ray.get(datetime_series), copy=False) + result.index = arg.index + + return result diff --git a/python/ray/dataframe/groupby.py b/python/ray/dataframe/groupby.py index 892bc8f74e19..733943fc956d 100644 --- a/python/ray/dataframe/groupby.py +++ b/python/ray/dataframe/groupby.py @@ -3,7 +3,6 @@ from __future__ import print_function import pandas.core.groupby -import numpy as np import pandas as pd from pandas.core.dtypes.common import is_list_like import ray @@ -34,7 +33,7 @@ def __init__(self, df, by, axis, level, as_index, sort, group_keys, self._index_grouped = pd.Series(self._columns, index=self._index)\ .groupby(by=by, sort=sort) - self._keys_and_values = [(k, np.array(v)) + self._keys_and_values = [(k, v) for k, v in self._index_grouped] self._grouped_partitions = \ @@ -44,7 +43,7 @@ def __init__(self, df, by, axis, level, as_index, sort, group_keys, as_index, sort, group_keys, - squeeze) + part, + squeeze) + tuple(part.tolist()), num_return_vals=len(self)) for part in partitions))) @@ -106,7 +105,7 @@ def tshift(self): @property def groups(self): - return dict([(k, pd.Index(v)) for k, v in self._keys_and_values]) + return {k: pd.Index(v) for k, v in self._keys_and_values} def min(self, **kwargs): return self._apply_agg_function(lambda df: df.min(**kwargs)) diff --git a/python/ray/dataframe/index_metadata.py b/python/ray/dataframe/index_metadata.py index 235809ec7a35..11c23d8855cc 100644 --- a/python/ray/dataframe/index_metadata.py +++ b/python/ray/dataframe/index_metadata.py @@ -3,148 +3,159 @@ import ray from .utils import ( - _build_index, - _build_columns) + _build_row_lengths, + _build_col_widths, + _build_coord_df) from pandas.core.indexing import convert_to_index_sliceable -class _IndexMetadataBase(object): +class _IndexMetadata(object): """Wrapper for Pandas indexes in Ray DataFrames. Handles all of the metadata specific to the axis of partition (setting indexes, - calculating the index within partition of a value, etc.) since the - dataframe may be partitioned across either axis. This way we can unify the - possible index operations over one axis-agnostic interface. - - This class is the abstract superclass for IndexMetadata and - WrappingIndexMetadata, which handle indexes along the partitioned and - non-partitioned axes, respectively. + calculating the index within partition of a value, etc.). This + implementation assumes the underlying index lies across multiple + partitions. IMPORTANT NOTE: Currently all operations, as implemented, are inplace. - """ - def _get__coord_df(self): - if isinstance(self._coord_df_cache, ray.local_scheduler.ObjectID): - self._coord_df_cache = ray.get(self._coord_df_cache) - return self._coord_df_cache - - def _set__coord_df(self, coord_df): - self._coord_df_cache = coord_df + WARNING: Currently, the `_lengths` item is the source of truth for an + _IndexMetadata object, since it is easy to manage, and that the coord_df + item may be deprecated in the future. As such, it is _very_ important that + any functions that mutate the coord_df splits in anyway first modify the + lengths. Otherwise bad things might happen! + """ - _coord_df = property(_get__coord_df, _set__coord_df) + def __init__(self, dfs=None, index=None, axis=0, lengths_oid=None, + coord_df_oid=None): + """Inits a IndexMetadata from Ray DataFrame partitions - def _get_index(self): - """Get the index wrapped by this IndexDF. + Args: + dfs ([ObjectID]): ObjectIDs of dataframe partitions + index (pd.Index): Index of the Ray DataFrame. + axis: Axis of partition (0=row partitions, 1=column partitions) Returns: - The index wrapped by this IndexDF - """ - return self._coord_df.index - - def _set_index(self, new_index): - """Set the index wrapped by this IndexDF. - - Args: - new_index: The new index to wrap + A IndexMetadata backed by the specified pd.Index, partitioned off + specified partitions """ - self._coord_df.index = new_index + assert (lengths_oid is None) == (coord_df_oid is None), \ + "Must pass both or neither of lengths_oid and coord_df_oid" - index = property(_get_index, _set_index) - - def coords_of(self, key): - raise NotImplementedError() + if dfs is not None and lengths_oid is None: + if axis == 0: + lengths_oid = _build_row_lengths.remote(dfs) + else: + lengths_oid = _build_col_widths.remote(dfs) + coord_df_oid = _build_coord_df.remote(lengths_oid, index) - def __getitem__(self, key): - return self.coords_of(key) + self._lengths = lengths_oid + self._coord_df = coord_df_oid + self._index_cache = index + self._cached_index = False - def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, - group_keys=True, squeeze=False, **kwargs): - raise NotImplementedError() + def _get__lengths(self): + if isinstance(self._lengths_cache, ray.ObjectID) or \ + (isinstance(self._lengths_cache, list) and + isinstance(self._lengths_cache[0], ray.ObjectID)): + self._lengths_cache = ray.get(self._lengths_cache) + return self._lengths_cache - def __len__(self): - return len(self._coord_df) + def _set__lengths(self, lengths): + self._lengths_cache = lengths - def first_valid_index(self): - return self._coord_df.first_valid_index() + _lengths = property(_get__lengths, _set__lengths) - def last_valid_index(self): - return self._coord_df.last_valid_index() + def _get__coord_df(self): + """Get the coordinate dataframe wrapped by this _IndexMetadata. - def insert(self, key, loc=None, partition=None, - index_within_partition=None): - raise NotImplementedError() + Since we may have had an index set before our coord_df was + materialized, we'll have to apply it to the newly materialized df + """ + if isinstance(self._coord_df_cache, ray.ObjectID): + self._coord_df_cache = ray.get(self._coord_df_cache) + if self._cached_index: + self._coord_df_cache.index = self._index_cache + self._cached_index = False + return self._coord_df_cache - def drop(self, labels, errors='raise'): - """Drop the specified labels from the IndexMetadata + def _set__coord_df(self, coord_df): + """Set the coordinate dataframe wrapped by this _IndexMetadata. - Args: - labels (scalar or list-like): - The labels to drop - errors ('raise' or 'ignore'): - If 'ignore', suppress errors for when labels don't exist + Sometimes we set the _IndexMetadata's coord_df outside of the + constructor, generally using fxns like drop(). This produces a modified + index, so we need to reflect the change on the index cache. - Returns: - DataFrame with coordinates of dropped labels + If the set _IndexMetadata is an OID instead (due to a copy or whatever + reason), we fall back relying on `_index_cache`. """ - # TODO(patyang): This produces inconsistent indexes. - dropped = self.coords_of(labels) - self._coord_df = self._coord_df.drop(labels, errors=errors) - return dropped + if not isinstance(coord_df, ray.ObjectID): + self._index_cache = coord_df.index + self._coord_df_cache = coord_df - def rename_index(self, mapper): - """Rename the index. + _coord_df = property(_get__coord_df, _set__coord_df) - Args: - mapper: name to rename the index as + def _get_index(self): + """Get the index wrapped by this _IndexMetadata. + + The only time `self._index_cache` would be None is in a newly created + _IndexMetadata object without a specified `index` parameter (See the + _IndexMetadata constructor for more details) """ - self._coord_df = self._coord_df.rename_axis(mapper, axis=0) + if isinstance(self._coord_df_cache, ray.ObjectID): + return self._index_cache + else: + return self._coord_df_cache.index - def convert_to_index_sliceable(self, key): - """Converts and performs error checking on the passed slice + def _set_index(self, new_index): + """Set the index wrapped by this _IndexMetadata. - Args: - key: slice to convert and check + It is important to always set `_index_cache` even if the coord_df is + materialized due to the possibility that it is set to an OID later on. + This design is more straightforward than caching indexes on setting the + coord_df to an OID due to the possibility of an OID-to-OID change. """ - return convert_to_index_sliceable(self._coord_df, key) + new_index = pd.DataFrame(index=new_index).index + assert len(new_index) == len(self) + self._index_cache = new_index + if isinstance(self._coord_df_cache, ray.ObjectID): + self._cached_index = True + else: + self._coord_df_cache.index = new_index -class _IndexMetadata(_IndexMetadataBase): - """IndexMetadata implementation for index across a partitioned axis. This - implementation assumes the underlying index lies across multiple - partitions. - """ + index = property(_get_index, _set_index) - def __init__(self, dfs=None, index=None, axis=0, lengths_oid=None, - coord_df_oid=None): - """Inits a IndexMetadata from Ray DataFrame partitions + def _get_index_cache(self): + """Get the cached Index object, which may sometimes be an OID. - Args: - dfs ([ObjectID]): ObjectIDs of dataframe partitions - index (pd.Index): Index of the Ray DataFrame. - axis: Axis of partition (0=row partitions, 1=column partitions) + This will ray.get the Index object out of the Ray store lazily, such + that it is not grabbed until it is needed in the driver. This layer of + abstraction is important for allowing this object to be instantiated + with a remote Index object. Returns: - A IndexMetadata backed by the specified pd.Index, partitioned off - specified partitions + The Index object in _index_cache. """ - if dfs is not None: - lengths_oid, coord_df_oid = \ - _build_index.remote(dfs, index) if axis == 0 else \ - _build_columns.remote(dfs, index) - self._coord_df = coord_df_oid - self._lengths = lengths_oid + if self._index_cache_validator is None: + self._index_cache_validator = pd.RangeIndex(len(self)) + elif isinstance(self._index_cache_validator, + ray.ObjectID): + self._index_cache_validator = ray.get(self._index_cache_validator) - def _get__lengths(self): - if isinstance(self._lengths_cache, ray.local_scheduler.ObjectID) or \ - (isinstance(self._lengths_cache, list) and - isinstance(self._lengths_cache[0], ray.local_scheduler.ObjectID)): - self._lengths_cache = ray.get(self._lengths_cache) - return self._lengths_cache + return self._index_cache_validator - def _set__lengths(self, lengths): - self._lengths_cache = lengths + def _set_index_cache(self, new_index): + """Sets the new index cache. - _lengths = property(_get__lengths, _set__lengths) + Args: + new_index: The Index to set the _index_cache to. + """ + self._index_cache_validator = new_index + + # _index_cache_validator is an extra layer of abstraction to allow the + # cache to accept ObjectIDs and ray.get them when needed. + _index_cache = property(_get_index_cache, _set_index_cache) def coords_of(self, key): """Returns the coordinates (partition, index_within_partition) of the @@ -154,7 +165,7 @@ def coords_of(self, key): Args: key: item to get coordinates of. Can also be a tuple of item - and {partition, index_within_partition} if caller only + and {"partition", "index_within_partition"} if caller only needs one of the coordinates Returns: @@ -180,8 +191,6 @@ def partition_series(self, partition): 'index_within_partition'] def __len__(self): - # Hard to say if this is faster than IndexMetadataBase.__len__ if - # self._coord_df is non-resident return sum(self._lengths) def reset_partition_coords(self, partitions=None): @@ -262,7 +271,20 @@ def insert(self, key, loc=None, partition=None, # Return inserted coordinate for callee return coord_to_insert + def get_global_indices(self, partition, index_within_partition_list): + total = 0 + for i in range(partition): + total += self._lengths[i] + + return [total + i for i in index_within_partition_list] + def squeeze(self, partition, index_within_partition): + """Prepare a single coordinate for removal by "squeezing" the + subsequent coordinates "up" one index within that partition. To be used + with "_IndexMetadata.drop" for when all the "squeezed" coordinates are + dropped in batch. Note that this function doesn't actually mutate the + coord_df. + """ self._coord_df = self._coord_df.copy() partition_mask = self._coord_df.partition == partition @@ -272,76 +294,101 @@ def squeeze(self, partition, index_within_partition): 'index_within_partition'] -= 1 def copy(self): - return _IndexMetadata(coord_df_oid=self._coord_df, - lengths_oid=self._lengths) + # TODO: Investigate copy-on-write wrapper for metadata objects + coord_df_copy = self._coord_df_cache + if not isinstance(self._coord_df_cache, ray.ObjectID): + coord_df_copy = self._coord_df_cache.copy() + lengths_copy = self._lengths_cache + if not isinstance(self._lengths_cache, ray.ObjectID): + lengths_copy = self._lengths_cache.copy() -class _WrappingIndexMetadata(_IndexMetadata): - """IndexMetadata implementation for index across a non-partitioned axis. - This implementation assumes the underlying index lies across one partition. - """ + index_copy = self._index_cache + if self._index_cache is not None: + index_copy = self._index_cache.copy() - def __init__(self, index): - """Inits a IndexMetadata from Pandas Index only. + return _IndexMetadata(index=index_copy, + coord_df_oid=coord_df_copy, + lengths_oid=lengths_copy) + + def __getitem__(self, key): + """Returns the coordinates (partition, index_within_partition) of the + provided key in the index. Essentially just an alias for + `_IndexMetadata.coords_of` that allows for slice passing, since + slices cannot be passed with slice notation other than through + `__getitem__` calls. Args: - index (pd.Index): Index to wrap. + key: + item to get coordinates of. Can also be a tuple of item + and {"partition", "index_within_partition"} if caller only + needs one of the coordinates Returns: - A IndexMetadata backed by the specified pd.Index. + Pandas object with the keys specified. If key is a single object + it will be a pd.Series with items `partition` and + `index_within_partition`, and if key is a slice or if the key is + duplicate it will be a pd.DataFrame with said items as columns. """ - self._coord_df = pd.DataFrame(index=index) - # Set _lengths as a dummy variable for future-proof method inheritance - self._lengths = [len(index)] + return self.coords_of(key) - def coords_of(self, key): - """Returns the coordinates (partition, index_within_partition) of the - provided key in the index + def first_valid_index(self): + return self._coord_df.first_valid_index() + + def last_valid_index(self): + return self._coord_df.last_valid_index() + + def drop(self, labels, errors='raise'): + """Drop the specified labels from the IndexMetadata Args: - key: item to get coordinates of + labels (scalar or list-like): + The labels to drop + errors ('raise' or 'ignore'): + If 'ignore', suppress errors for when labels don't exist Returns: - Pandas object with the keys specified. If key is a single object - it will be a pd.Series with items `partition` and - `index_within_partition`, and if key is a slice it will be a - pd.DataFrame with said items as columns. + DataFrame with coordinates of dropped labels """ - locs = self.index.get_loc(key) - # locs may be a single int, a slice, or a boolean mask. - # Convert here to iterable of integers - loc_idxs = pd.RangeIndex(len(self.index))[locs] - # TODO: Investigate "modify view/copy" warning - ret_obj = self._coord_df.loc[key] - ret_obj['partition'] = 0 - ret_obj['index_within_partition'] = loc_idxs - return ret_obj + dropped = self.coords_of(labels) - def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, - group_keys=True, squeeze=False, **kwargs): - raise NotImplementedError() + # Update first lengths to prevent possible length inconsistencies + if isinstance(dropped, pd.DataFrame): + drop_per_part = dropped.groupby(["partition"]).size()\ + .reindex(index=pd.RangeIndex(len(self._lengths)), + fill_value=0) + elif isinstance(dropped, pd.Series): + drop_per_part = np.zeros_like(self._lengths) + drop_per_part[dropped["partition"]] = 1 + else: + raise AssertionError("Unrecognized result from `coords_of`") + self._lengths = self._lengths - drop_per_part - def insert(self, key, loc=None, partition=None, - index_within_partition=None): - """Inserts a key at a certain location in the index, or a certain coord - in a partition. Called with either `loc` or `partition` and - `index_within_partition`. If called with both, `loc` will be used. + self._coord_df = self._coord_df.drop(labels, errors=errors) + return dropped + + def rename_index(self, mapper): + """Rename the index. Args: - key: item to insert into index - loc: location to insert into index - partition: partition to insert into - index_within_partition: index within partition to insert into + mapper: name to rename the index as + """ + self._coord_df = self._coord_df.rename_axis(mapper, axis=0) - Returns: - DataFrame with coordinates of insert + def convert_to_index_sliceable(self, key): + """Converts and performs error checking on the passed slice + + Args: + key: slice to convert and check """ - # Generate new index - new_index = self.index.insert(loc, key) + return convert_to_index_sliceable(self._coord_df, key) - # Make new empty coord_df - self._coord_df = pd.DataFrame(index=new_index) + def get_partition(self, partition_id): + """Return a view of coord_df where partition = partition_id + """ + return self._coord_df[self._coord_df.partition == partition_id] - # Shouldn't really need this, but here to maintain API consistency - return pd.DataFrame({'partition': 0, 'index_within_partition': loc}, - index=[key]) + def sorted_index(self): + return (self._coord_df + .sort_values(['partition', 'index_within_partition']) + .index) diff --git a/python/ray/dataframe/indexing.py b/python/ray/dataframe/indexing.py index cba4ff8728fc..9df3cdffb8c4 100644 --- a/python/ray/dataframe/indexing.py +++ b/python/ray/dataframe/indexing.py @@ -1,104 +1,458 @@ +"""Indexing Helper Class works as follows: + +_Location_Indexer_Base provide methods framework for __getitem__ + and __setitem__ that work with Ray DataFrame's internal index. Base + class's __{get,set}item__ takes in partitions & idx_in_partition data + and perform lookup/item write. + +_LocIndexer and _iLocIndexer is responsible for indexer specific logic and + lookup computation. Loc will take care of enlarge dataframe. Both indexer + will take care of translating pandas's lookup to Ray DataFrame's internal + lookup. + +An illustration is available at +https://github.com/ray-project/ray/pull/1955#issuecomment-386781826 +""" import pandas as pd +import numpy as np import ray -from .dataframe import _deploy_func +from warnings import warn + +from pandas.api.types import (is_scalar, is_list_like, is_bool) +from pandas.core.dtypes.common import is_integer +from pandas.core.indexing import IndexingError + +from .utils import (_blocks_to_col, _get_nan_block_id, extractor, + _mask_block_partitions, writer) +from .index_metadata import _IndexMetadata +from .dataframe import DataFrame + + +def is_slice(x): return isinstance(x, slice) + + +def is_2d(x): return is_list_like(x) or is_slice(x) + + +def is_tuple(x): return isinstance(x, tuple) + + +def is_boolean_array(x): return is_list_like(x) and all(map(is_bool, x)) + + +def is_integer_slice(x): + if not is_slice(x): + return False + for pos in [x.start, x.stop, x.step]: + if not ((pos is None) or is_integer(pos)): + return False # one position is neither None nor int + return True + + +_ENLARGEMENT_WARNING = """ +Passing list-likes to .loc or [] with any missing label will raise +KeyError in the future, you can use .reindex() as an alternative. + +See the documentation here: +http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike +""" + +_ILOC_INT_ONLY_ERROR = """ +Location based indexing can only have [integer, integer slice (START point is +INCLUDED, END point is EXCLUDED), listlike of integers, boolean array] types. +""" + + +def _parse_tuple(tup): + """Unpack the user input for getitem and setitem and compute ndim + + loc[a] -> ([a], :), 1D + loc[[a,b],] -> ([a,b], :), + loc[a,b] -> ([a], [b]), 0D + """ + row_loc, col_loc = slice(None), slice(None) + + if is_tuple(tup): + row_loc = tup[0] + if len(tup) == 2: + col_loc = tup[1] + if len(tup) > 2: + raise IndexingError('Too many indexers') + else: + row_loc = tup + + ndim = _compute_ndim(row_loc, col_loc) + row_loc = [row_loc] if is_scalar(row_loc) else row_loc + col_loc = [col_loc] if is_scalar(col_loc) else col_loc + + return row_loc, col_loc, ndim + + +def _is_enlargement(locator, coord_df): + """Determine if a locator will enlarge the corrd_df. + + Enlargement happens when you trying to locate using labels isn't in the + original index. In other words, enlargement == adding NaNs ! + """ + if is_list_like(locator) and not is_slice( + locator) and len(locator) > 0 and not is_boolean_array(locator): + n_diff_elems = len(pd.Index(locator).difference(coord_df.index)) + is_enlargement_boolean = n_diff_elems > 0 + return is_enlargement_boolean + return False + + +def _warn_enlargement(): + warn(FutureWarning(_ENLARGEMENT_WARNING)) + + +def _compute_ndim(row_loc, col_loc): + """Compute the ndim of result from locators + """ + row_scaler = is_scalar(row_loc) + col_scaler = is_scalar(col_loc) + + if row_scaler and col_scaler: + ndim = 0 + elif row_scaler ^ col_scaler: + ndim = 1 + else: + ndim = 2 + + return ndim class _Location_Indexer_Base(): """Base class for location indexer like loc and iloc - This class abstract away commonly used method """ def __init__(self, ray_df): self.df = ray_df + self.col_coord_df = ray_df._col_metadata._coord_df + self.row_coord_df = ray_df._row_metadata._coord_df + self.block_oids = ray_df._block_partitions - def __getitem__(self, key): - if not isinstance(key, tuple): - # The one argument case is equivalent to full slice in 2nd dim. - return self.locate_2d(key, slice(None)) - else: - return self.locate_2d(*key) - - def _get_lookup_dict(self, ray_partition_idx): - if ray_partition_idx.ndim == 1: # Single row matched - position = (ray_partition_idx['partition'], - ray_partition_idx['index_within_partition']) - rows_to_lookup = {position[0]: [position[1]]} - if ray_partition_idx.ndim == 2: # Multiple rows matched - # We copy ray_partition_idx because it allows us to - # do groupby. This might not be the most efficient method. - # And have room to optimize. - ray_partition_idx = ray_partition_idx.copy() - rows_to_lookup = ray_partition_idx.groupby('partition').aggregate( - lambda x: list(x)).to_dict()['index_within_partition'] - return rows_to_lookup - - def locate_2d(self, row_label, col_label): - pass - - def _map_partition(self, lookup_dict, col_lst, indexer='loc'): - """Apply retrieval function to a lookup_dict - in the form of {partition_id: [idx]}. + self.is_view = False + if isinstance(ray_df, DataFrameView): + self.block_oids = ray_df._block_partitions_data + self.is_view = True - Returns: - retrieved_rows_remote: a list of object ids for pd_df + def __getitem__(self, row_lookup, col_lookup, ndim): + """ + Args: + row_lookup: A pd dataframe, a partial view from row_coord_df + col_lookup: A pd dataframe, a partial view from col_coord_df + ndim: the dimension of returned data + """ + if ndim == 2: + return self._generate_view(row_lookup, col_lookup) + + extracted = self._retrive_items(row_lookup, col_lookup) + if ndim == 1: + result = ray.get(_blocks_to_col.remote(*extracted)).squeeze() + + if is_scalar(result): + result = pd.Series(result) + + scaler_axis = row_lookup if len(row_lookup) == 1 else col_lookup + series_name = scaler_axis.iloc[0].name + result.name = series_name + + index_axis = row_lookup if len(col_lookup) == 1 else col_lookup + result.index = index_axis.index + + if ndim == 0: + result = ray.get(extracted[0]).squeeze() + + return result + + def _retrive_items(self, row_lookup, col_lookup): + """Given lookup dataframes, return a list of result oids + """ + result_oids = [] + + # We have to copy before we groupby because + # https://github.com/pandas-dev/pandas/issues/10043 + row_groups = row_lookup.copy().groupby('partition') + col_groups = col_lookup.copy().groupby('partition') + for row_blk, row_data in row_groups: + for col_blk, col_data in col_groups: + block_oid = self.block_oids[row_blk, col_blk] + row_idx = row_data['index_within_partition'] + col_idx = col_data['index_within_partition'] + + result_oid = extractor.remote(block_oid, row_idx, col_idx) + result_oids.append(result_oid) + return result_oids + + def _generate_view(self, row_lookup, col_lookup): + """Generate a DataFrameView from lookup + """ + row_metadata_view = _IndexMetadata( + _coord_df=row_lookup, _lengths=self.df._row_metadata._lengths) + + col_metadata_view = _IndexMetadata( + _coord_df=col_lookup, _lengths=self.df._col_metadata._lengths) + + df_view = DataFrameView( + block_partitions=self.block_oids, + row_metadata=row_metadata_view, + col_metadata=col_metadata_view, + index=row_metadata_view.index, + columns=col_metadata_view.index) + + return df_view + + def __setitem__(self, row_lookup, col_lookup, item): + """ + Args: + row_lookup: A pd dataframe, a partial view from row_coord_df + col_lookup: A pd dataframe, a partial view from col_coord_df + item: The new item needs to be set. It can be any shape that's + broadcastable to the product of the lookup tables. + """ + to_shape = (len(row_lookup), len(col_lookup)) + item = self._broadcast_item(item, to_shape) + self._write_items(row_lookup, col_lookup, item) + + def _broadcast_item(self, item, to_shape): + """Use numpy to broadcast or reshape item. + + Notes: + - Numpy is memory efficent, there shouldn't be performance issue. + """ + try: + item = np.array(item) + if np.prod(to_shape) == np.prod(item.shape): + return item.reshape(to_shape) + else: + return np.broadcast_to(item, to_shape) + except ValueError: + from_shape = np.array(item).shape + raise ValueError( + "could not broadcast input array from \ + shape {from_shape} into shape {to_shape}".format( + from_shape=from_shape, to_shape=to_shape)) + + def _write_items(self, row_lookup, col_lookup, item): + """Perform remote write and replace blocks. """ - assert indexer in ['loc', 'iloc'], "indexer must be loc or iloc" - if indexer == 'loc': + # We have to copy before we groupby because + # https://github.com/pandas-dev/pandas/issues/10043 + row_groups = row_lookup.copy().groupby('partition') + col_groups = col_lookup.copy().groupby('partition') - def retrieve_func(df, idx_lst, col_label): - return df.loc[idx_lst, col_label] - elif indexer == 'iloc': + row_item_index = 0 + for row_blk, row_data in row_groups: + row_len = len(row_data) - def retrieve_func(df, idx_lst, col_idx): - return df.iloc[idx_lst, col_idx] + col_item_index = 0 + for col_blk, col_data in col_groups: + col_len = len(col_data) - retrieved_rows_remote = [ - _deploy_func.remote(retrieve_func, - self.df._row_partitions[partition], - idx_to_lookup, col_lst) - for partition, idx_to_lookup in lookup_dict.items() - ] - return retrieved_rows_remote + block_oid = self.block_oids[row_blk, col_blk] + row_idx = row_data['index_within_partition'] + col_idx = col_data['index_within_partition'] + + item_to_write = item[row_item_index:row_item_index + row_len, + col_item_index:col_item_index + col_len] + + result_oid = writer.remote(block_oid, row_idx, col_idx, + item_to_write) + + if self.is_view: + self.df._block_partitions_data[row_blk, + col_blk] = result_oid + else: + self.df._block_partitions[row_blk, col_blk] = result_oid + + col_item_index += col_len + row_item_index += row_len class _Loc_Indexer(_Location_Indexer_Base): """A indexer for ray_df.loc[] functionality""" - def locate_2d(self, row_label, col_label): - index_loc = self.df._row_index.loc[row_label] - lookup_dict = self._get_lookup_dict(index_loc) - retrieved_rows_remote = self._map_partition( - lookup_dict, col_label, indexer='loc') - joined_df = pd.concat(ray.get(retrieved_rows_remote)) + def __getitem__(self, key): + row_loc, col_loc, ndim = _parse_tuple(key) + self._handle_enlargement(row_loc, col_loc) + row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc) + ndim = self._expand_dim(row_lookup, col_lookup, ndim) + result = super(_Loc_Indexer, self).__getitem__(row_lookup, col_lookup, + ndim) + return result + + def __setitem__(self, key, item): + row_loc, col_loc, _ = _parse_tuple(key) + self._handle_enlargement(row_loc, col_loc) + row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc) + super(_Loc_Indexer, self).__setitem__(row_lookup, col_lookup, + item) + + def _handle_enlargement(self, row_loc, col_loc): + """Handle Enlargement (if there is one). + + Returns: + None + """ + locators = [row_loc, col_loc] + coord_dfs = [self.row_coord_df, self.col_coord_df] + axis = ['row', 'col'] + metadata = {'row': self.df._row_metadata, 'col': self.df._col_metadata} + + for loc, coord, axis in zip(locators, coord_dfs, axis): + if _is_enlargement(loc, coord): + new_meta = self._enlarge_axis(loc, axis=axis) + _warn_enlargement() + metadata[axis] = new_meta + + self.row_coord_df = metadata['row']._coord_df + self.col_coord_df = metadata['col']._coord_df + + def _enlarge_axis(self, locator, axis): + """Add rows/columns to block partitions according to locator. + + Returns: + metadata (_IndexMetadata) + """ + # 1. Prepare variables + row_based_bool = axis == 'row' + # major == the axis of the locator + major_meta = self.df._row_metadata if row_based_bool \ + else self.df._col_metadata + minor_meta = self.df._col_metadata if row_based_bool \ + else self.df._row_metadata + + # 2. Compute the nan labels and add blocks + nan_labels = self._compute_enlarge_labels(locator, major_meta.index) + num_nan_labels = len(nan_labels) + blk_part_n_row, blk_part_n_col = self.block_oids.shape + + nan_blk_lens = minor_meta._lengths + nan_blks = np.array([[ + _get_nan_block_id( + num_nan_labels, n_cols, transpose=not row_based_bool) + for n_cols in nan_blk_lens + ]]) + nan_blks = nan_blks.T if not row_based_bool else nan_blks + + self.block_oids = np.concatenate( + [self.block_oids, nan_blks], axis=0 if row_based_bool else 1) + + # 3. Prepare metadata to return + nan_coord_df = pd.DataFrame(data=[{ + '': name, + 'partition': blk_part_n_row if row_based_bool else blk_part_n_col, + 'index_within_partition': i + } for name, i in zip(nan_labels, np.arange(num_nan_labels)) + ]).set_index('') + + coord_df = pd.concat([major_meta._coord_df, nan_coord_df]) + coord_df = coord_df.loc[locator] # Re-index that allows duplicates - if index_loc.ndim == 2: - # The returned result need to be indexed series/df - # Re-index is needed. - joined_df.index = index_loc.index + lens = major_meta._lengths + lens = np.concatenate([lens, np.array([num_nan_labels])]) - if isinstance(row_label, int) or isinstance(row_label, str): - return joined_df.squeeze(axis=0) - else: - return joined_df + metadata_view = _IndexMetadata(_coord_df=coord_df, _lengths=lens) + return metadata_view + + def _compute_enlarge_labels(self, locator, base_index): + """Helper for _enlarge_axis, compute common labels and extra labels. + + Returns: + nan_labels: The labels needs to be added + """ + locator_as_index = pd.Index(locator) + + nan_labels = locator_as_index.difference(base_index) + common_labels = locator_as_index.intersection(base_index) + + if len(common_labels) == 0: + raise KeyError( + 'None of [{labels}] are in the [{base_index_name}]'.format( + labels=list(locator_as_index), base_index_name=base_index)) + + return nan_labels + + def _expand_dim(self, row_lookup, col_lookup, ndim): + """Expand the dimension if necessary. + This method is for cases like duplicate labels. + """ + many_rows = len(row_lookup) > 1 + many_cols = len(col_lookup) > 1 + + if ndim == 0 and (many_rows or many_cols): + ndim = 1 + if ndim == 1 and (many_rows and many_cols): + ndim = 2 + + return ndim + + def _compute_lookup(self, row_loc, col_loc): + # We use reindex for list to avoid duplicates. + row_lookup = self.row_coord_df.loc[row_loc] + col_lookup = self.col_coord_df.loc[col_loc] + return row_lookup, col_lookup class _iLoc_Indexer(_Location_Indexer_Base): """A indexer for ray_df.iloc[] functionality""" - def locate_2d(self, row_idx, col_idx): - index_loc = self.df._row_index.iloc[row_idx] - lookup_dict = self._get_lookup_dict(index_loc) - retrieved_rows_remote = self._map_partition( - lookup_dict, col_idx, indexer='iloc') - joined_df = pd.concat(ray.get(retrieved_rows_remote)) - - if index_loc.ndim == 2: - # The returned result need to be indexed series/df - # Re-index is needed. - joined_df.index = index_loc.index - - if isinstance(row_idx, int) or isinstance(row_idx, str): - return joined_df.squeeze(axis=0) - else: - return joined_df + def __getitem__(self, key): + row_loc, col_loc, ndim = _parse_tuple(key) + + self._check_dtypes(row_loc) + self._check_dtypes(col_loc) + + row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc) + result = super(_iLoc_Indexer, self).__getitem__( + row_lookup, col_lookup, ndim) + return result + + def __setitem__(self, key, item): + row_loc, col_loc, _ = _parse_tuple(key) + + self._check_dtypes(row_loc) + self._check_dtypes(col_loc) + + row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc) + super(_iLoc_Indexer, self).__setitem__( + row_lookup, col_lookup, item) + + def _compute_lookup(self, row_loc, col_loc): + # We use reindex for list to avoid duplicates. + return self.row_coord_df.iloc[row_loc], self.col_coord_df.iloc[col_loc] + + def _check_dtypes(self, locator): + is_int = is_integer(locator) + is_int_slice = is_integer_slice(locator) + is_int_list = is_list_like(locator) and all(map(is_integer, locator)) + is_bool_arr = is_boolean_array(locator) + + if not any([is_int, is_int_slice, is_int_list, is_bool_arr]): + raise ValueError(_ILOC_INT_ONLY_ERROR) + + +class DataFrameView(DataFrame): + """A subclass of DataFrame where the index can be smaller than blocks. + """ + + def __init__(self, block_partitions, row_metadata, col_metadata, index, + columns): + self._block_partitions = block_partitions + self._row_metadata = row_metadata + self._col_metadata = col_metadata + self.index = index + self.columns = columns + + def _get_block_partitions(self): + oid_arr = _mask_block_partitions(self._block_partitions_data, + self._row_metadata, + self._col_metadata) + return oid_arr + + def _set_block_partitions(self, new_block_partitions): + self._block_partitions_data = new_block_partitions + + _block_partitions = property(_get_block_partitions, _set_block_partitions) diff --git a/python/ray/dataframe/io.py b/python/ray/dataframe/io.py index c1abc0ec474c..cf91dbe5d647 100644 --- a/python/ray/dataframe/io.py +++ b/python/ray/dataframe/io.py @@ -261,7 +261,6 @@ def read_csv(filepath, df = _read_csv_with_offset.remote( filepath, start, end, kwargs=kwargs) df_obj_ids.append(df) - return DataFrame(row_partitions=df_obj_ids, columns=columns) diff --git a/python/ray/dataframe/reshape.py b/python/ray/dataframe/reshape.py new file mode 100644 index 000000000000..1883f11b78d7 --- /dev/null +++ b/python/ray/dataframe/reshape.py @@ -0,0 +1,125 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import ray +import pandas +import numpy as np + +from pandas import compat +from pandas.core.dtypes.common import is_list_like +from itertools import cycle + +from .dataframe import DataFrame +from .utils import _deploy_func + + +def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, + columns=None, sparse=False, drop_first=False): + """Convert categorical variable into indicator variables. + + Args: + data (array-like, Series, or DataFrame): data to encode. + prefix (string, [string]): Prefix to apply to each encoded column + label. + prefix_sep (string, [string]): Separator between prefix and value. + dummy_na (bool): Add a column to indicate NaNs. + columns: Which columns to encode. + sparse (bool): Not Implemented: If True, returns SparseDataFrame. + drop_first (bool): Whether to remove the first level of encoded data. + + Returns: + DataFrame or one-hot encoded data. + """ + if not isinstance(data, DataFrame): + return pandas.get_dummies(data, prefix=prefix, prefix_sep=prefix_sep, + dummy_na=dummy_na, columns=columns, + sparse=sparse, drop_first=drop_first) + + if sparse: + raise NotImplementedError( + "SparseDataFrame is not implemented. " + "To contribute to Pandas on Ray, please visit " + "github.com/ray-project/ray.") + + if columns is None: + columns_to_encode = data.dtypes.isin([np.dtype("O"), 'category']) + columns_to_encode = data.columns[columns_to_encode] + else: + columns_to_encode = columns + + def check_len(item, name): + len_msg = ("Length of '{name}' ({len_item}) did not match the " + "length of the columns being encoded ({len_enc}).") + + if is_list_like(item): + if not len(item) == len(columns_to_encode): + len_msg = len_msg.format(name=name, len_item=len(item), + len_enc=len(columns_to_encode)) + raise ValueError(len_msg) + + check_len(prefix, 'prefix') + check_len(prefix_sep, 'prefix_sep') + if isinstance(prefix, compat.string_types): + prefix = cycle([prefix]) + prefix = [next(prefix) for i in range(len(columns_to_encode))] + if isinstance(prefix, dict): + prefix = [prefix[col] for col in columns_to_encode] + + if prefix is None: + prefix = columns_to_encode + + # validate separators + if isinstance(prefix_sep, compat.string_types): + prefix_sep = cycle([prefix_sep]) + prefix_sep = [next(prefix_sep) for i in range(len(columns_to_encode))] + elif isinstance(prefix_sep, dict): + prefix_sep = [prefix_sep[col] for col in columns_to_encode] + + if set(columns_to_encode) == set(data.columns): + with_dummies = [] + dropped_columns = pandas.Index() + else: + with_dummies = data.drop(columns_to_encode, axis=1)._col_partitions + dropped_columns = data.columns.drop(columns_to_encode) + + def get_dummies_remote(df, to_drop, prefix, prefix_sep): + df = df.drop(to_drop, axis=1) + + if df.size == 0: + return df, df.columns + + df = pandas.get_dummies(df, prefix=prefix, prefix_sep=prefix_sep, + dummy_na=dummy_na, columns=None, sparse=sparse, + drop_first=drop_first) + columns = df.columns + df.columns = pandas.RangeIndex(0, len(df.columns)) + return df, columns + + total = 0 + columns = [] + for i, part in enumerate(data._col_partitions): + col_index = data._col_metadata.partition_series(i) + + # TODO(kunalgosar): Handle the case of duplicate columns here + to_encode = col_index.index.isin(columns_to_encode) + + to_encode = col_index[to_encode] + to_drop = col_index.drop(to_encode.index) + + result = _deploy_func._submit( + args=(get_dummies_remote, part, to_drop, + prefix[total:total + len(to_encode)], + prefix_sep[total:total + len(to_encode)]), + num_return_vals=2) + + with_dummies.append(result[0]) + columns.append(result[1]) + total += len(to_encode) + + columns = ray.get(columns) + dropped_columns = dropped_columns.append(columns) + + return DataFrame(col_partitions=with_dummies, + columns=dropped_columns, + index=data.index) diff --git a/python/ray/dataframe/test/test_dataframe.py b/python/ray/dataframe/test/test_dataframe.py index 60d2862d9cf9..6cd72ec4b99e 100644 --- a/python/ray/dataframe/test/test_dataframe.py +++ b/python/ray/dataframe/test/test_dataframe.py @@ -190,6 +190,11 @@ def test_int_dataframe(): 'col3', 'col4'] + filter_by = {'items': ['col1', 'col5'], + 'regex': '4$|3$', + 'like': 'col'} + + test_filter(ray_df, pandas_df, filter_by) test_roundtrip(ray_df, pandas_df) test_index(ray_df, pandas_df) test_size(ray_df, pandas_df) @@ -207,6 +212,8 @@ def test_int_dataframe(): test_copy(ray_df) test_sum(ray_df, pandas_df) + test_prod(ray_df, pandas_df) + test_product(ray_df, pandas_df) test_abs(ray_df, pandas_df) test_keys(ray_df, pandas_df) test_transpose(ray_df, pandas_df) @@ -221,6 +228,8 @@ def test_int_dataframe(): test_quantile(ray_df, pandas_df, .5) test_quantile(ray_df, pandas_df, .75) test_describe(ray_df, pandas_df) + test_diff(ray_df, pandas_df) + test_rank(ray_df, pandas_df) test_all(ray_df, pandas_df) test_any(ray_df, pandas_df) @@ -262,6 +271,7 @@ def test_int_dataframe(): test_cummin(ray_df, pandas_df) test_cumprod(ray_df, pandas_df) test_cumsum(ray_df, pandas_df) + test_pipe(ray_df, pandas_df) # test_loc(ray_df, pandas_df) # test_iloc(ray_df, pandas_df) @@ -302,25 +312,22 @@ def test_int_dataframe(): test_apply(ray_df, pandas_df, func, 1) test_aggregate(ray_df, pandas_df, func, 1) else: - with pytest.raises(NotImplementedError): + with pytest.raises(TypeError): test_agg(ray_df, pandas_df, func, 1) - with pytest.raises(NotImplementedError): + with pytest.raises(TypeError): test_apply(ray_df, pandas_df, func, 1) - with pytest.raises(NotImplementedError): + with pytest.raises(TypeError): test_aggregate(ray_df, pandas_df, func, 1) func = ['sum', lambda df: df.sum()] - with pytest.raises(NotImplementedError): - test_apply(ray_df, pandas_df, func, 0) - with pytest.raises(NotImplementedError): - test_aggregate(ray_df, pandas_df, func, 0) - with pytest.raises(NotImplementedError): - test_agg(ray_df, pandas_df, func, 0) - with pytest.raises(NotImplementedError): + test_apply(ray_df, pandas_df, func, 0) + test_aggregate(ray_df, pandas_df, func, 0) + test_agg(ray_df, pandas_df, func, 0) + with pytest.raises(TypeError): test_apply(ray_df, pandas_df, func, 1) - with pytest.raises(NotImplementedError): + with pytest.raises(TypeError): test_aggregate(ray_df, pandas_df, func, 1) - with pytest.raises(NotImplementedError): + with pytest.raises(TypeError): test_agg(ray_df, pandas_df, func, 1) test_transform(ray_df, pandas_df) @@ -350,6 +357,11 @@ def test_float_dataframe(): 'col3', 'col4'] + filter_by = {'items': ['col1', 'col5'], + 'regex': '4$|3$', + 'like': 'col'} + + test_filter(ray_df, pandas_df, filter_by) test_roundtrip(ray_df, pandas_df) test_index(ray_df, pandas_df) test_size(ray_df, pandas_df) @@ -367,6 +379,8 @@ def test_float_dataframe(): test_copy(ray_df) test_sum(ray_df, pandas_df) + test_prod(ray_df, pandas_df) + test_product(ray_df, pandas_df) test_abs(ray_df, pandas_df) test_keys(ray_df, pandas_df) test_transpose(ray_df, pandas_df) @@ -382,6 +396,8 @@ def test_float_dataframe(): test_quantile(ray_df, pandas_df, .5) test_quantile(ray_df, pandas_df, .75) test_describe(ray_df, pandas_df) + test_diff(ray_df, pandas_df) + test_rank(ray_df, pandas_df) test_all(ray_df, pandas_df) test_any(ray_df, pandas_df) @@ -408,6 +424,7 @@ def test_float_dataframe(): test_cummin(ray_df, pandas_df) test_cumprod(ray_df, pandas_df) test_cumsum(ray_df, pandas_df) + test_pipe(ray_df, pandas_df) test___len__(ray_df, pandas_df) test_first_valid_index(ray_df, pandas_df) @@ -464,25 +481,22 @@ def test_float_dataframe(): test_apply(ray_df, pandas_df, func, 1) test_aggregate(ray_df, pandas_df, func, 1) else: - with pytest.raises(NotImplementedError): + with pytest.raises(TypeError): test_agg(ray_df, pandas_df, func, 1) - with pytest.raises(NotImplementedError): + with pytest.raises(TypeError): test_apply(ray_df, pandas_df, func, 1) - with pytest.raises(NotImplementedError): + with pytest.raises(TypeError): test_aggregate(ray_df, pandas_df, func, 1) func = ['sum', lambda df: df.sum()] - with pytest.raises(NotImplementedError): - test_apply(ray_df, pandas_df, func, 0) - with pytest.raises(NotImplementedError): - test_aggregate(ray_df, pandas_df, func, 0) - with pytest.raises(NotImplementedError): - test_agg(ray_df, pandas_df, func, 0) - with pytest.raises(NotImplementedError): + test_apply(ray_df, pandas_df, func, 0) + test_aggregate(ray_df, pandas_df, func, 0) + test_agg(ray_df, pandas_df, func, 0) + with pytest.raises(TypeError): test_apply(ray_df, pandas_df, func, 1) - with pytest.raises(NotImplementedError): + with pytest.raises(TypeError): test_aggregate(ray_df, pandas_df, func, 1) - with pytest.raises(NotImplementedError): + with pytest.raises(TypeError): test_agg(ray_df, pandas_df, func, 1) test_transform(ray_df, pandas_df) @@ -510,6 +524,11 @@ def test_mixed_dtype_dataframe(): 'col3', 'col4'] + filter_by = {'items': ['col1', 'col5'], + 'regex': '4$|3$', + 'like': 'col'} + + test_filter(ray_df, pandas_df, filter_by) test_roundtrip(ray_df, pandas_df) test_index(ray_df, pandas_df) test_size(ray_df, pandas_df) @@ -547,6 +566,9 @@ def test_mixed_dtype_dataframe(): test_quantile(ray_df, pandas_df, .75) test_describe(ray_df, pandas_df) + # TODO Reolve once Pandas-20962 is resolved. + # test_rank(ray_df, pandas_df) + test_all(ray_df, pandas_df) test_any(ray_df, pandas_df) test___getitem__(ray_df, pandas_df) @@ -574,6 +596,7 @@ def test_mixed_dtype_dataframe(): test_min(ray_df, pandas_df) test_notna(ray_df, pandas_df) test_notnull(ray_df, pandas_df) + test_pipe(ray_df, pandas_df) # TODO Fix pandas so that the behavior is correct # We discovered a bug where argmax does not always give the same result @@ -632,17 +655,14 @@ def test_mixed_dtype_dataframe(): test_agg(ray_df, pandas_df, func, 0) func = ['sum', lambda df: df.sum()] - with pytest.raises(NotImplementedError): - test_apply(ray_df, pandas_df, func, 0) - with pytest.raises(NotImplementedError): - test_aggregate(ray_df, pandas_df, func, 0) - with pytest.raises(NotImplementedError): - test_agg(ray_df, pandas_df, func, 0) - with pytest.raises(NotImplementedError): + test_apply(ray_df, pandas_df, func, 0) + test_aggregate(ray_df, pandas_df, func, 0) + test_agg(ray_df, pandas_df, func, 0) + with pytest.raises(TypeError): test_apply(ray_df, pandas_df, func, 1) - with pytest.raises(NotImplementedError): + with pytest.raises(TypeError): test_aggregate(ray_df, pandas_df, func, 1) - with pytest.raises(NotImplementedError): + with pytest.raises(TypeError): test_agg(ray_df, pandas_df, func, 1) test_transform(ray_df, pandas_df) @@ -670,6 +690,11 @@ def test_nan_dataframe(): 'col3', 'col4'] + filter_by = {'items': ['col1', 'col5'], + 'regex': '4$|3$', + 'like': 'col'} + + test_filter(ray_df, pandas_df, filter_by) test_roundtrip(ray_df, pandas_df) test_index(ray_df, pandas_df) test_size(ray_df, pandas_df) @@ -701,6 +726,8 @@ def test_nan_dataframe(): test_quantile(ray_df, pandas_df, .5) test_quantile(ray_df, pandas_df, .75) test_describe(ray_df, pandas_df) + test_diff(ray_df, pandas_df) + test_rank(ray_df, pandas_df) test_all(ray_df, pandas_df) test_any(ray_df, pandas_df) @@ -727,6 +754,7 @@ def test_nan_dataframe(): test_cummin(ray_df, pandas_df) test_cumprod(ray_df, pandas_df) test_cumsum(ray_df, pandas_df) + test_pipe(ray_df, pandas_df) test___len__(ray_df, pandas_df) test_first_valid_index(ray_df, pandas_df) @@ -782,30 +810,49 @@ def test_nan_dataframe(): test_apply(ray_df, pandas_df, func, 1) test_aggregate(ray_df, pandas_df, func, 1) else: - with pytest.raises(NotImplementedError): + with pytest.raises(TypeError): test_agg(ray_df, pandas_df, func, 1) - with pytest.raises(NotImplementedError): + with pytest.raises(TypeError): test_apply(ray_df, pandas_df, func, 1) - with pytest.raises(NotImplementedError): + with pytest.raises(TypeError): test_aggregate(ray_df, pandas_df, func, 1) func = ['sum', lambda df: df.sum()] - with pytest.raises(NotImplementedError): - test_apply(ray_df, pandas_df, func, 0) - with pytest.raises(NotImplementedError): - test_aggregate(ray_df, pandas_df, func, 0) - with pytest.raises(NotImplementedError): - test_agg(ray_df, pandas_df, func, 0) - with pytest.raises(NotImplementedError): + test_apply(ray_df, pandas_df, func, 0) + test_aggregate(ray_df, pandas_df, func, 0) + test_agg(ray_df, pandas_df, func, 0) + with pytest.raises(TypeError): test_apply(ray_df, pandas_df, func, 1) - with pytest.raises(NotImplementedError): + with pytest.raises(TypeError): test_aggregate(ray_df, pandas_df, func, 1) - with pytest.raises(NotImplementedError): + with pytest.raises(TypeError): test_agg(ray_df, pandas_df, func, 1) test_transform(ray_df, pandas_df) +def test_dense_nan_df(): + ray_df = rdf.DataFrame([[np.nan, 2, np.nan, 0], + [3, 4, np.nan, 1], + [np.nan, np.nan, np.nan, 5]], + columns=list('ABCD')) + + pd_df = pd.DataFrame([[np.nan, 2, np.nan, 0], + [3, 4, np.nan, 1], + [np.nan, np.nan, np.nan, 5]], + columns=list('ABCD')) + + column_subsets = [list('AD'), list('BC'), list('CD')] + row_subsets = [[0, 1], [0, 1, 2], [2, 0]] + + test_dropna(ray_df, pd_df) + test_dropna_inplace(ray_df, pd_df) + test_dropna_multiple_axes(ray_df, pd_df) + test_dropna_multiple_axes_inplace(ray_df, pd_df) + test_dropna_subset(ray_df, pd_df, column_subsets, row_subsets) + test_dropna_subset_error(ray_df) + + @pytest.fixture def test_inter_df_math(op, simple=False): ray_df = rdf.DataFrame({"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], @@ -855,8 +902,8 @@ def test_comparison_inter_ops(op): ray_df2 = rdf.DataFrame({"A": [0, 2], "col1": [0, 19], "col2": [1, 1]}) pandas_df2 = pd.DataFrame({"A": [0, 2], "col1": [0, 19], "col2": [1, 1]}) - ray_df_equals_pandas(getattr(ray_df, op)(ray_df2), - getattr(pandas_df, op)(pandas_df2)) + ray_df_equals_pandas(getattr(ray_df2, op)(ray_df2), + getattr(pandas_df2, op)(pandas_df2)) @pytest.fixture @@ -927,8 +974,6 @@ def test_append(): pandas_df2 = pd.DataFrame({"col5": [0], "col6": [1]}) - print(ray_df.append(ray_df2)) - assert ray_df_equals_pandas(ray_df.append(ray_df2), pandas_df.append(pandas_df2)) @@ -954,10 +999,31 @@ def test_as_blocks(): def test_as_matrix(): - ray_df = create_test_dataframe() + test_data = TestData() + frame = rdf.DataFrame(test_data.frame) + mat = frame.as_matrix() + + frame_columns = frame.columns + for i, row in enumerate(mat): + for j, value in enumerate(row): + col = frame_columns[j] + if np.isnan(value): + assert np.isnan(frame[col][i]) + else: + assert value == frame[col][i] - with pytest.raises(NotImplementedError): - ray_df.as_matrix() + # mixed type + mat = rdf.DataFrame(test_data.mixed_frame).as_matrix(['foo', 'A']) + assert mat[0, 0] == 'bar' + + df = rdf.DataFrame({'real': [1, 2, 3], 'complex': [1j, 2j, 3j]}) + mat = df.as_matrix() + assert mat[0, 0] == 1j + + # single block corner case + mat = rdf.DataFrame(test_data.frame).as_matrix(['A', 'B']) + expected = test_data.frame.reindex(columns=['A', 'B']).values + tm.assert_almost_equal(mat, expected) def test_asfreq(): @@ -982,10 +1048,28 @@ def test_assign(): def test_astype(): - ray_df = create_test_dataframe() + td = TestData() + ray_df_frame = from_pandas(td.frame, 2) + our_df_casted = ray_df_frame.astype(np.int32) + expected_df_casted = pd.DataFrame(td.frame.values.astype(np.int32), + index=td.frame.index, + columns=td.frame.columns) - with pytest.raises(NotImplementedError): - ray_df.astype(None) + assert(ray_df_equals_pandas(our_df_casted, expected_df_casted)) + + our_df_casted = ray_df_frame.astype(np.float64) + expected_df_casted = pd.DataFrame(td.frame.values.astype(np.float64), + index=td.frame.index, + columns=td.frame.columns) + + assert(ray_df_equals_pandas(our_df_casted, expected_df_casted)) + + our_df_casted = ray_df_frame.astype(str) + expected_df_casted = pd.DataFrame(td.frame.values.astype(str), + index=td.frame.index, + columns=td.frame.columns) + + assert(ray_df_equals_pandas(our_df_casted, expected_df_casted)) def test_at_time(): @@ -1141,11 +1225,12 @@ def test_describe(ray_df, pandas_df): assert(ray_df.describe().equals(pandas_df.describe())) -def test_diff(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.diff() +@pytest.fixture +def test_diff(ray_df, pandas_df): + assert(ray_df_equals_pandas(ray_df.diff(), pandas_df.diff())) + assert(ray_df_equals_pandas(ray_df.diff(axis=1), pandas_df.diff(axis=1))) + assert(ray_df_equals_pandas(ray_df.diff(periods=1), + pandas_df.diff(periods=1))) def test_div(): @@ -1260,6 +1345,102 @@ def test_drop_duplicates(): ray_df.drop_duplicates() +@pytest.fixture +def test_dropna(ray_df, pd_df): + assert ray_df_equals_pandas(ray_df.dropna(axis=1, how='all'), + pd_df.dropna(axis=1, how='all')) + + assert ray_df_equals_pandas(ray_df.dropna(axis=1, how='any'), + pd_df.dropna(axis=1, how='any')) + + assert ray_df_equals_pandas(ray_df.dropna(axis=0, how='all'), + pd_df.dropna(axis=0, how='all')) + + assert ray_df_equals_pandas(ray_df.dropna(thresh=2), + pd_df.dropna(thresh=2)) + + +@pytest.fixture +def test_dropna_inplace(ray_df, pd_df): + ray_df = ray_df.copy() + pd_df = pd_df.copy() + + ray_df.dropna(thresh=2, inplace=True) + pd_df.dropna(thresh=2, inplace=True) + + assert ray_df_equals_pandas(ray_df, pd_df) + + ray_df.dropna(axis=1, how='any', inplace=True) + pd_df.dropna(axis=1, how='any', inplace=True) + + assert ray_df_equals_pandas(ray_df, pd_df) + + +@pytest.fixture +def test_dropna_multiple_axes(ray_df, pd_df): + assert ray_df_equals_pandas( + ray_df.dropna(how='all', axis=[0, 1]), + pd_df.dropna(how='all', axis=[0, 1]) + ) + assert ray_df_equals_pandas( + ray_df.dropna(how='all', axis=(0, 1)), + pd_df.dropna(how='all', axis=(0, 1)) + ) + + +@pytest.fixture +def test_dropna_multiple_axes_inplace(ray_df, pd_df): + ray_df_copy = ray_df.copy() + pd_df_copy = pd_df.copy() + + ray_df_copy.dropna(how='all', axis=[0, 1], inplace=True) + pd_df_copy.dropna(how='all', axis=[0, 1], inplace=True) + + assert ray_df_equals_pandas(ray_df_copy, pd_df_copy) + + ray_df_copy = ray_df.copy() + pd_df_copy = pd_df.copy() + + ray_df_copy.dropna(how='all', axis=(0, 1), inplace=True) + pd_df_copy.dropna(how='all', axis=(0, 1), inplace=True) + + assert ray_df_equals_pandas(ray_df_copy, pd_df_copy) + + +@pytest.fixture +def test_dropna_subset(ray_df, pd_df, column_subsets, row_subsets): + for subset in column_subsets: + assert ray_df_equals_pandas( + ray_df.dropna(how='all', subset=subset), + pd_df.dropna(how='all', subset=subset) + ) + + assert ray_df_equals_pandas( + ray_df.dropna(how='any', subset=subset), + pd_df.dropna(how='any', subset=subset) + ) + + for subset in row_subsets: + assert ray_df_equals_pandas( + ray_df.dropna(how='all', axis=1, subset=subset), + pd_df.dropna(how='all', axis=1, subset=subset) + ) + + assert ray_df_equals_pandas( + ray_df.dropna(how='any', axis=1, subset=subset), + pd_df.dropna(how='any', axis=1, subset=subset) + ) + + +@pytest.fixture +def test_dropna_subset_error(ray_df): + with pytest.raises(KeyError): + ray_df.dropna(subset=list('EF')) + + with pytest.raises(KeyError): + ray_df.dropna(axis=1, subset=[4, 5]) + + def test_duplicated(): ray_df = create_test_dataframe() @@ -1755,11 +1936,16 @@ def test_fillna_datetime_columns(num_partitions=2): """ -def test_filter(): - ray_df = create_test_dataframe() +@pytest.fixture +def test_filter(ray_df, pandas_df, by): + ray_df_equals_pandas(ray_df.filter(items=by['items']), + pandas_df.filter(items=by['items'])) - with pytest.raises(NotImplementedError): - ray_df.filter() + ray_df_equals_pandas(ray_df.filter(regex=by['regex']), + pandas_df.filter(regex=by['regex'])) + + ray_df_equals_pandas(ray_df.filter(like=by['like']), + pandas_df.filter(like=by['like'])) def test_first(): @@ -2043,10 +2229,57 @@ def test_memory_usage(ray_df): def test_merge(): - ray_df = create_test_dataframe() + ray_df = rdf.DataFrame({"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], + "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]}) - with pytest.raises(NotImplementedError): - ray_df.merge(None) + pandas_df = pd.DataFrame({"col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], + "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6]}) + + ray_df2 = rdf.DataFrame({"col1": [0, 1, 2], "col2": [1, 5, 6]}) + + pandas_df2 = pd.DataFrame({"col1": [0, 1, 2], "col2": [1, 5, 6]}) + + join_types = ["outer", "inner"] + for how in join_types: + # Defaults + ray_result = ray_df.merge(ray_df2, how=how) + pandas_result = pandas_df.merge(pandas_df2, how=how) + ray_df_equals_pandas(ray_result, pandas_result) + + # left_on and right_index + ray_result = ray_df.merge(ray_df2, how=how, left_on='col1', + right_index=True) + pandas_result = pandas_df.merge(pandas_df2, how=how, left_on='col1', + right_index=True) + ray_df_equals_pandas(ray_result, pandas_result) + + # left_index and right_index + ray_result = ray_df.merge(ray_df2, how=how, left_index=True, + right_index=True) + pandas_result = pandas_df.merge(pandas_df2, how=how, left_index=True, + right_index=True) + ray_df_equals_pandas(ray_result, pandas_result) + + # left_index and right_on + ray_result = ray_df.merge(ray_df2, how=how, left_index=True, + right_on='col1') + pandas_result = pandas_df.merge(pandas_df2, how=how, left_index=True, + right_on='col1') + ray_df_equals_pandas(ray_result, pandas_result) + + # left_on and right_on col1 + ray_result = ray_df.merge(ray_df2, how=how, left_on='col1', + right_on='col1') + pandas_result = pandas_df.merge(pandas_df2, how=how, left_on='col1', + right_on='col1') + ray_df_equals_pandas(ray_result, pandas_result) + + # left_on and right_on col2 + ray_result = ray_df.merge(ray_df2, how=how, left_on='col2', + right_on='col2') + pandas_result = pandas_df.merge(pandas_df2, how=how, left_on='col2', + right_on='col2') + ray_df_equals_pandas(ray_result, pandas_result) @pytest.fixture @@ -2059,11 +2292,11 @@ def test_mod(): test_inter_df_math("mod", simple=False) -def test_mode(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.mode() +@pytest.fixture +def test_mode(ray_df, pandas_df): + assert(ray_series_equals_pandas(ray_df.mode(), pandas_df.mode())) + assert(ray_series_equals_pandas(ray_df.mode(axis=1), + pandas_df.mode(axis=1))) def test_mul(): @@ -2102,11 +2335,12 @@ def test_nsmallest(): ray_df.nsmallest(None, None) -def test_nunique(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.nunique() +@pytest.fixture +def test_nunique(ray_df, pandas_df): + assert(ray_df_equals_pandas(ray_df.nunique(), + pandas_df.nunique())) + assert(ray_df_equals_pandas(ray_df.nunique(axis=1), + pandas_df.nunique(axis=1))) def test_pct_change(): @@ -2116,11 +2350,34 @@ def test_pct_change(): ray_df.pct_change() -def test_pipe(): - ray_df = create_test_dataframe() +@pytest.fixture +def test_pipe(ray_df, pandas_df): + n = len(ray_df.index) + a, b, c = 2 % n, 0, 3 % n + col = ray_df.columns[3 % len(ray_df.columns)] - with pytest.raises(NotImplementedError): - ray_df.pipe(None) + def h(x): + return x.drop(columns=[col]) + + def g(x, arg1=0): + for _ in range(arg1): + x = x.append(x) + return x + + def f(x, arg2=0, arg3=0): + return x.drop([arg2, arg3]) + + assert ray_df_equals(f(g(h(ray_df), arg1=a), arg2=b, arg3=c), + (ray_df.pipe(h) + .pipe(g, arg1=a) + .pipe(f, arg2=b, arg3=c))) + + assert ray_df_equals_pandas((ray_df.pipe(h) + .pipe(g, arg1=a) + .pipe(f, arg2=b, arg3=c)), + (pandas_df.pipe(h) + .pipe(g, arg1=a) + .pipe(f, arg2=b, arg3=c))) def test_pivot(): @@ -2158,18 +2415,14 @@ def test_pow(): test_inter_df_math("pow", simple=False) -def test_prod(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.prod(None) - +@pytest.fixture +def test_prod(ray_df, pandas_df): + assert(ray_df.prod().equals(pandas_df.prod())) -def test_product(): - ray_df = create_test_dataframe() - with pytest.raises(NotImplementedError): - ray_df.product() +@pytest.fixture +def test_product(ray_df, pandas_df): + assert(ray_df.product().equals(pandas_df.product())) @pytest.fixture @@ -2188,11 +2441,10 @@ def test_radd(): test_inter_df_math_right_ops("radd") -def test_rank(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.rank() +@pytest.fixture +def test_rank(ray_df, pandas_df): + assert(ray_df_equals_pandas(ray_df.rank(), pandas_df.rank())) + assert(ray_df_equals_pandas(ray_df.rank(axis=1), pandas_df.rank(axis=1))) def test_rdiv(): @@ -2558,10 +2810,25 @@ def test_select(): def test_select_dtypes(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.select_dtypes() + df = pd.DataFrame({'test1': list('abc'), + 'test2': np.arange(3, 6).astype('u1'), + 'test3': np.arange(8.0, 11.0, dtype='float64'), + 'test4': [True, False, True], + 'test5': pd.date_range('now', periods=3).values, + 'test6': list(range(5, 8))}) + include = np.float, 'integer' + exclude = np.bool_, + rd = from_pandas(df, 2) + r = rd.select_dtypes(include=include, exclude=exclude) + + e = df[["test2", "test3", "test6"]] + assert(ray_df_equals_pandas(r, e)) + + try: + rdf.DataFrame().select_dtypes() + assert(False) + except ValueError: + assert(True) def test_sem(): @@ -2604,11 +2871,12 @@ def test_shift(): ray_df.shift() -def test_skew(): - ray_df = create_test_dataframe() - - with pytest.raises(NotImplementedError): - ray_df.skew() +@pytest.fixture +def test_skew(ray_df, pandas_df): + assert(ray_df_equals_pandas(ray_df.skew(), + pandas_df.skew())) + assert(ray_df_equals_pandas(ray_df.skew(axis=1), + pandas_df.skew(axis=1))) def test_slice_shift(): @@ -2619,17 +2887,43 @@ def test_slice_shift(): def test_sort_index(): - ray_df = create_test_dataframe() + pandas_df = pd.DataFrame(np.random.randint(0, 100, size=(1000, 100))) + ray_df = rdf.DataFrame(pandas_df) - with pytest.raises(NotImplementedError): - ray_df.sort_index() + pandas_result = pandas_df.sort_index() + ray_result = ray_df.sort_index() + + ray_df_equals_pandas(ray_result, pandas_result) + + pandas_result = pandas_df.sort_index(ascending=False) + ray_result = ray_df.sort_index(ascending=False) + + ray_df_equals_pandas(ray_result, pandas_result) def test_sort_values(): - ray_df = create_test_dataframe() + pandas_df = pd.DataFrame(np.random.randint(0, 100, size=(1000, 100))) + ray_df = rdf.DataFrame(pandas_df) - with pytest.raises(NotImplementedError): - ray_df.sort_values(None) + pandas_result = pandas_df.sort_values(by=1) + ray_result = ray_df.sort_values(by=1) + + ray_df_equals_pandas(ray_result, pandas_result) + + pandas_result = pandas_df.sort_values(by=1, axis=1) + ray_result = ray_df.sort_values(by=1, axis=1) + + ray_df_equals_pandas(ray_result, pandas_result) + + pandas_result = pandas_df.sort_values(by=[1, 3]) + ray_result = ray_df.sort_values(by=[1, 3]) + + ray_df_equals_pandas(ray_result, pandas_result) + + pandas_result = pandas_df.sort_values(by=[1, 67], axis=1) + ray_result = ray_df.sort_values(by=[1, 67], axis=1) + + ray_df_equals_pandas(ray_result, pandas_result) def test_sortlevel(): @@ -2775,10 +3069,21 @@ def test_unstack(): def test_update(): - ray_df = create_test_dataframe() + df = rdf.DataFrame([[1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3], + [1.5, np.nan, 3]]) - with pytest.raises(NotImplementedError): - ray_df.update(None) + other = rdf.DataFrame([[3.6, 2., np.nan], + [np.nan, np.nan, 7]], index=[1, 3]) + + df.update(other) + + expected = rdf.DataFrame([[1.5, np.nan, 3], + [3.6, 2, 3], + [1.5, np.nan, 3], + [1.5, np.nan, 7.]]) + assert ray_df_equals(df, expected) @pytest.fixture @@ -2787,10 +3092,36 @@ def test_var(ray_df, pandas_df): def test_where(): - ray_df = create_test_dataframe() + pandas_df = pd.DataFrame(np.random.randn(100, 10), + columns=list('abcdefghij')) + ray_df = rdf.DataFrame(pandas_df) - with pytest.raises(NotImplementedError): - ray_df.where(None) + pandas_cond_df = pandas_df % 5 < 2 + ray_cond_df = ray_df % 5 < 2 + + pandas_result = pandas_df.where(pandas_cond_df, -pandas_df) + ray_result = ray_df.where(ray_cond_df, -ray_df) + + assert ray_df_equals_pandas(ray_result, pandas_result) + + other = pandas_df.loc[3] + + pandas_result = pandas_df.where(pandas_cond_df, other, axis=1) + ray_result = ray_df.where(ray_cond_df, other, axis=1) + + assert ray_df_equals_pandas(ray_result, pandas_result) + + other = pandas_df['e'] + + pandas_result = pandas_df.where(pandas_cond_df, other, axis=0) + ray_result = ray_df.where(ray_cond_df, other, axis=0) + + assert ray_df_equals_pandas(ray_result, pandas_result) + + pandas_result = pandas_df.where(pandas_df < 2, True) + ray_result = ray_df.where(ray_df < 2, True) + + assert ray_df_equals_pandas(ray_result, pandas_result) def test_xs(): @@ -3058,3 +3389,25 @@ def test__doc__(): pd_obj = getattr(pd.DataFrame, attr, None) if callable(pd_obj) or isinstance(pd_obj, property): assert obj.__doc__ == pd_obj.__doc__ + + +def test_to_datetime(): + ray_df = rdf.DataFrame({'year': [2015, 2016], + 'month': [2, 3], + 'day': [4, 5]}) + pd_df = pd.DataFrame({'year': [2015, 2016], + 'month': [2, 3], + 'day': [4, 5]}) + + rdf.to_datetime(ray_df).equals(pd.to_datetime(pd_df)) + + +def test_get_dummies(): + ray_df = rdf.DataFrame({'A': ['a', 'b', 'a'], + 'B': ['b', 'a', 'c'], + 'C': [1, 2, 3]}) + pd_df = pd.DataFrame({'A': ['a', 'b', 'a'], + 'B': ['b', 'a', 'c'], + 'C': [1, 2, 3]}) + + ray_df_equals_pandas(rdf.get_dummies(ray_df), pd.get_dummies(pd_df)) diff --git a/python/ray/dataframe/utils.py b/python/ray/dataframe/utils.py index 97c166d09413..26a97c2af885 100644 --- a/python/ray/dataframe/utils.py +++ b/python/ray/dataframe/utils.py @@ -9,6 +9,29 @@ from . import get_npartitions +_NAN_BLOCKS = dict() + + +def _get_nan_block_id(n_row=1, n_col=1, transpose=False): + """A memory efficent way to get a block of NaNs. + + Args: + n_rows(int): number of rows + n_col(int): number of columns + transpose(bool): if true, swap rows and columns + Returns: + ObjectID of the NaN block + """ + global _NAN_BLOCKS + if transpose: + n_row, n_col = n_col, n_row + shape = (n_row, n_col) + if shape not in _NAN_BLOCKS: + arr = np.tile(np.array(np.NaN), shape) + _NAN_BLOCKS[shape] = ray.put(pd.DataFrame(data=arr)) + return _NAN_BLOCKS[shape] + + def _get_lengths(df): """Gets the length of the dataframe. Args: @@ -72,9 +95,11 @@ def _partition_pandas_dataframe(df, num_partitions=None, row_chunksize=None): row_partitions.append(top) temp_df = temp_df[row_chunksize:] else: - if len(df) > row_chunksize: - temp_df.reset_index(drop=True, inplace=True) - temp_df.columns = pd.RangeIndex(0, len(temp_df.columns)) + # Handle the last chunk correctly. + # This call is necessary to prevent modifying original df + temp_df = temp_df[:] + temp_df.reset_index(drop=True, inplace=True) + temp_df.columns = pd.RangeIndex(0, len(temp_df.columns)) row_partitions.append(ray.put(temp_df)) return row_partitions @@ -107,16 +132,64 @@ def to_pandas(df): Returns: A new pandas DataFrame. """ - if df._row_partitions is not None: - pd_df = pd.concat(ray.get(df._row_partitions)) - else: - pd_df = pd.concat(ray.get(df._col_partitions), - axis=1) + pd_df = pd.concat(ray.get(df._row_partitions), copy=False) pd_df.index = df.index pd_df.columns = df.columns return pd_df +@ray.remote +def extractor(df_chunk, row_loc, col_loc): + """Retrieve an item from remote block + """ + # We currently have to do the writable flag trick because a pandas bug + # https://github.com/pandas-dev/pandas/issues/17192 + try: + row_loc.flags.writeable = True + col_loc.flags.writeable = True + except AttributeError: + # Locators might be scaler or python list + pass + return df_chunk.iloc[row_loc, col_loc] + + +@ray.remote +def writer(df_chunk, row_loc, col_loc, item): + """Make a copy of the block and write new item to it + """ + df_chunk = df_chunk.copy() + df_chunk.iloc[row_loc, col_loc] = item + return df_chunk + + +def _mask_block_partitions(blk_partitions, row_metadata, col_metadata): + """Return the squeezed/expanded block partitions as defined by + row_metadata and col_metadata. + + Note: + Very naive implementation. Extract one scaler at a time in a double + for loop. + """ + col_df = col_metadata._coord_df + row_df = row_metadata._coord_df + + result_oids = [] + shape = (len(row_df.index), len(col_df.index)) + + for _, row_partition_data in row_df.iterrows(): + for _, col_partition_data in col_df.iterrows(): + row_part = row_partition_data.partition + col_part = col_partition_data.partition + block_oid = blk_partitions[row_part, col_part] + + row_idx = row_partition_data['index_within_partition'] + col_idx = col_partition_data['index_within_partition'] + + result_oid = extractor.remote(block_oid, [row_idx], [col_idx]) + result_oids.append(result_oid) + return np.array(result_oids).reshape(shape) + + @ray.remote def _deploy_func(func, dataframe, *args): """Deploys a function for the _map_partitions call. @@ -140,7 +213,7 @@ def _map_partitions(func, partitions, *argslists): partitions ([ObjectID]): The list of partitions to map func on. Returns: - A new Dataframe containing the result of the function + A list of partitions ([ObjectID]) with the result of the function """ if partitions is None: return None @@ -157,34 +230,32 @@ def _map_partitions(func, partitions, *argslists): for part, args in zip(partitions, *argslists)] -@ray.remote(num_return_vals=2) -def _build_columns(df_col, columns): - """Build columns and compute lengths for each partition.""" - # Columns and width +@ray.remote +def _build_col_widths(df_col): + """Compute widths (# of columns) for each partition.""" widths = np.array(ray.get([_deploy_func.remote(_get_widths, d) for d in df_col])) - dest_indices = [(p_idx, p_sub_idx) for p_idx in range(len(widths)) - for p_sub_idx in range(widths[p_idx])] - - col_names = ("partition", "index_within_partition") - column_df = pd.DataFrame(dest_indices, index=columns, columns=col_names) - return widths, column_df + return widths -@ray.remote(num_return_vals=2) -def _build_index(df_row, index): - """Build index and compute lengths for each partition.""" - # Rows and length +@ray.remote +def _build_row_lengths(df_row): + """Compute lengths (# of rows) for each partition.""" lengths = np.array(ray.get([_deploy_func.remote(_get_lengths, d) for d in df_row])) - dest_indices = [(p_idx, p_sub_idx) for p_idx in range(len(lengths)) - for p_sub_idx in range(lengths[p_idx])] - col_names = ("partition", "index_within_partition") - index_df = pd.DataFrame(dest_indices, index=index, columns=col_names) + return lengths + + +@ray.remote +def _build_coord_df(lengths, index): + """Build the coordinate dataframe over all partitions.""" + coords = np.vstack([np.column_stack((np.full(l, i), np.arange(l))) + for i, l in enumerate(lengths)]) - return lengths, index_df + col_names = ("partition", "index_within_partition") + return pd.DataFrame(coords, index=index, columns=col_names) def _create_block_partitions(partitions, axis=0, length=None): @@ -233,8 +304,11 @@ def create_blocks_helper(df, npartitions, axis): @ray.remote def _blocks_to_col(*partition): - return pd.concat(partition, axis=0, copy=False)\ - .reset_index(drop=True) + if len(partition): + return pd.concat(partition, axis=0, copy=False)\ + .reset_index(drop=True) + else: + return pd.Series() @ray.remote @@ -313,7 +387,8 @@ def _reindex_helper(old_index, new_index, axis, npartitions, *df): @ray.remote -def _co_op_helper(func, left_columns, right_columns, left_df_len, *zipped): +def _co_op_helper(func, left_columns, right_columns, left_df_len, left_idx, + *zipped): """Copartition operation where two DataFrames must have aligned indexes. NOTE: This function assumes things are already copartitioned. Requires that @@ -330,11 +405,64 @@ def _co_op_helper(func, left_columns, right_columns, left_df_len, *zipped): Returns: A new set of blocks for the partitioned DataFrame. """ - left = pd.concat(zipped[:left_df_len], axis=1, copy=False) + left = pd.concat(zipped[:left_df_len], axis=1, copy=False).copy() left.columns = left_columns + if left_idx is not None: + left.index = left_idx - right = pd.concat(zipped[left_df_len:], axis=1, copy=False) + right = pd.concat(zipped[left_df_len:], axis=1, copy=False).copy() right.columns = right_columns new_rows = func(left, right) - return create_blocks_helper(new_rows, left_df_len, 0) + + new_blocks = create_blocks_helper(new_rows, left_df_len, 0) + + if left_idx is not None: + new_blocks.append(new_rows.index) + + return new_blocks + + +@ray.remote +def _match_partitioning(column_partition, lengths, index): + """Match the number of rows on each partition. Used in df.merge(). + + Args: + column_partition: The column partition to change. + lengths: The lengths of each row partition to match to. + index: The index index of the column_partition. This is used to push + down to the inner frame for correctness in the merge. + + Returns: + A list of blocks created from this column partition. + """ + partitioned_list = [] + + columns = column_partition.columns + # We set this because this is the only place we can guarantee correct + # placement. We use it in the case the user wants to join on the index. + column_partition.index = index + for length in lengths: + if len(column_partition) == 0: + partitioned_list.append(pd.DataFrame(columns=columns)) + continue + + partitioned_list.append(column_partition.iloc[:length, :]) + column_partition = column_partition.iloc[length:, :] + return partitioned_list + + +@ray.remote +def _concat_index(*index_parts): + return index_parts[0].append(index_parts[1:]) + + +@ray.remote +def _correct_column_dtypes(*column): + """Corrects dtypes of a column by concatenating column partitions and + splitting the column back into partitions. + + Args: + """ + concat_column = pd.concat(column, copy=False) + return create_blocks_helper(concat_column, len(column), 1) diff --git a/python/ray/experimental/state.py b/python/ray/experimental/state.py index 964b0f71e290..0cea70b9ecee 100644 --- a/python/ray/experimental/state.py +++ b/python/ray/experimental/state.py @@ -184,8 +184,8 @@ def _object_table(self, object_id): A dictionary with information about the object ID in question. """ # Allow the argument to be either an ObjectID or a hex string. - if not isinstance(object_id, ray.local_scheduler.ObjectID): - object_id = ray.local_scheduler.ObjectID(hex_to_binary(object_id)) + if not isinstance(object_id, ray.ObjectID): + object_id = ray.ObjectID(hex_to_binary(object_id)) # Return information about a single object ID. object_locations = self._execute_command(object_id, @@ -297,7 +297,7 @@ def _task_table(self, task_id): TaskExecutionDependencies.GetRootAsTaskExecutionDependencies( task_table_message.ExecutionDependencies(), 0)) execution_dependencies = [ - ray.local_scheduler.ObjectID( + ray.ObjectID( execution_dependencies_message.ExecutionDependencies(i)) for i in range( execution_dependencies_message.ExecutionDependenciesLength()) @@ -335,7 +335,7 @@ def task_table(self, task_id=None): """ self._check_connected() if task_id is not None: - task_id = ray.local_scheduler.ObjectID(hex_to_binary(task_id)) + task_id = ray.ObjectID(hex_to_binary(task_id)) return self._task_table(task_id) else: task_table_keys = self._keys(TASK_PREFIX + "*") @@ -343,7 +343,7 @@ def task_table(self, task_id=None): for key in task_table_keys: task_id_binary = key[len(TASK_PREFIX):] results[binary_to_hex(task_id_binary)] = self._task_table( - ray.local_scheduler.ObjectID(task_id_binary)) + ray.ObjectID(task_id_binary)) return results def function_table(self, function_id=None): @@ -628,8 +628,7 @@ def micros_rel(ts): # modify it in place since we will use the original values later. total_info = copy.copy(task_table[task_id]["TaskSpec"]) total_info["Args"] = [ - oid.hex() - if isinstance(oid, ray.local_scheduler.ObjectID) else oid + oid.hex() if isinstance(oid, ray.ObjectID) else oid for oid in task_t_info["TaskSpec"]["Args"] ] total_info["ReturnObjectIDs"] = [ @@ -855,7 +854,7 @@ def micros_rel(ts): args = task_table[task_id]["TaskSpec"]["Args"] for arg in args: # Don't visualize arguments that are not object IDs. - if isinstance(arg, ray.local_scheduler.ObjectID): + if isinstance(arg, ray.ObjectID): object_info = self._object_table(arg) # Don't visualize objects that were created by calls to # put. diff --git a/python/ray/experimental/tfutils.py b/python/ray/experimental/tfutils.py index 10d5fb4bc308..fc33900b8a27 100644 --- a/python/ray/experimental/tfutils.py +++ b/python/ray/experimental/tfutils.py @@ -49,7 +49,7 @@ def __init__(self, loss, sess=None, input_variables=None): self.sess = sess queue = deque([loss]) variable_names = [] - explored_inputs = set([loss]) + explored_inputs = {loss} # We do a BFS on the dependency graph of the input function to find # the variables. diff --git a/python/ray/plasma/test/test.py b/python/ray/plasma/test/test.py index 8b0d62fe1d2a..0ec424d6a71f 100644 --- a/python/ray/plasma/test/test.py +++ b/python/ray/plasma/test/test.py @@ -297,7 +297,7 @@ def test_wait(self): self.client1.seal(obj_id1) ready, waiting = self.client1.wait( [obj_id1], timeout=100, num_returns=1) - self.assertEqual(set(ready), set([obj_id1])) + self.assertEqual(set(ready), {obj_id1}) self.assertEqual(waiting, []) # Test wait if only one object available and only one object waited @@ -307,8 +307,8 @@ def test_wait(self): # Don't seal. ready, waiting = self.client1.wait( [obj_id2, obj_id1], timeout=100, num_returns=1) - self.assertEqual(set(ready), set([obj_id1])) - self.assertEqual(set(waiting), set([obj_id2])) + self.assertEqual(set(ready), {obj_id1}) + self.assertEqual(set(waiting), {obj_id2}) # Test wait if object is sealed later. obj_id3 = random_object_id() @@ -321,14 +321,14 @@ def finish(): t.start() ready, waiting = self.client1.wait( [obj_id3, obj_id2, obj_id1], timeout=1000, num_returns=2) - self.assertEqual(set(ready), set([obj_id1, obj_id3])) - self.assertEqual(set(waiting), set([obj_id2])) + self.assertEqual(set(ready), {obj_id1, obj_id3}) + self.assertEqual(set(waiting), {obj_id2}) # Test if the appropriate number of objects is shown if some objects # are not ready. ready, waiting = self.client1.wait([obj_id3, obj_id2, obj_id1], 100, 3) - self.assertEqual(set(ready), set([obj_id1, obj_id3])) - self.assertEqual(set(waiting), set([obj_id2])) + self.assertEqual(set(ready), {obj_id1, obj_id3}) + self.assertEqual(set(waiting), {obj_id2}) # Don't forget to seal obj_id2. self.client1.seal(obj_id2) diff --git a/python/ray/ray_constants.py b/python/ray/ray_constants.py index 4f337ad41e60..7e5df96500e5 100644 --- a/python/ray/ray_constants.py +++ b/python/ray/ray_constants.py @@ -3,19 +3,30 @@ from __future__ import print_function """Ray constants used in the Python code.""" +import os + + +def env_integer(key, default): + if key in os.environ: + return int(os.environ(key)) + return default + + # Abort autoscaling if more than this number of errors are encountered. This # is a safety feature to prevent e.g. runaway node launches. -AUTOSCALER_MAX_NUM_FAILURES = 5 +AUTOSCALER_MAX_NUM_FAILURES = env_integer("AUTOSCALER_MAX_NUM_FAILURES", 5) # Max number of nodes to launch at a time. -AUTOSCALER_MAX_CONCURRENT_LAUNCHES = 10 +AUTOSCALER_MAX_CONCURRENT_LAUNCHES = env_integer( + "AUTOSCALER_MAX_CONCURRENT_LAUNCHES", 10) # Interval at which to perform autoscaling updates. -AUTOSCALER_UPDATE_INTERVAL_S = 5 +AUTOSCALER_UPDATE_INTERVAL_S = env_integer("AUTOSCALER_UPDATE_INTERVAL_S", 5) # The autoscaler will attempt to restart Ray on nodes it hasn't heard from # in more than this interval. -AUTOSCALER_HEARTBEAT_TIMEOUT_S = 30 +AUTOSCALER_HEARTBEAT_TIMEOUT_S = env_integer("AUTOSCALER_HEARTBEAT_TIMEOUT_S", + 30) # Max number of retries to AWS (default is 5, time increases exponentially) -BOTO_MAX_RETRIES = 12 +BOTO_MAX_RETRIES = env_integer("BOTO_MAX_RETRIES", 12) diff --git a/python/ray/remote_function.py b/python/ray/remote_function.py new file mode 100644 index 000000000000..4e09e4016b1d --- /dev/null +++ b/python/ray/remote_function.py @@ -0,0 +1,158 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import copy +import hashlib +import inspect + +import ray.signature + +# Default parameters for remote functions. +DEFAULT_REMOTE_FUNCTION_CPUS = 1 +DEFAULT_REMOTE_FUNCTION_NUM_RETURN_VALS = 1 +DEFAULT_REMOTE_FUNCTION_MAX_CALLS = 0 + + +def in_ipython(): + """Return true if we are in an IPython interpreter and false otherwise.""" + try: + __IPYTHON__ + return True + except NameError: + return False + + +def compute_function_id(function): + """Compute an function ID for a function. + + Args: + func: The actual function. + + Returns: + This returns the function ID. + """ + function_id_hash = hashlib.sha1() + # Include the function module and name in the hash. + function_id_hash.update(function.__module__.encode("ascii")) + function_id_hash.update(function.__name__.encode("ascii")) + # If we are running a script or are in IPython, include the source code in + # the hash. If we are in a regular Python interpreter we skip this part + # because the source code is not accessible. If the function is a built-in + # (e.g., Cython), the source code is not accessible. + import __main__ as main + if (hasattr(main, "__file__") or in_ipython()) \ + and inspect.isfunction(function): + function_id_hash.update(inspect.getsource(function).encode("ascii")) + # Compute the function ID. + function_id = function_id_hash.digest() + assert len(function_id) == 20 + function_id = ray.ObjectID(function_id) + + return function_id + + +class RemoteFunction(object): + """A remote function. + + This is a decorated function. It can be used to spawn tasks. + + Attributes: + _function: The original function. + _function_id: The ID of the function. + _function_name: The module and function name. + _num_cpus: The default number of CPUs to use for invocations of this + remote function. + _num_gpus: The default number of GPUs to use for invocations of this + remote function. + _resources: The default custom resource requirements for invocations of + this remote function. + _num_return_vals: The default number of return values for invocations + of this remote function. + _max_calls: The number of times a worker can execute this function + before executing. + _function_signature: The function signature. + """ + + def __init__(self, function, num_cpus, num_gpus, resources, + num_return_vals, max_calls): + self._function = function + # TODO(rkn): We store the function ID as a string, so that + # RemoteFunction objects can be pickled. We should undo this when + # we allow ObjectIDs to be pickled. + self._function_id = compute_function_id(self._function).id() + self._function_name = ( + self._function.__module__ + '.' + self._function.__name__) + self._num_cpus = (DEFAULT_REMOTE_FUNCTION_CPUS + if num_cpus is None else num_cpus) + self._num_gpus = num_gpus + self._resources = resources + self._num_return_vals = (DEFAULT_REMOTE_FUNCTION_NUM_RETURN_VALS if + num_return_vals is None else num_return_vals) + self._max_calls = (DEFAULT_REMOTE_FUNCTION_MAX_CALLS + if max_calls is None else max_calls) + + ray.signature.check_signature_supported(self._function) + self._function_signature = ray.signature.extract_signature( + self._function) + + # # Export the function. + worker = ray.worker.get_global_worker() + if worker.mode in [ray.worker.SCRIPT_MODE, ray.worker.SILENT_MODE]: + self._export() + elif worker.mode is None: + worker.cached_remote_functions_and_actors.append( + ("remote_function", self)) + + def __call__(self, *args, **kwargs): + raise Exception("Remote functions cannot be called directly. Instead " + "of running '{}()', try '{}.remote()'.".format( + self._function_name, self._function_name)) + + def remote(self, *args, **kwargs): + """This runs immediately when a remote function is called.""" + return self._submit(args=args, kwargs=kwargs) + + def _submit(self, + args=None, + kwargs=None, + num_return_vals=None, + num_cpus=None, + num_gpus=None, + resources=None): + """An experimental alternate way to submit remote functions.""" + worker = ray.worker.get_global_worker() + worker.check_connected() + ray.worker.check_main_thread() + kwargs = {} if kwargs is None else kwargs + args = ray.signature.extend_args(self._function_signature, args, + kwargs) + + if num_return_vals is None: + num_return_vals = self._num_return_vals + + resources = ray.utils.resources_from_resource_arguments( + self._num_cpus, self._num_gpus, self._resources, num_cpus, + num_gpus, resources) + if worker.mode == ray.worker.PYTHON_MODE: + # In PYTHON_MODE, remote calls simply execute the function. + # We copy the arguments to prevent the function call from + # mutating them and to match the usual behavior of + # immutable remote objects. + result = self._function(*copy.deepcopy(args)) + return result + object_ids = worker.submit_task( + ray.ObjectID(self._function_id), + args, + num_return_vals=num_return_vals, + resources=resources) + if len(object_ids) == 1: + return object_ids[0] + elif len(object_ids) > 1: + return object_ids + + def _export(self): + worker = ray.worker.get_global_worker() + worker.export_remote_function( + ray.ObjectID(self._function_id), self._function_name, + self._function, self._max_calls, self) diff --git a/python/ray/rllib/a3c/a3c.py b/python/ray/rllib/a3c/a3c.py index fa10db9c918c..15f5aa187bc9 100644 --- a/python/ray/rllib/a3c/a3c.py +++ b/python/ray/rllib/a3c/a3c.py @@ -126,7 +126,7 @@ def _fetch_metrics_from_remote_evaluators(self): def _stop(self): # workaround for https://github.com/ray-project/ray/issues/1516 for ev in self.remote_evaluators: - ev.__ray_terminate__.remote(ev._ray_actor_id.id()) + ev.__ray_terminate__.remote() def _save(self, checkpoint_dir): checkpoint_path = os.path.join( diff --git a/python/ray/rllib/a3c/shared_torch_policy.py b/python/ray/rllib/a3c/shared_torch_policy.py index 59b7a2577008..36b39dcfc7bf 100644 --- a/python/ray/rllib/a3c/shared_torch_policy.py +++ b/python/ray/rllib/a3c/shared_torch_policy.py @@ -17,9 +17,9 @@ class SharedTorchPolicy(TorchPolicy): other_output = ["vf_preds"] is_recurrent = False - def __init__(self, ob_space, ac_space, config, **kwargs): + def __init__(self, registry, ob_space, ac_space, config, **kwargs): super(SharedTorchPolicy, self).__init__( - ob_space, ac_space, config, **kwargs) + registry, ob_space, ac_space, config, **kwargs) def _setup_graph(self, ob_space, ac_space): _, self.logit_dim = ModelCatalog.get_action_dist(ac_space) diff --git a/python/ray/rllib/bc/policy.py b/python/ray/rllib/bc/policy.py index 7566422fa154..11178a50d23a 100644 --- a/python/ray/rllib/bc/policy.py +++ b/python/ray/rllib/bc/policy.py @@ -2,8 +2,10 @@ from __future__ import division from __future__ import print_function -import ray import tensorflow as tf +import gym + +import ray from ray.rllib.a3c.policy import Policy from ray.rllib.models.catalog import ModelCatalog @@ -38,7 +40,16 @@ def _setup_graph(self, ob_space, ac_space): tf.get_variable_scope().name) def setup_loss(self, action_space): - self.ac = tf.placeholder(tf.int64, [None], name="ac") + if isinstance(action_space, gym.spaces.Box): + self.ac = tf.placeholder(tf.float32, + [None] + list(action_space.shape), + name="ac") + elif isinstance(action_space, gym.spaces.Discrete): + self.ac = tf.placeholder(tf.int64, [None], name="ac") + else: + raise NotImplementedError( + "action space" + str(type(action_space)) + + "currently not supported") log_prob = self.curr_dist.logp(self.ac) self.pi_loss = - tf.reduce_sum(log_prob) self.loss = self.pi_loss diff --git a/python/ray/rllib/ddpg/ddpg.py b/python/ray/rllib/ddpg/ddpg.py index 343b323948b3..ae5e65e5bc3b 100644 --- a/python/ray/rllib/ddpg/ddpg.py +++ b/python/ray/rllib/ddpg/ddpg.py @@ -234,7 +234,7 @@ def _train_stats(self, start_timestep): def _stop(self): # workaround for https://github.com/ray-project/ray/issues/1516 for ev in self.remote_evaluators: - ev.__ray_terminate__.remote(ev._ray_actor_id.id()) + ev.__ray_terminate__.remote() def _save(self, checkpoint_dir): checkpoint_path = self.saver.save( diff --git a/python/ray/rllib/dqn/dqn.py b/python/ray/rllib/dqn/dqn.py index cd7e85847a93..9a0da4e80704 100644 --- a/python/ray/rllib/dqn/dqn.py +++ b/python/ray/rllib/dqn/dqn.py @@ -232,7 +232,7 @@ def _train_stats(self, start_timestep): def _stop(self): # workaround for https://github.com/ray-project/ray/issues/1516 for ev in self.remote_evaluators: - ev.__ray_terminate__.remote(ev._ray_actor_id.id()) + ev.__ray_terminate__.remote() def _save(self, checkpoint_dir): checkpoint_path = self.saver.save( diff --git a/python/ray/rllib/es/es.py b/python/ray/rllib/es/es.py index ca1bf4da69fe..f5ea4fa373ed 100644 --- a/python/ray/rllib/es/es.py +++ b/python/ray/rllib/es/es.py @@ -311,7 +311,7 @@ def _train(self): def _stop(self): # workaround for https://github.com/ray-project/ray/issues/1516 for w in self.workers: - w.__ray_terminate__.remote(w._ray_actor_id.id()) + w.__ray_terminate__.remote() def _save(self, checkpoint_dir): checkpoint_path = os.path.join( diff --git a/python/ray/rllib/models/action_dist.py b/python/ray/rllib/models/action_dist.py index 03e88bd1fc5e..14b058488b7e 100644 --- a/python/ray/rllib/models/action_dist.py +++ b/python/ray/rllib/models/action_dist.py @@ -43,21 +43,21 @@ def logp(self, x): def entropy(self): a0 = self.inputs - tf.reduce_max(self.inputs, reduction_indices=[1], - keep_dims=True) + keepdims=True) ea0 = tf.exp(a0) - z0 = tf.reduce_sum(ea0, reduction_indices=[1], keep_dims=True) + z0 = tf.reduce_sum(ea0, reduction_indices=[1], keepdims=True) p0 = ea0 / z0 return tf.reduce_sum(p0 * (tf.log(z0) - a0), reduction_indices=[1]) def kl(self, other): a0 = self.inputs - tf.reduce_max(self.inputs, reduction_indices=[1], - keep_dims=True) + keepdims=True) a1 = other.inputs - tf.reduce_max(other.inputs, reduction_indices=[1], - keep_dims=True) + keepdims=True) ea0 = tf.exp(a0) ea1 = tf.exp(a1) - z0 = tf.reduce_sum(ea0, reduction_indices=[1], keep_dims=True) - z1 = tf.reduce_sum(ea1, reduction_indices=[1], keep_dims=True) + z0 = tf.reduce_sum(ea0, reduction_indices=[1], keepdims=True) + z1 = tf.reduce_sum(ea1, reduction_indices=[1], keepdims=True) p0 = ea0 / z0 return tf.reduce_sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), reduction_indices=[1]) diff --git a/python/ray/rllib/optimizers/apex_optimizer.py b/python/ray/rllib/optimizers/apex_optimizer.py index ded738f622fc..e113213b4940 100644 --- a/python/ray/rllib/optimizers/apex_optimizer.py +++ b/python/ray/rllib/optimizers/apex_optimizer.py @@ -7,12 +7,12 @@ from __future__ import print_function import os -import queue import random import time import threading import numpy as np +from six.moves import queue import ray from ray.rllib.optimizers.policy_optimizer import PolicyOptimizer @@ -22,6 +22,7 @@ from ray.rllib.utils.timer import TimerStat from ray.rllib.utils.window_stat import WindowStat + SAMPLE_QUEUE_DEPTH = 2 REPLAY_QUEUE_DEPTH = 4 LEARNER_QUEUE_MAX_SIZE = 16 diff --git a/python/ray/rllib/ppo/ppo.py b/python/ray/rllib/ppo/ppo.py index 8f550d318f43..a8c695033e9a 100644 --- a/python/ray/rllib/ppo/ppo.py +++ b/python/ray/rllib/ppo/ppo.py @@ -269,7 +269,7 @@ def _fetch_metrics_from_remote_evaluators(self): def _stop(self): # workaround for https://github.com/ray-project/ray/issues/1516 for ev in self.remote_evaluators: - ev.__ray_terminate__.remote(ev._ray_actor_id.id()) + ev.__ray_terminate__.remote() def _save(self, checkpoint_dir): checkpoint_path = self.saver.save( diff --git a/python/ray/rllib/utils/actors.py b/python/ray/rllib/utils/actors.py index d42114cb0011..a7f604bc2098 100644 --- a/python/ray/rllib/utils/actors.py +++ b/python/ray/rllib/utils/actors.py @@ -30,7 +30,7 @@ def count(self): def drop_colocated(actors): colocated, non_colocated = split_colocated(actors) for a in colocated: - a.__ray_terminate__.remote(a._ray_actor_id.id()) + a.__ray_terminate__.remote() return non_colocated diff --git a/python/ray/scripts/scripts.py b/python/ray/scripts/scripts.py index d06d4606890c..530de154be2c 100644 --- a/python/ray/scripts/scripts.py +++ b/python/ray/scripts/scripts.py @@ -4,6 +4,7 @@ import click import json +import os import subprocess import ray.services as services @@ -144,7 +145,7 @@ def cli(): @click.option( "--use-raylet", is_flag=True, - default=False, + default=None, help="use the raylet code path, this is not supported yet") def start(node_ip_address, redis_address, redis_port, num_redis_shards, redis_max_clients, redis_shard_ports, object_manager_port, @@ -157,6 +158,11 @@ def start(node_ip_address, redis_address, redis_port, num_redis_shards, if redis_address is not None: redis_address = services.address_to_ip(redis_address) + if use_raylet is None and os.environ.get("RAY_USE_XRAY") == "1": + # This environment variable is used in our testing setup. + print("Detected environment variable 'RAY_USE_XRAY'.") + use_raylet = True + try: resources = json.loads(resources) except Exception as e: diff --git a/python/ray/tune/registry.py b/python/ray/tune/registry.py index f17267eaadd4..cb9505771e63 100644 --- a/python/ray/tune/registry.py +++ b/python/ray/tune/registry.py @@ -23,7 +23,7 @@ def register_trainable(name, trainable): Args: name (str): Name to register. - trainable (obj): Function or tune.Trainable clsas. Functions must + trainable (obj): Function or tune.Trainable class. Functions must take (config, status_reporter) as arguments and will be automatically converted into a class during registration. """ diff --git a/python/ray/tune/test/trial_runner_test.py b/python/ray/tune/test/trial_runner_test.py index d51f9ec6f988..2eba2693d107 100644 --- a/python/ray/tune/test/trial_runner_test.py +++ b/python/ray/tune/test/trial_runner_test.py @@ -161,6 +161,26 @@ def train(config, reporter): } }) + def testLogdirStartingWithTilde(self): + local_dir = '~/ray_results/local_dir' + + def train(config, reporter): + cwd = os.getcwd() + assert cwd.startswith(os.path.expanduser(local_dir)), cwd + assert not cwd.startswith('~'), cwd + reporter(timesteps_total=1) + + register_trainable('f1', train) + run_experiments({ + 'foo': { + 'run': 'f1', + 'local_dir': local_dir, + 'config': { + 'a': 'b' + }, + } + }) + def testLongFilename(self): def train(config, reporter): assert "/tmp/logdir/foo" in os.getcwd(), os.getcwd() diff --git a/python/ray/tune/test/trial_scheduler_test.py b/python/ray/tune/test/trial_scheduler_test.py index b008af3c7d6a..a15448db79c9 100644 --- a/python/ray/tune/test/trial_scheduler_test.py +++ b/python/ray/tune/test/trial_scheduler_test.py @@ -688,36 +688,36 @@ def assertProduces(fn, values): # Categorical case assertProduces( lambda: explore({"v": 4}, {"v": [3, 4, 8, 10]}, 0.0, lambda x: x), - set([3, 8])) + {3, 8}) assertProduces( lambda: explore({"v": 3}, {"v": [3, 4, 8, 10]}, 0.0, lambda x: x), - set([3, 4])) + {3, 4}) assertProduces( lambda: explore({"v": 10}, {"v": [3, 4, 8, 10]}, 0.0, lambda x: x), - set([8, 10])) + {8, 10}) assertProduces( lambda: explore({"v": 7}, {"v": [3, 4, 8, 10]}, 0.0, lambda x: x), - set([3, 4, 8, 10])) + {3, 4, 8, 10}) assertProduces( lambda: explore({"v": 4}, {"v": [3, 4, 8, 10]}, 1.0, lambda x: x), - set([3, 4, 8, 10])) + {3, 4, 8, 10}) # Continuous case assertProduces( lambda: explore( {"v": 100}, {"v": lambda: random.choice([10, 100])}, 0.0, lambda x: x), - set([80, 120])) + {80, 120}) assertProduces( lambda: explore( {"v": 100.0}, {"v": lambda: random.choice([10, 100])}, 0.0, lambda x: x), - set([80.0, 120.0])) + {80.0, 120.0}) assertProduces( lambda: explore( {"v": 100.0}, {"v": lambda: random.choice([10, 100])}, 1.0, lambda x: x), - set([10.0, 100.0])) + {10.0, 100.0}) def testYieldsTimeToOtherTrials(self): pbt, runner = self.basicSetup() diff --git a/python/ray/tune/trainable.py b/python/ray/tune/trainable.py index 03976300e27e..a801b89f5886 100644 --- a/python/ray/tune/trainable.py +++ b/python/ray/tune/trainable.py @@ -112,7 +112,7 @@ def train(self): Subclasses should override ``_train()`` instead to return results. This method auto-fills many fields, so only ``timesteps_this_iter`` - is requied to be present. + is required to be present. Returns: A TrainingResult that describes training progress. diff --git a/python/ray/tune/trial.py b/python/ray/tune/trial.py index 9d12e768ce8d..21e8907e1aa6 100644 --- a/python/ray/tune/trial.py +++ b/python/ray/tune/trial.py @@ -110,7 +110,7 @@ def __init__(self, # Trial config self.trainable_name = trainable_name self.config = config or {} - self.local_dir = local_dir + self.local_dir = os.path.expanduser(local_dir) self.experiment_tag = experiment_tag self.resources = ( resources @@ -182,9 +182,7 @@ def stop(self, error=False, error_msg=None, stop_logger=True): if self.runner: stop_tasks = [] stop_tasks.append(self.runner.stop.remote()) - stop_tasks.append( - self.runner.__ray_terminate__.remote( - self.runner._ray_actor_id.id())) + stop_tasks.append(self.runner.__ray_terminate__.remote()) # TODO(ekl) seems like wait hangs when killing actors _, unfinished = ray.wait( stop_tasks, num_returns=2, timeout=250) diff --git a/python/ray/tune/trial_runner.py b/python/ray/tune/trial_runner.py index 56474c119495..3f07e7bb51ab 100644 --- a/python/ray/tune/trial_runner.py +++ b/python/ray/tune/trial_runner.py @@ -172,7 +172,7 @@ def debug_string(self, max_debug=MAX_DEBUG_TRIALS): if max_debug == start_num: break - for local_dir in sorted(set([t.local_dir for t in self._trials])): + for local_dir in sorted({t.local_dir for t in self._trials}): messages.append("Result logdir: {}".format(local_dir)) for state, trials in sorted(states.items()): limit = limit_per_state[state] diff --git a/python/ray/utils.py b/python/ray/utils.py index 0ef47daf971f..9fa3a4fe165f 100644 --- a/python/ray/utils.py +++ b/python/ray/utils.py @@ -3,7 +3,6 @@ from __future__ import print_function import binascii -import collections import hashlib import numpy as np import os @@ -125,7 +124,7 @@ def decode(byte_str): def binary_to_object_id(binary_object_id): - return ray.local_scheduler.ObjectID(binary_object_id) + return ray.ObjectID(binary_object_id) def binary_to_hex(identifier): @@ -139,11 +138,6 @@ def hex_to_binary(hex_identifier): return binascii.unhexlify(hex_identifier) -FunctionProperties = collections.namedtuple( - "FunctionProperties", ["num_return_vals", "resources", "max_calls"]) -"""FunctionProperties: A named tuple storing remote functions information.""" - - def get_cuda_visible_devices(): """Get the device IDs in the CUDA_VISIBLE_DEVICES environment variable. @@ -169,3 +163,48 @@ def set_cuda_visible_devices(gpu_ids): gpu_ids: This is a list of integers representing GPU IDs. """ os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(i) for i in gpu_ids]) + + +def resources_from_resource_arguments(default_num_cpus, default_num_gpus, + default_resources, runtime_num_cpus, + runtime_num_gpus, runtime_resources): + """Determine a task's resource requirements. + + Args: + default_num_cpus: The default number of CPUs required by this function + or actor method. + default_num_gpus: The default number of GPUs required by this function + or actor method. + default_resources: The default custom resources required by this + function or actor method. + runtime_num_cpus: The number of CPUs requested when the task was + invoked. + runtime_num_gpus: The number of GPUs requested when the task was + invoked. + runtime_resources: The custom resources requested when the task was + invoked. + + Returns: + A dictionary of the resource requirements for the task. + """ + if runtime_resources is not None: + resources = runtime_resources.copy() + elif default_resources is not None: + resources = default_resources.copy() + else: + resources = {} + + if "CPU" in resources or "GPU" in resources: + raise ValueError("The resources dictionary must not " + "contain the key 'CPU' or 'GPU'") + + assert default_num_cpus is not None + resources["CPU"] = (default_num_cpus + if runtime_num_cpus is None else runtime_num_cpus) + + if runtime_num_gpus is not None: + resources["GPU"] = runtime_num_gpus + elif default_num_gpus is not None: + resources["GPU"] = default_num_gpus + + return resources diff --git a/python/ray/worker.py b/python/ray/worker.py index a12f93a541b1..3b4495d25cfd 100644 --- a/python/ray/worker.py +++ b/python/ray/worker.py @@ -5,7 +5,6 @@ import atexit import collections import colorama -import copy import hashlib import inspect import json @@ -23,13 +22,13 @@ import pyarrow.plasma as plasma import ray.cloudpickle as pickle import ray.experimental.state as state +import ray.remote_function import ray.serialization as serialization import ray.services as services -import ray.signature as signature +import ray.signature import ray.local_scheduler import ray.plasma -from ray.utils import (FunctionProperties, random_string, binary_to_hex, - is_cython) +from ray.utils import random_string, binary_to_hex, is_cython # Import flatbuffer bindings. from ray.core.generated.ClientTableData import ClientTableData @@ -63,9 +62,6 @@ # This must be kept in sync with the `scheduling_state` enum in common/task.h. TASK_STATUS_RUNNING = 8 -# Default resource requirements for remote functions. -DEFAULT_REMOTE_FUNCTION_CPUS = 1 -DEFAULT_REMOTE_FUNCTION_GPUS = 0 # Default resource requirements for actors when no resource requirements are # specified. DEFAULT_ACTOR_METHOD_CPUS_SIMPLE_CASE = 1 @@ -74,15 +70,6 @@ # specified. DEFAULT_ACTOR_METHOD_CPUS_SPECIFIED_CASE = 0 DEFAULT_ACTOR_CREATION_CPUS_SPECIFIED_CASE = 1 -DEFAULT_ACTOR_CREATION_GPUS_SPECIFIED_CASE = 0 - - -class FunctionID(object): - def __init__(self, function_id): - self.function_id = function_id - - def id(self): - return self.function_id class RayTaskError(Exception): @@ -182,6 +169,11 @@ def __str__(self): self.task_error)) +FunctionExecutionInfo = collections.namedtuple( + "FunctionExecutionInfo", ["function", "function_name", "max_calls"]) +"""FunctionExecutionInfo: A named tuple storing remote function information.""" + + class Worker(object): """A class used to define the control flow of a worker process. @@ -190,9 +182,10 @@ class Worker(object): functions outside of this class are considered exposed. Attributes: - functions (Dict[str, Callable]): A dictionary mapping the name of a - remote function to the remote function itself. This is the set of - remote functions that can be executed by this worker. + function_execution_info (Dict[str, FunctionExecutionInfo]): A + dictionary mapping the name of a remote function to the remote + function itself. This is the set of remote functions that can be + executed by this worker. connected (bool): True if Ray has been started and False otherwise. mode: The mode of the worker. One of SCRIPT_MODE, PYTHON_MODE, SILENT_MODE, and WORKER_MODE. @@ -208,20 +201,12 @@ class Worker(object): def __init__(self): """Initialize a Worker object.""" - # The functions field is a dictionary that maps a driver ID to a - # dictionary of functions that have been registered for that driver - # (this inner dictionary maps function IDs to a tuple of the function - # name and the function itself). This should only be used on workers - # that execute remote functions. - self.functions = collections.defaultdict(lambda: {}) - # The function_properties field is a dictionary that maps a driver ID - # to a dictionary of functions that have been registered for that - # driver (this inner dictionary maps function IDs to a tuple of the - # number of values returned by that function, the number of CPUs - # required by that function, and the number of GPUs required by that - # function). This is used when submitting a function (which can be done - # both on workers and on drivers). - self.function_properties = collections.defaultdict(lambda: {}) + # This field is a dictionary that maps a driver ID to a dictionary of + # functions (and information about those functions) that have been + # registered for that driver (this inner dictionary maps function IDs + # to a FunctionExecutionInfo object. This should only be used on + # workers that execute remote functions. + self.function_execution_info = collections.defaultdict(lambda: {}) # This is a dictionary mapping driver ID to a dictionary that maps # remote function IDs for that driver to a counter of the number of # times that remote function has been executed on this worker. The @@ -248,6 +233,16 @@ def __init__(self): # CUDA_VISIBLE_DEVICES environment variable. self.original_gpu_ids = ray.utils.get_cuda_visible_devices() + def check_connected(self): + """Check if the worker is connected. + + Raises: + Exception: An exception is raised if the worker is not connected. + """ + if not self.connected: + raise RayConnectionError("Ray has not been started yet. You can " + "start Ray with 'ray.init()'.") + def set_mode(self, mode): """Set the mode of the worker. @@ -267,7 +262,7 @@ def set_mode(self, mode): print any information about errors because some of the tests intentionally fail. - args: + Args: mode: One of SCRIPT_MODE, WORKER_MODE, PYTHON_MODE, and SILENT_MODE. """ @@ -356,18 +351,13 @@ def put_object(self, object_id, value): full. """ # Make sure that the value is not an object ID. - if isinstance(value, ray.local_scheduler.ObjectID): + if isinstance(value, ray.ObjectID): raise Exception("Calling 'put' on an ObjectID is not allowed " "(similarly, returning an ObjectID from a remote " "function is not allowed). If you really want to " "do this, you can wrap the ObjectID in a list and " "call 'put' on it (or return it).") - if isinstance(value, ray.actor.ActorHandleParent): - raise Exception("Calling 'put' on an actor handle is currently " - "not allowed (similarly, returning an actor " - "handle from a remote function is not allowed).") - # Serialize and put the object in the object store. try: self.store_and_register(object_id, value) @@ -443,7 +433,7 @@ def get_object(self, object_ids): """ # Make sure that the values are object IDs. for object_id in object_ids: - if not isinstance(object_id, ray.local_scheduler.ObjectID): + if not isinstance(object_id, ray.ObjectID): raise Exception("Attempting to call `get` on the value {}, " "which is not an ObjectID.".format(object_id)) # Do an initial fetch for remote objects. We divide the fetch into @@ -464,9 +454,11 @@ def get_object(self, object_ids): final_results = self.retrieve_and_deserialize(plain_object_ids, 0) # Construct a dictionary mapping object IDs that we haven't gotten yet # to their original index in the object_ids argument. - unready_ids = dict((plain_object_ids[i].binary(), i) - for (i, val) in enumerate(final_results) - if val is plasma.ObjectNotAvailable) + unready_ids = { + plain_object_ids[i].binary(): i + for (i, val) in enumerate(final_results) + if val is plasma.ObjectNotAvailable + } was_blocked = (len(unready_ids) > 0) # Try reconstructing any objects we haven't gotten yet. Try to get them # until at least get_timeout_milliseconds milliseconds passes, then @@ -521,9 +513,8 @@ def submit_task(self, actor_creation_dummy_object_id=None, execution_dependencies=None, num_return_vals=None, - num_cpus=None, - num_gpus=None, - resources=None): + resources=None, + driver_id=None): """Submit a remote task to the scheduler. Tell the scheduler to schedule the execution of the function with ID @@ -547,9 +538,12 @@ def submit_task(self, execution_dependencies: The execution dependencies for this task. num_return_vals: The number of return values this function should have. - num_cpus: The number of CPUs required by this task. - num_gpus: The number of GPUs required by this task. resources: The resource requirements for this task. + driver_id: The ID of the relevant driver. This is almost always the + driver ID of the driver that is currently running. However, in + the exceptional case that an actor task is being dispatched to + an actor created by a different driver, this should be the + driver ID of the driver that created the actor. Returns: The return object IDs for this task. @@ -558,28 +552,23 @@ def submit_task(self, check_main_thread() if actor_id is None: assert actor_handle_id is None - actor_id = ray.local_scheduler.ObjectID(NIL_ACTOR_ID) - actor_handle_id = ray.local_scheduler.ObjectID( - NIL_ACTOR_HANDLE_ID) + actor_id = ray.ObjectID(NIL_ACTOR_ID) + actor_handle_id = ray.ObjectID(NIL_ACTOR_HANDLE_ID) else: assert actor_handle_id is not None if actor_creation_id is None: - actor_creation_id = ray.local_scheduler.ObjectID(NIL_ACTOR_ID) + actor_creation_id = ray.ObjectID(NIL_ACTOR_ID) if actor_creation_dummy_object_id is None: - actor_creation_dummy_object_id = ( - ray.local_scheduler.ObjectID(NIL_ID)) + actor_creation_dummy_object_id = (ray.ObjectID(NIL_ID)) # Put large or complex arguments that are passed by value in the # object store first. args_for_local_scheduler = [] for arg in args: - if isinstance(arg, ray.local_scheduler.ObjectID): + if isinstance(arg, ray.ObjectID): args_for_local_scheduler.append(arg) - elif isinstance(arg, ray.actor.ActorHandleParent): - args_for_local_scheduler.append( - put(ray.actor.wrap_actor_handle(arg))) elif ray.local_scheduler.check_simple_value(arg): args_for_local_scheduler.append(arg) else: @@ -589,27 +578,15 @@ def submit_task(self, if execution_dependencies is None: execution_dependencies = [] - # Look up the various function properties. - function_properties = self.function_properties[ - self.task_driver_id.id()][function_id.id()] + if driver_id is None: + driver_id = self.task_driver_id - if num_return_vals is None: - num_return_vals = function_properties.num_return_vals - - if resources is None and num_cpus is None and num_gpus is None: - resources = function_properties.resources - else: - resources = {} if resources is None else resources - if "CPU" in resources or "GPU" in resources: - raise ValueError("The resources dictionary must not " - "contain the keys 'CPU' or 'GPU'") - resources["CPU"] = num_cpus - resources["GPU"] = num_gpus + if resources is None: + raise ValueError("The resources dictionary is required.") # Submit the task to local scheduler. task = ray.local_scheduler.Task( - self.task_driver_id, - ray.local_scheduler.ObjectID( + driver_id, ray.ObjectID( function_id.id()), args_for_local_scheduler, num_return_vals, self.current_task_id, self.task_index, actor_creation_id, actor_creation_dummy_object_id, actor_id, @@ -622,6 +599,55 @@ def submit_task(self, return task.returns() + def export_remote_function(self, function_id, function_name, function, + max_calls, decorated_function): + """Export a remote function. + + Args: + function_id: The ID of the function. + function_name: The name of the function. + function: The raw undecorated function to export. + max_calls: The maximum number of times a given worker can execute + this function before exiting. + decorated_function: The decorated function (this is used to enable + the remote function to recursively call itself). + """ + check_main_thread() + if self.mode not in [SCRIPT_MODE, SILENT_MODE]: + raise Exception("export_remote_function can only be called on a " + "driver.") + + key = (b"RemoteFunction:" + self.task_driver_id.id() + b":" + + function_id.id()) + + # Work around limitations of Python pickling. + function_name_global_valid = function.__name__ in function.__globals__ + function_name_global_value = function.__globals__.get( + function.__name__) + # Allow the function to reference itself as a global variable + if not is_cython(function): + function.__globals__[function.__name__] = decorated_function + try: + pickled_function = pickle.dumps(function) + finally: + # Undo our changes + if function_name_global_valid: + function.__globals__[function.__name__] = ( + function_name_global_value) + else: + del function.__globals__[function.__name__] + + self.redis_client.hmset( + key, { + "driver_id": self.task_driver_id.id(), + "function_id": function_id.id(), + "name": function_name, + "module": function.__module__, + "function": pickled_function, + "max_calls": max_calls + }) + self.redis_client.rpush("Exports", key) + def run_function_on_all_workers(self, function): """Run arbitrary code on all of the workers. @@ -695,7 +721,8 @@ def _wait_for_function(self, function_id, driver_id, timeout=10): while True: with self.lock: if (self.actor_id == NIL_ACTOR_ID - and (function_id.id() in self.functions[driver_id])): + and (function_id.id() in + self.function_execution_info[driver_id])): break elif self.actor_id != NIL_ACTOR_ID and ( self.actor_id in self.actors): @@ -739,7 +766,7 @@ def _get_arguments_for_execution(self, function_name, serialized_args): """ arguments = [] for (i, arg) in enumerate(serialized_args): - if isinstance(arg, ray.local_scheduler.ObjectID): + if isinstance(arg, ray.ObjectID): # get the object from the local object store argument = self.get_object([arg])[0] if isinstance(argument, RayTaskError): @@ -747,8 +774,6 @@ def _get_arguments_for_execution(self, function_name, serialized_args): # created this object failed, and we should propagate the # error message here. raise RayGetArgumentError(function_name, i, arg, argument) - elif isinstance(argument, ray.actor.ActorHandleWrapper): - argument = ray.actor.unwrap_actor_handle(self, argument) else: # pass the argument by value argument = arg @@ -777,6 +802,10 @@ def _store_outputs_in_objstore(self, object_ids, outputs): passed into this function. """ for i in range(len(object_ids)): + if isinstance(outputs[i], ray.actor.ActorHandle): + raise Exception("Returning an actor handle from a remote " + "function is not allowed).") + self.put_object(object_ids[i], outputs[i]) def _process_task(self, task): @@ -794,7 +823,6 @@ def _process_task(self, task): # message to the correct driver. self.task_driver_id = task.driver_id() self.current_task_id = task.task_id() - self.current_function_id = task.function_id().id() self.task_index = 0 self.put_index = 1 function_id = task.function_id() @@ -802,8 +830,10 @@ def _process_task(self, task): return_object_ids = task.returns() if task.actor_id().id() != NIL_ACTOR_ID: dummy_return_id = return_object_ids.pop() - function_name, function_executor = ( - self.functions[self.task_driver_id.id()][function_id.id()]) + function_executor = self.function_execution_info[ + self.task_driver_id.id()][function_id.id()].function + function_name = self.function_execution_info[self.task_driver_id.id()][ + function_id.id()].function_name # Get task arguments from the object store. try: @@ -825,7 +855,7 @@ def _process_task(self, task): try: with log_span("ray:task:execute", worker=self): if task.actor_id().id() == NIL_ACTOR_ID: - outputs = function_executor.executor(arguments) + outputs = function_executor(*arguments) else: outputs = function_executor( dummy_return_id, self.actors[task.actor_id().id()], @@ -858,8 +888,8 @@ def _process_task(self, task): def _handle_process_task_failure(self, function_id, return_object_ids, error, backtrace): - function_name, _ = self.functions[self.task_driver_id.id()][ - function_id.id()] + function_name = self.function_execution_info[self.task_driver_id.id()][ + function_id.id()].function_name failure_object = RayTaskError(function_name, error, backtrace) failure_objects = [ failure_object for _ in range(len(return_object_ids)) @@ -898,7 +928,7 @@ def _become_actor(self, task): time.sleep(0.001) with self.lock: - self.fetch_and_register_actor(key, task.required_resources(), self) + self.fetch_and_register_actor(key, self) def _wait_for_and_process_task(self, task): """Wait for a task to be ready and process the task. @@ -907,11 +937,11 @@ def _wait_for_and_process_task(self, task): task: The task to execute. """ function_id = task.function_id() + driver_id = task.driver_id().id() # TODO(rkn): It would be preferable for actor creation tasks to share # more of the code path with regular task execution. - if (task.actor_creation_id() != - ray.local_scheduler.ObjectID(NIL_ACTOR_ID)): + if (task.actor_creation_id() != ray.ObjectID(NIL_ACTOR_ID)): self._become_actor(task) return @@ -919,7 +949,7 @@ def _wait_for_and_process_task(self, task): # on this worker. We will push warnings to the user if we spend too # long in this loop. with log_span("ray:wait_for_function", worker=self): - self._wait_for_function(function_id, task.driver_id().id()) + self._wait_for_function(function_id, driver_id) # Execute the task. # TODO(rkn): Consider acquiring this lock with a timeout and pushing a @@ -930,8 +960,8 @@ def _wait_for_and_process_task(self, task): with self.lock: log(event_type="ray:acquire_lock", kind=LOG_SPAN_END, worker=self) - function_name, _ = ( - self.functions[task.driver_id().id()][function_id.id()]) + function_name = (self.function_execution_info[driver_id][ + function_id.id()]).function_name contents = { "function_name": function_name, "task_id": task.task_id().hex(), @@ -944,14 +974,13 @@ def _wait_for_and_process_task(self, task): flush_log() # Increase the task execution counter. - (self.num_task_executions[task.driver_id().id()][function_id.id()] - ) += 1 + self.num_task_executions[driver_id][function_id.id()] += 1 - reached_max_executions = (self.num_task_executions[task.driver_id().id( - )][function_id.id()] == self.function_properties[task.driver_id().id()] - [function_id.id()].max_calls) + reached_max_executions = ( + self.num_task_executions[driver_id][function_id.id()] == self. + function_execution_info[driver_id][function_id.id()].max_calls) if reached_max_executions: - ray.worker.global_worker.local_scheduler_client.disconnect() + self.local_scheduler_client.disconnect() os._exit(0) def _get_next_task_from_local_scheduler(self): @@ -1065,18 +1094,6 @@ def check_main_thread(): .format(threading.current_thread().getName())) -def check_connected(worker=global_worker): - """Check if the worker is connected. - - Raises: - Exception: An exception is raised if the worker is not connected. - """ - if not worker.connected: - raise RayConnectionError("This command cannot be called before Ray " - "has been started. You can start Ray with " - "'ray.init()'.") - - def print_failed_task(task_status): """Print information about failed tasks. @@ -1110,7 +1127,7 @@ def error_applies_to_driver(error_key, worker=global_worker): def error_info(worker=global_worker): """Return information about failed tasks.""" - check_connected(worker) + worker.check_connected() check_main_thread() error_keys = worker.redis_client.lrange("ErrorKeys", 0, -1) errors = [] @@ -1135,18 +1152,39 @@ def _initialize_serialization(worker=global_worker): pyarrow.register_torch_serialization_handlers(worker.serialization_context) # Define a custom serializer and deserializer for handling Object IDs. - def objectid_custom_serializer(obj): + def object_id_custom_serializer(obj): return obj.id() - def objectid_custom_deserializer(serialized_obj): - return ray.local_scheduler.ObjectID(serialized_obj) + def object_id_custom_deserializer(serialized_obj): + return ray.ObjectID(serialized_obj) + # We register this serializer on each worker instead of calling + # register_custom_serializer from the driver so that isinstance still + # works. worker.serialization_context.register_type( - ray.local_scheduler.ObjectID, + ray.ObjectID, "ray.ObjectID", pickle=False, - custom_serializer=objectid_custom_serializer, - custom_deserializer=objectid_custom_deserializer) + custom_serializer=object_id_custom_serializer, + custom_deserializer=object_id_custom_deserializer) + + def actor_handle_serializer(obj): + return obj._serialization_helper(True) + + def actor_handle_deserializer(serialized_obj): + new_handle = ray.actor.ActorHandle.__new__(ray.actor.ActorHandle) + new_handle._deserialization_helper(serialized_obj, True) + return new_handle + + # We register this serializer on each worker instead of calling + # register_custom_serializer from the driver so that isinstance still + # works. + worker.serialization_context.register_type( + ray.actor.ActorHandle, + "ray.ActorHandle", + pickle=False, + custom_serializer=actor_handle_serializer, + custom_deserializer=actor_handle_deserializer) if worker.mode in [SCRIPT_MODE, SILENT_MODE]: # These should only be called on the driver because @@ -1159,8 +1197,6 @@ def objectid_custom_deserializer(serialized_obj): register_custom_serializer(type(lambda: 0), use_pickle=True) # Tell Ray to serialize types with pickle. register_custom_serializer(type(int), use_pickle=True) - # Ray can serialize actor handles that have been wrapped. - register_custom_serializer(ray.actor.ActorHandleWrapper, use_dict=True) # Tell Ray to serialize FunctionSignatures as dictionaries. This is # used when passing around actor handles. register_custom_serializer( @@ -1417,6 +1453,11 @@ def _init(address_info=None, raise Exception("Driver_mode must be in [ray.SCRIPT_MODE, " "ray.PYTHON_MODE, ray.SILENT_MODE].") + if use_raylet is None and os.environ.get("RAY_USE_XRAY") == "1": + # This environment variable is used in our testing setup. + print("Detected environment variable 'RAY_USE_XRAY'.") + use_raylet = True + # Get addresses of existing services. if address_info is None: address_info = {} @@ -1557,7 +1598,7 @@ def init(redis_address=None, huge_pages=False, include_webui=True, object_store_memory=None, - use_raylet=False): + use_raylet=None): """Connect to an existing Ray cluster or start one and connect to it. This method handles two cases. Either a Ray cluster already exists and we @@ -1612,6 +1653,11 @@ def init(redis_address=None, Exception: An exception is raised if an inappropriate combination of arguments is passed in. """ + if use_raylet is None and os.environ.get("RAY_USE_XRAY") == "1": + # This environment variable is used in our testing setup. + print("Detected environment variable 'RAY_USE_XRAY'.") + use_raylet = True + # Convert hostnames to numerical IP address. if node_ip_address is not None: node_ip_address = services.address_to_ip(node_ip_address) @@ -1753,12 +1799,9 @@ def fetch_and_register_remote_function(key, worker=global_worker): "driver_id", "function_id", "name", "function", "num_return_vals", "module", "resources", "max_calls" ]) - function_id = ray.local_scheduler.ObjectID(function_id_str) + function_id = ray.ObjectID(function_id_str) function_name = function_name.decode("ascii") - function_properties = FunctionProperties( - num_return_vals=int(num_return_vals), - resources=json.loads(resources.decode("ascii")), - max_calls=int(max_calls)) + max_calls = int(max_calls) module = module.decode("ascii") # This is a placeholder in case the function can't be unpickled. This will @@ -1766,11 +1809,9 @@ def fetch_and_register_remote_function(key, worker=global_worker): def f(): raise Exception("This function was not imported properly.") - remote_f_placeholder = remote(function_id=function_id)(lambda *xs: f()) - worker.functions[driver_id][function_id.id()] = (function_name, - remote_f_placeholder) - worker.function_properties[driver_id][function_id.id()] = ( - function_properties) + worker.function_execution_info[driver_id][function_id.id()] = ( + FunctionExecutionInfo( + function=f, function_name=function_name, max_calls=max_calls)) worker.num_task_executions[driver_id][function_id.id()] = 0 try: @@ -1792,8 +1833,11 @@ def f(): else: # TODO(rkn): Why is the below line necessary? function.__module__ = module - worker.functions[driver_id][function_id.id()] = ( - function_name, remote(function_id=function_id)(function)) + worker.function_execution_info[driver_id][function_id.id()] = ( + FunctionExecutionInfo( + function=function, + function_name=function_name, + max_calls=max_calls)) # Add the function to the function table. worker.redis_client.rpush(b"FunctionTable:" + function_id.id(), worker.worker_id) @@ -1940,6 +1984,14 @@ def connect(info, assert worker.cached_remote_functions_and_actors is not None, error_message # Initialize some fields. worker.worker_id = random_string() + + # When tasks are executed on remote workers in the context of multiple + # drivers, the task driver ID is used to keep track of which driver is + # responsible for the task so that error messages will be propagated to + # the correct driver. + if mode != WORKER_MODE: + worker.task_driver_id = ray.ObjectID(worker.worker_id) + # All workers start out as non-actors. A worker can be turned into an actor # after it is created. worker.actor_id = NIL_ACTOR_ID @@ -2069,13 +2121,7 @@ def connect(info, else: # Try to use true randomness. np.random.seed(None) - worker.current_task_id = ray.local_scheduler.ObjectID( - np.random.bytes(20)) - # When tasks are executed on remote workers in the context of multiple - # drivers, the task driver ID is used to keep track of which driver is - # responsible for the task so that error messages will be propagated to - # the correct driver. - worker.task_driver_id = ray.local_scheduler.ObjectID(worker.worker_id) + worker.current_task_id = ray.ObjectID(np.random.bytes(20)) # Reset the state of the numpy random number generator. np.random.set_state(numpy_state) # Set other fields needed for computing task IDs. @@ -2091,14 +2137,11 @@ def connect(info, nil_actor_counter = 0 driver_task = ray.local_scheduler.Task( - worker.task_driver_id, - ray.local_scheduler.ObjectID(NIL_FUNCTION_ID), [], 0, + worker.task_driver_id, ray.ObjectID(NIL_FUNCTION_ID), [], 0, worker.current_task_id, worker.task_index, - ray.local_scheduler.ObjectID(NIL_ACTOR_ID), - ray.local_scheduler.ObjectID(NIL_ACTOR_ID), - ray.local_scheduler.ObjectID(NIL_ACTOR_ID), - ray.local_scheduler.ObjectID(NIL_ACTOR_ID), nil_actor_counter, - False, [], {"CPU": 0}, worker.use_raylet) + ray.ObjectID(NIL_ACTOR_ID), ray.ObjectID(NIL_ACTOR_ID), + ray.ObjectID(NIL_ACTOR_ID), ray.ObjectID(NIL_ACTOR_ID), + nil_actor_counter, False, [], {"CPU": 0}, worker.use_raylet) global_state._execute_command( driver_task.task_id(), "RAY.TASK_TABLE_ADD", driver_task.task_id().id(), @@ -2161,11 +2204,7 @@ def connect(info, # Export cached remote functions to the workers. for cached_type, info in worker.cached_remote_functions_and_actors: if cached_type == "remote_function": - (function_id, func_name, func, func_invoker, - function_properties) = info - export_remote_function(function_id, func_name, func, - func_invoker, function_properties, - worker) + info._export() elif cached_type == "actor": (key, actor_class_info) = info ray.actor.publish_actor_class_to_key(key, actor_class_info, @@ -2417,7 +2456,7 @@ def get(object_ids, worker=global_worker): Returns: A Python object or a list of Python objects. """ - check_connected(worker) + worker.check_connected() with log_span("ray:get", worker=worker): check_main_thread() @@ -2450,7 +2489,7 @@ def put(value, worker=global_worker): Returns: The object ID assigned to this value. """ - check_connected(worker) + worker.check_connected() with log_span("ray:put", worker=worker): check_main_thread() @@ -2491,7 +2530,7 @@ def wait(object_ids, num_returns=1, timeout=None, worker=global_worker): print("plasma_client.wait has not been implemented yet") return - if isinstance(object_ids, ray.local_scheduler.ObjectID): + if isinstance(object_ids, ray.ObjectID): raise TypeError( "wait() expected a list of ObjectID, got a single ObjectID") @@ -2501,12 +2540,12 @@ def wait(object_ids, num_returns=1, timeout=None, worker=global_worker): if worker.mode != PYTHON_MODE: for object_id in object_ids: - if not isinstance(object_id, ray.local_scheduler.ObjectID): + if not isinstance(object_id, ray.ObjectID): raise TypeError("wait() expected a list of ObjectID, " "got list containing {}".format( type(object_id))) - check_connected(worker) + worker.check_connected() with log_span("ray:wait", worker=worker): check_main_thread() @@ -2528,27 +2567,14 @@ def wait(object_ids, num_returns=1, timeout=None, worker=global_worker): ready_ids, remaining_ids = worker.plasma_client.wait( object_id_strs, timeout, num_returns) ready_ids = [ - ray.local_scheduler.ObjectID(object_id.binary()) - for object_id in ready_ids + ray.ObjectID(object_id.binary()) for object_id in ready_ids ] remaining_ids = [ - ray.local_scheduler.ObjectID(object_id.binary()) - for object_id in remaining_ids + ray.ObjectID(object_id.binary()) for object_id in remaining_ids ] return ready_ids, remaining_ids -def _submit_task(function_id, *args, **kwargs): - """This is a wrapper around worker.submit_task. - - We use this wrapper so that in the remote decorator, we can call - _submit_task instead of worker.submit_task. The difference is that when we - attempt to serialize remote functions, we don't attempt to serialize the - worker object, which cannot be serialized. - """ - return global_worker.submit_task(function_id, *args, **kwargs) - - def _mode(worker=global_worker): """This is a wrapper around worker.mode. @@ -2560,278 +2586,104 @@ def _mode(worker=global_worker): return worker.mode -def export_remote_function(function_id, - func_name, - func, - func_invoker, - function_properties, - worker=global_worker): - check_main_thread() - if _mode(worker) not in [SCRIPT_MODE, SILENT_MODE]: - raise Exception("export_remote_function can only be called on a " - "driver.") - - worker.function_properties[worker.task_driver_id.id()][ - function_id.id()] = function_properties - task_driver_id = worker.task_driver_id - key = b"RemoteFunction:" + task_driver_id.id() + b":" + function_id.id() - - # Work around limitations of Python pickling. - func_name_global_valid = func.__name__ in func.__globals__ - func_name_global_value = func.__globals__.get(func.__name__) - # Allow the function to reference itself as a global variable - if not is_cython(func): - func.__globals__[func.__name__] = func_invoker - try: - pickled_func = pickle.dumps(func) - finally: - # Undo our changes - if func_name_global_valid: - func.__globals__[func.__name__] = func_name_global_value - else: - del func.__globals__[func.__name__] - - worker.redis_client.hmset( - key, { - "driver_id": worker.task_driver_id.id(), - "function_id": function_id.id(), - "name": func_name, - "module": func.__module__, - "function": pickled_func, - "num_return_vals": function_properties.num_return_vals, - "resources": json.dumps(function_properties.resources), - "max_calls": function_properties.max_calls - }) - worker.redis_client.rpush("Exports", key) - - -def in_ipython(): - """Return true if we are in an IPython interpreter and false otherwise.""" - try: - __IPYTHON__ - return True - except NameError: - return False - +def get_global_worker(): + return global_worker + + +def make_decorator(num_return_vals=None, + num_cpus=None, + num_gpus=None, + resources=None, + max_calls=None, + checkpoint_interval=None, + worker=None): + def decorator(function_or_class): + if (inspect.isfunction(function_or_class) + or is_cython(function_or_class)): + # Set the remote function default resources. + if checkpoint_interval is not None: + raise Exception("The keyword 'checkpoint_interval' is not " + "allowed for remote functions.") + + return ray.remote_function.RemoteFunction( + function_or_class, num_cpus, num_gpus, resources, + num_return_vals, max_calls) + + if inspect.isclass(function_or_class): + if num_return_vals is not None: + raise Exception("The keyword 'num_return_vals' is not allowed " + "for actors.") + if max_calls is not None: + raise Exception("The keyword 'max_calls' is not allowed for " + "actors.") + + # Set the actor default resources. + if num_cpus is None and num_gpus is None and resources is None: + # In the default case, actors acquire no resources for + # their lifetime, and actor methods will require 1 CPU. + cpus_to_use = DEFAULT_ACTOR_CREATION_CPUS_SIMPLE_CASE + actor_method_cpus = DEFAULT_ACTOR_METHOD_CPUS_SIMPLE_CASE + else: + # If any resources are specified, then all resources are + # acquired for the actor's lifetime and no resources are + # associated with methods. + cpus_to_use = (DEFAULT_ACTOR_CREATION_CPUS_SPECIFIED_CASE + if num_cpus is None else num_cpus) + actor_method_cpus = DEFAULT_ACTOR_METHOD_CPUS_SPECIFIED_CASE -def compute_function_id(func_name, func): - """Compute an function ID for a function. + return worker.make_actor(function_or_class, cpus_to_use, num_gpus, + resources, actor_method_cpus, + checkpoint_interval) - Args: - func_name: The name of the function (this includes the module name plus - the function name). - func: The actual function. + raise Exception("The @ray.remote decorator must be applied to " + "either a function or to a class.") - Returns: - This returns the function ID. - """ - function_id_hash = hashlib.sha1() - # Include the function name in the hash. - function_id_hash.update(func_name.encode("ascii")) - # If we are running a script or are in IPython, include the source code in - # the hash. If we are in a regular Python interpreter we skip this part - # because the source code is not accessible. If the function is a built-in - # (e.g., Cython), the source code is not accessible. - import __main__ as main - if (hasattr(main, "__file__") or in_ipython()) \ - and inspect.isfunction(func): - function_id_hash.update(inspect.getsource(func).encode("ascii")) - # Compute the function ID. - function_id = function_id_hash.digest() - assert len(function_id) == 20 - function_id = FunctionID(function_id) - - return function_id + return decorator def remote(*args, **kwargs): - """This decorator is used to define remote functions and to define actors. - - Args: - num_return_vals (int): The number of object IDs that a call to this - function should return. - num_cpus (int): The number of CPUs needed to execute this function. - num_gpus (int): The number of GPUs needed to execute this function. - resources: A dictionary mapping resource name to the required quantity - of that resource. - max_calls (int): The maximum number of tasks of this kind that can be - run on a worker before the worker needs to be restarted. - checkpoint_interval (int): The number of tasks to run between - checkpoints of the actor state. - """ - worker = global_worker - - def make_remote_decorator(num_return_vals, - num_cpus, - num_gpus, - resources, - max_calls, - checkpoint_interval, - func_id=None): - def remote_decorator(func_or_class): - if inspect.isfunction(func_or_class) or is_cython(func_or_class): - # Set the remote function default resources. - resources["CPU"] = (DEFAULT_REMOTE_FUNCTION_CPUS - if num_cpus is None else num_cpus) - resources["GPU"] = (DEFAULT_REMOTE_FUNCTION_GPUS - if num_gpus is None else num_gpus) - - function_properties = FunctionProperties( - num_return_vals=num_return_vals, - resources=resources, - max_calls=max_calls) - return remote_function_decorator(func_or_class, - function_properties) - if inspect.isclass(func_or_class): - # Set the actor default resources. - if num_cpus is None and num_gpus is None and resources == {}: - # In the default case, actors acquire no resources for - # their lifetime, and actor methods will require 1 CPU. - resources["CPU"] = DEFAULT_ACTOR_CREATION_CPUS_SIMPLE_CASE - actor_method_cpus = DEFAULT_ACTOR_METHOD_CPUS_SIMPLE_CASE - else: - # If any resources are specified, then all resources are - # acquired for the actor's lifetime and no resources are - # associated with methods. - resources["CPU"] = ( - DEFAULT_ACTOR_CREATION_CPUS_SPECIFIED_CASE - if num_cpus is None else num_cpus) - resources["GPU"] = ( - DEFAULT_ACTOR_CREATION_GPUS_SPECIFIED_CASE - if num_gpus is None else num_gpus) - actor_method_cpus = ( - DEFAULT_ACTOR_METHOD_CPUS_SPECIFIED_CASE) - - return worker.make_actor(func_or_class, resources, - checkpoint_interval, - actor_method_cpus) - raise Exception("The @ray.remote decorator must be applied to " - "either a function or to a class.") - - def remote_function_decorator(func, function_properties): - func_name = "{}.{}".format(func.__module__, func.__name__) - if func_id is None: - function_id = compute_function_id(func_name, func) - else: - function_id = func_id - - def func_call(*args, **kwargs): - """This runs immediately when a remote function is called.""" - return _submit(args=args, kwargs=kwargs) - - def _submit(args=None, - kwargs=None, - num_return_vals=None, - num_cpus=None, - num_gpus=None, - resources=None): - """An experimental alternate way to submit remote functions.""" - check_connected() - check_main_thread() - kwargs = {} if kwargs is None else kwargs - args = signature.extend_args(function_signature, args, kwargs) - - if _mode() == PYTHON_MODE: - # In PYTHON_MODE, remote calls simply execute the function. - # We copy the arguments to prevent the function call from - # mutating them and to match the usual behavior of - # immutable remote objects. - result = func(*copy.deepcopy(args)) - return result - object_ids = _submit_task( - function_id, - args, - num_return_vals=num_return_vals, - num_cpus=num_cpus, - num_gpus=num_gpus, - resources=resources) - if len(object_ids) == 1: - return object_ids[0] - elif len(object_ids) > 1: - return object_ids - - def func_executor(arguments): - """This gets run when the remote function is executed.""" - result = func(*arguments) - return result - - def func_invoker(*args, **kwargs): - """This is used to invoke the function.""" - raise Exception("Remote functions cannot be called directly. " - "Instead of running '{}()', try '{}.remote()'." - .format(func_name, func_name)) - - func_invoker.remote = func_call - func_invoker._submit = _submit - func_invoker.executor = func_executor - func_invoker.is_remote = True - func_name = "{}.{}".format(func.__module__, func.__name__) - func_invoker.func_name = func_name - if sys.version_info >= (3, 0) or is_cython(func): - func_invoker.__doc__ = func.__doc__ - else: - func_invoker.func_doc = func.func_doc - - signature.check_signature_supported(func) - function_signature = signature.extract_signature(func) - - # Everything ready - export the function - if worker.mode in [SCRIPT_MODE, SILENT_MODE]: - export_remote_function(function_id, func_name, func, - func_invoker, function_properties) - elif worker.mode is None: - worker.cached_remote_functions_and_actors.append( - ("remote_function", (function_id, func_name, func, - func_invoker, function_properties))) - return func_invoker + worker = get_global_worker() - return remote_decorator + if len(args) == 1 and len(kwargs) == 0 and callable(args[0]): + # This is the case where the decorator is just @ray.remote. + return make_decorator(worker=worker)(args[0]) + + # Parse the keyword arguments from the decorator. + error_string = ("The @ray.remote decorator must be applied either " + "with no arguments and no parentheses, for example " + "'@ray.remote', or it must be applied using some of " + "the arguments 'num_return_vals', 'num_cpus', 'num_gpus', " + "'resources', 'max_calls', or 'checkpoint_interval', like " + "'@ray.remote(num_return_vals=2, " + "resources={\"CustomResource\": 1})'.") + assert len(args) == 0 and len(kwargs) > 0, error_string + for key in kwargs: + assert key in [ + "num_return_vals", "num_cpus", "num_gpus", "resources", + "max_calls", "checkpoint_interval" + ], error_string - # Handle resource arguments num_cpus = kwargs["num_cpus"] if "num_cpus" in kwargs else None num_gpus = kwargs["num_gpus"] if "num_gpus" in kwargs else None - resources = kwargs.get("resources", {}) - if not isinstance(resources, dict): + resources = kwargs.get("resources") + if not isinstance(resources, dict) and resources is not None: raise Exception("The 'resources' keyword argument must be a " "dictionary, but received type {}.".format( type(resources))) - assert "CPU" not in resources, "Use the 'num_cpus' argument." - assert "GPU" not in resources, "Use the 'num_gpus' argument." + if resources is not None: + assert "CPU" not in resources, "Use the 'num_cpus' argument." + assert "GPU" not in resources, "Use the 'num_gpus' argument." + # Handle other arguments. - num_return_vals = (kwargs["num_return_vals"] - if "num_return_vals" in kwargs else 1) - max_calls = kwargs["max_calls"] if "max_calls" in kwargs else 0 - checkpoint_interval = (kwargs["checkpoint_interval"] - if "checkpoint_interval" in kwargs else -1) - - if _mode() == WORKER_MODE: - if "function_id" in kwargs: - function_id = kwargs["function_id"] - return make_remote_decorator(num_return_vals, num_cpus, num_gpus, - resources, max_calls, - checkpoint_interval, function_id) + num_return_vals = kwargs.get("num_return_vals") + max_calls = kwargs.get("max_calls") + checkpoint_interval = kwargs.get("checkpoint_interval") - if len(args) == 1 and len(kwargs) == 0 and callable(args[0]): - # This is the case where the decorator is just @ray.remote. - return make_remote_decorator(num_return_vals, num_cpus, num_gpus, - resources, max_calls, - checkpoint_interval)(args[0]) - else: - # This is the case where the decorator is something like - # @ray.remote(num_return_vals=2). - error_string = ("The @ray.remote decorator must be applied either " - "with no arguments and no parentheses, for example " - "'@ray.remote', or it must be applied using some of " - "the arguments 'num_return_vals', 'resources', " - "or 'max_calls', like " - "'@ray.remote(num_return_vals=2, " - "resources={\"GPU\": 1})'.") - assert len(args) == 0 and len(kwargs) > 0, error_string - for key in kwargs: - assert key in [ - "num_return_vals", "num_cpus", "num_gpus", "resources", - "max_calls", "checkpoint_interval" - ], error_string - assert "function_id" not in kwargs - return make_remote_decorator(num_return_vals, num_cpus, num_gpus, - resources, max_calls, checkpoint_interval) + return make_decorator( + num_return_vals=num_return_vals, + num_cpus=num_cpus, + num_gpus=num_gpus, + resources=resources, + max_calls=max_calls, + checkpoint_interval=checkpoint_interval, + worker=worker) diff --git a/src/global_scheduler/CMakeLists.txt b/src/global_scheduler/CMakeLists.txt index 892bfd6e50ad..fc8e53067d40 100644 --- a/src/global_scheduler/CMakeLists.txt +++ b/src/global_scheduler/CMakeLists.txt @@ -7,4 +7,4 @@ include(${CMAKE_CURRENT_LIST_DIR}/../common/cmake/Common.cmake) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wall") add_executable(global_scheduler global_scheduler.cc global_scheduler_algorithm.cc) -target_link_libraries(global_scheduler common ${HIREDIS_LIB} ray_static ${PLASMA_STATIC_LIB} ${ARROW_STATIC_LIB} ${Boost_SYSTEM_LIBRARY}) +target_link_libraries(global_scheduler common ${HIREDIS_LIB} ray_static ${PLASMA_STATIC_LIB} ${ARROW_STATIC_LIB} ${Boost_SYSTEM_LIBRARY} pthread) diff --git a/src/ray/common/client_connection.cc b/src/ray/common/client_connection.cc index 1f42068e5b9a..90b5ed6a56c0 100644 --- a/src/ray/common/client_connection.cc +++ b/src/ray/common/client_connection.cc @@ -97,7 +97,7 @@ std::shared_ptr> ClientConnection::Create( std::shared_ptr> self( new ClientConnection(message_handler, std::move(socket))); // Let our manager process our new connection. - client_handler(self); + client_handler(*self); return self; } diff --git a/src/ray/common/client_connection.h b/src/ray/common/client_connection.h index 55efed3dd2f6..0f65e43277f5 100644 --- a/src/ray/common/client_connection.h +++ b/src/ray/common/client_connection.h @@ -62,7 +62,7 @@ template class ClientConnection; template -using ClientHandler = std::function>)>; +using ClientHandler = std::function &)>; template using MessageHandler = std::function>, int64_t, const uint8_t *)>; diff --git a/src/ray/constants.h b/src/ray/constants.h index bdae39ff2b6f..eab76e37417b 100644 --- a/src/ray/constants.h +++ b/src/ray/constants.h @@ -1,12 +1,17 @@ #ifndef RAY_CONSTANTS_H_ #define RAY_CONSTANTS_H_ +#include + /// Length of Ray IDs in bytes. constexpr int64_t kUniqueIDSize = 20; /// An ObjectID's bytes are split into the task ID itself and the index of the /// object's creation. This is the maximum width of the object index in bits. constexpr int kObjectIdIndexSize = 32; +static_assert(kObjectIdIndexSize % CHAR_BIT == 0, + "ObjectID prefix not a multiple of bytes"); + /// The maximum number of objects that can be returned by a task when finishing /// execution. An ObjectID's bytes are split into the task ID itself and the /// index of the object's creation. A positive index indicates an object diff --git a/src/ray/gcs/client.cc b/src/ray/gcs/client.cc index f29c1f6ffb8b..d4aed268ce8d 100644 --- a/src/ray/gcs/client.cc +++ b/src/ray/gcs/client.cc @@ -7,7 +7,7 @@ namespace ray { namespace gcs { AsyncGcsClient::AsyncGcsClient(const ClientID &client_id) { - context_.reset(new RedisContext()); + context_ = std::make_shared(); client_table_.reset(new ClientTable(context_, this, client_id)); object_table_.reset(new ObjectTable(context_, this)); actor_table_.reset(new ActorTable(context_, this)); diff --git a/src/ray/gcs/client_test.cc b/src/ray/gcs/client_test.cc index 679177e45c0a..92ff7f854c31 100644 --- a/src/ray/gcs/client_test.cc +++ b/src/ray/gcs/client_test.cc @@ -93,9 +93,9 @@ void TestTableLookup(const JobID &job_id, std::shared_ptr c // Check that we added the correct task. auto add_callback = [task_id, data](gcs::AsyncGcsClient *client, const UniqueID &id, - const std::shared_ptr d) { + const protocol::TaskT &d) { ASSERT_EQ(id, task_id); - ASSERT_EQ(data->task_specification, d->task_specification); + ASSERT_EQ(data->task_specification, d.task_specification); }; // Check that the lookup returns the added task. @@ -139,9 +139,9 @@ void TestLogLookup(const JobID &job_id, std::shared_ptr cli data->manager = manager; // Check that we added the correct object entries. auto add_callback = [object_id, data](gcs::AsyncGcsClient *client, const UniqueID &id, - const std::shared_ptr d) { + const ObjectTableDataT &d) { ASSERT_EQ(id, object_id); - ASSERT_EQ(data->manager, d->manager); + ASSERT_EQ(data->manager, d.manager); }; RAY_CHECK_OK(client->object_table().Append(job_id, object_id, data, add_callback)); } @@ -222,7 +222,7 @@ void TestLogAppendAt(const JobID &job_id, std::shared_ptr c // Check that we added the correct task. auto failure_callback = [task_id](gcs::AsyncGcsClient *client, const UniqueID &id, - const std::shared_ptr d) { + const TaskReconstructionDataT &d) { ASSERT_EQ(id, task_id); test->IncrementNumCallbacks(); }; @@ -265,8 +265,8 @@ TEST_F(TestGcsWithAsio, TestLogAppendAt) { // Task table callbacks. void TaskAdded(gcs::AsyncGcsClient *client, const TaskID &id, - const std::shared_ptr data) { - ASSERT_EQ(data->scheduling_state, SchedulingState_SCHEDULED); + const TaskTableDataT &data) { + ASSERT_EQ(data.scheduling_state, SchedulingState_SCHEDULED); } void TaskLookup(gcs::AsyncGcsClient *client, const TaskID &id, diff --git a/src/ray/gcs/redis_context.cc b/src/ray/gcs/redis_context.cc index 43778f70b9d2..df061700be3a 100644 --- a/src/ray/gcs/redis_context.cc +++ b/src/ray/gcs/redis_context.cc @@ -100,15 +100,13 @@ void SubscribeRedisCallback(void *c, void *r, void *privdata) { int64_t RedisCallbackManager::add(const RedisCallback &function) { num_callbacks += 1; - callbacks_.emplace(num_callbacks, std::unique_ptr( - new RedisCallback(function))); + callbacks_.emplace(num_callbacks, function); return num_callbacks; } -RedisCallbackManager::RedisCallback &RedisCallbackManager::get( - int64_t callback_index) { +RedisCallback &RedisCallbackManager::get(int64_t callback_index) { RAY_CHECK(callbacks_.find(callback_index) != callbacks_.end()); - return *callbacks_[callback_index]; + return callbacks_[callback_index]; } void RedisCallbackManager::remove(int64_t callback_index) { @@ -185,7 +183,9 @@ Status RedisContext::AttachToEventLoop(aeEventLoop *loop) { Status RedisContext::RunAsync(const std::string &command, const UniqueID &id, const uint8_t *data, int64_t length, const TablePrefix prefix, const TablePubsub pubsub_channel, - int64_t callback_index, int log_length) { + RedisCallback redisCallback, int log_length) { + int64_t callback_index = + redisCallback != nullptr ? RedisCallbackManager::instance().add(redisCallback) : -1; if (length > 0) { if (log_length >= 0) { std::string redis_command = command + " %d %d %b %b %d"; @@ -222,10 +222,11 @@ Status RedisContext::RunAsync(const std::string &command, const UniqueID &id, Status RedisContext::SubscribeAsync(const ClientID &client_id, const TablePubsub pubsub_channel, - int64_t callback_index) { + const RedisCallback &redisCallback) { RAY_CHECK(pubsub_channel != TablePubsub_NO_PUBLISH) << "Client requested subscribe on a table that does not support pubsub"; + int64_t callback_index = RedisCallbackManager::instance().add(redisCallback); int status = 0; if (client_id.is_nil()) { // Subscribe to all messages. diff --git a/src/ray/gcs/redis_context.h b/src/ray/gcs/redis_context.h index 4d9a296eee60..c2371dff5809 100644 --- a/src/ray/gcs/redis_context.h +++ b/src/ray/gcs/redis_context.h @@ -18,13 +18,13 @@ struct aeEventLoop; namespace ray { namespace gcs { +/// Every callback should take in a vector of the results from the Redis +/// operation and return a bool indicating whether the callback should be +/// deleted once called. +using RedisCallback = std::function; class RedisCallbackManager { public: - /// Every callback should take in a vector of the results from the Redis - /// operation and return a bool indicating whether the callback should be - /// deleted once called. - using RedisCallback = std::function; static RedisCallbackManager &instance() { static RedisCallbackManager instance; @@ -44,7 +44,7 @@ class RedisCallbackManager { ~RedisCallbackManager() { printf("shut down callback manager\n"); } int64_t num_callbacks; - std::unordered_map> callbacks_; + std::unordered_map callbacks_; }; class RedisContext { @@ -70,11 +70,11 @@ class RedisContext { /// -1 for unused. If set, then data must be provided. Status RunAsync(const std::string &command, const UniqueID &id, const uint8_t *data, int64_t length, const TablePrefix prefix, - const TablePubsub pubsub_channel, int64_t callback_index, + const TablePubsub pubsub_channel, RedisCallback redisCallback, int log_length = -1); Status SubscribeAsync(const ClientID &client_id, const TablePubsub pubsub_channel, - int64_t callback_index); + const RedisCallback &redisCallback); redisAsyncContext *async_context() { return async_context_; } redisAsyncContext *subscribe_context() { return subscribe_context_; }; diff --git a/src/ray/gcs/tables.cc b/src/ray/gcs/tables.cc index be73aedf890e..3c014798b930 100644 --- a/src/ray/gcs/tables.cc +++ b/src/ray/gcs/tables.cc @@ -9,76 +9,66 @@ namespace gcs { template Status Log::Append(const JobID &job_id, const ID &id, - std::shared_ptr data, const WriteCallback &done) { - auto d = std::shared_ptr( - new CallbackData({id, data, nullptr, nullptr, this, client_})); - int64_t callback_index = - RedisCallbackManager::instance().add([d, done](const std::string &data) { - RAY_CHECK(data.empty()); - if (done != nullptr) { - (done)(d->client, d->id, d->data); - } - return true; - }); + std::shared_ptr &dataT, const WriteCallback &done) { + auto callback = [this, id, dataT, done](const std::string &data) { + RAY_CHECK(data.empty()); + if (done != nullptr) { + (done)(client_, id, *dataT); + } + return true; + }; flatbuffers::FlatBufferBuilder fbb; fbb.ForceDefaults(true); - fbb.Finish(Data::Pack(fbb, data.get())); + fbb.Finish(Data::Pack(fbb, dataT.get())); return context_->RunAsync("RAY.TABLE_APPEND", id, fbb.GetBufferPointer(), fbb.GetSize(), - prefix_, pubsub_channel_, callback_index); + prefix_, pubsub_channel_, std::move(callback)); } template Status Log::AppendAt(const JobID &job_id, const ID &id, - std::shared_ptr data, const WriteCallback &done, + std::shared_ptr &dataT, const WriteCallback &done, const WriteCallback &failure, int log_length) { - auto d = std::shared_ptr( - new CallbackData({id, data, nullptr, nullptr, this, client_})); - int64_t callback_index = - RedisCallbackManager::instance().add([d, done, failure](const std::string &data) { - if (data.empty()) { - if (done != nullptr) { - (done)(d->client, d->id, d->data); - } - } else { - if (failure != nullptr) { - (failure)(d->client, d->id, d->data); - } - } - return true; - }); + auto callback = [this, id, dataT, done, failure](const std::string &data) { + if (data.empty()) { + if (done != nullptr) { + (done)(client_, id, *dataT); + } + } else { + if (failure != nullptr) { + (failure)(client_, id, *dataT); + } + } + return true; + }; flatbuffers::FlatBufferBuilder fbb; fbb.ForceDefaults(true); - fbb.Finish(Data::Pack(fbb, data.get())); + fbb.Finish(Data::Pack(fbb, dataT.get())); return context_->RunAsync("RAY.TABLE_APPEND", id, fbb.GetBufferPointer(), fbb.GetSize(), - prefix_, pubsub_channel_, callback_index, log_length); + prefix_, pubsub_channel_, std::move(callback), log_length); } template Status Log::Lookup(const JobID &job_id, const ID &id, const Callback &lookup) { - auto d = std::shared_ptr( - new CallbackData({id, nullptr, lookup, nullptr, this, client_})); - int64_t callback_index = - RedisCallbackManager::instance().add([d](const std::string &data) { - if (d->callback != nullptr) { - std::vector results; - if (!data.empty()) { - auto root = flatbuffers::GetRoot(data.data()); - RAY_CHECK(from_flatbuf(*root->id()) == d->id); - for (size_t i = 0; i < root->entries()->size(); i++) { - DataT result; - auto data_root = - flatbuffers::GetRoot(root->entries()->Get(i)->data()); - data_root->UnPackTo(&result); - results.emplace_back(std::move(result)); - } - } - (d->callback)(d->client, d->id, results); + auto callback = [this, id, lookup](const std::string &data) { + if (lookup != nullptr) { + std::vector results; + if (!data.empty()) { + auto root = flatbuffers::GetRoot(data.data()); + RAY_CHECK(from_flatbuf(*root->id()) == id); + for (size_t i = 0; i < root->entries()->size(); i++) { + DataT result; + auto data_root = flatbuffers::GetRoot(root->entries()->Get(i)->data()); + data_root->UnPackTo(&result); + results.emplace_back(std::move(result)); } - return true; - }); + } + lookup(client_, id, results); + } + return true; + }; std::vector nil; return context_->RunAsync("RAY.TABLE_LOOKUP", id, nil.data(), nil.size(), prefix_, - pubsub_channel_, callback_index); + pubsub_channel_, std::move(callback)); } template @@ -87,42 +77,38 @@ Status Log::Subscribe(const JobID &job_id, const ClientID &client_id, const SubscriptionCallback &done) { RAY_CHECK(subscribe_callback_index_ == -1) << "Client called Subscribe twice on the same table"; - auto d = std::shared_ptr( - new CallbackData({client_id, nullptr, subscribe, done, this, client_})); - int64_t callback_index = - RedisCallbackManager::instance().add([d](const std::string &data) { - if (data.empty()) { - // No notification data is provided. This is the callback for the - // initial subscription request. - if (d->subscription_callback != nullptr) { - (d->subscription_callback)(d->client); - } - } else { - // Data is provided. This is the callback for a message. - if (d->callback != nullptr) { - // Parse the notification. - auto root = flatbuffers::GetRoot(data.data()); - ID id = UniqueID::nil(); - if (root->id()->size() > 0) { - id = from_flatbuf(*root->id()); - } - std::vector results; - for (size_t i = 0; i < root->entries()->size(); i++) { - DataT result; - auto data_root = - flatbuffers::GetRoot(root->entries()->Get(i)->data()); - data_root->UnPackTo(&result); - results.emplace_back(std::move(result)); - } - (d->callback)(d->client, id, results); - } + auto callback = [this, subscribe, done](const std::string &data) { + if (data.empty()) { + // No notification data is provided. This is the callback for the + // initial subscription request. + if (done != nullptr) { + done(client_); + } + } else { + // Data is provided. This is the callback for a message. + if (subscribe != nullptr) { + // Parse the notification. + auto root = flatbuffers::GetRoot(data.data()); + ID id = UniqueID::nil(); + if (root->id()->size() > 0) { + id = from_flatbuf(*root->id()); } - // We do not delete the callback after calling it since there may be - // more subscription messages. - return false; - }); - subscribe_callback_index_ = callback_index; - return context_->SubscribeAsync(client_id, pubsub_channel_, callback_index); + std::vector results; + for (size_t i = 0; i < root->entries()->size(); i++) { + DataT result; + auto data_root = flatbuffers::GetRoot(root->entries()->Get(i)->data()); + data_root->UnPackTo(&result); + results.emplace_back(std::move(result)); + } + subscribe(client_, id, results); + } + } + // We do not delete the callback after calling it since there may be + // more subscription messages. + return false; + }; + subscribe_callback_index_ = 1; + return context_->SubscribeAsync(client_id, pubsub_channel_, std::move(callback)); } template @@ -131,8 +117,7 @@ Status Log::RequestNotifications(const JobID &job_id, const ID &id, RAY_CHECK(subscribe_callback_index_ >= 0) << "Client requested notifications on a key before Subscribe completed"; return context_->RunAsync("RAY.TABLE_REQUEST_NOTIFICATIONS", id, client_id.data(), - client_id.size(), prefix_, pubsub_channel_, - /*callback_index=*/-1); + client_id.size(), prefix_, pubsub_channel_, nullptr); } template @@ -141,27 +126,23 @@ Status Log::CancelNotifications(const JobID &job_id, const ID &id, RAY_CHECK(subscribe_callback_index_ >= 0) << "Client canceled notifications on a key before Subscribe completed"; return context_->RunAsync("RAY.TABLE_CANCEL_NOTIFICATIONS", id, client_id.data(), - client_id.size(), prefix_, pubsub_channel_, - /*callback_index=*/-1); + client_id.size(), prefix_, pubsub_channel_, nullptr); } template Status Table::Add(const JobID &job_id, const ID &id, - std::shared_ptr data, const WriteCallback &done) { - auto d = std::shared_ptr( - new CallbackData({id, data, nullptr, nullptr, this, client_})); - int64_t callback_index = - RedisCallbackManager::instance().add([d, done](const std::string &data) { - if (done != nullptr) { - (done)(d->client, d->id, d->data); - } - return true; - }); + std::shared_ptr &dataT, const WriteCallback &done) { + auto callback = [this, id, dataT, done](const std::string &data) { + if (done != nullptr) { + (done)(client_, id, *dataT); + } + return true; + }; flatbuffers::FlatBufferBuilder fbb; fbb.ForceDefaults(true); - fbb.Finish(Data::Pack(fbb, data.get())); + fbb.Finish(Data::Pack(fbb, dataT.get())); return context_->RunAsync("RAY.TABLE_ADD", id, fbb.GetBufferPointer(), fbb.GetSize(), - prefix_, pubsub_channel_, callback_index); + prefix_, pubsub_channel_, std::move(callback)); } template @@ -259,9 +240,8 @@ void ClientTable::HandleNotification(AsyncGcsClient *client, } } -void ClientTable::HandleConnected(AsyncGcsClient *client, - const std::shared_ptr data) { - auto connected_client_id = ClientID::from_binary(data->client_id); +void ClientTable::HandleConnected(AsyncGcsClient *client, const ClientTableDataT &data) { + auto connected_client_id = ClientID::from_binary(data.client_id); RAY_CHECK(client_id_ == connected_client_id) << connected_client_id << " " << client_id_; } @@ -282,7 +262,7 @@ Status ClientTable::Connect(const ClientTableDataT &local_client) { // Callback to handle our own successful connection once we've added // ourselves. auto add_callback = [this](AsyncGcsClient *client, const UniqueID &log_key, - std::shared_ptr data) { + const ClientTableDataT &data) { RAY_CHECK(log_key == client_log_key_); HandleConnected(client, data); @@ -311,7 +291,7 @@ Status ClientTable::Disconnect() { auto data = std::make_shared(local_client_); data->is_insertion = false; auto add_callback = [this](AsyncGcsClient *client, const ClientID &id, - std::shared_ptr data) { + const ClientTableDataT &data) { HandleConnected(client, data); RAY_CHECK_OK(CancelNotifications(JobID::nil(), client_log_key_, id)); }; diff --git a/src/ray/gcs/tables.h b/src/ray/gcs/tables.h index d5c4df088aa7..a90519953f9a 100644 --- a/src/ray/gcs/tables.h +++ b/src/ray/gcs/tables.h @@ -57,8 +57,8 @@ class Log : virtual public PubsubInterface { using Callback = std::function &data)>; /// The callback to call when a write to a key succeeds. - using WriteCallback = std::function data)>; + using WriteCallback = + std::function; /// The callback to call when a SUBSCRIBE call completes and we are ready to /// request and receive notifications. using SubscriptionCallback = std::function; @@ -89,7 +89,7 @@ class Log : virtual public PubsubInterface { /// \param done Callback that is called once the data has been written to the /// GCS. /// \return Status - Status Append(const JobID &job_id, const ID &id, std::shared_ptr data, + Status Append(const JobID &job_id, const ID &id, std::shared_ptr &data, const WriteCallback &done); /// Append a log entry to a key if and only if the log has the given number @@ -105,7 +105,7 @@ class Log : virtual public PubsubInterface { /// \param log_length The number of entries that the log must have for the /// append to succeed. /// \return Status - Status AppendAt(const JobID &job_id, const ID &id, std::shared_ptr data, + Status AppendAt(const JobID &job_id, const ID &id, std::shared_ptr &data, const WriteCallback &done, const WriteCallback &failure, int log_length); @@ -187,7 +187,7 @@ class TableInterface { public: using DataT = typename Data::NativeTableType; using WriteCallback = typename Log::WriteCallback; - virtual Status Add(const JobID &job_id, const ID &task_id, std::shared_ptr data, + virtual Status Add(const JobID &job_id, const ID &task_id, std::shared_ptr &data, const WriteCallback &done) = 0; virtual ~TableInterface(){}; }; @@ -212,17 +212,6 @@ class Table : private Log, /// request and receive notifications. using SubscriptionCallback = typename Log::SubscriptionCallback; - struct CallbackData { - ID id; - std::shared_ptr data; - Callback callback; - // An optional callback to call for subscription operations, where the - // first message is a notification of subscription success. - SubscriptionCallback subscription_callback; - Log *log; - AsyncGcsClient *client; - }; - Table(const std::shared_ptr &context, AsyncGcsClient *client) : Log(context, client) {} @@ -237,7 +226,7 @@ class Table : private Log, /// \param done Callback that is called once the data has been written to the /// GCS. /// \return Status - Status Add(const JobID &job_id, const ID &id, std::shared_ptr data, + Status Add(const JobID &job_id, const ID &id, std::shared_ptr &data, const WriteCallback &done); /// Lookup an entry asynchronously. @@ -358,19 +347,18 @@ class TaskTable : public Table { Status TestAndUpdate(const JobID &job_id, const TaskID &id, std::shared_ptr data, const TestAndUpdateCallback &callback) { - int64_t callback_index = RedisCallbackManager::instance().add( - [this, callback, id](const std::string &data) { - auto result = std::make_shared(); - auto root = flatbuffers::GetRoot(data.data()); - root->UnPackTo(result.get()); - callback(client_, id, *result, root->updated()); - return true; - }); + auto redisCallback = [this, callback, id](const std::string &data) { + auto result = std::make_shared(); + auto root = flatbuffers::GetRoot(data.data()); + root->UnPackTo(result.get()); + callback(client_, id, *result, root->updated()); + return true; + }; flatbuffers::FlatBufferBuilder fbb; fbb.Finish(TaskTableTestAndUpdate::Pack(fbb, data.get())); RAY_RETURN_NOT_OK(context_->RunAsync("RAY.TABLE_TEST_AND_UPDATE", id, fbb.GetBufferPointer(), fbb.GetSize(), prefix_, - pubsub_channel_, callback_index)); + pubsub_channel_, redisCallback)); return Status::OK(); } @@ -499,8 +487,7 @@ class ClientTable : private Log { /// Handle a client table notification. void HandleNotification(AsyncGcsClient *client, const ClientTableDataT ¬ifications); /// Handle this client's successful connection to the GCS. - void HandleConnected(AsyncGcsClient *client, - const std::shared_ptr client_data); + void HandleConnected(AsyncGcsClient *client, const ClientTableDataT &client_data); /// The key at which the log of client information is stored. This key must /// be kept the same across all instances of the ClientTable, so that all diff --git a/src/ray/gcs/task_table.cc b/src/ray/gcs/task_table.cc index 1e3471cc41f5..6a82c06fa2d4 100644 --- a/src/ray/gcs/task_table.cc +++ b/src/ray/gcs/task_table.cc @@ -46,9 +46,9 @@ Status TaskTableAdd(AsyncGcsClient *gcs_client, Task *task) { TaskSpec *spec = execution_spec.Spec(); auto data = MakeTaskTableData(execution_spec, Task_local_scheduler(task), static_cast(Task_state(task))); - return gcs_client->task_table().Add(ray::JobID::nil(), TaskSpec_task_id(spec), data, - [](gcs::AsyncGcsClient *client, const TaskID &id, - std::shared_ptr data) {}); + return gcs_client->task_table().Add( + ray::JobID::nil(), TaskSpec_task_id(spec), data, + [](gcs::AsyncGcsClient *client, const TaskID &id, const TaskTableDataT &data) {}); } // TODO(pcm): This is a helper method that should go away once we get rid of diff --git a/src/ray/id.cc b/src/ray/id.cc index 4d9634623a78..c61c306c6ccc 100644 --- a/src/ray/id.cc +++ b/src/ray/id.cc @@ -1,5 +1,6 @@ #include "ray/id.h" +#include #include #include "ray/constants.h" @@ -83,7 +84,8 @@ bool UniqueID::operator==(const UniqueID &rhs) const { size_t UniqueID::hash() const { size_t result; - std::memcpy(&result, id_, sizeof(size_t)); + // Skip the bytes for the object prefix. + std::memcpy(&result, id_ + (kObjectIdIndexSize / CHAR_BIT), sizeof(size_t)); return result; } diff --git a/src/ray/object_manager/connection_pool.cc b/src/ray/object_manager/connection_pool.cc index efe5d68891f4..ec06b6d9aac0 100644 --- a/src/ray/object_manager/connection_pool.cc +++ b/src/ray/object_manager/connection_pool.cc @@ -53,7 +53,7 @@ ray::Status ConnectionPool::GetSender(ConnectionType type, const ClientID &clien } ray::Status ConnectionPool::ReleaseSender(ConnectionType type, - std::shared_ptr conn) { + std::shared_ptr &conn) { std::unique_lock guard(connection_mutex); SenderMapType &conn_map = (type == ConnectionType::MESSAGE) ? available_message_send_connections_ @@ -64,20 +64,21 @@ ray::Status ConnectionPool::ReleaseSender(ConnectionType type, void ConnectionPool::Add(ReceiverMapType &conn_map, const ClientID &client_id, std::shared_ptr conn) { - conn_map[client_id].push_back(conn); + conn_map[client_id].push_back(std::move(conn)); } void ConnectionPool::Add(SenderMapType &conn_map, const ClientID &client_id, std::shared_ptr conn) { - conn_map[client_id].push_back(conn); + conn_map[client_id].push_back(std::move(conn)); } void ConnectionPool::Remove(ReceiverMapType &conn_map, const ClientID &client_id, - std::shared_ptr conn) { - if (conn_map.count(client_id) == 0) { + std::shared_ptr &conn) { + auto it = conn_map.find(client_id); + if (it == conn_map.end()) { return; } - std::vector> &connections = conn_map[client_id]; + auto &connections = it->second; int64_t pos = std::find(connections.begin(), connections.end(), conn) - connections.begin(); if (pos >= (int64_t)connections.size()) { @@ -87,15 +88,16 @@ void ConnectionPool::Remove(ReceiverMapType &conn_map, const ClientID &client_id } uint64_t ConnectionPool::Count(SenderMapType &conn_map, const ClientID &client_id) { - if (conn_map.count(client_id) == 0) { + auto it = conn_map.find(client_id); + if (it == conn_map.end()) { return 0; - }; - return conn_map[client_id].size(); + } + return it->second.size(); } std::shared_ptr ConnectionPool::Borrow(SenderMapType &conn_map, const ClientID &client_id) { - std::shared_ptr conn = conn_map[client_id].back(); + std::shared_ptr conn = std::move(conn_map[client_id].back()); conn_map[client_id].pop_back(); RAY_LOG(DEBUG) << "Borrow " << client_id << " " << conn_map[client_id].size(); return conn; @@ -103,7 +105,7 @@ std::shared_ptr ConnectionPool::Borrow(SenderMapType &conn_map void ConnectionPool::Return(SenderMapType &conn_map, const ClientID &client_id, std::shared_ptr conn) { - conn_map[client_id].push_back(conn); + conn_map[client_id].push_back(std::move(conn)); RAY_LOG(DEBUG) << "Return " << client_id << " " << conn_map[client_id].size(); } diff --git a/src/ray/object_manager/connection_pool.h b/src/ray/object_manager/connection_pool.h index 15774a28798c..132083d55895 100644 --- a/src/ray/object_manager/connection_pool.h +++ b/src/ray/object_manager/connection_pool.h @@ -74,7 +74,7 @@ class ConnectionPool { /// \param type The type of connection. /// \param conn The actual connection. /// \return Status of invoking this method. - ray::Status ReleaseSender(ConnectionType type, std::shared_ptr conn); + ray::Status ReleaseSender(ConnectionType type, std::shared_ptr &conn); // TODO(hme): Implement with error handling. /// Remove a sender connection. This is invoked if the connection is no longer @@ -106,7 +106,7 @@ class ConnectionPool { /// Removes the given receiver for ClientID from the given map. void Remove(ReceiverMapType &conn_map, const ClientID &client_id, - std::shared_ptr conn); + std::shared_ptr &conn); /// Returns the count of sender connections to ClientID. uint64_t Count(SenderMapType &conn_map, const ClientID &client_id); diff --git a/src/ray/object_manager/object_directory.cc b/src/ray/object_manager/object_directory.cc index e7a6c8504c4a..990727e403ef 100644 --- a/src/ray/object_manager/object_directory.cc +++ b/src/ray/object_manager/object_directory.cc @@ -16,8 +16,8 @@ ray::Status ObjectDirectory::ReportObjectAdded(const ObjectID &object_id, data->is_eviction = false; data->object_size = object_info.data_size; ray::Status status = gcs_client_->object_table().Append( - job_id, object_id, data, [](gcs::AsyncGcsClient *client, const UniqueID &id, - const std::shared_ptr data) { + job_id, object_id, data, + [](gcs::AsyncGcsClient *client, const UniqueID &id, const ObjectTableDataT &data) { // Do nothing. }); return status; @@ -58,7 +58,7 @@ ray::Status ObjectDirectory::GetLocations(const ObjectID &object_id, }; ray::Status ObjectDirectory::ExecuteGetLocations(const ObjectID &object_id) { - JobID job_id = JobID::from_random(); + JobID job_id = JobID::nil(); // Note: Lookup must be synchronous for thread-safe access. // For now, this is only accessed by the main thread. ray::Status status = gcs_client_->object_table().Lookup( diff --git a/src/ray/object_manager/object_manager.cc b/src/ray/object_manager/object_manager.cc index cb857fec4f0b..42ef8164aee7 100644 --- a/src/ray/object_manager/object_manager.cc +++ b/src/ray/object_manager/object_manager.cc @@ -110,8 +110,8 @@ ray::Status ObjectManager::Pull(const ObjectID &object_id) { } void ObjectManager::SchedulePull(const ObjectID &object_id, int wait_ms) { - pull_requests_[object_id] = std::shared_ptr( - new asio::deadline_timer(*main_service_, boost::posix_time::milliseconds(wait_ms))); + pull_requests_[object_id] = std::make_shared( + *main_service_, boost::posix_time::milliseconds(wait_ms)); pull_requests_[object_id]->async_wait( [this, object_id](const boost::system::error_code &error_code) { pull_requests_.erase(object_id); @@ -184,7 +184,7 @@ ray::Status ObjectManager::PullEstablishConnection(const ObjectID &object_id, } ray::Status ObjectManager::PullSendRequest(const ObjectID &object_id, - std::shared_ptr conn) { + std::shared_ptr &conn) { flatbuffers::FlatBufferBuilder fbb; auto message = object_manager_protocol::CreatePullRequestMessage( fbb, fbb.CreateString(client_id_.binary()), fbb.CreateString(object_id.binary())); @@ -209,7 +209,7 @@ ray::Status ObjectManager::Push(const ObjectID &object_id, const ClientID &clien Status status = object_directory_->GetInformation( client_id, [this, object_id, client_id](const RemoteConnectionInfo &info) { - ObjectInfoT object_info = local_objects_[object_id]; + const ObjectInfoT &object_info = local_objects_[object_id]; uint64_t data_size = static_cast(object_info.data_size + object_info.metadata_size); uint64_t metadata_size = static_cast(object_info.metadata_size); @@ -251,7 +251,7 @@ void ObjectManager::ExecuteSendObject(const ClientID &client_id, ray::Status ObjectManager::SendObjectHeaders(const ObjectID &object_id, uint64_t data_size, uint64_t metadata_size, uint64_t chunk_index, - std::shared_ptr conn) { + std::shared_ptr &conn) { std::pair chunk_status = buffer_pool_.GetChunk(object_id, data_size, metadata_size, chunk_index); ObjectBufferPool::ChunkInfo chunk_info = chunk_status.first; @@ -276,7 +276,7 @@ ray::Status ObjectManager::SendObjectHeaders(const ObjectID &object_id, ray::Status ObjectManager::SendObjectData(const ObjectID &object_id, const ObjectBufferPool::ChunkInfo &chunk_info, - std::shared_ptr conn) { + std::shared_ptr &conn) { boost::system::error_code ec; std::vector buffer; buffer.push_back(asio::buffer(chunk_info.data, chunk_info.buffer_length)); @@ -328,11 +328,11 @@ std::shared_ptr ObjectManager::CreateSenderConnection( return conn; } -void ObjectManager::ProcessNewClient(std::shared_ptr conn) { - conn->ProcessMessages(); +void ObjectManager::ProcessNewClient(TcpClientConnection &conn) { + conn.ProcessMessages(); } -void ObjectManager::ProcessClientMessage(std::shared_ptr conn, +void ObjectManager::ProcessClientMessage(std::shared_ptr &conn, int64_t message_type, const uint8_t *message) { switch (message_type) { case object_manager_protocol::MessageType_PushRequest: { @@ -389,7 +389,7 @@ void ObjectManager::ReceivePullRequest(std::shared_ptr &con conn->ProcessMessages(); } -void ObjectManager::ReceivePushRequest(std::shared_ptr conn, +void ObjectManager::ReceivePushRequest(std::shared_ptr &conn, const uint8_t *message) { // Serialize. auto object_header = @@ -400,14 +400,14 @@ void ObjectManager::ReceivePushRequest(std::shared_ptr conn uint64_t metadata_size = object_header->metadata_size(); receive_service_.post([this, object_id, data_size, metadata_size, chunk_index, conn]() { ExecuteReceiveObject(conn->GetClientID(), object_id, data_size, metadata_size, - chunk_index, conn); + chunk_index, *conn); }); } void ObjectManager::ExecuteReceiveObject(const ClientID &client_id, const ObjectID &object_id, uint64_t data_size, uint64_t metadata_size, uint64_t chunk_index, - std::shared_ptr conn) { + TcpClientConnection &conn) { RAY_LOG(DEBUG) << "ExecuteReceiveObject " << client_id << " " << object_id << " " << chunk_index; @@ -419,7 +419,7 @@ void ObjectManager::ExecuteReceiveObject(const ClientID &client_id, std::vector buffer; buffer.push_back(asio::buffer(chunk_info.data, chunk_info.buffer_length)); boost::system::error_code ec; - conn->ReadBuffer(buffer, ec); + conn.ReadBuffer(buffer, ec); if (ec.value() == 0) { buffer_pool_.SealChunk(object_id, chunk_index); } else { @@ -435,13 +435,13 @@ void ObjectManager::ExecuteReceiveObject(const ClientID &client_id, std::vector buffer; buffer.push_back(asio::buffer(mutable_vec, buffer_length)); boost::system::error_code ec; - conn->ReadBuffer(buffer, ec); + conn.ReadBuffer(buffer, ec); if (ec.value() != 0) { RAY_LOG(ERROR) << ec.message(); } // TODO(hme): If the object isn't local, create a pull request for this chunk. } - conn->ProcessMessages(); + conn.ProcessMessages(); RAY_LOG(DEBUG) << "ReceiveCompleted " << client_id_ << " " << object_id << " " << "/" << config_.max_receives; } diff --git a/src/ray/object_manager/object_manager.h b/src/ray/object_manager/object_manager.h index 117a3073d414..d34d50762f24 100644 --- a/src/ray/object_manager/object_manager.h +++ b/src/ray/object_manager/object_manager.h @@ -110,7 +110,7 @@ class ObjectManager { /// /// \param conn The connection. /// \return Status of whether the connection was successfully established. - void ProcessNewClient(std::shared_ptr conn); + void ProcessNewClient(TcpClientConnection &conn); /// Process messages sent from other nodes. We only establish /// transfer connections using this method; all other transfer communication @@ -119,7 +119,7 @@ class ObjectManager { /// \param conn The connection. /// \param message_type The message type. /// \param message A pointer set to the beginning of the message. - void ProcessClientMessage(std::shared_ptr conn, + void ProcessClientMessage(std::shared_ptr &conn, int64_t message_type, const uint8_t *message); /// Cancels all requests (Push/Pull) associated with the given ObjectID. @@ -226,7 +226,7 @@ class ObjectManager { /// Synchronously send a pull request via remote object manager connection. /// Executes on main_service_ thread. ray::Status PullSendRequest(const ObjectID &object_id, - std::shared_ptr conn); + std::shared_ptr &conn); std::shared_ptr CreateSenderConnection( ConnectionPool::ConnectionType type, RemoteConnectionInfo info); @@ -241,23 +241,22 @@ class ObjectManager { /// Executes on send_service_ thread pool. ray::Status SendObjectHeaders(const ObjectID &object_id, uint64_t data_size, uint64_t metadata_size, uint64_t chunk_index, - std::shared_ptr conn); + std::shared_ptr &conn); /// This method initiates the actual object transfer. /// Executes on send_service_ thread pool. ray::Status SendObjectData(const ObjectID &object_id, const ObjectBufferPool::ChunkInfo &chunk_info, - std::shared_ptr conn); + std::shared_ptr &conn); /// Invoked when a remote object manager pushes an object to this object manager. /// This will invoke the object receive on the receive_service_ thread pool. - void ReceivePushRequest(std::shared_ptr conn, + void ReceivePushRequest(std::shared_ptr &conn, const uint8_t *message); /// Execute a receive on the receive_service_ thread pool. void ExecuteReceiveObject(const ClientID &client_id, const ObjectID &object_id, uint64_t data_size, uint64_t metadata_size, - uint64_t chunk_index, - std::shared_ptr conn); + uint64_t chunk_index, TcpClientConnection &conn); /// Handles receiving a pull request message. void ReceivePullRequest(std::shared_ptr &conn, diff --git a/src/ray/object_manager/object_manager_client_connection.cc b/src/ray/object_manager/object_manager_client_connection.cc index b904e5d90ab5..1fcfb7a44304 100644 --- a/src/ray/object_manager/object_manager_client_connection.cc +++ b/src/ray/object_manager/object_manager_client_connection.cc @@ -11,7 +11,7 @@ std::shared_ptr SenderConnection::Create( RAY_CHECK_OK(TcpConnect(socket, ip, port)); std::shared_ptr conn = std::make_shared(std::move(socket)); - return std::make_shared(conn, client_id); + return std::make_shared(std::move(conn), client_id); }; SenderConnection::SenderConnection(std::shared_ptr conn, diff --git a/src/ray/object_manager/test/object_manager_stress_test.cc b/src/ray/object_manager/test/object_manager_stress_test.cc index 350b37c4caed..827d19818979 100644 --- a/src/ray/object_manager/test/object_manager_stress_test.cc +++ b/src/ray/object_manager/test/object_manager_stress_test.cc @@ -65,9 +65,7 @@ class MockServer { void HandleAcceptObjectManager(const boost::system::error_code &error) { ClientHandler client_handler = - [this](std::shared_ptr client) { - object_manager_.ProcessNewClient(client); - }; + [this](TcpClientConnection &client) { object_manager_.ProcessNewClient(client); }; MessageHandler message_handler = [this]( std::shared_ptr client, int64_t message_type, const uint8_t *message) { diff --git a/src/ray/object_manager/test/object_manager_test.cc b/src/ray/object_manager/test/object_manager_test.cc index 259c3ea82287..faef6850a465 100644 --- a/src/ray/object_manager/test/object_manager_test.cc +++ b/src/ray/object_manager/test/object_manager_test.cc @@ -56,9 +56,7 @@ class MockServer { void HandleAcceptObjectManager(const boost::system::error_code &error) { ClientHandler client_handler = - [this](std::shared_ptr client) { - object_manager_.ProcessNewClient(client); - }; + [this](TcpClientConnection &client) { object_manager_.ProcessNewClient(client); }; MessageHandler message_handler = [this]( std::shared_ptr client, int64_t message_type, const uint8_t *message) { diff --git a/src/ray/raylet/lineage_cache.cc b/src/ray/raylet/lineage_cache.cc index 592c26481c24..9cc9cad1b919 100644 --- a/src/ray/raylet/lineage_cache.cc +++ b/src/ray/raylet/lineage_cache.cc @@ -258,8 +258,9 @@ Status LineageCache::Flush() { // Write back all ready tasks whose arguments have been committed to the GCS. gcs::raylet::TaskTable::WriteCallback task_callback = [this]( - ray::gcs::AsyncGcsClient *client, const TaskID &id, - const std::shared_ptr data) { HandleEntryCommitted(id); }; + ray::gcs::AsyncGcsClient *client, const TaskID &id, const protocol::TaskT &data) { + HandleEntryCommitted(id); + }; for (const auto &ready_task_id : ready_task_ids) { auto task = lineage_.GetEntry(ready_task_id); // TODO(swang): Make this better... diff --git a/src/ray/raylet/lineage_cache_test.cc b/src/ray/raylet/lineage_cache_test.cc index 9a51a3cf9321..1a19feb5c2ca 100644 --- a/src/ray/raylet/lineage_cache_test.cc +++ b/src/ray/raylet/lineage_cache_test.cc @@ -23,7 +23,7 @@ class MockGcs : public gcs::TableInterface, } Status Add(const JobID &job_id, const TaskID &task_id, - std::shared_ptr task_data, + std::shared_ptr &task_data, const gcs::TableInterface::WriteCallback &done) { task_table_[task_id] = task_data; callbacks_.push_back( @@ -38,7 +38,7 @@ class MockGcs : public gcs::TableInterface, bool send_notification = (subscribed_tasks_.count(task_id) == 1); auto callback = [this, send_notification](ray::gcs::AsyncGcsClient *client, const TaskID &task_id, - std::shared_ptr data) { + const protocol::TaskT &data) { if (send_notification) { notification_callback_(client, task_id, data); } @@ -63,7 +63,7 @@ class MockGcs : public gcs::TableInterface, void Flush() { for (const auto &callback : callbacks_) { - callback.first(NULL, callback.second, task_table_[callback.second]); + callback.first(NULL, callback.second, *task_table_[callback.second]); } callbacks_.clear(); } @@ -86,7 +86,7 @@ class LineageCacheTest : public ::testing::Test { LineageCacheTest() : mock_gcs_(), lineage_cache_(ClientID::from_random(), mock_gcs_, mock_gcs_) { mock_gcs_.Subscribe([this](ray::gcs::AsyncGcsClient *client, const TaskID &task_id, - std::shared_ptr data) { + const ray::protocol::TaskT &data) { lineage_cache_.HandleEntryCommitted(task_id); }); } diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc index 82740f1f5985..3fc99a2a6080 100644 --- a/src/ray/raylet/node_manager.cc +++ b/src/ray/raylet/node_manager.cc @@ -156,7 +156,7 @@ void NodeManager::Heartbeat() { ray::Status status = heartbeat_table.Add( UniqueID::nil(), gcs_client_->client_table().GetLocalClientId(), heartbeat_data, [](ray::gcs::AsyncGcsClient *client, const ClientID &id, - std::shared_ptr data) { + const HeartbeatTableDataT &data) { RAY_LOG(DEBUG) << "[HEARTBEAT] heartbeat sent callback"; }); @@ -251,37 +251,45 @@ void NodeManager::HandleActorCreation(const ActorID &actor_id, // Register the new actor. ActorRegistration actor_registration(data.back()); + ClientID received_node_manager_id = actor_registration.GetNodeManagerId(); // Extend the frontier to include the actor creation task. NOTE(swang): The // creator of the actor is always assigned nil as the actor handle ID. actor_registration.ExtendFrontier(ActorHandleID::nil(), actor_registration.GetActorCreationDependency()); auto inserted = actor_registry_.emplace(actor_id, std::move(actor_registration)); - RAY_CHECK(inserted.second); - - // Dequeue any methods that were submitted before the actor's location was - // known. - const auto &methods = local_queues_.GetUncreatedActorMethods(); - std::unordered_set created_actor_method_ids; - for (const auto &method : methods) { - if (method.GetTaskSpecification().ActorId() == actor_id) { - created_actor_method_ids.insert(method.GetTaskSpecification().TaskId()); + if (!inserted.second) { + // If we weren't able to insert the actor's location, check that the + // existing entry is the same as the new one. + // TODO(swang): This is not true in the case of failures. + RAY_CHECK(received_node_manager_id == inserted.first->second.GetNodeManagerId()) + << "Actor scheduled on " << inserted.first->second.GetNodeManagerId() + << ", but received notification for " << received_node_manager_id; + } else { + // The actor's location is now known. Dequeue any methods that were + // submitted before the actor's location was known. + const auto &methods = local_queues_.GetUncreatedActorMethods(); + std::unordered_set created_actor_method_ids; + for (const auto &method : methods) { + if (method.GetTaskSpecification().ActorId() == actor_id) { + created_actor_method_ids.insert(method.GetTaskSpecification().TaskId()); + } + } + // Resubmit the methods that were submitted before the actor's location was + // known. + auto created_actor_methods = local_queues_.RemoveTasks(created_actor_method_ids); + for (const auto &method : created_actor_methods) { + lineage_cache_.RemoveWaitingTask(method.GetTaskSpecification().TaskId()); + // The task's uncommitted lineage was already added to the local lineage + // cache upon the initial submission, so it's okay to resubmit it with an + // empty lineage this time. + SubmitTask(method, Lineage()); } - } - // Resubmit the methods that were submitted before the actor's location was - // known. - auto created_actor_methods = local_queues_.RemoveTasks(created_actor_method_ids); - for (const auto &method : created_actor_methods) { - lineage_cache_.RemoveWaitingTask(method.GetTaskSpecification().TaskId()); - // The task's uncommitted lineage was already added to the local lineage - // cache upon the initial submission, so it's okay to resubmit it with an - // empty lineage this time. - SubmitTask(method, Lineage()); } } -void NodeManager::ProcessNewClient(std::shared_ptr client) { +void NodeManager::ProcessNewClient(LocalClientConnection &client) { // The new client is a worker, so begin listening for messages. - client->ProcessMessages(); + client.ProcessMessages(); } void NodeManager::DispatchTasks() { @@ -309,9 +317,9 @@ void NodeManager::DispatchTasks() { } } -void NodeManager::ProcessClientMessage(std::shared_ptr client, - int64_t message_type, - const uint8_t *message_data) { +void NodeManager::ProcessClientMessage( + const std::shared_ptr &client, int64_t message_type, + const uint8_t *message_data) { RAY_LOG(DEBUG) << "Message of type " << message_type; switch (message_type) { @@ -319,7 +327,7 @@ void NodeManager::ProcessClientMessage(std::shared_ptr cl auto message = flatbuffers::GetRoot(message_data); if (message->is_worker()) { // Create a new worker from the registration request. - std::shared_ptr worker(new Worker(message->worker_pid(), client)); + auto worker = std::make_shared(message->worker_pid(), client); // Register the new worker. worker_pool_.RegisterWorker(std::move(worker)); } @@ -329,10 +337,10 @@ void NodeManager::ProcessClientMessage(std::shared_ptr cl RAY_CHECK(worker); // If the worker was assigned a task, mark it as finished. if (!worker->GetAssignedTaskId().is_nil()) { - FinishAssignedTask(worker); + FinishAssignedTask(*worker); } // Return the worker to the idle pool. - worker_pool_.PushWorker(worker); + worker_pool_.PushWorker(std::move(worker)); // Call task dispatch to assign work to the new worker. DispatchTasks(); @@ -436,14 +444,13 @@ void NodeManager::ProcessClientMessage(std::shared_ptr cl client->ProcessMessages(); } -void NodeManager::ProcessNewNodeManager( - std::shared_ptr node_manager_client) { - node_manager_client->ProcessMessages(); +void NodeManager::ProcessNewNodeManager(TcpClientConnection &node_manager_client) { + node_manager_client.ProcessMessages(); } -void NodeManager::ProcessNodeManagerMessage( - std::shared_ptr node_manager_client, int64_t message_type, - const uint8_t *message_data) { +void NodeManager::ProcessNodeManagerMessage(TcpClientConnection &node_manager_client, + int64_t message_type, + const uint8_t *message_data) { switch (message_type) { case protocol::MessageType_ForwardTaskRequest: { auto message = flatbuffers::GetRoot(message_data); @@ -458,7 +465,7 @@ void NodeManager::ProcessNodeManagerMessage( default: RAY_LOG(FATAL) << "Received unexpected message type " << message_type; } - node_manager_client->ProcessMessages(); + node_manager_client.ProcessMessages(); } void NodeManager::HandleWaitingTaskReady(const TaskID &task_id) { @@ -639,8 +646,8 @@ void NodeManager::AssignTask(Task &task) { } } -void NodeManager::FinishAssignedTask(std::shared_ptr worker) { - TaskID task_id = worker->GetAssignedTaskId(); +void NodeManager::FinishAssignedTask(Worker &worker) { + TaskID task_id = worker.GetAssignedTaskId(); RAY_LOG(DEBUG) << "Finished task " << task_id; auto tasks = local_queues_.RemoveTasks({task_id}); auto task = *tasks.begin(); @@ -648,7 +655,7 @@ void NodeManager::FinishAssignedTask(std::shared_ptr worker) { if (task.GetTaskSpecification().IsActorCreationTask()) { // If this was an actor creation task, then convert the worker to an actor. auto actor_id = task.GetTaskSpecification().ActorCreationId(); - worker->AssignActorId(actor_id); + worker.AssignActorId(actor_id); // Publish the actor creation event to all other nodes so that methods for // the actor will be forwarded directly to this node. @@ -684,7 +691,7 @@ void NodeManager::FinishAssignedTask(std::shared_ptr worker) { } // Unset the worker's assigned task. - worker->AssignTaskId(TaskID::nil()); + worker.AssignTaskId(TaskID::nil()); } void NodeManager::ResubmitTask(const TaskID &task_id) { diff --git a/src/ray/raylet/node_manager.h b/src/ray/raylet/node_manager.h index 3cf77327d08d..268e61f2301b 100644 --- a/src/ray/raylet/node_manager.h +++ b/src/ray/raylet/node_manager.h @@ -37,7 +37,7 @@ class NodeManager { std::shared_ptr gcs_client); /// Process a new client connection. - void ProcessNewClient(std::shared_ptr client); + void ProcessNewClient(LocalClientConnection &client); /// Process a message from a client. This method is responsible for /// explicitly listening for more messages from the client if the client is @@ -46,12 +46,12 @@ class NodeManager { /// \param client The client that sent the message. /// \param message_type The message type (e.g., a flatbuffer enum). /// \param message A pointer to the message data. - void ProcessClientMessage(std::shared_ptr client, + void ProcessClientMessage(const std::shared_ptr &client, int64_t message_type, const uint8_t *message); - void ProcessNewNodeManager(std::shared_ptr node_manager_client); + void ProcessNewNodeManager(TcpClientConnection &node_manager_client); - void ProcessNodeManagerMessage(std::shared_ptr node_manager_client, + void ProcessNodeManagerMessage(TcpClientConnection &node_manager_client, int64_t message_type, const uint8_t *message); ray::Status RegisterGcs(); @@ -69,7 +69,7 @@ class NodeManager { /// Assign a task. The task is assumed to not be queued in local_queues_. void AssignTask(Task &task); /// Handle a worker finishing its assigned task. - void FinishAssignedTask(std::shared_ptr worker); + void FinishAssignedTask(Worker &worker); /// Schedule tasks. void ScheduleTasks(); /// Handle a task whose local dependencies were missing and are now available. diff --git a/src/ray/raylet/raylet.cc b/src/ray/raylet/raylet.cc index 21ad56541240..bc23eb54d299 100644 --- a/src/ray/raylet/raylet.cc +++ b/src/ray/raylet/raylet.cc @@ -86,14 +86,12 @@ void Raylet::DoAcceptNodeManager() { void Raylet::HandleAcceptNodeManager(const boost::system::error_code &error) { if (!error) { - ClientHandler client_handler = - [this](std::shared_ptr client) { - node_manager_.ProcessNewNodeManager(client); - }; + ClientHandler client_handler = [this]( + TcpClientConnection &client) { node_manager_.ProcessNewNodeManager(client); }; MessageHandler message_handler = [this]( std::shared_ptr client, int64_t message_type, const uint8_t *message) { - node_manager_.ProcessNodeManagerMessage(client, message_type, message); + node_manager_.ProcessNodeManagerMessage(*client, message_type, message); }; // Accept a new local client and dispatch it to the node manager. auto new_connection = TcpClientConnection::Create(client_handler, message_handler, @@ -111,9 +109,7 @@ void Raylet::DoAcceptObjectManager() { void Raylet::HandleAcceptObjectManager(const boost::system::error_code &error) { ClientHandler client_handler = - [this](std::shared_ptr client) { - object_manager_.ProcessNewClient(client); - }; + [this](TcpClientConnection &client) { object_manager_.ProcessNewClient(client); }; MessageHandler message_handler = [this]( std::shared_ptr client, int64_t message_type, const uint8_t *message) { @@ -134,9 +130,7 @@ void Raylet::HandleAccept(const boost::system::error_code &error) { if (!error) { // TODO: typedef these handlers. ClientHandler client_handler = - [this](std::shared_ptr client) { - node_manager_.ProcessNewClient(client); - }; + [this](LocalClientConnection &client) { node_manager_.ProcessNewClient(client); }; MessageHandler message_handler = [this]( std::shared_ptr client, int64_t message_type, const uint8_t *message) { diff --git a/src/ray/raylet/worker_pool.cc b/src/ray/raylet/worker_pool.cc index 4b58b5a637f8..1a15dd6c22c0 100644 --- a/src/ray/raylet/worker_pool.cc +++ b/src/ray/raylet/worker_pool.cc @@ -87,14 +87,15 @@ void WorkerPool::StartWorker(bool force_start) { } void WorkerPool::RegisterWorker(std::shared_ptr worker) { - RAY_LOG(DEBUG) << "Registering worker with pid " << worker->Pid(); - registered_workers_.push_back(worker); - RAY_CHECK(started_worker_pids_.count(worker->Pid()) > 0); - started_worker_pids_.erase(worker->Pid()); + auto pid = worker->Pid(); + RAY_LOG(DEBUG) << "Registering worker with pid " << pid; + registered_workers_.push_back(std::move(worker)); + RAY_CHECK(started_worker_pids_.count(pid) > 0); + started_worker_pids_.erase(pid); } std::shared_ptr WorkerPool::GetRegisteredWorker( - std::shared_ptr connection) const { + const std::shared_ptr &connection) const { for (auto it = registered_workers_.begin(); it != registered_workers_.end(); it++) { if ((*it)->Connection() == connection) { return (*it); @@ -135,7 +136,7 @@ std::shared_ptr WorkerPool::PopWorker(const ActorID &actor_id) { // A helper function to remove a worker from a list. Returns true if the worker // was found and removed. bool removeWorker(std::list> &worker_pool, - std::shared_ptr worker) { + const std::shared_ptr &worker) { for (auto it = worker_pool.begin(); it != worker_pool.end(); it++) { if (*it == worker) { worker_pool.erase(it); diff --git a/src/ray/raylet/worker_pool.h b/src/ray/raylet/worker_pool.h index 8b6ef1e54d24..d1c6def3d473 100644 --- a/src/ray/raylet/worker_pool.h +++ b/src/ray/raylet/worker_pool.h @@ -60,7 +60,7 @@ class WorkerPool { /// \return The Worker that owns the given client connection. Returns nullptr /// if the client has not registered a worker yet. std::shared_ptr GetRegisteredWorker( - std::shared_ptr connection) const; + const std::shared_ptr &connection) const; /// Disconnect a registered worker. /// diff --git a/src/ray/raylet/worker_pool_test.cc b/src/ray/raylet/worker_pool_test.cc index 8d6c526b4945..28c5ef730d82 100644 --- a/src/ray/raylet/worker_pool_test.cc +++ b/src/ray/raylet/worker_pool_test.cc @@ -30,8 +30,8 @@ class WorkerPoolTest : public ::testing::Test { WorkerPoolTest() : worker_pool_({}), io_service_() {} std::shared_ptr CreateWorker(pid_t pid) { - std::function)> client_handler = [this]( - std::shared_ptr client) { HandleNewClient(client); }; + std::function client_handler = + [this](LocalClientConnection &client) { HandleNewClient(client); }; std::function, int64_t, const uint8_t *)> message_handler = [this](std::shared_ptr client, int64_t message_type, const uint8_t *message) { @@ -49,7 +49,7 @@ class WorkerPoolTest : public ::testing::Test { boost::asio::io_service io_service_; private: - void HandleNewClient(std::shared_ptr){}; + void HandleNewClient(LocalClientConnection &){}; void HandleMessage(std::shared_ptr, int64_t, const uint8_t *){}; }; diff --git a/test/actor_test.py b/test/actor_test.py index 7e040185b9fc..1d1c5ed18b20 100644 --- a/test/actor_test.py +++ b/test/actor_test.py @@ -18,6 +18,9 @@ class ActorAPI(unittest.TestCase): def tearDown(self): ray.worker.cleanup() + @unittest.skipIf( + os.environ.get("RAY_USE_XRAY") == "1", + "This test does not work with xray yet.") def testKeywordArgs(self): ray.init(num_workers=0, driver_mode=ray.SILENT_MODE) @@ -68,6 +71,9 @@ def get_values(self, arg0, arg1=2, arg2="b"): with self.assertRaises(Exception): ray.get(actor.get_values.remote()) + @unittest.skipIf( + os.environ.get("RAY_USE_XRAY") == "1", + "This test does not work with xray yet.") def testVariableNumberOfArgs(self): ray.init(num_workers=0) @@ -234,6 +240,9 @@ class Actor(object): def __init__(self): pass + @unittest.skipIf( + os.environ.get("RAY_USE_XRAY") == "1", + "This test does not work with xray yet.") def testRandomIDGeneration(self): ray.init(num_workers=0) @@ -327,6 +336,9 @@ def f(self, y): with self.assertRaises(Exception): t.f(1) + @unittest.skipIf( + os.environ.get("RAY_USE_XRAY") == "1", + "This test does not work with xray yet.") def testActorDeletion(self): ray.init(num_workers=0) @@ -359,6 +371,9 @@ def method(self): # called. self.assertEqual(ray.get(Actor.remote().method.remote()), 1) + @unittest.skipIf( + os.environ.get("RAY_USE_XRAY") == "1", + "This test does not work with xray yet.") def testActorDeletionWithGPUs(self): ray.init(num_workers=0, num_gpus=1) @@ -549,6 +564,9 @@ def get_values(self, z): actor2 = Actor2.remote(3, 4) self.assertEqual(ray.get(actor2.get_values.remote(5)), (3, 4)) + @unittest.skipIf( + os.environ.get("RAY_USE_XRAY") == "1", + "This test does not work with xray yet.") def testDefineActorWithinRemoteFunction(self): # Make sure we can define and actors within remote funtions. ray.init(num_cpus=10) @@ -684,6 +702,9 @@ class ActorsOnMultipleNodes(unittest.TestCase): def tearDown(self): ray.worker.cleanup() + @unittest.skipIf( + os.environ.get("RAY_USE_XRAY") == "1", + "This test does not work with xray yet.") def testActorsOnNodesWithNoCPUs(self): ray.init(num_cpus=0) @@ -774,7 +795,7 @@ def get_location_and_ids(self): # Make sure that no two actors are assigned to the same GPU. locations_and_ids = ray.get( [actor.get_location_and_ids.remote() for actor in actors]) - node_names = set([location for location, gpu_id in locations_and_ids]) + node_names = {location for location, gpu_id in locations_and_ids} self.assertEqual(len(node_names), num_local_schedulers) location_actor_combinations = [] for node_name in node_names: @@ -815,7 +836,7 @@ def get_location_and_ids(self): # Make sure that no two actors are assigned to the same GPU. locations_and_ids = ray.get( [actor.get_location_and_ids.remote() for actor in actors1]) - node_names = set([location for location, gpu_id in locations_and_ids]) + node_names = {location for location, gpu_id in locations_and_ids} self.assertEqual(len(node_names), num_local_schedulers) # Keep track of which GPU IDs are being used for each location. @@ -847,9 +868,9 @@ def get_location_and_ids(self): # Make sure that no two actors are assigned to the same GPU. locations_and_ids = ray.get( [actor.get_location_and_ids.remote() for actor in actors2]) - self.assertEqual( - node_names, - set([location for location, gpu_id in locations_and_ids])) + self.assertEqual(node_names, + {location + for location, gpu_id in locations_and_ids}) for location, gpu_ids in locations_and_ids: gpus_in_use[location].extend(gpu_ids) for node_name in node_names: @@ -887,7 +908,7 @@ def get_location_and_ids(self): # Make sure that no two actors are assigned to the same GPU. locations_and_ids = ray.get( [actor.get_location_and_ids.remote() for actor in actors]) - node_names = set([location for location, gpu_id in locations_and_ids]) + node_names = {location for location, gpu_id in locations_and_ids} self.assertEqual(len(node_names), 2) for node_name in node_names: node_gpu_ids = [ @@ -896,8 +917,8 @@ def get_location_and_ids(self): ] self.assertIn(len(node_gpu_ids), [5, 10]) self.assertEqual( - set(node_gpu_ids), - set([(i, ) for i in range(len(node_gpu_ids))])) + set(node_gpu_ids), {(i, ) + for i in range(len(node_gpu_ids))}) # Creating a new actor should fail because all of the GPUs are being # used. @@ -1098,6 +1119,9 @@ def locations_to_intervals_for_many_tasks(): ready_ids, remaining_ids = ray.wait(results, timeout=1000) self.assertEqual(len(ready_ids), 0) + @unittest.skipIf( + os.environ.get("RAY_USE_XRAY") == "1", + "This test does not work with xray yet.") def testActorsAndTasksWithGPUsVersionTwo(self): # Create tasks and actors that both use GPUs and make sure that they # are given different GPUs @@ -1170,6 +1194,9 @@ def sleep(self): self.assertLess(interval1[1], interval2[0]) self.assertLess(interval2[0], interval2[1]) + @unittest.skipIf( + os.environ.get("RAY_USE_XRAY") == "1", + "This test does not work with xray yet.") def testBlockingActorTask(self): ray.init(num_cpus=1, num_gpus=1) @@ -1763,6 +1790,9 @@ def read(self): return Queue.remote() + @unittest.skipIf( + os.environ.get("RAY_USE_XRAY") == "1", + "This test does not work with xray yet.") def testFork(self): queue = self.setup_queue_actor() @@ -1778,6 +1808,9 @@ def fork(queue, key, item): filtered_items = [item[1] for item in items if item[0] == i] self.assertEqual(filtered_items, list(range(1))) + @unittest.skipIf( + os.environ.get("RAY_USE_XRAY") == "1", + "This test does not work with xray yet.") def testForkConsistency(self): queue = self.setup_queue_actor() @@ -1822,7 +1855,12 @@ def testCallingPutOnActorHandle(self): @ray.remote class Counter(object): - pass + def __init__(self): + self.x = 0 + + def inc(self): + self.x += 1 + return self.x @ray.remote def f(): @@ -1832,24 +1870,43 @@ def f(): def g(): return [Counter.remote()] - with self.assertRaises(Exception): - ray.put(Counter.remote()) + # Currently, calling ray.put on an actor handle is allowed, but is + # there a good use case? + counter = Counter.remote() + counter_id = ray.put(counter) + new_counter = ray.get(counter_id) + assert ray.get(new_counter.inc.remote()) == 1 + assert ray.get(counter.inc.remote()) == 2 + assert ray.get(new_counter.inc.remote()) == 3 with self.assertRaises(Exception): ray.get(f.remote()) - # The below test is commented out because it currently does not behave - # properly. The call to g.remote() does not raise an exception because - # even though the actor handle cannot be pickled, pyarrow attempts to - # serialize it as a dictionary of its fields which kind of works. - # self.assertRaises(Exception): - # ray.get(g.remote()) + # The below test works, but do we want to disallow this usage? + ray.get(g.remote()) + + def testPicklingActorHandle(self): + ray.worker.init(num_workers=1) + + @ray.remote + class Foo(object): + def method(self): + pass + + f = Foo.remote() + new_f = ray.worker.pickle.loads(ray.worker.pickle.dumps(f)) + # Verify that we can call a method on the unpickled handle. TODO(rkn): + # we should also test this from a different driver. + ray.get(new_f.method.remote()) class ActorPlacementAndResources(unittest.TestCase): def tearDown(self): ray.worker.cleanup() + @unittest.skipIf( + os.environ.get("RAY_USE_XRAY") == "1", + "This test does not work with xray yet.") def testLifetimeAndTransientResources(self): ray.init(num_cpus=1) @@ -1907,6 +1964,9 @@ def get_location(self): for location in locations2: self.assertNotEqual(location, local_plasma) + @unittest.skipIf( + os.environ.get("RAY_USE_XRAY") == "1", + "This test does not work with xray yet.") def testCreatingMoreActorsThanResources(self): ray.init( num_workers=0, @@ -1942,7 +2002,7 @@ def method(self): results = ray.get([result1, result2, result3]) self.assertEqual(results[0], results[2]) - self.assertEqual(set(results), set([0, 1])) + self.assertEqual(set(results), {0, 1}) # Make sure that when one actor goes out of scope a new actor is # created because some resources have been freed up. diff --git a/test/autoscaler_test.py b/test/autoscaler_test.py index c3b84ca968f1..c56bd3ce55aa 100644 --- a/test/autoscaler_test.py +++ b/test/autoscaler_test.py @@ -365,14 +365,6 @@ def testMaxFailures(self): autoscaler.update() self.assertRaises(Exception, autoscaler.update) - def testAbortOnCreationFailures(self): - config_path = self.write_config(SMALL_CLUSTER) - self.provider = MockProvider() - self.provider.fail_creates = True - autoscaler = StandardAutoscaler( - config_path, LoadMetrics(), max_failures=0, update_interval_s=0) - self.assertRaises(AssertionError, autoscaler.update) - def testLaunchNewNodeOnOutOfBandTerminate(self): config_path = self.write_config(SMALL_CLUSTER) self.provider = MockProvider() diff --git a/test/failure_test.py b/test/failure_test.py index 560bc020506d..b3260a1744e2 100644 --- a/test/failure_test.py +++ b/test/failure_test.py @@ -257,6 +257,9 @@ class WorkerDeath(unittest.TestCase): def tearDown(self): ray.worker.cleanup() + @unittest.skipIf( + os.environ.get("RAY_USE_XRAY") == "1", + "This test does not work with xray yet.") def testWorkerRaisingException(self): ray.init(num_workers=1, driver_mode=ray.SILENT_MODE) @@ -272,6 +275,9 @@ def f(): wait_for_errors(b"worker_died", 1) self.assertEqual(len(ray.error_info()), 2) + @unittest.skipIf( + os.environ.get("RAY_USE_XRAY") == "1", + "This test does not work with xray yet.") def testWorkerDying(self): ray.init(num_workers=0, driver_mode=ray.SILENT_MODE) @@ -288,6 +294,9 @@ def f(): self.assertIn("died or was killed while executing the task", ray.error_info()[0][b"message"].decode("ascii")) + @unittest.skipIf( + os.environ.get("RAY_USE_XRAY") == "1", + "This test does not work with xray yet.") def testActorWorkerDying(self): ray.init(num_workers=0, driver_mode=ray.SILENT_MODE) @@ -306,6 +315,9 @@ def consume(x): self.assertRaises(Exception, lambda: ray.get(consume.remote(obj))) wait_for_errors(b"worker_died", 1) + @unittest.skipIf( + os.environ.get("RAY_USE_XRAY") == "1", + "This test does not work with xray yet.") def testActorWorkerDyingFutureTasks(self): ray.init(num_workers=0, driver_mode=ray.SILENT_MODE) @@ -328,6 +340,9 @@ def sleep(self): wait_for_errors(b"worker_died", 1) + @unittest.skipIf( + os.environ.get("RAY_USE_XRAY") == "1", + "This test does not work with xray yet.") def testActorWorkerDyingNothingInProgress(self): ray.init(num_workers=0, driver_mode=ray.SILENT_MODE) diff --git a/test/multi_node_test.py b/test/multi_node_test.py index 97c39d89544e..116e78fd4fa3 100644 --- a/test/multi_node_test.py +++ b/test/multi_node_test.py @@ -2,12 +2,13 @@ from __future__ import division from __future__ import print_function -import unittest +import os import ray import subprocess import sys import tempfile import time +import unittest from ray.test.test_utils import run_and_get_output @@ -153,6 +154,9 @@ def g(x): # Make sure the other driver succeeded. self.assertIn("success", out) + @unittest.skipIf( + os.environ.get("RAY_USE_XRAY") == "1", + "This test does not work with xray yet.") def testDriverExitingQuickly(self): # This test will create some drivers that submit some tasks and then # exit without waiting for the tasks to complete. diff --git a/test/runtest.py b/test/runtest.py index a44543a21294..7ad98702bad0 100644 --- a/test/runtest.py +++ b/test/runtest.py @@ -255,7 +255,7 @@ def temp(): # Test sets. self.assertEqual(ray.get(f.remote(set())), set()) - s = set([1, (1, 2, "hi")]) + s = {1, (1, 2, "hi")} self.assertEqual(ray.get(f.remote(s)), s) # Test types. @@ -689,6 +689,9 @@ def m(x): self.assertEqual(ray.get(k2.remote(1)), 2) self.assertEqual(ray.get(m.remote(1)), 2) + @unittest.skipIf( + os.environ.get("RAY_USE_XRAY") == "1", + "This test does not work with xray yet.") def testSubmitAPI(self): self.init_ray(num_gpus=1, resources={"Custom": 1}, num_workers=1) @@ -701,13 +704,39 @@ def g(): return ray.get_gpu_ids() assert f._submit([0], num_return_vals=0) is None - assert ray.get(f._submit(args=[1], num_return_vals=1)) == [0] - assert ray.get(f._submit(args=[2], num_return_vals=2)) == [0, 1] - assert ray.get(f._submit(args=[3], num_return_vals=3)) == [0, 1, 2] + id1 = f._submit(args=[1], num_return_vals=1) + assert ray.get(id1) == [0] + id1, id2 = f._submit(args=[2], num_return_vals=2) + assert ray.get([id1, id2]) == [0, 1] + id1, id2, id3 = f._submit(args=[3], num_return_vals=3) + assert ray.get([id1, id2, id3]) == [0, 1, 2] assert ray.get( g._submit( args=[], num_cpus=1, num_gpus=1, resources={"Custom": 1})) == [0] + infeasible_id = g._submit(args=[], resources={"NonexistentCustom": 1}) + ready_ids, remaining_ids = ray.wait([infeasible_id], timeout=50) + assert len(ready_ids) == 0 + assert len(remaining_ids) == 1 + + @ray.remote + class Actor(object): + def __init__(self, x, y=0): + self.x = x + self.y = y + + def method(self, a, b=0): + return self.x, self.y, a, b + + def gpu_ids(self): + return ray.get_gpu_ids() + + a = Actor._submit( + args=[0], kwargs={"y": 1}, num_gpus=1, resources={"Custom": 1}) + + id1, id2, id3, id4 = a.method._submit( + args=["test"], kwargs={"b": 2}, num_return_vals=4) + assert ray.get([id1, id2, id3, id4]) == [0, 1, "test", 2] def testGetMultiple(self): self.init_ray() @@ -720,6 +749,9 @@ def testGetMultiple(self): results = ray.get([object_ids[i] for i in indices]) self.assertEqual(results, indices) + @unittest.skipIf( + os.environ.get("RAY_USE_XRAY") == "1", + "This test does not work with xray yet.") def testWait(self): self.init_ray(num_cpus=1) @@ -785,10 +817,13 @@ def f(delay): with self.assertRaises(TypeError): ray.wait([1]) + @unittest.skipIf( + os.environ.get("RAY_USE_XRAY") == "1", + "This test does not work with xray yet.") def testMultipleWaitsAndGets(self): # It is important to use three workers here, so that the three tasks # launched in this experiment can run at the same time. - self.init_ray() + self.init_ray(num_cpus=3) @ray.remote def f(delay): @@ -887,6 +922,9 @@ def get_path2(): self.assertTrue("fake_directory" not in ray.get(get_path2.remote())) + @unittest.skipIf( + os.environ.get("RAY_USE_XRAY") == "1", + "This test does not work with xray yet.") def testLoggingAPI(self): self.init_ray(driver_mode=ray.SILENT_MODE) @@ -1033,6 +1071,9 @@ class PythonModeTest(unittest.TestCase): def tearDown(self): ray.worker.cleanup() + @unittest.skipIf( + os.environ.get("RAY_USE_XRAY") == "1", + "This test does not work with xray yet.") def testPythonMode(self): reload(test_functions) ray.init(driver_mode=ray.PYTHON_MODE) @@ -1229,6 +1270,9 @@ def g(n): self.assertLess(duration, 1 + time_buffer) self.assertGreater(duration, 1) + @unittest.skipIf( + os.environ.get("RAY_USE_XRAY") == "1", + "This test does not work with xray yet.") def testGPUIDs(self): num_gpus = 10 ray.init(num_cpus=10, num_gpus=num_gpus) @@ -1317,8 +1361,8 @@ def f(): self.assertEqual(list_of_ids, 10 * [[]]) list_of_ids = ray.get([f1.remote() for _ in range(10)]) - set_of_ids = set([tuple(gpu_ids) for gpu_ids in list_of_ids]) - self.assertEqual(set_of_ids, set([(i, ) for i in range(10)])) + set_of_ids = {tuple(gpu_ids) for gpu_ids in list_of_ids} + self.assertEqual(set_of_ids, {(i, ) for i in range(10)}) list_of_ids = ray.get([f2.remote(), f4.remote(), f4.remote()]) all_ids = [gpu_id for gpu_ids in list_of_ids for gpu_id in gpu_ids] @@ -1659,6 +1703,9 @@ def tearDown(self): else: del os.environ["CUDA_VISIBLE_DEVICES"] + @unittest.skipIf( + os.environ.get("RAY_USE_XRAY") == "1", + "This test does not work with xray yet.") def testSpecificGPUs(self): allowed_gpu_ids = [4, 5, 6] os.environ["CUDA_VISIBLE_DEVICES"] = ",".join( @@ -1699,8 +1746,11 @@ def f(): ray.get([f.remote() for _ in range(100)]) + @unittest.skipIf( + os.environ.get("RAY_USE_XRAY") == "1", + "This test does not work with xray yet.") def testBlockingTasks(self): - ray.init(num_workers=1) + ray.init(num_cpus=1) @ray.remote def f(i, j): @@ -1710,24 +1760,27 @@ def f(i, j): def g(i): # Each instance of g submits and blocks on the result of another # remote task. - object_ids = [f.remote(i, j) for j in range(10)] + object_ids = [f.remote(i, j) for j in range(2)] return ray.get(object_ids) - ray.get([g.remote(i) for i in range(100)]) + ray.get([g.remote(i) for i in range(4)]) @ray.remote def _sleep(i): - time.sleep(1) + time.sleep(0.01) return (i) @ray.remote def sleep(): # Each instance of sleep submits and blocks on the result of - # another remote task, which takes one second to execute. + # another remote task, which takes some time to execute. ray.get([_sleep.remote(i) for i in range(10)]) ray.get(sleep.remote()) + @unittest.skipIf( + os.environ.get("RAY_USE_XRAY") == "1", + "This test does not work with xray yet.") def testMaxCallTasks(self): ray.init(num_cpus=1) @@ -1838,6 +1891,9 @@ class GlobalStateAPI(unittest.TestCase): def tearDown(self): ray.worker.cleanup() + @unittest.skipIf( + os.environ.get("RAY_USE_XRAY") == "1", + "This test does not work with xray yet.") def testGlobalStateAPI(self): with self.assertRaises(Exception): ray.global_state.object_table() @@ -1995,6 +2051,9 @@ def f(): self.assertEqual(found_message, True) + @unittest.skipIf( + os.environ.get("RAY_USE_XRAY") == "1", + "This test does not work with xray yet.") def testTaskProfileAPI(self): ray.init(redirect_output=True) @@ -2036,7 +2095,7 @@ def testWorkers(self): @ray.remote def f(): - return id(ray.worker.global_worker) + return id(ray.worker.global_worker), os.getpid() # Wait until all of the workers have started. worker_ids = set() @@ -2053,6 +2112,9 @@ def f(): self.assertIn("stderr_file", info) self.assertIn("stdout_file", info) + @unittest.skipIf( + os.environ.get("RAY_USE_XRAY") == "1", + "This test does not work with xray yet.") def testDumpTraceFile(self): ray.init(redirect_output=True) @@ -2091,6 +2153,9 @@ def method(self): # the visualization actually renders (e.g., the context of the dumped # trace could be malformed). + @unittest.skipIf( + os.environ.get("RAY_USE_XRAY") == "1", + "This test does not work with xray yet.") def testFlushAPI(self): ray.init(num_cpus=1) diff --git a/test/stress_tests.py b/test/stress_tests.py index 62bf3604e72a..12e0e1aa6693 100644 --- a/test/stress_tests.py +++ b/test/stress_tests.py @@ -71,6 +71,9 @@ def g(*xs): self.assertTrue(ray.services.all_processes_alive()) ray.worker.cleanup() + @unittest.skipIf( + os.environ.get("RAY_USE_XRAY") == "1", + "This test does not work with xray yet.") def testSubmittingManyTasks(self): ray.init() @@ -118,6 +121,9 @@ def f(): self.assertTrue(ray.services.all_processes_alive()) ray.worker.cleanup() + @unittest.skipIf( + os.environ.get("RAY_USE_XRAY") == "1", + "This test does not work with xray yet.") def testWait(self): for num_local_schedulers in [1, 4]: for num_workers_per_scheduler in [4]: @@ -210,8 +216,10 @@ def tearDown(self): state._initialize_global_state(self.redis_ip_address, self.redis_port) if os.environ.get('RAY_USE_NEW_GCS', False): tasks = state.task_table() - local_scheduler_ids = set( - task["LocalSchedulerID"] for task in tasks.values()) + local_scheduler_ids = { + task["LocalSchedulerID"] + for task in tasks.values() + } # Make sure that all nodes in the cluster were used by checking that # the set of local scheduler IDs that had a task scheduled or submitted