diff --git a/.flake8 b/.flake8 new file mode 100644 index 00000000000..508150ef2f5 --- /dev/null +++ b/.flake8 @@ -0,0 +1,6 @@ +# Adopted from *Black*'s config + +[flake8] +ignore = E203, E266, E501, W503 +max-line-length = 88 +select = B,C,E,F,W,T4,B9 diff --git a/.travis.yml b/.travis.yml index b966cc6a3cb..db1bf229863 100644 --- a/.travis.yml +++ b/.travis.yml @@ -25,8 +25,8 @@ matrix: env: LINT=1 script: - export PATH="$HOME/miniconda/bin:$PATH" - - yapf -dr modin/pandas - - flake8 --max-line-length=88 . + - black --check modin/ + - flake8 . install: - ./.travis/install-dependencies.sh diff --git a/.travis/install-dependencies.sh b/.travis/install-dependencies.sh index d91af1d9887..73059a2b007 100755 --- a/.travis/install-dependencies.sh +++ b/.travis/install-dependencies.sh @@ -46,6 +46,7 @@ elif [[ "$LINT" == "1" ]]; then bash miniconda.sh -b -p $HOME/miniconda export PATH="$HOME/miniconda/bin:$PATH" conda install -y python==3.6.5 + pip install black flake8 flake8-comprehensions else echo "Unrecognized environment." @@ -53,5 +54,5 @@ else fi pip install -r requirements.txt -pip install -q pytest flake8 flake8-comprehensions yapf feather-format lxml openpyxl xlrd numpy +pip install -q pytest feather-format lxml openpyxl xlrd numpy diff --git a/README.rst b/README.rst index 0525ef1e051..9d4b176ce61 100644 --- a/README.rst +++ b/README.rst @@ -7,6 +7,8 @@ Modin .. image:: https://readthedocs.org/projects/modin/badge/?version=latest :target: https://modin.readthedocs.io/en/latest/?badge=latest +.. image:: https://img.shields.io/badge/code%20style-black-000000.svg + :target: https://github.com/ambv/black | *Modin is a library for unifying the way you interact with your data* diff --git a/modin/__init__.py b/modin/__init__.py index 40a6bd47928..a0f845e1209 100644 --- a/modin/__init__.py +++ b/modin/__init__.py @@ -6,19 +6,18 @@ def git_version(): def _execute_cmd_in_temp_env(cmd): # construct environment env = {} - for k in ['SYSTEMROOT', 'PATH', 'HOME']: + for k in ["SYSTEMROOT", "PATH", "HOME"]: v = os.environ.get(k) if v is not None: env[k] = v # LANGUAGE is used on win32 - env['LANGUAGE'] = 'C' - env['LANG'] = 'C' - env['LC_ALL'] = 'C' - return subprocess.Popen( - cmd, stdout=subprocess.PIPE, env=env).communicate()[0] + env["LANGUAGE"] = "C" + env["LANG"] = "C" + env["LC_ALL"] = "C" + return subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env).communicate()[0] try: - git_revision = _execute_cmd_in_temp_env(['git', 'rev-parse', 'HEAD']) + git_revision = _execute_cmd_in_temp_env(["git", "rev-parse", "HEAD"]) return git_revision.strip().decode() except OSError: return "Unknown" diff --git a/modin/data_management/data_manager.py b/modin/data_management/data_manager.py index 8aab730f22f..11e67012658 100644 --- a/modin/data_management/data_manager.py +++ b/modin/data_management/data_manager.py @@ -7,9 +7,12 @@ from pandas.compat import string_types from pandas.core.dtypes.cast import find_common_type -from pandas.core.dtypes.common import (_get_dtype_from_object, is_list_like, - is_numeric_dtype, - is_datetime_or_timedelta_dtype) +from pandas.core.dtypes.common import ( + _get_dtype_from_object, + is_list_like, + is_numeric_dtype, + is_datetime_or_timedelta_dtype, +) from pandas.core.index import _ensure_index from .partitioning.partition_collections import BlockPartitions @@ -19,11 +22,13 @@ class PandasDataManager(object): """This class implements the logic necessary for operating on partitions with a Pandas backend. This logic is specific to Pandas.""" - def __init__(self, - block_partitions_object: BlockPartitions, - index: pandas.Index, - columns: pandas.Index, - dtypes=None): + def __init__( + self, + block_partitions_object: BlockPartitions, + index: pandas.Index, + columns: pandas.Index, + dtypes=None, + ): assert isinstance(block_partitions_object, BlockPartitions) self.data = block_partitions_object self.index = index @@ -31,11 +36,7 @@ def __init__(self, if dtypes is not None: self._dtype_cache = dtypes - def __constructor__(self, - block_paritions_object, - index, - columns, - dtypes=None): + def __constructor__(self, block_paritions_object, index, columns, dtypes=None): """By default, constructor method will invoke an init""" return type(self)(block_paritions_object, index, columns, dtypes) @@ -48,11 +49,9 @@ def _get_dtype(self): map_func = self._prepare_method(lambda df: df.dtypes) def dtype_builder(df): - return df.apply( - lambda row: find_common_type(row.values), axis=0) + return df.apply(lambda row: find_common_type(row.values), axis=0) - self._dtype_cache = self.data.full_reduce(map_func, dtype_builder, - 0) + self._dtype_cache = self.data.full_reduce(map_func, dtype_builder, 0) self._dtype_cache.index = self.columns return self._dtype_cache @@ -77,8 +76,9 @@ def _validate_set_axis(self, new_labels, old_labels): new_len = len(new_labels) if old_len != new_len: raise ValueError( - 'Length mismatch: Expected axis has %d elements, ' - 'new values have %d elements' % (old_len, new_len)) + "Length mismatch: Expected axis has %d elements, " + "new values have %d elements" % (old_len, new_len) + ) return new_labels def _set_index(self, new_index): @@ -92,8 +92,7 @@ def _set_columns(self, new_columns): if self._columns_cache is None: self._columns_cache = _ensure_index(new_columns) else: - new_columns = self._validate_set_axis(new_columns, - self._columns_cache) + new_columns = self._validate_set_axis(new_columns, self._columns_cache) self._columns_cache = new_columns columns = property(_get_columns, _set_columns) @@ -133,7 +132,8 @@ def pandas_index_extraction(df, axis): new_indices = data_object.get_indices( axis=axis, index_func=lambda df: pandas_index_extraction(df, axis), - old_blocks=old_blocks) + old_blocks=old_blocks, + ) return index_obj[new_indices] if compute_diff else new_indices @@ -154,6 +154,7 @@ def _prepare_method(self, pandas_func, **kwargs): def helper(df, internal_indices=[]): return pandas_func(df.T, **kwargs) + else: def helper(df, internal_indices=[]): @@ -190,7 +191,8 @@ def numeric_function_clean_dataframe(self, axis): result = pandas.Series(dtype=np.float64) nonnumeric = [ - col for col, dtype in zip(self.columns, self.dtypes) + col + for col, dtype in zip(self.columns, self.dtypes) if not is_numeric_dtype(dtype) ] if len(nonnumeric) == len(self.columns): @@ -206,13 +208,15 @@ def numeric_function_clean_dataframe(self, axis): # Metadata modification methods def add_prefix(self, prefix): new_column_names = self.columns.map(lambda x: str(prefix) + str(x)) - return self.__constructor__(self.data, self.index, new_column_names, - self._dtype_cache) + return self.__constructor__( + self.data, self.index, new_column_names, self._dtype_cache + ) def add_suffix(self, suffix): new_column_names = self.columns.map(lambda x: str(x) + str(suffix)) - return self.__constructor__(self.data, self.index, new_column_names, - self._dtype_cache) + return self.__constructor__( + self.data, self.index, new_column_names, self._dtype_cache + ) # END Metadata modification methods @@ -221,8 +225,9 @@ def add_suffix(self, suffix): # copies if we end up modifying something here. We copy all of the metadata # to prevent that. def copy(self): - return self.__constructor__(self.data.copy(), self.index.copy(), - self.columns.copy(), self._dtype_cache) + return self.__constructor__( + self.data.copy(), self.index.copy(), self.columns.copy(), self._dtype_cache + ) # Append/Concat/Join (Not Merge) # The append/concat/join operations should ideally never trigger remote @@ -287,8 +292,9 @@ def concat(self, axis, other, **kwargs): def _append_list_of_managers(self, others, axis, **kwargs): if not isinstance(others, list): others = [others] - assert all(isinstance(other, type(self)) for other in others), \ - "Different Manager objects are being used. This is not allowed" + assert all( + isinstance(other, type(self)) for other in others + ), "Different Manager objects are being used. This is not allowed" sort = kwargs.get("sort", None) join = kwargs.get("join", "outer") @@ -301,13 +307,12 @@ def _append_list_of_managers(self, others, axis, **kwargs): axis, [other.columns if axis == 0 else other.index for other in others], join, - sort=sort) + sort=sort, + ) # Since we are concatenating a list of managers, we will align all of # the indices based on the `joined_axis` computed above. - to_append = [ - other.reindex(axis ^ 1, joined_axis).data for other in others - ] + to_append = [other.reindex(axis ^ 1, joined_axis).data for other in others] new_self = self.reindex(axis ^ 1, joined_axis).data new_data = new_self.concat(axis, to_append) @@ -316,20 +321,23 @@ def _append_list_of_managers(self, others, axis, **kwargs): # If `ignore_index` is true, we create a RangeIndex that is the # length of all of the index objects combined. This is the same # behavior as pandas. - new_index = self.index.append([ - other.index for other in others - ]) if not ignore_index else pandas.RangeIndex( - len(self.index) + sum(len(other.index) for other in others)) + new_index = ( + self.index.append([other.index for other in others]) + if not ignore_index + else pandas.RangeIndex( + len(self.index) + sum(len(other.index) for other in others) + ) + ) return self.__constructor__(new_data, new_index, joined_axis) else: # The columns will be appended to form the final columns. - new_columns = self.columns.append( - [other.columns for other in others]) + new_columns = self.columns.append([other.columns for other in others]) return self.__constructor__(new_data, joined_axis, new_columns) def _join_data_manager(self, other, **kwargs): - assert isinstance(other, type(self)), \ - "This method is for data manager objects only" + assert isinstance( + other, type(self) + ), "This method is for data manager objects only" # Uses join's default value (though should not revert to default) how = kwargs.get("how", "left") @@ -349,15 +357,18 @@ def _join_data_manager(self, other, **kwargs): self_proxy = pandas.DataFrame(columns=self.columns) other_proxy = pandas.DataFrame(columns=other.columns) new_columns = self_proxy.join( - other_proxy, lsuffix=lsuffix, rsuffix=rsuffix).columns + other_proxy, lsuffix=lsuffix, rsuffix=rsuffix + ).columns return self.__constructor__(new_data, joined_index, new_columns) def _join_list_of_managers(self, others, **kwargs): - assert isinstance(others, list), \ - "This method is for lists of DataManager objects only" - assert all(isinstance(other, type(self)) for other in others), \ - "Different Manager objects are being used. This is not allowed" + assert isinstance( + others, list + ), "This method is for lists of DataManager objects only" + assert all( + isinstance(other, type(self)) for other in others + ), "Different Manager objects are being used. This is not allowed" # Uses join's default value (though should not revert to default) how = kwargs.get("how", "left") @@ -366,7 +377,8 @@ def _join_list_of_managers(self, others, **kwargs): rsuffix = kwargs.get("rsuffix", "") joined_index = self._join_index_objects( - 1, [other.index for other in others], how, sort=sort) + 1, [other.index for other in others], how, sort=sort + ) to_join = [other.reindex(0, joined_index).data for other in others] new_self = self.reindex(0, joined_index).data @@ -376,11 +388,10 @@ def _join_list_of_managers(self, others, **kwargs): # This stage is to efficiently get the resulting columns, including the # suffixes. self_proxy = pandas.DataFrame(columns=self.columns) - others_proxy = [ - pandas.DataFrame(columns=other.columns) for other in others - ] + others_proxy = [pandas.DataFrame(columns=other.columns) for other in others] new_columns = self_proxy.join( - others_proxy, lsuffix=lsuffix, rsuffix=rsuffix).columns + others_proxy, lsuffix=lsuffix, rsuffix=rsuffix + ).columns return self.__constructor__(new_data, joined_index, new_columns) @@ -401,13 +412,14 @@ def inter_manager_operations(self, other, how_to_join, func): Returns: New DataManager with new data and index. """ - assert isinstance(other, type(self)), \ - "Must have the same DataManager subclass to perform this operation" + assert isinstance( + other, type(self) + ), "Must have the same DataManager subclass to perform this operation" - joined_index = self._join_index_objects( - 1, other.index, how_to_join, sort=False) + joined_index = self._join_index_objects(1, other.index, how_to_join, sort=False) new_columns = self._join_index_objects( - 0, other.columns, how_to_join, sort=False) + 0, other.columns, how_to_join, sort=False + ) reindexed_other = other.reindex(0, joined_index).data reindexed_self = self.reindex(0, joined_index).data @@ -429,7 +441,8 @@ def inter_data_op_builder(left, right, self_cols, other_cols, func): new_data = reindexed_self.inter_data_operation( 1, lambda l, r: inter_data_op_builder(l, r, self_cols, other_cols, func), - reindexed_other) + reindexed_other, + ) return self.__constructor__(new_data, joined_index, new_columns) @@ -447,10 +460,12 @@ def _inter_df_op_handler(self, func, other, **kwargs): if isinstance(other, type(self)): return self.inter_manager_operations( - other, "outer", lambda x, y: func(x, y, **kwargs)) + other, "outer", lambda x, y: func(x, y, **kwargs) + ) else: - return self.scalar_operations(axis, other, - lambda df: func(df, other, **kwargs)) + return self.scalar_operations( + axis, other, lambda df: func(df, other, **kwargs) + ) def add(self, other, **kwargs): """Adds this manager with other object (manager or scalar). @@ -667,8 +682,9 @@ def update(self, other, **kwargs): Returns: New DataManager with updated data and index. """ - assert isinstance(other, type(self)), \ - "Must have the same DataManager subclass to perform this operation" + assert isinstance( + other, type(self) + ), "Must have the same DataManager subclass to perform this operation" def update_builder(df, other, **kwargs): df.update(other, **kwargs) @@ -686,8 +702,9 @@ def where(self, cond, other, **kwargs): New DataManager with updated data and index. """ - assert isinstance(cond, type(self)), \ - "Must have the same DataManager subclass to perform this operation" + assert isinstance( + cond, type(self) + ), "Must have the same DataManager subclass to perform this operation" if isinstance(other, type(self)): # Note: Currently we are doing this with two maps across the entire @@ -708,11 +725,13 @@ def where_builder_second_pass(df, new_other, **kwargs): reindexed_self = self.reindex(0, self.index).data first_pass = reindexed_cond.inter_data_operation( - 1, lambda l, r: where_builder_first_pass(l, r, **kwargs), - reindexed_other) + 1, + lambda l, r: where_builder_first_pass(l, r, **kwargs), + reindexed_other, + ) final_pass = reindexed_self.inter_data_operation( - 1, lambda l, r: where_builder_second_pass(l, r, **kwargs), - first_pass) + 1, lambda l, r: where_builder_second_pass(l, r, **kwargs), first_pass + ) return self.__constructor__(final_pass, self.index, self.columns) else: axis = kwargs.get("axis", 0) @@ -725,13 +744,17 @@ def where_builder_series(df, cond, other, **kwargs): return df.where(cond, other, **kwargs) reindexed_self = self.reindex( - axis, self.index if not axis else self.columns).data + axis, self.index if not axis else self.columns + ).data reindexed_cond = cond.reindex( - axis, self.index if not axis else self.columns).data + axis, self.index if not axis else self.columns + ).data new_data = reindexed_self.inter_data_operation( - axis, lambda l, r: where_builder_series(l, r, other, **kwargs), - reindexed_cond) + axis, + lambda l, r: where_builder_series(l, r, other, **kwargs), + reindexed_cond, + ) return self.__constructor__(new_data, self.index, self.columns) # END Inter-Data operations @@ -791,7 +814,8 @@ def reindex_builer(df, axis, old_labels, new_labels, **kwargs): new_columns = labels if axis else self.columns func = self._prepare_method( - lambda df: reindex_builer(df, axis, old_labels, labels, **kwargs)) + lambda df: reindex_builer(df, axis, old_labels, labels, **kwargs) + ) # The reindex can just be mapped over the axis we are modifying. This # is for simplicity in implementation. We specify num_splits here @@ -819,8 +843,9 @@ def reset_index(self, **kwargs): else: # The copies here are to ensure that we do not give references to # this object for the purposes of updates. - return self.__constructor__(self.data.copy(), new_index, - self.columns.copy(), self._dtype_cache) + return self.__constructor__( + self.data.copy(), new_index, self.columns.copy(), self._dtype_cache + ) # END Reindex/reset_index @@ -858,8 +883,7 @@ def transpose(self, *args, **kwargs): # Currently, this means a Pandas Series will be returned, but in the future # we will implement a Distributed Series, and this will be returned # instead. - def full_reduce(self, axis, map_func, reduce_func=None, - numeric_only=False): + def full_reduce(self, axis, map_func, reduce_func=None, numeric_only=False): """Apply function that will reduce the data to a Pandas Series. Args: @@ -883,8 +907,9 @@ def full_reduce(self, axis, map_func, reduce_func=None, # The XOR here will ensure that we reduce over the correct axis that # exists on the internal partitions. We flip the axis - result = data_manager.data.full_reduce(map_func, reduce_func, - axis ^ self._is_transposed) + result = data_manager.data.full_reduce( + map_func, reduce_func, axis ^ self._is_transposed + ) if not axis: result.index = data_manager.columns else: @@ -967,36 +992,39 @@ def sum(self, **kwargs): # These operations are operations that apply a function to every partition. def map_partitions(self, func, new_dtypes=None): return self.__constructor__( - self.data.map_across_blocks(func), self.index, self.columns, - new_dtypes) + self.data.map_across_blocks(func), self.index, self.columns, new_dtypes + ) def abs(self): func = self._prepare_method(pandas.DataFrame.abs) - new_dtypes = pandas.Series([np.dtype('float64') for _ in self.columns], - index=self.columns) + new_dtypes = pandas.Series( + [np.dtype("float64") for _ in self.columns], index=self.columns + ) return self.map_partitions(func, new_dtypes=new_dtypes) def applymap(self, func): - remote_func = self._prepare_method( - pandas.DataFrame.applymap, func=func) + remote_func = self._prepare_method(pandas.DataFrame.applymap, func=func) return self.map_partitions(remote_func) def isin(self, **kwargs): func = self._prepare_method(pandas.DataFrame.isin, **kwargs) - new_dtypes = pandas.Series([np.dtype('bool') for _ in self.columns], - index=self.columns) + new_dtypes = pandas.Series( + [np.dtype("bool") for _ in self.columns], index=self.columns + ) return self.map_partitions(func, new_dtypes=new_dtypes) def isna(self): func = self._prepare_method(pandas.DataFrame.isna) - new_dtypes = pandas.Series([np.dtype('bool') for _ in self.columns], - index=self.columns) + new_dtypes = pandas.Series( + [np.dtype("bool") for _ in self.columns], index=self.columns + ) return self.map_partitions(func, new_dtypes=new_dtypes) def isnull(self): func = self._prepare_method(pandas.DataFrame.isnull) - new_dtypes = pandas.Series([np.dtype('bool') for _ in self.columns], - index=self.columns) + new_dtypes = pandas.Series( + [np.dtype("bool") for _ in self.columns], index=self.columns + ) return self.map_partitions(func, new_dtypes=new_dtypes) def negative(self, **kwargs): @@ -1005,14 +1033,16 @@ def negative(self, **kwargs): def notna(self): func = self._prepare_method(pandas.DataFrame.notna) - new_dtypes = pandas.Series([np.dtype('bool') for _ in self.columns], - index=self.columns) + new_dtypes = pandas.Series( + [np.dtype("bool") for _ in self.columns], index=self.columns + ) return self.map_partitions(func, new_dtypes=new_dtypes) def notnull(self): func = self._prepare_method(pandas.DataFrame.notnull) - new_dtypes = pandas.Series([np.dtype('bool') for _ in self.columns], - index=self.columns) + new_dtypes = pandas.Series( + [np.dtype("bool") for _ in self.columns], index=self.columns + ) return self.map_partitions(func, new_dtypes=new_dtypes) def round(self, **kwargs): @@ -1053,9 +1083,9 @@ def astype(self, col_dtypes, **kwargs): # Update the new dtype series to the proper pandas dtype new_dtype = np.dtype(dtype) if dtype != np.int32 and new_dtype == np.int32: - new_dtype = np.dtype('int64') + new_dtype = np.dtype("int64") elif dtype != np.float32 and new_dtype == np.float32: - new_dtype = np.dtype('float64') + new_dtype = np.dtype("float64") new_dtypes[column] = new_dtype # Update partitions for each dtype that is updated @@ -1069,10 +1099,10 @@ def astype(df, internal_indices=[]): return df.astype(block_dtypes) new_data = new_data.apply_func_to_select_indices( - 0, astype, dtype_indices[dtype], keep_remaining=True) + 0, astype, dtype_indices[dtype], keep_remaining=True + ) - return self.__constructor__(new_data, self.index, self.columns, - new_dtypes) + return self.__constructor__(new_data, self.index, self.columns, new_dtypes) # END Map partitions across select indices @@ -1092,7 +1122,8 @@ def full_axis_reduce(self, func, axis): Pandas series containing the reduced data. """ result = self.data.map_across_full_axis(axis, func).to_pandas( - self._is_transposed) + self._is_transposed + ) if not axis: result.index = self.columns @@ -1223,7 +1254,7 @@ def memory_usage(self, **kwargs): def memory_usage_builder(df, **kwargs): return df.memory_usage(index=False, deep=deep) - deep = kwargs.get('deep', False) + deep = kwargs.get("deep", False) func = self._prepare_method(memory_usage_builder, **kwargs) return self.full_axis_reduce(func, 0) @@ -1260,11 +1291,9 @@ def to_datetime_builder(df, **kwargs): # Currently, this means a Pandas Series will be returned, but in the future # we will implement a Distributed Series, and this will be returned # instead. - def full_axis_reduce_along_select_indices(self, - func, - axis, - index, - pandas_result=True): + def full_axis_reduce_along_select_indices( + self, func, axis, index, pandas_result=True + ): """Reduce Manger along select indices using function that needs full axis. Args: @@ -1279,11 +1308,10 @@ def full_axis_reduce_along_select_indices(self, """ # Convert indices to numeric indices old_index = self.index if axis else self.columns - numeric_indices = [ - i for i, name in enumerate(old_index) if name in index - ] + numeric_indices = [i for i, name in enumerate(old_index) if name in index] result = self.data.apply_func_to_select_indices_along_full_axis( - axis, func, numeric_indices) + axis, func, numeric_indices + ) if pandas_result: result = result.to_pandas(self._is_transposed) @@ -1311,19 +1339,19 @@ def describe_builder(df, **kwargs): # Apply describe and update indices, columns, and dtypes func = self._prepare_method(describe_builder, **kwargs) - new_data = self.full_axis_reduce_along_select_indices( - func, 0, new_index, False) + new_data = self.full_axis_reduce_along_select_indices(func, 0, new_index, False) new_index = self.compute_index(0, new_data, False) new_columns = self.compute_index(1, new_data, True) if numeric: - new_dtypes = pandas.Series([np.float64 for _ in new_columns], - index=new_columns) + new_dtypes = pandas.Series( + [np.float64 for _ in new_columns], index=new_columns + ) else: - new_dtypes = pandas.Series([np.object for _ in new_columns], - index=new_columns) + new_dtypes = pandas.Series( + [np.object for _ in new_columns], index=new_columns + ) - return self.__constructor__(new_data, new_index, new_columns, - new_dtypes) + return self.__constructor__(new_data, new_index, new_columns, new_dtypes) def median(self, **kwargs): """Returns median of each column or row. @@ -1440,8 +1468,9 @@ def _cumulative_builder(self, func, **kwargs): axis = kwargs.get("axis", 0) func = self._prepare_method(func, **kwargs) new_data = self.map_across_full_axis(axis, func) - return self.__constructor__(new_data, self.index, self.columns, - self._dtype_cache) + return self.__constructor__( + new_data, self.index, self.columns, self._dtype_cache + ) def cumsum(self, **kwargs): return self._cumulative_builder(pandas.DataFrame.cumsum, **kwargs) @@ -1493,13 +1522,11 @@ def dropna(self, **kwargs): # Count the number of NA values and specify which are higher than # thresh. drop_values = { - ax ^ 1: compute_na.isna().sum(axis=ax ^ 1) > thresh - for ax in axis + ax ^ 1: compute_na.isna().sum(axis=ax ^ 1) > thresh for ax in axis } else: drop_values = { - ax ^ 1: getattr(compute_na.isna(), how)(axis=ax ^ 1) - for ax in axis + ax ^ 1: getattr(compute_na.isna(), how)(axis=ax ^ 1) for ax in axis } if 0 not in drop_values: @@ -1508,16 +1535,25 @@ def dropna(self, **kwargs): if 1 not in drop_values: drop_values[1] = None - rm_from_index = [obj for obj in compute_na.index[drop_values[1]] - ] if drop_values[1] is not None else None - rm_from_columns = [ - obj for obj in compute_na.columns[drop_values[0]] - ] if drop_values[0] is not None else None + rm_from_index = ( + [obj for obj in compute_na.index[drop_values[1]]] + if drop_values[1] is not None + else None + ) + rm_from_columns = ( + [obj for obj in compute_na.columns[drop_values[0]]] + if drop_values[0] is not None + else None + ) else: - rm_from_index = compute_na.index[ - drop_values[1]] if drop_values[1] is not None else None - rm_from_columns = compute_na.columns[ - drop_values[0]] if drop_values[0] is not None else None + rm_from_index = ( + compute_na.index[drop_values[1]] if drop_values[1] is not None else None + ) + rm_from_columns = ( + compute_na.columns[drop_values[0]] + if drop_values[0] is not None + else None + ) return self.drop(index=rm_from_index, columns=rm_from_columns) @@ -1545,8 +1581,7 @@ def eval(self, expr, **kwargs): # in the first column if expect_series: if inplace: - raise ValueError( - "Cannot operate inplace if there is no assignment") + raise ValueError("Cannot operate inplace if there is no assignment") else: expr = "{0} = {1}".format(columns[0], expr) @@ -1578,13 +1613,15 @@ def mode(self, **kwargs): func = self._prepare_method(pandas.DataFrame.mode, **kwargs) new_data = self.map_across_full_axis(axis, func) - counts = self.__constructor__(new_data, self.index, - self.columns).notnull().sum(axis=axis) + counts = ( + self.__constructor__(new_data, self.index, self.columns) + .notnull() + .sum(axis=axis) + ) max_count = counts.max() new_index = pandas.RangeIndex(max_count) if not axis else self.index - new_columns = self.columns if not axis else pandas.RangeIndex( - max_count) + new_columns = self.columns if not axis else pandas.RangeIndex(max_count) # We have to reindex the DataFrame so that all of the partitions are # matching in shape. The next steps ensure this happens. @@ -1592,10 +1629,13 @@ def mode(self, **kwargs): # We build these intermediate objects to avoid depending directly on # the underlying implementation. final_data = self.__constructor__( - new_data, new_index, new_columns).map_across_full_axis( - axis, lambda df: df.reindex(axis=axis, labels=final_labels)) - return self.__constructor__(final_data, new_index, new_columns, - self._dtype_cache) + new_data, new_index, new_columns + ).map_across_full_axis( + axis, lambda df: df.reindex(axis=axis, labels=final_labels) + ) + return self.__constructor__( + final_data, new_index, new_columns, self._dtype_cache + ) def fillna(self, **kwargs): """Replaces NaN values with the method provided. @@ -1614,15 +1654,15 @@ def fillna(self, **kwargs): else: index = self.index value = { - idx: value[key] - for key in value for idx in index.get_indexer_for([key]) + idx: value[key] for key in value for idx in index.get_indexer_for([key]) } def fillna_dict_builder(df, func_dict={}): return df.fillna(value=func_dict, **kwargs) new_data = self.data.apply_func_to_select_indices( - axis, fillna_dict_builder, value, keep_remaining=True) + axis, fillna_dict_builder, value, keep_remaining=True + ) return self.__constructor__(new_data, self.index, self.columns) else: func = self._prepare_method(pandas.DataFrame.fillna, **kwargs) @@ -1655,8 +1695,7 @@ def query_builder(df, **kwargs): # Query removes rows, so we need to update the index new_index = self.compute_index(0, new_data, True) - return self.__constructor__(new_data, new_index, self.columns, - self.dtypes) + return self.__constructor__(new_data, new_index, self.columns, self.dtypes) def rank(self, **kwargs): """Computes numerical rank along axis. Equal values are set to the average. @@ -1676,10 +1715,8 @@ def rank(self, **kwargs): new_columns = self.compute_index(1, new_data, True) else: new_columns = self.columns - new_dtypes = pandas.Series([np.float64 for _ in new_columns], - index=new_columns) - return self.__constructor__(new_data, self.index, new_columns, - new_dtypes) + new_dtypes = pandas.Series([np.float64 for _ in new_columns], index=new_columns) + return self.__constructor__(new_data, self.index, new_columns, new_dtypes) # END Map across rows/columns @@ -1687,11 +1724,9 @@ def rank(self, **kwargs): # These operations require some global knowledge of the full column/row # that is being operated on. This means that we have to put all of that # data in the same place. - def map_across_full_axis_select_indices(self, - axis, - func, - indices, - keep_remaining=False): + def map_across_full_axis_select_indices( + self, axis, func, indices, keep_remaining=False + ): """Maps function to select indices along full axis. Args: @@ -1704,7 +1739,8 @@ def map_across_full_axis_select_indices(self, BlockPartitions containing the result of mapping func over axis on indices. """ return self.data.apply_func_to_select_indices_along_full_axis( - axis, func, indices, keep_remaining) + axis, func, indices, keep_remaining + ) def quantile_for_list_of_values(self, **kwargs): """Returns Manager containing quantiles along an axis for numeric columns. @@ -1721,22 +1757,22 @@ def quantile_for_list_of_values(self, **kwargs): new_columns = self.numeric_columns() else: new_columns = [ - col for col, dtype in zip(self.columns, self.dtypes) - if (is_numeric_dtype(dtype) - or is_datetime_or_timedelta_dtype(dtype)) + col + for col, dtype in zip(self.columns, self.dtypes) + if (is_numeric_dtype(dtype) or is_datetime_or_timedelta_dtype(dtype)) ] if axis: # If along rows, then drop the nonnumeric columns, record the index, and # take transpose. We have to do this because if we don't, the result is all # in one column for some reason. nonnumeric = [ - col for col, dtype in zip(self.columns, self.dtypes) + col + for col, dtype in zip(self.columns, self.dtypes) if not is_numeric_dtype(dtype) ] data_manager = self.drop(columns=nonnumeric) new_columns = data_manager.index - numeric_indices = list( - data_manager.index.get_indexer_for(new_columns)) + numeric_indices = list(data_manager.index.get_indexer_for(new_columns)) data_manager = data_manager.transpose() kwargs.pop("axis") else: @@ -1749,7 +1785,8 @@ def quantile_builder(df, internal_indices=[], **kwargs): func = self._prepare_method(quantile_builder, **kwargs) q_index = pandas.Float64Index(q) new_data = data_manager.map_across_full_axis_select_indices( - 0, func, numeric_indices) + 0, func, numeric_indices + ) return self.__constructor__(new_data, q_index, new_columns) # END Map across rows/columns @@ -1773,13 +1810,16 @@ def head(self, n): # on a transposed manager is already set to the correct value, so # we need to only take the head of that instead of re-transposing. result = self.__constructor__( - self.data.transpose().take(1, n).transpose(), self.index[:n], - self.columns, self._dtype_cache) + self.data.transpose().take(1, n).transpose(), + self.index[:n], + self.columns, + self._dtype_cache, + ) result._is_transposed = True else: result = self.__constructor__( - self.data.take(0, n), self.index[:n], self.columns, - self._dtype_cache) + self.data.take(0, n), self.index[:n], self.columns, self._dtype_cache + ) return result def tail(self, n): @@ -1794,13 +1834,16 @@ def tail(self, n): # See head for an explanation of the transposed behavior if self._is_transposed: result = self.__constructor__( - self.data.transpose().take(1, -n).transpose(), self.index[-n:], - self.columns, self._dtype_cache) + self.data.transpose().take(1, -n).transpose(), + self.index[-n:], + self.columns, + self._dtype_cache, + ) result._is_transposed = True else: result = self.__constructor__( - self.data.take(0, -n), self.index[-n:], self.columns, - self._dtype_cache) + self.data.take(0, -n), self.index[-n:], self.columns, self._dtype_cache + ) return result @@ -1816,13 +1859,16 @@ def front(self, n): # See head for an explanation of the transposed behavior if self._is_transposed: result = self.__constructor__( - self.data.transpose().take(0, n).transpose(), self.index, - self.columns[:n], self.dtypes[:n]) + self.data.transpose().take(0, n).transpose(), + self.index, + self.columns[:n], + self.dtypes[:n], + ) result._is_transposed = True else: result = self.__constructor__( - self.data.take(1, n), self.index, self.columns[:n], - self.dtypes[:n]) + self.data.take(1, n), self.index, self.columns[:n], self.dtypes[:n] + ) return result def back(self, n): @@ -1837,13 +1883,16 @@ def back(self, n): # See head for an explanation of the transposed behavior if self._is_transposed: result = self.__constructor__( - self.data.transpose().take(0, -n).transpose(), self.index, - self.columns[-n:], self.dtypes[-n:]) + self.data.transpose().take(0, -n).transpose(), + self.index, + self.columns[-n:], + self.dtypes[-n:], + ) result._is_transposed = True else: result = self.__constructor__( - self.data.take(1, -n), self.index, self.columns[-n:], - self.dtypes[-n:]) + self.data.take(1, -n), self.index, self.columns[-n:], self.dtypes[-n:] + ) return result # End Head/Tail/Front/Back @@ -1928,14 +1977,14 @@ def getitem(df, internal_indices=[]): return df.iloc[:, internal_indices] result = self.data.apply_func_to_select_indices( - 0, getitem, numeric_indices, keep_remaining=False) + 0, getitem, numeric_indices, keep_remaining=False + ) # We can't just set the columns to key here because there may be # multiple instances of a key. new_columns = self.columns[numeric_indices] new_dtypes = self.dtypes[numeric_indices] - return self.__constructor__(result, self.index, new_columns, - new_dtypes) + return self.__constructor__(result, self.index, new_columns, new_dtypes) def getitem_row_array(self, key): """Get row data for target labels. @@ -1953,12 +2002,12 @@ def getitem(df, internal_indices=[]): return df.iloc[internal_indices] result = self.data.apply_func_to_select_indices( - 1, getitem, numeric_indices, keep_remaining=False) + 1, getitem, numeric_indices, keep_remaining=False + ) # We can't just set the index to key here because there may be multiple # instances of a key. new_index = self.index[numeric_indices] - return self.__constructor__(result, new_index, self.columns, - self._dtype_cache) + return self.__constructor__(result, new_index, self.columns, self._dtype_cache) # END __getitem__ methods @@ -1987,11 +2036,13 @@ def delitem(df, internal_indices=[]): numeric_indices = list(self.index.get_indexer_for(index)) new_data = self.data.apply_func_to_select_indices( - 1, delitem, numeric_indices, keep_remaining=True) + 1, delitem, numeric_indices, keep_remaining=True + ) # We can't use self.index.drop with duplicate keys because in Pandas # it throws an error. new_index = [ - self.index[i] for i in range(len(self.index)) + self.index[i] + for i in range(len(self.index)) if i not in numeric_indices ] @@ -2005,16 +2056,17 @@ def delitem(df, internal_indices=[]): numeric_indices = list(self.columns.get_indexer_for(columns)) new_data = new_data.apply_func_to_select_indices( - 0, delitem, numeric_indices, keep_remaining=True) + 0, delitem, numeric_indices, keep_remaining=True + ) # We can't use self.columns.drop with duplicate keys because in Pandas # it throws an error. new_columns = [ - self.columns[i] for i in range(len(self.columns)) + self.columns[i] + for i in range(len(self.columns)) if i not in numeric_indices ] new_dtypes = self.dtypes.drop(columns) - return self.__constructor__(new_data, new_index, new_columns, - new_dtypes) + return self.__constructor__(new_data, new_index, new_columns, new_dtypes) # END __delitem__ and drop @@ -2041,7 +2093,8 @@ def insert(df, internal_indices=[]): return df new_data = self.data.apply_func_to_select_indices_along_full_axis( - 0, insert, loc, keep_remaining=True) + 0, insert, loc, keep_remaining=True + ) new_columns = self.columns.insert(loc, column) # Because a Pandas Series does not allow insert, we make a DataFrame @@ -2050,8 +2103,7 @@ def insert(df, internal_indices=[]): temp_dtypes.insert(loc, column, _get_dtype_from_object(value)) new_dtypes = temp_dtypes.iloc[0] - return self.__constructor__(new_data, self.index, new_columns, - new_dtypes) + return self.__constructor__(new_data, self.index, new_columns, new_dtypes) # END Insert @@ -2108,11 +2160,17 @@ def _post_process_apply(self, result_data, axis, try_scale=True): # this logic here. if len(columns) == 0: series_result = result_data.to_pandas(False) - if not axis and len(series_result) == len( - self.columns) and len(index) != len(series_result): + if ( + not axis + and len(series_result) == len(self.columns) + and len(index) != len(series_result) + ): index = self.columns - elif axis and len(series_result) == len( - self.index) and len(index) != len(series_result): + elif ( + axis + and len(series_result) == len(self.index) + and len(index) != len(series_result) + ): index = self.index series_result.index = index @@ -2138,16 +2196,14 @@ def _dict_func(self, func, axis, *args, **kwargs): else: index = self.index - func = { - idx: func[key] - for key in func for idx in index.get_indexer_for([key]) - } + func = {idx: func[key] for key in func for idx in index.get_indexer_for([key])} def dict_apply_builder(df, func_dict={}): return df.apply(func_dict, *args, **kwargs) result_data = self.data.apply_func_to_select_indices_along_full_axis( - axis, dict_apply_builder, func, keep_remaining=False) + axis, dict_apply_builder, func, keep_remaining=False + ) full_result = self._post_process_apply(result_data, axis) @@ -2169,14 +2225,11 @@ def _list_like_func(self, func, axis, *args, **kwargs): Returns: A new PandasDataManager. """ - func_prepared = self._prepare_method( - lambda df: df.apply(func, *args, **kwargs)) + func_prepared = self._prepare_method(lambda df: df.apply(func, *args, **kwargs)) new_data = self.map_across_full_axis(axis, func_prepared) # When the function is list-like, the function names become the index - new_index = [ - f if isinstance(f, string_types) else f.__name__ for f in func - ] + new_index = [f if isinstance(f, string_types) else f.__name__ for f in func] return self.__constructor__(new_data, new_index, self.columns) def _callable_func(self, func, axis, *args, **kwargs): @@ -2236,11 +2289,9 @@ def groupby_agg_builder(df): df.index = remote_index else: df.columns = remote_index - return agg_func( - df.groupby(by=by, axis=axis, **groupby_args), **agg_args) + return agg_func(df.groupby(by=by, axis=axis, **groupby_args), **agg_args) - func_prepared = self._prepare_method( - lambda df: groupby_agg_builder(df)) + func_prepared = self._prepare_method(lambda df: groupby_agg_builder(df)) result_data = self.map_across_full_axis(axis, func_prepared) return self._post_process_apply(result_data, axis, try_scale=False) @@ -2260,9 +2311,7 @@ def get_dummies(self, columns, **kwargs): # `columns` as None does not mean all columns, by default it means only # non-numeric columns. if columns is None: - columns = [ - c for c in self.columns if not is_numeric_dtype(self.dtypes[c]) - ] + columns = [c for c in self.columns if not is_numeric_dtype(self.dtypes[c])] # If we aren't computing any dummies, there is no need for any # remote compute. @@ -2286,7 +2335,8 @@ def set_columns(df, columns): set_cols = self.columns columns_applied = self.map_across_full_axis( - 1, lambda df: set_columns(df, set_cols)) + 1, lambda df: set_columns(df, set_cols) + ) # In some cases, we are mapping across all of the data. It is more # efficient if we are mapping over all of the data to do it this way @@ -2307,11 +2357,13 @@ def get_dummies_builder(df): def get_dummies_builder(df, internal_indices=[]): return pandas.get_dummies( - df.iloc[:, internal_indices], columns=None, **kwargs) + df.iloc[:, internal_indices], columns=None, **kwargs + ) numeric_indices = list(self.columns.get_indexer_for(columns)) new_data = columns_applied.apply_func_to_select_indices_along_full_axis( - 0, get_dummies_builder, numeric_indices, keep_remaining=False) + 0, get_dummies_builder, numeric_indices, keep_remaining=False + ) untouched_data = self.drop(columns=columns) # Since we set the columns in the beginning, we can just extract them @@ -2324,26 +2376,30 @@ def get_dummies_builder(df, internal_indices=[]): # not selected. if len(columns) != len(self.columns): new_data = untouched_data.data.concat(1, new_data) - final_columns = untouched_data.columns.append( - pandas.Index(final_columns)) + final_columns = untouched_data.columns.append(pandas.Index(final_columns)) return cls(new_data, self.index, final_columns) # Indexing def view(self, index=None, columns=None): - index_map_series = pandas.Series( - np.arange(len(self.index)), index=self.index) + index_map_series = pandas.Series(np.arange(len(self.index)), index=self.index) column_map_series = pandas.Series( - np.arange(len(self.columns)), index=self.columns) + np.arange(len(self.columns)), index=self.columns + ) if index is not None: index_map_series = index_map_series.reindex(index) if columns is not None: column_map_series = column_map_series.reindex(columns) - return PandasDataManagerView(self.data, index_map_series.index, - column_map_series.index, self.dtypes, - index_map_series, column_map_series) + return PandasDataManagerView( + self.data, + index_map_series.index, + column_map_series.index, + self.dtypes, + index_map_series, + column_map_series, + ) def squeeze(self, ndim=0, axis=None): squeezed = self.data.to_pandas().squeeze() @@ -2358,10 +2414,8 @@ def squeeze(self, ndim=0, axis=None): return squeezed - def write_items(self, row_numeric_index, col_numeric_index, - broadcasted_items): - def iloc_mut(partition, row_internal_indices, col_internal_indices, - item): + def write_items(self, row_numeric_index, col_numeric_index, broadcasted_items): + def iloc_mut(partition, row_internal_indices, col_internal_indices, item): partition = partition.copy() partition.iloc[row_internal_indices, col_internal_indices] = item return partition @@ -2371,7 +2425,8 @@ def iloc_mut(partition, row_internal_indices, col_internal_indices, row_indices=row_numeric_index, col_indices=col_numeric_index, mutate=True, - item_to_distribute=broadcasted_items) + item_to_distribute=broadcasted_items, + ) self.data = mutated_blk_partitions def global_idx_to_numeric_idx(self, axis, indices): @@ -2385,25 +2440,34 @@ def global_idx_to_numeric_idx(self, axis, indices): Returns: An Index object. """ - assert axis in ['row', 'col', 'columns'] - if axis == 'row': + assert axis in ["row", "col", "columns"] + if axis == "row": return pandas.Index( - pandas.Series(np.arange(len(self.index)), - index=self.index).loc[indices].values) - elif axis in ['col', 'columns']: + pandas.Series(np.arange(len(self.index)), index=self.index) + .loc[indices] + .values + ) + elif axis in ["col", "columns"]: return pandas.Index( - pandas.Series( - np.arange(len(self.columns)), - index=self.columns).loc[indices].values) + pandas.Series(np.arange(len(self.columns)), index=self.columns) + .loc[indices] + .values + ) def enlarge_partitions(self, new_row_labels=None, new_col_labels=None): new_data = self.data.enlarge_partitions( - len(new_row_labels), len(new_col_labels)) - concated_index = self.index.append(type( - self.index)(new_row_labels)) if new_row_labels else self.index - concated_columns = self.columns.append( - type(self.columns)( - new_col_labels)) if new_col_labels else self.columns + len(new_row_labels), len(new_col_labels) + ) + concated_index = ( + self.index.append(type(self.index)(new_row_labels)) + if new_row_labels + else self.index + ) + concated_columns = ( + self.columns.append(type(self.columns)(new_col_labels)) + if new_col_labels + else self.columns + ) return self.__constructor__(new_data, concated_index, concated_columns) @@ -2415,13 +2479,15 @@ class PandasDataManagerView(PandasDataManager): - (len(self.index), len(self.columns)) != self.data.shape """ - def __init__(self, - block_partitions_object: BlockPartitions, - index: pandas.Index, - columns: pandas.Index, - dtypes=None, - index_map_series: pandas.Series = None, - columns_map_series: pandas.Series = None): + def __init__( + self, + block_partitions_object: BlockPartitions, + index: pandas.Index, + columns: pandas.Index, + dtypes=None, + index_map_series: pandas.Series = None, + columns_map_series: pandas.Series = None, + ): """ Args: index_map_series: a Pandas Series Object mapping user-facing index to @@ -2438,19 +2504,28 @@ def __init__(self, self.columns_map = columns_map_series self.is_view = True - PandasDataManager.__init__(self, block_partitions_object, index, - columns, dtypes) + PandasDataManager.__init__( + self, block_partitions_object, index, columns, dtypes + ) - def __constructor__(self, - block_partitions_object: BlockPartitions, - index: pandas.Index, - columns: pandas.Index, - dtypes=None): + def __constructor__( + self, + block_partitions_object: BlockPartitions, + index: pandas.Index, + columns: pandas.Index, + dtypes=None, + ): new_index_map = self.index_map.reindex(index) new_columns_map = self.columns_map.reindex(columns) - return type(self)(block_partitions_object, index, columns, dtypes, - new_index_map, new_columns_map) + return type(self)( + block_partitions_object, + index, + columns, + dtypes, + new_index_map, + new_columns_map, + ) def _get_data(self) -> BlockPartitions: """Perform the map step @@ -2467,7 +2542,8 @@ def iloc(partition, row_internal_indices, col_internal_indices): row_indices=self.index_map.values, col_indices=self.columns_map.values, lazy=True, - keep_remaining=False) + keep_remaining=False, + ) return masked_data def _set_data(self, new_data): @@ -2479,8 +2555,8 @@ def _set_data(self, new_data): data = property(_get_data, _set_data) def global_idx_to_numeric_idx(self, axis, indices): - assert axis in ['row', 'col', 'columns'] - if axis == 'row': + assert axis in ["row", "col", "columns"] + if axis == "row": return self.index_map.loc[indices].index - elif axis in ['col', 'columns']: + elif axis in ["col", "columns"]: return self.columns_map.loc[indices].index diff --git a/modin/data_management/factories.py b/modin/data_management/factories.py index fea12f12b7b..9702a6634ec 100644 --- a/modin/data_management/factories.py +++ b/modin/data_management/factories.py @@ -13,8 +13,7 @@ class BaseFactory(object): @classmethod def _determine_engine(cls): - factory_name = \ - partition_format + "Backed" + execution_engine + "Factory" + factory_name = partition_format + "Backed" + execution_engine + "Factory" return getattr(sys.modules[__name__], factory_name) diff --git a/modin/data_management/partitioning/axis_partition.py b/modin/data_management/partitioning/axis_partition.py index dc8e760647c..007e92a9047 100644 --- a/modin/data_management/partitioning/axis_partition.py +++ b/modin/data_management/partitioning/axis_partition.py @@ -28,8 +28,7 @@ class AxisPartition(object): The only abstract method needed to implement is the `apply` method. """ - def apply(self, func, num_splits=None, other_axis_partition=None, - **kwargs): + def apply(self, func, num_splits=None, other_axis_partition=None, **kwargs): """Applies a function to a full axis. Note: The procedures that invoke this method assume full axis @@ -71,8 +70,7 @@ def __init__(self, list_of_blocks): # Unwrap from RemotePartition object for ease of use self.list_of_blocks = [obj.oid for obj in list_of_blocks] - def apply(self, func, num_splits=None, other_axis_partition=None, - **kwargs): + def apply(self, func, num_splits=None, other_axis_partition=None, **kwargs): """Applies func to the object in the plasma store. See notes in Parent class about this method. @@ -93,18 +91,17 @@ def apply(self, func, num_splits=None, other_axis_partition=None, return [ RayRemotePartition(obj) for obj in deploy_ray_func_between_two_axis_partitions._submit( - args=(self.axis, func, num_splits, - len(self.list_of_blocks), kwargs) + - tuple(self.list_of_blocks + - other_axis_partition.list_of_blocks), - num_return_vals=num_splits) + args=(self.axis, func, num_splits, len(self.list_of_blocks), kwargs) + + tuple(self.list_of_blocks + other_axis_partition.list_of_blocks), + num_return_vals=num_splits, + ) ] args = [self.axis, func, num_splits, kwargs] args.extend(self.list_of_blocks) return [ - RayRemotePartition(obj) for obj in deploy_ray_axis_func._submit( - args, num_return_vals=num_splits) + RayRemotePartition(obj) + for obj in deploy_ray_axis_func._submit(args, num_return_vals=num_splits) ] def shuffle(self, func, num_splits=None, **kwargs): @@ -123,8 +120,8 @@ def shuffle(self, func, num_splits=None, **kwargs): args = [self.axis, func, num_splits, kwargs] args.extend(self.list_of_blocks) return [ - RayRemotePartition(obj) for obj in deploy_ray_axis_func._submit( - args, num_return_vals=num_splits) + RayRemotePartition(obj) + for obj in deploy_ray_axis_func._submit(args, num_return_vals=num_splits) ] @@ -133,6 +130,7 @@ class RayColumnPartition(RayAxisPartition): for this class is in the parent class, and this class defines the axis to perform the computation over. """ + axis = 0 @@ -141,6 +139,7 @@ class RayRowPartition(RayAxisPartition): for this class is in the parent class, and this class defines the axis to perform the computation over. """ + axis = 1 @@ -160,13 +159,12 @@ def split_result_of_axis_func_pandas(axis, num_splits, result): if axis == 0 or type(result) is pandas.Series: chunksize = compute_chunksize(len(result), num_splits) return [ - result.iloc[chunksize * i:chunksize * (i + 1)] - for i in range(num_splits) + result.iloc[chunksize * i : chunksize * (i + 1)] for i in range(num_splits) ] else: chunksize = compute_chunksize(len(result.columns), num_splits) return [ - result.iloc[:, chunksize * i:chunksize * (i + 1)] + result.iloc[:, chunksize * i : chunksize * (i + 1)] for i in range(num_splits) ] @@ -193,7 +191,8 @@ def deploy_ray_axis_func(axis, func, num_splits, kwargs, *partitions): @ray.remote def deploy_ray_func_between_two_axis_partitions( - axis, func, num_splits, len_of_left, kwargs, *partitions): + axis, func, num_splits, len_of_left, kwargs, *partitions +): """Deploy a function along a full axis between two data sets in Ray. Args: @@ -210,10 +209,8 @@ def deploy_ray_func_between_two_axis_partitions( Returns: A list of Pandas DataFrames. """ - lt_frame = pandas.concat( - list(partitions[:len_of_left]), axis=axis, copy=False) - rt_frame = pandas.concat( - list(partitions[len_of_left:]), axis=axis, copy=False) + lt_frame = pandas.concat(list(partitions[:len_of_left]), axis=axis, copy=False) + rt_frame = pandas.concat(list(partitions[len_of_left:]), axis=axis, copy=False) result = func(lt_frame, rt_frame, **kwargs) return split_result_of_axis_func_pandas(axis, num_splits, result) diff --git a/modin/data_management/partitioning/partition_collections.py b/modin/data_management/partitioning/partition_collections.py index 9a7ba70451c..43bfd893cc0 100644 --- a/modin/data_management/partitioning/partition_collections.py +++ b/modin/data_management/partitioning/partition_collections.py @@ -102,9 +102,7 @@ def block_lengths(self): # The first column will have the correct lengths. We have an # invariant that requires that all blocks be the same length in a # row of blocks. - self._lengths_cache = [ - obj.length().get() for obj in self.partitions.T[0] - ] + self._lengths_cache = [obj.length().get() for obj in self.partitions.T[0]] return self._lengths_cache # Widths of the blocks @@ -121,9 +119,7 @@ def block_widths(self): # The first column will have the correct lengths. We have an # invariant that requires that all blocks be the same width in a # column of blocks. - self._widths_cache = [ - obj.width().get() for obj in self.partitions[0] - ] + self._widths_cache = [obj.width().get() for obj in self.partitions[0]] return self._widths_cache @property @@ -158,12 +154,16 @@ def full_reduce(self, map_func, reduce_func, axis): # DataFrame. The individual partitions return Series objects, and those # cannot be concatenated the correct way without casting them as # DataFrames. - full_frame = pandas.concat([ - pandas.concat( - [pandas.DataFrame(part.get()).T for part in row_of_parts], - axis=axis ^ 1) for row_of_parts in mapped_parts - ], - axis=axis) + full_frame = pandas.concat( + [ + pandas.concat( + [pandas.DataFrame(part.get()).T for part in row_of_parts], + axis=axis ^ 1, + ) + for row_of_parts in mapped_parts + ], + axis=axis, + ) # Transpose because operations where axis == 1 assume that the # operation is performed across the other axis @@ -186,17 +186,25 @@ def map_across_blocks(self, map_func): preprocessed_map_func = self.preprocess_func(map_func) new_partitions = np.array( - [[part.apply(preprocessed_map_func) for part in row_of_parts] - for row_of_parts in self.partitions]) + [ + [part.apply(preprocessed_map_func) for part in row_of_parts] + for row_of_parts in self.partitions + ] + ) return cls(new_partitions) def lazy_map_across_blocks(self, map_func, kwargs): cls = type(self) preprocessed_map_func = self.preprocess_func(map_func) - new_partitions = np.array([[ - part.add_to_apply_calls(preprocessed_map_func, kwargs) - for part in row_of_parts - ] for row_of_parts in self.partitions]) + new_partitions = np.array( + [ + [ + part.add_to_apply_calls(preprocessed_map_func, kwargs) + for part in row_of_parts + ] + for row_of_parts in self.partitions + ] + ) return cls(new_partitions) def map_across_full_axis(self, axis, map_func): @@ -220,10 +228,9 @@ def map_across_full_axis(self, axis, map_func): preprocessed_map_func = self.preprocess_func(map_func) partitions = self.column_partitions if not axis else self.row_partitions - result_blocks = np.array([ - part.apply(preprocessed_map_func, num_splits) - for part in partitions - ]) + result_blocks = np.array( + [part.apply(preprocessed_map_func, num_splits) for part in partitions] + ) # If we are mapping over columns, they are returned to use the same as # rows, so we need to transpose the returned 2D numpy array to return # the structure to the correct order. @@ -272,16 +279,22 @@ def take(self, axis, n): # We build this iloc to avoid creating a bunch of helper methods. # This code creates slice objects to be passed to `iloc` to grab # the last n rows or columns depending on axis. - slice_obj = slice(-remaining, None) if axis == 0 else ( - slice(None), slice(-remaining, None)) + slice_obj = ( + slice(-remaining, None) + if axis == 0 + else (slice(None), slice(-remaining, None)) + ) func = self.preprocess_func(lambda df: df.iloc[slice_obj]) # We use idx + 1 here because the loop is not inclusive, and we # need to iterate through idx. - result = np.array([ - partitions[i] if i != idx else - [obj.apply(func) for obj in partitions[i]] - for i in range(idx + 1) - ])[::-1] + result = np.array( + [ + partitions[i] + if i != idx + else [obj.apply(func) for obj in partitions[i]] + for i in range(idx + 1) + ] + )[::-1] else: length_bins = np.cumsum(bin_lengths) idx = int(np.digitize(n, length_bins)) @@ -296,15 +309,19 @@ def take(self, axis, n): # We build this iloc to avoid creating a bunch of helper methods. # This code creates slice objects to be passed to `iloc` to grab # the first n rows or columns depending on axis. - slice_obj = slice(remaining) if axis == 0 else ( - slice(None), slice(remaining)) + slice_obj = ( + slice(remaining) if axis == 0 else (slice(None), slice(remaining)) + ) func = self.preprocess_func(lambda df: df.iloc[slice_obj]) # See note above about idx + 1 - result = np.array([ - partitions[i] if i != idx else - [obj.apply(func) for obj in partitions[i]] - for i in range(idx + 1) - ]) + result = np.array( + [ + partitions[i] + if i != idx + else [obj.apply(func) for obj in partitions[i]] + for i in range(idx + 1) + ] + ) return cls(result.T) if axis else cls(result) @@ -326,11 +343,9 @@ def concat(self, axis, other_blocks): cls = type(self) if type(other_blocks) is list: other_blocks = [blocks.partitions for blocks in other_blocks] - return cls( - np.concatenate([self.partitions] + other_blocks, axis=axis)) + return cls(np.concatenate([self.partitions] + other_blocks, axis=axis)) else: - return cls( - np.append(self.partitions, other_blocks.partitions, axis=axis)) + return cls(np.append(self.partitions, other_blocks.partitions, axis=axis)) def copy(self): """Create a copy of this object. @@ -367,16 +382,21 @@ def to_pandas(self, is_transposed=False): if is_transposed: return self.transpose().to_pandas(False).T else: - retrieved_objects = [[obj.to_pandas() for obj in part] - for part in self.partitions] + retrieved_objects = [ + [obj.to_pandas() for obj in part] for part in self.partitions + ] if all( - isinstance(part, pandas.Series) - for row in retrieved_objects for part in row): + isinstance(part, pandas.Series) + for row in retrieved_objects + for part in row + ): axis = 0 retrieved_objects = np.array(retrieved_objects).T elif all( - isinstance(part, pandas.DataFrame) - for row in retrieved_objects for part in row): + isinstance(part, pandas.DataFrame) + for row in retrieved_objects + for part in row + ): axis = 1 else: raise ValueError( @@ -403,15 +423,15 @@ def from_pandas(cls, df): # Each chunk must have a RangeIndex that spans its length and width # according to our invariant. def chunk_builder(i, j): - chunk = df.iloc[i:i + row_chunksize, j:j + col_chunksize] + chunk = df.iloc[i : i + row_chunksize, j : j + col_chunksize] chunk.index = pandas.RangeIndex(len(chunk.index)) chunk.columns = pandas.RangeIndex(len(chunk.columns)) return put_func(chunk) - parts = [[ - chunk_builder(i, j) - for j in range(0, len(df.columns), col_chunksize) - ] for i in range(0, len(df), row_chunksize)] + parts = [ + [chunk_builder(i, j) for j in range(0, len(df.columns), col_chunksize)] + for i in range(0, len(df), row_chunksize) + ] return cls(np.array(parts)) @@ -431,31 +451,25 @@ def get_indices(self, axis=0, index_func=None, old_blocks=None): Returns: A Pandas Index object. """ - assert callable(index_func), \ - "Must tell this function how to extract index" + assert callable(index_func), "Must tell this function how to extract index" if axis == 0: func = self.preprocess_func(index_func) # We grab the first column of blocks and extract the indices - new_indices = [ - idx.apply(func).get() for idx in self.partitions.T[0] - ] + new_indices = [idx.apply(func).get() for idx in self.partitions.T[0]] # This is important because sometimes we have resized the data. The new # sizes will not be valid if we are trying to compute the index on a # new object that has a different length. if old_blocks is not None: - cumulative_block_lengths = np.array( - old_blocks.block_lengths).cumsum() + cumulative_block_lengths = np.array(old_blocks.block_lengths).cumsum() else: - cumulative_block_lengths = np.array( - self.block_lengths).cumsum() + cumulative_block_lengths = np.array(self.block_lengths).cumsum() else: func = self.preprocess_func(index_func) new_indices = [idx.apply(func).get() for idx in self.partitions[0]] if old_blocks is not None: - cumulative_block_lengths = np.array( - old_blocks.block_widths).cumsum() + cumulative_block_lengths = np.array(old_blocks.block_widths).cumsum() else: cumulative_block_lengths = np.array(self.block_widths).cumsum() @@ -469,8 +483,7 @@ def get_indices(self, axis=0, index_func=None, old_blocks=None): # The try-except here is intended to catch issues where we are # trying to get a string index out of the internal index. try: - append_val = new_indices[i] + cumulative_block_lengths[i - - 1] + append_val = new_indices[i] + cumulative_block_lengths[i - 1] except TypeError: append_val = new_indices[i] @@ -488,6 +501,7 @@ def _compute_num_partitions(cls): :return: """ from ...pandas import DEFAULT_NPARTITIONS + return DEFAULT_NPARTITIONS # Extracting rows/columns @@ -511,15 +525,21 @@ def _get_blocks_containing_index(self, axis, index): block_idx = int(np.digitize(index, cumulative_column_widths)) # Compute the internal index based on the previous lengths. This # is a global index, so we must subtract the lengths first. - internal_idx = index if not block_idx else index - cumulative_column_widths[ - block_idx - 1] + internal_idx = ( + index + if not block_idx + else index - cumulative_column_widths[block_idx - 1] + ) return block_idx, internal_idx else: cumulative_row_lengths = np.array(self.block_lengths).cumsum() block_idx = int(np.digitize(index, cumulative_row_lengths)) # See note above about internal index - internal_idx = index if not block_idx else index - cumulative_row_lengths[ - block_idx - 1] + internal_idx = ( + index + if not block_idx + else index - cumulative_row_lengths[block_idx - 1] + ) return block_idx, internal_idx def _get_dict_of_block_index(self, axis, indices): @@ -567,11 +587,7 @@ def _apply_func_to_list_of_partitions(self, func, partitions, **kwargs): preprocessed_func = self.preprocess_func(func) return [obj.apply(preprocessed_func, **kwargs) for obj in partitions] - def apply_func_to_select_indices(self, - axis, - func, - indices, - keep_remaining=False): + def apply_func_to_select_indices(self, axis, func, indices, keep_remaining=False): """Applies a function to select indices. Note: Your internal function must take a kwarg `internal_indices` for @@ -617,58 +633,70 @@ def apply_func_to_select_indices(self, # accept a keyword argument `func_dict`. if dict_indices is not None: if not keep_remaining: - result = np.array([ - self._apply_func_to_list_of_partitions( - func, - partitions_for_apply[i], - func_dict={ - idx: dict_indices[idx] - for idx in partitions_dict[i] - }) for i in partitions_dict - ]) + result = np.array( + [ + self._apply_func_to_list_of_partitions( + func, + partitions_for_apply[i], + func_dict={ + idx: dict_indices[idx] for idx in partitions_dict[i] + }, + ) + for i in partitions_dict + ] + ) else: - result = np.array([ - partitions_for_apply[i] if i not in partitions_dict else - self._apply_func_to_list_of_partitions( - func, - partitions_for_apply[i], - func_dict={ - idx: dict_indices[i] - for idx in partitions_dict[i] - }) for i in range(len(partitions_for_apply)) - ]) + result = np.array( + [ + partitions_for_apply[i] + if i not in partitions_dict + else self._apply_func_to_list_of_partitions( + func, + partitions_for_apply[i], + func_dict={ + idx: dict_indices[i] for idx in partitions_dict[i] + }, + ) + for i in range(len(partitions_for_apply)) + ] + ) else: if not keep_remaining: # We are passing internal indices in here. In order for func to # actually be able to use this information, it must be able to take in # the internal indices. This might mean an iloc in the case of Pandas # or some other way to index into the internal representation. - result = np.array([ - self._apply_func_to_list_of_partitions( - func, - partitions_for_apply[i], - internal_indices=partitions_dict[i]) - for i in partitions_dict - ]) + result = np.array( + [ + self._apply_func_to_list_of_partitions( + func, + partitions_for_apply[i], + internal_indices=partitions_dict[i], + ) + for i in partitions_dict + ] + ) else: # The difference here is that we modify a subset and return the # remaining (non-updated) blocks in their original position. - result = np.array([ - partitions_for_apply[i] if i not in partitions_dict else - self._apply_func_to_list_of_partitions( - func, - partitions_for_apply[i], - internal_indices=partitions_dict[i]) - for i in range(len(partitions_for_apply)) - ]) + result = np.array( + [ + partitions_for_apply[i] + if i not in partitions_dict + else self._apply_func_to_list_of_partitions( + func, + partitions_for_apply[i], + internal_indices=partitions_dict[i], + ) + for i in range(len(partitions_for_apply)) + ] + ) return cls(result.T) if not axis else cls(result) - def apply_func_to_select_indices_along_full_axis(self, - axis, - func, - indices, - keep_remaining=False): + def apply_func_to_select_indices_along_full_axis( + self, axis, func, indices, keep_remaining=False + ): """Applies a function to a select subset of full columns/rows. Note: This should be used when you need to apply a function that relies @@ -718,52 +746,68 @@ def apply_func_to_select_indices_along_full_axis(self, # accept a keyword argument `func_dict`. if dict_indices is not None: if not keep_remaining: - result = np.array([ - partitions_for_apply[i].apply( - preprocessed_func, - func_dict={ - idx: dict_indices[idx] - for idx in partitions_dict[i] - }) for i in partitions_dict - ]) + result = np.array( + [ + partitions_for_apply[i].apply( + preprocessed_func, + func_dict={ + idx: dict_indices[idx] for idx in partitions_dict[i] + }, + ) + for i in partitions_dict + ] + ) else: - result = np.array([ - partitions_for_remaining[i] if i not in partitions_dict - else self._apply_func_to_list_of_partitions( - preprocessed_func, - partitions_for_apply[i], - func_dict={ - idx: dict_indices[idx] - for idx in partitions_dict[i] - }) for i in range(len(partitions_for_apply)) - ]) + result = np.array( + [ + partitions_for_remaining[i] + if i not in partitions_dict + else self._apply_func_to_list_of_partitions( + preprocessed_func, + partitions_for_apply[i], + func_dict={ + idx: dict_indices[idx] for idx in partitions_dict[i] + }, + ) + for i in range(len(partitions_for_apply)) + ] + ) else: if not keep_remaining: # See notes in `apply_func_to_select_indices` - result = np.array([ - partitions_for_apply[i].apply( - preprocessed_func, internal_indices=partitions_dict[i]) - for i in partitions_dict - ]) + result = np.array( + [ + partitions_for_apply[i].apply( + preprocessed_func, internal_indices=partitions_dict[i] + ) + for i in partitions_dict + ] + ) else: # See notes in `apply_func_to_select_indices` - result = np.array([ - partitions_for_remaining[i] if i not in partitions_dict - else partitions_for_apply[i].apply( - preprocessed_func, internal_indices=partitions_dict[i]) - for i in range(len(partitions_for_remaining)) - ]) + result = np.array( + [ + partitions_for_remaining[i] + if i not in partitions_dict + else partitions_for_apply[i].apply( + preprocessed_func, internal_indices=partitions_dict[i] + ) + for i in range(len(partitions_for_remaining)) + ] + ) return cls(result.T) if not axis else cls(result) - def apply_func_to_indices_both_axis(self, - func, - row_indices, - col_indices, - lazy=False, - keep_remaining=True, - mutate=False, - item_to_distribute=None): + def apply_func_to_indices_both_axis( + self, + func, + row_indices, + col_indices, + lazy=False, + keep_remaining=True, + mutate=False, + item_to_distribute=None, + ): """ Apply a function to along both axis @@ -782,18 +826,22 @@ def apply_func_to_indices_both_axis(self, row_position_counter = 0 for row_blk_idx, row_internal_idx in self._get_dict_of_block_index( - 1, row_indices).items(): + 1, row_indices + ).items(): col_position_counter = 0 for col_blk_idx, col_internal_idx in self._get_dict_of_block_index( - 0, col_indices).items(): + 0, col_indices + ).items(): remote_part = partition_copy[row_blk_idx, col_blk_idx] if item_to_distribute is not None: item = item_to_distribute[ - row_position_counter:row_position_counter + - len(row_internal_idx), col_position_counter: - col_position_counter + len(col_internal_idx)] - item = {'item': item} + row_position_counter : row_position_counter + + len(row_internal_idx), + col_position_counter : col_position_counter + + len(col_internal_idx), + ] + item = {"item": item} else: item = {} @@ -802,13 +850,15 @@ def apply_func_to_indices_both_axis(self, func, row_internal_indices=row_internal_idx, col_internal_indices=col_internal_idx, - **item) + **item + ) else: result = remote_part.apply( func, row_internal_indices=row_internal_idx, col_internal_indices=col_internal_idx, - **item) + **item + ) partition_copy[row_blk_idx, col_blk_idx] = result operation_mask[row_blk_idx, col_blk_idx] = True @@ -845,13 +895,16 @@ def inter_data_operation(self, axis, func, other): func = self.preprocess_func(func) - result = np.array([ - partitions[i].apply( - func, - num_splits=cls._compute_num_partitions(), - other_axis_partition=other_partitions[i]) - for i in range(len(partitions)) - ]) + result = np.array( + [ + partitions[i].apply( + func, + num_splits=cls._compute_num_partitions(), + other_axis_partition=other_partitions[i], + ) + for i in range(len(partitions)) + ] + ) return cls(result) if axis else cls(result.T) def manual_shuffle(self, axis, shuffle_func): @@ -872,10 +925,12 @@ def manual_shuffle(self, axis, shuffle_func): partitions = self.column_partitions func = self.preprocess_func(shuffle_func) - result = np.array([ - part.shuffle(func, num_splits=cls._compute_num_partitions()) - for part in partitions - ]) + result = np.array( + [ + part.shuffle(func, num_splits=cls._compute_num_partitions()) + for part in partitions + ] + ) return cls(result) if axis else cls(result.T) def __getitem__(self, key): @@ -893,7 +948,8 @@ def enlarge_partitions(self, n_rows=None, n_cols=None): n_cols_lst = self.block_widths nan_oids_lst = [ self._partition_class( - _get_nan_block_id(self._partition_class, n_rows, n_cols_)) + _get_nan_block_id(self._partition_class, n_rows, n_cols_) + ) for n_cols_ in n_cols_lst ] new_chunk = block_partitions_cls(np.array([nan_oids_lst])) @@ -903,7 +959,8 @@ def enlarge_partitions(self, n_rows=None, n_cols=None): n_rows_lst = self.block_lengths nan_oids_lst = [ self._partition_class( - _get_nan_block_id(self._partition_class, n_rows_, n_cols)) + _get_nan_block_id(self._partition_class, n_rows_, n_cols) + ) for n_rows_ in n_rows_lst ] new_chunk = block_partitions_cls(np.array([nan_oids_lst]).T) @@ -939,7 +996,8 @@ def block_lengths(self): # invariant that requires that all blocks be the same length in a # row of blocks. self._lengths_cache = ray.get( - [obj.length().oid for obj in self.partitions.T[0]]) + [obj.length().oid for obj in self.partitions.T[0]] + ) return self._lengths_cache # Widths of the blocks @@ -957,7 +1015,8 @@ def block_widths(self): # invariant that requires that all blocks be the same width in a # column of blocks. self._widths_cache = ray.get( - [obj.width().oid for obj in self.partitions[0]]) + [obj.width().oid for obj in self.partitions[0]] + ) return self._widths_cache @property diff --git a/modin/data_management/partitioning/remote_partition.py b/modin/data_management/partitioning/remote_partition.py index 9e94e26abfa..49493d28953 100644 --- a/modin/data_management/partitioning/remote_partition.py +++ b/modin/data_management/partitioning/remote_partition.py @@ -188,7 +188,8 @@ def call_queue_closure(oid_obj, call_queues): return oid_obj oid = deploy_ray_func.remote( - call_queue_closure, oid, kwargs={'call_queues': self.call_queue}) + call_queue_closure, oid, kwargs={"call_queues": self.call_queue} + ) self.call_queue = [] return RayRemotePartition(oid) @@ -207,8 +208,7 @@ def to_pandas(self): A Pandas DataFrame. """ dataframe = self.get() - assert type(dataframe) is pandas.DataFrame or type( - dataframe) is pandas.Series + assert type(dataframe) is pandas.DataFrame or type(dataframe) is pandas.Series return dataframe diff --git a/modin/data_management/partitioning/utils.py b/modin/data_management/partitioning/utils.py index c47be187193..f63eeef7042 100644 --- a/modin/data_management/partitioning/utils.py +++ b/modin/data_management/partitioning/utils.py @@ -8,8 +8,9 @@ def compute_chunksize(length, num_splits): # We do this to avoid zeros and having an extremely large last partition - return length // num_splits if length % num_splits == 0 \ - else length // num_splits + 1 + return ( + length // num_splits if length % num_splits == 0 else length // num_splits + 1 + ) def _get_nan_block_id(partition_class, n_row=1, n_col=1, transpose=False): diff --git a/modin/pandas/__init__.py b/modin/pandas/__init__.py index 8b1aebcb53a..d0c1eb46592 100644 --- a/modin/pandas/__init__.py +++ b/modin/pandas/__init__.py @@ -4,11 +4,33 @@ # TODO: In the future `set_option` or similar needs to run on every node # in order to keep all pandas instances across nodes consistent -from pandas import (eval, unique, value_counts, cut, to_numeric, factorize, - test, qcut, match, Panel, date_range, Index, MultiIndex, - CategoricalIndex, Series, bdate_range, DatetimeIndex, - Timedelta, Timestamp, to_timedelta, set_eng_float_format, - set_option, NaT, PeriodIndex, Categorical) +from pandas import ( + eval, + unique, + value_counts, + cut, + to_numeric, + factorize, + test, + qcut, + match, + Panel, + date_range, + Index, + MultiIndex, + CategoricalIndex, + Series, + bdate_range, + DatetimeIndex, + Timedelta, + Timestamp, + to_timedelta, + set_eng_float_format, + set_option, + NaT, + PeriodIndex, + Categorical, +) import threading import os import ray @@ -17,34 +39,79 @@ from .concat import concat from .dataframe import DataFrame from .datetimes import to_datetime -from .io import (read_csv, read_parquet, read_json, read_html, read_clipboard, - read_excel, read_hdf, read_feather, read_msgpack, read_stata, - read_sas, read_pickle, read_sql) +from .io import ( + read_csv, + read_parquet, + read_json, + read_html, + read_clipboard, + read_excel, + read_hdf, + read_feather, + read_msgpack, + read_stata, + read_sas, + read_pickle, + read_sql, +) from .reshape import get_dummies # Set this so that Pandas doesn't try to multithread by itself -os.environ['OMP_NUM_THREADS'] = "1" +os.environ["OMP_NUM_THREADS"] = "1" try: if threading.current_thread().name == "MainThread": - ray.init( - redirect_output=True, - include_webui=False, - redirect_worker_output=True) + ray.init(redirect_output=True, include_webui=False, redirect_worker_output=True) except AssertionError: pass -num_cpus = ray.global_state.cluster_resources()['CPU'] +num_cpus = ray.global_state.cluster_resources()["CPU"] DEFAULT_NPARTITIONS = max(4, int(num_cpus)) __all__ = [ - "DataFrame", "Series", "read_csv", "read_parquet", "read_json", - "read_html", "read_clipboard", "read_excel", "read_hdf", "read_feather", - "read_msgpack", "read_stata", "read_sas", "read_pickle", "read_sql", - "concat", "eval", "unique", "value_counts", "cut", "to_numeric", - "factorize", "test", "qcut", "match", "to_datetime", "get_dummies", - "Panel", "date_range", "Index", "MultiIndex", "Series", "bdate_range", - "DatetimeIndex", "to_timedelta", "set_eng_float_format", "set_option", - "CategoricalIndex", "Timedelta", "Timestamp", "NaT", "PeriodIndex", - "Categorical", "__git_revision__", "__version__" + "DataFrame", + "Series", + "read_csv", + "read_parquet", + "read_json", + "read_html", + "read_clipboard", + "read_excel", + "read_hdf", + "read_feather", + "read_msgpack", + "read_stata", + "read_sas", + "read_pickle", + "read_sql", + "concat", + "eval", + "unique", + "value_counts", + "cut", + "to_numeric", + "factorize", + "test", + "qcut", + "match", + "to_datetime", + "get_dummies", + "Panel", + "date_range", + "Index", + "MultiIndex", + "Series", + "bdate_range", + "DatetimeIndex", + "to_timedelta", + "set_eng_float_format", + "set_option", + "CategoricalIndex", + "Timedelta", + "Timestamp", + "NaT", + "PeriodIndex", + "Categorical", + "__git_revision__", + "__version__", ] diff --git a/modin/pandas/concat.py b/modin/pandas/concat.py index d6bc52ab867..15cf62693cb 100644 --- a/modin/pandas/concat.py +++ b/modin/pandas/concat.py @@ -7,16 +7,18 @@ from .dataframe import DataFrame -def concat(objs, - axis=0, - join='outer', - join_axes=None, - ignore_index=False, - keys=None, - levels=None, - names=None, - verify_integrity=False, - copy=True): +def concat( + objs, + axis=0, + join="outer", + join_axes=None, + ignore_index=False, + keys=None, + levels=None, + names=None, + verify_integrity=False, + copy=True, +): if keys is not None: objs = [objs[k] for k in keys] @@ -33,41 +35,60 @@ def concat(objs, try: type_check = next( - obj for obj in objs - if not isinstance(obj, (pandas.Series, pandas.DataFrame, - DataFrame))) + obj + for obj in objs + if not isinstance(obj, (pandas.Series, pandas.DataFrame, DataFrame)) + ) except StopIteration: type_check = None if type_check is not None: raise ValueError( - "cannot concatenate object of type \"{0}\"; only " + 'cannot concatenate object of type "{0}"; only ' "pandas.Series, pandas.DataFrame, " "and modin.pandas.DataFrame objs are " - "valid", type(type_check)) + "valid", + type(type_check), + ) all_series = all(isinstance(obj, pandas.Series) for obj in objs) if all_series: return DataFrame( - pandas.concat(objs, axis, join, join_axes, ignore_index, keys, - levels, names, verify_integrity, copy)) + pandas.concat( + objs, + axis, + join, + join_axes, + ignore_index, + keys, + levels, + names, + verify_integrity, + copy, + ) + ) if isinstance(objs, dict): raise NotImplementedError( "Obj as dicts not implemented. To contribute to " - "Pandas on Ray, please visit github.com/ray-project/ray.") + "Pandas on Ray, please visit github.com/ray-project/ray." + ) axis = pandas.DataFrame()._get_axis_number(axis) - if join not in ['inner', 'outer']: - raise ValueError("Only can inner (intersect) or outer (union) join the" - " other axis") + if join not in ["inner", "outer"]: + raise ValueError( + "Only can inner (intersect) or outer (union) join the" " other axis" + ) # We have the weird Series and axis check because, when concatenating a # dataframe to a series on axis=0, pandas ignores the name of the series, # and this check aims to mirror that (possibly buggy) functionality objs = [ - obj if isinstance(obj, DataFrame) else DataFrame(obj.rename()) - if isinstance(obj, pandas.Series) and axis == 0 else DataFrame(obj) + obj + if isinstance(obj, DataFrame) + else DataFrame(obj.rename()) + if isinstance(obj, pandas.Series) and axis == 0 + else DataFrame(obj) for obj in objs ] df = objs[0] @@ -82,5 +103,6 @@ def concat(objs, levels=None, names=None, verify_integrity=False, - copy=True) + copy=True, + ) return DataFrame(data_manager=new_manager) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 757679320ff..3c554b62cc0 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -7,11 +7,15 @@ from pandas.compat import to_str, string_types, cPickle as pkl import pandas.core.common as com from pandas.core.dtypes.common import ( - _get_dtype_from_object, is_bool_dtype, is_list_like, is_numeric_dtype, - is_datetime_or_timedelta_dtype, is_dtype_equal) + _get_dtype_from_object, + is_bool_dtype, + is_list_like, + is_numeric_dtype, + is_datetime_or_timedelta_dtype, + is_dtype_equal, +) from pandas.core.index import _ensure_index_from_sequences -from pandas.core.indexing import (check_bool_indexer, - convert_to_index_sliceable) +from pandas.core.indexing import check_bool_indexer, convert_to_index_sliceable from pandas.util._validators import validate_bool_kwarg import itertools @@ -22,20 +26,23 @@ import sys import warnings -from .utils import (from_pandas, to_pandas, _inherit_docstrings) +from .utils import from_pandas, to_pandas, _inherit_docstrings from .iterator import PartitionIterator @_inherit_docstrings( - pandas.DataFrame, excluded=[pandas.DataFrame, pandas.DataFrame.__init__]) + pandas.DataFrame, excluded=[pandas.DataFrame, pandas.DataFrame.__init__] +) class DataFrame(object): - def __init__(self, - data=None, - index=None, - columns=None, - dtype=None, - copy=False, - data_manager=None): + def __init__( + self, + data=None, + index=None, + columns=None, + dtype=None, + copy=False, + data_manager=None, + ): """Distributed DataFrame object backed by Pandas dataframes. Args: @@ -60,11 +67,8 @@ def __init__(self, if data is not None or data_manager is None: pandas_df = pandas.DataFrame( - data=data, - index=index, - columns=columns, - dtype=dtype, - copy=copy) + data=data, index=index, columns=columns, dtype=dtype, copy=copy + ) self._data_manager = from_pandas(pandas_df)._data_manager else: @@ -120,9 +124,9 @@ def __repr__(self): result = repr(self._build_repr_df(num_rows, num_cols)) if len(self.index) > num_rows or len(self.columns) > num_cols: # The split here is so that we don't repr pandas row lengths. - return result.rsplit("\n\n", - 1)[0] + "\n\n[{0} rows x {1} columns]".format( - len(self.index), len(self.columns)) + return result.rsplit("\n\n", 1)[0] + "\n\n[{0} rows x {1} columns]".format( + len(self.index), len(self.columns) + ) else: return result @@ -142,9 +146,11 @@ def _repr_html_(self): result = self._build_repr_df(num_rows, num_cols)._repr_html_() if len(self.index) > num_rows or len(self.columns) > num_cols: # We split so that we insert our correct dataframe dimensions. - return result.split( - "

")[0] + "

{0} rows x {1} columns

\n".format( - len(self.index), len(self.columns)) + return result.split("

")[ + 0 + ] + "

{0} rows x {1} columns

\n".format( + len(self.index), len(self.columns) + ) else: return result @@ -193,15 +199,14 @@ def _validate_eval_query(self, expr, **kwargs): expr: The expression to evaluate. This string cannot contain any Python statements, only Python expressions. """ - if isinstance(expr, str) and expr is '': + if isinstance(expr, str) and expr is "": raise ValueError("expr cannot be an empty string") - if isinstance(expr, str) and '@' in expr: - raise NotImplementedError("Local variables not yet supported in " - "eval.") + if isinstance(expr, str) and "@" in expr: + raise NotImplementedError("Local variables not yet supported in " "eval.") - if isinstance(expr, str) and 'not' in expr: - if 'parser' in kwargs and kwargs['parser'] == 'python': + if isinstance(expr, str) and "not" in expr: + if "parser" in kwargs and kwargs["parser"] == "python": raise NotImplementedError("'Not' nodes are not implemented.") @property @@ -316,8 +321,7 @@ def applymap(self, func): func (callable): The function to apply. """ if not callable(func): - raise ValueError("\'{0}\' object is not callable".format( - type(func))) + raise ValueError("'{0}' object is not callable".format(type(func))) return DataFrame(data_manager=self._data_manager.applymap(func)) @@ -329,15 +333,17 @@ def copy(self, deep=True): """ return DataFrame(data_manager=self._data_manager.copy()) - def groupby(self, - by=None, - axis=0, - level=None, - as_index=True, - sort=True, - group_keys=True, - squeeze=False, - **kwargs): + def groupby( + self, + by=None, + axis=0, + level=None, + as_index=True, + sort=True, + group_keys=True, + squeeze=False, + **kwargs + ): """Apply a groupby to this DataFrame. See _groupby() remote task. Args: by: The value to groupby. @@ -359,26 +365,32 @@ def groupby(self, if isinstance(by, pandas.Series): by = by.values.tolist() - mismatch = len(by) != len(self) if axis == 0 \ - else len(by) != len(self.columns) + mismatch = ( + len(by) != len(self) if axis == 0 else len(by) != len(self.columns) + ) if all(obj in self for obj in by) and mismatch: raise NotImplementedError( - "Groupby with lists of columns not yet supported.") + "Groupby with lists of columns not yet supported." + ) elif mismatch: raise KeyError(next(x for x in by if x not in self)) from .groupby import DataFrameGroupBy - return DataFrameGroupBy(self, by, axis, level, as_index, sort, - group_keys, squeeze, **kwargs) - - def sum(self, - axis=None, - skipna=True, - level=None, - numeric_only=None, - min_count=1, - **kwargs): + + return DataFrameGroupBy( + self, by, axis, level, as_index, sort, group_keys, squeeze, **kwargs + ) + + def sum( + self, + axis=None, + skipna=True, + level=None, + numeric_only=None, + min_count=1, + **kwargs + ): """Perform a sum across the DataFrame. Args: @@ -388,8 +400,7 @@ def sum(self, Returns: The sum of the DataFrame. """ - axis = pandas.DataFrame()._get_axis_number( - axis) if axis is not None else 0 + axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 return self._data_manager.sum( axis=axis, @@ -397,7 +408,8 @@ def sum(self, level=level, numeric_only=numeric_only, min_count=min_count, - **kwargs) + **kwargs + ) def abs(self): """Apply an absolute value function to all numeric columns. @@ -406,7 +418,7 @@ def abs(self): A new DataFrame with the applied absolute value. """ for t in self.dtypes: - if np.dtype('O') == t: + if np.dtype("O") == t: # TODO Give a more accurate error to Pandas raise TypeError("bad operand type for abs():", "str") @@ -462,17 +474,11 @@ def transpose(self, *args, **kwargs): Returns: A new DataFrame transposed from this DataFrame. """ - return DataFrame( - data_manager=self._data_manager.transpose(*args, **kwargs)) + return DataFrame(data_manager=self._data_manager.transpose(*args, **kwargs)) T = property(transpose) - def dropna(self, - axis=0, - how='any', - thresh=None, - subset=None, - inplace=False): + def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False): """Create a new DataFrame from the removed NA values from this one. Args: @@ -498,8 +504,7 @@ def dropna(self, result = self for ax in axis: - result = result.dropna( - axis=ax, how=how, thresh=thresh, subset=subset) + result = result.dropna(axis=ax, how=how, thresh=thresh, subset=subset) if not inplace: return result @@ -508,10 +513,10 @@ def dropna(self, axis = pandas.DataFrame()._get_axis_number(axis) - if how is not None and how not in ['any', 'all']: - raise ValueError('invalid how option: %s' % how) + if how is not None and how not in ["any", "all"]: + raise ValueError("invalid how option: %s" % how) if how is None and thresh is None: - raise TypeError('must specify how or thresh') + raise TypeError("must specify how or thresh") if subset is not None: if axis == 1: @@ -526,14 +531,15 @@ def dropna(self, raise KeyError(list(np.compress(check, subset))) new_manager = self._data_manager.dropna( - axis=axis, how=how, thresh=thresh, subset=subset) + axis=axis, how=how, thresh=thresh, subset=subset + ) if not inplace: return DataFrame(data_manager=new_manager) else: self._update_inplace(new_manager=new_manager) - def add(self, other, axis='columns', level=None, fill_value=None): + def add(self, other, axis="columns", level=None, fill_value=None): """Add this DataFrame to another or a scalar/list. Args: @@ -547,12 +553,14 @@ def add(self, other, axis='columns', level=None, fill_value=None): A new DataFrame with the applied addition. """ if level is not None: - raise NotImplementedError("Mutlilevel index not yet supported " - "in Pandas on Ray") + raise NotImplementedError( + "Mutlilevel index not yet supported " "in Pandas on Ray" + ) other = self._validate_other(other, axis) new_manager = self._data_manager.add( - other=other, axis=axis, level=level, fill_value=fill_value) + other=other, axis=axis, level=level, fill_value=fill_value + ) return self._create_dataframe_from_manager(new_manager) def agg(self, func, axis=0, *args, **kwargs): @@ -570,16 +578,16 @@ def aggregate(self, func, axis=0, *args, **kwargs): pass if result is None: - kwargs.pop('is_transform', None) + kwargs.pop("is_transform", None) return self.apply(func, axis=axis, args=args, **kwargs) return result def _aggregate(self, arg, *args, **kwargs): - _axis = kwargs.pop('_axis', None) + _axis = kwargs.pop("_axis", None) if _axis is None: - _axis = getattr(self, 'axis', 0) - kwargs.pop('_level', None) + _axis = getattr(self, "axis", 0) + kwargs.pop("_level", None) if isinstance(arg, string_types): return self._string_function(arg, *args, **kwargs) @@ -588,7 +596,8 @@ def _aggregate(self, arg, *args, **kwargs): elif isinstance(arg, dict): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) elif is_list_like(arg) or callable(arg): return self.apply(arg, axis=_axis, args=args, **kwargs) else: @@ -605,9 +614,9 @@ def _string_function(self, func, *args, **kwargs): return f(*args, **kwargs) assert len(args) == 0 - assert len([ - kwarg for kwarg in kwargs if kwarg not in ['axis', '_level'] - ]) == 0 + assert ( + len([kwarg for kwarg in kwargs if kwarg not in ["axis", "_level"]]) == 0 + ) return f f = getattr(np, func, None) @@ -616,20 +625,23 @@ def _string_function(self, func, *args, **kwargs): raise ValueError("{} is an unknown string function".format(func)) - def align(self, - other, - join='outer', - axis=None, - level=None, - copy=True, - fill_value=None, - method=None, - limit=None, - fill_axis=0, - broadcast_axis=None): + def align( + self, + other, + join="outer", + axis=None, + level=None, + copy=True, + fill_value=None, + method=None, + limit=None, + fill_axis=0, + broadcast_axis=None, + ): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def all(self, axis=0, bool_only=None, skipna=None, level=None, **kwargs): """Return whether all elements are True over requested axis @@ -644,33 +656,25 @@ def all(self, axis=0, bool_only=None, skipna=None, level=None, **kwargs): axis = None result = self._data_manager.all( - axis=axis, - bool_only=bool_only, - skipna=skipna, - level=level, - **kwargs) + axis=axis, bool_only=bool_only, skipna=skipna, level=level, **kwargs + ) if axis is not None: return result else: return result.all() - def any(self, axis=None, bool_only=None, skipna=None, level=None, - **kwargs): + def any(self, axis=None, bool_only=None, skipna=None, level=None, **kwargs): """Return whether any elements are True over requested axis Note: If axis=None or axis=0, this call applies on the column partitions, otherwise operates on row partitions """ - axis = pandas.DataFrame()._get_axis_number( - axis) if axis is not None else 0 + axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 return self._data_manager.any( - axis=axis, - bool_only=bool_only, - skipna=skipna, - level=level, - **kwargs) + axis=axis, bool_only=bool_only, skipna=skipna, level=level, **kwargs + ) def append(self, other, ignore_index=False, verify_integrity=False): """Append another DataFrame/list/Series to this one. @@ -687,8 +691,10 @@ def append(self, other, ignore_index=False, verify_integrity=False): if isinstance(other, dict): other = pandas.Series(other) if other.name is None and not ignore_index: - raise TypeError('Can only append a Series if ignore_index=True' - ' or if the Series has a name') + raise TypeError( + "Can only append a Series if ignore_index=True" + " or if the Series has a name" + ) if other.name is None: index = None @@ -698,8 +704,7 @@ def append(self, other, ignore_index=False, verify_integrity=False): index = pandas.Index([other.name], name=self.index.name) # Create a Modin DataFrame from this Series for ease of development - other = DataFrame( - pandas.DataFrame(other).T, index=index)._data_manager + other = DataFrame(pandas.DataFrame(other).T, index=index)._data_manager elif isinstance(other, list): if not isinstance(other[0], DataFrame): other = pandas.DataFrame(other) @@ -716,24 +721,20 @@ def append(self, other, ignore_index=False, verify_integrity=False): # We also do this first to ensure that we don't waste compute/memory. if verify_integrity and not ignore_index: appended_index = self.index.append(other.index) - is_valid = next( - (False for idx in appended_index.duplicated() if idx), True) + is_valid = next((False for idx in appended_index.duplicated() if idx), True) if not is_valid: - raise ValueError("Indexes have overlapping values: {}".format( - appended_index[appended_index.duplicated()])) + raise ValueError( + "Indexes have overlapping values: {}".format( + appended_index[appended_index.duplicated()] + ) + ) - data_manager = self._data_manager.concat( - 0, other, ignore_index=ignore_index) + data_manager = self._data_manager.concat(0, other, ignore_index=ignore_index) return DataFrame(data_manager=data_manager) - def apply(self, - func, - axis=0, - broadcast=False, - raw=False, - reduce=None, - args=(), - **kwds): + def apply( + self, func, axis=0, broadcast=False, raw=False, reduce=None, args=(), **kwds + ): """Apply a function along input axis of DataFrame. Args: @@ -750,23 +751,26 @@ def apply(self, if isinstance(func, string_types): if axis == 1: - kwds['axis'] = axis + kwds["axis"] = axis return getattr(self, func)(*args, **kwds) elif isinstance(func, dict): if axis == 1: - raise TypeError("(\"'dict' object is not callable\", " - "'occurred at index {0}'".format( - self.index[0])) + raise TypeError( + "(\"'dict' object is not callable\", " + "'occurred at index {0}'".format(self.index[0]) + ) if len(self.columns) != len(set(self.columns)): warnings.warn( - 'duplicate column names not supported with apply().', + "duplicate column names not supported with apply().", FutureWarning, - stacklevel=2) + stacklevel=2, + ) elif is_list_like(func): if axis == 1: - raise TypeError("(\"'list' object is not callable\", " - "'occurred at index {0}'".format( - self.index[0])) + raise TypeError( + "(\"'list' object is not callable\", " + "'occurred at index {0}'".format(self.index[0]) + ) elif not callable(func): return @@ -778,7 +782,8 @@ def apply(self, def as_blocks(self, copy=True): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def as_matrix(self, columns=None): """Convert the frame to its Numpy-array representation. @@ -793,33 +798,32 @@ def as_matrix(self, columns=None): # TODO this is very inefficient, also see __array__ return to_pandas(self).as_matrix(columns) - def asfreq(self, - freq, - method=None, - how=None, - normalize=False, - fill_value=None): + def asfreq(self, freq, method=None, how=None, normalize=False, fill_value=None): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def asof(self, where, subset=None): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def assign(self, **kwargs): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) - def astype(self, dtype, copy=True, errors='raise', **kwargs): + def astype(self, dtype, copy=True, errors="raise", **kwargs): col_dtypes = {} if isinstance(dtype, dict): - if (not set(dtype.keys()).issubset(set(self.columns)) - and errors == 'raise'): - raise KeyError("Only a column name can be used for the key in" - "a dtype mappings argument.") + if not set(dtype.keys()).issubset(set(self.columns)) and errors == "raise": + raise KeyError( + "Only a column name can be used for the key in" + "a dtype mappings argument." + ) col_dtypes = dtype else: @@ -835,25 +839,20 @@ def astype(self, dtype, copy=True, errors='raise', **kwargs): def at_time(self, time, asof=False): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) - def between_time(self, - start_time, - end_time, - include_start=True, - include_end=True): + def between_time(self, start_time, end_time, include_start=True, include_end=True): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def bfill(self, axis=None, inplace=False, limit=None, downcast=None): """Synonym for DataFrame.fillna(method='bfill')""" new_df = self.fillna( - method='bfill', - axis=axis, - limit=limit, - downcast=downcast, - inplace=inplace) + method="bfill", axis=axis, limit=limit, downcast=downcast, inplace=inplace + ) if not inplace: return new_df @@ -865,89 +864,100 @@ def bool(self): element is not boolean """ shape = self.shape - if shape != (1, ) and shape != (1, 1): - raise ValueError("""The PandasObject does not have exactly + if shape != (1,) and shape != (1, 1): + raise ValueError( + """The PandasObject does not have exactly 1 element. Return the bool of a single element PandasObject. The truth value is ambiguous. Use a.empty, a.item(), a.any() - or a.all().""") + or a.all().""" + ) else: return to_pandas(self).bool() - def boxplot(self, - column=None, - by=None, - ax=None, - fontsize=None, - rot=0, - grid=True, - figsize=None, - layout=None, - return_type=None, - **kwds): + def boxplot( + self, + column=None, + by=None, + ax=None, + fontsize=None, + rot=0, + grid=True, + figsize=None, + layout=None, + return_type=None, + **kwds + ): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") - - def clip(self, - lower=None, - upper=None, - axis=None, - inplace=False, - *args, - **kwargs): + "github.com/modin-project/modin." + ) + + def clip(self, lower=None, upper=None, axis=None, inplace=False, *args, **kwargs): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def clip_lower(self, threshold, axis=None, inplace=False): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def clip_upper(self, threshold, axis=None, inplace=False): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def combine(self, other, func, fill_value=None, overwrite=True): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def combine_first(self, other): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def compound(self, axis=None, skipna=None, level=None): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def consolidate(self, inplace=False): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") - - def convert_objects(self, - convert_dates=True, - convert_numeric=False, - convert_timedeltas=True, - copy=True): + "github.com/modin-project/modin." + ) + + def convert_objects( + self, + convert_dates=True, + convert_numeric=False, + convert_timedeltas=True, + copy=True, + ): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) - def corr(self, method='pearson', min_periods=1): + def corr(self, method="pearson", min_periods=1): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def corrwith(self, other, axis=0, drop=False): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def count(self, axis=0, level=None, numeric_only=False): """Get the count of non-null objects in the DataFrame. @@ -961,15 +971,16 @@ def count(self, axis=0, level=None, numeric_only=False): Returns: The count, in a Series (or DataFrame if level is specified). """ - axis = pandas.DataFrame()._get_axis_number( - axis) if axis is not None else 0 + axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 return self._data_manager.count( - axis=axis, level=level, numeric_only=numeric_only) + axis=axis, level=level, numeric_only=numeric_only + ) def cov(self, min_periods=None): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def cummax(self, axis=None, skipna=True, *args, **kwargs): """Perform a cumulative maximum across the DataFrame. @@ -981,11 +992,10 @@ def cummax(self, axis=None, skipna=True, *args, **kwargs): Returns: The cumulative maximum of the DataFrame. """ - axis = pandas.DataFrame()._get_axis_number( - axis) if axis is not None else 0 + axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 return DataFrame( - data_manager=self._data_manager.cummax( - axis=axis, skipna=skipna, **kwargs)) + data_manager=self._data_manager.cummax(axis=axis, skipna=skipna, **kwargs) + ) def cummin(self, axis=None, skipna=True, *args, **kwargs): """Perform a cumulative minimum across the DataFrame. @@ -997,11 +1007,10 @@ def cummin(self, axis=None, skipna=True, *args, **kwargs): Returns: The cumulative minimum of the DataFrame. """ - axis = pandas.DataFrame()._get_axis_number( - axis) if axis is not None else 0 + axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 return DataFrame( - data_manager=self._data_manager.cummin( - axis=axis, skipna=skipna, **kwargs)) + data_manager=self._data_manager.cummin(axis=axis, skipna=skipna, **kwargs) + ) def cumprod(self, axis=None, skipna=True, *args, **kwargs): """Perform a cumulative product across the DataFrame. @@ -1013,11 +1022,10 @@ def cumprod(self, axis=None, skipna=True, *args, **kwargs): Returns: The cumulative product of the DataFrame. """ - axis = pandas.DataFrame()._get_axis_number( - axis) if axis is not None else 0 + axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 return DataFrame( - data_manager=self._data_manager.cumprod( - axis=axis, skipna=skipna, **kwargs)) + data_manager=self._data_manager.cumprod(axis=axis, skipna=skipna, **kwargs) + ) def cumsum(self, axis=None, skipna=True, *args, **kwargs): """Perform a cumulative sum across the DataFrame. @@ -1029,11 +1037,10 @@ def cumsum(self, axis=None, skipna=True, *args, **kwargs): Returns: The cumulative sum of the DataFrame. """ - axis = pandas.DataFrame()._get_axis_number( - axis) if axis is not None else 0 + axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 return DataFrame( - data_manager=self._data_manager.cumsum( - axis=axis, skipna=skipna, **kwargs)) + data_manager=self._data_manager.cumsum(axis=axis, skipna=skipna, **kwargs) + ) def describe(self, percentiles=None, include=None, exclude=None): """ @@ -1056,15 +1063,20 @@ def describe(self, percentiles=None, include=None, exclude=None): if exclude is None: exclude = "object" elif "object" not in include: - exclude = ([exclude] + "object") if isinstance( - exclude, str) else list(exclude) + "object" + exclude = ( + ([exclude] + "object") + if isinstance(exclude, str) + else list(exclude) + "object" + ) if percentiles is not None: pandas.DataFrame()._check_percentile(percentiles) return DataFrame( data_manager=self._data_manager.describe( - percentiles=percentiles, include=include, exclude=exclude)) + percentiles=percentiles, include=include, exclude=exclude + ) + ) def diff(self, periods=1, axis=0): """Finds the difference between elements on the axis requested @@ -1077,9 +1089,10 @@ def diff(self, periods=1, axis=0): DataFrame with the diff applied """ return DataFrame( - data_manager=self._data_manager.diff(periods=periods, axis=axis)) + data_manager=self._data_manager.diff(periods=periods, axis=axis) + ) - def div(self, other, axis='columns', level=None, fill_value=None): + def div(self, other, axis="columns", level=None, fill_value=None): """Divides this DataFrame against another DataFrame/Series/scalar. Args: @@ -1092,15 +1105,17 @@ def div(self, other, axis='columns', level=None, fill_value=None): A new DataFrame with the Divide applied. """ if level is not None: - raise NotImplementedError("Mutlilevel index not yet supported " - "in Pandas on Ray") + raise NotImplementedError( + "Mutlilevel index not yet supported " "in Pandas on Ray" + ) other = self._validate_other(other, axis) new_manager = self._data_manager.div( - other=other, axis=axis, level=level, fill_value=fill_value) + other=other, axis=axis, level=level, fill_value=fill_value + ) return self._create_dataframe_from_manager(new_manager) - def divide(self, other, axis='columns', level=None, fill_value=None): + def divide(self, other, axis="columns", level=None, fill_value=None): """Synonym for div. Args: @@ -1117,16 +1132,19 @@ def divide(self, other, axis='columns', level=None, fill_value=None): def dot(self, other): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") - - def drop(self, - labels=None, - axis=0, - index=None, - columns=None, - level=None, - inplace=False, - errors='raise'): + "github.com/modin-project/modin." + ) + + def drop( + self, + labels=None, + axis=0, + index=None, + columns=None, + level=None, + inplace=False, + errors="raise", + ): """Return new object with labels in requested axis removed. Args: labels: Index or column labels to drop. @@ -1148,16 +1166,19 @@ def drop(self, inplace = validate_bool_kwarg(inplace, "inplace") if labels is not None: if index is not None or columns is not None: - raise ValueError("Cannot specify both 'labels' and " - "'index'/'columns'") + raise ValueError( + "Cannot specify both 'labels' and " "'index'/'columns'" + ) axis = pandas.DataFrame()._get_axis_name(axis) axes = {axis: labels} elif index is not None or columns is not None: - axes, _ = pandas.DataFrame() \ - ._construct_axes_from_arguments((index, columns), {}) + axes, _ = pandas.DataFrame()._construct_axes_from_arguments( + (index, columns), {} + ) else: - raise ValueError("Need to specify at least one of 'labels', " - "'index' or 'columns'") + raise ValueError( + "Need to specify at least one of 'labels', " "'index' or 'columns'" + ) # TODO Clean up this error checking if "index" not in axes: @@ -1165,17 +1186,14 @@ def drop(self, elif axes["index"] is not None: if not is_list_like(axes["index"]): axes["index"] = [axes["index"]] - if errors == 'raise': - non_existant = [ - obj for obj in axes["index"] if obj not in self.index - ] + if errors == "raise": + non_existant = [obj for obj in axes["index"] if obj not in self.index] if len(non_existant): raise ValueError( - "labels {} not contained in axis".format(non_existant)) + "labels {} not contained in axis".format(non_existant) + ) else: - axes["index"] = [ - obj for obj in axes["index"] if obj in self.index - ] + axes["index"] = [obj for obj in axes["index"] if obj in self.index] # If the length is zero, we will just do nothing if not len(axes["index"]): axes["index"] = None @@ -1185,13 +1203,14 @@ def drop(self, elif axes["columns"] is not None: if not is_list_like(axes["columns"]): axes["columns"] = [axes["columns"]] - if errors == 'raise': + if errors == "raise": non_existant = [ obj for obj in axes["columns"] if obj not in self.columns ] if len(non_existant): raise ValueError( - "labels {} not contained in axis".format(non_existant)) + "labels {} not contained in axis".format(non_existant) + ) else: axes["columns"] = [ obj for obj in axes["columns"] if obj in self.columns @@ -1201,24 +1220,27 @@ def drop(self, axes["columns"] = None new_manager = self._data_manager.drop( - index=axes["index"], columns=axes["columns"]) + index=axes["index"], columns=axes["columns"] + ) if inplace: self._update_inplace(new_manager=new_manager) return DataFrame(data_manager=new_manager) - def drop_duplicates(self, subset=None, keep='first', inplace=False): + def drop_duplicates(self, subset=None, keep="first", inplace=False): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) - def duplicated(self, subset=None, keep='first'): + def duplicated(self, subset=None, keep="first"): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) - def eq(self, other, axis='columns', level=None): + def eq(self, other, axis="columns", level=None): """Checks element-wise that this is equal to other. Args: @@ -1230,12 +1252,12 @@ def eq(self, other, axis='columns', level=None): A new DataFrame filled with Booleans. """ if level is not None: - raise NotImplementedError("Mutlilevel index not yet supported " - "in Pandas on Ray") + raise NotImplementedError( + "Mutlilevel index not yet supported " "in Pandas on Ray" + ) other = self._validate_other(other, axis) - new_manager = self._data_manager.eq( - other=other, axis=axis, level=level) + new_manager = self._data_manager.eq(other=other, axis=axis, level=level) return self._create_dataframe_from_manager(new_manager) def equals(self, other): @@ -1249,8 +1271,7 @@ def equals(self, other): # Copy into a Ray DataFrame to simplify logic below other = DataFrame(other) - if not self.index.equals(other.index) or not \ - self.columns.equals(other.columns): + if not self.index.equals(other.index) or not self.columns.equals(other.columns): return False return all(self.eq(other).all()) @@ -1313,45 +1334,48 @@ def eval(self, expr, inplace=False, **kwargs): else: return DataFrame(data_manager=result) - def ewm(self, - com=None, - span=None, - halflife=None, - alpha=None, - min_periods=0, - freq=None, - adjust=True, - ignore_na=False, - axis=0): + def ewm( + self, + com=None, + span=None, + halflife=None, + alpha=None, + min_periods=0, + freq=None, + adjust=True, + ignore_na=False, + axis=0, + ): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def expanding(self, min_periods=1, freq=None, center=False, axis=0): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def ffill(self, axis=None, inplace=False, limit=None, downcast=None): """Synonym for DataFrame.fillna(method='ffill') """ new_df = self.fillna( - method='ffill', - axis=axis, - limit=limit, - downcast=downcast, - inplace=inplace) + method="ffill", axis=axis, limit=limit, downcast=downcast, inplace=inplace + ) if not inplace: return new_df - def fillna(self, - value=None, - method=None, - axis=None, - inplace=False, - limit=None, - downcast=None, - **kwargs): + def fillna( + self, + value=None, + method=None, + axis=None, + inplace=False, + limit=None, + downcast=None, + **kwargs + ): """Fill NA/NaN values using the specified method. Args: @@ -1384,28 +1408,28 @@ def fillna(self, """ # TODO implement value passed as DataFrame if isinstance(value, pandas.DataFrame): - raise NotImplementedError("Passing a DataFrame as the value for " - "fillna is not yet supported.") + raise NotImplementedError( + "Passing a DataFrame as the value for " "fillna is not yet supported." + ) - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") - axis = pandas.DataFrame()._get_axis_number(axis) \ - if axis is not None \ - else 0 + axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 if isinstance(value, (list, tuple)): - raise TypeError('"value" parameter must be a scalar or dict, but ' - 'you passed a "{0}"'.format(type(value).__name__)) + raise TypeError( + '"value" parameter must be a scalar or dict, but ' + 'you passed a "{0}"'.format(type(value).__name__) + ) if value is None and method is None: - raise ValueError('must specify a fill method or value') + raise ValueError("must specify a fill method or value") if value is not None and method is not None: - raise ValueError('cannot specify both a fill method and value') - if method is not None and method not in [ - 'backfill', 'bfill', 'pad', 'ffill' - ]: - expecting = 'pad (ffill) or backfill (bfill)' - msg = 'Invalid fill method. Expecting {expecting}. Got {method}'\ - .format(expecting=expecting, method=method) + raise ValueError("cannot specify both a fill method and value") + if method is not None and method not in ["backfill", "bfill", "pad", "ffill"]: + expecting = "pad (ffill) or backfill (bfill)" + msg = "Invalid fill method. Expecting {expecting}. Got {method}".format( + expecting=expecting, method=method + ) raise ValueError(msg) if isinstance(value, pandas.Series): @@ -1418,7 +1442,8 @@ def fillna(self, inplace=False, limit=limit, downcast=downcast, - **kwargs) + **kwargs + ) if inplace: self._update_inplace(new_manager=new_manager) @@ -1439,13 +1464,15 @@ def filter(self, items=None, like=None, regex=None, axis=None): """ nkw = com._count_not_none(items, like, regex) if nkw > 1: - raise TypeError('Keyword arguments `items`, `like`, or `regex` ' - 'are mutually exclusive') + raise TypeError( + "Keyword arguments `items`, `like`, or `regex` " + "are mutually exclusive" + ) if nkw == 0: - raise TypeError('Must pass either `items`, `like`, or `regex`') + raise TypeError("Must pass either `items`, `like`, or `regex`") if axis is None: - axis = 'columns' # This is the default info axis for dataframes + axis = "columns" # This is the default info axis for dataframes axis = pandas.DataFrame()._get_axis_number(axis) labels = self.columns if axis else self.index @@ -1473,7 +1500,8 @@ def f(x): def first(self, offset): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def first_valid_index(self): """Return index for first non-NA/null value. @@ -1483,7 +1511,7 @@ def first_valid_index(self): """ return self._data_manager.first_valid_index() - def floordiv(self, other, axis='columns', level=None, fill_value=None): + def floordiv(self, other, axis="columns", level=None, fill_value=None): """Divides this DataFrame against another DataFrame/Series/scalar. Args: @@ -1496,53 +1524,63 @@ def floordiv(self, other, axis='columns', level=None, fill_value=None): A new DataFrame with the Divide applied. """ if level is not None: - raise NotImplementedError("Mutlilevel index not yet supported " - "in Pandas on Ray") + raise NotImplementedError( + "Mutlilevel index not yet supported " "in Pandas on Ray" + ) other = self._validate_other(other, axis) new_manager = self._data_manager.floordiv( - other=other, axis=axis, level=level, fill_value=fill_value) + other=other, axis=axis, level=level, fill_value=fill_value + ) return self._create_dataframe_from_manager(new_manager) @classmethod - def from_csv(self, - path, - header=0, - sep=', ', - index_col=0, - parse_dates=True, - encoding=None, - tupleize_cols=None, - infer_datetime_format=False): + def from_csv( + self, + path, + header=0, + sep=", ", + index_col=0, + parse_dates=True, + encoding=None, + tupleize_cols=None, + infer_datetime_format=False, + ): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) @classmethod - def from_dict(self, data, orient='columns', dtype=None): + def from_dict(self, data, orient="columns", dtype=None): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) @classmethod - def from_items(self, items, columns=None, orient='columns'): + def from_items(self, items, columns=None, orient="columns"): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) @classmethod - def from_records(self, - data, - index=None, - exclude=None, - columns=None, - coerce_float=False, - nrows=None): + def from_records( + self, + data, + index=None, + exclude=None, + columns=None, + coerce_float=False, + nrows=None, + ): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) - def ge(self, other, axis='columns', level=None): + def ge(self, other, axis="columns", level=None): """Checks element-wise that this is greater than or equal to other. Args: @@ -1554,12 +1592,12 @@ def ge(self, other, axis='columns', level=None): A new DataFrame filled with Booleans. """ if level is not None: - raise NotImplementedError("Mutlilevel index not yet supported " - "in Pandas on Ray") + raise NotImplementedError( + "Mutlilevel index not yet supported " "in Pandas on Ray" + ) other = self._validate_other(other, axis) - new_manager = self._data_manager.ge( - other=other, axis=axis, level=level) + new_manager = self._data_manager.ge(other=other, axis=axis, level=level) return self._create_dataframe_from_manager(new_manager) def get(self, key, default=None): @@ -1602,14 +1640,16 @@ def get_ftype_counts(self): def get_value(self, index, col, takeable=False): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def get_values(self): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) - def gt(self, other, axis='columns', level=None): + def gt(self, other, axis="columns", level=None): """Checks element-wise that this is greater than other. Args: @@ -1621,12 +1661,12 @@ def gt(self, other, axis='columns', level=None): A new DataFrame filled with Booleans. """ if level is not None: - raise NotImplementedError("Mutlilevel index not yet supported " - "in Pandas on Ray") + raise NotImplementedError( + "Mutlilevel index not yet supported " "in Pandas on Ray" + ) other = self._validate_other(other, axis) - new_manager = self._data_manager.gt( - other=other, axis=axis, level=level) + new_manager = self._data_manager.gt(other=other, axis=axis, level=level) return self._create_dataframe_from_manager(new_manager) def head(self, n=5): @@ -1643,25 +1683,28 @@ def head(self, n=5): return DataFrame(data_manager=self._data_manager.head(n)) - def hist(self, - data, - column=None, - by=None, - grid=True, - xlabelsize=None, - xrot=None, - ylabelsize=None, - yrot=None, - ax=None, - sharex=False, - sharey=False, - figsize=None, - layout=None, - bins=10, - **kwds): + def hist( + self, + data, + column=None, + by=None, + grid=True, + xlabelsize=None, + xrot=None, + ylabelsize=None, + yrot=None, + ax=None, + sharex=False, + sharey=False, + figsize=None, + layout=None, + bins=10, + **kwds + ): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def idxmax(self, axis=0, skipna=True): """Get the index of the first occurrence of the max value of the axis. @@ -1674,9 +1717,8 @@ def idxmax(self, axis=0, skipna=True): A Series with the index for each maximum value for the axis specified. """ - if not all(d != np.dtype('O') for d in self.dtypes): - raise TypeError( - "reduction operation 'argmax' not allowed for this dtype") + if not all(d != np.dtype("O") for d in self.dtypes): + raise TypeError("reduction operation 'argmax' not allowed for this dtype") return self._data_manager.idxmax(axis=axis, skipna=skipna) @@ -1691,23 +1733,20 @@ def idxmin(self, axis=0, skipna=True): A Series with the index for each minimum value for the axis specified. """ - if not all(d != np.dtype('O') for d in self.dtypes): - raise TypeError( - "reduction operation 'argmax' not allowed for this dtype") + if not all(d != np.dtype("O") for d in self.dtypes): + raise TypeError("reduction operation 'argmax' not allowed for this dtype") return self._data_manager.idxmin(axis=axis, skipna=skipna) def infer_objects(self): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") - - def info(self, - verbose=None, - buf=None, - max_cols=None, - memory_usage=None, - null_counts=None): + "github.com/modin-project/modin." + ) + + def info( + self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None + ): """Print a concise summary of a DataFrame, which includes the index dtype and column dtypes, non-null values and memory usage. @@ -1755,62 +1794,59 @@ def info(self, null_counts = False # Determine if actually verbose - actually_verbose = True if verbose and max_cols > len( - columns) else False + actually_verbose = True if verbose and max_cols > len(columns) else False - if type(memory_usage) == str and memory_usage == 'deep': + if type(memory_usage) == str and memory_usage == "deep": memory_usage_deep = True else: memory_usage_deep = False # Start putting together output # Class denoted in info() output - class_string = '\n' + class_string = "\n" # Create the Index info() string by parsing self.index - index_string = index.summary() + '\n' + index_string = index.summary() + "\n" if null_counts: counts = self._data_manager.count() if memory_usage: memory_usage_data = self._data_manager.memory_usage( - deep=memory_usage_deep, index=True) + deep=memory_usage_deep, index=True + ) if actually_verbose: # Create string for verbose output - col_string = 'Data columns (total {0} columns):\n' \ - .format(len(columns)) + col_string = "Data columns (total {0} columns):\n".format(len(columns)) for col, dtype in zip(columns, dtypes): - col_string += '{0}\t'.format(col) + col_string += "{0}\t".format(col) if null_counts: - col_string += '{0} not-null '.format(counts[col]) - col_string += '{0}\n'.format(dtype) + col_string += "{0} not-null ".format(counts[col]) + col_string += "{0}\n".format(dtype) else: # Create string for not verbose output - col_string = 'Columns: {0} entries, {1} to {2}\n'\ - .format(len(columns), columns[0], columns[-1]) + col_string = "Columns: {0} entries, {1} to {2}\n".format( + len(columns), columns[0], columns[-1] + ) # A summary of the dtypes in the dataframe dtypes_string = "dtypes: " for dtype, count in dtypes.value_counts().iteritems(): dtypes_string += "{0}({1}),".format(dtype, count) - dtypes_string = dtypes_string[:-1] + '\n' + dtypes_string = dtypes_string[:-1] + "\n" # Create memory usage string - memory_string = '' + memory_string = "" if memory_usage: if memory_usage_deep: - memory_string = 'memory usage: {0} bytes'.format( - memory_usage_data) + memory_string = "memory usage: {0} bytes".format(memory_usage_data) else: - memory_string = 'memory usage: {0}+ bytes'.format( - memory_usage_data) + memory_string = "memory usage: {0}+ bytes".format(memory_usage_data) # Combine all the components of the info() output - result = ''.join([ - class_string, index_string, col_string, dtypes_string, - memory_string - ]) + result = "".join( + [class_string, index_string, col_string, dtypes_string, memory_string] + ) # Write to specified output buffer buf.write(result) @@ -1830,29 +1866,33 @@ def insert(self, loc, column, value, allow_duplicates=False): if len(value) != len(self.index): raise ValueError("Length of values does not match length of index") if not allow_duplicates and column in self.columns: - raise ValueError( - "cannot insert {0}, already exists".format(column)) + raise ValueError("cannot insert {0}, already exists".format(column)) if loc > len(self.columns): raise IndexError( "index {0} is out of bounds for axis 0 with size {1}".format( - loc, len(self.columns))) + loc, len(self.columns) + ) + ) if loc < 0: raise ValueError("unbounded slice") new_manager = self._data_manager.insert(loc, column, value) self._update_inplace(new_manager=new_manager) - def interpolate(self, - method='linear', - axis=0, - limit=None, - inplace=False, - limit_direction='forward', - downcast=None, - **kwargs): + def interpolate( + self, + method="linear", + axis=0, + limit=None, + inplace=False, + limit_direction="forward", + downcast=None, + **kwargs + ): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def iterrows(self): """Iterate over DataFrame rows as (index, Series) pairs. @@ -1872,8 +1912,7 @@ def iterrow_builder(df): df.index = [next(index_iter)] return df.iterrows() - partition_iterator = PartitionIterator(self._data_manager, 0, - iterrow_builder) + partition_iterator = PartitionIterator(self._data_manager, 0, iterrow_builder) for v in partition_iterator: yield v @@ -1896,8 +1935,7 @@ def items_builder(df): df.index = self.index return df.items() - partition_iterator = PartitionIterator(self._data_manager, 1, - items_builder) + partition_iterator = PartitionIterator(self._data_manager, 1, items_builder) for v in partition_iterator: yield v @@ -1913,7 +1951,7 @@ def iteritems(self): """ return self.items() - def itertuples(self, index=True, name='Pandas'): + def itertuples(self, index=True, name="Pandas"): """Iterate over DataFrame rows as namedtuples. Args: @@ -1936,19 +1974,14 @@ def itertuples_builder(df): df.index = [next(index_iter)] return df.itertuples(index=index, name=name) - partition_iterator = PartitionIterator(self._data_manager, 0, - itertuples_builder) + partition_iterator = PartitionIterator( + self._data_manager, 0, itertuples_builder + ) for v in partition_iterator: yield v - def join(self, - other, - on=None, - how='left', - lsuffix='', - rsuffix='', - sort=False): + def join(self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False): """Join two or more DataFrames, or a DataFrame with a collection. Args: @@ -1978,7 +2011,8 @@ def join(self, pandas.DataFrame(columns=self.columns).join( pandas.DataFrame(columns=other.columns), lsuffix=lsuffix, - rsuffix=rsuffix).columns + rsuffix=rsuffix, + ).columns return DataFrame( data_manager=self._data_manager.join( @@ -1986,18 +2020,22 @@ def join(self, how=how, lsuffix=lsuffix, rsuffix=rsuffix, - sort=sort)) + sort=sort, + ) + ) else: # This constraint carried over from Pandas. if on is not None: - raise ValueError("Joining multiple DataFrames only supported" - " for joining on index") + raise ValueError( + "Joining multiple DataFrames only supported" " for joining on index" + ) # See note above about error checking with an empty join. pandas.DataFrame(columns=self.columns).join( [pandas.DataFrame(columns=obj.columns) for obj in other], lsuffix=lsuffix, - rsuffix=rsuffix).columns + rsuffix=rsuffix, + ).columns return DataFrame( data_manager=self._data_manager.join( @@ -2005,32 +2043,27 @@ def join(self, how=how, lsuffix=lsuffix, rsuffix=rsuffix, - sort=sort)) - - def kurt(self, - axis=None, - skipna=None, - level=None, - numeric_only=None, - **kwargs): + sort=sort, + ) + ) + + def kurt(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") - - def kurtosis(self, - axis=None, - skipna=None, - level=None, - numeric_only=None, - **kwargs): + "github.com/modin-project/modin." + ) + + def kurtosis(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def last(self, offset): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def last_valid_index(self): """Return index for last non-NA/null value. @@ -2040,7 +2073,7 @@ def last_valid_index(self): """ return self._data_manager.last_valid_index() - def le(self, other, axis='columns', level=None): + def le(self, other, axis="columns", level=None): """Checks element-wise that this is less than or equal to other. Args: @@ -2052,20 +2085,21 @@ def le(self, other, axis='columns', level=None): A new DataFrame filled with Booleans. """ if level is not None: - raise NotImplementedError("Mutlilevel index not yet supported " - "in Pandas on Ray") + raise NotImplementedError( + "Mutlilevel index not yet supported " "in Pandas on Ray" + ) other = self._validate_other(other, axis) - new_manager = self._data_manager.le( - other=other, axis=axis, level=level) + new_manager = self._data_manager.le(other=other, axis=axis, level=level) return self._create_dataframe_from_manager(new_manager) def lookup(self, row_labels, col_labels): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) - def lt(self, other, axis='columns', level=None): + def lt(self, other, axis="columns", level=None): """Checks element-wise that this is less than other. Args: @@ -2077,38 +2111,37 @@ def lt(self, other, axis='columns', level=None): A new DataFrame filled with Booleans. """ if level is not None: - raise NotImplementedError("Mutlilevel index not yet supported " - "in Pandas on Ray") + raise NotImplementedError( + "Mutlilevel index not yet supported " "in Pandas on Ray" + ) other = self._validate_other(other, axis) - new_manager = self._data_manager.lt( - other=other, axis=axis, level=level) + new_manager = self._data_manager.lt(other=other, axis=axis, level=level) return self._create_dataframe_from_manager(new_manager) def mad(self, axis=None, skipna=None, level=None): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") - - def mask(self, - cond, - other=np.nan, - inplace=False, - axis=None, - level=None, - errors='raise', - try_cast=False, - raise_on_error=None): + "github.com/modin-project/modin." + ) + + def mask( + self, + cond, + other=np.nan, + inplace=False, + axis=None, + level=None, + errors="raise", + try_cast=False, + raise_on_error=None, + ): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") - - def max(self, - axis=None, - skipna=None, - level=None, - numeric_only=None, - **kwargs): + "github.com/modin-project/modin." + ) + + def max(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): """Perform max across the DataFrame. Args: @@ -2118,22 +2151,13 @@ def max(self, Returns: The max of the DataFrame. """ - axis = pandas.DataFrame()._get_axis_number( - axis) if axis is not None else 0 + axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 return self._data_manager.max( - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs) - - def mean(self, - axis=None, - skipna=None, - level=None, - numeric_only=None, - **kwargs): + axis=axis, skipna=skipna, level=level, numeric_only=numeric_only, **kwargs + ) + + def mean(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): """Computes mean across the DataFrame. Args: @@ -2143,22 +2167,13 @@ def mean(self, Returns: The mean of the DataFrame. (Pandas series) """ - axis = pandas.DataFrame()._get_axis_number( - axis) if axis is not None else 0 + axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 return self._data_manager.mean( - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs) - - def median(self, - axis=None, - skipna=None, - level=None, - numeric_only=None, - **kwargs): + axis=axis, skipna=skipna, level=level, numeric_only=numeric_only, **kwargs + ) + + def median(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): """Computes median across the DataFrame. Args: @@ -2168,24 +2183,23 @@ def median(self, Returns: The median of the DataFrame. (Pandas series) """ - axis = pandas.DataFrame()._get_axis_number( - axis) if axis is not None else 0 + axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 return self._data_manager.median( - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs) - - def melt(self, - id_vars=None, - value_vars=None, - var_name=None, - value_name='value', - col_level=None): + axis=axis, skipna=skipna, level=level, numeric_only=numeric_only, **kwargs + ) + + def melt( + self, + id_vars=None, + value_vars=None, + var_name=None, + value_name="value", + col_level=None, + ): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def memory_usage(self, index=True, deep=False): """Returns the memory usage of each column in bytes @@ -2206,23 +2220,25 @@ def memory_usage(self, index=True, deep=False): result.index = self.columns if index: index_value = self.index.memory_usage(deep=deep) - return pandas.Series(index_value, index=['Index']).append(result) + return pandas.Series(index_value, index=["Index"]).append(result) return result - def merge(self, - right, - how='inner', - on=None, - left_on=None, - right_on=None, - left_index=False, - right_index=False, - sort=False, - suffixes=('_x', '_y'), - copy=True, - indicator=False, - validate=None): + def merge( + self, + right, + how="inner", + on=None, + left_on=None, + right_on=None, + left_index=False, + right_index=False, + sort=False, + suffixes=("_x", "_y"), + copy=True, + indicator=False, + validate=None, + ): """Database style join, where common columns in "on" are merged. Args: @@ -2247,28 +2263,23 @@ def merge(self, """ if not isinstance(right, DataFrame): - raise ValueError("can not merge DataFrame with instance of type " - "{}".format(type(right))) + raise ValueError( + "can not merge DataFrame with instance of type " + "{}".format(type(right)) + ) if left_index is False or right_index is False: raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) if left_index and right_index: return self.join( - right, - how=how, - lsuffix=suffixes[0], - rsuffix=suffixes[1], - sort=sort) - - def min(self, - axis=None, - skipna=None, - level=None, - numeric_only=None, - **kwargs): + right, how=how, lsuffix=suffixes[0], rsuffix=suffixes[1], sort=sort + ) + + def min(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): """Perform min across the DataFrame. Args: @@ -2278,17 +2289,13 @@ def min(self, Returns: The min of the DataFrame. """ - axis = pandas.DataFrame()._get_axis_number( - axis) if axis is not None else 0 + axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 return self._data_manager.min( - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs) + axis=axis, skipna=skipna, level=level, numeric_only=numeric_only, **kwargs + ) - def mod(self, other, axis='columns', level=None, fill_value=None): + def mod(self, other, axis="columns", level=None, fill_value=None): """Mods this DataFrame against another DataFrame/Series/scalar. Args: @@ -2301,12 +2308,14 @@ def mod(self, other, axis='columns', level=None, fill_value=None): A new DataFrame with the Mod applied. """ if level is not None: - raise NotImplementedError("Mutlilevel index not yet supported " - "in Pandas on Ray") + raise NotImplementedError( + "Mutlilevel index not yet supported " "in Pandas on Ray" + ) other = self._validate_other(other, axis) new_manager = self._data_manager.mod( - other=other, axis=axis, level=level, fill_value=fill_value) + other=other, axis=axis, level=level, fill_value=fill_value + ) return self._create_dataframe_from_manager(new_manager) def mode(self, axis=0, numeric_only=False): @@ -2322,10 +2331,10 @@ def mode(self, axis=0, numeric_only=False): axis = pandas.DataFrame()._get_axis_number(axis) return DataFrame( - data_manager=self._data_manager.mode( - axis=axis, numeric_only=numeric_only)) + data_manager=self._data_manager.mode(axis=axis, numeric_only=numeric_only) + ) - def mul(self, other, axis='columns', level=None, fill_value=None): + def mul(self, other, axis="columns", level=None, fill_value=None): """Multiplies this DataFrame against another DataFrame/Series/scalar. Args: @@ -2338,15 +2347,17 @@ def mul(self, other, axis='columns', level=None, fill_value=None): A new DataFrame with the Multiply applied. """ if level is not None: - raise NotImplementedError("Mutlilevel index not yet supported " - "in Pandas on Ray") + raise NotImplementedError( + "Mutlilevel index not yet supported " "in Pandas on Ray" + ) other = self._validate_other(other, axis) new_manager = self._data_manager.mul( - other=other, axis=axis, level=level, fill_value=fill_value) + other=other, axis=axis, level=level, fill_value=fill_value + ) return self._create_dataframe_from_manager(new_manager) - def multiply(self, other, axis='columns', level=None, fill_value=None): + def multiply(self, other, axis="columns", level=None, fill_value=None): """Synonym for mul. Args: @@ -2360,7 +2371,7 @@ def multiply(self, other, axis='columns', level=None, fill_value=None): """ return self.mul(other, axis, level, fill_value) - def ne(self, other, axis='columns', level=None): + def ne(self, other, axis="columns", level=None): """Checks element-wise that this is not equal to other. Args: @@ -2372,18 +2383,19 @@ def ne(self, other, axis='columns', level=None): A new DataFrame filled with Booleans. """ if level is not None: - raise NotImplementedError("Mutlilevel index not yet supported " - "in Pandas on Ray") + raise NotImplementedError( + "Mutlilevel index not yet supported " "in Pandas on Ray" + ) other = self._validate_other(other, axis) - new_manager = self._data_manager.ne( - other=other, axis=axis, level=level) + new_manager = self._data_manager.ne(other=other, axis=axis, level=level) return self._create_dataframe_from_manager(new_manager) - def nlargest(self, n, columns, keep='first'): + def nlargest(self, n, columns, keep="first"): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def notna(self): """Perform notna across the DataFrame. @@ -2403,10 +2415,11 @@ def notnull(self): """ return DataFrame(data_manager=self._data_manager.notnull()) - def nsmallest(self, n, columns, keep='first'): + def nsmallest(self, n, columns, keep="first"): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def nunique(self, axis=0, dropna=True): """Return Series with number of distinct @@ -2421,15 +2434,11 @@ def nunique(self, axis=0, dropna=True): """ return self._data_manager.nunique(axis=axis, dropna=dropna) - def pct_change(self, - periods=1, - fill_method='pad', - limit=None, - freq=None, - **kwargs): + def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, **kwargs): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def pipe(self, func, *args, **kwargs): """Apply func(self, *args, **kwargs) @@ -2447,55 +2456,62 @@ def pipe(self, func, *args, **kwargs): def pivot(self, index=None, columns=None, values=None): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") - - def pivot_table(self, - values=None, - index=None, - columns=None, - aggfunc='mean', - fill_value=None, - margins=False, - dropna=True, - margins_name='All'): + "github.com/modin-project/modin." + ) + + def pivot_table( + self, + values=None, + index=None, + columns=None, + aggfunc="mean", + fill_value=None, + margins=False, + dropna=True, + margins_name="All", + ): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") - - def plot(self, - x=None, - y=None, - kind='line', - ax=None, - subplots=False, - sharex=None, - sharey=False, - layout=None, - figsize=None, - use_index=True, - title=None, - grid=None, - legend=True, - style=None, - logx=False, - logy=False, - loglog=False, - xticks=None, - yticks=None, - xlim=None, - ylim=None, - rot=None, - fontsize=None, - colormap=None, - table=False, - yerr=None, - xerr=None, - secondary_y=False, - sort_columns=False, - **kwds): + "github.com/modin-project/modin." + ) + + def plot( + self, + x=None, + y=None, + kind="line", + ax=None, + subplots=False, + sharex=None, + sharey=False, + layout=None, + figsize=None, + use_index=True, + title=None, + grid=None, + legend=True, + style=None, + logx=False, + logy=False, + loglog=False, + xticks=None, + yticks=None, + xlim=None, + ylim=None, + rot=None, + fontsize=None, + colormap=None, + table=False, + yerr=None, + xerr=None, + secondary_y=False, + sort_columns=False, + **kwds + ): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def pop(self, item): """Pops an item from this DataFrame and returns it. @@ -2511,7 +2527,7 @@ def pop(self, item): del self[item] return result - def pow(self, other, axis='columns', level=None, fill_value=None): + def pow(self, other, axis="columns", level=None, fill_value=None): """Pow this DataFrame against another DataFrame/Series/scalar. Args: @@ -2524,21 +2540,25 @@ def pow(self, other, axis='columns', level=None, fill_value=None): A new DataFrame with the Pow applied. """ if level is not None: - raise NotImplementedError("Mutlilevel index not yet supported " - "in Pandas on Ray") + raise NotImplementedError( + "Mutlilevel index not yet supported " "in Pandas on Ray" + ) other = self._validate_other(other, axis) new_manager = self._data_manager.pow( - other=other, axis=axis, level=level, fill_value=fill_value) + other=other, axis=axis, level=level, fill_value=fill_value + ) return self._create_dataframe_from_manager(new_manager) - def prod(self, - axis=None, - skipna=None, - level=None, - numeric_only=None, - min_count=1, - **kwargs): + def prod( + self, + axis=None, + skipna=None, + level=None, + numeric_only=None, + min_count=1, + **kwargs + ): """Return the product of the values for the requested axis Args: @@ -2551,8 +2571,7 @@ def prod(self, Returns: prod : Series or DataFrame (if level specified) """ - axis = pandas.DataFrame()._get_axis_number( - axis) if axis is not None else 0 + axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 return self._data_manager.prod( axis=axis, @@ -2560,15 +2579,18 @@ def prod(self, level=level, numeric_only=numeric_only, min_count=min_count, - **kwargs) - - def product(self, - axis=None, - skipna=None, - level=None, - numeric_only=None, - min_count=1, - **kwargs): + **kwargs + ) + + def product( + self, + axis=None, + skipna=None, + level=None, + numeric_only=None, + min_count=1, + **kwargs + ): """Return the product of the values for the requested axis Args: @@ -2587,13 +2609,10 @@ def product(self, level=level, numeric_only=numeric_only, min_count=min_count, - **kwargs) + **kwargs + ) - def quantile(self, - q=0.5, - axis=0, - numeric_only=True, - interpolation='linear'): + def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation="linear"): """Return values at the given quantile over requested axis, a la numpy.percentile. @@ -2616,14 +2635,13 @@ def quantile(self, """ def check_dtype(t): - return (is_numeric_dtype(t) or is_datetime_or_timedelta_dtype(t)) + return is_numeric_dtype(t) or is_datetime_or_timedelta_dtype(t) if not numeric_only: # If not numeric_only and columns, then check all columns are either # numeric, timestamp, or timedelta if not axis and not all(check_dtype(t) for t in self.dtypes): - raise TypeError("can't multiply sequence by non-int of type " - "'float'") + raise TypeError("can't multiply sequence by non-int of type " "'float'") # If over rows, then make sure that all dtypes are equal for not # numeric_only @@ -2634,7 +2652,9 @@ def check_dtype(t): if not is_dtype_equal(pre_dtype, curr_dtype): raise TypeError( "Cannot compare type '{0}' with type '{1}'".format( - pre_dtype, curr_dtype)) + pre_dtype, curr_dtype + ) + ) else: # Normally pandas returns this near the end of the quantile, but we # can't afford the overhead of running the entire operation before @@ -2653,14 +2673,14 @@ def check_dtype(t): q=q, axis=axis, numeric_only=numeric_only, - interpolation=interpolation)) + interpolation=interpolation, + ) + ) else: return self._data_manager.quantile_for_single_value( - q=q, - axis=axis, - numeric_only=numeric_only, - interpolation=interpolation) + q=q, axis=axis, numeric_only=numeric_only, interpolation=interpolation + ) def query(self, expr, inplace=False, **kwargs): """Queries the Dataframe with a boolean expression @@ -2678,16 +2698,18 @@ def query(self, expr, inplace=False, **kwargs): else: return DataFrame(data_manager=new_manager) - def radd(self, other, axis='columns', level=None, fill_value=None): + def radd(self, other, axis="columns", level=None, fill_value=None): return self.add(other, axis, level, fill_value) - def rank(self, - axis=0, - method='average', - numeric_only=None, - na_option='keep', - ascending=True, - pct=False): + def rank( + self, + axis=0, + method="average", + numeric_only=None, + na_option="keep", + ascending=True, + pct=False, + ): """ Compute numerical data ranks (1 through n) along axis. Equal values are assigned a rank that is the [method] of @@ -2718,9 +2740,11 @@ def rank(self, numeric_only=numeric_only, na_option=na_option, ascending=ascending, - pct=pct)) + pct=pct, + ) + ) - def rdiv(self, other, axis='columns', level=None, fill_value=None): + def rdiv(self, other, axis="columns", level=None, fill_value=None): """Div this DataFrame against another DataFrame/Series/scalar. Args: @@ -2733,33 +2757,37 @@ def rdiv(self, other, axis='columns', level=None, fill_value=None): A new DataFrame with the rdiv applied. """ if level is not None: - raise NotImplementedError("Mutlilevel index not yet supported " - "in Pandas on Ray") + raise NotImplementedError( + "Mutlilevel index not yet supported " "in Pandas on Ray" + ) other = self._validate_other(other, axis) new_manager = self._data_manager.rdiv( - other=other, axis=axis, level=level, fill_value=fill_value) + other=other, axis=axis, level=level, fill_value=fill_value + ) return self._create_dataframe_from_manager(new_manager) - def reindex(self, - labels=None, - index=None, - columns=None, - axis=None, - method=None, - copy=True, - level=None, - fill_value=np.nan, - limit=None, - tolerance=None): + def reindex( + self, + labels=None, + index=None, + columns=None, + axis=None, + method=None, + copy=True, + level=None, + fill_value=np.nan, + limit=None, + tolerance=None, + ): if level is not None: raise NotImplementedError( "Multilevel Index not Implemented. " "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None \ - else 0 + axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 if axis == 0 and labels is not None: index = labels elif labels is not None: @@ -2772,7 +2800,8 @@ def reindex(self, method=method, fill_value=fill_value, limit=limit, - tolerance=tolerance) + tolerance=tolerance, + ) else: new_manager = self._data_manager @@ -2783,7 +2812,8 @@ def reindex(self, method=method, fill_value=fill_value, limit=limit, - tolerance=tolerance) + tolerance=tolerance, + ) else: final_manager = new_manager @@ -2792,36 +2822,37 @@ def reindex(self, self._update_inplace(new_manager=final_manager) - def reindex_axis(self, - labels, - axis=0, - method=None, - level=None, - copy=True, - limit=None, - fill_value=np.nan): + def reindex_axis( + self, + labels, + axis=0, + method=None, + level=None, + copy=True, + limit=None, + fill_value=np.nan, + ): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") - - def reindex_like(self, - other, - method=None, - copy=True, - limit=None, - tolerance=None): + "github.com/modin-project/modin." + ) + + def reindex_like(self, other, method=None, copy=True, limit=None, tolerance=None): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") - - def rename(self, - mapper=None, - index=None, - columns=None, - axis=None, - copy=True, - inplace=False, - level=None): + "github.com/modin-project/modin." + ) + + def rename( + self, + mapper=None, + index=None, + columns=None, + axis=None, + copy=True, + inplace=False, + level=None, + ): """Alters axes labels. Args: @@ -2835,19 +2866,16 @@ def rename(self, Returns: If inplace is False, a new DataFrame with the updated axes. """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") # We have to do this with the args because of how rename handles # kwargs. It doesn't ignore None values passed in, so we have to filter # them ourselves. args = locals() - kwargs = { - k: v - for k, v in args.items() if v is not None and k != "self" - } + kwargs = {k: v for k, v in args.items() if v is not None and k != "self"} # inplace should always be true because this is just a copy, and we # will use the results after. - kwargs['inplace'] = True + kwargs["inplace"] = True df_to_rename = pandas.DataFrame(index=self.index, columns=self.columns) df_to_rename.rename(**kwargs) @@ -2897,44 +2925,48 @@ def _set_axis_name(self, name, axis=0, inplace=False): def reorder_levels(self, order, axis=0): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") - - def replace(self, - to_replace=None, - value=None, - inplace=False, - limit=None, - regex=False, - method='pad', - axis=None): + "github.com/modin-project/modin." + ) + + def replace( + self, + to_replace=None, + value=None, + inplace=False, + limit=None, + regex=False, + method="pad", + axis=None, + ): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") - - def resample(self, - rule, - how=None, - axis=0, - fill_method=None, - closed=None, - label=None, - convention='start', - kind=None, - loffset=None, - limit=None, - base=0, - on=None, - level=None): + "github.com/modin-project/modin." + ) + + def resample( + self, + rule, + how=None, + axis=0, + fill_method=None, + closed=None, + label=None, + convention="start", + kind=None, + loffset=None, + limit=None, + base=0, + on=None, + level=None, + ): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") - - def reset_index(self, - level=None, - drop=False, - inplace=False, - col_level=0, - col_fill=''): + "github.com/modin-project/modin." + ) + + def reset_index( + self, level=None, drop=False, inplace=False, col_level=0, col_fill="" + ): """Reset this index to default and create column from current index. Args: @@ -2956,7 +2988,7 @@ def reset_index(self, # TODO Implement level if level is not None: raise NotImplementedError("Level not yet supported!") - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") # Error checking for matching Pandas. Pandas does not allow you to # insert a dropped index into a DataFrame if these columns already @@ -2970,27 +3002,30 @@ def reset_index(self, else: return DataFrame(data_manager=new_manager) - def rfloordiv(self, other, axis='columns', level=None, fill_value=None): + def rfloordiv(self, other, axis="columns", level=None, fill_value=None): return self.floordiv(other, axis, level, fill_value) - def rmod(self, other, axis='columns', level=None, fill_value=None): + def rmod(self, other, axis="columns", level=None, fill_value=None): return self.mod(other, axis, level, fill_value) - def rmul(self, other, axis='columns', level=None, fill_value=None): + def rmul(self, other, axis="columns", level=None, fill_value=None): return self.mul(other, axis, level, fill_value) - def rolling(self, - window, - min_periods=None, - freq=None, - center=False, - win_type=None, - on=None, - axis=0, - closed=None): + def rolling( + self, + window, + min_periods=None, + freq=None, + center=False, + win_type=None, + on=None, + axis=0, + closed=None, + ): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def round(self, decimals=0, *args, **kwargs): """Round each element in the DataFrame. @@ -3002,9 +3037,10 @@ def round(self, decimals=0, *args, **kwargs): A new DataFrame. """ return DataFrame( - data_manager=self._data_manager.round(decimals=decimals, **kwargs)) + data_manager=self._data_manager.round(decimals=decimals, **kwargs) + ) - def rpow(self, other, axis='columns', level=None, fill_value=None): + def rpow(self, other, axis="columns", level=None, fill_value=None): """Pow this DataFrame against another DataFrame/Series/scalar. Args: @@ -3017,16 +3053,18 @@ def rpow(self, other, axis='columns', level=None, fill_value=None): A new DataFrame with the Pow applied. """ if level is not None: - raise NotImplementedError("Mutlilevel index not yet supported " - "in Pandas on Ray") + raise NotImplementedError( + "Mutlilevel index not yet supported " "in Pandas on Ray" + ) other = self._validate_other(other, axis) new_manager = self._data_manager.rpow( - other=other, axis=axis, level=level, fill_value=fill_value) + other=other, axis=axis, level=level, fill_value=fill_value + ) return self._create_dataframe_from_manager(new_manager) - def rsub(self, other, axis='columns', level=None, fill_value=None): + def rsub(self, other, axis="columns", level=None, fill_value=None): """Subtract a DataFrame/Series/scalar from this DataFrame. Args: @@ -3039,24 +3077,28 @@ def rsub(self, other, axis='columns', level=None, fill_value=None): A new DataFrame with the subtraciont applied. """ if level is not None: - raise NotImplementedError("Mutlilevel index not yet supported " - "in Pandas on Ray") + raise NotImplementedError( + "Mutlilevel index not yet supported " "in Pandas on Ray" + ) other = self._validate_other(other, axis) new_manager = self._data_manager.rsub( - other=other, axis=axis, level=level, fill_value=fill_value) + other=other, axis=axis, level=level, fill_value=fill_value + ) return self._create_dataframe_from_manager(new_manager) - def rtruediv(self, other, axis='columns', level=None, fill_value=None): + def rtruediv(self, other, axis="columns", level=None, fill_value=None): return self.truediv(other, axis, level, fill_value) - def sample(self, - n=None, - frac=None, - replace=False, - weights=None, - random_state=None, - axis=None): + def sample( + self, + n=None, + frac=None, + replace=False, + weights=None, + random_state=None, + axis=None, + ): """Returns a random sample of items from an axis of object. Args: @@ -3082,8 +3124,7 @@ def sample(self, A new Dataframe """ - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None \ - else 0 + axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 if axis == 0: axis_labels = self.index @@ -3106,25 +3147,26 @@ def sample(self, try: weights = self[weights] except KeyError: - raise KeyError("String passed to weights not a " - "valid column") + raise KeyError("String passed to weights not a " "valid column") else: - raise ValueError("Strings can only be passed to " - "weights when sampling from rows on " - "a DataFrame") + raise ValueError( + "Strings can only be passed to " + "weights when sampling from rows on " + "a DataFrame" + ) - weights = pandas.Series(weights, dtype='float64') + weights = pandas.Series(weights, dtype="float64") if len(weights) != axis_length: - raise ValueError("Weights and axis to be sampled must be of " - "same length") + raise ValueError( + "Weights and axis to be sampled must be of " "same length" + ) if (weights == np.inf).any() or (weights == -np.inf).any(): raise ValueError("weight vector may not include `inf` values") if (weights < 0).any(): - raise ValueError("weight vector many not include negative " - "values") + raise ValueError("weight vector many not include negative " "values") # weights cannot be NaN when sampling, so we must set all nan # values to 0 @@ -3154,11 +3196,11 @@ def sample(self, elif n is not None and frac is not None: # Pandas specification does not allow both n and frac to be passed # in - raise ValueError('Please enter a value for `frac` OR `n`, not ' - 'both') + raise ValueError("Please enter a value for `frac` OR `n`, not " "both") if n < 0: - raise ValueError("A negative number of rows requested. Please " - "provide positive value.") + raise ValueError( + "A negative number of rows requested. Please " "provide positive value." + ) if n == 0: # An Empty DataFrame is returned if the number of samples is 0. @@ -3166,7 +3208,8 @@ def sample(self, # depending on which axis is passed in. return DataFrame( columns=[] if axis == 1 else self.columns, - index=self.index if axis == 1 else []) + index=self.index if axis == 1 else [], + ) if random_state is not None: # Get a random number generator depending on the type of @@ -3177,18 +3220,22 @@ def sample(self, random_num_gen = random_state else: # random_state must be an int or a numpy RandomState object - raise ValueError("Please enter an `int` OR a " - "np.random.RandomState for random_state") + raise ValueError( + "Please enter an `int` OR a " + "np.random.RandomState for random_state" + ) # choose random numbers and then get corresponding labels from # chosen axis sample_indices = random_num_gen.choice( - np.arange(0, axis_length), size=n, replace=replace) + np.arange(0, axis_length), size=n, replace=replace + ) samples = axis_labels[sample_indices] else: # randomly select labels from chosen axis samples = np.random.choice( - a=axis_labels, size=n, replace=replace, p=weights) + a=axis_labels, size=n, replace=replace, p=weights + ) if axis == 1: data_manager = self._data_manager.getitem_col_array(samples) @@ -3200,7 +3247,8 @@ def sample(self, def select(self, crit, axis=0): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def select_dtypes(self, include=None, exclude=None): # Validates arguments for whether both include and exclude are None or @@ -3219,8 +3267,7 @@ def select_dtypes(self, include=None, exclude=None): sel = tuple(map(set, (include, exclude))) - include, exclude = map(lambda x: set(map(_get_dtype_from_object, x)), - sel) + include, exclude = map(lambda x: set(map(_get_dtype_from_object, x)), sel) include_these = pandas.Series(not bool(include), index=self.columns) exclude_these = pandas.Series(not bool(exclude), index=self.columns) @@ -3228,8 +3275,9 @@ def select_dtypes(self, include=None, exclude=None): def is_dtype_instance_mapper(column, dtype): return column, functools.partial(issubclass, dtype.type) - for column, f in itertools.starmap(is_dtype_instance_mapper, - self.dtypes.iteritems()): + for column, f in itertools.starmap( + is_dtype_instance_mapper, self.dtypes.iteritems() + ): if include: # checks for the case of empty include or exclude include_these[column] = any(map(f, include)) if exclude: @@ -3237,21 +3285,17 @@ def is_dtype_instance_mapper(column, dtype): dtype_indexer = include_these & exclude_these indicate = [ - i for i in range(len(dtype_indexer.values)) - if not dtype_indexer.values[i] + i for i in range(len(dtype_indexer.values)) if not dtype_indexer.values[i] ] return self.drop(columns=self.columns[indicate], inplace=False) - def sem(self, - axis=None, - skipna=None, - level=None, - ddof=1, - numeric_only=None, - **kwargs): + def sem( + self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + ): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def set_axis(self, labels, axis=0, inplace=None): """Assign desired index to given axis. @@ -3268,19 +3312,21 @@ def set_axis(self, labels, axis=0, inplace=None): warnings.warn( 'set_axis now takes "labels" as first argument, and ' '"axis" as named parameter. The old form, with "axis" as ' - 'first parameter and \"labels\" as second, is still supported ' - 'but will be deprecated in a future version of pandas.', + 'first parameter and "labels" as second, is still supported ' + "but will be deprecated in a future version of pandas.", FutureWarning, - stacklevel=2) + stacklevel=2, + ) labels, axis = axis, labels if inplace is None: warnings.warn( - 'set_axis currently defaults to operating inplace.\nThis ' - 'will change in a future version of pandas, use ' - 'inplace=True to avoid this warning.', + "set_axis currently defaults to operating inplace.\nThis " + "will change in a future version of pandas, use " + "inplace=True to avoid this warning.", FutureWarning, - stacklevel=2) + stacklevel=2, + ) inplace = True if inplace: setattr(self, pandas.DataFrame()._get_axis_name(axis), labels) @@ -3289,12 +3335,9 @@ def set_axis(self, labels, axis=0, inplace=None): obj.set_axis(labels, axis=axis, inplace=True) return obj - def set_index(self, - keys, - drop=True, - append=False, - inplace=False, - verify_integrity=False): + def set_index( + self, keys, drop=True, append=False, inplace=False, verify_integrity=False + ): """Set the DataFrame index using one or more existing columns. Args: @@ -3309,7 +3352,7 @@ def set_index(self, Returns: If inplace is set to false returns a new DataFrame, otherwise None. """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") if not isinstance(keys, list): keys = [keys] @@ -3358,7 +3401,7 @@ def set_index(self, if verify_integrity and not index.is_unique: duplicates = index.get_duplicates() - raise ValueError('Index has duplicate keys: %s' % duplicates) + raise ValueError("Index has duplicate keys: %s" % duplicates) for c in to_remove: del frame[c] @@ -3374,19 +3417,16 @@ def set_index(self, def set_value(self, index, col, value, takeable=False): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def shift(self, periods=1, freq=None, axis=0): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") - - def skew(self, - axis=None, - skipna=None, - level=None, - numeric_only=None, - **kwargs): + "github.com/modin-project/modin." + ) + + def skew(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): """Return unbiased skew over requested axis Normalized by N-1 Args: @@ -3400,26 +3440,26 @@ def skew(self, skew : Series or DataFrame (if level specified) """ return self._data_manager.skew( - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs) + axis=axis, skipna=skipna, level=level, numeric_only=numeric_only, **kwargs + ) def slice_shift(self, periods=1, axis=0): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") - - def sort_index(self, - axis=0, - level=None, - ascending=True, - inplace=False, - kind='quicksort', - na_position='last', - sort_remaining=True, - by=None): + "github.com/modin-project/modin." + ) + + def sort_index( + self, + axis=0, + level=None, + ascending=True, + inplace=False, + kind="quicksort", + na_position="last", + sort_remaining=True, + by=None, + ): """Sort a DataFrame by one of the indices (columns or index). Args: @@ -3443,11 +3483,11 @@ def sort_index(self, "by argument to sort_index is deprecated, " "please use .sort_values(by=...)", FutureWarning, - stacklevel=2) + stacklevel=2, + ) if level is not None: raise ValueError("unable to simultaneously sort by and level") - return self.sort_values( - by, axis=axis, ascending=ascending, inplace=inplace) + return self.sort_values(by, axis=axis, ascending=ascending, inplace=inplace) axis = pandas.DataFrame()._get_axis_number(axis) @@ -3460,13 +3500,15 @@ def sort_index(self, return self.reindex(index=new_index, columns=new_columns) - def sort_values(self, - by, - axis=0, - ascending=True, - inplace=False, - kind='quicksort', - na_position='last'): + def sort_values( + self, + by, + axis=0, + ascending=True, + inplace=False, + kind="quicksort", + na_position="last", + ): """Sorts by a column/row or list of columns/rows. Args: @@ -3490,55 +3532,53 @@ def sort_values(self, # TODO create a more efficient way to sort if axis == 0: broadcast_value_dict = {col: self[col] for col in by} - broadcast_values = pandas.DataFrame( - broadcast_value_dict, index=self.index) + broadcast_values = pandas.DataFrame(broadcast_value_dict, index=self.index) new_index = broadcast_values.sort_values( - by=by, axis=axis, ascending=ascending, kind=kind).index + by=by, axis=axis, ascending=ascending, kind=kind + ).index return self.reindex(index=new_index) else: broadcast_value_list = [ - to_pandas(self[row::len(self.index)]) for row in by + to_pandas(self[row :: len(self.index)]) for row in by ] index_builder = list(zip(broadcast_value_list, by)) - broadcast_values = \ - pandas.concat([row for row, idx in index_builder], copy=False) + broadcast_values = pandas.concat( + [row for row, idx in index_builder], copy=False + ) broadcast_values.columns = self.columns new_columns = broadcast_values.sort_values( - by=by, axis=axis, ascending=ascending, kind=kind).columns + by=by, axis=axis, ascending=ascending, kind=kind + ).columns return self.reindex(columns=new_columns) - def sortlevel(self, - level=0, - axis=0, - ascending=True, - inplace=False, - sort_remaining=True): + def sortlevel( + self, level=0, axis=0, ascending=True, inplace=False, sort_remaining=True + ): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def squeeze(self, axis=None): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def stack(self, level=-1, dropna=True): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") - - def std(self, - axis=None, - skipna=None, - level=None, - ddof=1, - numeric_only=None, - **kwargs): + "github.com/modin-project/modin." + ) + + def std( + self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + ): """Computes standard deviation across the DataFrame. Args: @@ -3549,8 +3589,7 @@ def std(self, Returns: The std of the DataFrame (Pandas Series) """ - axis = pandas.DataFrame()._get_axis_number( - axis) if axis is not None else 0 + axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 return self._data_manager.std( axis=axis, @@ -3558,9 +3597,10 @@ def std(self, level=level, ddof=ddof, numeric_only=numeric_only, - **kwargs) + **kwargs + ) - def sub(self, other, axis='columns', level=None, fill_value=None): + def sub(self, other, axis="columns", level=None, fill_value=None): """Subtract a DataFrame/Series/scalar from this DataFrame. Args: @@ -3573,15 +3613,17 @@ def sub(self, other, axis='columns', level=None, fill_value=None): A new DataFrame with the subtraciont applied. """ if level is not None: - raise NotImplementedError("Mutlilevel index not yet supported " - "in Pandas on Ray") + raise NotImplementedError( + "Mutlilevel index not yet supported " "in Pandas on Ray" + ) other = self._validate_other(other, axis) new_manager = self._data_manager.sub( - other=other, axis=axis, level=level, fill_value=fill_value) + other=other, axis=axis, level=level, fill_value=fill_value + ) return self._create_dataframe_from_manager(new_manager) - def subtract(self, other, axis='columns', level=None, fill_value=None): + def subtract(self, other, axis="columns", level=None, fill_value=None): """Alias for sub. Args: @@ -3598,12 +3640,14 @@ def subtract(self, other, axis='columns', level=None, fill_value=None): def swapaxes(self, axis1, axis2, copy=True): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def swaplevel(self, i=-2, j=-1, axis=0): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def tail(self, n=5): """Get the last n rows of the DataFrame. @@ -3622,209 +3666,257 @@ def tail(self, n=5): def take(self, indices, axis=0, convert=None, is_copy=True, **kwargs): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def to_clipboard(self, excel=None, sep=None, **kwargs): - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) + warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning) port_frame = to_pandas(self) port_frame.to_clipboard(excel, sep, **kwargs) - def to_csv(self, - path_or_buf=None, - sep=",", - na_rep="", - float_format=None, - columns=None, - header=True, - index=True, - index_label=None, - mode="w", - encoding=None, - compression=None, - quoting=None, - quotechar='"', - line_terminator="\n", - chunksize=None, - tupleize_cols=None, - date_format=None, - doublequote=True, - escapechar=None, - decimal="."): + def to_csv( + self, + path_or_buf=None, + sep=",", + na_rep="", + float_format=None, + columns=None, + header=True, + index=True, + index_label=None, + mode="w", + encoding=None, + compression=None, + quoting=None, + quotechar='"', + line_terminator="\n", + chunksize=None, + tupleize_cols=None, + date_format=None, + doublequote=True, + escapechar=None, + decimal=".", + ): kwargs = { - 'path_or_buf': path_or_buf, - 'sep': sep, - 'na_rep': na_rep, - 'float_format': float_format, - 'columns': columns, - 'header': header, - 'index': index, - 'index_label': index_label, - 'mode': mode, - 'encoding': encoding, - 'compression': compression, - 'quoting': quoting, - 'quotechar': quotechar, - 'line_terminator': line_terminator, - 'chunksize': chunksize, - 'tupleize_cols': tupleize_cols, - 'date_format': date_format, - 'doublequote': doublequote, - 'escapechar': escapechar, - 'decimal': decimal + "path_or_buf": path_or_buf, + "sep": sep, + "na_rep": na_rep, + "float_format": float_format, + "columns": columns, + "header": header, + "index": index, + "index_label": index_label, + "mode": mode, + "encoding": encoding, + "compression": compression, + "quoting": quoting, + "quotechar": quotechar, + "line_terminator": line_terminator, + "chunksize": chunksize, + "tupleize_cols": tupleize_cols, + "date_format": date_format, + "doublequote": doublequote, + "escapechar": escapechar, + "decimal": decimal, } - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) + warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning) return to_pandas(self).to_csv(**kwargs) def to_dense(self): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) - def to_dict(self, orient='dict', into=dict): + def to_dict(self, orient="dict", into=dict): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") - - def to_excel(self, - excel_writer, - sheet_name='Sheet1', - na_rep='', - float_format=None, - columns=None, - header=True, - index=True, - index_label=None, - startrow=0, - startcol=0, - engine=None, - merge_cells=True, - encoding=None, - inf_rep='inf', - verbose=True, - freeze_panes=None): - - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) + "github.com/modin-project/modin." + ) + + def to_excel( + self, + excel_writer, + sheet_name="Sheet1", + na_rep="", + float_format=None, + columns=None, + header=True, + index=True, + index_label=None, + startrow=0, + startcol=0, + engine=None, + merge_cells=True, + encoding=None, + inf_rep="inf", + verbose=True, + freeze_panes=None, + ): + + warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning) port_frame = to_pandas(self) - port_frame.to_excel(excel_writer, sheet_name, na_rep, float_format, - columns, header, index, index_label, startrow, - startcol, engine, merge_cells, encoding, inf_rep, - verbose, freeze_panes) + port_frame.to_excel( + excel_writer, + sheet_name, + na_rep, + float_format, + columns, + header, + index, + index_label, + startrow, + startcol, + engine, + merge_cells, + encoding, + inf_rep, + verbose, + freeze_panes, + ) def to_feather(self, fname): - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) + warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning) port_frame = to_pandas(self) port_frame.to_feather(fname) - def to_gbq(self, - destination_table, - project_id, - chunksize=10000, - verbose=True, - reauth=False, - if_exists='fail', - private_key=None): + def to_gbq( + self, + destination_table, + project_id, + chunksize=10000, + verbose=True, + reauth=False, + if_exists="fail", + private_key=None, + ): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def to_hdf(self, path_or_buf, key, **kwargs): - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) + warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning) port_frame = to_pandas(self) port_frame.to_hdf(path_or_buf, key, **kwargs) - def to_html(self, - buf=None, - columns=None, - col_space=None, - header=True, - index=True, - na_rep='np.NaN', - formatters=None, - float_format=None, - sparsify=None, - index_names=True, - justify=None, - bold_rows=True, - classes=None, - escape=True, - max_rows=None, - max_cols=None, - show_dimensions=False, - notebook=False, - decimal='.', - border=None): - - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) + def to_html( + self, + buf=None, + columns=None, + col_space=None, + header=True, + index=True, + na_rep="np.NaN", + formatters=None, + float_format=None, + sparsify=None, + index_names=True, + justify=None, + bold_rows=True, + classes=None, + escape=True, + max_rows=None, + max_cols=None, + show_dimensions=False, + notebook=False, + decimal=".", + border=None, + ): + + warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning) port_frame = to_pandas(self) - port_frame.to_html(buf, columns, col_space, header, index, na_rep, - formatters, float_format, sparsify, index_names, - justify, bold_rows, classes, escape, max_rows, - max_cols, show_dimensions, notebook, decimal, - border) - - def to_json(self, - path_or_buf=None, - orient=None, - date_format=None, - double_precision=10, - force_ascii=True, - date_unit='ms', - default_handler=None, - lines=False, - compression=None): - - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) + port_frame.to_html( + buf, + columns, + col_space, + header, + index, + na_rep, + formatters, + float_format, + sparsify, + index_names, + justify, + bold_rows, + classes, + escape, + max_rows, + max_cols, + show_dimensions, + notebook, + decimal, + border, + ) + + def to_json( + self, + path_or_buf=None, + orient=None, + date_format=None, + double_precision=10, + force_ascii=True, + date_unit="ms", + default_handler=None, + lines=False, + compression=None, + ): + + warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning) port_frame = to_pandas(self) - port_frame.to_json(path_or_buf, orient, date_format, double_precision, - force_ascii, date_unit, default_handler, lines, - compression) - - def to_latex(self, - buf=None, - columns=None, - col_space=None, - header=True, - index=True, - na_rep='np.NaN', - formatters=None, - float_format=None, - sparsify=None, - index_names=True, - bold_rows=False, - column_format=None, - longtable=None, - escape=None, - encoding=None, - decimal='.', - multicolumn=None, - multicolumn_format=None, - multirow=None): + port_frame.to_json( + path_or_buf, + orient, + date_format, + double_precision, + force_ascii, + date_unit, + default_handler, + lines, + compression, + ) + + def to_latex( + self, + buf=None, + columns=None, + col_space=None, + header=True, + index=True, + na_rep="np.NaN", + formatters=None, + float_format=None, + sparsify=None, + index_names=True, + bold_rows=False, + column_format=None, + longtable=None, + escape=None, + encoding=None, + decimal=".", + multicolumn=None, + multicolumn_format=None, + multirow=None, + ): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) - def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs): + def to_msgpack(self, path_or_buf=None, encoding="utf-8", **kwargs): - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) + warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning) port_frame = to_pandas(self) port_frame.to_msgpack(path_or_buf, encoding, **kwargs) @@ -3832,12 +3924,12 @@ def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs): def to_panel(self): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) - def to_parquet(self, fname, engine='auto', compression='snappy', **kwargs): + def to_parquet(self, fname, engine="auto", compression="snappy", **kwargs): - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) + warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning) port_frame = to_pandas(self) port_frame.to_parquet(fname, engine, compression, **kwargs) @@ -3845,15 +3937,12 @@ def to_parquet(self, fname, engine='auto', compression='snappy', **kwargs): def to_period(self, freq=None, axis=0, copy=True): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) - def to_pickle(self, - path, - compression='infer', - protocol=pkl.HIGHEST_PROTOCOL): + def to_pickle(self, path, compression="infer", protocol=pkl.HIGHEST_PROTOCOL): - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) + warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning) port_frame = to_pandas(self) port_frame.to_pickle(path, compression, protocol) @@ -3861,77 +3950,95 @@ def to_pickle(self, def to_records(self, index=True, convert_datetime64=True): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) - def to_sparse(self, fill_value=None, kind='block'): + def to_sparse(self, fill_value=None, kind="block"): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") - - def to_sql(self, - name, - con, - flavor=None, - schema=None, - if_exists='fail', - index=True, - index_label=None, - chunksize=None, - dtype=None): - - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) + "github.com/modin-project/modin." + ) + + def to_sql( + self, + name, + con, + flavor=None, + schema=None, + if_exists="fail", + index=True, + index_label=None, + chunksize=None, + dtype=None, + ): + + warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning) port_frame = to_pandas(self) - port_frame.to_sql(name, con, flavor, schema, if_exists, index, - index_label, chunksize, dtype) - - def to_stata(self, - fname, - convert_dates=None, - write_index=True, - encoding='latin-1', - byteorder=None, - time_stamp=None, - data_label=None, - variable_labels=None): - - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) + port_frame.to_sql( + name, con, flavor, schema, if_exists, index, index_label, chunksize, dtype + ) + + def to_stata( + self, + fname, + convert_dates=None, + write_index=True, + encoding="latin-1", + byteorder=None, + time_stamp=None, + data_label=None, + variable_labels=None, + ): + + warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning) port_frame = to_pandas(self) - port_frame.to_stata(fname, convert_dates, write_index, encoding, - byteorder, time_stamp, data_label, variable_labels) - - def to_string(self, - buf=None, - columns=None, - col_space=None, - header=True, - index=True, - na_rep='np.NaN', - formatters=None, - float_format=None, - sparsify=None, - index_names=True, - justify=None, - line_width=None, - max_rows=None, - max_cols=None, - show_dimensions=False): + port_frame.to_stata( + fname, + convert_dates, + write_index, + encoding, + byteorder, + time_stamp, + data_label, + variable_labels, + ) + + def to_string( + self, + buf=None, + columns=None, + col_space=None, + header=True, + index=True, + na_rep="np.NaN", + formatters=None, + float_format=None, + sparsify=None, + index_names=True, + justify=None, + line_width=None, + max_rows=None, + max_cols=None, + show_dimensions=False, + ): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) - def to_timestamp(self, freq=None, how='start', axis=0, copy=True): + def to_timestamp(self, freq=None, how="start", axis=0, copy=True): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def to_xarray(self): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def transform(self, func, *args, **kwargs): kwargs["is_transform"] = True @@ -3943,7 +4050,7 @@ def transform(self, func, *args, **kwargs): raise ValueError("transforms cannot produce aggregated results") return result - def truediv(self, other, axis='columns', level=None, fill_value=None): + def truediv(self, other, axis="columns", level=None, fill_value=None): """Divides this DataFrame against another DataFrame/Series/scalar. Args: @@ -3956,46 +4063,49 @@ def truediv(self, other, axis='columns', level=None, fill_value=None): A new DataFrame with the Divide applied. """ if level is not None: - raise NotImplementedError("Mutlilevel index not yet supported " - "in Pandas on Ray") + raise NotImplementedError( + "Mutlilevel index not yet supported " "in Pandas on Ray" + ) other = self._validate_other(other, axis) new_manager = self._data_manager.truediv( - other=other, axis=axis, level=level, fill_value=fill_value) + other=other, axis=axis, level=level, fill_value=fill_value + ) return self._create_dataframe_from_manager(new_manager) def truncate(self, before=None, after=None, axis=None, copy=True): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def tshift(self, periods=1, freq=None, axis=0): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def tz_convert(self, tz, axis=0, level=None, copy=True): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) - def tz_localize(self, tz, axis=0, level=None, copy=True, - ambiguous='raise'): + def tz_localize(self, tz, axis=0, level=None, copy=True, ambiguous="raise"): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def unstack(self, level=-1, fill_value=None): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") - - def update(self, - other, - join='left', - overwrite=True, - filter_func=None, - raise_conflict=False): + "github.com/modin-project/modin." + ) + + def update( + self, other, join="left", overwrite=True, filter_func=None, raise_conflict=False + ): """Modify DataFrame in place using non-NA values from other. Args: @@ -4013,7 +4123,8 @@ def update(self, raise NotImplementedError( "raise_conflict parameter not yet supported. " "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) if not isinstance(other, DataFrame): other = DataFrame(other) @@ -4023,16 +4134,13 @@ def update(self, join=join, overwrite=overwrite, filter_func=filter_func, - raise_conflict=raise_conflict) + raise_conflict=raise_conflict, + ) self._update_inplace(new_manager=data_manager) - def var(self, - axis=None, - skipna=None, - level=None, - ddof=1, - numeric_only=None, - **kwargs): + def var( + self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + ): """Computes variance across the DataFrame. Args: @@ -4043,8 +4151,7 @@ def var(self, Returns: The variance of the DataFrame. """ - axis = pandas.DataFrame()._get_axis_number( - axis) if axis is not None else 0 + axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 return self._data_manager.var( axis=axis, @@ -4052,17 +4159,20 @@ def var(self, level=level, ddof=ddof, numeric_only=numeric_only, - **kwargs) - - def where(self, - cond, - other=np.nan, - inplace=False, - axis=None, - level=None, - errors='raise', - try_cast=False, - raise_on_error=None): + **kwargs + ) + + def where( + self, + cond, + other=np.nan, + inplace=False, + axis=None, + level=None, + errors="raise", + try_cast=False, + raise_on_error=None, + ): """Replaces values not meeting condition with values in other. Args: @@ -4081,26 +4191,25 @@ def where(self, A new DataFrame with the replaced values. """ - inplace = validate_bool_kwarg(inplace, 'inplace') + inplace = validate_bool_kwarg(inplace, "inplace") if isinstance(other, pandas.Series) and axis is None: raise ValueError("Must specify axis=0 or 1") if level is not None: - raise NotImplementedError("Multilevel Index not yet supported on " - "Pandas on Ray.") + raise NotImplementedError( + "Multilevel Index not yet supported on " "Pandas on Ray." + ) - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None \ - else 0 + axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 cond = cond(self) if callable(cond) else cond if not isinstance(cond, DataFrame): - if not hasattr(cond, 'shape'): + if not hasattr(cond, "shape"): cond = np.asanyarray(cond) if cond.shape != self.shape: - raise ValueError("Array conditional must be same shape as " - "self") + raise ValueError("Array conditional must be same shape as " "self") cond = DataFrame(cond, index=self.index, columns=self.columns) if isinstance(other, DataFrame): @@ -4113,7 +4222,8 @@ def where(self, other = pandas.Series(other, index=index) data_manager = self._data_manager.where( - cond._data_manager, other, axis=axis, level=level) + cond._data_manager, other, axis=axis, level=level + ) if inplace: self._update_inplace(new_manager=data_manager) else: @@ -4122,7 +4232,8 @@ def where(self, def xs(self, key, axis=0, level=None, drop_level=True): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def __getitem__(self, key): """Get the column specified by key for this DataFrame. @@ -4145,20 +4256,23 @@ def __getitem__(self, key): # see if we can slice the rows # This lets us reuse code in Pandas to error check - indexer = convert_to_index_sliceable( - pandas.DataFrame(index=self.index), key) + indexer = convert_to_index_sliceable(pandas.DataFrame(index=self.index), key) if indexer is not None: return self._getitem_slice(indexer) if isinstance(key, (pandas.Series, np.ndarray, pandas.Index, list)): return self._getitem_array(key) elif isinstance(key, DataFrame): - raise NotImplementedError("To contribute to Pandas on Ray, please" - "visit github.com/modin-project/modin.") + raise NotImplementedError( + "To contribute to Pandas on Ray, please" + "visit github.com/modin-project/modin." + ) # return self._getitem_frame(key) elif is_mi_columns: - raise NotImplementedError("To contribute to Pandas on Ray, please" - "visit github.com/modin-project/modin.") + raise NotImplementedError( + "To contribute to Pandas on Ray, please" + "visit github.com/modin-project/modin." + ) # return self._getitem_multilevel(key) else: return self._getitem_column(key) @@ -4168,33 +4282,32 @@ def _getitem_column(self, key): def _getitem_array(self, key): if com.is_bool_indexer(key): - if isinstance(key, pandas.Series) and \ - not key.index.equals(self.index): + if isinstance(key, pandas.Series) and not key.index.equals(self.index): warnings.warn( - "Boolean Series key will be reindexed to match " - "DataFrame index.", + "Boolean Series key will be reindexed to match " "DataFrame index.", UserWarning, - stacklevel=3) + stacklevel=3, + ) elif len(key) != len(self.index): - raise ValueError('Item wrong length {} instead of {}.'.format( - len(key), len(self.index))) + raise ValueError( + "Item wrong length {} instead of {}.".format( + len(key), len(self.index) + ) + ) key = check_bool_indexer(self.index, key) # We convert here because the data_manager assumes it is a list of # indices. This greatly decreases the complexity of the code. key = self.index[key] - return DataFrame( - data_manager=self._data_manager.getitem_row_array(key)) + return DataFrame(data_manager=self._data_manager.getitem_row_array(key)) else: - return DataFrame( - data_manager=self._data_manager.getitem_column_array(key)) + return DataFrame(data_manager=self._data_manager.getitem_column_array(key)) def _getitem_slice(self, key): # We convert here because the data_manager assumes it is a list of # indices. This greatly decreases the complexity of the code. key = self.index[key] - return DataFrame( - data_manager=self._data_manager.getitem_row_array(key)) + return DataFrame(data_manager=self._data_manager.getitem_row_array(key)) def __getattr__(self, key): """After regular attribute access, looks up the name in the columns @@ -4216,7 +4329,8 @@ def __setitem__(self, key, value): if not isinstance(key, str): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) if key not in self.columns: self.insert(loc=len(self.columns), column=key, value=value) else: @@ -4235,17 +4349,20 @@ def __len__(self): def __unicode__(self): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def __invert__(self): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def __hash__(self): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def __iter__(self): """Iterate over the columns @@ -4269,12 +4386,14 @@ def __contains__(self, key): def __nonzero__(self): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def __bool__(self): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def __abs__(self): """Creates a modified DataFrame by taking the absolute value. @@ -4287,7 +4406,8 @@ def __abs__(self): def __round__(self, decimals=0): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def __array__(self, dtype=None): # TODO: This is very inefficient and needs fix, also see as_matrix @@ -4300,12 +4420,14 @@ def __array_wrap__(self, result, context=None): def __getstate__(self): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def __setstate__(self, state): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def __delitem__(self, key): """Delete a column by key. `del a[key]` for example. @@ -4324,7 +4446,8 @@ def __delitem__(self, key): def __finalize__(self, other, method=None, **kwargs): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def __copy__(self, deep=True): """Make a copy using modin.DataFrame.copy method @@ -4353,17 +4476,20 @@ def __deepcopy__(self, memo=None): def __and__(self, other): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def __or__(self, other): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def __xor__(self, other): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def __lt__(self, other): return self.lt(other) @@ -4425,8 +4551,7 @@ def __floordiv__(self, other): def __ifloordiv__(self, other): return self.floordiv(other) - def __rfloordiv__(self, other, axis="columns", level=None, - fill_value=None): + def __rfloordiv__(self, other, axis="columns", level=None, fill_value=None): return self.rfloordiv(other, axis, level, fill_value) def __truediv__(self, other): @@ -4460,40 +4585,49 @@ def __neg__(self): A modified DataFrame where every element is the negation of before """ for t in self.dtypes: - if not (is_bool_dtype(t) or is_numeric_dtype(t) - or is_datetime_or_timedelta_dtype(t)): + if not ( + is_bool_dtype(t) + or is_numeric_dtype(t) + or is_datetime_or_timedelta_dtype(t) + ): raise TypeError( - "Unary negative expects numeric dtype, not {}".format(t)) + "Unary negative expects numeric dtype, not {}".format(t) + ) return DataFrame(data_manager=self._data_manager.negative()) def __sizeof__(self): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) @property def __doc__(self): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) @property def blocks(self): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) @property def style(self): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def iat(self, axis=None): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) @property def loc(self): @@ -4503,23 +4637,27 @@ def loc(self): We do not support: boolean array, callable """ from .indexing import _LocIndexer + return _LocIndexer(self) @property def is_copy(self): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def at(self, axis=None): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def ix(self, axis=None): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) @property def iloc(self): @@ -4529,6 +4667,7 @@ def iloc(self): We do not support: boolean array, callable """ from .indexing import _iLocIndexer + return _iLocIndexer(self) def _create_dataframe_from_manager(self, new_manager, inplace=False): @@ -4549,12 +4688,14 @@ def _validate_other(self, other, axis): if len(other) != len(self.index): raise ValueError( "Unable to coerce to Series, length must be {0}: " - "given {1}".format(len(self.index), len(other))) + "given {1}".format(len(self.index), len(other)) + ) else: if len(other) != len(self.columns): raise ValueError( "Unable to coerce to Series, length must be {0}: " - "given {1}".format(len(self.columns), len(other))) + "given {1}".format(len(self.columns), len(other)) + ) return other @@ -4570,6 +4711,8 @@ def _merge_columns(left_columns, right_columns, *args): Returns: The columns for the merge operation. """ - return pandas.DataFrame(columns=left_columns, index=[0], dtype='uint8') \ - .merge(pandas.DataFrame(columns=right_columns, index=[0], - dtype='uint8'), *args).columns + return ( + pandas.DataFrame(columns=left_columns, index=[0], dtype="uint8") + .merge(pandas.DataFrame(columns=right_columns, index=[0], dtype="uint8"), *args) + .columns + ) diff --git a/modin/pandas/datetimes.py b/modin/pandas/datetimes.py index 0ca2c0c0f5d..5e6ae78354d 100644 --- a/modin/pandas/datetimes.py +++ b/modin/pandas/datetimes.py @@ -7,17 +7,19 @@ from .dataframe import DataFrame -def to_datetime(arg, - errors='raise', - dayfirst=False, - yearfirst=False, - utc=None, - box=True, - format=None, - exact=True, - unit=None, - infer_datetime_format=False, - origin='unix'): +def to_datetime( + arg, + errors="raise", + dayfirst=False, + yearfirst=False, + utc=None, + box=True, + format=None, + exact=True, + unit=None, + infer_datetime_format=False, + origin="unix", +): """Convert the arg to datetime format. If not Ray DataFrame, this falls back on pandas. @@ -53,7 +55,8 @@ def to_datetime(arg, exact=exact, unit=unit, infer_datetime_format=infer_datetime_format, - origin=origin) + origin=origin, + ) # Pandas seems to ignore this kwarg so we will too pandas.to_datetime( @@ -67,6 +70,7 @@ def to_datetime(arg, exact=exact, unit=unit, infer_datetime_format=infer_datetime_format, - origin=origin) + origin=origin, + ) return arg._data_manager.to_datetime() diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index e48f631b9e5..011a269121a 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -14,11 +14,13 @@ pandas.core.groupby.DataFrameGroupBy, excluded=[ pandas.core.groupby.DataFrameGroupBy, - pandas.core.groupby.DataFrameGroupBy.__init__ - ]) + pandas.core.groupby.DataFrameGroupBy.__init__, + ], +) class DataFrameGroupBy(object): - def __init__(self, df, by, axis, level, as_index, sort, group_keys, - squeeze, **kwargs): + def __init__( + self, df, by, axis, level, as_index, sort, group_keys, squeeze, **kwargs + ): self._axis = axis self._data_manager = df._data_manager @@ -30,7 +32,7 @@ def __init__(self, df, by, axis, level, as_index, sort, group_keys, "sort": sort, "as_index": as_index, "group_keys": group_keys, - "squeeze": squeeze + "squeeze": squeeze, } def __getattr__(self, key): @@ -49,7 +51,8 @@ def __getattr__(self, key): raise NotImplementedError( "SeriesGroupBy is not implemented." "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) raise e _index_grouped_cache = None @@ -79,17 +82,29 @@ def _iter(self): from .dataframe import DataFrame if self._axis == 0: - return ((k, - DataFrame( - data_manager=self._data_manager.getitem_row_array( - self._index_grouped[k]))) - for k, _ in self._keys_and_values) + return ( + ( + k, + DataFrame( + data_manager=self._data_manager.getitem_row_array( + self._index_grouped[k] + ) + ), + ) + for k, _ in self._keys_and_values + ) else: - return ((k, - DataFrame( - data_manager=self._data_manager.getitem_column_array( - self._index_grouped[k]))) - for k, _ in self._keys_and_values) + return ( + ( + k, + DataFrame( + data_manager=self._data_manager.getitem_column_array( + self._index_grouped[k] + ) + ), + ) + for k, _ in self._keys_and_values + ) @property def ngroups(self): @@ -101,12 +116,14 @@ def skew(self, **kwargs): def ffill(self, limit=None): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def sem(self, ddof=1): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def mean(self, *args, **kwargs): return self._apply_agg_function(lambda df: df.mean(*args, **kwargs)) @@ -118,23 +135,27 @@ def any(self): def plot(self): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def ohlc(self): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def __bytes__(self): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) @property def tshift(self): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) @property def groups(self): @@ -146,7 +167,8 @@ def min(self, **kwargs): def idxmax(self): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) @property def ndim(self): @@ -155,16 +177,17 @@ def ndim(self): def shift(self, periods=1, freq=None, axis=0): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def nth(self, n, dropna=None): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def cumsum(self, axis=0, *args, **kwargs): - return self._apply_agg_function( - lambda df: df.cumsum(axis, *args, **kwargs)) + return self._apply_agg_function(lambda df: df.cumsum(axis, *args, **kwargs)) @property def indices(self): @@ -173,19 +196,20 @@ def indices(self): def pct_change(self): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def filter(self, func, dropna=True, *args, **kwargs): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def cummax(self, axis=0, **kwargs): return self._apply_agg_function(lambda df: df.cummax(axis, **kwargs)) def apply(self, func, *args, **kwargs): - return self._apply_agg_function( - lambda df: df.apply(func, *args, **kwargs)) + return self._apply_agg_function(lambda df: df.apply(func, *args, **kwargs)) @property def dtypes(self): @@ -196,7 +220,8 @@ def dtypes(self): def first(self, **kwargs): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def backfill(self, limit=None): return self.bfill(limit) @@ -205,28 +230,29 @@ def __getitem__(self, key): # This operation requires a SeriesGroupBy Object raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def cummin(self, axis=0, **kwargs): - return self._apply_agg_function( - lambda df: df.cummin(axis=axis, **kwargs)) + return self._apply_agg_function(lambda df: df.cummin(axis=axis, **kwargs)) def bfill(self, limit=None): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def idxmin(self): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def prod(self, **kwargs): return self._apply_agg_function(lambda df: df.prod(**kwargs)) def std(self, ddof=1, *args, **kwargs): - return self._apply_agg_function( - lambda df: df.std(ddof, *args, **kwargs)) + return self._apply_agg_function(lambda df: df.std(ddof, *args, **kwargs)) def aggregate(self, arg, *args, **kwargs): if self._axis != 0: @@ -238,19 +264,21 @@ def aggregate(self, arg, *args, **kwargs): raise NotImplementedError( "This requires Multi-level index to be implemented. " "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") - return self._apply_agg_function( - lambda df: df.aggregate(arg, *args, **kwargs)) + "github.com/modin-project/modin." + ) + return self._apply_agg_function(lambda df: df.aggregate(arg, *args, **kwargs)) def last(self, **kwargs): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def mad(self): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def rank(self): return self._apply_agg_function(lambda df: df.rank()) @@ -259,24 +287,26 @@ def rank(self): def corrwith(self): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def pad(self, limit=None): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def max(self, **kwargs): return self._apply_agg_function(lambda df: df.max(**kwargs)) def var(self, ddof=1, *args, **kwargs): - return self._apply_agg_function( - lambda df: df.var(ddof, *args, **kwargs)) + return self._apply_agg_function(lambda df: df.var(ddof, *args, **kwargs)) def get_group(self, name, obj=None): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def __len__(self): return len(self._index_grouped) @@ -293,32 +323,40 @@ def sum(self, **kwargs): def __unicode__(self): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def describe(self, **kwargs): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) - def boxplot(self, - grouped, - subplots=True, - column=None, - fontsize=None, - rot=0, - grid=True, - ax=None, - figsize=None, - layout=None, - **kwds): + def boxplot( + self, + grouped, + subplots=True, + column=None, + fontsize=None, + rot=0, + grid=True, + ax=None, + figsize=None, + layout=None, + **kwds + ): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def ngroup(self, ascending=True): index = self._index if not self._axis else self._columns - return pandas.Series(index=index).groupby( - by=self._by, **self._kwargs).ngroup(ascending) + return ( + pandas.Series(index=index) + .groupby(by=self._by, **self._kwargs) + .ngroup(ascending) + ) def nunique(self, dropna=True): return self._apply_agg_function(lambda df: df.nunique(dropna)) @@ -326,7 +364,8 @@ def nunique(self, dropna=True): def resample(self, rule, *args, **kwargs): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def median(self, **kwargs): return self._apply_agg_function(lambda df: df.median(**kwargs)) @@ -334,11 +373,11 @@ def median(self, **kwargs): def head(self, n=5): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def cumprod(self, axis=0, *args, **kwargs): - return self._apply_agg_function( - lambda df: df.cumprod(axis, *args, **kwargs)) + return self._apply_agg_function(lambda df: df.cumprod(axis, *args, **kwargs)) def __iter__(self): return self._iter.__iter__() @@ -349,16 +388,17 @@ def agg(self, arg, *args, **kwargs): def cov(self): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def transform(self, func, *args, **kwargs): - return self._apply_agg_function( - lambda df: df.transform(func, *args, **kwargs)) + return self._apply_agg_function(lambda df: df.transform(func, *args, **kwargs)) def corr(self, **kwargs): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def fillna(self, **kwargs): return self._apply_agg_function(lambda df: df.fillna(**kwargs)) @@ -372,48 +412,56 @@ def pipe(self, func, *args, **kwargs): def cumcount(self, ascending=True): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def tail(self, n=5): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) # expanding and rolling are unique cases and need to likely be handled # separately. They do not appear to be commonly used. def expanding(self, *args, **kwargs): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def rolling(self, *args, **kwargs): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def hist(self): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def quantile(self, q=0.5, **kwargs): if is_list_like(q): raise NotImplementedError( "This requires Multi-level index to be implemented. " "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) return self._apply_agg_function(lambda df: df.quantile(q, **kwargs)) def diff(self): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def take(self, **kwargs): raise NotImplementedError( "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) def _apply_agg_function(self, f, **kwargs): """Perform aggregation and combine stages based on a given function. @@ -424,8 +472,10 @@ def _apply_agg_function(self, f, **kwargs): Returns: A new combined DataFrame with the result of all groups. """ - assert callable(f), "\'{0}\' object is not callable".format(type(f)) + assert callable(f), "'{0}' object is not callable".format(type(f)) from .dataframe import DataFrame - new_manager = self._data_manager.groupby_agg(self._by, self._axis, f, - self._kwargs, kwargs) + + new_manager = self._data_manager.groupby_agg( + self._by, self._axis, f, self._kwargs, kwargs + ) return DataFrame(data_manager=new_manager) diff --git a/modin/pandas/indexing.py b/modin/pandas/indexing.py index cdac309b7cc..f76bae3a5c8 100644 --- a/modin/pandas/indexing.py +++ b/modin/pandas/indexing.py @@ -4,13 +4,14 @@ import numpy as np import pandas -from pandas.api.types import (is_scalar, is_list_like, is_bool) +from pandas.api.types import is_scalar, is_list_like, is_bool from pandas.core.dtypes.common import is_integer from pandas.core.indexing import IndexingError from typing import Tuple from warnings import warn from .dataframe import DataFrame + """Indexing Helper Class works as follows: _LocationIndexerBase provide methods framework for __getitem__ @@ -86,7 +87,7 @@ def _parse_tuple(tup): if len(tup) == 2: col_loc = tup[1] if len(tup) > 2: - raise IndexingError('Too many indexers') + raise IndexingError("Too many indexers") else: row_loc = tup @@ -105,8 +106,12 @@ def _is_enlargement(locator, global_index): Enlargement happens when you trying to locate using labels isn't in the original index. In other words, enlargement == adding NaNs ! """ - if is_list_like(locator) and not is_slice( - locator) and len(locator) > 0 and not is_boolean_array(locator): + if ( + is_list_like(locator) + and not is_slice(locator) + and len(locator) > 0 + and not is_boolean_array(locator) + ): n_diff_elems = len(pandas.Index(locator).difference(global_index)) is_enlargement_boolean = n_diff_elems > 0 return is_enlargement_boolean @@ -144,11 +149,11 @@ def __init__(self, ray_df: DataFrame): self.row_scaler = False self.col_scaler = False - def __getitem__(self, row_lookup: pandas.Index, col_lookup: pandas.Index, - ndim: int): + def __getitem__( + self, row_lookup: pandas.Index, col_lookup: pandas.Index, ndim: int + ): if self.is_view: - dm_view = self.dm.__constructor__(self.dm.data, row_lookup, - col_lookup) + dm_view = self.dm.__constructor__(self.dm.data, row_lookup, col_lookup) else: dm_view = self.dm.view(row_lookup, col_lookup) @@ -160,8 +165,7 @@ def __getitem__(self, row_lookup: pandas.Index, col_lookup: pandas.Index, single_axis = 1 if self.col_scaler else 0 return dm_view.squeeze(ndim=1, axis=single_axis) - def __setitem__(self, row_lookup: pandas.Index, col_lookup: pandas.Index, - item): + def __setitem__(self, row_lookup: pandas.Index, col_lookup: pandas.Index, item): """ Args: row_lookup: the global row index to write item to @@ -187,15 +191,18 @@ def _broadcast_item(self, item, to_shape): return np.broadcast_to(item, to_shape) except ValueError: from_shape = np.array(item).shape - raise ValueError("could not broadcast input array from \ + raise ValueError( + "could not broadcast input array from \ shape {from_shape} into shape {to_shape}".format( - from_shape=from_shape, to_shape=to_shape)) + from_shape=from_shape, to_shape=to_shape + ) + ) def _write_items(self, row_lookup, col_lookup, item): """Perform remote write and replace blocks. """ - row_numeric_idx = self.dm.global_idx_to_numeric_idx('row', row_lookup) - col_numeric_idx = self.dm.global_idx_to_numeric_idx('col', col_lookup) + row_numeric_idx = self.dm.global_idx_to_numeric_idx("row", row_lookup) + col_numeric_idx = self.dm.global_idx_to_numeric_idx("col", col_lookup) self.dm.write_items(row_numeric_idx, col_numeric_idx, item) @@ -203,13 +210,11 @@ class _LocIndexer(_LocationIndexerBase): """A indexer for ray_df.loc[] functionality""" def __getitem__(self, key): - row_loc, col_loc, ndim, self.row_scaler, self.col_scaler = _parse_tuple( - key) + row_loc, col_loc, ndim, self.row_scaler, self.col_scaler = _parse_tuple(key) self._handle_enlargement(row_loc, col_loc) row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc) ndim = self._expand_dim(row_lookup, col_lookup, ndim) - result = super(_LocIndexer, self).__getitem__(row_lookup, col_lookup, - ndim) + result = super(_LocIndexer, self).__getitem__(row_lookup, col_lookup, ndim) return result def __setitem__(self, key, item): @@ -224,13 +229,13 @@ def _handle_enlargement(self, row_loc, col_loc): None """ if _is_enlargement(row_loc, self.dm.index) or _is_enlargement( - col_loc, self.dm.columns): + col_loc, self.dm.columns + ): _warn_enlargement() self.dm.enlarge_partitions( - new_row_labels=self._compute_enlarge_labels( - row_loc, self.dm.index), - new_col_labels=self._compute_enlarge_labels( - col_loc, self.dm.columns)) + new_row_labels=self._compute_enlarge_labels(row_loc, self.dm.index), + new_col_labels=self._compute_enlarge_labels(col_loc, self.dm.columns), + ) def _compute_enlarge_labels(self, locator, base_index): """Helper for _enlarge_axis, compute common labels and extra labels. @@ -249,8 +254,10 @@ def _compute_enlarge_labels(self, locator, base_index): if len(common_labels) == 0: raise KeyError( - 'None of [{labels}] are in the [{base_index_name}]'.format( - labels=list(locator_as_index), base_index_name=base_index)) + "None of [{labels}] are in the [{base_index_name}]".format( + labels=list(locator_as_index), base_index_name=base_index + ) + ) return nan_labels @@ -268,8 +275,7 @@ def _expand_dim(self, row_lookup, col_lookup, ndim): return ndim - def _compute_lookup(self, row_loc, - col_loc) -> Tuple[pandas.Index, pandas.Index]: + def _compute_lookup(self, row_loc, col_loc) -> Tuple[pandas.Index, pandas.Index]: row_lookup = self.dm.index.to_series().loc[row_loc].index col_lookup = self.dm.columns.to_series().loc[col_loc].index return row_lookup, col_lookup @@ -279,15 +285,13 @@ class _iLocIndexer(_LocationIndexerBase): """A indexer for ray_df.iloc[] functionality""" def __getitem__(self, key): - row_loc, col_loc, ndim, self.row_scaler, self.col_scaler = _parse_tuple( - key) + row_loc, col_loc, ndim, self.row_scaler, self.col_scaler = _parse_tuple(key) self._check_dtypes(row_loc) self._check_dtypes(col_loc) row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc) - result = super(_iLocIndexer, self).__getitem__(row_lookup, col_lookup, - ndim) + result = super(_iLocIndexer, self).__getitem__(row_lookup, col_lookup, ndim) return result def __setitem__(self, key, item): @@ -299,8 +303,7 @@ def __setitem__(self, key, item): row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc) super(_iLocIndexer, self).__setitem__(row_lookup, col_lookup, item) - def _compute_lookup(self, row_loc, - col_loc) -> Tuple[pandas.Index, pandas.Index]: + def _compute_lookup(self, row_loc, col_loc) -> Tuple[pandas.Index, pandas.Index]: row_lookup = self.dm.index.to_series().iloc[row_loc].index col_lookup = self.dm.columns.to_series().iloc[col_loc].index return row_lookup, col_lookup diff --git a/modin/pandas/io.py b/modin/pandas/io.py index 2551c0bae59..caabc740cfe 100644 --- a/modin/pandas/io.py +++ b/modin/pandas/io.py @@ -18,14 +18,15 @@ from ..data_management.partitioning.partition_collections import RayBlockPartitions from ..data_management.partitioning.remote_partition import RayRemotePartition from ..data_management.partitioning.axis_partition import ( - split_result_of_axis_func_pandas) + split_result_of_axis_func_pandas +) from ..data_management.data_manager import PandasDataManager -PQ_INDEX_REGEX = re.compile('__index_level_\d+__') +PQ_INDEX_REGEX = re.compile("__index_level_\d+__") # Parquet -def read_parquet(path, engine='auto', columns=None, **kwargs): +def read_parquet(path, engine="auto", columns=None, **kwargs): """Load a parquet object from the file path, returning a DataFrame. Ray DataFrame only supports pyarrow engine for now. @@ -49,25 +50,28 @@ def _read_parquet_pandas_on_ray(path, engine, columns, **kwargs): if not columns: pf = ParquetFile(path) columns = [ - name for name in pf.metadata.schema.names - if not PQ_INDEX_REGEX.match(name) + name for name in pf.metadata.schema.names if not PQ_INDEX_REGEX.match(name) ] - num_splits = min( - len(columns), RayBlockPartitions._compute_num_partitions()) + num_splits = min(len(columns), RayBlockPartitions._compute_num_partitions()) # Each item in this list will be a column of original df # partitioned to smaller pieces along rows. # We need to transpose the oids array to fit our schema. - blk_partitions = np.array([ - _read_parquet_column._submit( - args=(path, col, num_splits, kwargs), - num_return_vals=num_splits + 1) for col in columns - ]).T - remote_partitions = np.array([[RayRemotePartition(obj) for obj in row] - for row in blk_partitions[:-1]]) + blk_partitions = np.array( + [ + _read_parquet_column._submit( + args=(path, col, num_splits, kwargs), num_return_vals=num_splits + 1 + ) + for col in columns + ] + ).T + remote_partitions = np.array( + [[RayRemotePartition(obj) for obj in row] for row in blk_partitions[:-1]] + ) index_len = ray.get(blk_partitions[-1][0]) index = pandas.RangeIndex(index_len) new_manager = PandasDataManager( - RayBlockPartitions(remote_partitions), index, columns) + RayBlockPartitions(remote_partitions), index, columns + ) df = DataFrame(data_manager=new_manager) return df @@ -134,14 +138,15 @@ def _read_csv_from_file_pandas_on_ray(filepath, kwargs={}): DataFrame or Series constructed from CSV file. """ empty_pd_df = pandas.read_csv( - filepath, **dict(kwargs, nrows=0, skipfooter=0, skip_footer=0)) + filepath, **dict(kwargs, nrows=0, skipfooter=0, skip_footer=0) + ) column_names = empty_pd_df.columns - skipfooter = kwargs.get("skipfooter", None) or kwargs.get( - "skip_footer", None) + skipfooter = kwargs.get("skipfooter", None) or kwargs.get("skip_footer", None) partition_kwargs = dict( - kwargs, header=None, names=column_names, skipfooter=0, skip_footer=0) + kwargs, header=None, names=column_names, skipfooter=0, skip_footer=0 + ) with open(filepath, "rb") as f: # Get the BOM if necessary prefix = b"" @@ -173,11 +178,17 @@ def _read_csv_from_file_pandas_on_ray(filepath, kwargs={}): f.readline() # Read a whole number of lines partition_id = _read_csv_with_offset_pandas_on_ray._submit( - args=(filepath, num_splits, start, f.tell(), - partition_kwargs_id, prefix_id), - num_return_vals=num_splits + 1) - partition_ids.append( - [RayRemotePartition(obj) for obj in partition_id[:-1]]) + args=( + filepath, + num_splits, + start, + f.tell(), + partition_kwargs_id, + prefix_id, + ), + num_return_vals=num_splits + 1, + ) + partition_ids.append([RayRemotePartition(obj) for obj in partition_id[:-1]]) index_ids.append(partition_id[-1]) index_col = kwargs.get("index_col", None) @@ -188,7 +199,8 @@ def _read_csv_from_file_pandas_on_ray(filepath, kwargs={}): new_index = ray.get(new_index_ids) new_manager = PandasDataManager( - RayBlockPartitions(np.array(partition_ids)), new_index, column_names) + RayBlockPartitions(np.array(partition_ids)), new_index, column_names + ) df = DataFrame(data_manager=new_manager) if skipfooter: @@ -208,65 +220,66 @@ def _read_csv_from_pandas(filepath_or_buffer, kwargs): # Overwriting the read method should return a ray DataFrame for calls # to __next__ and get_chunk pd_read = pd_obj.read - pd_obj.read = lambda *args, **kwargs: \ - from_pandas(pd_read(*args, **kwargs)) + pd_obj.read = lambda *args, **kwargs: from_pandas(pd_read(*args, **kwargs)) return pd_obj -def read_csv(filepath_or_buffer, - sep=',', - delimiter=None, - header='infer', - names=None, - index_col=None, - usecols=None, - squeeze=False, - prefix=None, - mangle_dupe_cols=True, - dtype=None, - engine=None, - converters=None, - true_values=None, - false_values=None, - skipinitialspace=False, - skiprows=None, - nrows=None, - na_values=None, - keep_default_na=True, - na_filter=True, - verbose=False, - skip_blank_lines=True, - parse_dates=False, - infer_datetime_format=False, - keep_date_col=False, - date_parser=None, - dayfirst=False, - iterator=False, - chunksize=None, - compression='infer', - thousands=None, - decimal=b'.', - lineterminator=None, - quotechar='"', - quoting=0, - escapechar=None, - comment=None, - encoding=None, - dialect=None, - tupleize_cols=None, - error_bad_lines=True, - warn_bad_lines=True, - skipfooter=0, - skip_footer=0, - doublequote=True, - delim_whitespace=False, - as_recarray=None, - compact_ints=None, - use_unsigned=None, - low_memory=True, - buffer_lines=None, - memory_map=False, - float_precision=None): +def read_csv( + filepath_or_buffer, + sep=",", + delimiter=None, + header="infer", + names=None, + index_col=None, + usecols=None, + squeeze=False, + prefix=None, + mangle_dupe_cols=True, + dtype=None, + engine=None, + converters=None, + true_values=None, + false_values=None, + skipinitialspace=False, + skiprows=None, + nrows=None, + na_values=None, + keep_default_na=True, + na_filter=True, + verbose=False, + skip_blank_lines=True, + parse_dates=False, + infer_datetime_format=False, + keep_date_col=False, + date_parser=None, + dayfirst=False, + iterator=False, + chunksize=None, + compression="infer", + thousands=None, + decimal=b".", + lineterminator=None, + quotechar='"', + quoting=0, + escapechar=None, + comment=None, + encoding=None, + dialect=None, + tupleize_cols=None, + error_bad_lines=True, + warn_bad_lines=True, + skipfooter=0, + skip_footer=0, + doublequote=True, + delim_whitespace=False, + as_recarray=None, + compact_ints=None, + use_unsigned=None, + low_memory=True, + buffer_lines=None, + memory_map=False, + float_precision=None, +): """Read csv file from local disk. Args: filepath: @@ -287,7 +300,8 @@ def read_csv(filepath_or_buffer, defaults = dict(zip(args[1:], defaults)) kwargs = { kw: kwargs[kw] - for kw in kwargs if kw in defaults and kwargs[kw] != defaults[kw] + for kw in kwargs + if kw in defaults and kwargs[kw] != defaults[kw] } # This happens on Python2, we will just default to serializing the entire dictionary except AttributeError: @@ -297,9 +311,10 @@ def read_csv(filepath_or_buffer, if isinstance(filepath_or_buffer, str): if not os.path.exists(filepath_or_buffer): - warnings.warn(("File not found on disk. " - "Defaulting to Pandas implementation."), - PendingDeprecationWarning) + warnings.warn( + ("File not found on disk. " "Defaulting to Pandas implementation."), + PendingDeprecationWarning, + ) return _read_csv_from_pandas(filepath_or_buffer, kwargs) elif not isinstance(filepath_or_buffer, py.path.local): read_from_pandas = True @@ -307,109 +322,141 @@ def read_csv(filepath_or_buffer, # Pandas read_csv supports pathlib.Path try: import pathlib + if isinstance(filepath_or_buffer, pathlib.Path): read_from_pandas = False except ImportError: pass if read_from_pandas: - warnings.warn(("Reading from buffer. " - "Defaulting to Pandas implementation."), - PendingDeprecationWarning) + warnings.warn( + ("Reading from buffer. " "Defaulting to Pandas implementation."), + PendingDeprecationWarning, + ) return _read_csv_from_pandas(filepath_or_buffer, kwargs) if _infer_compression(filepath_or_buffer, compression) is not None: - warnings.warn(("Compression detected. " - "Defaulting to Pandas implementation."), - PendingDeprecationWarning) + warnings.warn( + ("Compression detected. " "Defaulting to Pandas implementation."), + PendingDeprecationWarning, + ) return _read_csv_from_pandas(filepath_or_buffer, kwargs) if as_recarray: - warnings.warn("Defaulting to Pandas implementation.", - PendingDeprecationWarning) + warnings.warn("Defaulting to Pandas implementation.", PendingDeprecationWarning) return _read_csv_from_pandas(filepath_or_buffer, kwargs) if chunksize is not None: - warnings.warn(("Reading chunks from a file. " - "Defaulting to Pandas implementation."), - PendingDeprecationWarning) + warnings.warn( + ("Reading chunks from a file. " "Defaulting to Pandas implementation."), + PendingDeprecationWarning, + ) return _read_csv_from_pandas(filepath_or_buffer, kwargs) if skiprows is not None and not isinstance(skiprows, int): - warnings.warn(("Defaulting to Pandas implementation. To speed up " - "read_csv through the Pandas on Ray implementation, " - "comment the rows to skip instead.")) + warnings.warn( + ( + "Defaulting to Pandas implementation. To speed up " + "read_csv through the Pandas on Ray implementation, " + "comment the rows to skip instead." + ) + ) return _read_csv_from_pandas(filepath_or_buffer, kwargs) # TODO: replace this by reading lines from file. if nrows is not None: - warnings.warn("Defaulting to Pandas implementation.", - PendingDeprecationWarning) + warnings.warn("Defaulting to Pandas implementation.", PendingDeprecationWarning) return _read_csv_from_pandas(filepath_or_buffer, kwargs) return _read_csv_from_file_pandas_on_ray(filepath_or_buffer, kwargs) -def read_json(path_or_buf=None, - orient=None, - typ='frame', - dtype=True, - convert_axes=True, - convert_dates=True, - keep_default_dates=True, - numpy=False, - precise_float=False, - date_unit=None, - encoding=None, - lines=False, - chunksize=None, - compression='infer'): - - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) +def read_json( + path_or_buf=None, + orient=None, + typ="frame", + dtype=True, + convert_axes=True, + convert_dates=True, + keep_default_dates=True, + numpy=False, + precise_float=False, + date_unit=None, + encoding=None, + lines=False, + chunksize=None, + compression="infer", +): + + warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning) port_frame = pandas.read_json( - path_or_buf, orient, typ, dtype, convert_axes, convert_dates, - keep_default_dates, numpy, precise_float, date_unit, encoding, lines, - chunksize, compression) + path_or_buf, + orient, + typ, + dtype, + convert_axes, + convert_dates, + keep_default_dates, + numpy, + precise_float, + date_unit, + encoding, + lines, + chunksize, + compression, + ) ray_frame = from_pandas(port_frame) return ray_frame -def read_html(io, - match='.+', - flavor=None, - header=None, - index_col=None, - skiprows=None, - attrs=None, - parse_dates=False, - tupleize_cols=None, - thousands=',', - encoding=None, - decimal='.', - converters=None, - na_values=None, - keep_default_na=True): - - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) - - port_frame = pandas.read_html(io, match, flavor, header, index_col, - skiprows, attrs, parse_dates, tupleize_cols, - thousands, encoding, decimal, converters, - na_values, keep_default_na) +def read_html( + io, + match=".+", + flavor=None, + header=None, + index_col=None, + skiprows=None, + attrs=None, + parse_dates=False, + tupleize_cols=None, + thousands=",", + encoding=None, + decimal=".", + converters=None, + na_values=None, + keep_default_na=True, +): + + warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning) + + port_frame = pandas.read_html( + io, + match, + flavor, + header, + index_col, + skiprows, + attrs, + parse_dates, + tupleize_cols, + thousands, + encoding, + decimal, + converters, + na_values, + keep_default_na, + ) ray_frame = from_pandas(port_frame[0]) return ray_frame -def read_clipboard(sep=r'\s+'): +def read_clipboard(sep=r"\s+"): - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) + warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning) port_frame = pandas.read_clipboard(sep) ray_frame = from_pandas(port_frame) @@ -417,42 +464,59 @@ def read_clipboard(sep=r'\s+'): return ray_frame -def read_excel(io, - sheet_name=0, - header=0, - skiprows=None, - skip_footer=0, - index_col=None, - names=None, - usecols=None, - parse_dates=False, - date_parser=None, - na_values=None, - thousands=None, - convert_float=True, - converters=None, - dtype=None, - true_values=None, - false_values=None, - engine=None, - squeeze=False): - - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) +def read_excel( + io, + sheet_name=0, + header=0, + skiprows=None, + skip_footer=0, + index_col=None, + names=None, + usecols=None, + parse_dates=False, + date_parser=None, + na_values=None, + thousands=None, + convert_float=True, + converters=None, + dtype=None, + true_values=None, + false_values=None, + engine=None, + squeeze=False, +): + + warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning) port_frame = pandas.read_excel( - io, sheet_name, header, skiprows, skip_footer, index_col, names, - usecols, parse_dates, date_parser, na_values, thousands, convert_float, - converters, dtype, true_values, false_values, engine, squeeze) + io, + sheet_name, + header, + skiprows, + skip_footer, + index_col, + names, + usecols, + parse_dates, + date_parser, + na_values, + thousands, + convert_float, + converters, + dtype, + true_values, + false_values, + engine, + squeeze, + ) ray_frame = from_pandas(port_frame) return ray_frame -def read_hdf(path_or_buf, key=None, mode='r'): +def read_hdf(path_or_buf, key=None, mode="r"): - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) + warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning) port_frame = pandas.read_hdf(path_or_buf, key, mode) ray_frame = from_pandas(port_frame) @@ -462,8 +526,7 @@ def read_hdf(path_or_buf, key=None, mode='r'): def read_feather(path, nthreads=1): - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) + warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning) port_frame = pandas.read_feather(path) ray_frame = from_pandas(port_frame) @@ -471,10 +534,9 @@ def read_feather(path, nthreads=1): return ray_frame -def read_msgpack(path_or_buf, encoding='utf-8', iterator=False): +def read_msgpack(path_or_buf, encoding="utf-8", iterator=False): - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) + warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning) port_frame = pandas.read_msgpack(path_or_buf, encoding, iterator) ray_frame = from_pandas(port_frame) @@ -482,51 +544,62 @@ def read_msgpack(path_or_buf, encoding='utf-8', iterator=False): return ray_frame -def read_stata(filepath_or_buffer, - convert_dates=True, - convert_categoricals=True, - encoding=None, - index_col=None, - convert_missing=False, - preserve_dtypes=True, - columns=None, - order_categoricals=True, - chunksize=None, - iterator=False): - - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) - - port_frame = pandas.read_stata(filepath_or_buffer, convert_dates, - convert_categoricals, encoding, index_col, - convert_missing, preserve_dtypes, columns, - order_categoricals, chunksize, iterator) +def read_stata( + filepath_or_buffer, + convert_dates=True, + convert_categoricals=True, + encoding=None, + index_col=None, + convert_missing=False, + preserve_dtypes=True, + columns=None, + order_categoricals=True, + chunksize=None, + iterator=False, +): + + warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning) + + port_frame = pandas.read_stata( + filepath_or_buffer, + convert_dates, + convert_categoricals, + encoding, + index_col, + convert_missing, + preserve_dtypes, + columns, + order_categoricals, + chunksize, + iterator, + ) ray_frame = from_pandas(port_frame) return ray_frame -def read_sas(filepath_or_buffer, - format=None, - index=None, - encoding=None, - chunksize=None, - iterator=False): +def read_sas( + filepath_or_buffer, + format=None, + index=None, + encoding=None, + chunksize=None, + iterator=False, +): - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) + warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning) - port_frame = pandas.read_sas(filepath_or_buffer, format, index, encoding, - chunksize, iterator) + port_frame = pandas.read_sas( + filepath_or_buffer, format, index, encoding, chunksize, iterator + ) ray_frame = from_pandas(port_frame) return ray_frame -def read_pickle(path, compression='infer'): +def read_pickle(path, compression="infer"): - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) + warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning) port_frame = pandas.read_pickle(path, compression) ray_frame = from_pandas(port_frame) @@ -534,20 +607,22 @@ def read_pickle(path, compression='infer'): return ray_frame -def read_sql(sql, - con, - index_col=None, - coerce_float=True, - params=None, - parse_dates=None, - columns=None, - chunksize=None): +def read_sql( + sql, + con, + index_col=None, + coerce_float=True, + params=None, + parse_dates=None, + columns=None, + chunksize=None, +): - warnings.warn("Defaulting to Pandas implementation", - PendingDeprecationWarning) + warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning) - port_frame = pandas.read_sql(sql, con, index_col, coerce_float, params, - parse_dates, columns, chunksize) + port_frame = pandas.read_sql( + sql, con, index_col, coerce_float, params, parse_dates, columns, chunksize + ) ray_frame = from_pandas(port_frame) return ray_frame @@ -561,8 +636,7 @@ def get_index(index_name, *partition_indices): @ray.remote -def _read_csv_with_offset_pandas_on_ray(fname, num_splits, start, end, kwargs, - header): +def _read_csv_with_offset_pandas_on_ray(fname, num_splits, start, end, kwargs, header): """Use a Ray task to read a chunk of a CSV into a Pandas DataFrame. Args: @@ -579,7 +653,7 @@ def _read_csv_with_offset_pandas_on_ray(fname, num_splits, start, end, kwargs, This is used to determine the total length of the DataFrame to build a default Index. """ - bio = open(fname, 'rb') + bio = open(fname, "rb") bio.seek(start) to_read = header + bio.read(end - start) bio.close() @@ -612,7 +686,7 @@ def _read_parquet_column(path, column, num_splits, kwargs): default Index. """ import pyarrow.parquet as pq + df = pq.read_pandas(path, columns=[column], **kwargs).to_pandas() # Append the length of the index here to build it externally - return split_result_of_axis_func_pandas(0, num_splits, - df) + [len(df.index)] + return split_result_of_axis_func_pandas(0, num_splits, df) + [len(df.index)] diff --git a/modin/pandas/iterator.py b/modin/pandas/iterator.py index 35cb2b445db..08ecbabeaea 100644 --- a/modin/pandas/iterator.py +++ b/modin/pandas/iterator.py @@ -17,8 +17,9 @@ def __init__(self, data_manager, axis, func): """ self.data_manager = data_manager self.axis = axis - self.index_iter = iter(self.data_manager.columns) if axis else iter( - self.data_manager.index) + self.index_iter = ( + iter(self.data_manager.columns) if axis else iter(self.data_manager.index) + ) self.func = func def __iter__(self): diff --git a/modin/pandas/reshape.py b/modin/pandas/reshape.py index 882bbc90025..8fe9232c520 100644 --- a/modin/pandas/reshape.py +++ b/modin/pandas/reshape.py @@ -8,13 +8,15 @@ from .dataframe import DataFrame -def get_dummies(data, - prefix=None, - prefix_sep='_', - dummy_na=False, - columns=None, - sparse=False, - drop_first=False): +def get_dummies( + data, + prefix=None, + prefix_sep="_", + dummy_na=False, + columns=None, + sparse=False, + drop_first=False, +): """Convert categorical variable into indicator variables. Args: @@ -34,7 +36,8 @@ def get_dummies(data, raise NotImplementedError( "SparseDataFrame is not implemented. " "To contribute to Pandas on Ray, please visit " - "github.com/modin-project/modin.") + "github.com/modin-project/modin." + ) if not isinstance(data, DataFrame): return pandas.get_dummies( @@ -44,7 +47,8 @@ def get_dummies(data, dummy_na=dummy_na, columns=columns, sparse=sparse, - drop_first=drop_first) + drop_first=drop_first, + ) if isinstance(data, DataFrame): df = data @@ -56,6 +60,7 @@ def get_dummies(data, prefix=prefix, prefix_sep=prefix_sep, dummy_na=dummy_na, - drop_first=drop_first) + drop_first=drop_first, + ) return DataFrame(data_manager=new_manager) diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 30fba343995..991bcbc7e56 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -15,8 +15,7 @@ def na_op(): raise NotImplementedError("Not Yet implemented.") -@_inherit_docstrings( - pandas.Series, excluded=[pandas.Series, pandas.Series.__init__]) +@_inherit_docstrings(pandas.Series, excluded=[pandas.Series, pandas.Series.__init__]) class Series(object): def __init__(self, series_oids): """Constructor for a Series object. @@ -33,7 +32,7 @@ def T(self): def __abs__(self): raise NotImplementedError("Not Yet implemented.") - def __add__(self, right, name='__add__', na_op=na_op): + def __add__(self, right, name="__add__", na_op=na_op): raise NotImplementedError("Not Yet implemented.") def __and__(self, other): @@ -58,13 +57,9 @@ def __bool__(self): def __bytes__(self): raise NotImplementedError("Not Yet implemented.") - def __class__(self, - data=None, - index=None, - dtype=None, - name=None, - copy=False, - fastpath=False): + def __class__( + self, data=None, index=None, dtype=None, name=None, copy=False, fastpath=False + ): raise NotImplementedError("Not Yet implemented.") def __contains__(self, key): @@ -82,10 +77,10 @@ def __delitem__(self, key): def __dir__(self): return list(type(self).__dict__.keys()) - def __div__(self, right, name='__truediv__', na_op=na_op): + def __div__(self, right, name="__truediv__", na_op=na_op): raise NotImplementedError("Not Yet implemented.") - def __divmod__(self, right, name='__divmod__', na_op=na_op): + def __divmod__(self, right, name="__divmod__", na_op=na_op): raise NotImplementedError("Not Yet implemented.") @property @@ -101,7 +96,7 @@ def __finalize__(self, other, method=None, **kwargs): def __float__(self): raise NotImplementedError("Not Yet implemented.") - def __floordiv__(self, right, name='__floordiv__', na_op=na_op): + def __floordiv__(self, right, name="__floordiv__", na_op=na_op): raise NotImplementedError("Not Yet implemented.") def __ge__(self, other, axis=None): @@ -152,10 +147,10 @@ def __long__(self): def __lt__(self, other, axis=None): raise NotImplementedError("Not Yet implemented.") - def __mod__(self, right, name='__mod__', na_op=na_op): + def __mod__(self, right, name="__mod__", na_op=na_op): raise NotImplementedError("Not Yet implemented.") - def __mul__(self, right, name='__mul__', na_op=na_op): + def __mul__(self, right, name="__mul__", na_op=na_op): raise NotImplementedError("Not Yet implemented.") def __ne__(self, other, axis=None): @@ -170,7 +165,7 @@ def __nonzero__(self): def __or__(self, other): raise NotImplementedError("Not Yet implemented.") - def __pow__(self, right, name='__pow__', na_op=na_op): + def __pow__(self, right, name="__pow__", na_op=na_op): raise NotImplementedError("Not Yet implemented.") def __repr__(self): @@ -191,10 +186,10 @@ def __sizeof__(self): def __str__(self): raise NotImplementedError("Not Yet implemented.") - def __sub__(self, right, name='__sub__', na_op=na_op): + def __sub__(self, right, name="__sub__", na_op=na_op): raise NotImplementedError("Not Yet implemented.") - def __truediv__(self, right, name='__truediv__', na_op=na_op): + def __truediv__(self, right, name="__truediv__", na_op=na_op): raise NotImplementedError("Not Yet implemented.") def __xor__(self, other): @@ -218,25 +213,25 @@ def agg(self, func, axis=0, *args, **kwargs): def aggregate(self, func, axis=0, *args, **kwargs): raise NotImplementedError("Not Yet implemented.") - def align(self, - other, - join='outer', - axis=None, - level=None, - copy=True, - fill_value=None, - method=None, - limit=None, - fill_axis=0, - broadcast_axis=None): + def align( + self, + other, + join="outer", + axis=None, + level=None, + copy=True, + fill_value=None, + method=None, + limit=None, + fill_axis=0, + broadcast_axis=None, + ): raise NotImplementedError("Not Yet implemented.") - def all(self, axis=None, bool_only=None, skipna=None, level=None, - **kwargs): + def all(self, axis=None, bool_only=None, skipna=None, level=None, **kwargs): raise NotImplementedError("Not Yet implemented.") - def any(self, axis=None, bool_only=None, skipna=None, level=None, - **kwargs): + def any(self, axis=None, bool_only=None, skipna=None, level=None, **kwargs): raise NotImplementedError("Not Yet implemented.") def append(self, to_append, ignore_index=False, verify_integrity=False): @@ -251,7 +246,7 @@ def argmax(self, axis=None, skipna=True, *args, **kwargs): def argmin(self, axis=None, skipna=True, *args, **kwargs): raise NotImplementedError("Not Yet implemented.") - def argsort(self, axis=0, kind='quicksort', order=None): + def argsort(self, axis=0, kind="quicksort", order=None): raise NotImplementedError("Not Yet implemented.") def as_blocks(self, copy=True): @@ -260,18 +255,13 @@ def as_blocks(self, copy=True): def as_matrix(self, columns=None): raise NotImplementedError("Not Yet implemented.") - def asfreq(self, - freq, - method=None, - how=None, - normalize=False, - fill_value=None): + def asfreq(self, freq, method=None, how=None, normalize=False, fill_value=None): raise NotImplementedError("Not Yet implemented.") def asof(self, where, subset=None): raise NotImplementedError("Not Yet implemented.") - def astype(self, dtype, copy=True, errors='raise', **kwargs): + def astype(self, dtype, copy=True, errors="raise", **kwargs): raise NotImplementedError("Not Yet implemented.") def at(self, axis=None): @@ -286,11 +276,7 @@ def autocorr(self, lag=1): def between(self, left, right, inclusive=True): raise NotImplementedError("Not Yet implemented.") - def between_time(self, - start_time, - end_time, - include_start=True, - include_end=True): + def between_time(self, start_time, end_time, include_start=True, include_end=True): raise NotImplementedError("Not Yet implemented.") def bfill(self, axis=None, inplace=False, limit=None, downcast=None): @@ -323,17 +309,19 @@ def compress(self, condition, *args, **kwargs): def consolidate(self, inplace=False): raise NotImplementedError("Not Yet implemented.") - def convert_objects(self, - convert_dates=True, - convert_numeric=False, - convert_timedeltas=True, - copy=True): + def convert_objects( + self, + convert_dates=True, + convert_numeric=False, + convert_timedeltas=True, + copy=True, + ): raise NotImplementedError("Not Yet implemented.") def copy(self, deep=True): raise NotImplementedError("Not Yet implemented.") - def corr(self, other, method='pearson', min_periods=None): + def corr(self, other, method="pearson", min_periods=None): raise NotImplementedError("Not Yet implemented.") def count(self, level=None): @@ -369,16 +357,16 @@ def divide(self, other, level=None, fill_value=None, axis=0): def dot(self, other): raise NotImplementedError("Not Yet implemented.") - def drop(self, labels, axis=0, level=None, inplace=False, errors='raise'): + def drop(self, labels, axis=0, level=None, inplace=False, errors="raise"): raise NotImplementedError("Not Yet implemented.") - def drop_duplicates(self, keep='first', inplace=False): + def drop_duplicates(self, keep="first", inplace=False): raise NotImplementedError("Not Yet implemented.") def dropna(self, axis=0, inplace=False, **kwargs): raise NotImplementedError("Not Yet implemented.") - def duplicated(self, keep='first'): + def duplicated(self, keep="first"): raise NotImplementedError("Not Yet implemented.") def eq(self, other, level=None, fill_value=None, axis=0): @@ -387,16 +375,18 @@ def eq(self, other, level=None, fill_value=None, axis=0): def equals(self, other): raise NotImplementedError("Not Yet implemented.") - def ewm(self, - com=None, - span=None, - halflife=None, - alpha=None, - min_periods=0, - freq=None, - adjust=True, - ignore_na=False, - axis=0): + def ewm( + self, + com=None, + span=None, + halflife=None, + alpha=None, + min_periods=0, + freq=None, + adjust=True, + ignore_na=False, + axis=0, + ): raise NotImplementedError("Not Yet implemented.") def expanding(self, min_periods=1, freq=None, center=False, axis=0): @@ -408,14 +398,16 @@ def factorize(self, sort=False, na_sentinel=-1): def ffill(self, axis=None, inplace=False, limit=None, downcast=None): raise NotImplementedError("Not Yet implemented.") - def fillna(self, - value=None, - method=None, - axis=None, - inplace=False, - limit=None, - downcast=None, - **kwargs): + def fillna( + self, + value=None, + method=None, + axis=None, + inplace=False, + limit=None, + downcast=None, + **kwargs + ): raise NotImplementedError("Not Yet implemented.") def filter(self, items=None, like=None, regex=None, axis=None): @@ -430,23 +422,21 @@ def first_valid_index(self): def floordiv(self, other, level=None, fill_value=None, axis=0): raise NotImplementedError("Not Yet implemented.") - def from_array(self, - arr, - index=None, - name=None, - dtype=None, - copy=False, - fastpath=False): + def from_array( + self, arr, index=None, name=None, dtype=None, copy=False, fastpath=False + ): raise NotImplementedError("Not Yet implemented.") - def from_csv(self, - path, - sep=',', - parse_dates=True, - header=None, - index_col=0, - encoding=None, - infer_datetime_format=False): + def from_csv( + self, + path, + sep=",", + parse_dates=True, + header=None, + index_col=0, + encoding=None, + infer_datetime_format=False, + ): raise NotImplementedError("Not Yet implemented.") def ge(self, other, level=None, fill_value=None, axis=0): @@ -467,15 +457,17 @@ def get_value(self, label, takeable=False): def get_values(self): raise NotImplementedError("Not Yet implemented.") - def groupby(self, - by=None, - axis=0, - level=None, - as_index=True, - sort=True, - group_keys=True, - squeeze=False, - **kwargs): + def groupby( + self, + by=None, + axis=0, + level=None, + as_index=True, + sort=True, + group_keys=True, + squeeze=False, + **kwargs + ): raise NotImplementedError("Not Yet implemented.") def gt(self, other, level=None, fill_value=None, axis=0): @@ -484,17 +476,19 @@ def gt(self, other, level=None, fill_value=None, axis=0): def head(self, n=5): raise NotImplementedError("Not Yet implemented.") - def hist(self, - by=None, - ax=None, - grid=True, - xlabelsize=None, - xrot=None, - ylabelsize=None, - yrot=None, - figsize=None, - bins=10, - **kwds): + def hist( + self, + by=None, + ax=None, + grid=True, + xlabelsize=None, + xrot=None, + ylabelsize=None, + yrot=None, + figsize=None, + bins=10, + **kwds + ): raise NotImplementedError("Not Yet implemented.") def iat(self, axis=None): @@ -509,14 +503,16 @@ def idxmin(self, axis=None, skipna=True, *args, **kwargs): def iloc(self, axis=None): raise NotImplementedError("Not Yet implemented.") - def interpolate(self, - method='linear', - axis=0, - limit=None, - inplace=False, - limit_direction='forward', - downcast=None, - **kwargs): + def interpolate( + self, + method="linear", + axis=0, + limit=None, + inplace=False, + limit_direction="forward", + downcast=None, + **kwargs + ): raise NotImplementedError("Not Yet implemented.") def isin(self, values): @@ -540,20 +536,10 @@ def ix(self, axis=None): def keys(self): raise NotImplementedError("Not Yet implemented.") - def kurt(self, - axis=None, - skipna=None, - level=None, - numeric_only=None, - **kwargs): + def kurt(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): raise NotImplementedError("Not Yet implemented.") - def kurtosis(self, - axis=None, - skipna=None, - level=None, - numeric_only=None, - **kwargs): + def kurtosis(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): raise NotImplementedError("Not Yet implemented.") def last(self, offset): @@ -577,49 +563,31 @@ def mad(self, axis=None, skipna=None, level=None): def map(self, arg, na_action=None): raise NotImplementedError("Not Yet implemented.") - def mask(self, - cond, - other=np.nan, - inplace=False, - axis=None, - level=None, - try_cast=False, - raise_on_error=True): + def mask( + self, + cond, + other=np.nan, + inplace=False, + axis=None, + level=None, + try_cast=False, + raise_on_error=True, + ): raise NotImplementedError("Not Yet implemented.") - def max(self, - axis=None, - skipna=None, - level=None, - numeric_only=None, - **kwargs): + def max(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): raise NotImplementedError("Not Yet implemented.") - def mean(self, - axis=None, - skipna=None, - level=None, - numeric_only=None, - **kwargs): + def mean(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): raise NotImplementedError("Not Yet implemented.") - def median(self, - axis=None, - skipna=None, - level=None, - numeric_only=None, - **kwargs): + def median(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): raise NotImplementedError("Not Yet implemented.") def memory_usage(self, index=True, deep=False): raise NotImplementedError("Not Yet implemented.") - def min(self, - axis=None, - skipna=None, - level=None, - numeric_only=None, - **kwargs): + def min(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): raise NotImplementedError("Not Yet implemented.") def mod(self, other, level=None, fill_value=None, axis=0): @@ -637,7 +605,7 @@ def multiply(self, other, level=None, fill_value=None, axis=0): def ne(self, other, level=None, fill_value=None, axis=0): raise NotImplementedError("Not Yet implemented.") - def nlargest(self, n=5, keep='first'): + def nlargest(self, n=5, keep="first"): raise NotImplementedError("Not Yet implemented.") def nonzero(self): @@ -646,48 +614,45 @@ def nonzero(self): def notnull(self): raise NotImplementedError("Not Yet implemented.") - def nsmallest(self, n=5, keep='first'): + def nsmallest(self, n=5, keep="first"): raise NotImplementedError("Not Yet implemented.") def nunique(self, dropna=True): raise NotImplementedError("Not Yet implemented.") - def pct_change(self, - periods=1, - fill_method='pad', - limit=None, - freq=None, - **kwargs): + def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, **kwargs): raise NotImplementedError("Not Yet implemented.") def pipe(self, func, *args, **kwargs): raise NotImplementedError("Not Yet implemented.") - def plot(self, - kind='line', - ax=None, - figsize=None, - use_index=True, - title=None, - grid=None, - legend=False, - style=None, - logx=False, - logy=False, - loglog=False, - xticks=None, - yticks=None, - xlim=None, - ylim=None, - rot=None, - fontsize=None, - colormap=None, - table=False, - yerr=None, - xerr=None, - label=None, - secondary_y=False, - **kwds): + def plot( + self, + kind="line", + ax=None, + figsize=None, + use_index=True, + title=None, + grid=None, + legend=False, + style=None, + logx=False, + logy=False, + loglog=False, + xticks=None, + yticks=None, + xlim=None, + ylim=None, + rot=None, + fontsize=None, + colormap=None, + table=False, + yerr=None, + xerr=None, + label=None, + secondary_y=False, + **kwds + ): raise NotImplementedError("Not Yet implemented.") def pop(self, item): @@ -696,49 +661,36 @@ def pop(self, item): def pow(self, other, level=None, fill_value=None, axis=0): raise NotImplementedError("Not Yet implemented.") - def prod(self, - axis=None, - skipna=None, - level=None, - numeric_only=None, - **kwargs): + def prod(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): raise NotImplementedError("Not Yet implemented.") - def product(self, - axis=None, - skipna=None, - level=None, - numeric_only=None, - **kwargs): + def product(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): raise NotImplementedError("Not Yet implemented.") - def ptp(self, - axis=None, - skipna=None, - level=None, - numeric_only=None, - **kwargs): + def ptp(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): raise NotImplementedError("Not Yet implemented.") def put(self, *args, **kwargs): raise NotImplementedError("Not Yet implemented.") - def quantile(self, q=0.5, interpolation='linear'): + def quantile(self, q=0.5, interpolation="linear"): raise NotImplementedError("Not Yet implemented.") def radd(self, other, level=None, fill_value=None, axis=0): raise NotImplementedError("Not Yet implemented.") - def rank(self, - axis=0, - method='average', - numeric_only=None, - na_option='keep', - ascending=True, - pct=False): + def rank( + self, + axis=0, + method="average", + numeric_only=None, + na_option="keep", + ascending=True, + pct=False, + ): raise NotImplementedError("Not Yet implemented.") - def ravel(self, order='C'): + def ravel(self, order="C"): raise NotImplementedError("Not Yet implemented.") def rdiv(self, other, level=None, fill_value=None, axis=0): @@ -750,12 +702,7 @@ def reindex(self, index=None, **kwargs): def reindex_axis(self, labels, axis=0, **kwargs): raise NotImplementedError("Not Yet implemented.") - def reindex_like(self, - other, - method=None, - copy=True, - limit=None, - tolerance=None): + def reindex_like(self, other, method=None, copy=True, limit=None, tolerance=None): raise NotImplementedError("Not Yet implemented.") def rename(self, index=None, **kwargs): @@ -770,30 +717,34 @@ def reorder_levels(self, order): def repeat(self, repeats, *args, **kwargs): raise NotImplementedError("Not Yet implemented.") - def replace(self, - to_replace=None, - value=None, - inplace=False, - limit=None, - regex=False, - method='pad', - axis=None): - raise NotImplementedError("Not Yet implemented.") - - def resample(self, - rule, - how=None, - axis=0, - fill_method=None, - closed=None, - label=None, - convention='start', - kind=None, - loffset=None, - limit=None, - base=0, - on=None, - level=None): + def replace( + self, + to_replace=None, + value=None, + inplace=False, + limit=None, + regex=False, + method="pad", + axis=None, + ): + raise NotImplementedError("Not Yet implemented.") + + def resample( + self, + rule, + how=None, + axis=0, + fill_method=None, + closed=None, + label=None, + convention="start", + kind=None, + loffset=None, + limit=None, + base=0, + on=None, + level=None, + ): raise NotImplementedError("Not Yet implemented.") def reset_index(self, level=None, drop=False, name=None, inplace=False): @@ -811,15 +762,17 @@ def rmod(self, other, level=None, fill_value=None, axis=0): def rmul(self, other, level=None, fill_value=None, axis=0): raise NotImplementedError("Not Yet implemented.") - def rolling(self, - window, - min_periods=None, - freq=None, - center=False, - win_type=None, - on=None, - axis=0, - closed=None): + def rolling( + self, + window, + min_periods=None, + freq=None, + center=False, + win_type=None, + on=None, + axis=0, + closed=None, + ): raise NotImplementedError("Not Yet implemented.") def round(self, decimals=0, *args, **kwargs): @@ -834,28 +787,26 @@ def rsub(self, other, level=None, fill_value=None, axis=0): def rtruediv(self, other, level=None, fill_value=None, axis=0): raise NotImplementedError("Not Yet implemented.") - def sample(self, - n=None, - frac=None, - replace=False, - weights=None, - random_state=None, - axis=None): + def sample( + self, + n=None, + frac=None, + replace=False, + weights=None, + random_state=None, + axis=None, + ): raise NotImplementedError("Not Yet implemented.") - def searchsorted(self, value, side='left', sorter=None): + def searchsorted(self, value, side="left", sorter=None): raise NotImplementedError("Not Yet implemented.") def select(self, crit, axis=0): raise NotImplementedError("Not Yet implemented.") - def sem(self, - axis=None, - skipna=None, - level=None, - ddof=1, - numeric_only=None, - **kwargs): + def sem( + self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + ): raise NotImplementedError("Not Yet implemented.") def set_axis(self, axis, labels): @@ -867,33 +818,32 @@ def set_value(self, label, value, takeable=False): def shift(self, periods=1, freq=None, axis=0): raise NotImplementedError("Not Yet implemented.") - def skew(self, - axis=None, - skipna=None, - level=None, - numeric_only=None, - **kwargs): + def skew(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): raise NotImplementedError("Not Yet implemented.") def slice_shift(self, periods=1, axis=0): raise NotImplementedError("Not Yet implemented.") - def sort_index(self, - axis=0, - level=None, - ascending=True, - inplace=False, - kind='quicksort', - na_position='last', - sort_remaining=True): + def sort_index( + self, + axis=0, + level=None, + ascending=True, + inplace=False, + kind="quicksort", + na_position="last", + sort_remaining=True, + ): raise NotImplementedError("Not Yet implemented.") - def sort_values(self, - axis=0, - ascending=True, - inplace=False, - kind='quicksort', - na_position='last'): + def sort_values( + self, + axis=0, + ascending=True, + inplace=False, + kind="quicksort", + na_position="last", + ): raise NotImplementedError("Not Yet implemented.") def sortlevel(self, level=0, ascending=True, sort_remaining=True): @@ -902,13 +852,9 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): def squeeze(self, axis=None): raise NotImplementedError("Not Yet implemented.") - def std(self, - axis=None, - skipna=None, - level=None, - ddof=1, - numeric_only=None, - **kwargs): + def std( + self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + ): raise NotImplementedError("Not Yet implemented.") def sub(self, other, level=None, fill_value=None, axis=0): @@ -917,12 +863,7 @@ def sub(self, other, level=None, fill_value=None, axis=0): def subtract(self, other, level=None, fill_value=None, axis=0): raise NotImplementedError("Not Yet implemented.") - def sum(self, - axis=None, - skipna=None, - level=None, - numeric_only=None, - **kwargs): + def sum(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): raise NotImplementedError("Not Yet implemented.") def swapaxes(self, axis1, axis2, copy=True): @@ -940,18 +881,20 @@ def take(self, indices, axis=0, convert=True, is_copy=False, **kwargs): def to_clipboard(self, excel=None, sep=None, **kwargs): raise NotImplementedError("Not Yet implemented.") - def to_csv(self, - path=None, - index=True, - sep=',', - na_rep='', - float_format=None, - header=False, - index_label=None, - mode='w', - encoding=None, - date_format=None, - decimal='.'): + def to_csv( + self, + path=None, + index=True, + sep=",", + na_rep="", + float_format=None, + header=False, + index_label=None, + mode="w", + encoding=None, + date_format=None, + decimal=".", + ): raise NotImplementedError("Not Yet implemented.") def to_dense(self): @@ -960,22 +903,24 @@ def to_dense(self): def to_dict(self): raise NotImplementedError("Not Yet implemented.") - def to_excel(self, - excel_writer, - sheet_name='Sheet1', - na_rep='', - float_format=None, - columns=None, - header=True, - index=True, - index_label=None, - startrow=0, - startcol=0, - engine=None, - merge_cells=True, - encoding=None, - inf_rep='inf', - verbose=True): + def to_excel( + self, + excel_writer, + sheet_name="Sheet1", + na_rep="", + float_format=None, + columns=None, + header=True, + index=True, + index_label=None, + startrow=0, + startcol=0, + engine=None, + merge_cells=True, + encoding=None, + inf_rep="inf", + verbose=True, + ): raise NotImplementedError("Not Yet implemented.") def to_frame(self, name=None): @@ -984,76 +929,84 @@ def to_frame(self, name=None): def to_hdf(self, path_or_buf, key, **kwargs): raise NotImplementedError("Not Yet implemented.") - def to_json(self, - path_or_buf=None, - orient=None, - date_format=None, - double_precision=10, - force_ascii=True, - date_unit='ms', - default_handler=None, - lines=False): - raise NotImplementedError("Not Yet implemented.") - - def to_latex(self, - buf=None, - columns=None, - col_space=None, - header=True, - index=True, - na_rep='NaN', - formatters=None, - float_format=None, - sparsify=None, - index_names=True, - bold_rows=False, - column_format=None, - longtable=None, - escape=None, - encoding=None, - decimal='.', - multicolumn=None, - multicolumn_format=None, - multirow=None): - raise NotImplementedError("Not Yet implemented.") - - def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs): + def to_json( + self, + path_or_buf=None, + orient=None, + date_format=None, + double_precision=10, + force_ascii=True, + date_unit="ms", + default_handler=None, + lines=False, + ): + raise NotImplementedError("Not Yet implemented.") + + def to_latex( + self, + buf=None, + columns=None, + col_space=None, + header=True, + index=True, + na_rep="NaN", + formatters=None, + float_format=None, + sparsify=None, + index_names=True, + bold_rows=False, + column_format=None, + longtable=None, + escape=None, + encoding=None, + decimal=".", + multicolumn=None, + multicolumn_format=None, + multirow=None, + ): + raise NotImplementedError("Not Yet implemented.") + + def to_msgpack(self, path_or_buf=None, encoding="utf-8", **kwargs): raise NotImplementedError("Not Yet implemented.") def to_period(self, freq=None, copy=True): raise NotImplementedError("Not Yet implemented.") - def to_pickle(self, path, compression='infer'): + def to_pickle(self, path, compression="infer"): raise NotImplementedError("Not Yet implemented.") - def to_sparse(self, kind='block', fill_value=None): + def to_sparse(self, kind="block", fill_value=None): raise NotImplementedError("Not Yet implemented.") - def to_sql(self, - name, - con, - flavor=None, - schema=None, - if_exists='fail', - index=True, - index_label=None, - chunksize=None, - dtype=None): + def to_sql( + self, + name, + con, + flavor=None, + schema=None, + if_exists="fail", + index=True, + index_label=None, + chunksize=None, + dtype=None, + ): raise NotImplementedError("Not Yet implemented.") - def to_string(self, - buf=None, - na_rep='NaN', - float_format=None, - header=True, - index=True, - length=False, - dtype=False, - name=False, - max_rows=None): + def to_string( + self, + buf=None, + na_rep="NaN", + float_format=None, + header=True, + index=True, + length=False, + dtype=False, + name=False, + max_rows=None, + ): raise NotImplementedError("Not Yet implemented.") - def to_timestamp(self, freq=None, how='start', copy=True): + def to_timestamp(self, freq=None, how="start", copy=True): raise NotImplementedError("Not Yet implemented.") def to_xarray(self): @@ -1080,8 +1033,7 @@ def tshift(self, periods=1, freq=None, axis=0): def tz_convert(self, tz, axis=0, level=None, copy=True): raise NotImplementedError("Not Yet implemented.") - def tz_localize(self, tz, axis=0, level=None, copy=True, - ambiguous='raise'): + def tz_localize(self, tz, axis=0, level=None, copy=True, ambiguous="raise"): raise NotImplementedError("Not Yet implemented.") def unique(self): @@ -1096,34 +1048,29 @@ def upandasate(self, other): def valid(self, inplace=False, **kwargs): raise NotImplementedError("Not Yet implemented.") - def value_counts(self, - normalize=False, - sort=True, - ascending=False, - bins=None, - dropna=True): + def value_counts( + self, normalize=False, sort=True, ascending=False, bins=None, dropna=True + ): raise NotImplementedError("Not Yet implemented.") - def var(self, - axis=None, - skipna=None, - level=None, - ddof=1, - numeric_only=None, - **kwargs): + def var( + self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + ): raise NotImplementedError("Not Yet implemented.") def view(self, dtype=None): raise NotImplementedError("Not Yet implemented.") - def where(self, - cond, - other=np.nan, - inplace=False, - axis=None, - level=None, - try_cast=False, - raise_on_error=True): + def where( + self, + cond, + other=np.nan, + inplace=False, + axis=None, + level=None, + try_cast=False, + raise_on_error=True, + ): raise NotImplementedError("Not Yet implemented.") def xs(key, axis=0, level=None, drop_level=True): diff --git a/modin/pandas/test/test_concat.py b/modin/pandas/test/test_concat.py index e6963f4b78c..dcdf8e56aa1 100644 --- a/modin/pandas/test/test_concat.py +++ b/modin/pandas/test/test_concat.py @@ -15,41 +15,49 @@ def ray_df_equals_pandas(ray_df, pandas_df): @pytest.fixture def generate_dfs(): - df = pandas.DataFrame({ - 'col1': [0, 1, 2, 3], - 'col2': [4, 5, 6, 7], - 'col3': [8, 9, 10, 11], - 'col4': [12, 13, 14, 15], - 'col5': [0, 0, 0, 0] - }) - - df2 = pandas.DataFrame({ - 'col1': [0, 1, 2, 3], - 'col2': [4, 5, 6, 7], - 'col3': [8, 9, 10, 11], - 'col6': [12, 13, 14, 15], - 'col7': [0, 0, 0, 0] - }) + df = pandas.DataFrame( + { + "col1": [0, 1, 2, 3], + "col2": [4, 5, 6, 7], + "col3": [8, 9, 10, 11], + "col4": [12, 13, 14, 15], + "col5": [0, 0, 0, 0], + } + ) + + df2 = pandas.DataFrame( + { + "col1": [0, 1, 2, 3], + "col2": [4, 5, 6, 7], + "col3": [8, 9, 10, 11], + "col6": [12, 13, 14, 15], + "col7": [0, 0, 0, 0], + } + ) return df, df2 @pytest.fixture def generate_none_dfs(): - df = pandas.DataFrame({ - 'col1': [0, 1, 2, 3], - 'col2': [4, 5, None, 7], - 'col3': [8, 9, 10, 11], - 'col4': [12, 13, 14, 15], - 'col5': [None, None, None, None] - }) - - df2 = pandas.DataFrame({ - 'col1': [0, 1, 2, 3], - 'col2': [4, 5, 6, 7], - 'col3': [8, 9, 10, 11], - 'col6': [12, 13, 14, 15], - 'col7': [0, 0, 0, 0] - }) + df = pandas.DataFrame( + { + "col1": [0, 1, 2, 3], + "col2": [4, 5, None, 7], + "col3": [8, 9, 10, 11], + "col4": [12, 13, 14, 15], + "col5": [None, None, None, None], + } + ) + + df2 = pandas.DataFrame( + { + "col1": [0, 1, 2, 3], + "col2": [4, 5, 6, 7], + "col3": [8, 9, 10, 11], + "col6": [12, 13, 14, 15], + "col7": [0, 0, 0, 0], + } + ) return df, df2 @@ -57,16 +65,14 @@ def generate_none_dfs(): def test_df_concat(): df, df2 = generate_dfs() - assert (ray_df_equals_pandas( - pd.concat([df, df2]), pandas.concat([df, df2]))) + assert ray_df_equals_pandas(pd.concat([df, df2]), pandas.concat([df, df2])) def test_ray_concat(): df, df2 = generate_dfs() ray_df, ray_df2 = from_pandas(df), from_pandas(df2) - assert ray_df_equals_pandas( - pd.concat([ray_df, ray_df2]), pandas.concat([df, df2])) + assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2]), pandas.concat([df, df2])) def test_ray_concat_with_series(): @@ -76,11 +82,13 @@ def test_ray_concat_with_series(): assert ray_df_equals_pandas( pd.concat([ray_df, ray_df2, pandas_series], axis=0), - pandas.concat([df, df2, pandas_series], axis=0)) + pandas.concat([df, df2, pandas_series], axis=0), + ) assert ray_df_equals_pandas( pd.concat([ray_df, ray_df2, pandas_series], axis=1), - pandas.concat([df, df2, pandas_series], axis=1)) + pandas.concat([df, df2, pandas_series], axis=1), + ) def test_ray_concat_on_index(): @@ -88,15 +96,17 @@ def test_ray_concat_on_index(): ray_df, ray_df2 = from_pandas(df), from_pandas(df2) assert ray_df_equals_pandas( - pd.concat([ray_df, ray_df2], axis='index'), - pandas.concat([df, df2], axis='index')) + pd.concat([ray_df, ray_df2], axis="index"), + pandas.concat([df, df2], axis="index"), + ) assert ray_df_equals_pandas( - pd.concat([ray_df, ray_df2], axis='rows'), - pandas.concat([df, df2], axis='rows')) + pd.concat([ray_df, ray_df2], axis="rows"), pandas.concat([df, df2], axis="rows") + ) assert ray_df_equals_pandas( - pd.concat([ray_df, ray_df2], axis=0), pandas.concat([df, df2], axis=0)) + pd.concat([ray_df, ray_df2], axis=0), pandas.concat([df, df2], axis=0) + ) def test_ray_concat_on_column(): @@ -104,11 +114,13 @@ def test_ray_concat_on_column(): ray_df, ray_df2 = from_pandas(df), from_pandas(df2) assert ray_df_equals_pandas( - pd.concat([ray_df, ray_df2], axis=1), pandas.concat([df, df2], axis=1)) + pd.concat([ray_df, ray_df2], axis=1), pandas.concat([df, df2], axis=1) + ) assert ray_df_equals_pandas( pd.concat([ray_df, ray_df2], axis="columns"), - pandas.concat([df, df2], axis="columns")) + pandas.concat([df, df2], axis="columns"), + ) def test_invalid_axis_errors(): @@ -125,8 +137,7 @@ def test_mixed_concat(): mixed_dfs = [from_pandas(df), from_pandas(df2), df3] - assert (ray_df_equals_pandas( - pd.concat(mixed_dfs), pandas.concat([df, df2, df3]))) + assert ray_df_equals_pandas(pd.concat(mixed_dfs), pandas.concat([df, df2, df3])) def test_mixed_inner_concat(): @@ -135,9 +146,9 @@ def test_mixed_inner_concat(): mixed_dfs = [from_pandas(df), from_pandas(df2), df3] - assert (ray_df_equals_pandas( - pd.concat(mixed_dfs, join='inner'), - pandas.concat([df, df2, df3], join='inner'))) + assert ray_df_equals_pandas( + pd.concat(mixed_dfs, join="inner"), pandas.concat([df, df2, df3], join="inner") + ) def test_mixed_none_concat(): @@ -146,5 +157,4 @@ def test_mixed_none_concat(): mixed_dfs = [from_pandas(df), from_pandas(df2), df3] - assert (ray_df_equals_pandas( - pd.concat(mixed_dfs), pandas.concat([df, df2, df3]))) + assert ray_df_equals_pandas(pd.concat(mixed_dfs), pandas.concat([df, df2, df3])) diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index 9d1886d72be..b64fac33eea 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -31,39 +31,46 @@ def ray_df_equals(ray_df1, ray_df2): @pytest.fixture def create_test_dataframe(): - return pd.DataFrame({ - 'col1': [0, 1, 2, 3], - 'col2': [4, 5, 6, 7], - 'col3': [8, 9, 10, 11], - 'col4': [12, 13, 14, 15], - 'col5': [0, 0, 0, 0] - }) + return pd.DataFrame( + { + "col1": [0, 1, 2, 3], + "col2": [4, 5, 6, 7], + "col3": [8, 9, 10, 11], + "col4": [12, 13, 14, 15], + "col5": [0, 0, 0, 0], + } + ) def test_int_dataframe(): frame_data = { - 'col1': [0, 1, 2, 3], - 'col2': [4, 5, 6, 7], - 'col3': [8, 9, 10, 11], - 'col4': [12, 13, 14, 15], - 'col5': [0, 0, 0, 0] + "col1": [0, 1, 2, 3], + "col2": [4, 5, 6, 7], + "col3": [8, 9, 10, 11], + "col4": [12, 13, 14, 15], + "col5": [0, 0, 0, 0], } pandas_df = pandas.DataFrame(frame_data) ray_df = pd.DataFrame(frame_data) testfuncs = [ - lambda x: x + 1, lambda x: str(x), lambda x: x * x, lambda x: x, - lambda x: False + lambda x: x + 1, + lambda x: str(x), + lambda x: x * x, + lambda x: x, + lambda x: False, ] query_funcs = [ - 'col1 < col2', 'col3 > col4', 'col1 == col2', - '(col2 > col1) and (col1 < col3)' + "col1 < col2", + "col3 > col4", + "col1 == col2", + "(col2 > col1) and (col1 < col3)", ] - keys = ['col1', 'col2', 'col3', 'col4'] + keys = ["col1", "col2", "col3", "col4"] - filter_by = {'items': ['col1', 'col5'], 'regex': '4$|3$', 'like': 'col'} + filter_by = {"items": ["col1", "col5"], "regex": "4$|3$", "like": "col"} test_sample(ray_df, pandas_df) test_filter(ray_df, pandas_df, filter_by) @@ -150,12 +157,12 @@ def test_int_dataframe(): test_loc(ray_df, pandas_df) test_iloc(ray_df, pandas_df) - labels = ['a', 'b', 'c', 'd'] + labels = ["a", "b", "c", "d"] test_set_axis(ray_df, pandas_df, labels, 0) - test_set_axis(ray_df, pandas_df, labels, 'rows') - labels.append('e') + test_set_axis(ray_df, pandas_df, labels, "rows") + labels.append("e") test_set_axis(ray_df, pandas_df, labels, 1) - test_set_axis(ray_df, pandas_df, labels, 'columns') + test_set_axis(ray_df, pandas_df, labels, "columns") for key in keys: test_set_index(ray_df, pandas_df, key) @@ -175,9 +182,7 @@ def test_int_dataframe(): test___array__(ray_df, pandas_df) - apply_agg_functions = [ - 'sum', lambda df: df.sum(), ['sum', 'mean'], ['sum', 'sum'] - ] + apply_agg_functions = ["sum", lambda df: df.sum(), ["sum", "mean"], ["sum", "sum"]] for func in apply_agg_functions: test_apply(ray_df, pandas_df, func, 0) test_aggregate(ray_df, pandas_df, func, 0) @@ -194,7 +199,7 @@ def test_int_dataframe(): with pytest.raises(TypeError): test_aggregate(ray_df, pandas_df, func, 1) - func = ['sum', lambda df: df.sum()] + func = ["sum", lambda df: df.sum()] test_apply(ray_df, pandas_df, func, 0) test_aggregate(ray_df, pandas_df, func, 0) test_agg(ray_df, pandas_df, func, 0) @@ -205,36 +210,41 @@ def test_int_dataframe(): with pytest.raises(TypeError): test_agg(ray_df, pandas_df, func, 1) - test_apply(ray_df, pandas_df, lambda df: df.drop('col1'), 1) + test_apply(ray_df, pandas_df, lambda df: df.drop("col1"), 1) test_apply(ray_df, pandas_df, lambda df: -df, 0) test_transform(ray_df, pandas_df) def test_float_dataframe(): frame_data = { - 'col1': [0.0, 1.0, 2.0, 3.0], - 'col2': [4.0, 5.0, 6.0, 7.0], - 'col3': [8.0, 9.0, 10.0, 11.0], - 'col4': [12.0, 13.0, 14.0, 15.0], - 'col5': [0.0, 0.0, 0.0, 0.0] + "col1": [0.0, 1.0, 2.0, 3.0], + "col2": [4.0, 5.0, 6.0, 7.0], + "col3": [8.0, 9.0, 10.0, 11.0], + "col4": [12.0, 13.0, 14.0, 15.0], + "col5": [0.0, 0.0, 0.0, 0.0], } pandas_df = pandas.DataFrame(frame_data) ray_df = pd.DataFrame(frame_data) testfuncs = [ - lambda x: x + 1, lambda x: str(x), lambda x: x * x, lambda x: x, - lambda x: False + lambda x: x + 1, + lambda x: str(x), + lambda x: x * x, + lambda x: x, + lambda x: False, ] query_funcs = [ - 'col1 < col2', 'col3 > col4', 'col1 == col2', - '(col2 > col1) and (col1 < col3)' + "col1 < col2", + "col3 > col4", + "col1 == col2", + "(col2 > col1) and (col1 < col3)", ] - keys = ['col1', 'col2', 'col3', 'col4'] + keys = ["col1", "col2", "col3", "col4"] - filter_by = {'items': ['col1', 'col5'], 'regex': '4$|3$', 'like': 'col'} + filter_by = {"items": ["col1", "col5"], "regex": "4$|3$", "like": "col"} test_sample(ray_df, pandas_df) test_filter(ray_df, pandas_df, filter_by) @@ -320,12 +330,12 @@ def test_float_dataframe(): test_loc(ray_df, pandas_df) test_iloc(ray_df, pandas_df) - labels = ['a', 'b', 'c', 'd'] + labels = ["a", "b", "c", "d"] test_set_axis(ray_df, pandas_df, labels, 0) - test_set_axis(ray_df, pandas_df, labels, 'rows') - labels.append('e') + test_set_axis(ray_df, pandas_df, labels, "rows") + labels.append("e") test_set_axis(ray_df, pandas_df, labels, 1) - test_set_axis(ray_df, pandas_df, labels, 'columns') + test_set_axis(ray_df, pandas_df, labels, "columns") for key in keys: test_set_index(ray_df, pandas_df, key) @@ -346,9 +356,7 @@ def test_float_dataframe(): test___array__(ray_df, pandas_df) - apply_agg_functions = [ - 'sum', lambda df: df.sum(), ['sum', 'mean'], ['sum', 'sum'] - ] + apply_agg_functions = ["sum", lambda df: df.sum(), ["sum", "mean"], ["sum", "sum"]] for func in apply_agg_functions: test_apply(ray_df, pandas_df, func, 0) test_aggregate(ray_df, pandas_df, func, 0) @@ -365,7 +373,7 @@ def test_float_dataframe(): with pytest.raises(TypeError): test_aggregate(ray_df, pandas_df, func, 1) - func = ['sum', lambda df: df.sum()] + func = ["sum", lambda df: df.sum()] test_apply(ray_df, pandas_df, func, 0) test_aggregate(ray_df, pandas_df, func, 0) test_agg(ray_df, pandas_df, func, 0) @@ -376,33 +384,29 @@ def test_float_dataframe(): with pytest.raises(TypeError): test_agg(ray_df, pandas_df, func, 1) - test_apply(ray_df, pandas_df, lambda df: df.drop('col1'), 1) + test_apply(ray_df, pandas_df, lambda df: df.drop("col1"), 1) test_apply(ray_df, pandas_df, lambda df: -df, 0) test_transform(ray_df, pandas_df) def test_mixed_dtype_dataframe(): frame_data = { - 'col1': [1, 2, 3, 4], - 'col2': [4, 5, 6, 7], - 'col3': [8.0, 9.4, 10.1, 11.3], - 'col4': ['a', 'b', 'c', 'd'] + "col1": [1, 2, 3, 4], + "col2": [4, 5, 6, 7], + "col3": [8.0, 9.4, 10.1, 11.3], + "col4": ["a", "b", "c", "d"], } pandas_df = pandas.DataFrame(frame_data) ray_df = pd.DataFrame(frame_data) - testfuncs = [ - lambda x: x + x, lambda x: str(x), lambda x: x, lambda x: False - ] + testfuncs = [lambda x: x + x, lambda x: str(x), lambda x: x, lambda x: False] - query_funcs = [ - 'col1 < col2', 'col1 == col2', '(col2 > col1) and (col1 < col3)' - ] + query_funcs = ["col1 < col2", "col1 == col2", "(col2 > col1) and (col1 < col3)"] - keys = ['col1', 'col2', 'col3', 'col4'] + keys = ["col1", "col2", "col3", "col4"] - filter_by = {'items': ['col1', 'col5'], 'regex': '4$|3$', 'like': 'col'} + filter_by = {"items": ["col1", "col5"], "regex": "4$|3$", "like": "col"} test_sample(ray_df, pandas_df) test_filter(ray_df, pandas_df, filter_by) @@ -501,11 +505,11 @@ def test_mixed_dtype_dataframe(): test_loc(ray_df, pandas_df) test_iloc(ray_df, pandas_df) - labels = ['a', 'b', 'c', 'd'] + labels = ["a", "b", "c", "d"] test_set_axis(ray_df, pandas_df, labels, 0) - test_set_axis(ray_df, pandas_df, labels, 'rows') + test_set_axis(ray_df, pandas_df, labels, "rows") test_set_axis(ray_df, pandas_df, labels, 1) - test_set_axis(ray_df, pandas_df, labels, 'columns') + test_set_axis(ray_df, pandas_df, labels, "columns") for key in keys: test_set_index(ray_df, pandas_df, key) @@ -526,13 +530,13 @@ def test_mixed_dtype_dataframe(): test___array__(ray_df, pandas_df) - apply_agg_functions = ['sum', lambda df: df.sum()] + apply_agg_functions = ["sum", lambda df: df.sum()] for func in apply_agg_functions: test_apply(ray_df, pandas_df, func, 0) test_aggregate(ray_df, pandas_df, func, 0) test_agg(ray_df, pandas_df, func, 0) - func = ['sum', lambda df: df.sum()] + func = ["sum", lambda df: df.sum()] test_apply(ray_df, pandas_df, func, 0) test_aggregate(ray_df, pandas_df, func, 0) test_agg(ray_df, pandas_df, func, 0) @@ -544,32 +548,32 @@ def test_mixed_dtype_dataframe(): test_agg(ray_df, pandas_df, func, 1) test_transform(ray_df, pandas_df) - test_apply(ray_df, pandas_df, lambda df: df.drop('col1'), 1) + test_apply(ray_df, pandas_df, lambda df: df.drop("col1"), 1) def test_nan_dataframe(): frame_data = { - 'col1': [1, 2, 3, np.nan], - 'col2': [4, 5, np.nan, 7], - 'col3': [8, np.nan, 10, 11], - 'col4': [np.nan, 13, 14, 15] + "col1": [1, 2, 3, np.nan], + "col2": [4, 5, np.nan, 7], + "col3": [8, np.nan, 10, 11], + "col4": [np.nan, 13, 14, 15], } pandas_df = pandas.DataFrame(frame_data) ray_df = pd.DataFrame(frame_data) - testfuncs = [ - lambda x: x + x, lambda x: str(x), lambda x: x, lambda x: False - ] + testfuncs = [lambda x: x + x, lambda x: str(x), lambda x: x, lambda x: False] query_funcs = [ - 'col1 < col2', 'col3 > col4', 'col1 == col2', - '(col2 > col1) and (col1 < col3)' + "col1 < col2", + "col3 > col4", + "col1 == col2", + "(col2 > col1) and (col1 < col3)", ] - keys = ['col1', 'col2', 'col3', 'col4'] + keys = ["col1", "col2", "col3", "col4"] - filter_by = {'items': ['col1', 'col5'], 'regex': '4$|3$', 'like': 'col'} + filter_by = {"items": ["col1", "col5"], "regex": "4$|3$", "like": "col"} test_sample(ray_df, pandas_df) test_filter(ray_df, pandas_df, filter_by) @@ -653,11 +657,11 @@ def test_nan_dataframe(): test_loc(ray_df, pandas_df) test_iloc(ray_df, pandas_df) - labels = ['a', 'b', 'c', 'd'] + labels = ["a", "b", "c", "d"] test_set_axis(ray_df, pandas_df, labels, 0) - test_set_axis(ray_df, pandas_df, labels, 'rows') + test_set_axis(ray_df, pandas_df, labels, "rows") test_set_axis(ray_df, pandas_df, labels, 1) - test_set_axis(ray_df, pandas_df, labels, 'columns') + test_set_axis(ray_df, pandas_df, labels, "columns") for key in keys: test_set_index(ray_df, pandas_df, key) @@ -678,9 +682,7 @@ def test_nan_dataframe(): test___array__(ray_df, pandas_df) - apply_agg_functions = [ - 'sum', lambda df: df.sum(), ['sum', 'mean'], ['sum', 'sum'] - ] + apply_agg_functions = ["sum", lambda df: df.sum(), ["sum", "mean"], ["sum", "sum"]] for func in apply_agg_functions: test_apply(ray_df, pandas_df, func, 0) test_aggregate(ray_df, pandas_df, func, 0) @@ -697,7 +699,7 @@ def test_nan_dataframe(): with pytest.raises(TypeError): test_aggregate(ray_df, pandas_df, func, 1) - func = ['sum', lambda df: df.sum()] + func = ["sum", lambda df: df.sum()] test_apply(ray_df, pandas_df, func, 0) test_aggregate(ray_df, pandas_df, func, 0) test_agg(ray_df, pandas_df, func, 0) @@ -708,36 +710,36 @@ def test_nan_dataframe(): with pytest.raises(TypeError): test_agg(ray_df, pandas_df, func, 1) - test_apply(ray_df, pandas_df, lambda df: df.drop('col1'), 1) + test_apply(ray_df, pandas_df, lambda df: df.drop("col1"), 1) test_apply(ray_df, pandas_df, lambda df: -df, 0) test_transform(ray_df, pandas_df) def test_empty_df(): - df = pd.DataFrame(index=['a', 'b']) + df = pd.DataFrame(index=["a", "b"]) test_is_empty(df) - tm.assert_index_equal(df.index, pd.Index(['a', 'b'])) + tm.assert_index_equal(df.index, pd.Index(["a", "b"])) assert len(df.columns) == 0 - df = pd.DataFrame(columns=['a', 'b']) + df = pd.DataFrame(columns=["a", "b"]) test_is_empty(df) assert len(df.index) == 0 - tm.assert_index_equal(df.columns, pd.Index(['a', 'b'])) + tm.assert_index_equal(df.columns, pd.Index(["a", "b"])) df = pd.DataFrame() test_is_empty(df) assert len(df.index) == 0 assert len(df.columns) == 0 - df = pd.DataFrame(index=['a', 'b']) + df = pd.DataFrame(index=["a", "b"]) test_is_empty(df) - tm.assert_index_equal(df.index, pd.Index(['a', 'b'])) + tm.assert_index_equal(df.index, pd.Index(["a", "b"])) assert len(df.columns) == 0 - df = pd.DataFrame(columns=['a', 'b']) + df = pd.DataFrame(columns=["a", "b"]) test_is_empty(df) assert len(df.index) == 0 - tm.assert_index_equal(df.columns, pd.Index(['a', 'b'])) + tm.assert_index_equal(df.columns, pd.Index(["a", "b"])) df = pd.DataFrame() test_is_empty(df) @@ -752,13 +754,16 @@ def test_is_empty(df): def test_dense_nan_df(): - frame_data = [[np.nan, 2, np.nan, 0], [3, 4, np.nan, 1], - [np.nan, np.nan, np.nan, 5]] - ray_df = pd.DataFrame(frame_data, columns=list('ABCD')) + frame_data = [ + [np.nan, 2, np.nan, 0], + [3, 4, np.nan, 1], + [np.nan, np.nan, np.nan, 5], + ] + ray_df = pd.DataFrame(frame_data, columns=list("ABCD")) - pd_df = pandas.DataFrame(frame_data, columns=list('ABCD')) + pd_df = pandas.DataFrame(frame_data, columns=list("ABCD")) - column_subsets = [list('AD'), list('BC'), list('CD')] + column_subsets = [list("AD"), list("BC"), list("CD")] row_subsets = [[0, 1], [0, 1, 2], [2, 0]] test_dropna(ray_df, pd_df) @@ -775,40 +780,38 @@ def test_inter_df_math(op, simple=False): "col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], "col3": [8, 9, 0, 1], - "col4": [2, 4, 5, 6] + "col4": [2, 4, 5, 6], } ray_df = pd.DataFrame(frame_data) pandas_df = pandas.DataFrame(frame_data) assert ray_df_equals_pandas( - getattr(ray_df, op)(ray_df), - getattr(pandas_df, op)(pandas_df)) - assert ray_df_equals_pandas( - getattr(ray_df, op)(4), - getattr(pandas_df, op)(4)) - assert ray_df_equals_pandas( - getattr(ray_df, op)(4.0), - getattr(pandas_df, op)(4.0)) + getattr(ray_df, op)(ray_df), getattr(pandas_df, op)(pandas_df) + ) + assert ray_df_equals_pandas(getattr(ray_df, op)(4), getattr(pandas_df, op)(4)) + assert ray_df_equals_pandas(getattr(ray_df, op)(4.0), getattr(pandas_df, op)(4.0)) frame_data = {"A": [0, 2], "col1": [0, 19], "col2": [1, 1]} ray_df2 = pd.DataFrame(frame_data) pandas_df2 = pandas.DataFrame(frame_data) assert ray_df_equals_pandas( - getattr(ray_df, op)(ray_df2), - getattr(pandas_df, op)(pandas_df2)) + getattr(ray_df, op)(ray_df2), getattr(pandas_df, op)(pandas_df2) + ) list_test = [0, 1, 2, 4] if not simple: assert ray_df_equals_pandas( getattr(ray_df, op)(list_test, axis=1), - getattr(pandas_df, op)(list_test, axis=1)) + getattr(pandas_df, op)(list_test, axis=1), + ) assert ray_df_equals_pandas( getattr(ray_df, op)(list_test, axis=0), - getattr(pandas_df, op)(list_test, axis=0)) + getattr(pandas_df, op)(list_test, axis=0), + ) @pytest.fixture @@ -817,21 +820,17 @@ def test_comparison_inter_ops(op): "col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], "col3": [8, 9, 0, 1], - "col4": [2, 4, 5, 6] + "col4": [2, 4, 5, 6], } ray_df = pd.DataFrame(frame_data) pandas_df = pandas.DataFrame(frame_data) assert ray_df_equals_pandas( - getattr(ray_df, op)(ray_df), - getattr(pandas_df, op)(pandas_df)) - assert ray_df_equals_pandas( - getattr(ray_df, op)(4), - getattr(pandas_df, op)(4)) - assert ray_df_equals_pandas( - getattr(ray_df, op)(4.0), - getattr(pandas_df, op)(4.0)) + getattr(ray_df, op)(ray_df), getattr(pandas_df, op)(pandas_df) + ) + assert ray_df_equals_pandas(getattr(ray_df, op)(4), getattr(pandas_df, op)(4)) + assert ray_df_equals_pandas(getattr(ray_df, op)(4.0), getattr(pandas_df, op)(4.0)) frame_data = {"A": [0, 2], "col1": [0, 19], "col2": [1, 1]} @@ -839,8 +838,8 @@ def test_comparison_inter_ops(op): pandas_df2 = pandas.DataFrame(frame_data) assert ray_df_equals_pandas( - getattr(ray_df2, op)(ray_df2), - getattr(pandas_df2, op)(pandas_df2)) + getattr(ray_df2, op)(ray_df2), getattr(pandas_df2, op)(pandas_df2) + ) @pytest.fixture @@ -849,18 +848,14 @@ def test_inter_df_math_right_ops(op): "col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], "col3": [8, 9, 0, 1], - "col4": [2, 4, 5, 6] + "col4": [2, 4, 5, 6], } ray_df = pd.DataFrame(frame_data) pandas_df = pandas.DataFrame(frame_data) - assert ray_df_equals_pandas( - getattr(ray_df, op)(4), - getattr(pandas_df, op)(4)) - assert ray_df_equals_pandas( - getattr(ray_df, op)(4.0), - getattr(pandas_df, op)(4.0)) + assert ray_df_equals_pandas(getattr(ray_df, op)(4), getattr(pandas_df, op)(4)) + assert ray_df_equals_pandas(getattr(ray_df, op)(4.0), getattr(pandas_df, op)(4.0)) @pytest.fixture @@ -902,7 +897,7 @@ def test_values(ray_df, pandas_df): @pytest.fixture def test_axes(ray_df, pandas_df): for ray_axis, pd_axis in zip(ray_df.axes, pandas_df.axes): - assert (np.array_equal(ray_axis, pd_axis)) + assert np.array_equal(ray_axis, pd_axis) @pytest.fixture @@ -940,8 +935,9 @@ def test_copy(ray_df): new_ray_df = ray_df.copy() assert new_ray_df is not ray_df - assert np.array_equal(new_ray_df._data_manager.data.partitions, - ray_df._data_manager.data.partitions) + assert np.array_equal( + new_ray_df._data_manager.data.partitions, ray_df._data_manager.data.partitions + ) @pytest.fixture @@ -968,8 +964,9 @@ def test_transpose(ray_df, pandas_df): @pytest.fixture def test_get(ray_df, pandas_df, key): assert ray_df.get(key).equals(pandas_df.get(key)) - assert ray_df.get( - key, default='default').equals(pandas_df.get(key, default='default')) + assert ray_df.get(key, default="default").equals( + pandas_df.get(key, default="default") + ) @pytest.fixture @@ -1030,7 +1027,7 @@ def test_append(): "col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], "col3": [8, 9, 0, 1], - "col4": [2, 4, 5, 6] + "col4": [2, 4, 5, 6], } ray_df = pd.DataFrame(frame_data) @@ -1041,8 +1038,7 @@ def test_append(): ray_df2 = pd.DataFrame(frame_data2) pandas_df2 = pandas.DataFrame(frame_data2) - assert ray_df_equals_pandas( - ray_df.append(ray_df2), pandas_df.append(pandas_df2)) + assert ray_df_equals_pandas(ray_df.append(ray_df2), pandas_df.append(pandas_df2)) with pytest.raises(ValueError): ray_df.append(ray_df2, verify_integrity=True) @@ -1080,16 +1076,16 @@ def test_as_matrix(): assert value == frame[col][i] # mixed type - mat = pd.DataFrame(test_data.mixed_frame).as_matrix(['foo', 'A']) - assert mat[0, 0] == 'bar' + mat = pd.DataFrame(test_data.mixed_frame).as_matrix(["foo", "A"]) + assert mat[0, 0] == "bar" - df = pd.DataFrame({'real': [1, 2, 3], 'complex': [1j, 2j, 3j]}) + df = pd.DataFrame({"real": [1, 2, 3], "complex": [1j, 2j, 3j]}) mat = df.as_matrix() assert mat[0, 0] == 1j # single block corner case - mat = pd.DataFrame(test_data.frame).as_matrix(['A', 'B']) - expected = test_data.frame.reindex(columns=['A', 'B']).values + mat = pd.DataFrame(test_data.frame).as_matrix(["A", "B"]) + expected = test_data.frame.reindex(columns=["A", "B"]).values tm.assert_almost_equal(mat, expected) @@ -1117,9 +1113,11 @@ def test_assign(): def test_astype(): td = TestData() ray_df = pd.DataFrame( - td.frame.values, index=td.frame.index, columns=td.frame.columns) + td.frame.values, index=td.frame.index, columns=td.frame.columns + ) expected_df = pandas.DataFrame( - td.frame.values, index=td.frame.index, columns=td.frame.columns) + td.frame.values, index=td.frame.index, columns=td.frame.columns + ) ray_df_casted = ray_df.astype(np.int32) expected_df_casted = expected_df.astype(np.int32) @@ -1154,8 +1152,8 @@ def test_between_time(): @pytest.fixture def test_bfill(): test_data = TestData() - test_data.tsframe['A'][:5] = np.nan - test_data.tsframe['A'][-5:] = np.nan + test_data.tsframe["A"][:5] = np.nan + test_data.tsframe["A"][-5:] = np.nan ray_df = pd.DataFrame(test_data.tsframe) assert ray_df_equals_pandas(ray_df.bfill(), test_data.tsframe.bfill()) @@ -1291,8 +1289,7 @@ def test_describe(ray_df, pandas_df): def test_diff(ray_df, pandas_df): assert ray_df_equals_pandas(ray_df.diff(), pandas_df.diff()) assert ray_df_equals_pandas(ray_df.diff(axis=1), pandas_df.diff(axis=1)) - assert ray_df_equals_pandas( - ray_df.diff(periods=1), pandas_df.diff(periods=1)) + assert ray_df_equals_pandas(ray_df.diff(periods=1), pandas_df.diff(periods=1)) def test_div(): @@ -1314,50 +1311,47 @@ def test_drop(): frame_data = {"A": [1, 2, 3, 4], "B": [0, 1, 2, 3]} simple = pandas.DataFrame(frame_data) ray_simple = pd.DataFrame(frame_data) - assert ray_df_equals_pandas(ray_simple.drop("A", axis=1), simple[['B']]) - assert ray_df_equals_pandas( - ray_simple.drop(["A", "B"], axis='columns'), simple[[]]) + assert ray_df_equals_pandas(ray_simple.drop("A", axis=1), simple[["B"]]) + assert ray_df_equals_pandas(ray_simple.drop(["A", "B"], axis="columns"), simple[[]]) + assert ray_df_equals_pandas(ray_simple.drop([0, 1, 3], axis=0), simple.loc[[2], :]) assert ray_df_equals_pandas( - ray_simple.drop([0, 1, 3], axis=0), simple.loc[[2], :]) - assert ray_df_equals_pandas( - ray_simple.drop([0, 3], axis='index'), simple.loc[[1, 2], :]) + ray_simple.drop([0, 3], axis="index"), simple.loc[[1, 2], :] + ) pytest.raises(ValueError, ray_simple.drop, 5) - pytest.raises(ValueError, ray_simple.drop, 'C', 1) + pytest.raises(ValueError, ray_simple.drop, "C", 1) pytest.raises(ValueError, ray_simple.drop, [1, 5]) - pytest.raises(ValueError, ray_simple.drop, ['A', 'C'], 1) + pytest.raises(ValueError, ray_simple.drop, ["A", "C"], 1) # errors = 'ignore' - assert ray_df_equals_pandas(ray_simple.drop(5, errors='ignore'), simple) - assert ray_df_equals_pandas( - ray_simple.drop([0, 5], errors='ignore'), simple.loc[[1, 2, 3], :]) + assert ray_df_equals_pandas(ray_simple.drop(5, errors="ignore"), simple) assert ray_df_equals_pandas( - ray_simple.drop('C', axis=1, errors='ignore'), simple) + ray_simple.drop([0, 5], errors="ignore"), simple.loc[[1, 2, 3], :] + ) + assert ray_df_equals_pandas(ray_simple.drop("C", axis=1, errors="ignore"), simple) assert ray_df_equals_pandas( - ray_simple.drop(['A', 'C'], axis=1, errors='ignore'), simple[['B']]) + ray_simple.drop(["A", "C"], axis=1, errors="ignore"), simple[["B"]] + ) # non-unique nu_df = pandas.DataFrame( - pandas.compat.lzip(range(3), range(-3, 1), list('abc')), - columns=['a', 'a', 'b']) + pandas.compat.lzip(range(3), range(-3, 1), list("abc")), columns=["a", "a", "b"] + ) ray_nu_df = pd.DataFrame(nu_df) - assert ray_df_equals_pandas(ray_nu_df.drop('a', axis=1), nu_df[['b']]) - assert ray_df_equals_pandas( - ray_nu_df.drop('b', axis='columns'), nu_df['a']) + assert ray_df_equals_pandas(ray_nu_df.drop("a", axis=1), nu_df[["b"]]) + assert ray_df_equals_pandas(ray_nu_df.drop("b", axis="columns"), nu_df["a"]) assert ray_df_equals_pandas(ray_nu_df.drop([]), nu_df) - nu_df = nu_df.set_index(pandas.Index(['X', 'Y', 'X'])) - nu_df.columns = list('abc') + nu_df = nu_df.set_index(pandas.Index(["X", "Y", "X"])) + nu_df.columns = list("abc") ray_nu_df = pd.DataFrame(nu_df) - assert ray_df_equals_pandas( - ray_nu_df.drop('X', axis='rows'), nu_df.loc[["Y"], :]) - assert ray_df_equals_pandas( - ray_nu_df.drop(['X', 'Y'], axis=0), nu_df.loc[[], :]) + assert ray_df_equals_pandas(ray_nu_df.drop("X", axis="rows"), nu_df.loc[["Y"], :]) + assert ray_df_equals_pandas(ray_nu_df.drop(["X", "Y"], axis=0), nu_df.loc[[], :]) # inplace cache issue frame_data = np.random.randn(10, 3) - df = pandas.DataFrame(frame_data, columns=list('abc')) - ray_df = pd.DataFrame(frame_data, columns=list('abc')) + df = pandas.DataFrame(frame_data, columns=list("abc")) + ray_df = pd.DataFrame(frame_data, columns=list("abc")) expected = df[~(df.b > 0)] ray_df.drop(labels=df[df.b > 0].index, inplace=True) assert ray_df_equals_pandas(ray_df, expected) @@ -1367,34 +1361,33 @@ def test_drop_api_equivalence(): # equivalence of the labels/axis and index/columns API's frame_data = [[1, 2, 3], [3, 4, 5], [5, 6, 7]] - ray_df = pd.DataFrame( - frame_data, index=['a', 'b', 'c'], columns=['d', 'e', 'f']) + ray_df = pd.DataFrame(frame_data, index=["a", "b", "c"], columns=["d", "e", "f"]) - ray_df1 = ray_df.drop('a') - ray_df2 = ray_df.drop(index='a') + ray_df1 = ray_df.drop("a") + ray_df2 = ray_df.drop(index="a") assert ray_df_equals(ray_df1, ray_df2) - ray_df1 = ray_df.drop('d', 1) - ray_df2 = ray_df.drop(columns='d') + ray_df1 = ray_df.drop("d", 1) + ray_df2 = ray_df.drop(columns="d") assert ray_df_equals(ray_df1, ray_df2) - ray_df1 = ray_df.drop(labels='e', axis=1) - ray_df2 = ray_df.drop(columns='e') + ray_df1 = ray_df.drop(labels="e", axis=1) + ray_df2 = ray_df.drop(columns="e") assert ray_df_equals(ray_df1, ray_df2) - ray_df1 = ray_df.drop(['a'], axis=0) - ray_df2 = ray_df.drop(index=['a']) + ray_df1 = ray_df.drop(["a"], axis=0) + ray_df2 = ray_df.drop(index=["a"]) assert ray_df_equals(ray_df1, ray_df2) - ray_df1 = ray_df.drop(['a'], axis=0).drop(['d'], axis=1) - ray_df2 = ray_df.drop(index=['a'], columns=['d']) + ray_df1 = ray_df.drop(["a"], axis=0).drop(["d"], axis=1) + ray_df2 = ray_df.drop(index=["a"], columns=["d"]) assert ray_df_equals(ray_df1, ray_df2) with pytest.raises(ValueError): - ray_df.drop(labels='a', index='b') + ray_df.drop(labels="a", index="b") with pytest.raises(ValueError): - ray_df.drop(labels='a', columns='b') + ray_df.drop(labels="a", columns="b") with pytest.raises(ValueError): ray_df.drop(axis=1) @@ -1410,16 +1403,18 @@ def test_drop_duplicates(): @pytest.fixture def test_dropna(ray_df, pd_df): assert ray_df_equals_pandas( - ray_df.dropna(axis=1, how='all'), pd_df.dropna(axis=1, how='all')) + ray_df.dropna(axis=1, how="all"), pd_df.dropna(axis=1, how="all") + ) assert ray_df_equals_pandas( - ray_df.dropna(axis=1, how='any'), pd_df.dropna(axis=1, how='any')) + ray_df.dropna(axis=1, how="any"), pd_df.dropna(axis=1, how="any") + ) assert ray_df_equals_pandas( - ray_df.dropna(axis=0, how='all'), pd_df.dropna(axis=0, how='all')) + ray_df.dropna(axis=0, how="all"), pd_df.dropna(axis=0, how="all") + ) - assert ray_df_equals_pandas( - ray_df.dropna(thresh=2), pd_df.dropna(thresh=2)) + assert ray_df_equals_pandas(ray_df.dropna(thresh=2), pd_df.dropna(thresh=2)) @pytest.fixture @@ -1432,8 +1427,8 @@ def test_dropna_inplace(ray_df, pd_df): assert ray_df_equals_pandas(ray_df, pd_df) - ray_df.dropna(axis=1, how='any', inplace=True) - pd_df.dropna(axis=1, how='any', inplace=True) + ray_df.dropna(axis=1, how="any", inplace=True) + pd_df.dropna(axis=1, how="any", inplace=True) assert ray_df_equals_pandas(ray_df, pd_df) @@ -1441,11 +1436,11 @@ def test_dropna_inplace(ray_df, pd_df): @pytest.fixture def test_dropna_multiple_axes(ray_df, pd_df): assert ray_df_equals_pandas( - ray_df.dropna(how='all', axis=[0, 1]), - pd_df.dropna(how='all', axis=[0, 1])) + ray_df.dropna(how="all", axis=[0, 1]), pd_df.dropna(how="all", axis=[0, 1]) + ) assert ray_df_equals_pandas( - ray_df.dropna(how='all', axis=(0, 1)), - pd_df.dropna(how='all', axis=(0, 1))) + ray_df.dropna(how="all", axis=(0, 1)), pd_df.dropna(how="all", axis=(0, 1)) + ) @pytest.fixture @@ -1453,16 +1448,16 @@ def test_dropna_multiple_axes_inplace(ray_df, pd_df): ray_df_copy = ray_df.copy() pd_df_copy = pd_df.copy() - ray_df_copy.dropna(how='all', axis=[0, 1], inplace=True) - pd_df_copy.dropna(how='all', axis=[0, 1], inplace=True) + ray_df_copy.dropna(how="all", axis=[0, 1], inplace=True) + pd_df_copy.dropna(how="all", axis=[0, 1], inplace=True) assert ray_df_equals_pandas(ray_df_copy, pd_df_copy) ray_df_copy = ray_df.copy() pd_df_copy = pd_df.copy() - ray_df_copy.dropna(how='all', axis=(0, 1), inplace=True) - pd_df_copy.dropna(how='all', axis=(0, 1), inplace=True) + ray_df_copy.dropna(how="all", axis=(0, 1), inplace=True) + pd_df_copy.dropna(how="all", axis=(0, 1), inplace=True) assert ray_df_equals_pandas(ray_df_copy, pd_df_copy) @@ -1471,27 +1466,31 @@ def test_dropna_multiple_axes_inplace(ray_df, pd_df): def test_dropna_subset(ray_df, pd_df, column_subsets, row_subsets): for subset in column_subsets: assert ray_df_equals_pandas( - ray_df.dropna(how='all', subset=subset), - pd_df.dropna(how='all', subset=subset)) + ray_df.dropna(how="all", subset=subset), + pd_df.dropna(how="all", subset=subset), + ) assert ray_df_equals_pandas( - ray_df.dropna(how='any', subset=subset), - pd_df.dropna(how='any', subset=subset)) + ray_df.dropna(how="any", subset=subset), + pd_df.dropna(how="any", subset=subset), + ) for subset in row_subsets: assert ray_df_equals_pandas( - ray_df.dropna(how='all', axis=1, subset=subset), - pd_df.dropna(how='all', axis=1, subset=subset)) + ray_df.dropna(how="all", axis=1, subset=subset), + pd_df.dropna(how="all", axis=1, subset=subset), + ) assert ray_df_equals_pandas( - ray_df.dropna(how='any', axis=1, subset=subset), - pd_df.dropna(how='any', axis=1, subset=subset)) + ray_df.dropna(how="any", axis=1, subset=subset), + pd_df.dropna(how="any", axis=1, subset=subset), + ) @pytest.fixture def test_dropna_subset_error(ray_df): with pytest.raises(KeyError): - ray_df.dropna(subset=list('EF')) + ray_df.dropna(subset=list("EF")) with pytest.raises(KeyError): ray_df.dropna(axis=1, subset=[4, 5]) @@ -1509,13 +1508,13 @@ def test_eq(): def test_equals(): - frame_data = {'col1': [2.9, 3, 3, 3], 'col2': [2, 3, 4, 1]} + frame_data = {"col1": [2.9, 3, 3, 3], "col2": [2, 3, 4, 1]} ray_df1 = pd.DataFrame(frame_data) ray_df2 = pd.DataFrame(frame_data) assert ray_df1.equals(ray_df2) - frame_data = {'col1': [2.9, 3, 3, 3], 'col2': [2, 3, 5, 1]} + frame_data = {"col1": [2.9, 3, 3, 3], "col2": [2, 3, 5, 1]} ray_df3 = pd.DataFrame(frame_data) assert not ray_df3.equals(ray_df1) @@ -1523,49 +1522,37 @@ def test_equals(): def test_eval_df_use_case(): - frame_data = {'a': np.random.randn(10), 'b': np.random.randn(10)} + frame_data = {"a": np.random.randn(10), "b": np.random.randn(10)} df = pandas.DataFrame(frame_data) ray_df = pd.DataFrame(frame_data) # test eval for series results - tmp_pandas = df.eval( - "arctan2(sin(a), b)", engine='python', parser='pandas') - tmp_ray = ray_df.eval( - "arctan2(sin(a), b)", engine='python', parser='pandas') + tmp_pandas = df.eval("arctan2(sin(a), b)", engine="python", parser="pandas") + tmp_ray = ray_df.eval("arctan2(sin(a), b)", engine="python", parser="pandas") assert isinstance(tmp_ray, pandas.Series) assert ray_series_equals_pandas(tmp_ray, tmp_pandas) # Test not inplace assignments - tmp_pandas = df.eval( - "e = arctan2(sin(a), b)", engine='python', parser='pandas') - tmp_ray = ray_df.eval( - "e = arctan2(sin(a), b)", engine='python', parser='pandas') + tmp_pandas = df.eval("e = arctan2(sin(a), b)", engine="python", parser="pandas") + tmp_ray = ray_df.eval("e = arctan2(sin(a), b)", engine="python", parser="pandas") assert ray_df_equals_pandas(tmp_ray, tmp_pandas) # Test inplace assignments - df.eval( - "e = arctan2(sin(a), b)", - engine='python', - parser='pandas', - inplace=True) + df.eval("e = arctan2(sin(a), b)", engine="python", parser="pandas", inplace=True) ray_df.eval( - "e = arctan2(sin(a), b)", - engine='python', - parser='pandas', - inplace=True) + "e = arctan2(sin(a), b)", engine="python", parser="pandas", inplace=True + ) # TODO: Use a series equality validator. assert ray_df_equals_pandas(ray_df, df) def test_eval_df_arithmetic_subexpression(): - frame_data = {'a': np.random.randn(10), 'b': np.random.randn(10)} + frame_data = {"a": np.random.randn(10), "b": np.random.randn(10)} df = pandas.DataFrame(frame_data) ray_df = pd.DataFrame(frame_data) - df.eval( - "not_e = sin(a + b)", engine='python', parser='pandas', inplace=True) - ray_df.eval( - "not_e = sin(a + b)", engine='python', parser='pandas', inplace=True) + df.eval("not_e = sin(a + b)", engine="python", parser="pandas", inplace=True) + ray_df.eval("not_e = sin(a + b)", engine="python", parser="pandas", inplace=True) # TODO: Use a series equality validator. assert ray_df_equals_pandas(ray_df, df) @@ -1587,8 +1574,8 @@ def test_expanding(): @pytest.fixture def test_ffill(): test_data = TestData() - test_data.tsframe['A'][:5] = np.nan - test_data.tsframe['A'][-5:] = np.nan + test_data.tsframe["A"][:5] = np.nan + test_data.tsframe["A"][-5:] = np.nan ray_df = pd.DataFrame(test_data.tsframe) assert ray_df_equals_pandas(ray_df.ffill(), test_data.tsframe.ffill()) @@ -1621,44 +1608,44 @@ def test_fillna(): def test_fillna_sanity(): test_data = TestData() tf = test_data.tsframe - tf.loc[tf.index[:5], 'A'] = np.nan - tf.loc[tf.index[-5:], 'A'] = np.nan + tf.loc[tf.index[:5], "A"] = np.nan + tf.loc[tf.index[-5:], "A"] = np.nan zero_filled = test_data.tsframe.fillna(0) ray_df = pd.DataFrame(test_data.tsframe).fillna(0) assert ray_df_equals_pandas(ray_df, zero_filled) - padded = test_data.tsframe.fillna(method='pad') - ray_df = pd.DataFrame(test_data.tsframe).fillna(method='pad') + padded = test_data.tsframe.fillna(method="pad") + ray_df = pd.DataFrame(test_data.tsframe).fillna(method="pad") assert ray_df_equals_pandas(ray_df, padded) # mixed type mf = test_data.mixed_frame - mf.loc[mf.index[5:20], 'foo'] = np.nan - mf.loc[mf.index[-10:], 'A'] = np.nan + mf.loc[mf.index[5:20], "foo"] = np.nan + mf.loc[mf.index[-10:], "A"] = np.nan result = test_data.mixed_frame.fillna(value=0) ray_df = pd.DataFrame(test_data.mixed_frame).fillna(value=0) assert ray_df_equals_pandas(ray_df, result) - result = test_data.mixed_frame.fillna(method='pad') - ray_df = pd.DataFrame(test_data.mixed_frame).fillna(method='pad') + result = test_data.mixed_frame.fillna(method="pad") + ray_df = pd.DataFrame(test_data.mixed_frame).fillna(method="pad") assert ray_df_equals_pandas(ray_df, result) pytest.raises(ValueError, test_data.tsframe.fillna) pytest.raises(ValueError, pd.DataFrame(test_data.tsframe).fillna) with pytest.raises(ValueError): - pd.DataFrame(test_data.tsframe).fillna(5, method='ffill') + pd.DataFrame(test_data.tsframe).fillna(5, method="ffill") # mixed numeric (but no float16) - mf = test_data.mixed_float.reindex(columns=['A', 'B', 'D']) - mf.loc[mf.index[-10:], 'A'] = np.nan + mf = test_data.mixed_float.reindex(columns=["A", "B", "D"]) + mf.loc[mf.index[-10:], "A"] = np.nan result = mf.fillna(value=0) ray_df = pd.DataFrame(mf).fillna(value=0) assert ray_df_equals_pandas(ray_df, result) - result = mf.fillna(method='pad') - ray_df = pd.DataFrame(mf).fillna(method='pad') + result = mf.fillna(method="pad") + ray_df = pd.DataFrame(mf).fillna(method="pad") assert ray_df_equals_pandas(ray_df, result) # TODO: Use this when Arrow issue resolves: @@ -1670,27 +1657,30 @@ def test_fillna_sanity(): # df.x.fillna(method=m) # with different dtype - frame_data = [['a', 'a', np.nan, 'a'], ['b', 'b', np.nan, 'b'], - ['c', 'c', np.nan, 'c']] + frame_data = [ + ["a", "a", np.nan, "a"], + ["b", "b", np.nan, "b"], + ["c", "c", np.nan, "c"], + ] df = pandas.DataFrame(frame_data) - result = df.fillna({2: 'foo'}) - ray_df = pd.DataFrame(frame_data).fillna({2: 'foo'}) + result = df.fillna({2: "foo"}) + ray_df = pd.DataFrame(frame_data).fillna({2: "foo"}) assert ray_df_equals_pandas(ray_df, result) ray_df = pd.DataFrame(df) - df.fillna({2: 'foo'}, inplace=True) - ray_df.fillna({2: 'foo'}, inplace=True) + df.fillna({2: "foo"}, inplace=True) + ray_df.fillna({2: "foo"}, inplace=True) assert ray_df_equals_pandas(ray_df, result) frame_data = { - 'Date': [pandas.NaT, pandas.Timestamp("2014-1-1")], - 'Date2': [pandas.Timestamp("2013-1-1"), pandas.NaT] + "Date": [pandas.NaT, pandas.Timestamp("2014-1-1")], + "Date2": [pandas.Timestamp("2013-1-1"), pandas.NaT], } df = pandas.DataFrame(frame_data) - result = df.fillna(value={'Date': df['Date2']}) - ray_df = pd.DataFrame(frame_data).fillna(value={'Date': df['Date2']}) + result = df.fillna(value={"Date": df["Date2"]}) + ray_df = pd.DataFrame(frame_data).fillna(value={"Date": df["Date2"]}) assert ray_df_equals_pandas(ray_df, result) # TODO: Use this when Arrow issue resolves: @@ -1715,39 +1705,39 @@ def test_fillna_sanity(): @pytest.fixture def test_fillna_downcast(): # infer int64 from float64 - frame_data = {'a': [1., np.nan]} + frame_data = {"a": [1., np.nan]} df = pandas.DataFrame(frame_data) - result = df.fillna(0, downcast='infer') - ray_df = pd.DataFrame(frame_data).fillna(0, downcast='infer') + result = df.fillna(0, downcast="infer") + ray_df = pd.DataFrame(frame_data).fillna(0, downcast="infer") assert ray_df_equals_pandas(ray_df, result) # infer int64 from float64 when fillna value is a dict df = pandas.DataFrame(frame_data) - result = df.fillna({'a': 0}, downcast='infer') - ray_df = pd.DataFrame(frame_data).fillna({'a': 0}, downcast='infer') + result = df.fillna({"a": 0}, downcast="infer") + ray_df = pd.DataFrame(frame_data).fillna({"a": 0}, downcast="infer") assert ray_df_equals_pandas(ray_df, result) @pytest.fixture def test_ffill2(): test_data = TestData() - test_data.tsframe['A'][:5] = np.nan - test_data.tsframe['A'][-5:] = np.nan + test_data.tsframe["A"][:5] = np.nan + test_data.tsframe["A"][-5:] = np.nan ray_df = pd.DataFrame(test_data.tsframe) assert ray_df_equals_pandas( - ray_df.fillna(method='ffill'), - test_data.tsframe.fillna(method='ffill')) + ray_df.fillna(method="ffill"), test_data.tsframe.fillna(method="ffill") + ) @pytest.fixture def test_bfill2(): test_data = TestData() - test_data.tsframe['A'][:5] = np.nan - test_data.tsframe['A'][-5:] = np.nan + test_data.tsframe["A"][:5] = np.nan + test_data.tsframe["A"][-5:] = np.nan ray_df = pd.DataFrame(test_data.tsframe) assert ray_df_equals_pandas( - ray_df.fillna(method='bfill'), - test_data.tsframe.fillna(method='bfill')) + ray_df.fillna(method="bfill"), test_data.tsframe.fillna(method="bfill") + ) @pytest.fixture @@ -1770,11 +1760,11 @@ def test_fillna_inplace(): df[1][:4] = np.nan df[3][-4:] = np.nan ray_df = pd.DataFrame(df) - df.fillna(method='ffill', inplace=True) + df.fillna(method="ffill", inplace=True) assert not ray_df_equals_pandas(ray_df, df) - ray_df.fillna(method='ffill', inplace=True) + ray_df.fillna(method="ffill", inplace=True) assert ray_df_equals_pandas(ray_df, df) @@ -1785,15 +1775,14 @@ def test_frame_fillna_limit(): df = pandas.DataFrame(frame_data, index=index) expected = df[:2].reindex(index) - expected = expected.fillna(method='pad', limit=5) + expected = expected.fillna(method="pad", limit=5) - ray_df = pd.DataFrame(df[:2].reindex(index)).fillna(method='pad', limit=5) + ray_df = pd.DataFrame(df[:2].reindex(index)).fillna(method="pad", limit=5) assert ray_df_equals_pandas(ray_df, expected) expected = df[-2:].reindex(index) - expected = expected.fillna(method='backfill', limit=5) - ray_df = pd.DataFrame(df[-2:].reindex(index)).fillna( - method='backfill', limit=5) + expected = expected.fillna(method="backfill", limit=5) + ray_df = pd.DataFrame(df[-2:].reindex(index)).fillna(method="backfill", limit=5) assert ray_df_equals_pandas(ray_df, expected) @@ -1806,27 +1795,28 @@ def test_frame_pad_backfill_limit(): result = df[:2].reindex(index) ray_df = pd.DataFrame(result) assert ray_df_equals_pandas( - ray_df.fillna(method='pad', limit=5), - result.fillna(method='pad', limit=5)) + ray_df.fillna(method="pad", limit=5), result.fillna(method="pad", limit=5) + ) result = df[-2:].reindex(index) ray_df = pd.DataFrame(result) assert ray_df_equals_pandas( - ray_df.fillna(method='backfill', limit=5), - result.fillna(method='backfill', limit=5)) + ray_df.fillna(method="backfill", limit=5), + result.fillna(method="backfill", limit=5), + ) @pytest.fixture def test_fillna_dtype_conversion(): # make sure that fillna on an empty frame works - df = pandas.DataFrame(index=range(3), columns=['A', 'B'], dtype='float64') - ray_df = pd.DataFrame(index=range(3), columns=['A', 'B'], dtype='float64') - assert ray_df_equals_pandas(ray_df.fillna('nan'), df.fillna('nan')) + df = pandas.DataFrame(index=range(3), columns=["A", "B"], dtype="float64") + ray_df = pd.DataFrame(index=range(3), columns=["A", "B"], dtype="float64") + assert ray_df_equals_pandas(ray_df.fillna("nan"), df.fillna("nan")) - frame_data = {'A': [1, np.nan], 'B': [1., 2.]} + frame_data = {"A": [1, np.nan], "B": [1., 2.]} df = pandas.DataFrame(frame_data) ray_df = pd.DataFrame(frame_data) - for v in ['', 1, np.nan, 1.0]: + for v in ["", 1, np.nan, 1.0]: assert ray_df_equals_pandas(ray_df.fillna(v), df.fillna(v)) @@ -1844,32 +1834,20 @@ def test_fillna_skip_certain_blocks(): @pytest.fixture def test_fillna_dict_series(): frame_data = { - 'a': [np.nan, 1, 2, np.nan, np.nan], - 'b': [1, 2, 3, np.nan, np.nan], - 'c': [np.nan, 1, 2, 3, 4] + "a": [np.nan, 1, 2, np.nan, np.nan], + "b": [1, 2, 3, np.nan, np.nan], + "c": [np.nan, 1, 2, 3, 4], } df = pandas.DataFrame(frame_data) ray_df = pd.DataFrame(frame_data) assert ray_df_equals_pandas( - ray_df.fillna({ - 'a': 0, - 'b': 5 - }), df.fillna({ - 'a': 0, - 'b': 5 - })) + ray_df.fillna({"a": 0, "b": 5}), df.fillna({"a": 0, "b": 5}) + ) assert ray_df_equals_pandas( - ray_df.fillna({ - 'a': 0, - 'b': 5, - 'd': 7 - }), df.fillna({ - 'a': 0, - 'b': 5, - 'd': 7 - })) + ray_df.fillna({"a": 0, "b": 5, "d": 7}), df.fillna({"a": 0, "b": 5, "d": 7}) + ) # Series treated same as dict assert ray_df_equals_pandas(ray_df.fillna(df.max()), df.fillna(df.max())) @@ -1878,20 +1856,18 @@ def test_fillna_dict_series(): @pytest.fixture def test_fillna_dataframe(): frame_data = { - 'a': [np.nan, 1, 2, np.nan, np.nan], - 'b': [1, 2, 3, np.nan, np.nan], - 'c': [np.nan, 1, 2, 3, 4] + "a": [np.nan, 1, 2, np.nan, np.nan], + "b": [1, 2, 3, np.nan, np.nan], + "c": [np.nan, 1, 2, 3, 4], } - df = pandas.DataFrame(frame_data, index=list('VWXYZ')) - ray_df = pd.DataFrame(frame_data, index=list('VWXYZ')) + df = pandas.DataFrame(frame_data, index=list("VWXYZ")) + ray_df = pd.DataFrame(frame_data, index=list("VWXYZ")) # df2 may have different index and columns - df2 = pandas.DataFrame({ - 'a': [np.nan, 10, 20, 30, 40], - 'b': [50, 60, 70, 80, 90], - 'foo': ['bar'] * 5 - }, - index=list('VWXuZ')) + df2 = pandas.DataFrame( + {"a": [np.nan, 10, 20, 30, 40], "b": [50, 60, 70, 80, 90], "foo": ["bar"] * 5}, + index=list("VWXuZ"), + ) # only those columns and indices which are shared get filled assert ray_df_equals_pandas(ray_df.fillna(df2), df.fillna(df2)) @@ -1905,22 +1881,22 @@ def test_fillna_columns(): ray_df = pd.DataFrame(df) assert ray_df_equals_pandas( - ray_df.fillna(method='ffill', axis=1), df.fillna( - method='ffill', axis=1)) + ray_df.fillna(method="ffill", axis=1), df.fillna(method="ffill", axis=1) + ) - df.insert(6, 'foo', 5) + df.insert(6, "foo", 5) ray_df = pd.DataFrame(df) assert ray_df_equals_pandas( - ray_df.fillna(method='ffill', axis=1), df.fillna( - method='ffill', axis=1)) + ray_df.fillna(method="ffill", axis=1), df.fillna(method="ffill", axis=1) + ) @pytest.fixture def test_fillna_invalid_method(): test_data = TestData() ray_df = pd.DataFrame(test_data.frame) - with tm.assert_raises_regex(ValueError, 'ffil'): - ray_df.fillna(method='ffil') + with tm.assert_raises_regex(ValueError, "ffil"): + ray_df.fillna(method="ffil") @pytest.fixture @@ -1942,7 +1918,8 @@ def test_fillna_col_reordering(): df = pandas.DataFrame(index=range(20), columns=cols, data=data) ray_df = pd.DataFrame(index=range(20), columns=cols, data=data) assert ray_df_equals_pandas( - ray_df.fillna(method='ffill'), df.fillna(method='ffill')) + ray_df.fillna(method="ffill"), df.fillna(method="ffill") + ) """ @@ -1972,13 +1949,16 @@ def test_fillna_datetime_columns(): @pytest.fixture def test_filter(ray_df, pandas_df, by): assert ray_df_equals_pandas( - ray_df.filter(items=by['items']), pandas_df.filter(items=by['items'])) + ray_df.filter(items=by["items"]), pandas_df.filter(items=by["items"]) + ) assert ray_df_equals_pandas( - ray_df.filter(regex=by['regex']), pandas_df.filter(regex=by['regex'])) + ray_df.filter(regex=by["regex"]), pandas_df.filter(regex=by["regex"]) + ) assert ray_df_equals_pandas( - ray_df.filter(like=by['like']), pandas_df.filter(like=by['like'])) + ray_df.filter(like=by["like"]), pandas_df.filter(like=by["like"]) + ) def test_first(): @@ -2053,14 +2033,12 @@ def test_hist(): @pytest.fixture def test_idxmax(ray_df, pandas_df): - assert \ - ray_df.idxmax().equals(pandas_df.idxmax()) + assert ray_df.idxmax().equals(pandas_df.idxmax()) @pytest.fixture def test_idxmin(ray_df, pandas_df): - assert \ - ray_df.idxmin().equals(pandas_df.idxmin()) + assert ray_df.idxmin().equals(pandas_df.idxmin()) def test_infer_objects(): @@ -2071,24 +2049,26 @@ def test_infer_objects(): def test_info(): - ray_df = pd.DataFrame({ - 'col1': [1, 2, 3, np.nan], - 'col2': [4, 5, np.nan, 7], - 'col3': [8, np.nan, 10, 11], - 'col4': [np.nan, 13, 14, 15] - }) - ray_df.info(memory_usage='deep') + ray_df = pd.DataFrame( + { + "col1": [1, 2, 3, np.nan], + "col2": [4, 5, np.nan, 7], + "col3": [8, np.nan, 10, 11], + "col4": [np.nan, 13, 14, 15], + } + ) + ray_df.info(memory_usage="deep") with io.StringIO() as buf: ray_df.info(buf=buf) info_string = buf.getvalue() - assert '\n' in info_string - assert 'memory usage: ' in info_string - assert 'Data columns (total 4 columns):' in info_string + assert "\n" in info_string + assert "memory usage: " in info_string + assert "Data columns (total 4 columns):" in info_string with io.StringIO() as buf: ray_df.info(buf=buf, verbose=False, memory_usage=False) info_string = buf.getvalue() - assert 'memory usage: ' not in info_string - assert 'Columns: 4 entries, col1 to col4' in info_string + assert "memory usage: " not in info_string + assert "Columns: 4 entries, col1 to col4" in info_string @pytest.fixture @@ -2150,7 +2130,7 @@ def test_itertuples(ray_df, pandas_df): # test all combinations of custom params indices = [True, False] - names = [None, 'NotPandas', 'Pandas'] + names = [None, "NotPandas", "Pandas"] for index in indices: for name in names: @@ -2165,7 +2145,7 @@ def test_join(): "col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], "col3": [8, 9, 0, 1], - "col4": [2, 4, 5, 6] + "col4": [2, 4, 5, 6], } ray_df = pd.DataFrame(frame_data) @@ -2255,8 +2235,8 @@ def test_max(ray_df, pandas_df): # We pass in numeric_only because # https://github.com/modin-project/modin/issues/83 assert ray_series_equals_pandas( - ray_df.max(axis=1, numeric_only=True), - pandas_df.max(axis=1, numeric_only=True)) + ray_df.max(axis=1, numeric_only=True), pandas_df.max(axis=1, numeric_only=True) + ) @pytest.fixture @@ -2279,9 +2259,8 @@ def test_melt(): def test_memory_usage(): ray_df = create_test_dataframe() assert type(ray_df.memory_usage()) is pandas.core.series.Series - assert ray_df.memory_usage(index=True).at['Index'] is not None - assert ray_df.memory_usage(deep=True).sum() >= \ - ray_df.memory_usage(deep=False).sum() + assert ray_df.memory_usage(index=True).at["Index"] is not None + assert ray_df.memory_usage(deep=True).sum() >= ray_df.memory_usage(deep=False).sum() def test_merge(): @@ -2289,7 +2268,7 @@ def test_merge(): "col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], "col3": [8, 9, 0, 1], - "col4": [2, 4, 5, 6] + "col4": [2, 4, 5, 6], } ray_df = pd.DataFrame(frame_data) @@ -2309,37 +2288,41 @@ def test_merge(): # left_on and right_index ray_result = ray_df.merge( - ray_df2, how=how, left_on='col1', right_index=True) + ray_df2, how=how, left_on="col1", right_index=True + ) pandas_result = pandas_df.merge( - pandas_df2, how=how, left_on='col1', right_index=True) + pandas_df2, how=how, left_on="col1", right_index=True + ) ray_df_equals_pandas(ray_result, pandas_result) # left_index and right_on ray_result = ray_df.merge( - ray_df2, how=how, left_index=True, right_on='col1') + ray_df2, how=how, left_index=True, right_on="col1" + ) pandas_result = pandas_df.merge( - pandas_df2, how=how, left_index=True, right_on='col1') + pandas_df2, how=how, left_index=True, right_on="col1" + ) ray_df_equals_pandas(ray_result, pandas_result) # left_on and right_on col1 - ray_result = ray_df.merge( - ray_df2, how=how, left_on='col1', right_on='col1') + ray_result = ray_df.merge(ray_df2, how=how, left_on="col1", right_on="col1") pandas_result = pandas_df.merge( - pandas_df2, how=how, left_on='col1', right_on='col1') + pandas_df2, how=how, left_on="col1", right_on="col1" + ) ray_df_equals_pandas(ray_result, pandas_result) # left_on and right_on col2 - ray_result = ray_df.merge( - ray_df2, how=how, left_on='col2', right_on='col2') + ray_result = ray_df.merge(ray_df2, how=how, left_on="col2", right_on="col2") pandas_result = pandas_df.merge( - pandas_df2, how=how, left_on='col2', right_on='col2') + pandas_df2, how=how, left_on="col2", right_on="col2" + ) ray_df_equals_pandas(ray_result, pandas_result) # left_index and right_index - ray_result = ray_df.merge( - ray_df2, how=how, left_index=True, right_index=True) + ray_result = ray_df.merge(ray_df2, how=how, left_index=True, right_index=True) pandas_result = pandas_df.merge( - pandas_df2, how=how, left_index=True, right_index=True) + pandas_df2, how=how, left_index=True, right_index=True + ) ray_df_equals_pandas(ray_result, pandas_result) @@ -2356,8 +2339,7 @@ def test_mod(): @pytest.fixture def test_mode(ray_df, pandas_df): assert ray_series_equals_pandas(ray_df.mode(), pandas_df.mode()) - assert ray_series_equals_pandas( - ray_df.mode(axis=1), pandas_df.mode(axis=1)) + assert ray_series_equals_pandas(ray_df.mode(axis=1), pandas_df.mode(axis=1)) def test_mul(): @@ -2399,8 +2381,7 @@ def test_nsmallest(): @pytest.fixture def test_nunique(ray_df, pandas_df): assert ray_df_equals_pandas(ray_df.nunique(), pandas_df.nunique()) - assert ray_df_equals_pandas( - ray_df.nunique(axis=1), pandas_df.nunique(axis=1)) + assert ray_df_equals_pandas(ray_df.nunique(axis=1), pandas_df.nunique(axis=1)) def test_pct_change(): @@ -2429,11 +2410,13 @@ def f(x, arg2=0, arg3=0): assert ray_df_equals( f(g(h(ray_df), arg1=a), arg2=b, arg3=c), - (ray_df.pipe(h).pipe(g, arg1=a).pipe(f, arg2=b, arg3=c))) + (ray_df.pipe(h).pipe(g, arg1=a).pipe(f, arg2=b, arg3=c)), + ) assert ray_df_equals_pandas( (ray_df.pipe(h).pipe(g, arg1=a).pipe(f, arg2=b, arg3=c)), - (pandas_df.pipe(h).pipe(g, arg1=a).pipe(f, arg2=b, arg3=c))) + (pandas_df.pipe(h).pipe(g, arg1=a).pipe(f, arg2=b, arg3=c)), + ) def test_pivot(): @@ -2461,8 +2444,8 @@ def test_plot(): def test_pop(ray_df, pandas_df): temp_ray_df = ray_df.copy() temp_pandas_df = pandas_df.copy() - ray_popped = temp_ray_df.pop('col2') - pandas_popped = temp_pandas_df.pop('col2') + ray_popped = temp_ray_df.pop("col2") + pandas_popped = temp_pandas_df.pop("col2") assert ray_popped.equals(pandas_popped) assert ray_df_equals_pandas(temp_ray_df, temp_pandas_df) @@ -2509,34 +2492,35 @@ def test_rdiv(): def test_reindex(): frame_data = { - 'col1': [0, 1, 2, 3], - 'col2': [4, 5, 6, 7], - 'col3': [8, 9, 10, 11], - 'col4': [12, 13, 14, 15], - 'col5': [0, 0, 0, 0] + "col1": [0, 1, 2, 3], + "col2": [4, 5, 6, 7], + "col3": [8, 9, 10, 11], + "col4": [12, 13, 14, 15], + "col5": [0, 0, 0, 0], } pandas_df = pandas.DataFrame(frame_data) ray_df = pd.DataFrame(frame_data) assert ray_df_equals_pandas( - ray_df.reindex([0, 3, 2, 1]), pandas_df.reindex([0, 3, 2, 1])) + ray_df.reindex([0, 3, 2, 1]), pandas_df.reindex([0, 3, 2, 1]) + ) - assert ray_df_equals_pandas( - ray_df.reindex([0, 6, 2]), pandas_df.reindex([0, 6, 2])) + assert ray_df_equals_pandas(ray_df.reindex([0, 6, 2]), pandas_df.reindex([0, 6, 2])) assert ray_df_equals_pandas( - ray_df.reindex(['col1', 'col3', 'col4', 'col2'], axis=1), - pandas_df.reindex(['col1', 'col3', 'col4', 'col2'], axis=1)) + ray_df.reindex(["col1", "col3", "col4", "col2"], axis=1), + pandas_df.reindex(["col1", "col3", "col4", "col2"], axis=1), + ) assert ray_df_equals_pandas( - ray_df.reindex(['col1', 'col7', 'col4', 'col8'], axis=1), - pandas_df.reindex(['col1', 'col7', 'col4', 'col8'], axis=1)) + ray_df.reindex(["col1", "col7", "col4", "col8"], axis=1), + pandas_df.reindex(["col1", "col7", "col4", "col8"], axis=1), + ) assert ray_df_equals_pandas( - ray_df.reindex( - index=[0, 1, 5], columns=['col1', 'col7', 'col4', 'col8']), - pandas_df.reindex( - index=[0, 1, 5], columns=['col1', 'col7', 'col4', 'col8'])) + ray_df.reindex(index=[0, 1, 5], columns=["col1", "col7", "col4", "col8"]), + pandas_df.reindex(index=[0, 1, 5], columns=["col1", "col7", "col4", "col8"]), + ) def test_reindex_axis(): @@ -2568,55 +2552,46 @@ def test_rename(): @pytest.fixture def test_rename_sanity(): test_data = TestData() - mapping = {'A': 'a', 'B': 'b', 'C': 'c', 'D': 'd'} + mapping = {"A": "a", "B": "b", "C": "c", "D": "d"} ray_df = pd.DataFrame(test_data.frame) assert ray_df_equals_pandas( - ray_df.rename(columns=mapping), - test_data.frame.rename(columns=mapping)) + ray_df.rename(columns=mapping), test_data.frame.rename(columns=mapping) + ) renamed2 = test_data.frame.rename(columns=str.lower) assert ray_df_equals_pandas(ray_df.rename(columns=str.lower), renamed2) ray_df = pd.DataFrame(renamed2) assert ray_df_equals_pandas( - ray_df.rename(columns=str.upper), renamed2.rename(columns=str.upper)) + ray_df.rename(columns=str.upper), renamed2.rename(columns=str.upper) + ) # index - data = {'A': {'foo': 0, 'bar': 1}} + data = {"A": {"foo": 0, "bar": 1}} # gets sorted alphabetical df = pandas.DataFrame(data) ray_df = pd.DataFrame(data) tm.assert_index_equal( - ray_df.rename(index={ - 'foo': 'bar', - 'bar': 'foo' - }).index, - df.rename(index={ - 'foo': 'bar', - 'bar': 'foo' - }).index) + ray_df.rename(index={"foo": "bar", "bar": "foo"}).index, + df.rename(index={"foo": "bar", "bar": "foo"}).index, + ) tm.assert_index_equal( - ray_df.rename(index=str.upper).index, - df.rename(index=str.upper).index) + ray_df.rename(index=str.upper).index, df.rename(index=str.upper).index + ) # have to pass something pytest.raises(TypeError, ray_df.rename) # partial columns - renamed = test_data.frame.rename(columns={'C': 'foo', 'D': 'bar'}) + renamed = test_data.frame.rename(columns={"C": "foo", "D": "bar"}) ray_df = pd.DataFrame(test_data.frame) tm.assert_index_equal( - ray_df.rename(columns={ - 'C': 'foo', - 'D': 'bar' - }).index, - test_data.frame.rename(columns={ - 'C': 'foo', - 'D': 'bar' - }).index) + ray_df.rename(columns={"C": "foo", "D": "bar"}).index, + test_data.frame.rename(columns={"C": "foo", "D": "bar"}).index, + ) # TODO: Uncomment when transpose works # other axis @@ -2626,12 +2601,12 @@ def test_rename_sanity(): # ray_df.T.rename(index={'C': 'foo', 'D': 'bar'}).index) # index with name - index = pandas.Index(['foo', 'bar'], name='name') + index = pandas.Index(["foo", "bar"], name="name") renamer = pandas.DataFrame(data, index=index) ray_df = pd.DataFrame(data, index=index) - renamed = renamer.rename(index={'foo': 'bar', 'bar': 'foo'}) - ray_renamed = ray_df.rename(index={'foo': 'bar', 'bar': 'foo'}) + renamed = renamer.rename(index={"foo": "bar", "bar": "foo"}) + ray_renamed = ray_df.rename(index={"foo": "bar", "bar": "foo"}) tm.assert_index_equal(renamed.index, ray_renamed.index) assert renamed.index.name == ray_renamed.index.name @@ -2639,11 +2614,10 @@ def test_rename_sanity(): @pytest.fixture def test_rename_multiindex(): - tuples_index = [('foo1', 'bar1'), ('foo2', 'bar2')] - tuples_columns = [('fizz1', 'buzz1'), ('fizz2', 'buzz2')] - index = pandas.MultiIndex.from_tuples(tuples_index, names=['foo', 'bar']) - columns = pandas.MultiIndex.from_tuples( - tuples_columns, names=['fizz', 'buzz']) + tuples_index = [("foo1", "bar1"), ("foo2", "bar2")] + tuples_columns = [("fizz1", "buzz1"), ("fizz2", "buzz2")] + index = pandas.MultiIndex.from_tuples(tuples_index, names=["foo", "bar"]) + columns = pandas.MultiIndex.from_tuples(tuples_columns, names=["fizz", "buzz"]) frame_data = [(0, 0), (1, 1)] df = pandas.DataFrame(frame_data, index=index, columns=columns) @@ -2652,34 +2626,19 @@ def test_rename_multiindex(): # # without specifying level -> accross all levels renamed = df.rename( - index={ - 'foo1': 'foo3', - 'bar2': 'bar3' - }, - columns={ - 'fizz1': 'fizz3', - 'buzz2': 'buzz3' - }) + index={"foo1": "foo3", "bar2": "bar3"}, + columns={"fizz1": "fizz3", "buzz2": "buzz3"}, + ) ray_renamed = ray_df.rename( - index={ - 'foo1': 'foo3', - 'bar2': 'bar3' - }, - columns={ - 'fizz1': 'fizz3', - 'buzz2': 'buzz3' - }) + index={"foo1": "foo3", "bar2": "bar3"}, + columns={"fizz1": "fizz3", "buzz2": "buzz3"}, + ) tm.assert_index_equal(renamed.index, ray_renamed.index) renamed = df.rename( - index={ - 'foo1': 'foo3', - 'bar2': 'bar3' - }, - columns={ - 'fizz1': 'fizz3', - 'buzz2': 'buzz3' - }) + index={"foo1": "foo3", "bar2": "bar3"}, + columns={"fizz1": "fizz3", "buzz2": "buzz3"}, + ) tm.assert_index_equal(renamed.columns, ray_renamed.columns) assert renamed.index.names == ray_renamed.index.names assert renamed.columns.names == ray_renamed.columns.names @@ -2688,42 +2647,22 @@ def test_rename_multiindex(): # with specifying a level # dict - renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, level=0) - ray_renamed = ray_df.rename( - columns={ - 'fizz1': 'fizz3', - 'buzz2': 'buzz3' - }, level=0) + renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=0) + ray_renamed = ray_df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=0) tm.assert_index_equal(renamed.columns, ray_renamed.columns) - renamed = df.rename( - columns={ - 'fizz1': 'fizz3', - 'buzz2': 'buzz3' - }, level='fizz') + renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="fizz") ray_renamed = ray_df.rename( - columns={ - 'fizz1': 'fizz3', - 'buzz2': 'buzz3' - }, level='fizz') + columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="fizz" + ) tm.assert_index_equal(renamed.columns, ray_renamed.columns) - renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, level=1) - ray_renamed = ray_df.rename( - columns={ - 'fizz1': 'fizz3', - 'buzz2': 'buzz3' - }, level=1) + renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=1) + ray_renamed = ray_df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=1) tm.assert_index_equal(renamed.columns, ray_renamed.columns) - renamed = df.rename( - columns={ - 'fizz1': 'fizz3', - 'buzz2': 'buzz3' - }, level='buzz') + renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="buzz") ray_renamed = ray_df.rename( - columns={ - 'fizz1': 'fizz3', - 'buzz2': 'buzz3' - }, level='buzz') + columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="buzz" + ) tm.assert_index_equal(renamed.columns, ray_renamed.columns) # function @@ -2731,24 +2670,20 @@ def test_rename_multiindex(): renamed = df.rename(columns=func, level=0) ray_renamed = ray_df.rename(columns=func, level=0) tm.assert_index_equal(renamed.columns, ray_renamed.columns) - renamed = df.rename(columns=func, level='fizz') - ray_renamed = ray_df.rename(columns=func, level='fizz') + renamed = df.rename(columns=func, level="fizz") + ray_renamed = ray_df.rename(columns=func, level="fizz") tm.assert_index_equal(renamed.columns, ray_renamed.columns) renamed = df.rename(columns=func, level=1) ray_renamed = ray_df.rename(columns=func, level=1) tm.assert_index_equal(renamed.columns, ray_renamed.columns) - renamed = df.rename(columns=func, level='buzz') - ray_renamed = ray_df.rename(columns=func, level='buzz') + renamed = df.rename(columns=func, level="buzz") + ray_renamed = ray_df.rename(columns=func, level="buzz") tm.assert_index_equal(renamed.columns, ray_renamed.columns) # index - renamed = df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'}, level=0) - ray_renamed = ray_df.rename( - index={ - 'foo1': 'foo3', - 'bar2': 'bar3' - }, level=0) + renamed = df.rename(index={"foo1": "foo3", "bar2": "bar3"}, level=0) + ray_renamed = ray_df.rename(index={"foo1": "foo3", "bar2": "bar3"}, level=0) tm.assert_index_equal(ray_renamed.index, renamed.index) @@ -2756,9 +2691,9 @@ def test_rename_multiindex(): def test_rename_nocopy(): test_data = TestData().frame ray_df = pd.DataFrame(test_data) - ray_renamed = ray_df.rename(columns={'C': 'foo'}, copy=False) - ray_renamed['foo'] = 1 - assert (ray_df['C'] == 1).all() + ray_renamed = ray_df.rename(columns={"C": "foo"}, copy=False) + ray_renamed["foo"] = 1 + assert (ray_df["C"] == 1).all() @pytest.fixture @@ -2767,13 +2702,13 @@ def test_rename_inplace(): ray_df = pd.DataFrame(test_data) assert ray_df_equals_pandas( - ray_df.rename(columns={'C': 'foo'}), - test_data.rename(columns={'C': 'foo'})) + ray_df.rename(columns={"C": "foo"}), test_data.rename(columns={"C": "foo"}) + ) frame = test_data.copy() ray_frame = ray_df.copy() - frame.rename(columns={'C': 'foo'}, inplace=True) - ray_frame.rename(columns={'C': 'foo'}, inplace=True) + frame.rename(columns={"C": "foo"}, inplace=True) + ray_frame.rename(columns={"C": "foo"}, inplace=True) assert ray_df_equals_pandas(ray_frame, frame) @@ -2781,17 +2716,17 @@ def test_rename_inplace(): @pytest.fixture def test_rename_bug(): # rename set ref_locs, and set_index was not resetting - frame_data = {0: ['foo', 'bar'], 1: ['bah', 'bas'], 2: [1, 2]} + frame_data = {0: ["foo", "bar"], 1: ["bah", "bas"], 2: [1, 2]} df = pandas.DataFrame(frame_data) ray_df = pd.DataFrame(frame_data) - df = df.rename(columns={0: 'a'}) - df = df.rename(columns={1: 'b'}) + df = df.rename(columns={0: "a"}) + df = df.rename(columns={1: "b"}) # TODO: Uncomment when set_index is implemented # df = df.set_index(['a', 'b']) # df.columns = ['2001-01-01'] - ray_df = ray_df.rename(columns={0: 'a'}) - ray_df = ray_df.rename(columns={1: 'b'}) + ray_df = ray_df.rename(columns={0: "a"}) + ray_df = ray_df.rename(columns={1: "b"}) # TODO: Uncomment when set_index is implemented # ray_df = ray_df.set_index(['a', 'b']) # ray_df.columns = ['2001-01-01'] @@ -2805,16 +2740,16 @@ def test_rename_axis_inplace(): result = test_frame.copy() ray_result = ray_df.copy() - no_return = result.rename_axis('foo', inplace=True) - ray_no_return = ray_result.rename_axis('foo', inplace=True) + no_return = result.rename_axis("foo", inplace=True) + ray_no_return = ray_result.rename_axis("foo", inplace=True) assert no_return is ray_no_return assert ray_df_equals_pandas(ray_result, result) result = test_frame.copy() ray_result = ray_df.copy() - no_return = result.rename_axis('bar', axis=1, inplace=True) - ray_no_return = ray_result.rename_axis('bar', axis=1, inplace=True) + no_return = result.rename_axis("bar", axis=1, inplace=True) + ray_no_return = ray_result.rename_axis("bar", axis=1, inplace=True) assert no_return is ray_no_return assert ray_df_equals_pandas(ray_result, result) @@ -2845,7 +2780,8 @@ def test_resample(): def test_reset_index(ray_df, pandas_df, inplace=False): if not inplace: assert to_pandas(ray_df.reset_index(inplace=inplace)).equals( - pandas_df.reset_index(inplace=inplace)) + pandas_df.reset_index(inplace=inplace) + ) else: ray_df_cp = ray_df.copy() pd_df_cp = pandas_df.copy() @@ -2854,14 +2790,16 @@ def test_reset_index(ray_df, pandas_df, inplace=False): assert to_pandas(ray_df_cp).equals(pd_df_cp) -@pytest.mark.skip(reason="dtypes on different partitions may not match up, " - "no fix for this yet") +@pytest.mark.skip( + reason="dtypes on different partitions may not match up, " "no fix for this yet" +) def test_rfloordiv(): test_inter_df_math_right_ops("rfloordiv") -@pytest.mark.skip(reason="dtypes on different partitions may not match up, " - "no fix for this yet") +@pytest.mark.skip( + reason="dtypes on different partitions may not match up, " "no fix for this yet" +) def test_rmod(): test_inter_df_math_right_ops("rmod") @@ -2891,8 +2829,9 @@ def test_rsub(): test_inter_df_math_right_ops("rsub") -@pytest.mark.skip(reason="dtypes on different partitions may not match up, " - "no fix for this yet") +@pytest.mark.skip( + reason="dtypes on different partitions may not match up, " "no fix for this yet" +) def test_rtruediv(): test_inter_df_math_right_ops("rtruediv") @@ -2904,10 +2843,11 @@ def test_sample(ray_df, pd_df): assert ray_df_equals_pandas( ray_df.sample(frac=0.5, random_state=42), - pd_df.sample(frac=0.5, random_state=42)) + pd_df.sample(frac=0.5, random_state=42), + ) assert ray_df_equals_pandas( - ray_df.sample(n=2, random_state=42), pd_df.sample( - n=2, random_state=42)) + ray_df.sample(n=2, random_state=42), pd_df.sample(n=2, random_state=42) + ) def test_select(): @@ -2919,18 +2859,18 @@ def test_select(): def test_select_dtypes(): frame_data = { - 'test1': list('abc'), - 'test2': np.arange(3, 6).astype('u1'), - 'test3': np.arange(8.0, 11.0, dtype='float64'), - 'test4': [True, False, True], - 'test5': pandas.date_range('now', periods=3).values, - 'test6': list(range(5, 8)) + "test1": list("abc"), + "test2": np.arange(3, 6).astype("u1"), + "test3": np.arange(8.0, 11.0, dtype="float64"), + "test4": [True, False, True], + "test5": pandas.date_range("now", periods=3).values, + "test6": list(range(5, 8)), } df = pandas.DataFrame(frame_data) rd = pd.DataFrame(frame_data) - include = np.float, 'integer' - exclude = np.bool_, + include = np.float, "integer" + exclude = (np.bool_,) r = rd.select_dtypes(include=include, exclude=exclude) e = df[["test2", "test3", "test6"]] @@ -2953,14 +2893,14 @@ def test_sem(): @pytest.fixture def test_set_axis(ray_df, pandas_df, label, axis): assert to_pandas(ray_df.set_axis(label, axis, inplace=False)).equals( - pandas_df.set_axis(label, axis, inplace=False)) + pandas_df.set_axis(label, axis, inplace=False) + ) @pytest.fixture def test_set_index(ray_df, pandas_df, keys, inplace=False): if not inplace: - assert to_pandas(ray_df.set_index(keys)).equals( - pandas_df.set_index(keys)) + assert to_pandas(ray_df.set_index(keys)).equals(pandas_df.set_index(keys)) else: ray_df_cp = ray_df.copy() pd_df_cp = pandas_df.copy() @@ -3137,9 +3077,9 @@ def test_to_xarray(): def test_transform(ray_df, pandas_df): assert ray_df_equals_pandas( ray_df.transform(lambda df: df.isna()), - pandas_df.transform(lambda df: df.isna())) - assert ray_df_equals_pandas( - ray_df.transform('isna'), pandas_df.transform('isna')) + pandas_df.transform(lambda df: df.isna()), + ) + assert ray_df_equals_pandas(ray_df.transform("isna"), pandas_df.transform("isna")) def test_truediv(): @@ -3182,14 +3122,15 @@ def test_unstack(): def test_update(): - df = pd.DataFrame([[1.5, np.nan, 3.], [1.5, np.nan, 3.], [1.5, np.nan, 3], - [1.5, np.nan, 3]]) - other = pd.DataFrame([[3.6, 2., np.nan], [np.nan, np.nan, 7]], - index=[1, 3]) + df = pd.DataFrame( + [[1.5, np.nan, 3.], [1.5, np.nan, 3.], [1.5, np.nan, 3], [1.5, np.nan, 3]] + ) + other = pd.DataFrame([[3.6, 2., np.nan], [np.nan, np.nan, 7]], index=[1, 3]) df.update(other) - expected = pd.DataFrame([[1.5, np.nan, 3], [3.6, 2, 3], [1.5, np.nan, 3], - [1.5, np.nan, 7.]]) + expected = pd.DataFrame( + [[1.5, np.nan, 3], [3.6, 2, 3], [1.5, np.nan, 3], [1.5, np.nan, 7.]] + ) assert ray_df_equals(df, expected) @@ -3197,14 +3138,15 @@ def test_update(): def test_var(ray_df, pandas_df): # Because of some differences in floating point arithmetic, we need to check that # they are almost equal if they are not identically equal. - assert (ray_df.var() == pandas_df.var()).all() or \ - ((ray_df.var() - pandas_df.var()).abs() < 10**-10).all() + assert (ray_df.var() == pandas_df.var()).all() or ( + (ray_df.var() - pandas_df.var()).abs() < 10 ** -10 + ).all() def test_where(): frame_data = np.random.randn(100, 10) - pandas_df = pandas.DataFrame(frame_data, columns=list('abcdefghij')) - ray_df = pd.DataFrame(frame_data, columns=list('abcdefghij')) + pandas_df = pandas.DataFrame(frame_data, columns=list("abcdefghij")) + ray_df = pd.DataFrame(frame_data, columns=list("abcdefghij")) pandas_cond_df = pandas_df % 5 < 2 ray_cond_df = ray_df % 5 < 2 @@ -3217,7 +3159,7 @@ def test_where(): ray_result = ray_df.where(ray_cond_df, other, axis=1) assert all((to_pandas(ray_result) == pandas_result).all()) - other = pandas_df['e'] + other = pandas_df["e"] pandas_result = pandas_df.where(pandas_cond_df, other, axis=0) ray_result = ray_df.where(ray_cond_df, other, axis=0) assert all((to_pandas(ray_result) == pandas_result).all()) @@ -3236,10 +3178,10 @@ def test_xs(): @pytest.fixture def test___getitem__(ray_df, pd_df): - ray_col = ray_df.__getitem__('col1') + ray_col = ray_df.__getitem__("col1") assert isinstance(ray_col, pandas.Series) - pd_col = pd_df['col1'] + pd_col = pd_df["col1"] assert pd_col.equals(ray_col) @@ -3304,8 +3246,8 @@ def test___iter__(ray_df, pd_df): ray_iterator = ray_df.__iter__() # Check that ray_iterator implements the iterator interface - assert hasattr(ray_iterator, '__iter__') - assert hasattr(ray_iterator, 'next') or hasattr(ray_iterator, '__next__') + assert hasattr(ray_iterator, "__iter__") + assert hasattr(ray_iterator, "next") or hasattr(ray_iterator, "__next__") pd_iterator = pd_df.__iter__() assert list(ray_iterator) == list(pd_iterator) @@ -3366,8 +3308,8 @@ def test___setstate__(): def test___delitem__(ray_df, pd_df): ray_df = ray_df.copy() pd_df = pd_df.copy() - ray_df.__delitem__('col1') - pd_df.__delitem__('col1') + ray_df.__delitem__("col1") + pd_df.__delitem__("col1") assert ray_df_equals_pandas(ray_df, pd_df) # Issue 2027 @@ -3473,12 +3415,12 @@ def test___repr__(): @pytest.fixture def test_loc(ray_df, pd_df): # Scaler - assert ray_df.loc[0, 'col1'] == pd_df.loc[0, 'col1'] + assert ray_df.loc[0, "col1"] == pd_df.loc[0, "col1"] # Series assert ray_df.loc[0].equals(pd_df.loc[0]) - assert ray_df.loc[1:, 'col1'].equals(pd_df.loc[1:, 'col1']) - assert ray_df.loc[1:2, 'col1'].equals(pd_df.loc[1:2, 'col1']) + assert ray_df.loc[1:, "col1"].equals(pd_df.loc[1:, "col1"]) + assert ray_df.loc[1:2, "col1"].equals(pd_df.loc[1:2, "col1"]) # DataFrame assert ray_df_equals_pandas(ray_df.loc[[1, 2]], pd_df.loc[[1, 2]]) @@ -3486,8 +3428,9 @@ def test_loc(ray_df, pd_df): # See issue #80 # assert ray_df_equals_pandas(ray_df.loc[[1, 2], ['col1']], # pd_df.loc[[1, 2], ['col1']]) - assert ray_df_equals_pandas(ray_df.loc[1:2, 'col1':'col2'], - pd_df.loc[1:2, 'col1':'col2']) + assert ray_df_equals_pandas( + ray_df.loc[1:2, "col1":"col2"], pd_df.loc[1:2, "col1":"col2"] + ) # Write Item ray_df_copy = ray_df.copy() @@ -3554,15 +3497,14 @@ def test__doc__(): assert pd.DataFrame.__doc__ != pandas.DataFrame.__doc__ assert pd.DataFrame.__init__ != pandas.DataFrame.__init__ for attr, obj in pd.DataFrame.__dict__.items(): - if (callable(obj) or isinstance(obj, property)) \ - and attr != "__init__": + if (callable(obj) or isinstance(obj, property)) and attr != "__init__": pd_obj = getattr(pandas.DataFrame, attr, None) if callable(pd_obj) or isinstance(pd_obj, property): assert obj.__doc__ == pd_obj.__doc__ def test_to_datetime(): - frame_data = {'year': [2015, 2016], 'month': [2, 3], 'day': [4, 5]} + frame_data = {"year": [2015, 2016], "month": [2, 3], "day": [4, 5]} ray_df = pd.DataFrame(frame_data) pd_df = pandas.DataFrame(frame_data) @@ -3570,20 +3512,17 @@ def test_to_datetime(): def test_get_dummies(): - frame_data = {'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], 'C': [1, 2, 3]} + frame_data = {"A": ["a", "b", "a"], "B": ["b", "a", "c"], "C": [1, 2, 3]} ray_df = pd.DataFrame(frame_data) pd_df = pandas.DataFrame(frame_data) - assert ray_df_equals_pandas( - pd.get_dummies(ray_df), pandas.get_dummies(pd_df)) + assert ray_df_equals_pandas(pd.get_dummies(ray_df), pandas.get_dummies(pd_df)) - frame_data = {'A': ['a'], 'B': ['b']} + frame_data = {"A": ["a"], "B": ["b"]} ray_df = pd.DataFrame(frame_data) pd_df = pandas.DataFrame(frame_data) - assert ray_df_equals_pandas( - pd.get_dummies(ray_df), pandas.get_dummies(pd_df)) + assert ray_df_equals_pandas(pd.get_dummies(ray_df), pandas.get_dummies(pd_df)) - frame_data = {'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [1, 2, 3]} + frame_data = {"A": [1, 2, 3], "B": [4, 5, 6], "C": [1, 2, 3]} ray_df = pd.DataFrame(frame_data) pd_df = pandas.DataFrame(frame_data) - assert ray_df_equals_pandas( - pd.get_dummies(ray_df), pandas.get_dummies(pd_df)) + assert ray_df_equals_pandas(pd.get_dummies(ray_df), pandas.get_dummies(pd_df)) diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py index 436ad434ba7..98210e59a18 100644 --- a/modin/pandas/test/test_groupby.py +++ b/modin/pandas/test/test_groupby.py @@ -17,8 +17,9 @@ @pytest.fixture def ray_df_equals_pandas(ray_df, pandas_df): assert isinstance(ray_df, pd.DataFrame) - assert to_pandas(ray_df).equals(pandas_df) or (all(ray_df.isna().all()) and - all(pandas_df.isna().all())) + assert to_pandas(ray_df).equals(pandas_df) or ( + all(ray_df.isna().all()) and all(pandas_df.isna().all()) + ) @pytest.fixture @@ -26,8 +27,11 @@ def ray_df_almost_equals_pandas(ray_df, pandas_df): assert isinstance(ray_df, pd.DataFrame) difference = to_pandas(ray_df) - pandas_df diff_max = difference.max().max() - assert to_pandas(ray_df).equals(pandas_df) or diff_max < 0.0001 or (all( - ray_df.isna().all()) and all(pandas_df.isna().all())) + assert ( + to_pandas(ray_df).equals(pandas_df) + or diff_max < 0.0001 + or (all(ray_df.isna().all()) and all(pandas_df.isna().all())) + ) @pytest.fixture @@ -48,13 +52,15 @@ def ray_groupby_equals_pandas(ray_groupby, pandas_groupby): def test_simple_row_groupby(): - pandas_df = pandas.DataFrame({ - 'col1': [0, 1, 2, 3], - 'col2': [4, 5, 6, 7], - 'col3': [3, 8, 12, 10], - 'col4': [17, 13, 16, 15], - 'col5': [-4, -5, -6, -7] - }) + pandas_df = pandas.DataFrame( + { + "col1": [0, 1, 2, 3], + "col2": [4, 5, 6, 7], + "col3": [3, 8, 12, 10], + "col4": [17, 13, 16, 15], + "col5": [-4, -5, -6, -7], + } + ) ray_df = from_pandas(pandas_df) @@ -91,7 +97,7 @@ def test_simple_row_groupby(): test_prod(ray_groupby, pandas_groupby) test_std(ray_groupby, pandas_groupby) - agg_functions = ['min', 'max'] + agg_functions = ["min", "max"] for func in agg_functions: test_agg(ray_groupby, pandas_groupby, func) test_aggregate(ray_groupby, pandas_groupby, func) @@ -127,13 +133,15 @@ def test_simple_row_groupby(): def test_single_group_row_groupby(): - pandas_df = pandas.DataFrame({ - 'col1': [0, 1, 2, 3], - 'col2': [4, 5, 36, 7], - 'col3': [3, 8, 12, 10], - 'col4': [17, 3, 16, 15], - 'col5': [-4, 5, -6, -7] - }) + pandas_df = pandas.DataFrame( + { + "col1": [0, 1, 2, 3], + "col2": [4, 5, 36, 7], + "col3": [3, 8, 12, 10], + "col4": [17, 3, 16, 15], + "col5": [-4, 5, -6, -7], + } + ) ray_df = from_pandas(pandas_df) @@ -170,7 +178,7 @@ def test_single_group_row_groupby(): test_prod(ray_groupby, pandas_groupby) test_std(ray_groupby, pandas_groupby) - agg_functions = ['min', 'max'] + agg_functions = ["min", "max"] for func in agg_functions: test_agg(ray_groupby, pandas_groupby, func) test_aggregate(ray_groupby, pandas_groupby, func) @@ -208,11 +216,12 @@ def test_single_group_row_groupby(): @pytest.mark.skip(reason="See Modin issue #21.") def test_large_row_groupby(): pandas_df = pandas.DataFrame( - np.random.randint(0, 8, size=(100, 4)), columns=list('ABCD')) + np.random.randint(0, 8, size=(100, 4)), columns=list("ABCD") + ) ray_df = from_pandas(pandas_df) - by = [str(i) for i in pandas_df['A'].tolist()] + by = [str(i) for i in pandas_df["A"].tolist()] n = 4 ray_groupby = ray_df.groupby(by=by) @@ -245,7 +254,7 @@ def test_large_row_groupby(): # test_prod(ray_groupby, pandas_groupby) causes overflows test_std(ray_groupby, pandas_groupby) - agg_functions = ['min', 'max'] + agg_functions = ["min", "max"] for func in agg_functions: test_agg(ray_groupby, pandas_groupby, func) test_aggregate(ray_groupby, pandas_groupby, func) @@ -281,13 +290,15 @@ def test_large_row_groupby(): def test_simple_col_groupby(): - pandas_df = pandas.DataFrame({ - 'col1': [0, 3, 2, 3], - 'col2': [4, 1, 6, 7], - 'col3': [3, 8, 2, 10], - 'col4': [1, 13, 6, 15], - 'col5': [-4, 5, 6, -7] - }) + pandas_df = pandas.DataFrame( + { + "col1": [0, 3, 2, 3], + "col2": [4, 1, 6, 7], + "col3": [3, 8, 2, 10], + "col4": [1, 13, 6, 15], + "col5": [-4, 5, 6, -7], + } + ) ray_df = from_pandas(pandas_df) @@ -407,7 +418,8 @@ def test_ndim(ray_groupby, pandas_groupby): @pytest.fixture def test_cumsum(ray_groupby, pandas_groupby, axis=0): ray_df_equals_pandas( - ray_groupby.cumsum(axis=axis), pandas_groupby.cumsum(axis=axis)) + ray_groupby.cumsum(axis=axis), pandas_groupby.cumsum(axis=axis) + ) @pytest.fixture @@ -419,7 +431,8 @@ def test_pct_change(ray_groupby, pandas_groupby): @pytest.fixture def test_cummax(ray_groupby, pandas_groupby, axis=0): ray_df_equals_pandas( - ray_groupby.cummax(axis=axis), pandas_groupby.cummax(axis=axis)) + ray_groupby.cummax(axis=axis), pandas_groupby.cummax(axis=axis) + ) @pytest.fixture @@ -447,7 +460,8 @@ def test_backfill(ray_groupby, pandas_groupby): @pytest.fixture def test_cummin(ray_groupby, pandas_groupby, axis=0): ray_df_equals_pandas( - ray_groupby.cummin(axis=axis), pandas_groupby.cummin(axis=axis)) + ray_groupby.cummin(axis=axis), pandas_groupby.cummin(axis=axis) + ) @pytest.fixture @@ -474,8 +488,7 @@ def test_std(ray_groupby, pandas_groupby): @pytest.fixture def test_aggregate(ray_groupby, pandas_groupby, func): - ray_df_equals_pandas( - ray_groupby.aggregate(func), pandas_groupby.aggregate(func)) + ray_df_equals_pandas(ray_groupby.aggregate(func), pandas_groupby.aggregate(func)) @pytest.fixture @@ -545,7 +558,8 @@ def test_head(ray_groupby, pandas_groupby, n): def test_cumprod(ray_groupby, pandas_groupby, axis=0): ray_df_equals_pandas(ray_groupby.cumprod(), pandas_groupby.cumprod()) ray_df_equals_pandas( - ray_groupby.cumprod(axis=axis), pandas_groupby.cumprod(axis=axis)) + ray_groupby.cumprod(axis=axis), pandas_groupby.cumprod(axis=axis) + ) @pytest.fixture @@ -556,8 +570,7 @@ def test_cov(ray_groupby, pandas_groupby): @pytest.fixture def test_transform(ray_groupby, pandas_groupby, func): - ray_df_equals_pandas( - ray_groupby.transform(func), pandas_groupby.transform(func)) + ray_df_equals_pandas(ray_groupby.transform(func), pandas_groupby.transform(func)) @pytest.fixture @@ -569,8 +582,8 @@ def test_corr(ray_groupby, pandas_groupby): @pytest.fixture def test_fillna(ray_groupby, pandas_groupby): ray_df_equals_pandas( - ray_groupby.fillna(method="ffill"), - pandas_groupby.fillna(method="ffill")) + ray_groupby.fillna(method="ffill"), pandas_groupby.fillna(method="ffill") + ) @pytest.fixture @@ -591,8 +604,7 @@ def test_tail(ray_groupby, pandas_groupby, n): @pytest.fixture def test_quantile(ray_groupby, pandas_groupby): - ray_df_equals_pandas( - ray_groupby.quantile(q=0.4), pandas_groupby.quantile(q=0.4)) + ray_df_equals_pandas(ray_groupby.quantile(q=0.4), pandas_groupby.quantile(q=0.4)) @pytest.fixture diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py index 0def3731a3a..58eda9bbb8f 100644 --- a/modin/pandas/test/test_io.py +++ b/modin/pandas/test/test_io.py @@ -10,18 +10,18 @@ import os import sqlite3 -TEST_PARQUET_FILENAME = 'test.parquet' -TEST_CSV_FILENAME = 'test.csv' -TEST_JSON_FILENAME = 'test.json' -TEST_HTML_FILENAME = 'test.html' -TEST_EXCEL_FILENAME = 'test.xlsx' -TEST_FEATHER_FILENAME = 'test.feather' -TEST_HDF_FILENAME = 'test.hdf' -TEST_MSGPACK_FILENAME = 'test.msg' -TEST_STATA_FILENAME = 'test.dta' -TEST_PICKLE_FILENAME = 'test.pkl' -TEST_SAS_FILENAME = os.getcwd() + '/data/test1.sas7bdat' -TEST_SQL_FILENAME = 'test.db' +TEST_PARQUET_FILENAME = "test.parquet" +TEST_CSV_FILENAME = "test.csv" +TEST_JSON_FILENAME = "test.json" +TEST_HTML_FILENAME = "test.html" +TEST_EXCEL_FILENAME = "test.xlsx" +TEST_FEATHER_FILENAME = "test.feather" +TEST_HDF_FILENAME = "test.hdf" +TEST_MSGPACK_FILENAME = "test.msg" +TEST_STATA_FILENAME = "test.dta" +TEST_PICKLE_FILENAME = "test.pkl" +TEST_SAS_FILENAME = os.getcwd() + "/data/test1.sas7bdat" +TEST_SQL_FILENAME = "test.db" SMALL_ROW_SIZE = 2000 @@ -35,42 +35,45 @@ def setup_parquet_file(row_size, force=False): if os.path.exists(TEST_PARQUET_FILENAME) and not force: pass else: - df = pandas.DataFrame({ - 'col1': np.arange(row_size), - 'col2': np.arange(row_size) - }) + df = pandas.DataFrame( + {"col1": np.arange(row_size), "col2": np.arange(row_size)} + ) df.to_parquet(TEST_PARQUET_FILENAME) @pytest.fixture def create_test_ray_dataframe(): - df = pd.DataFrame({ - 'col1': [0, 1, 2, 3], - 'col2': [4, 5, 6, 7], - 'col3': [8, 9, 10, 11], - 'col4': [12, 13, 14, 15], - 'col5': [0, 0, 0, 0] - }) + df = pd.DataFrame( + { + "col1": [0, 1, 2, 3], + "col2": [4, 5, 6, 7], + "col3": [8, 9, 10, 11], + "col4": [12, 13, 14, 15], + "col5": [0, 0, 0, 0], + } + ) return df @pytest.fixture def create_test_pandas_dataframe(): - df = pandas.DataFrame({ - 'col1': [0, 1, 2, 3], - 'col2': [4, 5, 6, 7], - 'col3': [8, 9, 10, 11], - 'col4': [12, 13, 14, 15], - 'col5': [0, 0, 0, 0] - }) + df = pandas.DataFrame( + { + "col1": [0, 1, 2, 3], + "col2": [4, 5, 6, 7], + "col3": [8, 9, 10, 11], + "col4": [12, 13, 14, 15], + "col5": [0, 0, 0, 0], + } + ) return df @pytest.fixture def test_files_eq(path1, path2): - with open(path1, 'rb') as file1, open(path2, 'rb') as file2: + with open(path1, "rb") as file1, open(path2, "rb") as file2: file1_content = file1.read() file2_content = file2.read() @@ -93,14 +96,13 @@ def teardown_parquet_file(): @pytest.fixture -def setup_csv_file(row_size, force=False, delimiter=','): +def setup_csv_file(row_size, force=False, delimiter=","): if os.path.exists(TEST_CSV_FILENAME) and not force: pass else: - df = pandas.DataFrame({ - 'col1': np.arange(row_size), - 'col2': np.arange(row_size) - }) + df = pandas.DataFrame( + {"col1": np.arange(row_size), "col2": np.arange(row_size)} + ) df.to_csv(TEST_CSV_FILENAME, sep=delimiter) @@ -115,10 +117,9 @@ def setup_json_file(row_size, force=False): if os.path.exists(TEST_JSON_FILENAME) and not force: pass else: - df = pandas.DataFrame({ - 'col1': np.arange(row_size), - 'col2': np.arange(row_size) - }) + df = pandas.DataFrame( + {"col1": np.arange(row_size), "col2": np.arange(row_size)} + ) df.to_json(TEST_JSON_FILENAME) @@ -133,10 +134,9 @@ def setup_html_file(row_size, force=False): if os.path.exists(TEST_HTML_FILENAME) and not force: pass else: - df = pandas.DataFrame({ - 'col1': np.arange(row_size), - 'col2': np.arange(row_size) - }) + df = pandas.DataFrame( + {"col1": np.arange(row_size), "col2": np.arange(row_size)} + ) df.to_html(TEST_HTML_FILENAME) @@ -148,10 +148,7 @@ def teardown_html_file(): @pytest.fixture def setup_clipboard(row_size, force=False): - df = pandas.DataFrame({ - 'col1': np.arange(row_size), - 'col2': np.arange(row_size) - }) + df = pandas.DataFrame({"col1": np.arange(row_size), "col2": np.arange(row_size)}) df.to_clipboard() @@ -160,10 +157,9 @@ def setup_excel_file(row_size, force=False): if os.path.exists(TEST_EXCEL_FILENAME) and not force: pass else: - df = pandas.DataFrame({ - 'col1': np.arange(row_size), - 'col2': np.arange(row_size) - }) + df = pandas.DataFrame( + {"col1": np.arange(row_size), "col2": np.arange(row_size)} + ) df.to_excel(TEST_EXCEL_FILENAME) @@ -178,10 +174,9 @@ def setup_feather_file(row_size, force=False): if os.path.exists(TEST_FEATHER_FILENAME) and not force: pass else: - df = pandas.DataFrame({ - 'col1': np.arange(row_size), - 'col2': np.arange(row_size) - }) + df = pandas.DataFrame( + {"col1": np.arange(row_size), "col2": np.arange(row_size)} + ) df.to_feather(TEST_FEATHER_FILENAME) @@ -196,11 +191,10 @@ def setup_hdf_file(row_size, force=False): if os.path.exists(TEST_HDF_FILENAME) and not force: pass else: - df = pandas.DataFrame({ - 'col1': np.arange(row_size), - 'col2': np.arange(row_size) - }) - df.to_hdf(TEST_HDF_FILENAME, 'test') + df = pandas.DataFrame( + {"col1": np.arange(row_size), "col2": np.arange(row_size)} + ) + df.to_hdf(TEST_HDF_FILENAME, "test") @pytest.fixture @@ -214,10 +208,9 @@ def setup_msgpack_file(row_size, force=False): if os.path.exists(TEST_MSGPACK_FILENAME) and not force: pass else: - df = pandas.DataFrame({ - 'col1': np.arange(row_size), - 'col2': np.arange(row_size) - }) + df = pandas.DataFrame( + {"col1": np.arange(row_size), "col2": np.arange(row_size)} + ) df.to_msgpack(TEST_MSGPACK_FILENAME) @@ -232,10 +225,9 @@ def setup_stata_file(row_size, force=False): if os.path.exists(TEST_STATA_FILENAME) and not force: pass else: - df = pandas.DataFrame({ - 'col1': np.arange(row_size), - 'col2': np.arange(row_size) - }) + df = pandas.DataFrame( + {"col1": np.arange(row_size), "col2": np.arange(row_size)} + ) df.to_stata(TEST_STATA_FILENAME) @@ -250,10 +242,9 @@ def setup_pickle_file(row_size, force=False): if os.path.exists(TEST_PICKLE_FILENAME) and not force: pass else: - df = pandas.DataFrame({ - 'col1': np.arange(row_size), - 'col2': np.arange(row_size) - }) + df = pandas.DataFrame( + {"col1": np.arange(row_size), "col2": np.arange(row_size)} + ) df.to_pickle(TEST_PICKLE_FILENAME) @@ -268,13 +259,15 @@ def setup_sql_file(conn, force=False): if os.path.exists(TEST_SQL_FILENAME) and not force: pass else: - df = pandas.DataFrame({ - 'col1': [0, 1, 2, 3], - 'col2': [4, 5, 6, 7], - 'col3': [8, 9, 10, 11], - 'col4': [12, 13, 14, 15], - 'col5': [0, 0, 0, 0] - }) + df = pandas.DataFrame( + { + "col1": [0, 1, 2, 3], + "col2": [4, 5, 6, 7], + "col3": [8, 9, 10, 11], + "col4": [12, 13, 14, 15], + "col5": [0, 0, 0, 0], + } + ) df.to_sql(TEST_SQL_FILENAME.split(".")[0], conn) @@ -297,8 +290,8 @@ def test_from_parquet(): def test_from_parquet_with_columns(): setup_parquet_file(SMALL_ROW_SIZE) - pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME, columns=['col1']) - ray_df = pd.read_parquet(TEST_PARQUET_FILENAME, columns=['col1']) + pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME, columns=["col1"]) + ray_df = pd.read_parquet(TEST_PARQUET_FILENAME, columns=["col1"]) assert ray_df_equals_pandas(ray_df, pandas_df) teardown_parquet_file() @@ -403,8 +396,8 @@ def test_from_feather(): def test_from_hdf(): setup_hdf_file(SMALL_ROW_SIZE) - pandas_df = pandas.read_hdf(TEST_HDF_FILENAME, key='test') - ray_df = pd.read_hdf(TEST_HDF_FILENAME, key='test') + pandas_df = pandas.read_hdf(TEST_HDF_FILENAME, key="test") + ray_df = pd.read_hdf(TEST_HDF_FILENAME, key="test") assert ray_df_equals_pandas(ray_df, pandas_df) @@ -465,7 +458,7 @@ def test_from_sas(): def test_from_csv_delimiter(): - setup_csv_file(SMALL_ROW_SIZE, delimiter='|') + setup_csv_file(SMALL_ROW_SIZE, delimiter="|") pandas_df = pandas.read_csv(TEST_CSV_FILENAME) ray_df = pd.read_csv(TEST_CSV_FILENAME) @@ -486,7 +479,7 @@ def test_to_clipboard(): pandas_df.to_clipboard() pandas_as_clip = pandas.read_clipboard() - assert (ray_as_clip.equals(pandas_as_clip)) + assert ray_as_clip.equals(pandas_as_clip) def test_to_csv(): @@ -499,7 +492,7 @@ def test_to_csv(): ray_df.to_csv(TEST_CSV_DF_FILENAME) pandas_df.to_csv(TEST_CSV_pandas_FILENAME) - assert (test_files_eq(TEST_CSV_DF_FILENAME, TEST_CSV_pandas_FILENAME)) + assert test_files_eq(TEST_CSV_DF_FILENAME, TEST_CSV_pandas_FILENAME) teardown_test_file(TEST_CSV_pandas_FILENAME) teardown_test_file(TEST_CSV_DF_FILENAME) @@ -535,7 +528,7 @@ def test_to_excel(): ray_writer.save() pandas_writer.save() - assert (test_files_eq(TEST_EXCEL_DF_FILENAME, TEST_EXCEL_pandas_FILENAME)) + assert test_files_eq(TEST_EXCEL_DF_FILENAME, TEST_EXCEL_pandas_FILENAME) teardown_test_file(TEST_EXCEL_DF_FILENAME) teardown_test_file(TEST_EXCEL_pandas_FILENAME) @@ -551,8 +544,7 @@ def test_to_feather(): ray_df.to_feather(TEST_FEATHER_DF_FILENAME) pandas_df.to_feather(TEST_FEATHER_pandas_FILENAME) - assert (test_files_eq(TEST_FEATHER_DF_FILENAME, - TEST_FEATHER_pandas_FILENAME)) + assert test_files_eq(TEST_FEATHER_DF_FILENAME, TEST_FEATHER_pandas_FILENAME) teardown_test_file(TEST_FEATHER_pandas_FILENAME) teardown_test_file(TEST_FEATHER_DF_FILENAME) @@ -576,7 +568,7 @@ def test_to_html(): ray_df.to_html(TEST_HTML_DF_FILENAME) pandas_df.to_html(TEST_HTML_pandas_FILENAME) - assert (test_files_eq(TEST_HTML_DF_FILENAME, TEST_HTML_pandas_FILENAME)) + assert test_files_eq(TEST_HTML_DF_FILENAME, TEST_HTML_pandas_FILENAME) teardown_test_file(TEST_HTML_pandas_FILENAME) teardown_test_file(TEST_HTML_DF_FILENAME) @@ -592,7 +584,7 @@ def test_to_json(): ray_df.to_json(TEST_JSON_DF_FILENAME) pandas_df.to_json(TEST_JSON_pandas_FILENAME) - assert (test_files_eq(TEST_JSON_DF_FILENAME, TEST_JSON_pandas_FILENAME)) + assert test_files_eq(TEST_JSON_DF_FILENAME, TEST_JSON_pandas_FILENAME) teardown_test_file(TEST_JSON_pandas_FILENAME) teardown_test_file(TEST_JSON_DF_FILENAME) @@ -615,8 +607,7 @@ def test_to_msgpack(): ray_df.to_msgpack(TEST_MSGPACK_DF_FILENAME) pandas_df.to_msgpack(TEST_MSGPACK_pandas_FILENAME) - assert (test_files_eq(TEST_MSGPACK_DF_FILENAME, - TEST_MSGPACK_pandas_FILENAME)) + assert test_files_eq(TEST_MSGPACK_DF_FILENAME, TEST_MSGPACK_pandas_FILENAME) teardown_test_file(TEST_MSGPACK_pandas_FILENAME) teardown_test_file(TEST_MSGPACK_DF_FILENAME) @@ -639,8 +630,7 @@ def test_to_parquet(): ray_df.to_parquet(TEST_PARQUET_DF_FILENAME) pandas_df.to_parquet(TEST_PARQUET_pandas_FILENAME) - assert (test_files_eq(TEST_PARQUET_DF_FILENAME, - TEST_PARQUET_pandas_FILENAME)) + assert test_files_eq(TEST_PARQUET_DF_FILENAME, TEST_PARQUET_pandas_FILENAME) teardown_test_file(TEST_PARQUET_pandas_FILENAME) teardown_test_file(TEST_PARQUET_DF_FILENAME) @@ -663,8 +653,7 @@ def test_to_pickle(): ray_df.to_pickle(TEST_PICKLE_DF_FILENAME) pandas_df.to_pickle(TEST_PICKLE_pandas_FILENAME) - assert (test_files_eq(TEST_PICKLE_DF_FILENAME, - TEST_PICKLE_pandas_FILENAME)) + assert test_files_eq(TEST_PICKLE_DF_FILENAME, TEST_PICKLE_pandas_FILENAME) teardown_test_file(TEST_PICKLE_pandas_FILENAME) teardown_test_file(TEST_PICKLE_DF_FILENAME) @@ -680,7 +669,7 @@ def test_to_sql(): ray_df.to_pickle(TEST_SQL_DF_FILENAME) pandas_df.to_pickle(TEST_SQL_pandas_FILENAME) - assert (test_files_eq(TEST_SQL_DF_FILENAME, TEST_SQL_pandas_FILENAME)) + assert test_files_eq(TEST_SQL_DF_FILENAME, TEST_SQL_pandas_FILENAME) teardown_test_file(TEST_SQL_DF_FILENAME) teardown_test_file(TEST_SQL_pandas_FILENAME) @@ -696,7 +685,7 @@ def test_to_stata(): ray_df.to_stata(TEST_STATA_DF_FILENAME) pandas_df.to_stata(TEST_STATA_pandas_FILENAME) - assert (test_files_eq(TEST_STATA_DF_FILENAME, TEST_STATA_pandas_FILENAME)) + assert test_files_eq(TEST_STATA_DF_FILENAME, TEST_STATA_pandas_FILENAME) teardown_test_file(TEST_STATA_pandas_FILENAME) teardown_test_file(TEST_STATA_DF_FILENAME) diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 1178056f9c3..3be3a303e41 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -1544,9 +1544,31 @@ def test_plot(): ray_series = create_test_series() with pytest.raises(NotImplementedError): - ray_series.plot(None, None, None, None, None, None, None, None, None, - None, None, None, None, None, None, None, None, None, - None, None, None, None, None) + ray_series.plot( + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + ) @pytest.mark.skip(reason="Using pandas Series.") @@ -1714,8 +1736,9 @@ def test_resample(): ray_series = create_test_series() with pytest.raises(NotImplementedError): - ray_series.resample(None, None, None, None, None, None, None, None, - None, None, None, None) + ray_series.resample( + None, None, None, None, None, None, None, None, None, None, None, None + ) @pytest.mark.skip(reason="Using pandas Series.") @@ -2003,8 +2026,7 @@ def test_to_csv(): ray_series = create_test_series() with pytest.raises(NotImplementedError): - ray_series.to_csv(None, None, None, None, None, None, None, None, None, - None) + ray_series.to_csv(None, None, None, None, None, None, None, None, None, None) @pytest.mark.skip(reason="Using pandas Series.") @@ -2028,8 +2050,22 @@ def test_to_excel(): ray_series = create_test_series() with pytest.raises(NotImplementedError): - ray_series.to_excel(None, None, None, None, None, None, None, None, - None, None, None, None, None, None) + ray_series.to_excel( + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + ) @pytest.mark.skip(reason="Using pandas Series.") @@ -2061,9 +2097,26 @@ def test_to_latex(): ray_series = create_test_series() with pytest.raises(NotImplementedError): - ray_series.to_latex(None, None, None, None, None, None, None, None, - None, None, None, None, None, None, None, None, - None, None) + ray_series.to_latex( + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + ) @pytest.mark.skip(reason="Using pandas Series.") diff --git a/modin/pandas/utils.py b/modin/pandas/utils.py index 45769fc9275..7ac6848bc1b 100644 --- a/modin/pandas/utils.py +++ b/modin/pandas/utils.py @@ -49,9 +49,9 @@ def decorator(cls): cls.__doc__ = parent.__doc__ for attr, obj in cls.__dict__.items(): parent_obj = getattr(parent, attr, None) - if parent_obj in excluded or \ - (not callable(parent_obj) and - not isinstance(parent_obj, property)): + if parent_obj in excluded or ( + not callable(parent_obj) and not isinstance(parent_obj, property) + ): continue if callable(obj): obj.__doc__ = parent_obj.__doc__ diff --git a/modin/sql/connection.py b/modin/sql/connection.py index 23c985f3658..7989683f535 100644 --- a/modin/sql/connection.py +++ b/modin/sql/connection.py @@ -33,22 +33,22 @@ def execute(self, query): elif " ".join(split_query[:2]) == "INSERT INTO": self._insert_into(split_query) else: - raise NotImplementedError("This API is for demonstration purposes " - "only. Coming Soon!") + raise NotImplementedError( + "This API is for demonstration purposes " "only. Coming Soon!" + ) def _create_table(self, split_query): - column_names = " ".join(split_query[3:]) \ - .replace("(", "").replace(")", "").split(", ") + column_names = ( + " ".join(split_query[3:]).replace("(", "").replace(")", "").split(", ") + ) columns = Series(column_names) self._tables[split_query[2]] = DataFrame(columns=columns) def _insert_into(self, split_query): table = self._tables[split_query[2]] - values = " ".join(split_query[4:]) \ - .replace("(", "").replace(")", "").split(", ") + values = " ".join(split_query[4:]).replace("(", "").replace(")", "").split(", ") to_append = Series([eval(i) for i in values], index=table.columns) - self._tables[split_query[2]] = \ - table.append(to_append, ignore_index=True) + self._tables[split_query[2]] = table.append(to_append, ignore_index=True) print(self._tables[split_query[2]])