diff --git a/.coveragerc b/.coveragerc index bf9e57e7664..24151ab9edd 100644 --- a/.coveragerc +++ b/.coveragerc @@ -7,8 +7,6 @@ omit = # Skip tests modin/pandas/test/* modin/experimental/pandas/test/* - # This is not distributed yet - modin/pandas/series.py # Plotting is not tested modin/pandas/plotting.py # Skip Dask until it is fully a part of the testing suite diff --git a/.travis.yml b/.travis.yml index 32de8a44507..725a3e82208 100644 --- a/.travis.yml +++ b/.travis.yml @@ -87,6 +87,7 @@ script: - if [[ "$MODIN_DF_TEST" == "ONE" ]]; then python -m pytest -n auto --disable-pytest-warnings --cov-config=.coveragerc --cov=modin modin/pandas/test/test_dataframe.py::TestDFPartOne --cov-append; fi - if [[ "$MODIN_DF_TEST" == "TWO" ]]; then python -m pytest -n auto --disable-pytest-warnings --cov-config=.coveragerc --cov=modin modin/pandas/test/test_dataframe.py::TestDFPartTwo --cov-append; fi - if [[ "$MODIN_DF_TEST" == "ALL" ]]; then python -m pytest -n auto --disable-pytest-warnings --cov-config=.coveragerc --cov=modin modin/pandas/test/test_dataframe.py --cov-append; fi + - python -m pytest --disable-pytest-warnings --cov-config=.coveragerc --cov=modin modin/pandas/test/test_series.py --cov-append - python -m pytest --disable-pytest-warnings --cov-config=.coveragerc --cov=modin modin/pandas/test/test_concat.py --cov-append - python -m pytest --disable-pytest-warnings --cov-config=.coveragerc --cov=modin modin/pandas/test/test_groupby.py --cov-append - python -m pytest --disable-pytest-warnings --cov-config=.coveragerc --cov=modin modin/pandas/test/test_reshape.py --cov-append diff --git a/modin/backends/base/query_compiler.py b/modin/backends/base/query_compiler.py index 335032dc980..4a7a494111d 100644 --- a/modin/backends/base/query_compiler.py +++ b/modin/backends/base/query_compiler.py @@ -66,10 +66,10 @@ def _set_columns(self, new_columns): # END dtypes and indexing abstract methods # Metadata modification abstract methods - def add_prefix(self, prefix): + def add_prefix(self, prefix, axis=1): raise NotImplementedError("Must be implemented in children classes") - def add_suffix(self, suffix): + def add_suffix(self, suffix, axis=1): raise NotImplementedError("Must be implemented in children classes") # END Metadata modification abstract methods @@ -200,213 +200,37 @@ def inter_manager_operations(self, other, how_to_join, func): """ raise NotImplementedError("Must be implemented in children classes") - def add(self, other, **kwargs): - """Adds this manager with other object (manager or scalar). + def binary_op(self, op, other, **kwargs): + """Perform an operation between two objects. + Note: The list of operations is as follows: + - add + - eq + - floordiv + - ge + - gt + - le + - lt + - mod + - mul + - ne + - pow + - rfloordiv + - rmod + - rpow + - rsub + - rtruediv + - sub + - truediv + - __and__ + - __or__ + - __xor__ Args: - other: The other object (manager or scalar). + op: The operation. See list of operations above + other: The object to operate against. Returns: - New DataManager with added data and new index. - """ - raise NotImplementedError("Must be implemented in children classes") - - def div(self, other, **kwargs): - """Divides this manager with other object (manager or scalar). - - Args: - other: The other object (manager or scalar). - - Returns: - New DataManager with divided data and new index. - """ - raise NotImplementedError("Must be implemented in children classes") - - def eq(self, other, **kwargs): - """Compares equality (==) with other object (manager or scalar). - - Args: - other: The other object (manager or scalar). - - Returns: - New DataManager with compared data and index. - """ - raise NotImplementedError("Must be implemented in children classes") - - def floordiv(self, other, **kwargs): - """Floordivs this manager with other object (manager or scalar). - - Args: - other: The other object (manager or scalar). - - Returns: - New DataManager with floordiv-ed data and index. - """ - raise NotImplementedError("Must be implemented in children classes") - - def ge(self, other, **kwargs): - """Compares this manager >= than other object (manager or scalar). - - Args: - other: The other object (manager or scalar). - - Returns: - New DataManager with compared data and index. - """ - raise NotImplementedError("Must be implemented in children classes") - - def gt(self, other, **kwargs): - """Compares this manager > than other object (manager or scalar). - - Args: - other: The other object (manager or scalar). - - Returns: - New DataManager with compared data and index. - """ - raise NotImplementedError("Must be implemented in children classes") - - def le(self, other, **kwargs): - """Compares this manager < than other object (manager or scalar). - - Args: - other: The other object (manager or scalar). - - Returns: - New DataManager with compared data and index. - """ - raise NotImplementedError("Must be implemented in children classes") - - def lt(self, other, **kwargs): - """Compares this manager <= than other object (manager or scalar). - - Args: - other: The other object (manager or scalar). - - Returns: - New DataManager with compared data and index. - """ - raise NotImplementedError("Must be implemented in children classes") - - def mod(self, other, **kwargs): - """Mods this manager against other object (manager or scalar). - - Args: - other: The other object (manager or scalar). - - Returns: - New DataManager with mod-ed data and index. - """ - raise NotImplementedError("Must be implemented in children classes") - - def mul(self, other, **kwargs): - """Multiplies this manager against other object (manager or scalar). - - Args: - other: The other object (manager or scalar). - - Returns: - New DataManager with multiplied data and index. - """ - raise NotImplementedError("Must be implemented in children classes") - - def ne(self, other, **kwargs): - """Compares this manager != to other object (manager or scalar). - - Args: - other: The other object (manager or scalar). - - Returns: - New DataManager with compared data and index. - """ - raise NotImplementedError("Must be implemented in children classes") - - def pow(self, other, **kwargs): - """Exponential power of this manager to other object (manager or scalar). - - Args: - other: The other object (manager or scalar). - - Returns: - New DataManager with pow-ed data and index. - """ - raise NotImplementedError("Must be implemented in children classes") - - def rdiv(self, other, **kwargs): - """Divides other object (manager or scalar) with this manager. - - Args: - other: The other object (manager or scalar). - - Returns: - New DataManager with divided data and new index. - """ - raise NotImplementedError("Must be implemented in children classes") - - def rfloordiv(self, other, **kwargs): - """Floordivs this manager with other object (manager or scalar). - - Args: - other: The other object (manager or scalar). - - Returns: - New DataManager with floordiv-ed data and index. - """ - raise NotImplementedError("Must be implemented in children classes") - - def rmod(self, other, **kwargs): - """Mods this manager with other object (manager or scalar). - - Args: - other: The other object (manager or scalar). - - Returns: - New DataManager with mod data and index. - """ - raise NotImplementedError("Must be implemented in children classes") - - def rpow(self, other, **kwargs): - """Exponential power of other object (manager or scalar) to this manager. - - Args: - other: The other object (manager or scalar). - - Returns: - New DataManager with pow-ed data and new index. - """ - raise NotImplementedError("Must be implemented in children classes") - - def rsub(self, other, **kwargs): - """Subtracts other object (manager or scalar) from this manager. - - Args: - other: The other object (manager or scalar). - - Returns: - New DataManager with subtracted data and new index. - """ - raise NotImplementedError("Must be implemented in children classes") - - def sub(self, other, **kwargs): - """Subtracts this manager from other object (manager or scalar). - - Args: - other: The other object (manager or scalar). - - Returns: - New DataManager with subtracted data and new index. - """ - raise NotImplementedError("Must be implemented in children classes") - - def truediv(self, other, **kwargs): - """Divides this manager with other object (manager or scalar). - Functionally same as div - - Args: - other: The other object (manager or scalar). - - Returns: - New DataManager with divided data and new index. + A new QueryCompiler object. """ raise NotImplementedError("Must be implemented in children classes") @@ -868,17 +692,6 @@ def back(self, n): # END head/tail/front/back # Abstract __getitem__ methods - def getitem_single_key(self, key): - """Get item for a single target index. - - Args: - key: Target index by which to retrieve data. - - Returns: - A new Query Compiler. - """ - raise NotImplementedError("Must be implemented in children classes") - def getitem_column_array(self, key): """Get column data for target labels. diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index ac107013052..169c2539c5b 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -11,7 +11,6 @@ is_list_like, is_numeric_dtype, is_datetime_or_timedelta_dtype, - is_bool_dtype, ) from pandas.core.index import ensure_index from pandas.core.base import DataError @@ -57,13 +56,20 @@ def _get_dtype(self): elif not self._dtype_cache.equals(self.columns): self._dtype_cache.index = self.columns if calculate_dtype: - map_func = self._prepare_method(lambda df: df.dtypes) def dtype_builder(df): return df.apply(lambda row: find_common_type(row.values), axis=0) - self._dtype_cache = self._full_reduce(0, map_func, dtype_builder) - self._dtype_cache.index = self.columns + map_func = self._prepare_method( + self._build_mapreduce_func(lambda df: df.dtypes) + ) + reduce_func = self._build_mapreduce_func(dtype_builder) + # For now we will use a pandas Series for the dtypes. + self._dtype_cache = ( + self._full_reduce(0, map_func, reduce_func).to_pandas().iloc[0] + ) + # reset name to None because we use "__reduced__" internally + self._dtype_cache.name = None return self._dtype_cache def _set_dtype(self, dtypes): @@ -221,23 +227,31 @@ def numeric_function_clean_dataframe(self, axis): # END Internal methods # Metadata modification methods - def add_prefix(self, prefix): - new_column_names = self.columns.map(lambda x: str(prefix) + str(x)) - new_dtype_cache = self._dtype_cache.copy() - if new_dtype_cache is not None: - new_dtype_cache.index = new_column_names - return self.__constructor__( - self.data, self.index, new_column_names, new_dtype_cache - ) + def add_prefix(self, prefix, axis=1): + if axis == 1: + new_columns = self.columns.map(lambda x: str(prefix) + str(x)) + new_dtype_cache = self._dtype_cache.copy() + if new_dtype_cache is not None: + new_dtype_cache.index = new_columns + new_index = self.index + else: + new_index = self.index.map(lambda x: str(prefix) + str(x)) + new_columns = self.columns + new_dtype_cache = self._dtype_cache.copy() + return self.__constructor__(self.data, new_index, new_columns, new_dtype_cache) - def add_suffix(self, suffix): - new_column_names = self.columns.map(lambda x: str(x) + str(suffix)) - new_dtype_cache = self._dtype_cache.copy() - if new_dtype_cache is not None: - new_dtype_cache.index = new_column_names - return self.__constructor__( - self.data, self.index, new_column_names, new_dtype_cache - ) + def add_suffix(self, suffix, axis=1): + if axis == 1: + new_columns = self.columns.map(lambda x: str(x) + str(suffix)) + new_dtype_cache = self._dtype_cache.copy() + if new_dtype_cache is not None: + new_dtype_cache.index = new_columns + new_index = self.index + else: + new_index = self.index.map(lambda x: str(x) + str(suffix)) + new_columns = self.columns + new_dtype_cache = self._dtype_cache.copy() + return self.__constructor__(self.data, new_index, new_columns, new_dtype_cache) # END Metadata modification methods @@ -246,9 +260,13 @@ def add_suffix(self, suffix): # copies if we end up modifying something here. We copy all of the metadata # to prevent that. def copy(self): - return self.__constructor__( + copied = self.__constructor__( self.data.copy(), self.index.copy(), self.columns.copy(), self._dtype_cache ) + # Copy metadata + if self._is_transposed: + copied._is_transposed = True + return copied # END Copy @@ -439,11 +457,13 @@ def reindex_partition(df): if i != 0 or (left_old_idx.equals(joined_index) and not force_repartition): reindex_left = None else: - reindex_left = compute_reindex(left_old_idx) + reindex_left = self._prepare_method(compute_reindex(left_old_idx)) if right_old_idxes[i].equals(joined_index) and not force_repartition: reindex_right = None else: - reindex_right = compute_reindex(right_old_idxes[i]) + reindex_right = other[i]._prepare_method( + compute_reindex(right_old_idxes[i]) + ) reindexed_self, reindexed_other = reindexed_self.copartition_datasets( axis, other[i].data, reindex_left, reindex_right @@ -470,13 +490,9 @@ def to_pandas(self): df = self.data.to_pandas(is_transposed=self._is_transposed) if df.empty: if len(self.columns) != 0: - data = [ - pandas.Series(dtype=self.dtypes[col_name], name=col_name) - for col_name in self.columns - ] - df = pandas.concat(data, axis=1) + df = pandas.DataFrame(columns=self.columns).astype(self.dtypes) else: - df = pandas.DataFrame(index=self.index) + df = pandas.DataFrame(columns=self.columns, index=self.index) else: ErrorMessage.catch_bugs_and_request_email( len(df.index) != len(self.index) or len(df.columns) != len(self.columns) @@ -563,7 +579,6 @@ def _inter_df_op_handler(self, func, other, **kwargs): """ axis = kwargs.get("axis", 0) axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 - if isinstance(other, type(self)): return self._inter_manager_operations( other, "outer", lambda x, y: func(x, y, **kwargs) @@ -573,233 +588,39 @@ def _inter_df_op_handler(self, func, other, **kwargs): axis, other, lambda df: func(df, other, **kwargs) ) - def add(self, other, **kwargs): - """Adds this manager with other object (manager or scalar). - - Args: - other: The other object (manager or scalar). - - Returns: - New DataManager with added data and new index. - """ - func = pandas.DataFrame.add - return self._inter_df_op_handler(func, other, **kwargs) - - def div(self, other, **kwargs): - """Divides this manager with other object (manager or scalar). - - Args: - other: The other object (manager or scalar). - - Returns: - New DataManager with divided data and new index. - """ - func = pandas.DataFrame.div - return self._inter_df_op_handler(func, other, **kwargs) - - def eq(self, other, **kwargs): - """Compares equality (==) with other object (manager or scalar). - + def binary_op(self, op, other, **kwargs): + """Perform an operation between two objects. + + Note: The list of operations is as follows: + - add + - eq + - floordiv + - ge + - gt + - le + - lt + - mod + - mul + - ne + - pow + - rfloordiv + - rmod + - rpow + - rsub + - rtruediv + - sub + - truediv + - __and__ + - __or__ + - __xor__ Args: - other: The other object (manager or scalar). + op: The operation. See list of operations above + other: The object to operate against. Returns: - New DataManager with compared data and index. + A new QueryCompiler object. """ - func = pandas.DataFrame.eq - return self._inter_df_op_handler(func, other, **kwargs) - - def floordiv(self, other, **kwargs): - """Floordivs this manager with other object (manager or scalar). - - Args: - other: The other object (manager or scalar). - - Returns: - New DataManager with floordiv-ed data and index. - """ - func = pandas.DataFrame.floordiv - return self._inter_df_op_handler(func, other, **kwargs) - - def ge(self, other, **kwargs): - """Compares this manager >= than other object (manager or scalar). - - Args: - other: The other object (manager or scalar). - - Returns: - New DataManager with compared data and index. - """ - func = pandas.DataFrame.ge - return self._inter_df_op_handler(func, other, **kwargs) - - def gt(self, other, **kwargs): - """Compares this manager > than other object (manager or scalar). - - Args: - other: The other object (manager or scalar). - - Returns: - New DataManager with compared data and index. - """ - func = pandas.DataFrame.gt - return self._inter_df_op_handler(func, other, **kwargs) - - def le(self, other, **kwargs): - """Compares this manager < than other object (manager or scalar). - - Args: - other: The other object (manager or scalar). - - Returns: - New DataManager with compared data and index. - """ - func = pandas.DataFrame.le - return self._inter_df_op_handler(func, other, **kwargs) - - def lt(self, other, **kwargs): - """Compares this manager <= than other object (manager or scalar). - - Args: - other: The other object (manager or scalar). - - Returns: - New DataManager with compared data and index. - """ - func = pandas.DataFrame.lt - return self._inter_df_op_handler(func, other, **kwargs) - - def mod(self, other, **kwargs): - """Mods this manager against other object (manager or scalar). - - Args: - other: The other object (manager or scalar). - - Returns: - New DataManager with mod-ed data and index. - """ - func = pandas.DataFrame.mod - return self._inter_df_op_handler(func, other, **kwargs) - - def mul(self, other, **kwargs): - """Multiplies this manager against other object (manager or scalar). - - Args: - other: The other object (manager or scalar). - - Returns: - New DataManager with multiplied data and index. - """ - func = pandas.DataFrame.mul - return self._inter_df_op_handler(func, other, **kwargs) - - def ne(self, other, **kwargs): - """Compares this manager != to other object (manager or scalar). - - Args: - other: The other object (manager or scalar). - - Returns: - New DataManager with compared data and index. - """ - func = pandas.DataFrame.ne - return self._inter_df_op_handler(func, other, **kwargs) - - def pow(self, other, **kwargs): - """Exponential power of this manager to other object (manager or scalar). - - Args: - other: The other object (manager or scalar). - - Returns: - New DataManager with pow-ed data and index. - """ - func = pandas.DataFrame.pow - return self._inter_df_op_handler(func, other, **kwargs) - - def rdiv(self, other, **kwargs): - """Divides other object (manager or scalar) with this manager. - - Args: - other: The other object (manager or scalar). - - Returns: - New DataManager with divided data and new index. - """ - func = pandas.DataFrame.rdiv - return self._inter_df_op_handler(func, other, **kwargs) - - def rfloordiv(self, other, **kwargs): - """Floordivs this manager with other object (manager or scalar). - - Args: - other: The other object (manager or scalar). - - Returns: - New DataManager with floordiv-ed data and index. - """ - func = pandas.DataFrame.rfloordiv - return self._inter_df_op_handler(func, other, **kwargs) - - def rmod(self, other, **kwargs): - """Mods this manager with other object (manager or scalar). - - Args: - other: The other object (manager or scalar). - - Returns: - New DataManager with mod data and index. - """ - func = pandas.DataFrame.rmod - return self._inter_df_op_handler(func, other, **kwargs) - - def rpow(self, other, **kwargs): - """Exponential power of other object (manager or scalar) to this manager. - - Args: - other: The other object (manager or scalar). - - Returns: - New DataManager with pow-ed data and new index. - """ - func = pandas.DataFrame.rpow - return self._inter_df_op_handler(func, other, **kwargs) - - def rsub(self, other, **kwargs): - """Subtracts other object (manager or scalar) from this manager. - - Args: - other: The other object (manager or scalar). - - Returns: - New DataManager with subtracted data and new index. - """ - func = pandas.DataFrame.rsub - return self._inter_df_op_handler(func, other, **kwargs) - - def sub(self, other, **kwargs): - """Subtracts this manager from other object (manager or scalar). - - Args: - other: The other object (manager or scalar). - - Returns: - New DataManager with subtracted data and new index. - """ - func = pandas.DataFrame.sub - return self._inter_df_op_handler(func, other, **kwargs) - - def truediv(self, other, **kwargs): - """Divides this manager with other object (manager or scalar). - Functionally same as div - - Args: - other: The other object (manager or scalar). - - Returns: - New DataManager with divided data and new index. - """ - func = pandas.DataFrame.truediv + func = getattr(pandas.DataFrame, op) return self._inter_df_op_handler(func, other, **kwargs) def clip(self, lower, upper, **kwargs): @@ -903,10 +724,19 @@ def _scalar_operations(self, axis, scalar, func): func: The function to use on the Manager with the scalar. Returns: - New DataManager with updated data and new index. + A new QueryCompiler with updated data and new index. """ if isinstance(scalar, (list, np.ndarray, pandas.Series)): - new_data = self._map_across_full_axis(axis, func) + new_index = self.index if axis == 0 else self.columns + + def list_like_op(df): + if axis == 0: + df.index = new_index + else: + df.columns = new_index + return func(df) + + new_data = self._map_across_full_axis(axis, list_like_op) return self.__constructor__(new_data, self.index, self.columns) else: return self._map_partitions(func) @@ -922,7 +752,7 @@ def reindex(self, axis, labels, **kwargs): labels: New labels to conform 'axis' on to. Returns: - New DataManager with updated data and new index. + A new QueryCompiler with updated data and new index. """ # To reindex, we need a function that will be shipped to each of the @@ -964,7 +794,7 @@ def reset_index(self, **kwargs): """Removes all levels from index and sets a default level_0 index. Returns: - New DataManager with updated data and reset index. + A new QueryCompiler with updated data and reset index. """ drop = kwargs.get("drop", False) new_index = pandas.RangeIndex(len(self.index)) @@ -1021,10 +851,8 @@ def transpose(self, *args, **kwargs): # Full Reduce operations # # These operations result in a reduced dimensionality of data. - # Currently, this means a Pandas Series will be returned, but in the future - # we will implement a Distributed Series, and this will be returned - # instead. - def _full_reduce(self, axis, map_func, reduce_func=None, numeric_only=False): + # This will return a new QueryCompiler, which will be handled in the front end. + def _full_reduce(self, axis, map_func, reduce_func=None): """Apply function that will reduce the data to a Pandas Series. Args: @@ -1032,137 +860,102 @@ def _full_reduce(self, axis, map_func, reduce_func=None, numeric_only=False): map_func: Callable function to map the dataframe. reduce_func: Callable function to reduce the dataframe. If none, then apply map_func twice. - numeric_only: Apply only over the numeric rows. Return: - Returns Pandas Series containing the results from map_func and reduce_func. + A new QueryCompiler object containing the results from map_func and + reduce_func. """ - if numeric_only: - result, query_compiler = self.numeric_function_clean_dataframe(axis) - if result is not None: - return result - else: - query_compiler = self if reduce_func is None: reduce_func = map_func - # The XOR here will ensure that we reduce over the correct axis that - # exists on the internal partitions. We flip the axis - mapped_parts = query_compiler.data.map_across_blocks(map_func).partitions - if reduce_func is None: - reduce_func = map_func - # For now we return a pandas.Series until ours gets implemented. - # We have to build the intermediate frame based on the axis passed, - # thus axis=axis and axis=axis ^ 1 - # - # This currently requires special treatment because of the intermediate - # DataFrame. The individual partitions return Series objects, and those - # cannot be concatenated the correct way without casting them as - # DataFrames. - full_frame = pandas.concat( - [ - pandas.concat( - [pandas.DataFrame(part.to_pandas()).T for part in row_of_parts], - axis=axis ^ 1, - ) - for row_of_parts in mapped_parts - ], - axis=axis, - ) - - # Transpose because operations where axis == 1 assume that the - # operation is performed across the other axis - if axis == 1: - full_frame = full_frame.T - result = reduce_func(full_frame) - if result.shape == (0,): - return result - elif not axis: - result.index = query_compiler.columns + mapped_parts = self.data.map_across_blocks(map_func) + full_frame = mapped_parts.map_across_full_axis(axis, reduce_func) + if axis == 0: + columns = self.columns + return self.__constructor__( + full_frame, index=["__reduced__"], columns=columns + ) else: - result.index = query_compiler.index - return result - - def _process_min_max(self, func, **kwargs): - """Calculates the min or max of the DataFrame. - - Return: - Pandas series containing the min or max values from each column or - row. - """ - # Pandas default is 0 (though not mentioned in docs) - axis = kwargs.get("axis", 0) - numeric_only = True if axis else kwargs.get("numeric_only", False) - - def min_max_builder(df, **kwargs): - if not df.empty: - return func(df, **kwargs) + index = self.index + return self.__constructor__( + full_frame, index=index, columns=["__reduced__"] + ) - map_func = self._prepare_method(min_max_builder, **kwargs) - return self._full_reduce(axis, map_func, numeric_only=numeric_only) + def _build_mapreduce_func(self, func, **kwargs): + def _map_reduce_func(df): + series_result = func(df, **kwargs) + if kwargs.get("axis", 0) == 0 and isinstance(series_result, pandas.Series): + # In the case of axis=0, we need to keep the shape of the data + # consistent with what we have done. In the case of a reduction, the + # data for axis=0 should be a single value for each column. By + # transposing the data after we convert to a DataFrame, we ensure that + # the columns of the result line up with the columns from the data. + # axis=1 does not have this requirement because the index already will + # line up with the index of the data based on how pandas creates a + # DataFrame from a Series. + return pandas.DataFrame(series_result).T + return pandas.DataFrame(series_result) + + return _map_reduce_func def count(self, **kwargs): """Counts the number of non-NaN objects for each column or row. Return: - Pandas series containing counts of non-NaN objects from each column or row. + A new QueryCompiler object containing counts of non-NaN objects from each + column or row. """ + if self._is_transposed: + kwargs["axis"] = kwargs.get("axis", 0) ^ 1 + return self.transpose().count(**kwargs) axis = kwargs.get("axis", 0) - numeric_only = kwargs.get("numeric_only", False) - map_func = self._prepare_method(pandas.DataFrame.count, **kwargs) - reduce_func = self._prepare_method(pandas.DataFrame.sum, **kwargs) - return self._full_reduce(axis, map_func, reduce_func, numeric_only) + map_func = self._build_mapreduce_func(pandas.DataFrame.count, **kwargs) + reduce_func = self._build_mapreduce_func(pandas.DataFrame.sum, **kwargs) + return self._full_reduce(axis, map_func, reduce_func) def max(self, **kwargs): """Returns the maximum value for each column or row. Return: - Pandas series with the maximum values from each column or row. + A new QueryCompiler object with the maximum values from each column or row. """ - return self._process_min_max(pandas.DataFrame.max, **kwargs) + if self._is_transposed: + kwargs["axis"] = kwargs.get("axis", 0) ^ 1 + return self.transpose().max(**kwargs) + mapreduce_func = self._build_mapreduce_func(pandas.DataFrame.max, **kwargs) + return self._full_reduce(kwargs.get("axis", 0), mapreduce_func) def mean(self, **kwargs): """Returns the mean for each numerical column or row. Return: - Pandas series containing the mean from each numerical column or row. + A new QueryCompiler object containing the mean from each numerical column or + row. """ + if self._is_transposed: + kwargs["axis"] = kwargs.get("axis", 0) ^ 1 + return self.transpose().mean(**kwargs) # Pandas default is 0 (though not mentioned in docs) axis = kwargs.get("axis", 0) sums = self.sum(**kwargs) counts = self.count(axis=axis, numeric_only=kwargs.get("numeric_only", None)) - try: - # If we need to drop any columns, it will throw a TypeError - return sums.divide(counts) - # In the case that a TypeError is thrown, we need to iterate through, similar to - # how pandas does and do the division only on things that can be divided. - # NOTE: We will only hit this condition if numeric_only is not True. - except TypeError: - - def can_divide(l, r): - try: - pandas.Series([l]).divide(r) - except TypeError: - return False - return True - - # Iterate through the sums to check that we can divide them. If not, then - # drop the record. This matches pandas behavior. - return pandas.Series( - { - idx: sums[idx] / counts[idx] - for idx in sums.index - if can_divide(sums[idx], counts[idx]) - } - ) + if sums._is_transposed and counts._is_transposed: + sums = sums.transpose() + counts = counts.transpose() + result = sums.binary_op("truediv", counts, axis=axis) + return result.transpose() if axis == 0 else result def min(self, **kwargs): """Returns the minimum from each column or row. Return: - Pandas series with the minimum value from each column or row. + A new QueryCompiler object with the minimum value from each column or row. """ - return self._process_min_max(pandas.DataFrame.min, **kwargs) + if self._is_transposed: + kwargs["axis"] = kwargs.get("axis", 0) ^ 1 + return self.transpose().min(**kwargs) + mapreduce_func = self._build_mapreduce_func(pandas.DataFrame.min, **kwargs) + return self._full_reduce(kwargs.get("axis", 0), mapreduce_func) def _process_sum_prod(self, func, **kwargs): """Calculates the sum or product of the DataFrame. @@ -1171,51 +964,83 @@ def _process_sum_prod(self, func, **kwargs): func: Pandas func to apply to DataFrame. ignore_axis: Whether to ignore axis when raising TypeError Return: - Pandas Series with sum or prod of DataFrame. + A new QueryCompiler object with sum or prod of the object. """ axis = kwargs.get("axis", 0) - numeric_only = kwargs.get("numeric_only", None) if not axis else True min_count = kwargs.get("min_count", 0) - reduce_index = self.columns if axis else self.index - - if numeric_only: - result, query_compiler = self.numeric_function_clean_dataframe(axis) - else: - query_compiler = self - new_index = query_compiler.index if axis else query_compiler.columns def sum_prod_builder(df, **kwargs): - if not df.empty: - return func(df, **kwargs) - else: - return pandas.DataFrame([]) - - map_func = self._prepare_method(sum_prod_builder, **kwargs) + return func(df, **kwargs) if min_count <= 1: - return self._full_reduce(axis, map_func, numeric_only=numeric_only) - elif min_count > len(reduce_index): - return pandas.Series( - [np.nan] * len(new_index), index=new_index, dtype=np.dtype("object") - ) + return self._full_reduce(axis, sum_prod_builder) else: - return self._full_axis_reduce(map_func, axis) + return self._full_axis_reduce(axis, sum_prod_builder) def prod(self, **kwargs): """Returns the product of each numerical column or row. Return: - Pandas series with the product of each numerical column or row. + A new QueryCompiler object with the product of each numerical column or row. """ - return self._process_sum_prod(pandas.DataFrame.prod, **kwargs) + if self._is_transposed: + kwargs["axis"] = kwargs.get("axis", 0) ^ 1 + return self.transpose().prod(**kwargs) + return self._process_sum_prod( + self._build_mapreduce_func(pandas.DataFrame.prod, **kwargs), **kwargs + ) def sum(self, **kwargs): """Returns the sum of each numerical column or row. Return: - Pandas series with the sum of each numerical column or row. + A new QueryCompiler object with the sum of each numerical column or row. + """ + if self._is_transposed: + kwargs["axis"] = kwargs.get("axis", 0) ^ 1 + return self.transpose().sum(**kwargs) + return self._process_sum_prod( + self._build_mapreduce_func(pandas.DataFrame.sum, **kwargs), **kwargs + ) + + def _process_all_any(self, func, **kwargs): + """Calculates if any or all the values are true. + + Return: + A new QueryCompiler object containing boolean values or boolean. + """ + axis = kwargs.get("axis", 0) + axis = 0 if axis is None else axis + kwargs["axis"] = axis + builder_func = self._build_mapreduce_func(func, **kwargs) + return self._full_reduce(axis, builder_func) + + def all(self, **kwargs): + """Returns whether all the elements are true, potentially over an axis. + + Return: + A new QueryCompiler object containing boolean values or boolean. + """ + if self._is_transposed: + # Pandas ignores on axis=1 + kwargs["bool_only"] = False + kwargs["axis"] = kwargs.get("axis", 0) ^ 1 + return self.transpose().all(**kwargs) + return self._process_all_any(lambda df, **kwargs: df.all(**kwargs), **kwargs) + + def any(self, **kwargs): + """Returns whether any the elements are true, potentially over an axis. + + Return: + A new QueryCompiler object containing boolean values or boolean. """ - return self._process_sum_prod(pandas.DataFrame.sum, **kwargs) + if self._is_transposed: + if kwargs.get("axis", 0) == 1: + # Pandas ignores on axis=1 + kwargs["bool_only"] = False + kwargs["axis"] = kwargs.get("axis", 0) ^ 1 + return self.transpose().any(**kwargs) + return self._process_all_any(lambda df, **kwargs: df.any(**kwargs), **kwargs) # END Full Reduce operations @@ -1234,6 +1059,10 @@ def applymap(self, func): remote_func = self._prepare_method(pandas.DataFrame.applymap, func=func) return self._map_partitions(remote_func) + def invert(self): + remote_func = self._prepare_method(pandas.DataFrame.__invert__) + return self._map_partitions(remote_func) + def isin(self, **kwargs): func = self._prepare_method(pandas.DataFrame.isin, **kwargs) new_dtypes = pandas.Series( @@ -1259,13 +1088,6 @@ def notna(self): ) return self._map_partitions(func, new_dtypes=new_dtypes) - def notnull(self): - func = self._prepare_method(pandas.DataFrame.notnull) - new_dtypes = pandas.Series( - [np.dtype("bool") for _ in self.columns], index=self.columns - ) - return self._map_partitions(func, new_dtypes=new_dtypes) - def round(self, **kwargs): func = self._prepare_method(pandas.DataFrame.round, **kwargs) return self._map_partitions(func, new_dtypes=self._dtype_cache) @@ -1317,7 +1139,6 @@ def astype(df, internal_indices=[]): new_data = new_data.apply_func_to_select_indices( 0, astype, dtype_indices[dtype], keep_remaining=True ) - return self.__constructor__(new_data, self.index, self.columns, new_dtypes) # END Map partitions across select indices @@ -1325,10 +1146,8 @@ def astype(df, internal_indices=[]): # Column/Row partitions reduce operations # # These operations result in a reduced dimensionality of data. - # Currently, this means a Pandas Series will be returned, but in the future - # we will implement a Distributed Series, and this will be returned - # instead. - def _full_axis_reduce(self, func, axis, alternate_index=None): + # This will return a new QueryCompiler object which the font end will handle. + def _full_axis_reduce(self, axis, func, alternate_index=None): """Applies map that reduce Manager to series but require knowledge of full axis. Args: @@ -1340,88 +1159,13 @@ def _full_axis_reduce(self, func, axis, alternate_index=None): Return: Pandas series containing the reduced data. """ - # We XOR with axis because if we are doing an operation over the columns - # (i.e. along the rows), we want to take the transpose so that the - # results from the same parition will be concated together first. - # We need this here because if the operations is over the columns, - # map_across_full_axis does not transpose the result before returning. - result = self.data.map_across_full_axis(axis, func).to_pandas( - self._is_transposed ^ axis - ) - if result.empty: - return result - if not axis: - result.index = ( - alternate_index if alternate_index is not None else self.columns - ) - else: - result.index = ( - alternate_index if alternate_index is not None else self.index - ) - return result - - def all(self, **kwargs): - """Returns whether all the elements are true, potentially over an axis. - - Return: - Pandas Series containing boolean values or boolean. - """ - return self._process_all_any(lambda df, **kwargs: df.all(**kwargs), **kwargs) - - def any(self, **kwargs): - """Returns whether any the elements are true, potentially over an axis. - - Return: - Pandas Series containing boolean values or boolean. - """ - return self._process_all_any(lambda df, **kwargs: df.any(**kwargs), **kwargs) - - def _process_all_any(self, func, **kwargs): - """Calculates if any or all the values are true. - - Return: - Pandas Series containing boolean values or boolean. - """ - axis = kwargs.get("axis", 0) - axis_none = True if axis is None else False - axis = 0 if axis is None else axis - kwargs["axis"] = axis - bool_only = kwargs.get("bool_only", None) - kwargs["bool_only"] = False if bool_only is None else bool_only - - not_bool_col = [] - numeric_col_count = 0 - for col, dtype in zip(self.columns, self.dtypes): - if not is_bool_dtype(dtype): - not_bool_col.append(col) - numeric_col_count += 1 if is_numeric_dtype(dtype) else 0 - - if bool_only: - if axis == 0 and not axis_none and len(not_bool_col) == len(self.columns): - return pandas.Series(dtype=bool) - if len(not_bool_col) == len(self.columns): - query_compiler = self - else: - query_compiler = self.drop(columns=not_bool_col) - else: - if ( - bool_only is False - and axis_none - and len(not_bool_col) == len(self.columns) - and numeric_col_count != len(self.columns) - ): - if func == pandas.DataFrame.all: - return self.getitem_single_key(self.columns[-1])[self.index[-1]] - elif func == pandas.DataFrame.any: - return self.getitem_single_key(self.columns[0])[self.index[0]] - query_compiler = self - - builder_func = query_compiler._prepare_method(func, **kwargs) - result = query_compiler._full_axis_reduce(builder_func, axis) - if axis_none: - return func(result) + result = self.data.map_across_full_axis(axis, func) + if axis == 0: + columns = alternate_index if alternate_index is not None else self.columns + return self.__constructor__(result, index=["__reduced__"], columns=columns) else: - return result + index = alternate_index if alternate_index is not None else self.index + return self.__constructor__(result, index=index, columns=["__reduced__"]) def first_valid_index(self): """Returns index of first non-NaN/NULL value. @@ -1429,88 +1173,64 @@ def first_valid_index(self): Return: Scalar of index name. """ - # It may be possible to incrementally check each partition, but this # computation is fairly cheap. def first_valid_index_builder(df): df.index = pandas.RangeIndex(len(df.index)) return df.apply(lambda df: df.first_valid_index()) - func = self._prepare_method(first_valid_index_builder) + func = self._build_mapreduce_func(first_valid_index_builder) # We get the minimum from each column, then take the min of that to get - # first_valid_index. - first_result = self._full_axis_reduce(func, 0) + # first_valid_index. The `to_pandas()` here is just for a single value and + # `squeeze` will convert it to a scalar. + first_result = self._full_axis_reduce(0, func).min(axis=1).to_pandas().squeeze() + return self.index[first_result] - return self.index[first_result.min()] - - def _post_process_idx_ops(self, axis, intermediate_result): - """Converts internal index to external index. - - Args: - axis: 0 for columns and 1 for rows. Defaults to 0. - intermediate_result: Internal index of self.data. + def idxmax(self, **kwargs): + """Returns the first occurrence of the maximum over requested axis. Returns: - External index of the intermediate_result. + A new QueryCompiler object containing the maximum of each column or axis. """ - index = self.index if not axis else self.columns - - def apply_index(x): - try: - return index[x] if x is not np.nan else x - # These can happen even if x is a nan because of how pandas classifies nans - # as of 0.24. - except (ValueError, IndexError): - return x - - result = intermediate_result.apply(apply_index) - return result + if self._is_transposed: + kwargs["axis"] = kwargs.get("axis", 0) ^ 1 + return self.transpose().idxmax(**kwargs) - def idxmax(self, **kwargs): - """Returns the first occurance of the maximum over requested axis. + axis = kwargs.get("axis", 0) + index = self.index if axis == 0 else self.columns - Returns: - Series containing the maximum of each column or axis. - """ - # The reason for the special treatment with idxmax/min is because we - # need to communicate the row number back here. - def idxmax_builder(df, axis=0, **kwargs): + def idxmax_builder(df, **kwargs): if axis == 0: - df.index = pandas.RangeIndex(len(df.index)) + df.index = index else: - df.columns = pandas.RangeIndex(len(df.columns)) - return df.idxmax(axis=axis, **kwargs) + df.columns = index + return df.idxmax(**kwargs) - axis = kwargs.pop("axis", 0) - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 - func = self._prepare_method(idxmax_builder, axis=axis, **kwargs) - max_result = self._full_axis_reduce(func, axis) - # Because our internal partitions don't track the external index, we - # have to do a conversion. - return self._post_process_idx_ops(axis, max_result) + func = self._build_mapreduce_func(idxmax_builder, **kwargs) + return self._full_axis_reduce(axis, func) def idxmin(self, **kwargs): - """Returns the first occurance of the minimum over requested axis. + """Returns the first occurrence of the minimum over requested axis. Returns: - Series containing the minimum of each column or axis. + A new QueryCompiler object containing the minimum of each column or axis. """ - # The reason for the special treatment with idxmax/min is because we - # need to communicate the row number back here. - def idxmin_builder(df, axis=0, **kwargs): + if self._is_transposed: + kwargs["axis"] = kwargs.get("axis", 0) ^ 1 + return self.transpose().idxmin(**kwargs) + + axis = kwargs.get("axis", 0) + index = self.index if axis == 0 else self.columns + + def idxmin_builder(df, **kwargs): if axis == 0: - df.index = pandas.RangeIndex(len(df.index)) + df.index = index else: - df.columns = pandas.RangeIndex(len(df.columns)) - return df.idxmin(axis=axis, **kwargs) + df.columns = index + return df.idxmin(**kwargs) - axis = kwargs.pop("axis", 0) - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 - func = self._prepare_method(idxmin_builder, axis=axis, **kwargs) - min_result = self._full_axis_reduce(func, axis) - # Because our internal partitions don't track the external index, we - # have to do a conversion. - return self._post_process_idx_ops(axis, min_result) + func = self._build_mapreduce_func(idxmin_builder, **kwargs) + return self._full_axis_reduce(axis, func) def last_valid_index(self): """Returns index of last non-NaN/NULL value. @@ -1523,67 +1243,65 @@ def last_valid_index_builder(df): df.index = pandas.RangeIndex(len(df.index)) return df.apply(lambda df: df.last_valid_index()) - func = self._prepare_method(last_valid_index_builder) + func = self._build_mapreduce_func(last_valid_index_builder) # We get the maximum from each column, then take the max of that to get - # last_valid_index. - first_result = self._full_axis_reduce(func, 0) - - return self.index[first_result.max()] + # last_valid_index. The `to_pandas()` here is just for a single value and + # `squeeze` will convert it to a scalar. + first_result = self._full_axis_reduce(0, func).max(axis=1).to_pandas().squeeze() + return self.index[first_result] def median(self, **kwargs): """Returns median of each column or row. Returns: - Series containing the median of each column or row. + A new QueryCompiler object containing the median of each column or row. """ + if self._is_transposed: + kwargs["axis"] = kwargs.get("axis", 0) ^ 1 + return self.transpose().median(**kwargs) # Pandas default is 0 (though not mentioned in docs) axis = kwargs.get("axis", 0) - result, query_compiler = self.numeric_function_clean_dataframe(axis) - if result is not None: - return result - func = self._prepare_method(pandas.DataFrame.median, **kwargs) - return query_compiler._full_axis_reduce(func, axis) + func = self._build_mapreduce_func(pandas.DataFrame.median, **kwargs) + return self._full_axis_reduce(axis, func) def memory_usage(self, **kwargs): """Returns the memory usage of each column. Returns: - Series containing the memory usage of each column. + A new QueryCompiler object containing the memory usage of each column. """ def memory_usage_builder(df, **kwargs): - return df.memory_usage(index=False, deep=deep) + return df.memory_usage(**kwargs) - deep = kwargs.get("deep", False) - func = self._prepare_method(memory_usage_builder, **kwargs) - return self._full_axis_reduce(func, 0) + func = self._build_mapreduce_func(memory_usage_builder, **kwargs) + return self._full_axis_reduce(0, func) def nunique(self, **kwargs): """Returns the number of unique items over each column or row. Returns: - Series of ints indexed by column or index names. + A new QueryCompiler object of ints indexed by column or index names. """ + if self._is_transposed: + kwargs["axis"] = kwargs.get("axis", 0) ^ 1 + return self.transpose().nunique(**kwargs) axis = kwargs.get("axis", 0) - func = self._prepare_method(pandas.DataFrame.nunique, **kwargs) - return self._full_axis_reduce(func, axis) + func = self._build_mapreduce_func(pandas.DataFrame.nunique, **kwargs) + return self._full_axis_reduce(axis, func) def quantile_for_single_value(self, **kwargs): """Returns quantile of each column or row. Returns: - Series containing the quantile of each column or row. + A new QueryCompiler object containing the quantile of each column or row. """ + if self._is_transposed: + kwargs["axis"] = kwargs.get("axis", 0) ^ 1 + return self.transpose().quantile_for_single_value(**kwargs) axis = kwargs.get("axis", 0) q = kwargs.get("q", 0.5) - numeric_only = kwargs.get("numeric_only", True) assert type(q) is float - if numeric_only: - result, query_compiler = self.numeric_function_clean_dataframe(axis) - if result is not None: - return result - else: - query_compiler = self def quantile_builder(df, **kwargs): try: @@ -1591,90 +1309,74 @@ def quantile_builder(df, **kwargs): except ValueError: return pandas.Series() - func = self._prepare_method(quantile_builder, **kwargs) - result = query_compiler._full_axis_reduce(func, axis) - result.name = q + func = self._build_mapreduce_func(quantile_builder, **kwargs) + result = self._full_axis_reduce(axis, func) + if axis == 0: + result.index = [q] + else: + result.columns = [q] return result def skew(self, **kwargs): """Returns skew of each column or row. Returns: - Series containing the skew of each column or row. + A new QueryCompiler object containing the skew of each column or row. """ + if self._is_transposed: + kwargs["axis"] = kwargs.get("axis", 0) ^ 1 + return self.transpose().skew(**kwargs) # Pandas default is 0 (though not mentioned in docs) axis = kwargs.get("axis", 0) - result, query_compiler = self.numeric_function_clean_dataframe(axis) - if result is not None: - return result - func = self._prepare_method(pandas.DataFrame.skew, **kwargs) - return query_compiler._full_axis_reduce(func, axis) + func = self._build_mapreduce_func(pandas.DataFrame.skew, **kwargs) + return self._full_axis_reduce(axis, func) def std(self, **kwargs): """Returns standard deviation of each column or row. Returns: - Series containing the standard deviation of each column or row. + A new QueryCompiler object containing the standard deviation of each column + or row. """ + if self._is_transposed: + kwargs["axis"] = kwargs.get("axis", 0) ^ 1 + return self.transpose().std(**kwargs) # Pandas default is 0 (though not mentioned in docs) axis = kwargs.get("axis", 0) - result, query_compiler = self.numeric_function_clean_dataframe(axis) - if result is not None: - return result - func = self._prepare_method(pandas.DataFrame.std, **kwargs) - return query_compiler._full_axis_reduce(func, axis) - - def to_datetime(self, **kwargs): - """Converts the Manager to a Series of DateTime objects. - - Returns: - Series of DateTime objects. - """ - columns = self.columns - - def to_datetime_builder(df, **kwargs): - df.columns = columns - return pandas.to_datetime(df, **kwargs) - - func = self._prepare_method(to_datetime_builder, **kwargs) - return self._full_axis_reduce(func, 1) + func = self._build_mapreduce_func(pandas.DataFrame.std, **kwargs) + return self._full_axis_reduce(axis, func) def var(self, **kwargs): """Returns variance of each column or row. Returns: - Series containing the variance of each column or row. + A new QueryCompiler object containing the variance of each column or row. """ + if self._is_transposed: + kwargs["axis"] = kwargs.get("axis", 0) ^ 1 + return self.transpose().var(**kwargs) # Pandas default is 0 (though not mentioned in docs) axis = kwargs.get("axis", 0) - result, query_compiler = self.numeric_function_clean_dataframe(axis) - if result is not None: - return result - func = query_compiler._prepare_method(pandas.DataFrame.var, **kwargs) - return query_compiler._full_axis_reduce(func, axis) + func = self._build_mapreduce_func(pandas.DataFrame.var, **kwargs) + return self._full_axis_reduce(axis, func) # END Column/Row partitions reduce operations # Column/Row partitions reduce operations over select indices # # These operations result in a reduced dimensionality of data. - # Currently, this means a Pandas Series will be returned, but in the future - # we will implement a Distributed Series, and this will be returned - # instead. - def _full_axis_reduce_along_select_indices( - self, func, axis, index, pandas_result=True - ): + # This will return a new QueryCompiler object which the front end will handle. + def _full_axis_reduce_along_select_indices(self, func, axis, index): """Reduce Manger along select indices using function that needs full axis. Args: - func: Callable that reduces Manager to Series using full knowledge of an - axis. + func: Callable that reduces the dimension of the object and requires full + knowledge of the entire axis. axis: 0 for columns and 1 for rows. Defaults to 0. - index: Index of the resulting series. - pandas_result: Return the result as a Pandas Series instead of raw data. + index: Index of the resulting QueryCompiler. Returns: - Either a Pandas Series with index or BaseFrameManager object. + A new QueryCompiler object with index or BaseFrameManager object. """ # Convert indices to numeric indices old_index = self.index if axis else self.columns @@ -1682,9 +1384,6 @@ def _full_axis_reduce_along_select_indices( result = self.data.apply_func_to_select_indices_along_full_axis( axis, func, numeric_indices ) - if pandas_result: - result = result.to_pandas(self._is_transposed) - result.index = index return result def describe(self, **kwargs): @@ -1706,9 +1405,7 @@ def describe_builder(df, internal_indices=[], **kwargs): # Apply describe and update indices, columns, and dtypes func = self._prepare_method(describe_builder, **kwargs) - new_data = self._full_axis_reduce_along_select_indices( - func, 0, new_columns, False - ) + new_data = self._full_axis_reduce_along_select_indices(func, 0, new_columns) new_index = self.compute_index(0, new_data, False) return self.__constructor__(new_data, new_index, new_columns) @@ -1729,33 +1426,47 @@ def _cumulative_builder(self, func, **kwargs): new_data, self.index, self.columns, self._dtype_cache ) - def cumsum(self, **kwargs): - return self._cumulative_builder(pandas.DataFrame.cumsum, **kwargs) - def cummax(self, **kwargs): + if self._is_transposed: + kwargs["axis"] = kwargs.get("axis", 0) ^ 1 + return self.transpose().cummax(**kwargs).transpose() return self._cumulative_builder(pandas.DataFrame.cummax, **kwargs) def cummin(self, **kwargs): + if self._is_transposed: + kwargs["axis"] = kwargs.get("axis", 0) ^ 1 + return self.transpose().cummin(**kwargs).transpose() return self._cumulative_builder(pandas.DataFrame.cummin, **kwargs) + def cumsum(self, **kwargs): + if self._is_transposed: + kwargs["axis"] = kwargs.get("axis", 0) ^ 1 + return self.transpose().cumsum(**kwargs).transpose() + return self._cumulative_builder(pandas.DataFrame.cumsum, **kwargs) + def cumprod(self, **kwargs): + if self._is_transposed: + kwargs["axis"] = kwargs.get("axis", 0) ^ 1 + return self.transpose().cumprod(**kwargs).transpose() return self._cumulative_builder(pandas.DataFrame.cumprod, **kwargs) def diff(self, **kwargs): - + if self._is_transposed: + kwargs["axis"] = kwargs.get("axis", 0) ^ 1 + return self.transpose().diff(**kwargs).transpose() axis = kwargs.get("axis", 0) func = self._prepare_method(pandas.DataFrame.diff, **kwargs) new_data = self._map_across_full_axis(axis, func) return self.__constructor__(new_data, self.index, self.columns) def dropna(self, **kwargs): - """Returns a new DataManager with null values dropped along given axis. + """Returns a new QueryCompiler with null values dropped along given axis. Return: a new DataManager """ axis = kwargs.get("axis", 0) - subset = kwargs.get("subset") - thresh = kwargs.get("thresh") + subset = kwargs.get("subset", None) + thresh = kwargs.get("thresh", None) how = kwargs.get("how", "any") # We need to subset the axis that we care about with `subset`. This # will be used to determine the number of values that are NA. @@ -1776,11 +1487,15 @@ def dropna(self, **kwargs): # Count the number of NA values and specify which are higher than # thresh. drop_values = { - ax ^ 1: compute_na.isna().sum(axis=ax ^ 1) > thresh for ax in axis + ax ^ 1: compute_na.isna().sum(axis=ax ^ 1).to_pandas().squeeze() + > thresh + for ax in axis } else: drop_values = { - ax ^ 1: getattr(compute_na.isna(), how)(axis=ax ^ 1) for ax in axis + ax + ^ 1: getattr(compute_na.isna(), how)(axis=ax ^ 1).to_pandas().squeeze() + for ax in axis } if 0 not in drop_values: @@ -1812,16 +1527,14 @@ def dropna(self, **kwargs): return self.drop(index=rm_from_index, columns=rm_from_columns) def eval(self, expr, **kwargs): - """Returns a new DataManager with expr evaluated on columns. + """Returns a new QueryCompiler with expr evaluated on columns. Args: expr: The string expression to evaluate. Returns: - A new PandasDataManager with new columns after applying expr. + A new QueryCompiler with new columns after applying expr. """ - inplace = kwargs.get("inplace", False) - columns = self.index if self._is_transposed else self.columns index = self.columns if self._is_transposed else self.index @@ -1830,37 +1543,31 @@ def eval(self, expr, **kwargs): columns_copy = pandas.DataFrame(columns=self.columns) columns_copy = columns_copy.eval(expr, inplace=False, **kwargs) expect_series = isinstance(columns_copy, pandas.Series) - # if there is no assignment, then we simply save the results - # in the first column - if expect_series: - if inplace: - raise ValueError("Cannot operate inplace if there is no assignment") - else: - expr = "{0} = {1}".format(columns[0], expr) def eval_builder(df, **kwargs): + # pop the `axis` parameter because it was needed to build the mapreduce + # function but it is not a parameter used by `eval`. + kwargs.pop("axis", None) df.columns = columns result = df.eval(expr, inplace=False, **kwargs) - result.columns = pandas.RangeIndex(0, len(result.columns)) return result - func = self._prepare_method(eval_builder, **kwargs) + func = self._build_mapreduce_func(eval_builder, axis=1, **kwargs) new_data = self._map_across_full_axis(1, func) if expect_series: - result = new_data.to_pandas()[0] - result.name = columns_copy.name - result.index = index - return result + new_columns = [columns_copy.name] + new_index = index else: - columns = columns_copy.columns - return self.__constructor__(new_data, self.index, columns) + new_columns = columns_copy.columns + new_index = self.index + return self.__constructor__(new_data, new_index, new_columns) def mode(self, **kwargs): - """Returns a new DataManager with modes calculated for each label along given axis. + """Returns a new QueryCompiler with modes calculated for each label along given axis. Returns: - A new PandasDataManager with modes calculated. + A new QueryCompiler with modes calculated. """ axis = kwargs.get("axis", 0) @@ -1881,7 +1588,7 @@ def mode_builder(df, **kwargs): index=result.index, ) result = pandas.concat([result, append_vals], axis=1) - return result + return pandas.DataFrame(result) func = self._prepare_method(mode_builder, **kwargs) new_data = self._map_across_full_axis(axis, func) @@ -1896,7 +1603,7 @@ def fillna(self, **kwargs): """Replaces NaN values with the method provided. Returns: - A new PandasDataManager with null values filled. + A new QueryCompiler with null values filled. """ axis = kwargs.get("axis", 0) value = kwargs.get("value") @@ -2039,6 +1746,9 @@ def quantile_for_list_of_values(self, **kwargs): Returns: DataManager containing quantiles of original DataManager along an axis. """ + if self._is_transposed: + kwargs["axis"] = kwargs.get("axis", 0) ^ 1 + return self.transpose().quantile_for_list_of_values(**kwargs) axis = kwargs.get("axis", 0) q = kwargs.get("q") numeric_only = kwargs.get("numeric_only", True) @@ -2135,22 +1845,29 @@ def tail(self, n): # See head for an explanation of the transposed behavior if n < 0: n = max(0, len(self.index) + n) - if n == 0: - index = pandas.Index([]) - else: - index = self.index[-n:] if self._is_transposed: result = self.__constructor__( self.data.transpose().take(1, -n).transpose(), - index, + self.index[:-n], self.columns, self._dtype_cache, ) result._is_transposed = True else: - result = self.__constructor__( - self.data.take(0, -n), index, self.columns, self._dtype_cache - ) + if n == 0: + result = self.__constructor__( + self.data.take(0, 0), + self.index[:0], + self.columns, + self._dtype_cache, + ) + else: + result = self.__constructor__( + self.data.take(0, -n), + self.index[-n:], + self.columns, + self._dtype_cache, + ) return result @@ -2211,24 +1928,6 @@ def back(self, n): # End Head/Tail/Front/Back # __getitem__ methods - def getitem_single_key(self, key): - """Get item for a single target index. - - Args: - key: Target index by which to retrieve data. - - Returns: - A new PandasDataManager. - """ - new_data = self.getitem_column_array([key]) - if len(self.columns.get_indexer_for([key])) > 1: - return new_data - else: - # This is the case that we are returning a single Series. - # We do this post processing because everything is treated a a list - # from here on, and that will result in a DataFrame. - return new_data.to_pandas()[key] - def getitem_column_array(self, key): """Get column data for target labels. @@ -2236,7 +1935,7 @@ def getitem_column_array(self, key): key: Target labels by which to retrieve data. Returns: - A new PandasDataManager. + A new QueryCompiler. """ # Convert to list for type checking numeric_indices = list(self.columns.get_indexer_for(key)) @@ -2262,7 +1961,7 @@ def getitem_row_array(self, key): key: Target numeric indices by which to retrieve data. Returns: - A new PandasDataManager. + A new QueryCompiler. """ # Convert to list for type checking key = list(key) @@ -2278,7 +1977,7 @@ def getitem(df, internal_indices=[]): new_index = self.index[key] return self.__constructor__(result, new_index, self.columns, self._dtype_cache) - def setitem(self, key, value): + def setitem(self, axis, key, value): """Set the column defined by `key` to the `value` provided. Args: @@ -2286,26 +1985,35 @@ def setitem(self, key, value): value: The value to set the column to. Returns: - A new PandasDataManager + A new QueryCompiler """ def setitem(df, internal_indices=[]): if len(internal_indices) == 1: - df[df.columns[internal_indices[0]]] = value + if axis == 0: + df[df.columns[internal_indices[0]]] = value + else: + df.iloc[internal_indices[0]] = value else: - df[df.columns[internal_indices]] = value + if axis == 0: + df[df.columns[internal_indices]] = value + else: + df.iloc[internal_indices] = value return df - numeric_indices = list(self.columns.get_indexer_for([key])) + if axis == 0: + numeric_indices = list(self.columns.get_indexer_for([key])) + else: + numeric_indices = list(self.index.get_indexer_for([key])) prepared_func = self._prepare_method(setitem) if is_list_like(value): value = list(value) new_data = self.data.apply_func_to_select_indices_along_full_axis( - 0, prepared_func, numeric_indices, keep_remaining=True + axis, prepared_func, numeric_indices, keep_remaining=True ) else: new_data = self.data.apply_func_to_select_indices( - 0, prepared_func, numeric_indices, keep_remaining=True + axis, prepared_func, numeric_indices, keep_remaining=True ) return self.__constructor__(new_data, self.index, self.columns) @@ -2321,7 +2029,7 @@ def drop(self, index=None, columns=None): columns: Target columns to drop. Returns: - A new PandasDataManager. + A new QueryCompiler. """ if index is None: new_data = self.data @@ -2374,9 +2082,10 @@ def insert(self, loc, column, value): A new PandasQueryCompiler with new data inserted. """ if is_list_like(value): - from modin.pandas.series import SeriesView - - if isinstance(value, (pandas.Series, SeriesView)): + # TODO make work with another querycompiler object as `value`. + # This will require aligning the indices with a `reindex` and ensuring that + # the data is partitioned identically. + if isinstance(value, pandas.Series): value = value.reindex(self.index) value = list(value) @@ -2393,7 +2102,6 @@ def insert(df, internal_indices=[]): 0, insert, loc, keep_remaining=True ) new_columns = self.columns.insert(loc, column) - return self.__constructor__(new_data, self.index, new_columns) # END Insert @@ -2459,17 +2167,6 @@ def _post_process_apply(self, result_data, axis, try_scale=True): index = internal_index else: index = self.index - # `apply` and `aggregate` can return a Series or a DataFrame object, - # and since we need to handle each of those differently, we have to add - # this logic here. - if len(columns) == 0: - series_result = result_data.to_pandas(False) - if not axis and len(series_result) == len(self.columns): - index = self.columns - elif axis and len(series_result) == len(self.index): - index = self.index - series_result.index = index - return series_result return self.__constructor__(result_data, index, columns) def _dict_func(self, func, axis, *args, **kwargs): @@ -2492,18 +2189,14 @@ def _dict_func(self, func, axis, *args, **kwargs): func = {idx: func[key] for key in func for idx in index.get_indexer_for([key])} def dict_apply_builder(df, func_dict={}): - return df.apply(func_dict, *args, **kwargs) + # Sometimes `apply` can return a `Series`, but we require that internally + # all objects are `DataFrame`s. + return pandas.DataFrame(df.apply(func_dict, *args, **kwargs)) result_data = self.data.apply_func_to_select_indices_along_full_axis( axis, dict_apply_builder, func, keep_remaining=False ) full_result = self._post_process_apply(result_data, axis) - # The columns can get weird because we did not broadcast them to the - # partitions and we do not have any guarantee that they are correct - # until here. Fortunately, the keys of the function will tell us what - # the columns are. - if isinstance(full_result, pandas.Series): - full_result.index = [self.columns[idx] for idx in func] return full_result def _list_like_func(self, func, axis, *args, **kwargs): @@ -2517,7 +2210,7 @@ def _list_like_func(self, func, axis, *args, **kwargs): A new PandasQueryCompiler. """ func_prepared = self._prepare_method( - lambda df: df.apply(func, axis, *args, **kwargs) + lambda df: pandas.DataFrame(df.apply(func, axis, *args, **kwargs)) ) new_data = self._map_across_full_axis(axis, func_prepared) # When the function is list-like, the function names become the index/columns @@ -2544,21 +2237,18 @@ def _callable_func(self, func, axis, *args, **kwargs): A new PandasQueryCompiler. """ - def callable_apply_builder(df, func, axis, index, *args, **kwargs): + def callable_apply_builder(df, axis=0): if not axis: df.index = index df.columns = pandas.RangeIndex(len(df.columns)) else: df.columns = index df.index = pandas.RangeIndex(len(df.index)) - result = df.apply(func, axis=axis, *args, **kwargs) return result index = self.index if not axis else self.columns - func_prepared = self._prepare_method( - lambda df: callable_apply_builder(df, func, axis, index, *args, **kwargs) - ) + func_prepared = self._build_mapreduce_func(callable_apply_builder, axis=axis) result_data = self._map_across_full_axis(axis, func_prepared) return self._post_process_apply(result_data, axis) @@ -2643,7 +2333,7 @@ def get_dummies(self, columns, **kwargs): columns: The columns to convert. Returns: - A new PandasDataManager. + A new QueryCompiler. """ cls = type(self) # `columns` as None does not mean all columns, by default it means only @@ -2717,6 +2407,8 @@ def get_dummies_builder(df, internal_indices=[]): # Indexing def view(self, index=None, columns=None): + if self._is_transposed: + return self.transpose().view(columns=index, index=columns) index_map_series = pandas.Series(np.arange(len(self.index)), index=self.index) column_map_series = pandas.Series( np.arange(len(self.columns)), index=self.columns @@ -2729,31 +2421,11 @@ def view(self, index=None, columns=None): self.data, index_map_series.index, column_map_series.index, - self.dtypes, + self._dtype_cache, index_map_series, column_map_series, ) - def squeeze(self, ndim=0, axis=None): - to_squeeze = self.to_pandas() - # This is the case for 1xN or Nx1 DF - Need to call squeeze - if ndim == 1: - if axis is None: - axis = 0 if self.data.shape[1] > 1 else 1 - squeezed = pandas.Series(to_squeeze.squeeze()) - # In the case of `MultiIndex`, we already have the correct index and naming - # because we are going from pandas above. This step is to correct the - # `Series` to have the correct name and index. - if not isinstance(squeezed.index, pandas.MultiIndex): - scaler_axis = self.columns if axis else self.index - non_scaler_axis = self.index if axis else self.columns - squeezed.name = scaler_axis[0] - squeezed.index = non_scaler_axis - return squeezed - # This is the case for a 1x1 DF - We don't need to squeeze - else: - return to_squeeze.values[0][0] - def write_items(self, row_numeric_index, col_numeric_index, broadcasted_items): def iloc_mut(partition, row_internal_indices, col_internal_indices, item): partition = partition.copy() diff --git a/modin/data_management/utils.py b/modin/data_management/utils.py index 45533b3ac84..3092560c0fc 100644 --- a/modin/data_management/utils.py +++ b/modin/data_management/utils.py @@ -100,7 +100,7 @@ def split_result_of_axis_func_pandas(axis, num_splits, result, length_list=None) return [result.iloc[:, sums[i] : sums[i + 1]] for i in range(len(sums) - 1)] # We do this to restore block partitioning chunksize = compute_chunksize(result, num_splits, axis=axis) - if axis == 0 or type(result) is pandas.Series: + if axis == 0: return [ result.iloc[chunksize * i : chunksize * (i + 1)] for i in range(num_splits) ] @@ -112,13 +112,10 @@ def split_result_of_axis_func_pandas(axis, num_splits, result, length_list=None) def length_fn_pandas(df): - assert isinstance(df, (pandas.DataFrame, pandas.Series)), "{}".format(df) + assert isinstance(df, pandas.DataFrame) return len(df) def width_fn_pandas(df): - assert isinstance(df, (pandas.DataFrame, pandas.Series)), "{}".format((df)) - if isinstance(df, pandas.DataFrame): - return len(df.columns) - else: - return 1 + assert isinstance(df, pandas.DataFrame) + return len(df.columns) diff --git a/modin/engines/ray/generic/frame/partition_manager.py b/modin/engines/ray/generic/frame/partition_manager.py index e2b435dc741..ecd626f97d9 100644 --- a/modin/engines/ray/generic/frame/partition_manager.py +++ b/modin/engines/ray/generic/frame/partition_manager.py @@ -1,7 +1,9 @@ import ray import numpy as np +from ray.worker import RayTaskError from modin.engines.base.frame.partition_manager import BaseFrameManager +from modin.engines.ray.utils import handle_ray_task_error class RayFrameManager(BaseFrameManager): @@ -26,14 +28,17 @@ def block_lengths(self): having to recompute these values each time they are needed. """ if self._lengths_cache is None: - # The first column will have the correct lengths. We have an - # invariant that requires that all blocks be the same length in a - # row of blocks. - self._lengths_cache = np.array( - ray.get([obj.length().oid for obj in self._partitions_cache.T[0]]) - if len(self._partitions_cache.T) > 0 - else [] - ) + try: + # The first column will have the correct lengths. We have an + # invariant that requires that all blocks be the same length in a + # row of blocks. + self._lengths_cache = np.array( + ray.get([obj.length().oid for obj in self._partitions_cache.T[0]]) + if len(self._partitions_cache.T) > 0 + else [] + ) + except RayTaskError as e: + handle_ray_task_error(e) return self._lengths_cache @property @@ -44,12 +49,15 @@ def block_widths(self): having to recompute these values each time they are needed. """ if self._widths_cache is None: - # The first column will have the correct lengths. We have an - # invariant that requires that all blocks be the same width in a - # column of blocks. - self._widths_cache = np.array( - ray.get([obj.width().oid for obj in self._partitions_cache[0]]) - if len(self._partitions_cache) > 0 - else [] - ) + try: + # The first column will have the correct lengths. We have an + # invariant that requires that all blocks be the same width in a + # column of blocks. + self._widths_cache = np.array( + ray.get([obj.width().oid for obj in self._partitions_cache[0]]) + if len(self._partitions_cache) > 0 + else [] + ) + except RayTaskError as e: + handle_ray_task_error(e) return self._widths_cache diff --git a/modin/engines/ray/pandas_on_ray/frame/partition.py b/modin/engines/ray/pandas_on_ray/frame/partition.py index b43e0e5b451..ce2de00f8a6 100644 --- a/modin/engines/ray/pandas_on_ray/frame/partition.py +++ b/modin/engines/ray/pandas_on_ray/frame/partition.py @@ -4,9 +4,11 @@ import pandas import ray +from ray.worker import RayTaskError from modin.engines.base.frame.partition import BaseFramePartition from modin.data_management.utils import length_fn_pandas, width_fn_pandas +from modin.engines.ray.utils import handle_ray_task_error class PandasOnRayFramePartition(BaseFramePartition): @@ -24,8 +26,10 @@ def get(self): """ if len(self.call_queue): return self.apply(lambda x: x).get() - - return ray.get(self.oid) + try: + return ray.get(self.oid) + except RayTaskError as e: + handle_ray_task_error(e) def apply(self, func, **kwargs): """Apply a function to the object stored in this partition. @@ -77,7 +81,6 @@ def to_pandas(self): """ dataframe = self.get() assert type(dataframe) is pandas.DataFrame or type(dataframe) is pandas.Series - return dataframe @classmethod diff --git a/modin/engines/ray/utils.py b/modin/engines/ray/utils.py new file mode 100644 index 00000000000..9bc60811b2d --- /dev/null +++ b/modin/engines/ray/utils.py @@ -0,0 +1,16 @@ +import sys + +if sys.version_info[0] == 3: + import builtins +else: + import __builtin__ as builtins + + +def handle_ray_task_error(e): + for s in e.traceback_str.split("\n")[::-1]: + if "Error" in s or "Exception" in s: + try: + raise getattr(builtins, s.split(":")[0])("".join(s.split(":")[1:])) + except AttributeError: + break + raise e diff --git a/modin/pandas/__init__.py b/modin/pandas/__init__.py index ca1983777aa..d177413a1be 100644 --- a/modin/pandas/__init__.py +++ b/modin/pandas/__init__.py @@ -30,7 +30,6 @@ Index, MultiIndex, CategoricalIndex, - Series, bdate_range, DatetimeIndex, Timedelta, @@ -109,6 +108,7 @@ HDFStore, ) from .reshape import get_dummies, melt, crosstab, lreshape, wide_to_long +from .series import Series from .general import ( isna, isnull, diff --git a/modin/pandas/base.py b/modin/pandas/base.py new file mode 100644 index 00000000000..9f3431d75ec --- /dev/null +++ b/modin/pandas/base.py @@ -0,0 +1,3288 @@ +import numpy as np +from numpy import nan +import pandas +from pandas.api.types import is_scalar +from pandas.compat import cPickle as pkl, numpy as numpy_compat, string_types, to_str +from pandas.core.common import count_not_none, _get_rename_function, _pipe +from pandas.core.dtypes.common import ( + is_list_like, + is_dict_like, + is_numeric_dtype, + is_datetime_or_timedelta_dtype, + is_dtype_equal, + is_object_dtype, +) +from pandas.util._validators import validate_bool_kwarg +import re +import warnings + +from modin.error_message import ErrorMessage + +# Similar to pandas, sentinel value to use as kwarg in place of None when None has +# special meaning and needs to be distinguished from a user explicitly passing None. +sentinel = object() + + +class BasePandasDataset(object): + """This object is the base for most of the common code that exists in + DataFrame/Series. Since both objects share the same underlying representation, + and the algorithms are the same, we use this object to define the general + behavior of those objects and then use those objects to define the output type. + """ + + def _build_repr_df(self, num_rows, num_cols): + # Add one here so that pandas automatically adds the dots + # It turns out to be faster to extract 2 extra rows and columns than to + # build the dots ourselves. + num_rows_for_head = num_rows // 2 + 1 + num_cols_for_front = num_cols // 2 + 1 + + if len(self.index) <= num_rows: + head = self._query_compiler + tail = None + else: + head = self._query_compiler.head(num_rows_for_head) + tail = self._query_compiler.tail(num_rows_for_head) + + if not hasattr(self, "columns") or len(self.columns) <= num_cols: + head_front = head.to_pandas() + # Creating these empty to make the concat logic simpler + head_back = pandas.DataFrame() + tail_back = pandas.DataFrame() + + if tail is not None: + tail_front = tail.to_pandas() + else: + tail_front = pandas.DataFrame() + else: + head_front = head.front(num_cols_for_front).to_pandas() + head_back = head.back(num_cols_for_front).to_pandas() + + if tail is not None: + tail_front = tail.front(num_cols_for_front).to_pandas() + tail_back = tail.back(num_cols_for_front).to_pandas() + else: + tail_front = tail_back = pandas.DataFrame() + + head_for_repr = pandas.concat([head_front, head_back], axis=1) + tail_for_repr = pandas.concat([tail_front, tail_back], axis=1) + + return pandas.concat([head_for_repr, tail_for_repr]) + + def _update_inplace(self, new_query_compiler): + """Updates the current DataFrame inplace. + + Args: + new_query_compiler: The new QueryCompiler to use to manage the data + """ + old_query_compiler = self._query_compiler + self._query_compiler = new_query_compiler + old_query_compiler.free() + + def _validate_other( + self, + other, + axis, + numeric_only=False, + numeric_or_time_only=False, + numeric_or_object_only=False, + comparison_dtypes_only=False, + ): + """Helper method to check validity of other in inter-df operations""" + axis = self._get_axis_number(axis) if axis is not None else 1 + result = other + if isinstance(other, BasePandasDataset): + return other._query_compiler + elif is_list_like(other): + if axis == 0: + if len(other) != len(self._query_compiler.index): + raise ValueError( + "Unable to coerce to Series, length must be {0}: " + "given {1}".format(len(self._query_compiler.index), len(other)) + ) + else: + if len(other) != len(self._query_compiler.columns): + raise ValueError( + "Unable to coerce to Series, length must be {0}: " + "given {1}".format( + len(self._query_compiler.columns), len(other) + ) + ) + if hasattr(other, "dtype"): + other_dtypes = [other.dtype] * len(other) + else: + other_dtypes = [type(x) for x in other] + else: + other_dtypes = [ + type(other) + for _ in range( + len(self._query_compiler.index) + if axis + else len(self._query_compiler.columns) + ) + ] + # Do dtype checking + if numeric_only: + if not all( + is_numeric_dtype(self_dtype) and is_numeric_dtype(other_dtype) + for self_dtype, other_dtype in zip(self._get_dtypes(), other_dtypes) + ): + raise TypeError("Cannot do operation on non-numeric dtypes") + elif numeric_or_object_only: + if not all( + (is_numeric_dtype(self_dtype) and is_numeric_dtype(other_dtype)) + or (is_object_dtype(self_dtype) and is_object_dtype(other_dtype)) + for self_dtype, other_dtype in zip(self._get_dtypes(), other_dtypes) + ): + raise TypeError("Cannot do operation non-numeric dtypes") + elif comparison_dtypes_only: + if not all( + (is_numeric_dtype(self_dtype) and is_numeric_dtype(other_dtype)) + or ( + is_datetime_or_timedelta_dtype(self_dtype) + and is_datetime_or_timedelta_dtype(other_dtype) + ) + or is_dtype_equal(self_dtype, other_dtype) + for self_dtype, other_dtype in zip(self._get_dtypes(), other_dtypes) + ): + raise TypeError( + "Cannot do operation non-numeric objects with numeric objects" + ) + elif numeric_or_time_only: + if not all( + (is_numeric_dtype(self_dtype) and is_numeric_dtype(other_dtype)) + or ( + is_datetime_or_timedelta_dtype(self_dtype) + and is_datetime_or_timedelta_dtype(other_dtype) + ) + for self_dtype, other_dtype in zip(self._get_dtypes(), other_dtypes) + ): + raise TypeError( + "Cannot do operation non-numeric objects with numeric objects" + ) + return result + + def _binary_op(self, op, other, axis=None, **kwargs): + axis = self._get_axis_number(axis) if axis is not None else 1 + if kwargs.get("level", None) is not None: + if isinstance(other, BasePandasDataset): + other = other._to_pandas() + return self._default_to_pandas( + getattr(getattr(pandas, self.__name__), op), other, axis=axis, **kwargs + ) + other = self._validate_other(other, axis, numeric_or_object_only=True) + new_query_compiler = self._query_compiler.binary_op( + op, other=other, axis=axis, **kwargs + ) + return self._create_or_update_from_compiler(new_query_compiler) + + def _default_to_pandas(self, op, *args, **kwargs): + """Helper method to use default pandas function""" + empty_self_str = "" if not self.empty else " for empty DataFrame" + ErrorMessage.default_to_pandas( + "`{}.{}`{}".format( + self.__name__, + op if isinstance(op, str) else op.__name__, + empty_self_str, + ) + ) + if callable(op): + result = op(self._to_pandas(), *args, **kwargs) + elif isinstance(op, str): + # The inner `getattr` is ensuring that we are treating this object (whether + # it is a DataFrame, Series, etc.) as a pandas object. The outer `getattr` + # will get the operation (`op`) from the pandas version of the class and run + # it on the object after we have converted it to pandas. + result = getattr(getattr(pandas, self.__name__), op)( + self._to_pandas(), *args, **kwargs + ) + # SparseDataFrames cannot be serialize by arrow and cause problems for Modin. + # For now we will use pandas. + if isinstance(result, type(self)) and not isinstance( + result, (pandas.SparseDataFrame, pandas.SparseSeries) + ): + return self._create_or_update_from_compiler( + result, inplace=kwargs.get("inplace", False) + ) + elif isinstance(result, pandas.DataFrame): + from .dataframe import DataFrame + + return DataFrame(result) + elif isinstance(result, pandas.Series): + from .series import Series + + return Series(result) + else: + try: + if ( + isinstance(result, (list, tuple)) + and len(result) == 2 + and isinstance(result[0], pandas.DataFrame) + ): + # Some operations split the DataFrame into two (e.g. align). We need to wrap + # both of the returned results + if isinstance(result[1], pandas.DataFrame): + second = self.__constructor__(result[1]) + else: + second = result[1] + return self.__constructor__(result[0]), second + else: + return result + except TypeError: + return result + + def _get_axis_number(self, axis): + return ( + getattr(pandas, self.__name__)()._get_axis_number(axis) + if axis is not None + else 0 + ) + + def __constructor__(self, *args, **kwargs): + return type(self)(*args, **kwargs) + + def abs(self): + """Apply an absolute value function to all numeric columns. + + Returns: + A new DataFrame with the applied absolute value. + """ + self._validate_dtypes(numeric_only=True) + return self.__constructor__(query_compiler=self._query_compiler.abs()) + + def _set_index(self, new_index): + """Set the index for this DataFrame. + + Args: + new_index: The new index to set this + """ + self._query_compiler.index = new_index + + def _get_index(self): + """Get the index for this DataFrame. + + Returns: + The union of all indexes across the partitions. + """ + return self._query_compiler.index + + index = property(_get_index, _set_index) + + def add(self, other, axis="columns", level=None, fill_value=None): + """Add this DataFrame to another or a scalar/list. + + Args: + other: What to add this this DataFrame. + axis: The axis to apply addition over. Only applicaable to Series + or list 'other'. + level: A level in the multilevel axis to add over. + fill_value: The value to fill NaN. + + Returns: + A new DataFrame with the applied addition. + """ + return self._binary_op( + "add", other, axis=axis, level=level, fill_value=fill_value + ) + + def agg(self, func, axis=0, *args, **kwargs): + return self.aggregate(func, axis=axis, *args, **kwargs) + + def aggregate(self, func, axis=0, *args, **kwargs): + axis = self._get_axis_number(axis) + result = None + + if axis == 0: + try: + result = self._aggregate(func, _axis=axis, *args, **kwargs) + except TypeError: + pass + + if result is None: + kwargs.pop("is_transform", None) + return self.apply(func, axis=axis, args=args, **kwargs) + + return result + + def _aggregate(self, arg, *args, **kwargs): + _axis = kwargs.pop("_axis", 0) + kwargs.pop("_level", None) + + if isinstance(arg, string_types): + kwargs.pop("is_transform", None) + return self._string_function(arg, *args, **kwargs) + + # Dictionaries have complex behavior because they can be renamed here. + elif isinstance(arg, dict): + return self._default_to_pandas("agg", arg, *args, **kwargs) + elif is_list_like(arg) or callable(arg): + kwargs.pop("is_transform", None) + return self.apply(arg, axis=_axis, args=args, **kwargs) + else: + raise TypeError("type {} is not callable".format(type(arg))) + + def _string_function(self, func, *args, **kwargs): + assert isinstance(func, string_types) + f = getattr(self, func, None) + if f is not None: + if callable(f): + return f(*args, **kwargs) + assert len(args) == 0 + assert ( + len([kwarg for kwarg in kwargs if kwarg not in ["axis", "_level"]]) == 0 + ) + return f + f = getattr(np, func, None) + if f is not None: + return self._default_to_pandas("agg", func, *args, **kwargs) + raise ValueError("{} is an unknown string function".format(func)) + + def _get_dtypes(self): + if hasattr(self, "dtype"): + return [self.dtype] + else: + return list(self.dtypes) + + def align( + self, + other, + join="outer", + axis=None, + level=None, + copy=True, + fill_value=None, + method=None, + limit=None, + fill_axis=0, + broadcast_axis=None, + ): + if isinstance(other, BasePandasDataset): + other = other._to_pandas() + return self._default_to_pandas( + "align", + other, + join=join, + axis=axis, + level=level, + copy=copy, + fill_value=fill_value, + method=method, + limit=limit, + fill_axis=fill_axis, + broadcast_axis=broadcast_axis, + ) + + def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): + """Return whether all elements are True over requested axis + + Note: + If axis=None or axis=0, this call applies df.all(axis=1) + to the transpose of df. + """ + if axis is not None: + axis = self._get_axis_number(axis) + if bool_only and axis == 0: + if hasattr(self, "dtype"): + raise NotImplementedError( + "{}.{} does not implement numeric_only.".format( + self.__name__, "all" + ) + ) + data_for_compute = self[self.columns[self.dtypes == np.bool]] + return data_for_compute.all( + axis=axis, bool_only=False, skipna=skipna, level=level, **kwargs + ) + return self._reduce_dimension( + self._query_compiler.all( + axis=axis, bool_only=bool_only, skipna=skipna, level=level, **kwargs + ) + ) + else: + if bool_only: + raise ValueError("Axis must be 0 or 1 (got {})".format(axis)) + # Reduce to a scalar if axis is None. + result = self._reduce_dimension( + self._query_compiler.all( + axis=0, bool_only=bool_only, skipna=skipna, level=level, **kwargs + ) + ) + if isinstance(result, BasePandasDataset): + return result.all( + axis=axis, bool_only=bool_only, skipna=skipna, level=level, **kwargs + ) + return result + + def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): + """Return whether any elements are True over requested axis + + Note: + If axis=None or axis=0, this call applies on the column partitions, + otherwise operates on row partitions + """ + if axis is not None: + axis = self._get_axis_number(axis) + if bool_only and axis == 0: + if hasattr(self, "dtype"): + raise NotImplementedError( + "{}.{} does not implement numeric_only.".format( + self.__name__, "all" + ) + ) + data_for_compute = self[self.columns[self.dtypes == np.bool]] + return data_for_compute.all( + axis=axis, bool_only=None, skipna=skipna, level=level, **kwargs + ) + return self._reduce_dimension( + self._query_compiler.any( + axis=axis, bool_only=bool_only, skipna=skipna, level=level, **kwargs + ) + ) + else: + if bool_only: + raise ValueError("Axis must be 0 or 1 (got {})".format(axis)) + # Reduce to a scalar if axis is None. + result = self._reduce_dimension( + self._query_compiler.any( + axis=0, bool_only=bool_only, skipna=skipna, level=level, **kwargs + ) + ) + if isinstance(result, BasePandasDataset): + return result.any( + axis=axis, bool_only=bool_only, skipna=skipna, level=level, **kwargs + ) + return result + + def apply( + self, + func, + axis=0, + broadcast=None, + raw=False, + reduce=None, + result_type=None, + convert_dtype=True, + args=(), + **kwds + ): + """Apply a function along input axis of DataFrame. + + Args: + func: The function to apply + axis: The axis over which to apply the func. + broadcast: Whether or not to broadcast. + raw: Whether or not to convert to a Series. + reduce: Whether or not to try to apply reduction procedures. + + Returns: + Series or DataFrame, depending on func. + """ + axis = self._get_axis_number(axis) + ErrorMessage.non_verified_udf() + if isinstance(func, string_types): + if axis == 1: + kwds["axis"] = axis + result = self._string_function(func, *args, **kwds) + # Sometimes we can return a scalar here + if isinstance(result, BasePandasDataset): + return result._query_compiler + return result + elif isinstance(func, dict): + if axis == 1: + raise TypeError( + "(\"'dict' object is not callable\", " + "'occurred at index {0}'".format(self.index[0]) + ) + if len(self.columns) != len(set(self.columns)): + warnings.warn( + "duplicate column names not supported with apply().", + FutureWarning, + stacklevel=2, + ) + elif not callable(func) and not is_list_like(func): + raise TypeError("{} object is not callable".format(type(func))) + query_compiler = self._query_compiler.apply(func, axis, *args, **kwds) + return query_compiler + + def as_blocks(self, copy=True): + return self._default_to_pandas("as_blocks", copy=copy) + + def as_matrix(self, columns=None): + """Convert the frame to its Numpy-array representation. + + Args: + columns: If None, return all columns, otherwise, + returns specified columns. + + Returns: + values: ndarray + """ + # TODO this is very inefficient, also see __array__ + return self._default_to_pandas("as_matrix", columns=columns) + + def asfreq(self, freq, method=None, how=None, normalize=False, fill_value=None): + return self._default_to_pandas( + "asfreq", + freq, + method=method, + how=how, + normalize=normalize, + fill_value=fill_value, + ) + + def asof(self, where, subset=None): + return self._default_to_pandas("asof", where, subset=subset) + + def astype(self, dtype, copy=True, errors="raise", **kwargs): + col_dtypes = {} + if isinstance(dtype, dict): + if ( + not set(dtype.keys()).issubset(set(self._query_compiler.columns)) + and errors == "raise" + ): + raise KeyError( + "Only a column name can be used for the key in" + "a dtype mappings argument." + ) + col_dtypes = dtype + else: + for column in self._query_compiler.columns: + col_dtypes[column] = dtype + + new_query_compiler = self._query_compiler.astype(col_dtypes, **kwargs) + return self._create_or_update_from_compiler(new_query_compiler, not copy) + + @property + def at(self, axis=None): + from .indexing import _LocIndexer + + return _LocIndexer(self) + + def at_time(self, time, asof=False, axis=None): + return self._default_to_pandas("at_time", time, asof=asof, axis=axis) + + def between_time( + self, start_time, end_time, include_start=True, include_end=True, axis=None + ): + return self._default_to_pandas( + "between_time", + start_time, + end_time, + include_start=include_start, + include_end=include_end, + axis=axis, + ) + + def bfill(self, axis=None, inplace=False, limit=None, downcast=None): + """Synonym for DataFrame.fillna(method='bfill')""" + return self.fillna( + method="bfill", axis=axis, limit=limit, downcast=downcast, inplace=inplace + ) + + def bool(self): + """Return the bool of a single element PandasObject. + + This must be a boolean scalar value, either True or False. Raise a + ValueError if the PandasObject does not have exactly 1 element, or that + element is not boolean + """ + shape = self.shape + if shape != (1,) and shape != (1, 1): + raise ValueError( + """The PandasObject does not have exactly + 1 element. Return the bool of a single + element PandasObject. The truth value is + ambiguous. Use a.empty, a.item(), a.any() + or a.all().""" + ) + else: + return self._to_pandas().bool() + + def clip(self, lower=None, upper=None, axis=None, inplace=False, *args, **kwargs): + # validate inputs + if axis is not None: + axis = self._get_axis_number(axis) + self._validate_dtypes(numeric_only=True) + if is_list_like(lower) or is_list_like(upper): + if axis is None: + raise ValueError("Must specify axis = 0 or 1") + self._validate_other(lower, axis) + self._validate_other(upper, axis) + inplace = validate_bool_kwarg(inplace, "inplace") + axis = numpy_compat.function.validate_clip_with_axis(axis, args, kwargs) + # any np.nan bounds are treated as None + if lower is not None and np.any(np.isnan(lower)): + lower = None + if upper is not None and np.any(np.isnan(upper)): + upper = None + new_query_compiler = self._query_compiler.clip( + lower=lower, upper=upper, axis=axis, inplace=inplace, *args, **kwargs + ) + return self._create_or_update_from_compiler(new_query_compiler, inplace) + + def clip_lower(self, threshold, axis=None, inplace=False): + return self.clip(lower=threshold, axis=axis, inplace=inplace) + + def clip_upper(self, threshold, axis=None, inplace=False): + return self.clip(upper=threshold, axis=axis, inplace=inplace) + + def combine(self, other, func, fill_value=None, **kwargs): + if isinstance(other, type(self)): + other = other._to_pandas() + return self._default_to_pandas( + "combine", other, func, fill_value=fill_value, **kwargs + ) + + def combine_first(self, other): + if isinstance(other, type(self)): + other = other._to_pandas() + return self._default_to_pandas("combine_first", other=other) + + def compound(self, axis=None, skipna=None, level=None): + return self._default_to_pandas( + "compound", axis=axis, skipna=skipna, level=level + ) + + def copy(self, deep=True): + """Creates a shallow copy of the DataFrame. + + Returns: + A new DataFrame pointing to the same partitions as this one. + """ + return self.__constructor__(query_compiler=self._query_compiler.copy()) + + def count(self, axis=0, level=None, numeric_only=False): + """Get the count of non-null objects in the DataFrame. + + Arguments: + axis: 0 or 'index' for row-wise, 1 or 'columns' for column-wise. + level: If the axis is a MultiIndex (hierarchical), count along a + particular level, collapsing into a DataFrame. + numeric_only: Include only float, int, boolean data + + Returns: + The count, in a Series (or DataFrame if level is specified). + """ + axis = self._get_axis_number(axis) if axis is not None else 0 + return self._reduce_dimension( + self._query_compiler.count( + axis=axis, level=level, numeric_only=numeric_only + ) + ) + + def cummax(self, axis=None, skipna=True, *args, **kwargs): + """Perform a cumulative maximum across the DataFrame. + + Args: + axis (int): The axis to take maximum on. + skipna (bool): True to skip NA values, false otherwise. + + Returns: + The cumulative maximum of the DataFrame. + """ + axis = self._get_axis_number(axis) if axis is not None else 0 + if axis: + self._validate_dtypes() + return self.__constructor__( + query_compiler=self._query_compiler.cummax( + axis=axis, skipna=skipna, **kwargs + ) + ) + + def cummin(self, axis=None, skipna=True, *args, **kwargs): + """Perform a cumulative minimum across the DataFrame. + + Args: + axis (int): The axis to cummin on. + skipna (bool): True to skip NA values, false otherwise. + + Returns: + The cumulative minimum of the DataFrame. + """ + axis = self._get_axis_number(axis) if axis is not None else 0 + if axis: + self._validate_dtypes() + return self.__constructor__( + query_compiler=self._query_compiler.cummin( + axis=axis, skipna=skipna, **kwargs + ) + ) + + def cumprod(self, axis=None, skipna=True, *args, **kwargs): + """Perform a cumulative product across the DataFrame. + + Args: + axis (int): The axis to take product on. + skipna (bool): True to skip NA values, false otherwise. + + Returns: + The cumulative product of the DataFrame. + """ + axis = self._get_axis_number(axis) if axis is not None else 0 + self._validate_dtypes(numeric_only=True) + return self.__constructor__( + query_compiler=self._query_compiler.cumprod( + axis=axis, skipna=skipna, **kwargs + ) + ) + + def cumsum(self, axis=None, skipna=True, *args, **kwargs): + """Perform a cumulative sum across the DataFrame. + + Args: + axis (int): The axis to take sum on. + skipna (bool): True to skip NA values, false otherwise. + + Returns: + The cumulative sum of the DataFrame. + """ + axis = self._get_axis_number(axis) if axis is not None else 0 + self._validate_dtypes(numeric_only=True) + return self.__constructor__( + query_compiler=self._query_compiler.cumsum( + axis=axis, skipna=skipna, **kwargs + ) + ) + + def describe(self, percentiles=None, include=None, exclude=None): + """ + Generates descriptive statistics that summarize the central tendency, + dispersion and shape of a dataset's distribution, excluding NaN values. + + Args: + percentiles (list-like of numbers, optional): + The percentiles to include in the output. + include: White-list of data types to include in results + exclude: Black-list of data types to exclude in results + + Returns: Series/DataFrame of summary statistics + """ + if include is not None and (isinstance(include, np.dtype) or include != "all"): + if not is_list_like(include): + include = [include] + include = [ + np.dtype(i) + if not (isinstance(i, type) and i.__module__ == "numpy") + else i + for i in include + ] + if not any( + (isinstance(inc, np.dtype) and inc == d) + or ( + not isinstance(inc, np.dtype) + and inc.__subclasscheck__(getattr(np, d.__str__())) + ) + for d in self._get_dtypes() + for inc in include + ): + # This is the error that pandas throws. + raise ValueError("No objects to concatenate") + if exclude is not None: + if not is_list_like(exclude): + exclude = [exclude] + exclude = [np.dtype(e) for e in exclude] + if all( + (isinstance(exc, np.dtype) and exc == d) + or ( + not isinstance(exc, np.dtype) + and exc.__subclasscheck__(getattr(np, d.__str__())) + ) + for d in self._get_dtypes() + for exc in exclude + ): + # This is the error that pandas throws. + raise ValueError("No objects to concatenate") + if percentiles is not None: + pandas.DataFrame()._check_percentile(percentiles) + return self.__constructor__( + query_compiler=self._query_compiler.describe( + percentiles=percentiles, include=include, exclude=exclude + ) + ) + + def diff(self, periods=1, axis=0): + """Finds the difference between elements on the axis requested + + Args: + periods: Periods to shift for forming difference + axis: Take difference over rows or columns + + Returns: + DataFrame with the diff applied + """ + axis = self._get_axis_number(axis) + return self.__constructor__( + query_compiler=self._query_compiler.diff(periods=periods, axis=axis) + ) + + def dot(self, other): + if isinstance(other, BasePandasDataset): + other = other._to_pandas() + return self._default_to_pandas("dot", other) + + def drop( + self, + labels=None, + axis=0, + index=None, + columns=None, + level=None, + inplace=False, + errors="raise", + ): + """Return new object with labels in requested axis removed. + Args: + labels: Index or column labels to drop. + axis: Whether to drop labels from the index (0 / 'index') or + columns (1 / 'columns'). + index, columns: Alternative to specifying axis (labels, axis=1 is + equivalent to columns=labels). + level: For MultiIndex + inplace: If True, do operation inplace and return None. + errors: If 'ignore', suppress error and existing labels are + dropped. + Returns: + dropped : type of caller + """ + # TODO implement level + if level is not None: + return self._default_to_pandas( + "drop", + labels=labels, + axis=axis, + index=index, + columns=columns, + level=level, + inplace=inplace, + errors=errors, + ) + + inplace = validate_bool_kwarg(inplace, "inplace") + if labels is not None: + if index is not None or columns is not None: + raise ValueError("Cannot specify both 'labels' and 'index'/'columns'") + axis = pandas.DataFrame()._get_axis_name(axis) + axes = {axis: labels} + elif index is not None or columns is not None: + axes, _ = pandas.DataFrame()._construct_axes_from_arguments( + (index, columns), {} + ) + else: + raise ValueError( + "Need to specify at least one of 'labels', 'index' or 'columns'" + ) + + # TODO Clean up this error checking + if "index" not in axes: + axes["index"] = None + elif axes["index"] is not None: + if not is_list_like(axes["index"]): + axes["index"] = [axes["index"]] + if errors == "raise": + non_existant = [obj for obj in axes["index"] if obj not in self.index] + if len(non_existant): + raise ValueError( + "labels {} not contained in axis".format(non_existant) + ) + else: + axes["index"] = [obj for obj in axes["index"] if obj in self.index] + # If the length is zero, we will just do nothing + if not len(axes["index"]): + axes["index"] = None + + if "columns" not in axes: + axes["columns"] = None + elif axes["columns"] is not None: + if not is_list_like(axes["columns"]): + axes["columns"] = [axes["columns"]] + if errors == "raise": + non_existant = [ + obj for obj in axes["columns"] if obj not in self.columns + ] + if len(non_existant): + raise ValueError( + "labels {} not contained in axis".format(non_existant) + ) + else: + axes["columns"] = [ + obj for obj in axes["columns"] if obj in self.columns + ] + # If the length is zero, we will just do nothing + if not len(axes["columns"]): + axes["columns"] = None + + new_query_compiler = self._query_compiler.drop( + index=axes["index"], columns=axes["columns"] + ) + return self._create_or_update_from_compiler(new_query_compiler, inplace) + + def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False): + """Create a new DataFrame from the removed NA values from this one. + + Args: + axis (int, tuple, or list): The axis to apply the drop. + how (str): How to drop the NA values. + 'all': drop the label if all values are NA. + 'any': drop the label if any values are NA. + thresh (int): The minimum number of NAs to require. + subset ([label]): Labels to consider from other axis. + inplace (bool): Change this DataFrame or return a new DataFrame. + True: Modify the data for this DataFrame, return None. + False: Create a new DataFrame and return it. + + Returns: + If inplace is set to True, returns None, otherwise returns a new + DataFrame with the dropna applied. + """ + inplace = validate_bool_kwarg(inplace, "inplace") + + if is_list_like(axis): + axis = [self._get_axis_number(ax) for ax in axis] + result = self + + for ax in axis: + result = result.dropna(axis=ax, how=how, thresh=thresh, subset=subset) + return self._create_or_update_from_compiler(result._query_compiler, inplace) + + axis = self._get_axis_number(axis) + if how is not None and how not in ["any", "all"]: + raise ValueError("invalid how option: %s" % how) + if how is None and thresh is None: + raise TypeError("must specify how or thresh") + if subset is not None: + if axis == 1: + indices = self.index.get_indexer_for(subset) + check = indices == -1 + if check.any(): + raise KeyError(list(np.compress(check, subset))) + else: + indices = self.columns.get_indexer_for(subset) + check = indices == -1 + if check.any(): + raise KeyError(list(np.compress(check, subset))) + new_query_compiler = self._query_compiler.dropna( + axis=axis, how=how, thresh=thresh, subset=subset + ) + return self._create_or_update_from_compiler(new_query_compiler, inplace) + + def droplevel(self, level, axis=0): + """Return index with requested level(s) removed. + + Args: + level: The level to drop + + Returns: + Index or MultiIndex + """ + return self._default_to_pandas("droplevel", level, axis=axis) + + def drop_duplicates(self, keep="first", inplace=False, **kwargs): + """Return DataFrame with duplicate rows removed, optionally only considering certain columns + + Args: + subset : column label or sequence of labels, optional + Only consider certain columns for identifying duplicates, by + default use all of the columns + keep : {'first', 'last', False}, default 'first' + - ``first`` : Drop duplicates except for the first occurrence. + - ``last`` : Drop duplicates except for the last occurrence. + - False : Drop all duplicates. + inplace : boolean, default False + Whether to drop duplicates in place or to return a copy + + Returns: + deduplicated : DataFrame + """ + inplace = validate_bool_kwarg(inplace, "inplace") + if kwargs.get("subset", None) is not None: + duplicates = self.duplicated(keep=keep, **kwargs) + else: + duplicates = self.duplicated(keep=keep, **kwargs) + indices, = duplicates.values.nonzero() + return self.drop(index=self.index[indices], inplace=inplace) + + def duplicated(self, keep="first", **kwargs): + return self._default_to_pandas("duplicated", keep=keep, **kwargs) + + def eq(self, other, axis="columns", level=None): + """Checks element-wise that this is equal to other. + + Args: + other: A DataFrame or Series or scalar to compare to. + axis: The axis to perform the eq over. + level: The Multilevel index level to apply eq over. + + Returns: + A new DataFrame filled with Booleans. + """ + return self._binary_op("eq", other, axis=axis, level=level) + + def ewm( + self, + com=None, + span=None, + halflife=None, + alpha=None, + min_periods=0, + adjust=True, + ignore_na=False, + axis=0, + ): + return self._default_to_pandas( + "ewm", + com=com, + span=span, + halflife=halflife, + alpha=alpha, + min_periods=min_periods, + adjust=adjust, + ignore_na=ignore_na, + axis=axis, + ) + + def expanding(self, min_periods=1, center=False, axis=0): + return self._default_to_pandas( + "expanding", min_periods=min_periods, center=center, axis=axis + ) + + def ffill(self, axis=None, inplace=False, limit=None, downcast=None): + """Synonym for fillna(method='ffill') + """ + return self.fillna( + method="ffill", axis=axis, limit=limit, downcast=downcast, inplace=inplace + ) + + def fillna( + self, + value=None, + method=None, + axis=None, + inplace=False, + limit=None, + downcast=None, + **kwargs + ): + """Fill NA/NaN values using the specified method. + + Args: + value: Value to use to fill holes. This value cannot be a list. + + method: Method to use for filling holes in reindexed Series pad. + ffill: propagate last valid observation forward to next valid + backfill. + bfill: use NEXT valid observation to fill gap. + + axis: 0 or 'index', 1 or 'columns'. + + inplace: If True, fill in place. Note: this will modify any other + views on this object. + + limit: If method is specified, this is the maximum number of + consecutive NaN values to forward/backward fill. In other + words, if there is a gap with more than this number of + consecutive NaNs, it will only be partially filled. If method + is not specified, this is the maximum number of entries along + the entire axis where NaNs will be filled. Must be greater + than 0 if not None. + + downcast: A dict of item->dtype of what to downcast if possible, + or the string 'infer' which will try to downcast to an + appropriate equal type. + + Returns: + filled: DataFrame + """ + # TODO implement value passed as DataFrame/Series + if isinstance(value, BasePandasDataset): + new_query_compiler = self._default_to_pandas( + "fillna", + value=value._to_pandas(), + method=method, + axis=axis, + inplace=False, + limit=limit, + downcast=downcast, + **kwargs + )._query_compiler + return self._create_or_update_from_compiler(new_query_compiler, inplace) + inplace = validate_bool_kwarg(inplace, "inplace") + axis = self._get_axis_number(axis) if axis is not None else 0 + if isinstance(value, (list, tuple)): + raise TypeError( + '"value" parameter must be a scalar or dict, but ' + 'you passed a "{0}"'.format(type(value).__name__) + ) + if value is None and method is None: + raise ValueError("must specify a fill method or value") + if value is not None and method is not None: + raise ValueError("cannot specify both a fill method and value") + if method is not None and method not in ["backfill", "bfill", "pad", "ffill"]: + expecting = "pad (ffill) or backfill (bfill)" + msg = "Invalid fill method. Expecting {expecting}. Got {method}".format( + expecting=expecting, method=method + ) + raise ValueError(msg) + + new_query_compiler = self._query_compiler.fillna( + value=value, + method=method, + axis=axis, + inplace=False, + limit=limit, + downcast=downcast, + **kwargs + ) + return self._create_or_update_from_compiler(new_query_compiler, inplace) + + def filter(self, items=None, like=None, regex=None, axis=None): + """Subset rows or columns based on their labels + + Args: + items (list): list of labels to subset + like (string): retain labels where `arg in label == True` + regex (string): retain labels matching regex input + axis: axis to filter on + + Returns: + A new DataFrame with the filter applied. + """ + nkw = count_not_none(items, like, regex) + if nkw > 1: + raise TypeError( + "Keyword arguments `items`, `like`, or `regex` are mutually exclusive" + ) + if nkw == 0: + raise TypeError("Must pass either `items`, `like`, or `regex`") + if axis is None: + axis = "columns" # This is the default info axis for dataframes + + axis = self._get_axis_number(axis) + labels = self.columns if axis else self.index + + if items is not None: + bool_arr = labels.isin(items) + elif like is not None: + + def f(x): + return like in to_str(x) + + bool_arr = labels.map(f).tolist() + else: + + def f(x): + return matcher.search(to_str(x)) is not None + + matcher = re.compile(regex) + bool_arr = labels.map(f).tolist() + if not axis: + return self[bool_arr] + return self[self.columns[bool_arr]] + + def first(self, offset): + return self._default_to_pandas("first", offset) + + def first_valid_index(self): + """Return index for first non-NA/null value. + + Returns: + scalar: type of index + """ + return self._query_compiler.first_valid_index() + + def floordiv(self, other, axis="columns", level=None, fill_value=None): + """Divides this DataFrame against another DataFrame/Series/scalar. + + Args: + other: The object to use to apply the divide against this. + axis: The axis to divide over. + level: The Multilevel index level to apply divide over. + fill_value: The value to fill NaNs with. + + Returns: + A new DataFrame with the Divide applied. + """ + return self._binary_op( + "floordiv", other, axis=axis, level=level, fill_value=fill_value + ) + + @classmethod + def from_csv( + cls, + path, + header=0, + sep=",", + index_col=0, + parse_dates=True, + encoding=None, + tupleize_cols=None, + infer_datetime_format=False, + ): + from .io import read_csv + + return read_csv( + path, + header=header, + sep=sep, + index_col=index_col, + parse_dates=parse_dates, + encoding=encoding, + tupleize_cols=tupleize_cols, + infer_datetime_format=infer_datetime_format, + ) + + def ge(self, other, axis="columns", level=None): + """Checks element-wise that this is greater than or equal to other. + + Args: + other: A DataFrame or Series or scalar to compare to. + axis: The axis to perform the gt over. + level: The Multilevel index level to apply gt over. + + Returns: + A new DataFrame filled with Booleans. + """ + return self._binary_op("ge", other, axis=axis, level=level) + + def get(self, key, default=None): + """Get item from object for given key (DataFrame column, Panel + slice, etc.). Returns default value if not found. + + Args: + key (DataFrame column, Panel slice) : the key for which value + to get + + Returns: + value (type of items contained in object) : A value that is + stored at the key + """ + if key in self.keys(): + return self.__getitem__(key) + else: + return default + + def get_dtype_counts(self): + """Get the counts of dtypes in this object. + + Returns: + The counts of dtypes in this object. + """ + if hasattr(self, "dtype"): + return pandas.Series({str(self.dtype): 1}) + result = self.dtypes.value_counts() + result.index = result.index.map(lambda x: str(x)) + return result + + def get_ftype_counts(self): + """Get the counts of ftypes in this object. + + Returns: + The counts of ftypes in this object. + """ + if hasattr(self, "ftype"): + return pandas.Series({self.ftype: 1}) + return self.ftypes.value_counts().sort_index() + + def get_values(self): + return self._default_to_pandas("get_values") + + def gt(self, other, axis="columns", level=None): + """Checks element-wise that this is greater than other. + + Args: + other: A DataFrame or Series or scalar to compare to. + axis: The axis to perform the gt over. + level: The Multilevel index level to apply gt over. + + Returns: + A new DataFrame filled with Booleans. + """ + return self._binary_op("gt", other, axis=axis, level=level) + + def head(self, n=5): + """Get the first n rows of the DataFrame. + + Args: + n (int): The number of rows to return. + + Returns: + A new DataFrame with the first n rows of the DataFrame. + """ + if n >= len(self.index): + return self.copy() + return self.__constructor__(query_compiler=self._query_compiler.head(n)) + + @property + def iat(self, axis=None): + from .indexing import _iLocIndexer + + return _iLocIndexer(self) + + def idxmax(self, axis=0, skipna=True, *args, **kwargs): + """Get the index of the first occurrence of the max value of the axis. + + Args: + axis (int): Identify the max over the rows (1) or columns (0). + skipna (bool): Whether or not to skip NA values. + + Returns: + A Series with the index for each maximum value for the axis + specified. + """ + if not all(d != np.dtype("O") for d in self._get_dtypes()): + raise TypeError("reduction operation 'argmax' not allowed for this dtype") + axis = self._get_axis_number(axis) + return self._reduce_dimension( + self._query_compiler.idxmax(axis=axis, skipna=skipna) + ) + + def idxmin(self, axis=0, skipna=True, *args, **kwargs): + """Get the index of the first occurrence of the min value of the axis. + + Args: + axis (int): Identify the min over the rows (1) or columns (0). + skipna (bool): Whether or not to skip NA values. + + Returns: + A Series with the index for each minimum value for the axis + specified. + """ + if not all(d != np.dtype("O") for d in self._get_dtypes()): + raise TypeError("reduction operation 'argmin' not allowed for this dtype") + axis = self._get_axis_number(axis) + return self._reduce_dimension( + self._query_compiler.idxmin(axis=axis, skipna=skipna) + ) + + def infer_objects(self): + return self._default_to_pandas("infer_objects") + + def isin(self, values): + """Fill a DataFrame with booleans for cells contained in values. + + Args: + values (iterable, DataFrame, Series, or dict): The values to find. + + Returns: + A new DataFrame with booleans representing whether or not a cell + is in values. + True: cell is contained in values. + False: otherwise + """ + return self.__constructor__( + query_compiler=self._query_compiler.isin(values=values) + ) + + def isna(self): + """Fill a DataFrame with booleans for cells containing NA. + + Returns: + A new DataFrame with booleans representing whether or not a cell + is NA. + True: cell contains NA. + False: otherwise. + """ + return self.__constructor__(query_compiler=self._query_compiler.isna()) + + isnull = isna + + @property + def ix(self, axis=None): + raise ErrorMessage.not_implemented("ix is not implemented.") + + @property + def iloc(self): + """Purely integer-location based indexing for selection by position. + + We currently support: single label, list array, slice object + We do not support: boolean array, callable + """ + from .indexing import _iLocIndexer + + return _iLocIndexer(self) + + def kurt(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + return self._default_to_pandas( + "kurt", + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs + ) + + def kurtosis(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + return self._default_to_pandas( + "kurtosis", + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs + ) + + def last(self, offset): + return self._default_to_pandas("last", offset) + + def last_valid_index(self): + """Return index for last non-NA/null value. + + Returns: + scalar: type of index + """ + return self._query_compiler.last_valid_index() + + def le(self, other, axis="columns", level=None): + """Checks element-wise that this is less than or equal to other. + + Args: + other: A DataFrame or Series or scalar to compare to. + axis: The axis to perform the le over. + level: The Multilevel index level to apply le over. + + Returns: + A new DataFrame filled with Booleans. + """ + return self._binary_op("le", other, axis=axis, level=level) + + def lt(self, other, axis="columns", level=None): + """Checks element-wise that this is less than other. + + Args: + other: A DataFrame or Series or scalar to compare to. + axis: The axis to perform the lt over. + level: The Multilevel index level to apply lt over. + + Returns: + A new DataFrame filled with Booleans. + """ + return self._binary_op("lt", other, axis=axis, level=level) + + @property + def loc(self): + """Purely label-location based indexer for selection by label. + + We currently support: single label, list array, slice object + We do not support: boolean array, callable + """ + from .indexing import _LocIndexer + + return _LocIndexer(self) + + def mad(self, axis=None, skipna=None, level=None): + return self._default_to_pandas("mad", axis=axis, skipna=skipna, level=level) + + def mask( + self, + cond, + other=nan, + inplace=False, + axis=None, + level=None, + errors="raise", + try_cast=False, + raise_on_error=None, + ): + if isinstance(other, BasePandasDataset): + other = other._to_pandas() + return self._default_to_pandas( + "mask", + cond, + other=other, + inplace=inplace, + axis=axis, + level=level, + errors=errors, + try_cast=try_cast, + raise_on_error=raise_on_error, + ) + + def max(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + """Perform max across the DataFrame. + + Args: + axis (int): The axis to take the max on. + skipna (bool): True to skip NA values, false otherwise. + + Returns: + The max of the DataFrame. + """ + axis = self._get_axis_number(axis) if axis is not None else 0 + self._validate_dtypes_min_max(axis, numeric_only) + + return self._reduce_dimension( + self._query_compiler.max( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs + ) + ) + + def mean(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + """Computes mean across the DataFrame. + + Args: + axis (int): The axis to take the mean on. + skipna (bool): True to skip NA values, false otherwise. + + Returns: + The mean of the DataFrame. (Pandas series) + """ + axis = self._get_axis_number(axis) if axis is not None else 0 + self._validate_dtypes_sum_prod_mean(axis, numeric_only, ignore_axis=False) + return self._reduce_dimension( + self._query_compiler.mean( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs + ) + ) + + def median(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + """Computes median across the DataFrame. + + Args: + axis (int): The axis to take the median on. + skipna (bool): True to skip NA values, false otherwise. + + Returns: + The median of the DataFrame. (Pandas series) + """ + axis = self._get_axis_number(axis) if axis is not None else 0 + if numeric_only is not None and not numeric_only: + self._validate_dtypes(numeric_only=True) + + return self._reduce_dimension( + self._query_compiler.median( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs + ) + ) + + def memory_usage(self, index=True, deep=False): + """Returns the memory usage of each column in bytes + + Args: + index (bool): Whether to include the memory usage of the DataFrame's + index in returned Series. Defaults to True + deep (bool): If True, introspect the data deeply by interrogating + objects dtypes for system-level memory consumption. Defaults to False + + Returns: + A Series where the index are the column names and the values are + the memory usage of each of the columns in bytes. If `index=true`, + then the first value of the Series will be 'Index' with its memory usage. + """ + assert not index, "Internal Error. Index must be evaluated in child class" + return self._reduce_dimension( + self._query_compiler.memory_usage(index=index, deep=deep) + ) + + def min(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + """Perform min across the DataFrame. + + Args: + axis (int): The axis to take the min on. + skipna (bool): True to skip NA values, false otherwise. + + Returns: + The min of the DataFrame. + """ + axis = self._get_axis_number(axis) if axis is not None else 0 + self._validate_dtypes_min_max(axis, numeric_only) + + return self._reduce_dimension( + self._query_compiler.min( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs + ) + ) + + def mod(self, other, axis="columns", level=None, fill_value=None): + """Mods this DataFrame against another DataFrame/Series/scalar. + + Args: + other: The object to use to apply the mod against this. + axis: The axis to mod over. + level: The Multilevel index level to apply mod over. + fill_value: The value to fill NaNs with. + + Returns: + A new DataFrame with the Mod applied. + """ + return self._binary_op( + "mod", other, axis=axis, level=level, fill_value=fill_value + ) + + def mode(self, axis=0, numeric_only=False, dropna=True): + """Perform mode across the DataFrame. + + Args: + axis (int): The axis to take the mode on. + numeric_only (bool): if True, only apply to numeric columns. + + Returns: + DataFrame: The mode of the DataFrame. + """ + axis = self._get_axis_number(axis) + return self.__constructor__( + query_compiler=self._query_compiler.mode( + axis=axis, numeric_only=numeric_only, dropna=dropna + ) + ) + + def mul(self, other, axis="columns", level=None, fill_value=None): + """Multiplies this DataFrame against another DataFrame/Series/scalar. + + Args: + other: The object to use to apply the multiply against this. + axis: The axis to multiply over. + level: The Multilevel index level to apply multiply over. + fill_value: The value to fill NaNs with. + + Returns: + A new DataFrame with the Multiply applied. + """ + return self._binary_op( + "mul", other, axis=axis, level=level, fill_value=fill_value + ) + + multiply = mul + + def ne(self, other, axis="columns", level=None): + """Checks element-wise that this is not equal to other. + + Args: + other: A DataFrame or Series or scalar to compare to. + axis: The axis to perform the ne over. + level: The Multilevel index level to apply ne over. + + Returns: + A new DataFrame filled with Booleans. + """ + return self._binary_op("ne", other, axis=axis, level=level) + + def notna(self): + """Perform notna across the DataFrame. + + Returns: + Boolean DataFrame where value is False if corresponding + value is NaN, True otherwise + """ + return self.__constructor__(query_compiler=self._query_compiler.notna()) + + notnull = notna + + def nunique(self, axis=0, dropna=True): + """Return Series with number of distinct + observations over requested axis. + + Args: + axis : {0 or 'index', 1 or 'columns'}, default 0 + dropna : boolean, default True + + Returns: + nunique : Series + """ + axis = self._get_axis_number(axis) if axis is not None else 0 + return self._reduce_dimension( + self._query_compiler.nunique(axis=axis, dropna=dropna) + ) + + def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, **kwargs): + return self._default_to_pandas( + "pct_change", + periods=periods, + fill_method=fill_method, + limit=limit, + freq=freq, + **kwargs + ) + + def pipe(self, func, *args, **kwargs): + """Apply func(self, *args, **kwargs) + + Args: + func: function to apply to the df. + args: positional arguments passed into ``func``. + kwargs: a dictionary of keyword arguments passed into ``func``. + + Returns: + object: the return type of ``func``. + """ + return _pipe(self, func, *args, **kwargs) + + def pop(self, item): + """Pops an item from this DataFrame and returns it. + + Args: + item (str): Column label to be popped + + Returns: + A Series containing the popped values. Also modifies this + DataFrame. + """ + result = self[item] + del self[item] + return result + + def pow(self, other, axis="columns", level=None, fill_value=None): + """Pow this DataFrame against another DataFrame/Series/scalar. + + Args: + other: The object to use to apply the pow against this. + axis: The axis to pow over. + level: The Multilevel index level to apply pow over. + fill_value: The value to fill NaNs with. + + Returns: + A new DataFrame with the Pow applied. + """ + return self._binary_op( + "pow", other, axis=axis, level=level, fill_value=fill_value + ) + + def prod( + self, + axis=None, + skipna=None, + level=None, + numeric_only=None, + min_count=0, + **kwargs + ): + """Return the product of the values for the requested axis + + Args: + axis : {index (0), columns (1)} + skipna : boolean, default True + level : int or level name, default None + numeric_only : boolean, default None + min_count : int, default 0 + + Returns: + prod : Series or DataFrame (if level specified) + """ + axis = self._get_axis_number(axis) if axis is not None else 0 + self._validate_dtypes_sum_prod_mean(axis, numeric_only, ignore_axis=True) + return self._reduce_dimension( + self._query_compiler.prod( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs + ) + ) + + product = prod + radd = add + + def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation="linear"): + """Return values at the given quantile over requested axis, + a la numpy.percentile. + + Args: + q (float): 0 <= q <= 1, the quantile(s) to compute + axis (int): 0 or 'index' for row-wise, + 1 or 'columns' for column-wise + interpolation: {'linear', 'lower', 'higher', 'midpoint', 'nearest'} + Specifies which interpolation method to use + + Returns: + quantiles : Series or DataFrame + If q is an array, a DataFrame will be returned where the + index is q, the columns are the columns of self, and the + values are the quantiles. + + If q is a float, a Series will be returned where the + index is the columns of self and the values + are the quantiles. + """ + axis = self._get_axis_number(axis) if axis is not None else 0 + + def check_dtype(t): + return is_numeric_dtype(t) or is_datetime_or_timedelta_dtype(t) + + if not numeric_only: + # If not numeric_only and columns, then check all columns are either + # numeric, timestamp, or timedelta + if not axis and not all(check_dtype(t) for t in self._get_dtypes()): + raise TypeError("can't multiply sequence by non-int of type 'float'") + # If over rows, then make sure that all dtypes are equal for not + # numeric_only + elif axis: + for i in range(1, len(self._get_dtypes())): + pre_dtype = self._get_dtypes()[i - 1] + curr_dtype = self._get_dtypes()[i] + if not is_dtype_equal(pre_dtype, curr_dtype): + raise TypeError( + "Cannot compare type '{0}' with type '{1}'".format( + pre_dtype, curr_dtype + ) + ) + else: + # Normally pandas returns this near the end of the quantile, but we + # can't afford the overhead of running the entire operation before + # we error. + if not any(is_numeric_dtype(t) for t in self._get_dtypes()): + raise ValueError("need at least one array to concatenate") + + # check that all qs are between 0 and 1 + pandas.DataFrame()._check_percentile(q) + axis = self._get_axis_number(axis) + + if isinstance(q, (pandas.Series, np.ndarray, pandas.Index, list)): + return self.__constructor__( + query_compiler=self._query_compiler.quantile_for_list_of_values( + q=q, + axis=axis, + numeric_only=numeric_only, + interpolation=interpolation, + ) + ) + else: + return self._reduce_dimension( + self._query_compiler.quantile_for_single_value( + q=q, + axis=axis, + numeric_only=numeric_only, + interpolation=interpolation, + ) + ) + + def rank( + self, + axis=0, + method="average", + numeric_only=None, + na_option="keep", + ascending=True, + pct=False, + ): + """ + Compute numerical data ranks (1 through n) along axis. + Equal values are assigned a rank that is the [method] of + the ranks of those values. + + Args: + axis (int): 0 or 'index' for row-wise, + 1 or 'columns' for column-wise + method: {'average', 'min', 'max', 'first', 'dense'} + Specifies which method to use for equal vals + numeric_only (boolean) + Include only float, int, boolean data. + na_option: {'keep', 'top', 'bottom'} + Specifies how to handle NA options + ascending (boolean): + Decedes ranking order + pct (boolean): + Computes percentage ranking of data + Returns: + A new DataFrame + """ + axis = self._get_axis_number(axis) + return self.__constructor__( + query_compiler=self._query_compiler.rank( + axis=axis, + method=method, + numeric_only=numeric_only, + na_option=na_option, + ascending=ascending, + pct=pct, + ) + ) + + def reindex( + self, + labels=None, + index=None, + columns=None, + axis=None, + method=None, + copy=True, + level=None, + fill_value=np.nan, + limit=None, + tolerance=None, + ): + axis = self._get_axis_number(axis) if axis is not None else 0 + if ( + level is not None + or ( + (columns is not None or axis == 1) + and isinstance(self.columns, pandas.MultiIndex) + ) + or ( + (index is not None or axis == 0) + and isinstance(self.index, pandas.MultiIndex) + ) + ): + return self._default_to_pandas( + "reindex", + labels=labels, + index=index, + columns=columns, + axis=axis, + method=method, + copy=copy, + level=level, + fill_value=fill_value, + limit=limit, + tolerance=tolerance, + ) + if axis == 0 and labels is not None: + index = labels + elif labels is not None: + columns = labels + if index is not None: + new_query_compiler = self._query_compiler.reindex( + 0, + index, + method=method, + fill_value=fill_value, + limit=limit, + tolerance=tolerance, + ) + else: + new_query_compiler = self._query_compiler + if columns is not None: + final_query_compiler = new_query_compiler.reindex( + 1, + columns, + method=method, + fill_value=fill_value, + limit=limit, + tolerance=tolerance, + ) + else: + final_query_compiler = new_query_compiler + return self._create_or_update_from_compiler(final_query_compiler, not copy) + + def reindex_axis( + self, + labels, + axis=0, + method=None, + level=None, + copy=True, + limit=None, + fill_value=nan, + ): + return self._default_to_pandas( + "reindex_axis", + labels, + axis=axis, + method=method, + level=level, + copy=copy, + limit=limit, + fill_value=fill_value, + ) + + def reindex_like(self, other, method=None, copy=True, limit=None, tolerance=None): + if isinstance(other, BasePandasDataset): + other = other._to_pandas() + return self._default_to_pandas( + "reindex_like", + other, + method=method, + copy=copy, + limit=limit, + tolerance=tolerance, + ) + + def rename_axis( + self, mapper=None, index=None, columns=None, axis=None, copy=True, inplace=False + ): + kwargs = { + "index": index, + "columns": columns, + "axis": axis, + "copy": copy, + "inplace": inplace, + } + axes, kwargs = getattr(pandas, self.__name__)()._construct_axes_from_arguments( + (), kwargs, sentinel=sentinel + ) + if axis is not None: + axis = self._get_axis_number(axis) + else: + axis = 0 + inplace = validate_bool_kwarg(inplace, "inplace") + + if mapper is not None: + # Use v0.23 behavior if a scalar or list + non_mapper = is_scalar(mapper) or ( + is_list_like(mapper) and not is_dict_like(mapper) + ) + if non_mapper: + return self._set_axis_name(mapper, axis=axis, inplace=inplace) + else: + # Deprecated (v0.21) behavior is if mapper is specified, + # and not a list or scalar, then call rename + msg = ( + "Using 'rename_axis' to alter labels is deprecated. " + "Use '.rename' instead" + ) + warnings.warn(msg, FutureWarning, stacklevel=3) + axis = pandas.DataFrame()._get_axis_name(axis) + d = {"copy": copy, "inplace": inplace, axis: mapper} + return self.rename(**d) + else: + # Use new behavior. Means that index and/or columns is specified + result = self if inplace else self.copy(deep=copy) + + for axis in axes: + if axes[axis] is None: + continue + v = axes[axis] + axis = self._get_axis_number(axis) + non_mapper = is_scalar(v) or (is_list_like(v) and not is_dict_like(v)) + if non_mapper: + newnames = v + else: + f = _get_rename_function(v) + curnames = self.index.names if axis == 0 else self.columns.names + newnames = [f(name) for name in curnames] + result._set_axis_name(newnames, axis=axis, inplace=True) + if not inplace: + return result + + def replace( + self, + to_replace=None, + value=None, + inplace=False, + limit=None, + regex=False, + method="pad", + ): + return self._default_to_pandas( + "replace", + to_replace=to_replace, + value=value, + inplace=inplace, + limit=limit, + regex=regex, + method=method, + ) + + def resample( + self, + rule, + how=None, + axis=0, + fill_method=None, + closed=None, + label=None, + convention="start", + kind=None, + loffset=None, + limit=None, + base=0, + on=None, + level=None, + ): + return self._default_to_pandas( + "resample", + rule, + how=how, + axis=axis, + fill_method=fill_method, + closed=closed, + label=label, + convention=convention, + kind=kind, + loffset=loffset, + limit=limit, + base=base, + on=on, + level=level, + ) + + def reset_index( + self, level=None, drop=False, inplace=False, col_level=0, col_fill="" + ): + """Reset this index to default and create column from current index. + + Args: + level: Only remove the given levels from the index. Removes all + levels by default + drop: Do not try to insert index into DataFrame columns. This + resets the index to the default integer index. + inplace: Modify the DataFrame in place (do not create a new object) + col_level : If the columns have multiple levels, determines which + level the labels are inserted into. By default it is inserted + into the first level. + col_fill: If the columns have multiple levels, determines how the + other levels are named. If None then the index name is + repeated. + + Returns: + A new DataFrame if inplace is False, None otherwise. + """ + inplace = validate_bool_kwarg(inplace, "inplace") + # TODO Implement level + if level is not None: + new_query_compiler = self._default_to_pandas( + "reset_index", + level=level, + drop=drop, + inplace=inplace, + col_level=col_level, + col_fill=col_fill, + ) + # Error checking for matching Pandas. Pandas does not allow you to + # insert a dropped index into a DataFrame if these columns already + # exist. + elif ( + not drop + and not isinstance(self.index, pandas.MultiIndex) + and all(n in self.columns for n in ["level_0", "index"]) + ): + raise ValueError("cannot insert level_0, already exists") + else: + new_query_compiler = self._query_compiler.reset_index( + drop=drop, level=level + ) + return self._create_or_update_from_compiler(new_query_compiler, inplace) + + def rfloordiv(self, other, axis="columns", level=None, fill_value=None): + return self._binary_op( + "rfloordiv", other, axis=axis, level=level, fill_value=fill_value + ) + + def rmod(self, other, axis="columns", level=None, fill_value=None): + """Mod this DataFrame against another DataFrame/Series/scalar. + + Args: + other: The object to use to apply the div against this. + axis: The axis to div over. + level: The Multilevel index level to apply div over. + fill_value: The value to fill NaNs with. + + Returns: + A new DataFrame with the rdiv applied. + """ + return self._binary_op( + "rmod", other, axis=axis, level=level, fill_value=fill_value + ) + + rmul = mul + + def rolling( + self, + window, + min_periods=None, + center=False, + win_type=None, + on=None, + axis=0, + closed=None, + ): + return self._default_to_pandas( + "rolling", + window, + min_periods=min_periods, + center=center, + win_type=win_type, + on=on, + axis=axis, + closed=closed, + ) + + def round(self, decimals=0, *args, **kwargs): + """Round each element in the DataFrame. + + Args: + decimals: The number of decimals to round to. + + Returns: + A new DataFrame. + """ + return self.__constructor__( + query_compiler=self._query_compiler.round(decimals=decimals, **kwargs) + ) + + def rpow(self, other, axis="columns", level=None, fill_value=None): + """Pow this DataFrame against another DataFrame/Series/scalar. + + Args: + other: The object to use to apply the pow against this. + axis: The axis to pow over. + level: The Multilevel index level to apply pow over. + fill_value: The value to fill NaNs with. + + Returns: + A new DataFrame with the Pow applied. + """ + return self._binary_op( + "rpow", other, axis=axis, level=level, fill_value=fill_value + ) + + def rsub(self, other, axis="columns", level=None, fill_value=None): + """Subtract a DataFrame/Series/scalar from this DataFrame. + + Args: + other: The object to use to apply the subtraction to this. + axis: The axis to apply the subtraction over. + level: Mutlilevel index level to subtract over. + fill_value: The value to fill NaNs with. + + Returns: + A new DataFrame with the subtraciont applied. + """ + return self._binary_op( + "rsub", other, axis=axis, level=level, fill_value=fill_value + ) + + def rtruediv(self, other, axis="columns", level=None, fill_value=None): + """Div this DataFrame against another DataFrame/Series/scalar. + + Args: + other: The object to use to apply the div against this. + axis: The axis to div over. + level: The Multilevel index level to apply div over. + fill_value: The value to fill NaNs with. + + Returns: + A new DataFrame with the rdiv applied. + """ + return self._binary_op( + "rtruediv", other, axis=axis, level=level, fill_value=fill_value + ) + + rdiv = rtruediv + + def sample( + self, + n=None, + frac=None, + replace=False, + weights=None, + random_state=None, + axis=None, + ): + """Returns a random sample of items from an axis of object. + + Args: + n: Number of items from axis to return. Cannot be used with frac. + Default = 1 if frac = None. + frac: Fraction of axis items to return. Cannot be used with n. + replace: Sample with or without replacement. Default = False. + weights: Default 'None' results in equal probability weighting. + If passed a Series, will align with target object on index. + Index values in weights not found in sampled object will be + ignored and index values in sampled object not in weights will + be assigned weights of zero. If called on a DataFrame, will + accept the name of a column when axis = 0. Unless weights are + a Series, weights must be same length as axis being sampled. + If weights do not sum to 1, they will be normalized to sum + to 1. Missing values in the weights column will be treated as + zero. inf and -inf values not allowed. + random_state: Seed for the random number generator (if int), or + numpy RandomState object. + axis: Axis to sample. Accepts axis number or name. + + Returns: + A new Dataframe + """ + axis = self._get_axis_number(axis) if axis is not None else 0 + if axis: + axis_labels = self.columns + axis_length = len(axis_labels) + else: + # Getting rows requires indices instead of labels. RangeIndex provides this. + axis_labels = pandas.RangeIndex(len(self.index)) + axis_length = len(axis_labels) + if weights is not None: + # Index of the weights Series should correspond to the index of the + # Dataframe in order to sample + if isinstance(weights, BasePandasDataset): + weights = weights.reindex(self.axes[axis]) + # If weights arg is a string, the weights used for sampling will + # the be values in the column corresponding to that string + if isinstance(weights, string_types): + if axis == 0: + try: + weights = self[weights] + except KeyError: + raise KeyError("String passed to weights not a valid column") + else: + raise ValueError( + "Strings can only be passed to " + "weights when sampling from rows on " + "a DataFrame" + ) + weights = pandas.Series(weights, dtype="float64") + + if len(weights) != axis_length: + raise ValueError( + "Weights and axis to be sampled must be of same length" + ) + if (weights == np.inf).any() or (weights == -np.inf).any(): + raise ValueError("weight vector may not include `inf` values") + if (weights < 0).any(): + raise ValueError("weight vector many not include negative values") + # weights cannot be NaN when sampling, so we must set all nan + # values to 0 + weights = weights.fillna(0) + # If passed in weights are not equal to 1, renormalize them + # otherwise numpy sampling function will error + weights_sum = weights.sum() + if weights_sum != 1: + if weights_sum != 0: + weights = weights / weights_sum + else: + raise ValueError("Invalid weights: weights sum to zero") + weights = weights.values + + if n is None and frac is None: + # default to n = 1 if n and frac are both None (in accordance with + # Pandas specification) + n = 1 + elif n is not None and frac is None and n % 1 != 0: + # n must be an integer + raise ValueError("Only integers accepted as `n` values") + elif n is None and frac is not None: + # compute the number of samples based on frac + n = int(round(frac * axis_length)) + elif n is not None and frac is not None: + # Pandas specification does not allow both n and frac to be passed + # in + raise ValueError("Please enter a value for `frac` OR `n`, not both") + if n < 0: + raise ValueError( + "A negative number of rows requested. Please provide positive value." + ) + if n == 0: + # This returns an empty object, and since it is a weird edge case that + # doesn't need to be distributed, we default to pandas for n=0. + return self._default_to_pandas( + "sample", + n=n, + frac=frac, + replace=replace, + weights=weights, + random_state=random_state, + axis=axis, + ) + if random_state is not None: + # Get a random number generator depending on the type of + # random_state that is passed in + if isinstance(random_state, int): + random_num_gen = np.random.RandomState(random_state) + elif isinstance(random_state, np.random.randomState): + random_num_gen = random_state + else: + # random_state must be an int or a numpy RandomState object + raise ValueError( + "Please enter an `int` OR a " + "np.random.RandomState for random_state" + ) + # choose random numbers and then get corresponding labels from + # chosen axis + sample_indices = random_num_gen.choice( + np.arange(0, axis_length), size=n, replace=replace, p=weights + ) + samples = axis_labels[sample_indices] + else: + # randomly select labels from chosen axis + samples = np.random.choice( + a=axis_labels, size=n, replace=replace, p=weights + ) + if axis: + query_compiler = self._query_compiler.getitem_column_array(samples) + return self.__constructor__(query_compiler=query_compiler) + else: + query_compiler = self._query_compiler.getitem_row_array(samples) + return self.__constructor__(query_compiler=query_compiler) + + def sem( + self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + ): + return self._default_to_pandas( + "sem", + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + **kwargs + ) + + def select(self, crit, axis=0): + return self._default_to_pandas("select", crit, axis=axis) + + def set_axis(self, labels, axis=0, inplace=None): + """Assign desired index to given axis. + + Args: + labels (pandas.Index or list-like): The Index to assign. + axis (string or int): The axis to reassign. + inplace (bool): Whether to make these modifications inplace. + + Returns: + If inplace is False, returns a new DataFrame, otherwise None. + """ + if is_scalar(labels): + warnings.warn( + 'set_axis now takes "labels" as first argument, and ' + '"axis" as named parameter. The old form, with "axis" as ' + 'first parameter and "labels" as second, is still supported ' + "but will be deprecated in a future version of pandas.", + FutureWarning, + stacklevel=2, + ) + labels, axis = axis, labels + if inplace is None: + warnings.warn( + "set_axis currently defaults to operating inplace.\nThis " + "will change in a future version of pandas, use " + "inplace=True to avoid this warning.", + FutureWarning, + stacklevel=2, + ) + inplace = True + if inplace: + setattr(self, pandas.DataFrame()._get_axis_name(axis), labels) + else: + obj = self.copy() + obj.set_axis(labels, axis=axis, inplace=True) + return obj + + def set_value(self, index, col, value, takeable=False): + return self._default_to_pandas( + "set_value", index, col, value, takeable=takeable + ) + + def shift(self, periods=1, freq=None, axis=0, fill_value=None): + return self._default_to_pandas( + "shift", periods=periods, freq=freq, axis=axis, fill_value=fill_value + ) + + def skew(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + """Return unbiased skew over requested axis Normalized by N-1 + + Args: + axis : {index (0), columns (1)} + skipna : boolean, default True + Exclude NA/null values when computing the result. + level : int or level name, default None + numeric_only : boolean, default None + + Returns: + skew : Series or DataFrame (if level specified) + """ + axis = self._get_axis_number(axis) if axis is not None else 0 + if numeric_only is not None and not numeric_only: + self._validate_dtypes(numeric_only=True) + + return self._reduce_dimension( + self._query_compiler.skew( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs + ) + ) + + def slice_shift(self, periods=1, axis=0): + return self._default_to_pandas("slice_shift", periods=periods, axis=axis) + + def sort_index( + self, + axis=0, + level=None, + ascending=True, + inplace=False, + kind="quicksort", + na_position="last", + sort_remaining=True, + by=None, + ): + """Sort a DataFrame by one of the indices (columns or index). + + Args: + axis: The axis to sort over. + level: The MultiIndex level to sort over. + ascending: Ascending or descending + inplace: Whether or not to update this DataFrame inplace. + kind: How to perform the sort. + na_position: Where to position NA on the sort. + sort_remaining: On Multilevel Index sort based on all levels. + by: (Deprecated) argument to pass to sort_values. + + Returns: + A sorted DataFrame + """ + axis = self._get_axis_number(axis) + if level is not None: + new_query_compiler = self._default_to_pandas( + "sort_index", + axis=axis, + level=level, + ascending=ascending, + inplace=False, + kind=kind, + na_position=na_position, + sort_remaining=sort_remaining, + ) + return self._create_or_update_from_compiler(new_query_compiler, inplace) + if by is not None: + warnings.warn( + "by argument to sort_index is deprecated, " + "please use .sort_values(by=...)", + FutureWarning, + stacklevel=2, + ) + if level is not None: + raise ValueError("unable to simultaneously sort by and level") + return self.sort_values(by, axis=axis, ascending=ascending, inplace=inplace) + new_query_compiler = self._query_compiler.sort_index( + axis=axis, ascending=ascending, kind=kind, na_position=na_position + ) + if inplace: + self._update_inplace(new_query_compiler=new_query_compiler) + else: + return self.__constructor__(query_compiler=new_query_compiler) + + def sort_values( + self, + by, + axis=0, + ascending=True, + inplace=False, + kind="quicksort", + na_position="last", + ): + """Sorts by a column/row or list of columns/rows. + + Args: + by: A list of labels for the axis to sort over. + axis: The axis to sort. + ascending: Sort in ascending or descending order. + inplace: If true, do the operation inplace. + kind: How to sort. + na_position: Where to put np.nan values. + + Returns: + A sorted DataFrame. + """ + axis = self._get_axis_number(axis) + if not is_list_like(by): + by = [by] + # Currently, sort_values will just reindex based on the sorted values. + # TODO create a more efficient way to sort + if axis == 0: + broadcast_value_dict = {col: self[col] for col in by} + broadcast_values = pandas.DataFrame(broadcast_value_dict, index=self.index) + new_index = broadcast_values.sort_values( + by=by, + axis=axis, + ascending=ascending, + kind=kind, + na_position=na_position, + ).index + return self.reindex(index=new_index, copy=not inplace) + else: + broadcast_value_list = [ + self[row :: len(self.index)]._to_pandas() for row in by + ] + index_builder = list(zip(broadcast_value_list, by)) + broadcast_values = pandas.concat( + [row for row, idx in index_builder], copy=False + ) + broadcast_values.columns = self.columns + new_columns = broadcast_values.sort_values( + by=by, + axis=axis, + ascending=ascending, + kind=kind, + na_position=na_position, + ).columns + return self.reindex(columns=new_columns, copy=not inplace) + + def std( + self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + ): + """Computes standard deviation across the DataFrame. + + Args: + axis (int): The axis to take the std on. + skipna (bool): True to skip NA values, false otherwise. + ddof (int): degrees of freedom + + Returns: + The std of the DataFrame (Pandas Series) + """ + axis = self._get_axis_number(axis) if axis is not None else 0 + if numeric_only is not None and not numeric_only: + self._validate_dtypes(numeric_only=True) + + return self._reduce_dimension( + self._query_compiler.std( + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + **kwargs + ) + ) + + def sub(self, other, axis="columns", level=None, fill_value=None): + """Subtract a DataFrame/Series/scalar from this DataFrame. + + Args: + other: The object to use to apply the subtraction to this. + axis: The axis to apply the subtraction over. + level: Mutlilevel index level to subtract over. + fill_value: The value to fill NaNs with. + + Returns: + A new DataFrame with the subtraciont applied. + """ + return self._binary_op( + "sub", other, axis=axis, level=level, fill_value=fill_value + ) + + subtract = sub + + def sum( + self, + axis=None, + skipna=None, + level=None, + numeric_only=None, + min_count=0, + **kwargs + ): + """Perform a sum across the DataFrame. + + Args: + axis (int): The axis to sum on. + skipna (bool): True to skip NA values, false otherwise. + + Returns: + The sum of the DataFrame. + """ + axis = self._get_axis_number(axis) if axis is not None else 0 + self._validate_dtypes_sum_prod_mean(axis, numeric_only, ignore_axis=False) + return self._reduce_dimension( + self._query_compiler.sum( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs + ) + ) + + def swapaxes(self, axis1, axis2, copy=True): + return self._default_to_pandas("swapaxes", axis1, axis2, copy=copy) + + def swaplevel(self, i=-2, j=-1, axis=0): + return self._default_to_pandas("swaplevel", i=i, j=j, axis=axis) + + def take(self, indices, axis=0, convert=None, is_copy=True, **kwargs): + return self._default_to_pandas( + "take", indices, axis=axis, convert=convert, is_copy=is_copy, **kwargs + ) + + def tail(self, n=5): + """Get the last n rows of the DataFrame. + + Args: + n (int): The number of rows to return. + + Returns: + A new DataFrame with the last n rows of this DataFrame. + """ + if n >= len(self.index): + return self.copy() + return self.__constructor__(query_compiler=self._query_compiler.tail(n)) + + def to_clipboard(self, excel=True, sep=None, **kwargs): # pragma: no cover + return self._default_to_pandas("to_clipboard", excel=excel, sep=sep, **kwargs) + + def to_csv( + self, + path_or_buf=None, + sep=",", + na_rep="", + float_format=None, + columns=None, + header=True, + index=True, + index_label=None, + mode="w", + encoding=None, + compression="infer", + quoting=None, + quotechar='"', + line_terminator=None, + chunksize=None, + tupleize_cols=None, + date_format=None, + doublequote=True, + escapechar=None, + decimal=".", + *args, + **kwargs + ): # pragma: no cover + + kwargs = { + "path_or_buf": path_or_buf, + "sep": sep, + "na_rep": na_rep, + "float_format": float_format, + "columns": columns, + "header": header, + "index": index, + "index_label": index_label, + "mode": mode, + "encoding": encoding, + "compression": compression, + "quoting": quoting, + "quotechar": quotechar, + "line_terminator": line_terminator, + "chunksize": chunksize, + "tupleize_cols": tupleize_cols, + "date_format": date_format, + "doublequote": doublequote, + "escapechar": escapechar, + "decimal": decimal, + } + return self._default_to_pandas("to_csv", **kwargs) + + def to_dense(self): # pragma: no cover + return self._default_to_pandas("to_dense") + + def to_dict(self, orient="dict", into=dict): # pragma: no cover + return self._default_to_pandas("to_dict", orient=orient, into=into) + + def to_excel( + self, + excel_writer, + sheet_name="Sheet1", + na_rep="", + float_format=None, + columns=None, + header=True, + index=True, + index_label=None, + startrow=0, + startcol=0, + engine=None, + merge_cells=True, + encoding=None, + inf_rep="inf", + verbose=True, + freeze_panes=None, + ): # pragma: no cover + return self._default_to_pandas( + "to_excel", + excel_writer, + sheet_name, + na_rep, + float_format, + columns, + header, + index, + index_label, + startrow, + startcol, + engine, + merge_cells, + encoding, + inf_rep, + verbose, + freeze_panes, + ) + + def to_hdf(self, path_or_buf, key, format="table", **kwargs): # pragma: no cover + return self._default_to_pandas( + "to_hdf", path_or_buf, key, format=format, **kwargs + ) + + def to_json( + self, + path_or_buf=None, + orient=None, + date_format=None, + double_precision=10, + force_ascii=True, + date_unit="ms", + default_handler=None, + lines=False, + compression="infer", + index=True, + ): # pragma: no cover + return self._default_to_pandas( + "to_json", + path_or_buf, + orient=orient, + date_format=date_format, + double_precision=double_precision, + force_ascii=force_ascii, + date_unit=date_unit, + default_handler=default_handler, + lines=lines, + compression=compression, + index=index, + ) + + def to_latex( + self, + buf=None, + columns=None, + col_space=None, + header=True, + index=True, + na_rep="NaN", + formatters=None, + float_format=None, + sparsify=None, + index_names=True, + bold_rows=False, + column_format=None, + longtable=None, + escape=None, + encoding=None, + decimal=".", + multicolumn=None, + multicolumn_format=None, + multirow=None, + ): # pragma: no cover + return self._default_to_pandas( + "to_latex", + buf=buf, + columns=columns, + col_space=col_space, + header=header, + index=index, + na_rep=na_rep, + formatters=formatters, + float_format=float_format, + sparsify=sparsify, + index_names=index_names, + bold_rows=bold_rows, + column_format=column_format, + longtable=longtable, + escape=escape, + encoding=encoding, + decimal=decimal, + multicolumn=multicolumn, + multicolumn_format=multicolumn_format, + multirow=multirow, + ) + + def to_msgpack( + self, path_or_buf=None, encoding="utf-8", **kwargs + ): # pragma: no cover + return self._default_to_pandas( + "to_msgpack", path_or_buf=path_or_buf, encoding=encoding, **kwargs + ) + + def to_numpy(self, dtype=None, copy=False): + """Convert the DataFrame to a NumPy array. + + Args: + dtype: The dtype to pass to numpy.asarray() + copy: Whether to ensure that the returned value is a not a view on another + array. + + Returns: + A numpy array. + """ + return self._default_to_pandas("to_numpy", dtype=dtype, copy=copy) + + # TODO(williamma12): When this gets implemented, have the series one call this. + def to_period(self, freq=None, axis=0, copy=True): # pragma: no cover + return self._default_to_pandas("to_period", freq=freq, axis=axis, copy=copy) + + def to_pickle( + self, path, compression="infer", protocol=pkl.HIGHEST_PROTOCOL + ): # pragma: no cover + return self._default_to_pandas( + "to_pickle", path, compression=compression, protocol=protocol + ) + + def to_sparse(self, fill_value=None, kind="block"): + return self._default_to_pandas("to_sparse", fill_value=fill_value, kind=kind) + + def to_string( + self, + buf=None, + columns=None, + col_space=None, + header=True, + index=True, + na_rep="NaN", + formatters=None, + float_format=None, + sparsify=None, + index_names=True, + justify=None, + max_rows=None, + max_cols=None, + show_dimensions=False, + decimal=".", + line_width=None, + ): + return self._default_to_pandas( + "to_string", + buf=buf, + columns=columns, + col_space=col_space, + header=header, + index=index, + na_rep=na_rep, + formatters=formatters, + float_format=float_format, + sparsify=sparsify, + index_names=index_names, + justify=justify, + max_rows=max_rows, + max_cols=max_cols, + show_dimensions=show_dimensions, + decimal=decimal, + line_width=line_width, + ) + + def to_sql( + self, + name, + con, + schema=None, + if_exists="fail", + index=True, + index_label=None, + chunksize=None, + dtype=None, + method=None, + ): + new_query_compiler = self._query_compiler + # writing the index to the database by inserting it to the DF + if index: + if not index_label: + index_label = "index" + new_query_compiler = new_query_compiler.insert(0, index_label, self.index) + # so pandas._to_sql will not write the index to the database as well + index = False + + from modin.data_management.factories import BaseFactory + + BaseFactory.to_sql( + new_query_compiler, + name=name, + con=con, + schema=schema, + if_exists=if_exists, + index=index, + index_label=index_label, + chunksize=chunksize, + dtype=dtype, + method=method, + ) + + # TODO(williamma12): When this gets implemented, have the series one call this. + def to_timestamp(self, freq=None, how="start", axis=0, copy=True): + return self._default_to_pandas( + "to_timestamp", freq=freq, how=how, axis=axis, copy=copy + ) + + def to_xarray(self): + return self._default_to_pandas("to_xarray") + + def truediv(self, other, axis="columns", level=None, fill_value=None): + """Divides this DataFrame against another DataFrame/Series/scalar. + + Args: + other: The object to use to apply the divide against this. + axis: The axis to divide over. + level: The Multilevel index level to apply divide over. + fill_value: The value to fill NaNs with. + + Returns: + A new DataFrame with the Divide applied. + """ + return self._binary_op( + "truediv", other, axis=axis, level=level, fill_value=fill_value + ) + + div = divide = truediv + + def truncate(self, before=None, after=None, axis=None, copy=True): + return self._default_to_pandas( + "truncate", before=before, after=after, axis=axis, copy=copy + ) + + def tshift(self, periods=1, freq=None, axis=0): + return self._default_to_pandas("tshift", periods=periods, freq=freq, axis=axis) + + def transform(self, func, axis=0, *args, **kwargs): + kwargs["is_transform"] = True + result = self.agg(func, axis=axis, *args, **kwargs) + try: + assert len(result) == len(self) + except Exception: + raise ValueError("transforms cannot produce aggregated results") + return result + + def tz_convert(self, tz, axis=0, level=None, copy=True): + return self._default_to_pandas( + "tz_convert", tz, axis=axis, level=level, copy=copy + ) + + def tz_localize( + self, tz, axis=0, level=None, copy=True, ambiguous="raise", nonexistent="raise" + ): + return self._default_to_pandas( + "tz_localize", + tz, + axis=axis, + level=level, + copy=copy, + ambiguous=ambiguous, + nonexistent=nonexistent, + ) + + def unstack(self, level=-1, fill_value=None): + return self._default_to_pandas("unstack", level=level, fill_value=fill_value) + + def var( + self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + ): + """Computes variance across the DataFrame. + + Args: + axis (int): The axis to take the variance on. + skipna (bool): True to skip NA values, false otherwise. + ddof (int): degrees of freedom + + Returns: + The variance of the DataFrame. + """ + axis = self._get_axis_number(axis) if axis is not None else 0 + if numeric_only is not None and not numeric_only: + self._validate_dtypes(numeric_only=True) + + return self._reduce_dimension( + self._query_compiler.var( + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + **kwargs + ) + ) + + def __abs__(self): + """Creates a modified DataFrame by taking the absolute value. + + Returns: + A modified DataFrame + """ + return self.abs() + + def __and__(self, other): + return self._binary_op("__and__", other, axis=0) + + def __array__(self, dtype=None): + # TODO: This is very inefficient and needs fix, also see as_matrix + return self._default_to_pandas("__array__", dtype=dtype) + + def __array_wrap__(self, result, context=None): + # TODO: This is very inefficient, see also __array__ and as_matrix + return self._default_to_pandas("__array_wrap__", result, context=context) + + def __copy__(self, deep=True): + """Make a copy of this object. + + Args: + deep: Boolean, deep copy or not. + Currently we do not support deep copy. + + Returns: + A Modin Series/DataFrame object. + """ + return self.copy(deep=deep) + + def __deepcopy__(self, memo=None): + """Make a -deep- copy of this object. + + Note: This is equivalent to copy(deep=True). + + Args: + memo: No effect. Just to comply with Pandas API. + + Returns: + A Modin Series/DataFrame object. + """ + return self.copy(deep=True) + + def __eq__(self, other): + return self.eq(other) + + def __finalize__(self, other, method=None, **kwargs): + if isinstance(other, BasePandasDataset): + other = other._to_pandas() + return self._default_to_pandas("__finalize__", other, method=method, **kwargs) + + def __ge__(self, other): + return self.ge(other) + + def __getstate__(self): + return self._default_to_pandas("__getstate__") + + def __gt__(self, other): + return self.gt(other) + + def __invert__(self): + if not all(is_numeric_dtype(d) for d in self._get_dtypes()): + raise TypeError( + "bad operand type for unary ~: '{}'".format( + next(d for d in self._get_dtypes() if not is_numeric_dtype(d)) + ) + ) + return self.__constructor__(query_compiler=self._query_compiler.invert()) + + def __le__(self, other): + return self.le(other) + + def __len__(self): + """Gets the length of the DataFrame. + + Returns: + Returns an integer length of the DataFrame object. + """ + return len(self.index) + + def __lt__(self, other): + return self.lt(other) + + def __ne__(self, other): + return self.ne(other) + + def __neg__(self): + """Computes an element wise negative DataFrame + + Returns: + A modified DataFrame where every element is the negation of before + """ + self._validate_dtypes(numeric_only=True) + return self.__constructor__(query_compiler=self._query_compiler.negative()) + + def __nonzero__(self): + raise ValueError( + "The truth value of a {0} is ambiguous. " + "Use a.empty, a.bool(), a.item(), a.any() or a.all().".format( + self.__class__.__name__ + ) + ) + + __bool__ = __nonzero__ + + def __or__(self, other): + return self._binary_op("__or__", other, axis=0) + + def __sizeof__(self): + return self._default_to_pandas("__sizeof__") + + def __str__(self): # pragma: no cover + return repr(self) + + def __xor__(self, other): + return self._binary_op("__xor__", other, axis=0) + + @property + def blocks(self): + def blocks(df): + """Defined because properties do not have a __name__""" + return df.blocks + + return self._default_to_pandas(blocks) + + @property + def is_copy(self): + warnings.warn( + "Attribute `is_copy` is deprecated and will be removed in a " + "future version.", + FutureWarning, + ) + # Pandas doesn't do anything so neither do we. + return + + @property + def size(self): + """Get the number of elements in the DataFrame. + + Returns: + The number of elements in the DataFrame. + """ + return len(self._query_compiler.index) * len(self._query_compiler.columns) + + @property + def values(self): + """Create a numpy array with the values from this object. + + Returns: + The numpy representation of this object. + """ + return self._to_pandas().values + + @property + def __name__(self): + return type(self).__name__ + + def __getattribute__(self, item): + default_behaviors = [ + "__init__", + "__class__", + "index", + "_get_index", + "_set_index", + "empty", + "index", + "columns", + "name", + "_get_name", + "_set_name", + "dtypes", + "dtype", + "_default_to_pandas", + "_query_compiler", + "_to_pandas", + "_build_repr_df", + "_reduce_dimension", + "__repr__", + "__len__", + ] + if item not in default_behaviors: + method = object.__getattribute__(self, item) + is_callable = callable(method) + # We default to pandas on empty DataFrames. This avoids a large amount of + # pain in underlying implementation and returns a result immediately rather + # than dealing with the edge cases that empty DataFrames have. + if self.empty and is_callable: + + def default_handler(*args, **kwargs): + return self._default_to_pandas(item, *args, **kwargs) + + return default_handler + return object.__getattribute__(self, item) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 2825dbcd3b7..35b7ae2275a 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -3,24 +3,12 @@ from __future__ import print_function import pandas -from pandas.api.types import is_scalar -from pandas.compat import to_str, string_types, numpy as numpy_compat, cPickle as pkl -from pandas.core.common import ( - count_not_none, - _pipe, - apply_if_callable, - is_bool_indexer, - _get_rename_function, -) +from pandas.compat import string_types +from pandas.core.common import apply_if_callable, is_bool_indexer from pandas.core.dtypes.common import ( infer_dtype_from_object, is_list_like, - is_dict_like, is_numeric_dtype, - is_datetime_or_timedelta_dtype, - is_dtype_equal, - is_object_dtype, - is_integer_dtype, ) from pandas.core.index import ensure_index_from_sequences from pandas.core.indexing import check_bool_indexer, convert_to_index_sliceable @@ -29,25 +17,20 @@ import itertools import functools import numpy as np -from numpy import nan -import re import sys import warnings from modin.error_message import ErrorMessage from .utils import from_pandas, to_pandas, _inherit_docstrings from .iterator import PartitionIterator -from .series import SeriesView - -# Similar to pandas, sentinel value to use as kwarg in place of None when None has -# special meaning and needs to be distinguished from a user explicitly passing None. -sentinel = object() +from .series import Series +from .base import BasePandasDataset @_inherit_docstrings( pandas.DataFrame, excluded=[pandas.DataFrame, pandas.DataFrame.__init__] ) -class DataFrame(object): +class DataFrame(BasePandasDataset): def __init__( self, data=None, @@ -73,63 +56,19 @@ def __init__( Only affects DataFrame / 2d ndarray input. query_compiler: A query compiler object to manage distributed computation. """ - if isinstance(data, DataFrame): - self._query_compiler = data._query_compiler - return - + if isinstance(data, (DataFrame, Series)): + self._query_compiler = data._query_compiler.copy() + if isinstance(data, Series) and data.name is None: + self.columns = [0] # Check type of data and use appropriate constructor - if data is not None or query_compiler is None: - + elif data is not None or query_compiler is None: pandas_df = pandas.DataFrame( data=data, index=index, columns=columns, dtype=dtype, copy=copy ) - self._query_compiler = from_pandas(pandas_df)._query_compiler else: self._query_compiler = query_compiler - def __str__(self): # pragma: no cover - return repr(self) - - def _build_repr_df(self, num_rows, num_cols): - # Add one here so that pandas automatically adds the dots - # It turns out to be faster to extract 2 extra rows and columns than to - # build the dots ourselves. - num_rows_for_head = num_rows // 2 + 1 - num_cols_for_front = num_cols // 2 + 1 - - if len(self.index) <= num_rows: - head = self._query_compiler - tail = None - else: - head = self._query_compiler.head(num_rows_for_head) - tail = self._query_compiler.tail(num_rows_for_head) - - if len(self.columns) <= num_cols: - head_front = head.to_pandas() - # Creating these empty to make the concat logic simpler - head_back = pandas.DataFrame() - tail_back = pandas.DataFrame() - - if tail is not None: - tail_front = tail.to_pandas() - else: - tail_front = pandas.DataFrame() - else: - head_front = head.front(num_cols_for_front).to_pandas() - head_back = head.back(num_cols_for_front).to_pandas() - - if tail is not None: - tail_front = tail.front(num_cols_for_front).to_pandas() - tail_back = tail.back(num_cols_for_front).to_pandas() - else: - tail_front = tail_back = pandas.DataFrame() - - head_for_repr = pandas.concat([head_front, head_back], axis=1) - tail_for_repr = pandas.concat([tail_front, tail_back], axis=1) - - return pandas.concat([head_for_repr, tail_for_repr]) - def __repr__(self): # In the future, we can have this be configurable, just like Pandas. num_rows = 60 @@ -168,14 +107,6 @@ def _repr_html_(self): # pragma: no cover else: return result - def _get_index(self): - """Get the index for this DataFrame. - - Returns: - The union of all indexes across the partitions. - """ - return self._query_compiler.index - def _get_columns(self): """Get the columns for this DataFrame. @@ -184,23 +115,14 @@ def _get_columns(self): """ return self._query_compiler.columns - def _set_index(self, new_index): - """Set the index for this DataFrame. - - Args: - new_index: The new index to set this - """ - self._query_compiler.index = new_index - def _set_columns(self, new_columns): """Set the columns for this DataFrame. Args: - new_index: The new index to set this + new_columns: The new index to set this """ self._query_compiler.columns = new_columns - index = property(_get_index, _set_index) columns = property(_get_columns, _set_columns) def _validate_eval_query(self, expr, **kwargs): @@ -222,15 +144,6 @@ def _validate_eval_query(self, expr, **kwargs): "'Not' nodes are not implemented." ) # pragma: no cover - @property - def size(self): - """Get the number of elements in the DataFrame. - - Returns: - The number of elements in the DataFrame. - """ - return len(self.index) * len(self.columns) - @property def ndim(self): """Get the number of dimensions for this DataFrame. @@ -255,6 +168,11 @@ def ftypes(self): result = pandas.Series(ftypes, index=self.columns) return result + def drop_duplicates(self, subset=None, keep="first", inplace=False): + return super(DataFrame, self).drop_duplicates( + subset=subset, keep=keep, inplace=inplace + ) + @property def dtypes(self): """Get the dtypes for this DataFrame. @@ -264,6 +182,9 @@ def dtypes(self): """ return self._query_compiler.dtypes + def duplicated(self, subset=None, keep="first"): + return super(DataFrame, self).duplicated(subset=subset, keep=keep) + @property def empty(self): """Determines if the DataFrame is empty. @@ -274,15 +195,6 @@ def empty(self): """ return len(self.columns) == 0 or len(self.index) == 0 - @property - def values(self): - """Create a numpy array with the values from this DataFrame. - - Returns: - The numpy representation of this DataFrame. - """ - return to_pandas(self).values - @property def axes(self): """Get the axes for the DataFrame. @@ -301,16 +213,6 @@ def shape(self): """ return len(self.index), len(self.columns) - def _update_inplace(self, new_query_compiler): - """Updates the current DataFrame inplace. - - Args: - new_query_compiler: The new QueryCompiler to use to manage the data - """ - old_query_compiler = self._query_compiler - self._query_compiler = new_query_compiler - old_query_compiler.free() - def add_prefix(self, prefix): """Add a prefix to each of the column names. @@ -338,13 +240,72 @@ def applymap(self, func): ErrorMessage.non_verified_udf() return DataFrame(query_compiler=self._query_compiler.applymap(func)) - def copy(self, deep=True): - """Creates a shallow copy of the DataFrame. + def apply( + self, + func, + axis=0, + broadcast=None, + raw=False, + reduce=None, + result_type=None, + convert_dtype=True, + args=(), + **kwds + ): + axis = self._get_axis_number(axis) + query_compiler = super(DataFrame, self).apply( + func, + axis=axis, + broadcast=broadcast, + raw=raw, + reduce=reduce, + result_type=result_type, + convert_dtype=convert_dtype, + args=args, + **kwds + ) + if not isinstance(query_compiler, type(self._query_compiler)): + return query_compiler + # This is the simplest way to determine the return type, but there are checks + # in pandas that verify that some results are created. This is a challenge for + # empty DataFrames, but fortunately they only happen when the `func` type is + # a list or a dictionary, which means that the return type won't change from + # type(self), so we catch that error and use `self.__name__` for the return + # type. + try: + if axis == 0: + init_kwargs = {"index": self.index} + else: + init_kwargs = {"columns": self.columns} + return_type = type( + getattr(pandas, self.__name__)(**init_kwargs).apply( + func, + axis=axis, + broadcast=broadcast, + raw=raw, + reduce=reduce, + result_type=result_type, + ) + ).__name__ + except ValueError: + return_type = self.__name__ + if return_type not in ["DataFrame", "Series"]: + return query_compiler.to_pandas().squeeze() + else: + result = getattr(sys.modules[self.__module__], return_type)( + query_compiler=query_compiler + ) + if hasattr(result, "name"): + if axis == 0 and result.name == self.index[0]: + result.name = None + elif axis == 1 and result.name == self.columns[0]: + result.name = None + return result - Returns: - A new DataFrame pointing to the same partitions as this one. - """ - return DataFrame(query_compiler=self._query_compiler.copy()) + def get_value(self, index, col, takeable=False): + return self._default_to_pandas( + pandas.DataFrame.get_value, index, col, takeable=takeable + ) def groupby( self, @@ -370,7 +331,7 @@ def groupby( Returns: A new DataFrame resulting from the groupby. """ - axis = pandas.DataFrame()._get_axis_number(axis) + axis = self._get_axis_number(axis) idx_name = None if callable(by): by = by(self.index) @@ -409,72 +370,8 @@ def groupby( **kwargs ) - def sum( - self, - axis=None, - skipna=None, - level=None, - numeric_only=None, - min_count=0, - **kwargs - ): - """Perform a sum across the DataFrame. - - Args: - axis (int): The axis to sum on. - skipna (bool): True to skip NA values, false otherwise. - - Returns: - The sum of the DataFrame. - """ - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 - self._validate_dtypes_sum_prod_mean(axis, numeric_only, ignore_axis=False) - - return self._query_compiler.sum( - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - min_count=min_count, - **kwargs - ) - - def abs(self): - """Apply an absolute value function to all numeric columns. - - Returns: - A new DataFrame with the applied absolute value. - """ - self._validate_dtypes(numeric_only=True) - - return DataFrame(query_compiler=self._query_compiler.abs()) - - def isin(self, values): - """Fill a DataFrame with booleans for cells contained in values. - - Args: - values (iterable, DataFrame, Series, or dict): The values to find. - - Returns: - A new DataFrame with booleans representing whether or not a cell - is in values. - True: cell is contained in values. - False: otherwise - """ - return DataFrame(query_compiler=self._query_compiler.isin(values=values)) - - def isna(self): - """Fill a DataFrame with booleans for cells containing NA. - - Returns: - A new DataFrame with booleans representing whether or not a cell - is NA. - True: cell contains NA. - False: otherwise. - """ - return DataFrame(query_compiler=self._query_compiler.isna()) - - isnull = isna + def _reduce_dimension(self, query_compiler): + return Series(query_compiler=query_compiler) def keys(self): """Get the info axis for the DataFrame. @@ -494,202 +391,11 @@ def transpose(self, *args, **kwargs): T = property(transpose) - def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False): - """Create a new DataFrame from the removed NA values from this one. - - Args: - axis (int, tuple, or list): The axis to apply the drop. - how (str): How to drop the NA values. - 'all': drop the label if all values are NA. - 'any': drop the label if any values are NA. - thresh (int): The minimum number of NAs to require. - subset ([label]): Labels to consider from other axis. - inplace (bool): Change this DataFrame or return a new DataFrame. - True: Modify the data for this DataFrame, return None. - False: Create a new DataFrame and return it. - - Returns: - If inplace is set to True, returns None, otherwise returns a new - DataFrame with the dropna applied. - """ - inplace = validate_bool_kwarg(inplace, "inplace") - - if is_list_like(axis): - axis = [pandas.DataFrame()._get_axis_number(ax) for ax in axis] - result = self - - for ax in axis: - result = result.dropna(axis=ax, how=how, thresh=thresh, subset=subset) - return self._create_dataframe_from_compiler(result._query_compiler, inplace) - - axis = pandas.DataFrame()._get_axis_number(axis) - if how is not None and how not in ["any", "all"]: - raise ValueError("invalid how option: %s" % how) - if how is None and thresh is None: - raise TypeError("must specify how or thresh") - if subset is not None: - if axis == 1: - indices = self.index.get_indexer_for(subset) - check = indices == -1 - if check.any(): - raise KeyError(list(np.compress(check, subset))) - else: - indices = self.columns.get_indexer_for(subset) - check = indices == -1 - if check.any(): - raise KeyError(list(np.compress(check, subset))) - new_query_compiler = self._query_compiler.dropna( - axis=axis, how=how, thresh=thresh, subset=subset - ) - return self._create_dataframe_from_compiler(new_query_compiler, inplace) - def add(self, other, axis="columns", level=None, fill_value=None): - """Add this DataFrame to another or a scalar/list. - - Args: - other: What to add this this DataFrame. - axis: The axis to apply addition over. Only applicaable to Series - or list 'other'. - level: A level in the multilevel axis to add over. - fill_value: The value to fill NaN. - - Returns: - A new DataFrame with the applied addition. - """ - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 1 - if level is not None: - if isinstance(other, DataFrame): - other = other._query_compiler.to_pandas() - return self._default_to_pandas( - pandas.DataFrame.add, - other, - axis=axis, - level=level, - fill_value=fill_value, - ) - other = self._validate_other(other, axis, numeric_or_object_only=True) - new_query_compiler = self._query_compiler.add( - other=other, axis=axis, level=level, fill_value=fill_value - ) - return self._create_dataframe_from_compiler(new_query_compiler) - - def agg(self, func, axis=0, *args, **kwargs): - return self.aggregate(func, axis, *args, **kwargs) - - def aggregate(self, func, axis=0, *args, **kwargs): - axis = pandas.DataFrame()._get_axis_number(axis) - - result = None - - if axis == 0: - try: - result = self._aggregate(func, _axis=axis, *args, **kwargs) - except TypeError: - pass - - if result is None: - kwargs.pop("is_transform", None) - return self.apply(func, axis=axis, args=args, **kwargs) - - return result - - def _aggregate(self, arg, *args, **kwargs): - _axis = kwargs.pop("_axis", 0) - kwargs.pop("_level", None) - - if isinstance(arg, string_types): - kwargs.pop("is_transform", None) - return self._string_function(arg, *args, **kwargs) - - # Dictionaries have complex behavior because they can be renamed here. - elif isinstance(arg, dict): - return self._default_to_pandas(pandas.DataFrame.agg, arg, *args, **kwargs) - elif is_list_like(arg) or callable(arg): - kwargs.pop("is_transform", None) - return self.apply(arg, axis=_axis, args=args, **kwargs) - else: - raise TypeError("type {} is not callable".format(type(arg))) - - def _string_function(self, func, *args, **kwargs): - assert isinstance(func, string_types) - - f = getattr(self, func, None) - - if f is not None: - if callable(f): - return f(*args, **kwargs) - - assert len(args) == 0 - assert ( - len([kwarg for kwarg in kwargs if kwarg not in ["axis", "_level"]]) == 0 - ) - return f - - f = getattr(np, func, None) - if f is not None: - return self._default_to_pandas(pandas.DataFrame.agg, func, *args, **kwargs) - - raise ValueError("{} is an unknown string function".format(func)) - - def align( - self, - other, - join="outer", - axis=None, - level=None, - copy=True, - fill_value=None, - method=None, - limit=None, - fill_axis=0, - broadcast_axis=None, - ): - if isinstance(other, DataFrame): - other = other._query_compiler.to_pandas() - return self._default_to_pandas( - pandas.DataFrame.align, - other, - join=join, - axis=axis, - level=level, - copy=copy, - fill_value=fill_value, - method=method, - limit=limit, - fill_axis=fill_axis, - broadcast_axis=broadcast_axis, - ) - - def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): - """Return whether all elements are True over requested axis - - Note: - If axis=None or axis=0, this call applies df.all(axis=1) - to the transpose of df. - """ - if axis is not None: - axis = pandas.DataFrame()._get_axis_number(axis) - else: - if bool_only: - raise ValueError("Axis must be 0 or 1 (got {})".format(axis)) - return self._query_compiler.all( - axis=axis, bool_only=bool_only, skipna=skipna, level=level, **kwargs - ) - - def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): - """Return whether any elements are True over requested axis - - Note: - If axis=None or axis=0, this call applies on the column partitions, - otherwise operates on row partitions - """ - if axis is not None: - axis = pandas.DataFrame()._get_axis_number(axis) - else: - if bool_only: - raise ValueError("Axis must be 0 or 1 (got {})".format(axis)) - return self._query_compiler.any( - axis=axis, bool_only=bool_only, skipna=skipna, level=level, **kwargs + if isinstance(other, Series): + other = other._to_pandas() + return super(DataFrame, self).add( + other, axis=axis, level=level, fill_value=fill_value ) def append(self, other, ignore_index=False, verify_integrity=False, sort=None): @@ -703,26 +409,27 @@ def append(self, other, ignore_index=False, verify_integrity=False, sort=None): Returns: A new DataFrame containing the concatenated values. """ - if isinstance(other, (pandas.Series, dict)): + if isinstance(other, (Series, dict)): if isinstance(other, dict): - other = pandas.Series(other) + other = Series(other) if other.name is None and not ignore_index: raise TypeError( "Can only append a Series if ignore_index=True" " or if the Series has a name" ) - - if other.name is None: - index = None - else: + if other.name is not None: # other must have the same index name as self, otherwise # index name will be reset - index = pandas.Index([other.name], name=self.index.name) - - # Create a Modin DataFrame from this Series for ease of development - other = DataFrame(pandas.DataFrame(other).T, index=index)._query_compiler + name = other.name + # We must transpose here because a Series becomes a new row, and the + # structure of the query compiler is currently columnar + other = other._query_compiler.transpose() + other.index = pandas.Index([name], name=self.index.name) + else: + # See note above about transpose + other = other._query_compiler.transpose() elif isinstance(other, list): - if not isinstance(other[0], DataFrame): + if not all(isinstance(o, BasePandasDataset) for o in other): other = DataFrame(pandas.DataFrame(other))._query_compiler else: other = [obj._query_compiler for obj in other] @@ -750,237 +457,38 @@ def append(self, other, ignore_index=False, verify_integrity=False, sort=None): ) return DataFrame(query_compiler=query_compiler) - def apply( + def assign(self, **kwargs): + return self._default_to_pandas(pandas.DataFrame.assign, **kwargs) + + def boxplot( self, - func, - axis=0, - broadcast=None, - raw=False, - reduce=None, - result_type=None, - args=(), + column=None, + by=None, + ax=None, + fontsize=None, + rot=0, + grid=True, + figsize=None, + layout=None, + return_type=None, **kwds ): - """Apply a function along input axis of DataFrame. - - Args: - func: The function to apply - axis: The axis over which to apply the func. - broadcast: Whether or not to broadcast. - raw: Whether or not to convert to a Series. - reduce: Whether or not to try to apply reduction procedures. - - Returns: - Series or DataFrame, depending on func. - """ - axis = pandas.DataFrame()._get_axis_number(axis) - ErrorMessage.non_verified_udf() - - if isinstance(func, string_types): - if axis == 1: - kwds["axis"] = axis - return getattr(self, func)(*args, **kwds) - elif isinstance(func, dict): - if axis == 1: - raise TypeError( - "(\"'dict' object is not callable\", " - "'occurred at index {0}'".format(self.index[0]) - ) - if len(self.columns) != len(set(self.columns)): - warnings.warn( - "duplicate column names not supported with apply().", - FutureWarning, - stacklevel=2, - ) - elif not callable(func) and not is_list_like(func): - raise TypeError("{} object is not callable".format(type(func))) - - query_compiler = self._query_compiler.apply(func, axis, *args, **kwds) - if isinstance(query_compiler, pandas.Series): - return query_compiler - return DataFrame(query_compiler=query_compiler) - - def as_blocks(self, copy=True): - return self._default_to_pandas(pandas.DataFrame.as_blocks, copy=copy) - - def as_matrix(self, columns=None): - """Convert the frame to its Numpy-array representation. - - Args: - columns: If None, return all columns, otherwise, - returns specified columns. - - Returns: - values: ndarray - """ - # TODO this is very inefficient, also see __array__ - return to_pandas(self).as_matrix(columns) - - def to_numpy(self, dtype=None, copy=False): - """Convert the DataFrame to a NumPy array. - - Args: - dtype: The dtype to pass to numpy.asarray() - copy: Whether to ensure that the returned value is a not a view on another - array. - - Returns: - A numpy array. - """ - return self._default_to_pandas( - pandas.DataFrame.to_numpy, dtype=dtype, copy=copy + return to_pandas(self).boxplot( + column=column, + by=by, + ax=ax, + fontsize=fontsize, + rot=rot, + grid=grid, + figsize=figsize, + layout=layout, + return_type=return_type, + **kwds ) - def asfreq(self, freq, method=None, how=None, normalize=False, fill_value=None): - return self._default_to_pandas( - pandas.DataFrame.asfreq, - freq, - method=method, - how=how, - normalize=normalize, - fill_value=fill_value, - ) - - def asof(self, where, subset=None): - return self._default_to_pandas(pandas.DataFrame.asof, where, subset=subset) - - def assign(self, **kwargs): - return self._default_to_pandas(pandas.DataFrame.assign, **kwargs) - - def astype(self, dtype, copy=True, errors="raise", **kwargs): - col_dtypes = {} - if isinstance(dtype, dict): - if not set(dtype.keys()).issubset(set(self.columns)) and errors == "raise": - raise KeyError( - "Only a column name can be used for the key in" - "a dtype mappings argument." - ) - col_dtypes = dtype - - else: - for column in self.columns: - col_dtypes[column] = dtype - - new_query_compiler = self._query_compiler.astype(col_dtypes, **kwargs) - return self._create_dataframe_from_compiler(new_query_compiler, not copy) - - def at_time(self, time, asof=False, axis=None): - return self._default_to_pandas( - pandas.DataFrame.at_time, time, asof=asof, axis=axis - ) - - def between_time( - self, start_time, end_time, include_start=True, include_end=True, axis=None - ): - return self._default_to_pandas( - pandas.DataFrame.between_time, - start_time, - end_time, - include_start=include_start, - include_end=include_end, - axis=axis, - ) - - def bfill(self, axis=None, inplace=False, limit=None, downcast=None): - """Synonym for DataFrame.fillna(method='bfill')""" - new_df = self.fillna( - method="bfill", axis=axis, limit=limit, downcast=downcast, inplace=inplace - ) - if not inplace: - return new_df - - def bool(self): - """Return the bool of a single element PandasObject. - - This must be a boolean scalar value, either True or False. Raise a - ValueError if the PandasObject does not have exactly 1 element, or that - element is not boolean - """ - shape = self.shape - if shape != (1,) and shape != (1, 1): - raise ValueError( - """The PandasObject does not have exactly - 1 element. Return the bool of a single - element PandasObject. The truth value is - ambiguous. Use a.empty, a.item(), a.any() - or a.all().""" - ) - else: - return to_pandas(self).bool() - - def boxplot( - self, - column=None, - by=None, - ax=None, - fontsize=None, - rot=0, - grid=True, - figsize=None, - layout=None, - return_type=None, - **kwds - ): - return to_pandas(self).boxplot( - column=column, - by=by, - ax=ax, - fontsize=fontsize, - rot=rot, - grid=grid, - figsize=figsize, - layout=layout, - return_type=return_type, - **kwds - ) - - def clip(self, lower=None, upper=None, axis=None, inplace=False, *args, **kwargs): - # validate inputs - if axis is not None: - axis = pandas.DataFrame()._get_axis_number(axis) - self._validate_dtypes(numeric_only=True) - if is_list_like(lower) or is_list_like(upper): - if axis is None: - raise ValueError("Must specify axis = 0 or 1") - self._validate_other(lower, axis) - self._validate_other(upper, axis) - inplace = validate_bool_kwarg(inplace, "inplace") - axis = numpy_compat.function.validate_clip_with_axis(axis, args, kwargs) - # any np.nan bounds are treated as None - if lower is not None and np.any(np.isnan(lower)): - lower = None - if upper is not None and np.any(np.isnan(upper)): - upper = None - new_query_compiler = self._query_compiler.clip( - lower=lower, upper=upper, axis=axis, inplace=inplace, *args, **kwargs - ) - return self._create_dataframe_from_compiler(new_query_compiler, inplace) - - def clip_lower(self, threshold, axis=None, inplace=False): - return self.clip(lower=threshold, axis=axis, inplace=inplace) - - def clip_upper(self, threshold, axis=None, inplace=False): - return self.clip(upper=threshold, axis=axis, inplace=inplace) - - def combine(self, other, func, fill_value=None, overwrite=True): - if isinstance(other, DataFrame): - other = other._query_compiler.to_pandas() - return self._default_to_pandas( - pandas.DataFrame.combine, - other, - func, - fill_value=fill_value, - overwrite=overwrite, - ) - - def combine_first(self, other): - if isinstance(other, DataFrame): - other = other._query_compiler.to_pandas() - return self._default_to_pandas(pandas.DataFrame.combine_first, other=other) - - def compound(self, axis=None, skipna=None, level=None): - return self._default_to_pandas( - pandas.DataFrame.compound, axis=axis, skipna=skipna, level=level + def combine(self, other, func, fill_value=None, overwrite=True): + return super(DataFrame, self).combine( + other, func, fill_value=fill_value, overwrite=overwrite ) def convert_objects( @@ -1010,375 +518,13 @@ def corrwith(self, other, axis=0, drop=False, method="pearson"): pandas.DataFrame.corrwith, other, axis=axis, drop=drop, method=method ) - def count(self, axis=0, level=None, numeric_only=False): - """Get the count of non-null objects in the DataFrame. - - Arguments: - axis: 0 or 'index' for row-wise, 1 or 'columns' for column-wise. - level: If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a DataFrame. - numeric_only: Include only float, int, boolean data - - Returns: - The count, in a Series (or DataFrame if level is specified). - """ - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 - return self._query_compiler.count( - axis=axis, level=level, numeric_only=numeric_only - ) - def cov(self, min_periods=None): return self._default_to_pandas(pandas.DataFrame.cov, min_periods=min_periods) - def cummax(self, axis=None, skipna=True, *args, **kwargs): - """Perform a cumulative maximum across the DataFrame. - - Args: - axis (int): The axis to take maximum on. - skipna (bool): True to skip NA values, false otherwise. - - Returns: - The cumulative maximum of the DataFrame. - """ - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 - if axis: - self._validate_dtypes() - return DataFrame( - query_compiler=self._query_compiler.cummax( - axis=axis, skipna=skipna, **kwargs - ) - ) - - def cummin(self, axis=None, skipna=True, *args, **kwargs): - """Perform a cumulative minimum across the DataFrame. - - Args: - axis (int): The axis to cummin on. - skipna (bool): True to skip NA values, false otherwise. - - Returns: - The cumulative minimum of the DataFrame. - """ - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 - if axis: - self._validate_dtypes() - return DataFrame( - query_compiler=self._query_compiler.cummin( - axis=axis, skipna=skipna, **kwargs - ) - ) - - def cumprod(self, axis=None, skipna=True, *args, **kwargs): - """Perform a cumulative product across the DataFrame. - - Args: - axis (int): The axis to take product on. - skipna (bool): True to skip NA values, false otherwise. - - Returns: - The cumulative product of the DataFrame. - """ - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 - self._validate_dtypes(numeric_only=True) - return DataFrame( - query_compiler=self._query_compiler.cumprod( - axis=axis, skipna=skipna, **kwargs - ) - ) - - def cumsum(self, axis=None, skipna=True, *args, **kwargs): - """Perform a cumulative sum across the DataFrame. - - Args: - axis (int): The axis to take sum on. - skipna (bool): True to skip NA values, false otherwise. - - Returns: - The cumulative sum of the DataFrame. - """ - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 - self._validate_dtypes(numeric_only=True) - return DataFrame( - query_compiler=self._query_compiler.cumsum( - axis=axis, skipna=skipna, **kwargs - ) - ) - - def describe(self, percentiles=None, include=None, exclude=None): - """ - Generates descriptive statistics that summarize the central tendency, - dispersion and shape of a dataset's distribution, excluding NaN values. - - Args: - percentiles (list-like of numbers, optional): - The percentiles to include in the output. - include: White-list of data types to include in results - exclude: Black-list of data types to exclude in results - - Returns: Series/DataFrame of summary statistics - """ - if include is not None and (isinstance(include, np.dtype) or include != "all"): - if not is_list_like(include): - include = [include] - include = [ - np.dtype(i) - if not (isinstance(i, type) and i.__module__ == "numpy") - else i - for i in include - ] - if not any( - (isinstance(inc, np.dtype) and inc == d) - or ( - not isinstance(inc, np.dtype) - and inc.__subclasscheck__(getattr(np, d.__str__())) - ) - for d in self.dtypes.values - for inc in include - ): - # This is the error that pandas throws. - raise ValueError("No objects to concatenate") - if exclude is not None: - if not is_list_like(exclude): - exclude = [exclude] - exclude = [np.dtype(e) for e in exclude] - if all( - (isinstance(exc, np.dtype) and exc == d) - or ( - not isinstance(exc, np.dtype) - and exc.__subclasscheck__(getattr(np, d.__str__())) - ) - for d in self.dtypes.values - for exc in exclude - ): - # This is the error that pandas throws. - raise ValueError("No objects to concatenate") - if percentiles is not None: - pandas.DataFrame()._check_percentile(percentiles) - return DataFrame( - query_compiler=self._query_compiler.describe( - percentiles=percentiles, include=include, exclude=exclude - ) - ) - - def diff(self, periods=1, axis=0): - """Finds the difference between elements on the axis requested - - Args: - periods: Periods to shift for forming difference - axis: Take difference over rows or columns - - Returns: - DataFrame with the diff applied - """ - axis = pandas.DataFrame()._get_axis_number(axis) - return DataFrame( - query_compiler=self._query_compiler.diff(periods=periods, axis=axis) - ) - - def div(self, other, axis="columns", level=None, fill_value=None): - """Divides this DataFrame against another DataFrame/Series/scalar. - - Args: - other: The object to use to apply the divide against this. - axis: The axis to divide over. - level: The Multilevel index level to apply divide over. - fill_value: The value to fill NaNs with. - - Returns: - A new DataFrame with the Divide applied. - """ - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 1 - if level is not None: - if isinstance(other, DataFrame): - other = other._query_compiler.to_pandas() - return self._default_to_pandas( - pandas.DataFrame.div, - other, - axis=axis, - level=level, - fill_value=fill_value, - ) - other = self._validate_other(other, axis, numeric_only=True) - new_query_compiler = self._query_compiler.div( - other=other, axis=axis, level=level, fill_value=fill_value - ) - return self._create_dataframe_from_compiler(new_query_compiler) - - def divide(self, other, axis="columns", level=None, fill_value=None): - """Synonym for div. - - Args: - other: The object to use to apply the divide against this. - axis: The axis to divide over. - level: The Multilevel index level to apply divide over. - fill_value: The value to fill NaNs with. - - Returns: - A new DataFrame with the Divide applied. - """ - return self.div(other, axis, level, fill_value) - - def dot(self, other): - if isinstance(other, DataFrame): - other = other._query_compiler.to_pandas() - return self._default_to_pandas(pandas.DataFrame.dot, other) - - def drop( - self, - labels=None, - axis=0, - index=None, - columns=None, - level=None, - inplace=False, - errors="raise", - ): - """Return new object with labels in requested axis removed. - Args: - labels: Index or column labels to drop. - axis: Whether to drop labels from the index (0 / 'index') or - columns (1 / 'columns'). - index, columns: Alternative to specifying axis (labels, axis=1 is - equivalent to columns=labels). - level: For MultiIndex - inplace: If True, do operation inplace and return None. - errors: If 'ignore', suppress error and existing labels are - dropped. - Returns: - dropped : type of caller - """ - # TODO implement level - if level is not None: - return self._default_to_pandas( - pandas.DataFrame.drop, - labels=labels, - axis=axis, - index=index, - columns=columns, - level=level, - inplace=inplace, - errors=errors, - ) - - inplace = validate_bool_kwarg(inplace, "inplace") - if labels is not None: - if index is not None or columns is not None: - raise ValueError("Cannot specify both 'labels' and 'index'/'columns'") - axis = pandas.DataFrame()._get_axis_name(axis) - axes = {axis: labels} - elif index is not None or columns is not None: - axes, _ = pandas.DataFrame()._construct_axes_from_arguments( - (index, columns), {} - ) - else: - raise ValueError( - "Need to specify at least one of 'labels', 'index' or 'columns'" - ) - - # TODO Clean up this error checking - if "index" not in axes: - axes["index"] = None - elif axes["index"] is not None: - if not is_list_like(axes["index"]): - axes["index"] = [axes["index"]] - if errors == "raise": - non_existant = [obj for obj in axes["index"] if obj not in self.index] - if len(non_existant): - raise ValueError( - "labels {} not contained in axis".format(non_existant) - ) - else: - axes["index"] = [obj for obj in axes["index"] if obj in self.index] - # If the length is zero, we will just do nothing - if not len(axes["index"]): - axes["index"] = None - - if "columns" not in axes: - axes["columns"] = None - elif axes["columns"] is not None: - if not is_list_like(axes["columns"]): - axes["columns"] = [axes["columns"]] - if errors == "raise": - non_existant = [ - obj for obj in axes["columns"] if obj not in self.columns - ] - if len(non_existant): - raise ValueError( - "labels {} not contained in axis".format(non_existant) - ) - else: - axes["columns"] = [ - obj for obj in axes["columns"] if obj in self.columns - ] - # If the length is zero, we will just do nothing - if not len(axes["columns"]): - axes["columns"] = None - - new_query_compiler = self._query_compiler.drop( - index=axes["index"], columns=axes["columns"] - ) - return self._create_dataframe_from_compiler(new_query_compiler, inplace) - - def droplevel(self, level, axis=0): - """Return index with requested level(s) removed. - - Args: - level: The level to drop - - Returns: - Index or MultiIndex - """ - return self._default_to_pandas(pandas.DataFrame.droplevel, level, axis=axis) - - def drop_duplicates(self, subset=None, keep="first", inplace=False): - """Return DataFrame with duplicate rows removed, optionally only considering certain columns - - Args: - subset : column label or sequence of labels, optional - Only consider certain columns for identifying duplicates, by - default use all of the columns - keep : {'first', 'last', False}, default 'first' - - ``first`` : Drop duplicates except for the first occurrence. - - ``last`` : Drop duplicates except for the last occurrence. - - False : Drop all duplicates. - inplace : boolean, default False - Whether to drop duplicates in place or to return a copy - - Returns: - deduplicated : DataFrame - """ - inplace = validate_bool_kwarg(inplace, "inplace") - duplicates = self.duplicated(subset=subset, keep=keep) - indices, = duplicates.values.nonzero() - return self.drop(index=self.index[indices], inplace=inplace) - - def duplicated(self, subset=None, keep="first"): - return self._default_to_pandas( - pandas.DataFrame.duplicated, subset=subset, keep=keep - ) - def eq(self, other, axis="columns", level=None): - """Checks element-wise that this is equal to other. - - Args: - other: A DataFrame or Series or scalar to compare to. - axis: The axis to perform the eq over. - level: The Multilevel index level to apply eq over. - - Returns: - A new DataFrame filled with Booleans. - """ - if level is not None: - if isinstance(other, DataFrame): - other = other._query_compiler.to_pandas() - return self._default_to_pandas( - pandas.DataFrame.eq, other, axis=axis, level=level - ) - other = self._validate_other(other, axis) - new_query_compiler = self._query_compiler.eq( - other=other, axis=axis, level=level - ) - return self._create_dataframe_from_compiler(new_query_compiler) + if isinstance(other, Series): + other = other._to_pandas() + return super(DataFrame, self).eq(other, axis=axis, level=level) def equals(self, other): """ @@ -1388,12 +534,13 @@ def equals(self, other): Boolean: True if equal, otherwise False """ if isinstance(other, pandas.DataFrame): - # Copy into a Ray DataFrame to simplify logic below + # Copy into a Modin DataFrame to simplify logic below other = DataFrame(other) - if not self.index.equals(other.index) or not self.columns.equals(other.columns): - return False - - return all(self.eq(other).all()) + return ( + self.index.equals(other.index) + and self.columns.equals(other.columns) + and self.eq(other).all().all() + ) def eval(self, expr, inplace=False, **kwargs): """Evaluate a Python expression as a string using various backends. @@ -1441,244 +588,27 @@ def eval(self, expr, inplace=False, **kwargs): ndarray, numeric scalar, DataFrame, Series """ self._validate_eval_query(expr, **kwargs) - inplace = validate_bool_kwarg(inplace, "inplace") - new_query_compiler = self._query_compiler.eval(expr, **kwargs) - - if isinstance(new_query_compiler, pandas.Series): - return new_query_compiler - else: - return self._create_dataframe_from_compiler(new_query_compiler, inplace) - - def ewm( - self, - com=None, - span=None, - halflife=None, - alpha=None, - min_periods=0, - adjust=True, - ignore_na=False, - axis=0, - ): - return self._default_to_pandas( - pandas.DataFrame.ewm, - com=com, - span=span, - halflife=halflife, - alpha=alpha, - min_periods=min_periods, - adjust=adjust, - ignore_na=ignore_na, - axis=axis, - ) - - def expanding(self, min_periods=1, center=False, axis=0): - return self._default_to_pandas( - pandas.DataFrame.expanding, - min_periods=min_periods, - center=center, - axis=axis, - ) - - def ffill(self, axis=None, inplace=False, limit=None, downcast=None): - """Synonym for DataFrame.fillna(method='ffill') - """ - new_df = self.fillna( - method="ffill", axis=axis, limit=limit, downcast=downcast, inplace=inplace - ) - if not inplace: - return new_df - - def fillna( - self, - value=None, - method=None, - axis=None, - inplace=False, - limit=None, - downcast=None, - **kwargs - ): - """Fill NA/NaN values using the specified method. - - Args: - value: Value to use to fill holes. This value cannot be a list. - - method: Method to use for filling holes in reindexed Series pad. - ffill: propagate last valid observation forward to next valid - backfill. - bfill: use NEXT valid observation to fill gap. - - axis: 0 or 'index', 1 or 'columns'. - - inplace: If True, fill in place. Note: this will modify any other - views on this object. - - limit: If method is specified, this is the maximum number of - consecutive NaN values to forward/backward fill. In other - words, if there is a gap with more than this number of - consecutive NaNs, it will only be partially filled. If method - is not specified, this is the maximum number of entries along - the entire axis where NaNs will be filled. Must be greater - than 0 if not None. - - downcast: A dict of item->dtype of what to downcast if possible, - or the string 'infer' which will try to downcast to an - appropriate equal type. - - Returns: - filled: DataFrame - """ - # TODO implement value passed as DataFrame - if isinstance(value, pandas.DataFrame) or isinstance(value, pandas.Series): - new_query_compiler = self._default_to_pandas( - pandas.DataFrame.fillna, - value=value, - method=method, - axis=axis, - inplace=False, - limit=limit, - downcast=downcast, - **kwargs - )._query_compiler - return self._create_dataframe_from_compiler(new_query_compiler, inplace) - inplace = validate_bool_kwarg(inplace, "inplace") - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 - - if isinstance(value, (list, tuple)): - raise TypeError( - '"value" parameter must be a scalar or dict, but ' - 'you passed a "{0}"'.format(type(value).__name__) - ) - if value is None and method is None: - raise ValueError("must specify a fill method or value") - if value is not None and method is not None: - raise ValueError("cannot specify both a fill method and value") - if method is not None and method not in ["backfill", "bfill", "pad", "ffill"]: - expecting = "pad (ffill) or backfill (bfill)" - msg = "Invalid fill method. Expecting {expecting}. Got {method}".format( - expecting=expecting, method=method - ) - raise ValueError(msg) - - new_query_compiler = self._query_compiler.fillna( - value=value, - method=method, - axis=axis, - inplace=False, - limit=limit, - downcast=downcast, - **kwargs - ) - return self._create_dataframe_from_compiler(new_query_compiler, inplace) - - def filter(self, items=None, like=None, regex=None, axis=None): - """Subset rows or columns based on their labels - - Args: - items (list): list of labels to subset - like (string): retain labels where `arg in label == True` - regex (string): retain labels matching regex input - axis: axis to filter on - - Returns: - A new DataFrame with the filter applied. - """ - nkw = count_not_none(items, like, regex) - if nkw > 1: - raise TypeError( - "Keyword arguments `items`, `like`, or `regex` are mutually exclusive" - ) - if nkw == 0: - raise TypeError("Must pass either `items`, `like`, or `regex`") - if axis is None: - axis = "columns" # This is the default info axis for dataframes - - axis = pandas.DataFrame()._get_axis_number(axis) - labels = self.columns if axis else self.index - - if items is not None: - bool_arr = labels.isin(items) - elif like is not None: - - def f(x): - return like in to_str(x) - - bool_arr = labels.map(f).tolist() - else: - - def f(x): - return matcher.search(to_str(x)) is not None - - matcher = re.compile(regex) - bool_arr = labels.map(f).tolist() - if not axis: - return self[bool_arr] - return self[self.columns[bool_arr]] - - def first(self, offset): - return self._default_to_pandas(pandas.DataFrame.first, offset) - - def first_valid_index(self): - """Return index for first non-NA/null value. - - Returns: - scalar: type of index - """ - return self._query_compiler.first_valid_index() - - def floordiv(self, other, axis="columns", level=None, fill_value=None): - """Divides this DataFrame against another DataFrame/Series/scalar. - - Args: - other: The object to use to apply the divide against this. - axis: The axis to divide over. - level: The Multilevel index level to apply divide over. - fill_value: The value to fill NaNs with. - - Returns: - A new DataFrame with the Divide applied. - """ - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 1 - if level is not None: - if isinstance(other, DataFrame): - other = other._query_compiler.to_pandas() - return self._default_to_pandas( - pandas.DataFrame.floordiv, - other, - axis=axis, - level=level, - fill_value=fill_value, + inplace = validate_bool_kwarg(inplace, "inplace") + new_query_compiler = self._query_compiler.eval(expr, **kwargs) + return_type = type( + pandas.DataFrame(columns=self.columns) + .astype(self.dtypes) + .eval(expr, **kwargs) + ).__name__ + if return_type == self.__name__: + return self._create_or_update_from_compiler(new_query_compiler, inplace) + else: + if inplace: + raise ValueError("Cannot operate inplace if there is no assignment") + return getattr(sys.modules[self.__module__], return_type)( + query_compiler=new_query_compiler ) - other = self._validate_other(other, axis, numeric_only=True) - new_query_compiler = self._query_compiler.floordiv( - other=other, axis=axis, level=level, fill_value=fill_value - ) - return self._create_dataframe_from_compiler(new_query_compiler) - - @classmethod - def from_csv( - cls, - path, - header=0, - sep=",", - index_col=0, - parse_dates=True, - encoding=None, - tupleize_cols=None, - infer_datetime_format=False, - ): - from .io import read_csv - return read_csv( - path, - header=header, - sep=sep, - index_col=index_col, - parse_dates=parse_dates, - encoding=encoding, - tupleize_cols=tupleize_cols, - infer_datetime_format=infer_datetime_format, + def floordiv(self, other, axis="columns", level=None, fill_value=None): + if isinstance(other, Series): + other = other._to_pandas() + return super(DataFrame, self).floordiv( + other, axis=axis, level=level, fill_value=None ) @classmethod @@ -1722,106 +652,14 @@ def from_records( ) def ge(self, other, axis="columns", level=None): - """Checks element-wise that this is greater than or equal to other. - - Args: - other: A DataFrame or Series or scalar to compare to. - axis: The axis to perform the gt over. - level: The Multilevel index level to apply gt over. - - Returns: - A new DataFrame filled with Booleans. - """ - if level is not None: - if isinstance(other, DataFrame): - other = other._query_compiler.to_pandas() - return self._default_to_pandas( - pandas.DataFrame.ge, other, axis=axis, level=level - ) - other = self._validate_other(other, axis, comparison_dtypes_only=True) - new_query_compiler = self._query_compiler.ge( - other=other, axis=axis, level=level - ) - return self._create_dataframe_from_compiler(new_query_compiler) - - def get(self, key, default=None): - """Get item from object for given key (DataFrame column, Panel - slice, etc.). Returns default value if not found. - - Args: - key (DataFrame column, Panel slice) : the key for which value - to get - - Returns: - value (type of items contained in object) : A value that is - stored at the key - """ - try: - return self[key] - except (KeyError, ValueError, IndexError): - return default - - def get_dtype_counts(self): - """Get the counts of dtypes in this object. - - Returns: - The counts of dtypes in this object. - """ - result = self.dtypes.value_counts() - result.index = result.index.map(lambda x: str(x)) - return result - - def get_ftype_counts(self): - """Get the counts of ftypes in this object. - - Returns: - The counts of ftypes in this object. - """ - return self.ftypes.value_counts().sort_index() - - def get_value(self, index, col, takeable=False): - return self._default_to_pandas( - pandas.DataFrame.get_value, index, col, takeable=takeable - ) - - def get_values(self): - return self._default_to_pandas(pandas.DataFrame.get_values) + if isinstance(other, Series): + other = other._to_pandas() + return super(DataFrame, self).ge(other, axis=axis, level=level) def gt(self, other, axis="columns", level=None): - """Checks element-wise that this is greater than other. - - Args: - other: A DataFrame or Series or scalar to compare to. - axis: The axis to perform the gt over. - level: The Multilevel index level to apply gt over. - - Returns: - A new DataFrame filled with Booleans. - """ - if level is not None: - if isinstance(other, DataFrame): - other = other._query_compiler.to_pandas() - return self._default_to_pandas( - pandas.DataFrame.gt, other, axis=axis, level=level - ) - other = self._validate_other(other, axis, comparison_dtypes_only=True) - new_query_compiler = self._query_compiler.gt( - other=other, axis=axis, level=level - ) - return self._create_dataframe_from_compiler(new_query_compiler) - - def head(self, n=5): - """Get the first n rows of the DataFrame. - - Args: - n (int): The number of rows to return. - - Returns: - A new DataFrame with the first n rows of the DataFrame. - """ - if n >= len(self.index): - return self.copy() - return DataFrame(query_compiler=self._query_compiler.head(n)) + if isinstance(other, Series): + other = other._to_pandas() + return super(DataFrame, self).gt(other, axis=axis, level=level) def hist( self, @@ -1858,39 +696,6 @@ def hist( **kwds ) - def idxmax(self, axis=0, skipna=True): - """Get the index of the first occurrence of the max value of the axis. - - Args: - axis (int): Identify the max over the rows (1) or columns (0). - skipna (bool): Whether or not to skip NA values. - - Returns: - A Series with the index for each maximum value for the axis - specified. - """ - if not all(d != np.dtype("O") for d in self.dtypes): - raise TypeError("reduction operation 'argmax' not allowed for this dtype") - return self._query_compiler.idxmax(axis=axis, skipna=skipna) - - def idxmin(self, axis=0, skipna=True): - """Get the index of the first occurrence of the min value of the axis. - - Args: - axis (int): Identify the min over the rows (1) or columns (0). - skipna (bool): Whether or not to skip NA values. - - Returns: - A Series with the index for each minimum value for the axis - specified. - """ - if not all(d != np.dtype("O") for d in self.dtypes): - raise TypeError("reduction operation 'argmax' not allowed for this dtype") - return self._query_compiler.idxmin(axis=axis, skipna=skipna) - - def infer_objects(self): - return self._default_to_pandas(pandas.DataFrame.infer_objects) - def info( self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None ): @@ -1959,6 +764,9 @@ def insert(self, loc, column, value, allow_duplicates=False): raise ValueError("Wrong number of items passed 2, placement implies 1") value = value.iloc[:, 0] if len(self.index) == 0: + if isinstance(value, Series): + # TODO: Remove broadcast of Series + value = value._to_pandas() try: value = pandas.Series(value) except (TypeError, ValueError, IndexError): @@ -2175,167 +983,18 @@ def join(self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False): ) ) - def kurt(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - return self._default_to_pandas( - pandas.DataFrame.kurt, - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs - ) - - def kurtosis(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - return self._default_to_pandas( - pandas.DataFrame.kurtosis, - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs - ) - - def last(self, offset): - return self._default_to_pandas(pandas.DataFrame.last, offset) - - def last_valid_index(self): - """Return index for last non-NA/null value. - - Returns: - scalar: type of index - """ - return self._query_compiler.last_valid_index() - def le(self, other, axis="columns", level=None): - """Checks element-wise that this is less than or equal to other. - - Args: - other: A DataFrame or Series or scalar to compare to. - axis: The axis to perform the le over. - level: The Multilevel index level to apply le over. - - Returns: - A new DataFrame filled with Booleans. - """ - if level is not None: - if isinstance(other, DataFrame): - other = other._query_compiler.to_pandas() - return self._default_to_pandas( - pandas.DataFrame.le, other, axis=axis, level=level - ) - other = self._validate_other(other, axis, comparison_dtypes_only=True) - new_query_compiler = self._query_compiler.le( - other=other, axis=axis, level=level - ) - return self._create_dataframe_from_compiler(new_query_compiler) + if isinstance(other, Series): + other = other._to_pandas() + return super(DataFrame, self).le(other, axis=axis, level=level) def lookup(self, row_labels, col_labels): return self._default_to_pandas(pandas.DataFrame.lookup, row_labels, col_labels) def lt(self, other, axis="columns", level=None): - """Checks element-wise that this is less than other. - - Args: - other: A DataFrame or Series or scalar to compare to. - axis: The axis to perform the lt over. - level: The Multilevel index level to apply lt over. - - Returns: - A new DataFrame filled with Booleans. - """ - if level is not None: - if isinstance(other, DataFrame): - other = other._query_compiler.to_pandas() - return self._default_to_pandas( - pandas.DataFrame.lt, other, axis=axis, level=level - ) - other = self._validate_other(other, axis, comparison_dtypes_only=True) - new_query_compiler = self._query_compiler.lt( - other=other, axis=axis, level=level - ) - return self._create_dataframe_from_compiler(new_query_compiler) - - def mad(self, axis=None, skipna=None, level=None): - return self._default_to_pandas( - pandas.DataFrame.mad, axis=axis, skipna=skipna, level=level - ) - - def mask( - self, - cond, - other=nan, - inplace=False, - axis=None, - level=None, - errors="raise", - try_cast=False, - raise_on_error=None, - ): - if isinstance(other, DataFrame): - other = other._query_compiler.to_pandas() - return self._default_to_pandas( - pandas.DataFrame.mask, - cond, - other=other, - inplace=inplace, - axis=axis, - level=level, - errors=errors, - try_cast=try_cast, - raise_on_error=raise_on_error, - ) - - def max(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - """Perform max across the DataFrame. - - Args: - axis (int): The axis to take the max on. - skipna (bool): True to skip NA values, false otherwise. - - Returns: - The max of the DataFrame. - """ - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 - self._validate_dtypes_min_max(axis, numeric_only) - - return self._query_compiler.max( - axis=axis, skipna=skipna, level=level, numeric_only=numeric_only, **kwargs - ) - - def mean(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - """Computes mean across the DataFrame. - - Args: - axis (int): The axis to take the mean on. - skipna (bool): True to skip NA values, false otherwise. - - Returns: - The mean of the DataFrame. (Pandas series) - """ - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 - self._validate_dtypes_sum_prod_mean(axis, numeric_only, ignore_axis=False) - - return self._query_compiler.mean( - axis=axis, skipna=skipna, level=level, numeric_only=numeric_only, **kwargs - ) - - def median(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - """Computes median across the DataFrame. - - Args: - axis (int): The axis to take the median on. - skipna (bool): True to skip NA values, false otherwise. - - Returns: - The median of the DataFrame. (Pandas series) - """ - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 - if numeric_only is not None and not numeric_only: - self._validate_dtypes(numeric_only=True) - - return self._query_compiler.median( - axis=axis, skipna=skipna, level=level, numeric_only=numeric_only, **kwargs - ) + if isinstance(other, Series): + other = other._to_pandas() + return super(DataFrame, self).lt(other, axis=axis, level=level) def melt( self, @@ -2368,13 +1027,13 @@ def memory_usage(self, index=True, deep=False): the memory usage of each of the columns in bytes. If `index=true`, then the first value of the Series will be 'Index' with its memory usage. """ - result = self._query_compiler.memory_usage(index=index, deep=deep) - result.index = self.columns if index: + result = self._reduce_dimension( + self._query_compiler.memory_usage(index=False, deep=deep) + ) index_value = self.index.memory_usage(deep=deep) - return pandas.Series(index_value, index=["Index"]).append(result) - - return result + return Series(index_value, index=["Index"]).append(result) + return super(DataFrame, self).memory_usage(index=index, deep=deep) def merge( self, @@ -2442,197 +1101,34 @@ def merge( right, how=how, lsuffix=suffixes[0], rsuffix=suffixes[1], sort=sort ) - def min(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - """Perform min across the DataFrame. - - Args: - axis (int): The axis to take the min on. - skipna (bool): True to skip NA values, false otherwise. - - Returns: - The min of the DataFrame. - """ - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 - self._validate_dtypes_min_max(axis, numeric_only) - - return self._query_compiler.min( - axis=axis, skipna=skipna, level=level, numeric_only=numeric_only, **kwargs - ) - def mod(self, other, axis="columns", level=None, fill_value=None): - """Mods this DataFrame against another DataFrame/Series/scalar. - - Args: - other: The object to use to apply the mod against this. - axis: The axis to mod over. - level: The Multilevel index level to apply mod over. - fill_value: The value to fill NaNs with. - - Returns: - A new DataFrame with the Mod applied. - """ - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 1 - if level is not None: - if isinstance(other, DataFrame): - other = other._query_compiler.to_pandas() - return self._default_to_pandas( - pandas.DataFrame.mod, - other, - axis=axis, - level=level, - fill_value=fill_value, - ) - other = self._validate_other(other, axis, numeric_only=True) - new_query_compiler = self._query_compiler.mod( - other=other, axis=axis, level=level, fill_value=fill_value - ) - return self._create_dataframe_from_compiler(new_query_compiler) - - def mode(self, axis=0, numeric_only=False, dropna=True): - """Perform mode across the DataFrame. - - Args: - axis (int): The axis to take the mode on. - numeric_only (bool): if True, only apply to numeric columns. - - Returns: - DataFrame: The mode of the DataFrame. - """ - axis = pandas.DataFrame()._get_axis_number(axis) - return DataFrame( - query_compiler=self._query_compiler.mode( - axis=axis, numeric_only=numeric_only, dropna=dropna - ) - ) - - def mul(self, other, axis="columns", level=None, fill_value=None): - """Multiplies this DataFrame against another DataFrame/Series/scalar. - - Args: - other: The object to use to apply the multiply against this. - axis: The axis to multiply over. - level: The Multilevel index level to apply multiply over. - fill_value: The value to fill NaNs with. - - Returns: - A new DataFrame with the Multiply applied. - """ - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 1 - if level is not None: - if isinstance(other, DataFrame): - other = other._query_compiler.to_pandas() - return self._default_to_pandas( - pandas.DataFrame.mul, - other, - axis=axis, - level=level, - fill_value=fill_value, - ) - other = self._validate_other(other, axis, numeric_only=True) - new_query_compiler = self._query_compiler.mul( - other=other, axis=axis, level=level, fill_value=fill_value - ) - return self._create_dataframe_from_compiler(new_query_compiler) - - def multiply(self, other, axis="columns", level=None, fill_value=None): - """Synonym for mul. - - Args: - other: The object to use to apply the multiply against this. - axis: The axis to multiply over. - level: The Multilevel index level to apply multiply over. - fill_value: The value to fill NaNs with. - - Returns: - A new DataFrame with the Multiply applied. - """ - return self.mul(other, axis, level, fill_value) - - def ne(self, other, axis="columns", level=None): - """Checks element-wise that this is not equal to other. - - Args: - other: A DataFrame or Series or scalar to compare to. - axis: The axis to perform the ne over. - level: The Multilevel index level to apply ne over. - - Returns: - A new DataFrame filled with Booleans. - """ - if level is not None: - if isinstance(other, DataFrame): - other = other._query_compiler.to_pandas() - return self._default_to_pandas( - pandas.DataFrame.ne, other, axis=axis, level=level - ) - other = self._validate_other(other, axis) - new_query_compiler = self._query_compiler.ne( - other=other, axis=axis, level=level - ) - return self._create_dataframe_from_compiler(new_query_compiler) - - def nlargest(self, n, columns, keep="first"): - return self._default_to_pandas(pandas.DataFrame.nlargest, n, columns, keep=keep) - - def notna(self): - """Perform notna across the DataFrame. - - Returns: - Boolean DataFrame where value is False if corresponding - value is NaN, True otherwise - """ - return DataFrame(query_compiler=self._query_compiler.notna()) - - def notnull(self): - """Perform notnull across the DataFrame. - - Returns: - Boolean DataFrame where value is False if corresponding - value is NaN, True otherwise - """ - return DataFrame(query_compiler=self._query_compiler.notnull()) - - def nsmallest(self, n, columns, keep="first"): - return self._default_to_pandas( - pandas.DataFrame.nsmallest, n, columns, keep=keep + if isinstance(other, Series): + other = other._to_pandas() + return super(DataFrame, self).mod( + other, axis=axis, level=level, fill_value=None ) - - def nunique(self, axis=0, dropna=True): - """Return Series with number of distinct - observations over requested axis. - - Args: - axis : {0 or 'index', 1 or 'columns'}, default 0 - dropna : boolean, default True - - Returns: - nunique : Series - """ - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 - return self._query_compiler.nunique(axis=axis, dropna=dropna) - - def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, **kwargs): - return self._default_to_pandas( - pandas.DataFrame.pct_change, - periods=periods, - fill_method=fill_method, - limit=limit, - freq=freq, - **kwargs + + def mul(self, other, axis="columns", level=None, fill_value=None): + if isinstance(other, Series): + other = other._to_pandas() + return super(DataFrame, self).mul( + other, axis=axis, level=level, fill_value=None ) - def pipe(self, func, *args, **kwargs): - """Apply func(self, *args, **kwargs) + rmul = multiply = mul - Args: - func: function to apply to the df. - args: positional arguments passed into ``func``. - kwargs: a dictionary of keyword arguments passed into ``func``. + def ne(self, other, axis="columns", level=None): + if isinstance(other, Series): + other = other._to_pandas() + return super(DataFrame, self).ne(other, axis=axis, level=level) - Returns: - object: the return type of ``func``. - """ - return _pipe(self, func, *args, **kwargs) + def nlargest(self, n, columns, keep="first"): + return self._default_to_pandas(pandas.DataFrame.nlargest, n, columns, keep=keep) + + def nsmallest(self, n, columns, keep="first"): + return self._default_to_pandas( + pandas.DataFrame.nsmallest, n, columns, keep=keep + ) def pivot(self, index=None, columns=None, values=None): return self._default_to_pandas( @@ -2696,50 +1192,14 @@ def plot( sort_columns=False, **kwargs ): - return to_pandas(self).plot - - def pop(self, item): - """Pops an item from this DataFrame and returns it. - - Args: - item (str): Column label to be popped - - Returns: - A Series containing the popped values. Also modifies this - DataFrame. - """ - result = self[item] - del self[item] - return result + return self._to_pandas().plot def pow(self, other, axis="columns", level=None, fill_value=None): - """Pow this DataFrame against another DataFrame/Series/scalar. - - Args: - other: The object to use to apply the pow against this. - axis: The axis to pow over. - level: The Multilevel index level to apply pow over. - fill_value: The value to fill NaNs with. - - Returns: - A new DataFrame with the Pow applied. - """ - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 1 - if level is not None: - if isinstance(other, DataFrame): - other = other._query_compiler.to_pandas() - return self._default_to_pandas( - pandas.DataFrame.pow, - other, - axis=axis, - level=level, - fill_value=fill_value, - ) - other = self._validate_other(other, axis, numeric_only=True) - new_query_compiler = self._query_compiler.pow( - other=other, axis=axis, level=level, fill_value=fill_value + if isinstance(other, Series): + other = other._to_pandas() + return super(DataFrame, self).pow( + other, axis=axis, level=level, fill_value=None ) - return self._create_dataframe_from_compiler(new_query_compiler) def prod( self, @@ -2750,51 +1210,13 @@ def prod( min_count=0, **kwargs ): - """Return the product of the values for the requested axis - - Args: - axis : {index (0), columns (1)} - skipna : boolean, default True - level : int or level name, default None - numeric_only : boolean, default None - min_count : int, default 0 - - Returns: - prod : Series or DataFrame (if level specified) - """ - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 - self._validate_dtypes_sum_prod_mean(axis, numeric_only, ignore_axis=True) - return self._query_compiler.prod( - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - min_count=min_count, - **kwargs - ) - - def product( - self, - axis=None, - skipna=None, - level=None, - numeric_only=None, - min_count=0, - **kwargs - ): - """Return the product of the values for the requested axis - - Args: - axis : {index (0), columns (1)} - skipna : boolean, default True - level : int or level name, default None - numeric_only : boolean, default None - min_count : int, default 0 - - Returns: - product : Series or DataFrame (if level specified) - """ - return self.prod( + axis = self._get_axis_number(axis) + new_index = self.columns if axis else self.index + if min_count > len(new_index): + return Series( + [np.nan] * len(new_index), index=new_index, dtype=np.dtype("object") + ) + return super(DataFrame, self).prod( axis=axis, skipna=skipna, level=level, @@ -2803,73 +1225,8 @@ def product( **kwargs ) - def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation="linear"): - """Return values at the given quantile over requested axis, - a la numpy.percentile. - - Args: - q (float): 0 <= q <= 1, the quantile(s) to compute - axis (int): 0 or 'index' for row-wise, - 1 or 'columns' for column-wise - interpolation: {'linear', 'lower', 'higher', 'midpoint', 'nearest'} - Specifies which interpolation method to use - - Returns: - quantiles : Series or DataFrame - If q is an array, a DataFrame will be returned where the - index is q, the columns are the columns of self, and the - values are the quantiles. - - If q is a float, a Series will be returned where the - index is the columns of self and the values - are the quantiles. - """ - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 - - def check_dtype(t): - return is_numeric_dtype(t) or is_datetime_or_timedelta_dtype(t) - - if not numeric_only: - # If not numeric_only and columns, then check all columns are either - # numeric, timestamp, or timedelta - if not axis and not all(check_dtype(t) for t in self.dtypes): - raise TypeError("can't multiply sequence by non-int of type 'float'") - # If over rows, then make sure that all dtypes are equal for not - # numeric_only - elif axis: - for i in range(1, len(self.dtypes)): - pre_dtype = self.dtypes[i - 1] - curr_dtype = self.dtypes[i] - if not is_dtype_equal(pre_dtype, curr_dtype): - raise TypeError( - "Cannot compare type '{0}' with type '{1}'".format( - pre_dtype, curr_dtype - ) - ) - else: - # Normally pandas returns this near the end of the quantile, but we - # can't afford the overhead of running the entire operation before - # we error. - if not any(is_numeric_dtype(t) for t in self.dtypes): - raise ValueError("need at least one array to concatenate") - - # check that all qs are between 0 and 1 - pandas.DataFrame()._check_percentile(q) - axis = pandas.DataFrame()._get_axis_number(axis) - - if isinstance(q, (pandas.Series, np.ndarray, pandas.Index, list)): - return DataFrame( - query_compiler=self._query_compiler.quantile_for_list_of_values( - q=q, - axis=axis, - numeric_only=numeric_only, - interpolation=interpolation, - ) - ) - else: - return self._query_compiler.quantile_for_single_value( - q=q, axis=axis, numeric_only=numeric_only, interpolation=interpolation - ) + product = prod + radd = add def query(self, expr, inplace=False, **kwargs): """Queries the Dataframe with a boolean expression @@ -2881,180 +1238,7 @@ def query(self, expr, inplace=False, **kwargs): self._validate_eval_query(expr, **kwargs) inplace = validate_bool_kwarg(inplace, "inplace") new_query_compiler = self._query_compiler.query(expr, **kwargs) - return self._create_dataframe_from_compiler(new_query_compiler, inplace) - - def radd(self, other, axis="columns", level=None, fill_value=None): - return self.add(other, axis, level, fill_value) - - def rank( - self, - axis=0, - method="average", - numeric_only=None, - na_option="keep", - ascending=True, - pct=False, - ): - """ - Compute numerical data ranks (1 through n) along axis. - Equal values are assigned a rank that is the [method] of - the ranks of those values. - - Args: - axis (int): 0 or 'index' for row-wise, - 1 or 'columns' for column-wise - method: {'average', 'min', 'max', 'first', 'dense'} - Specifies which method to use for equal vals - numeric_only (boolean) - Include only float, int, boolean data. - na_option: {'keep', 'top', 'bottom'} - Specifies how to handle NA options - ascending (boolean): - Decedes ranking order - pct (boolean): - Computes percentage ranking of data - Returns: - A new DataFrame - """ - axis = pandas.DataFrame()._get_axis_number(axis) - return DataFrame( - query_compiler=self._query_compiler.rank( - axis=axis, - method=method, - numeric_only=numeric_only, - na_option=na_option, - ascending=ascending, - pct=pct, - ) - ) - - def rdiv(self, other, axis="columns", level=None, fill_value=None): - """Div this DataFrame against another DataFrame/Series/scalar. - - Args: - other: The object to use to apply the div against this. - axis: The axis to div over. - level: The Multilevel index level to apply div over. - fill_value: The value to fill NaNs with. - - Returns: - A new DataFrame with the rdiv applied. - """ - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 1 - if level is not None: - if isinstance(other, DataFrame): - other = other._query_compiler.to_pandas() - return self._default_to_pandas( - pandas.DataFrame.rdiv, - other, - axis=axis, - level=level, - fill_value=fill_value, - ) - other = self._validate_other(other, axis, numeric_only=True) - new_query_compiler = self._query_compiler.rdiv( - other=other, axis=axis, level=level, fill_value=fill_value - ) - return self._create_dataframe_from_compiler(new_query_compiler) - - def reindex( - self, - labels=None, - index=None, - columns=None, - axis=None, - method=None, - copy=True, - level=None, - fill_value=np.nan, - limit=None, - tolerance=None, - ): - if ( - level is not None - or ( - isinstance(self.columns, pandas.MultiIndex) - and (columns is not None or axis == 1) - ) - or ( - isinstance(self.index, pandas.MultiIndex) - and (index is not None or axis == 0) - ) - ): - return self._default_to_pandas( - pandas.DataFrame.reindex, - labels=labels, - index=index, - columns=columns, - axis=axis, - method=method, - copy=copy, - level=level, - fill_value=fill_value, - limit=limit, - tolerance=tolerance, - ) - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 - if axis == 0 and labels is not None: - index = labels - elif labels is not None: - columns = labels - if index is not None: - new_query_compiler = self._query_compiler.reindex( - 0, - index, - method=method, - fill_value=fill_value, - limit=limit, - tolerance=tolerance, - ) - else: - new_query_compiler = self._query_compiler - if columns is not None: - final_query_compiler = new_query_compiler.reindex( - 1, - columns, - method=method, - fill_value=fill_value, - limit=limit, - tolerance=tolerance, - ) - else: - final_query_compiler = new_query_compiler - return self._create_dataframe_from_compiler(final_query_compiler, not copy) - - def reindex_axis( - self, - labels, - axis=0, - method=None, - level=None, - copy=True, - limit=None, - fill_value=np.nan, - ): - return self._default_to_pandas( - pandas.DataFrame.reindex_axis, - labels, - axis=axis, - method=method, - level=level, - copy=copy, - limit=limit, - fill_value=fill_value, - ) - - def reindex_like(self, other, method=None, copy=True, limit=None, tolerance=None): - if isinstance(other, DataFrame): - other = other._query_compiler.to_pandas() - return self._default_to_pandas( - pandas.DataFrame.reindex_like, - other, - method=method, - copy=copy, - limit=limit, - tolerance=tolerance, - ) + return self._create_or_update_from_compiler(new_query_compiler, inplace) def rename( self, @@ -3086,496 +1270,89 @@ def rename( kwargs = {k: v for k, v in args.items() if v is not None and k != "self"} # inplace should always be true because this is just a copy, and we will use the # results after. - kwargs["inplace"] = True - df_to_rename = pandas.DataFrame(index=self.index, columns=self.columns) - df_to_rename.rename(**kwargs) - - if inplace: - obj = self - else: - obj = self.copy() - obj.index = df_to_rename.index - obj.columns = df_to_rename.columns - - if not inplace: - return obj - - def rename_axis( - self, mapper=None, index=None, columns=None, axis=None, copy=True, inplace=False - ): - kwargs = { - "index": index, - "columns": columns, - "axis": axis, - "copy": copy, - "inplace": inplace, - } - axes, kwargs = pandas.DataFrame()._construct_axes_from_arguments( - (), kwargs, sentinel=sentinel - ) - if axis is not None: - axis = pandas.DataFrame()._get_axis_number(axis) - else: - axis = 0 - inplace = validate_bool_kwarg(inplace, "inplace") - - if mapper is not None: - # Use v0.23 behavior if a scalar or list - non_mapper = is_scalar(mapper) or ( - is_list_like(mapper) and not is_dict_like(mapper) - ) - if non_mapper: - return self._set_axis_name(mapper, axis=axis, inplace=inplace) - else: - # Deprecated (v0.21) behavior is if mapper is specified, - # and not a list or scalar, then call rename - msg = ( - "Using 'rename_axis' to alter labels is deprecated. " - "Use '.rename' instead" - ) - warnings.warn(msg, FutureWarning, stacklevel=3) - axis = pandas.DataFrame()._get_axis_name(axis) - d = {"copy": copy, "inplace": inplace, axis: mapper} - return self.rename(**d) - else: - # Use new behavior. Means that index and/or columns is specified - result = self if inplace else self.copy(deep=copy) - - for axis in axes: - if axes[axis] is None: - continue - v = axes[axis] - axis = pandas.DataFrame()._get_axis_number(axis) - non_mapper = is_scalar(v) or (is_list_like(v) and not is_dict_like(v)) - if non_mapper: - newnames = v - else: - f = _get_rename_function(v) - curnames = self.index.names if axis == 0 else self.columns.names - newnames = [f(name) for name in curnames] - result._set_axis_name(newnames, axis=axis, inplace=True) - if not inplace: - return result - - def _set_axis_name(self, name, axis=0, inplace=False): - """Alter the name or names of the axis. - - Args: - name: Name for the Index, or list of names for the MultiIndex - axis: 0 or 'index' for the index; 1 or 'columns' for the columns - inplace: Whether to modify `self` directly or return a copy - - Returns: - Type of caller or None if inplace=True. - """ - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 - renamed = self if inplace else self.copy() - if axis == 0: - renamed.index = renamed.index.set_names(name) - else: - renamed.columns = renamed.columns.set_names(name) - if not inplace: - return renamed - - def reorder_levels(self, order, axis=0): - return self._default_to_pandas( - pandas.DataFrame.reorder_levels, order, axis=axis - ) - - def replace( - self, - to_replace=None, - value=None, - inplace=False, - limit=None, - regex=False, - method="pad", - ): - return self._default_to_pandas( - pandas.DataFrame.replace, - to_replace=to_replace, - value=value, - inplace=inplace, - limit=limit, - regex=regex, - method=method, - ) - - def resample( - self, - rule, - how=None, - axis=0, - fill_method=None, - closed=None, - label=None, - convention="start", - kind=None, - loffset=None, - limit=None, - base=0, - on=None, - level=None, - ): - return self._default_to_pandas( - pandas.DataFrame.resample, - rule, - how=how, - axis=axis, - fill_method=fill_method, - closed=closed, - label=label, - convention=convention, - kind=kind, - loffset=loffset, - limit=limit, - base=base, - on=on, - level=level, - ) - - def reset_index( - self, level=None, drop=False, inplace=False, col_level=0, col_fill="" - ): - """Reset this index to default and create column from current index. - - Args: - level: Only remove the given levels from the index. Removes all - levels by default - drop: Do not try to insert index into DataFrame columns. This - resets the index to the default integer index. - inplace: Modify the DataFrame in place (do not create a new object) - col_level : If the columns have multiple levels, determines which - level the labels are inserted into. By default it is inserted - into the first level. - col_fill: If the columns have multiple levels, determines how the - other levels are named. If None then the index name is - repeated. - - Returns: - A new DataFrame if inplace is False, None otherwise. - """ - inplace = validate_bool_kwarg(inplace, "inplace") - # TODO Implement level - if level is not None: - new_query_compiler = self._default_to_pandas( - pandas.DataFrame.reset_index, - level=level, - drop=drop, - inplace=inplace, - col_level=col_level, - col_fill=col_fill, - ) - # Error checking for matching Pandas. Pandas does not allow you to - # insert a dropped index into a DataFrame if these columns already - # exist. - elif ( - not drop - and not isinstance(self.index, pandas.MultiIndex) - and all(n in self.columns for n in ["level_0", "index"]) - ): - raise ValueError("cannot insert level_0, already exists") + kwargs["inplace"] = False + if index is not None: + new_index = pandas.DataFrame(index=self.index).rename(**kwargs).index else: - new_query_compiler = self._query_compiler.reset_index( - drop=drop, level=level - ) - return self._create_dataframe_from_compiler(new_query_compiler, inplace) - - def rfloordiv(self, other, axis="columns", level=None, fill_value=None): - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 1 - if level is not None: - if isinstance(other, DataFrame): - other = other._query_compiler.to_pandas() - return self._default_to_pandas( - pandas.DataFrame.rfloordiv, - other, - axis=axis, - level=level, - fill_value=fill_value, - ) - other = self._validate_other(other, axis, numeric_only=True) - new_query_compiler = self._query_compiler.rfloordiv( - other=other, axis=axis, level=level, fill_value=fill_value - ) - return self._create_dataframe_from_compiler(new_query_compiler) - - def rmod(self, other, axis="columns", level=None, fill_value=None): - """Mod this DataFrame against another DataFrame/Series/scalar. - - Args: - other: The object to use to apply the div against this. - axis: The axis to div over. - level: The Multilevel index level to apply div over. - fill_value: The value to fill NaNs with. - - Returns: - A new DataFrame with the rdiv applied. - """ - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 1 - if level is not None: - if isinstance(other, DataFrame): - other = other._query_compiler.to_pandas() - return self._default_to_pandas( - pandas.DataFrame.rmod, - other, - axis=axis, - level=level, - fill_value=fill_value, + new_index = self.index + if columns is not None: + new_columns = ( + pandas.DataFrame(columns=self.columns).rename(**kwargs).columns ) - other = self._validate_other(other, axis, numeric_only=True) - new_query_compiler = self._query_compiler.rmod( - other=other, axis=axis, level=level, fill_value=fill_value - ) - return self._create_dataframe_from_compiler(new_query_compiler) + else: + new_columns = self.columns - def rmul(self, other, axis="columns", level=None, fill_value=None): - return self.mul(other, axis, level, fill_value) + if inplace: + obj = self + else: + obj = self.copy() + obj.index = new_index + obj.columns = new_columns - def rolling( - self, - window, - min_periods=None, - center=False, - win_type=None, - on=None, - axis=0, - closed=None, - ): - return self._default_to_pandas( - pandas.DataFrame.rolling, - window, - min_periods=min_periods, - center=center, - win_type=win_type, - on=on, - axis=axis, - closed=closed, - ) + if not inplace: + return obj - def round(self, decimals=0, *args, **kwargs): - """Round each element in the DataFrame. + def _set_axis_name(self, name, axis=0, inplace=False): + """Alter the name or names of the axis. Args: - decimals: The number of decimals to round to. + name: Name for the Index, or list of names for the MultiIndex + axis: 0 or 'index' for the index; 1 or 'columns' for the columns + inplace: Whether to modify `self` directly or return a copy Returns: - A new DataFrame. + Type of caller or None if inplace=True. """ - return DataFrame( - query_compiler=self._query_compiler.round(decimals=decimals, **kwargs) - ) + axis = self._get_axis_number(axis) if axis is not None else 0 + renamed = self if inplace else self.copy() + if axis == 0: + renamed.index = renamed.index.set_names(name) + else: + renamed.columns = renamed.columns.set_names(name) + if not inplace: + return renamed - def rpow(self, other, axis="columns", level=None, fill_value=None): - """Pow this DataFrame against another DataFrame/Series/scalar. + def reorder_levels(self, order, axis=0): + return self._default_to_pandas( + pandas.DataFrame.reorder_levels, order, axis=axis + ) - Args: - other: The object to use to apply the pow against this. - axis: The axis to pow over. - level: The Multilevel index level to apply pow over. - fill_value: The value to fill NaNs with. + def rfloordiv(self, other, axis="columns", level=None, fill_value=None): + if isinstance(other, Series): + other = other._to_pandas() + return super(DataFrame, self).rfloordiv( + other, axis=axis, level=level, fill_value=None + ) - Returns: - A new DataFrame with the Pow applied. - """ - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 1 - if level is not None: - if isinstance(other, DataFrame): - other = other._query_compiler.to_pandas() - return self._default_to_pandas( - pandas.DataFrame.rpow, - other, - axis=axis, - level=level, - fill_value=fill_value, - ) - other = self._validate_other(other, axis, numeric_only=True) - # Check to make sure integers are not raised to negative integer powers - if ( - is_integer_dtype(type(other)) - and other < 0 - and all(is_integer_dtype(t) for t in self.dtypes) - ): - raise ValueError("Integers to negative integer powers are not allowed.") - new_query_compiler = self._query_compiler.rpow( - other=other, axis=axis, level=level, fill_value=fill_value + def rmod(self, other, axis="columns", level=None, fill_value=None): + if isinstance(other, Series): + other = other._to_pandas() + return super(DataFrame, self).rmod( + other, axis=axis, level=level, fill_value=None ) - return self._create_dataframe_from_compiler(new_query_compiler) + def rpow(self, other, axis="columns", level=None, fill_value=None): + if isinstance(other, Series): + other = other._to_pandas() + return super(DataFrame, self).rpow( + other, axis=axis, level=level, fill_value=None + ) def rsub(self, other, axis="columns", level=None, fill_value=None): - """Subtract a DataFrame/Series/scalar from this DataFrame. - - Args: - other: The object to use to apply the subtraction to this. - axis: The axis to apply the subtraction over. - level: Mutlilevel index level to subtract over. - fill_value: The value to fill NaNs with. - - Returns: - A new DataFrame with the subtraciont applied. - """ - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 1 - if level is not None: - if isinstance(other, DataFrame): - other = other._query_compiler.to_pandas() - return self._default_to_pandas( - pandas.DataFrame.rsub, - other, - axis=axis, - level=level, - fill_value=fill_value, - ) - other = self._validate_other(other, axis, numeric_or_time_only=True) - new_query_compiler = self._query_compiler.rsub( - other=other, axis=axis, level=level, fill_value=fill_value + if isinstance(other, Series): + other = other._to_pandas() + return super(DataFrame, self).rsub( + other, axis=axis, level=level, fill_value=None ) - return self._create_dataframe_from_compiler(new_query_compiler) def rtruediv(self, other, axis="columns", level=None, fill_value=None): - return self.rdiv(other, axis, level, fill_value) - - def sample( - self, - n=None, - frac=None, - replace=False, - weights=None, - random_state=None, - axis=None, - ): - """Returns a random sample of items from an axis of object. - - Args: - n: Number of items from axis to return. Cannot be used with frac. - Default = 1 if frac = None. - frac: Fraction of axis items to return. Cannot be used with n. - replace: Sample with or without replacement. Default = False. - weights: Default 'None' results in equal probability weighting. - If passed a Series, will align with target object on index. - Index values in weights not found in sampled object will be - ignored and index values in sampled object not in weights will - be assigned weights of zero. If called on a DataFrame, will - accept the name of a column when axis = 0. Unless weights are - a Series, weights must be same length as axis being sampled. - If weights do not sum to 1, they will be normalized to sum - to 1. Missing values in the weights column will be treated as - zero. inf and -inf values not allowed. - random_state: Seed for the random number generator (if int), or - numpy RandomState object. - axis: Axis to sample. Accepts axis number or name. - - Returns: - A new Dataframe - """ - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 - if axis: - axis_labels = self.columns - axis_length = len(axis_labels) - else: - # Getting rows requires indices instead of labels. RangeIndex provides this. - axis_labels = pandas.RangeIndex(len(self.index)) - axis_length = len(axis_labels) - if weights is not None: - # Index of the weights Series should correspond to the index of the - # Dataframe in order to sample - if isinstance(weights, pandas.Series): - weights = weights.reindex(self.axes[axis]) - # If weights arg is a string, the weights used for sampling will - # the be values in the column corresponding to that string - if isinstance(weights, string_types): - if axis == 0: - try: - weights = self[weights] - except KeyError: - raise KeyError("String passed to weights not a valid column") - else: - raise ValueError( - "Strings can only be passed to " - "weights when sampling from rows on " - "a DataFrame" - ) - weights = pandas.Series(weights, dtype="float64") - - if len(weights) != axis_length: - raise ValueError( - "Weights and axis to be sampled must be of same length" - ) - if (weights == np.inf).any() or (weights == -np.inf).any(): - raise ValueError("weight vector may not include `inf` values") - if (weights < 0).any(): - raise ValueError("weight vector many not include negative values") - # weights cannot be NaN when sampling, so we must set all nan - # values to 0 - weights = weights.fillna(0) - # If passed in weights are not equal to 1, renormalize them - # otherwise numpy sampling function will error - weights_sum = weights.sum() - if weights_sum != 1: - if weights_sum != 0: - weights = weights / weights_sum - else: - raise ValueError("Invalid weights: weights sum to zero") - weights = weights.values - - if n is None and frac is None: - # default to n = 1 if n and frac are both None (in accordance with - # Pandas specification) - n = 1 - elif n is not None and frac is None and n % 1 != 0: - # n must be an integer - raise ValueError("Only integers accepted as `n` values") - elif n is None and frac is not None: - # compute the number of samples based on frac - n = int(round(frac * axis_length)) - elif n is not None and frac is not None: - # Pandas specification does not allow both n and frac to be passed - # in - raise ValueError("Please enter a value for `frac` OR `n`, not both") - if n < 0: - raise ValueError( - "A negative number of rows requested. Please provide positive value." - ) - if n == 0: - # An Empty DataFrame is returned if the number of samples is 0. - # The Empty Dataframe should have either columns or index specified - # depending on which axis is passed in. - return DataFrame( - columns=[] if axis == 1 else self.columns, - index=self.index if axis == 1 else [], - ) - if random_state is not None: - # Get a random number generator depending on the type of - # random_state that is passed in - if isinstance(random_state, int): - random_num_gen = np.random.RandomState(random_state) - elif isinstance(random_state, np.random.randomState): - random_num_gen = random_state - else: - # random_state must be an int or a numpy RandomState object - raise ValueError( - "Please enter an `int` OR a " - "np.random.RandomState for random_state" - ) - # choose random numbers and then get corresponding labels from - # chosen axis - sample_indices = random_num_gen.choice( - np.arange(0, axis_length), size=n, replace=replace, p=weights - ) - samples = axis_labels[sample_indices] - else: - # randomly select labels from chosen axis - samples = np.random.choice( - a=axis_labels, size=n, replace=replace, p=weights - ) - if axis: - query_compiler = self._query_compiler.getitem_column_array(samples) - return DataFrame(query_compiler=query_compiler) - else: - query_compiler = self._query_compiler.getitem_row_array(samples) - return DataFrame(query_compiler=query_compiler) + if isinstance(other, Series): + other = other._to_pandas() + return super(DataFrame, self).rtruediv( + other, axis=axis, level=level, fill_value=None + ) - def select(self, crit, axis=0): - return self._default_to_pandas(pandas.DataFrame.select, crit, axis=axis) + rdiv = rtruediv def select_dtypes(self, include=None, exclude=None): # Validates arguments for whether both include and exclude are None or @@ -3613,56 +1390,6 @@ def is_dtype_instance_mapper(column, dtype): ] return self.drop(columns=self.columns[indicate], inplace=False) - def sem( - self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs - ): - return self._default_to_pandas( - pandas.DataFrame.sem, - axis=axis, - skipna=skipna, - level=level, - ddof=ddof, - numeric_only=numeric_only, - **kwargs - ) - - def set_axis(self, labels, axis=0, inplace=None): - """Assign desired index to given axis. - - Args: - labels (pandas.Index or list-like): The Index to assign. - axis (string or int): The axis to reassign. - inplace (bool): Whether to make these modifications inplace. - - Returns: - If inplace is False, returns a new DataFrame, otherwise None. - """ - if is_scalar(labels): - warnings.warn( - 'set_axis now takes "labels" as first argument, and ' - '"axis" as named parameter. The old form, with "axis" as ' - 'first parameter and "labels" as second, is still supported ' - "but will be deprecated in a future version of pandas.", - FutureWarning, - stacklevel=2, - ) - labels, axis = axis, labels - if inplace is None: - warnings.warn( - "set_axis currently defaults to operating inplace.\nThis " - "will change in a future version of pandas, use " - "inplace=True to avoid this warning.", - FutureWarning, - stacklevel=2, - ) - inplace = True - if inplace: - setattr(self, pandas.DataFrame()._get_axis_name(axis), labels) - else: - obj = self.copy() - obj.set_axis(labels, axis=axis, inplace=True) - return obj - def set_index( self, keys, drop=True, append=False, inplace=False, verify_integrity=False ): @@ -3717,7 +1444,7 @@ def set_index( level = col names.append(None) else: - level = frame[col]._values + level = frame[col]._to_pandas()._values names.append(col) if drop: to_remove.append(col) @@ -3737,170 +1464,14 @@ def set_index( if not inplace: return frame - def set_value(self, index, col, value, takeable=False): - return self._default_to_pandas( - pandas.DataFrame.set_value, index, col, value, takeable=takeable - ) - - def shift(self, periods=1, freq=None, axis=0, fill_value=None): - return self._default_to_pandas( - pandas.DataFrame.shift, - periods=periods, - freq=freq, - axis=axis, - fill_value=fill_value, - ) - - def skew(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - """Return unbiased skew over requested axis Normalized by N-1 - - Args: - axis : {index (0), columns (1)} - skipna : boolean, default True - Exclude NA/null values when computing the result. - level : int or level name, default None - numeric_only : boolean, default None - - Returns: - skew : Series or DataFrame (if level specified) - """ - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 - if numeric_only is not None and not numeric_only: - self._validate_dtypes(numeric_only=True) - - return self._query_compiler.skew( - axis=axis, skipna=skipna, level=level, numeric_only=numeric_only, **kwargs - ) - - def slice_shift(self, periods=1, axis=0): - return self._default_to_pandas( - pandas.DataFrame.slice_shift, periods=periods, axis=axis - ) - - def sort_index( - self, - axis=0, - level=None, - ascending=True, - inplace=False, - kind="quicksort", - na_position="last", - sort_remaining=True, - by=None, - ): - """Sort a DataFrame by one of the indices (columns or index). - - Args: - axis: The axis to sort over. - level: The MultiIndex level to sort over. - ascending: Ascending or descending - inplace: Whether or not to update this DataFrame inplace. - kind: How to perform the sort. - na_position: Where to position NA on the sort. - sort_remaining: On Multilevel Index sort based on all levels. - by: (Deprecated) argument to pass to sort_values. - - Returns: - A sorted DataFrame - """ - axis = pandas.DataFrame()._get_axis_number(axis) - if level is not None: - new_query_compiler = self._default_to_pandas( - pandas.DataFrame.sort_index, - axis=axis, - level=level, - ascending=ascending, - inplace=False, - kind=kind, - na_position=na_position, - sort_remaining=sort_remaining, - ) - return self._create_dataframe_from_compiler(new_query_compiler, inplace) - if by is not None: - warnings.warn( - "by argument to sort_index is deprecated, " - "please use .sort_values(by=...)", - FutureWarning, - stacklevel=2, - ) - if level is not None: - raise ValueError("unable to simultaneously sort by and level") - return self.sort_values(by, axis=axis, ascending=ascending, inplace=inplace) - new_query_compiler = self._query_compiler.sort_index( - axis=axis, ascending=ascending, kind=kind, na_position=na_position - ) - if inplace: - self._update_inplace(new_query_compiler=new_query_compiler) - else: - return DataFrame(query_compiler=new_query_compiler) - - def sort_values( - self, - by, - axis=0, - ascending=True, - inplace=False, - kind="quicksort", - na_position="last", - ): - """Sorts by a column/row or list of columns/rows. - - Args: - by: A list of labels for the axis to sort over. - axis: The axis to sort. - ascending: Sort in ascending or descending order. - inplace: If true, do the operation inplace. - kind: How to sort. - na_position: Where to put np.nan values. - - Returns: - A sorted DataFrame. - """ - axis = pandas.DataFrame()._get_axis_number(axis) - if not is_list_like(by): - by = [by] - # Currently, sort_values will just reindex based on the sorted values. - # TODO create a more efficient way to sort - if axis == 0: - broadcast_value_dict = {col: self[col] for col in by} - broadcast_values = pandas.DataFrame(broadcast_value_dict, index=self.index) - new_index = broadcast_values.sort_values( - by=by, - axis=axis, - ascending=ascending, - kind=kind, - na_position=na_position, - ).index - return self.reindex(index=new_index, copy=not inplace) - else: - broadcast_value_list = [ - to_pandas(self[row :: len(self.index)]) for row in by - ] - index_builder = list(zip(broadcast_value_list, by)) - broadcast_values = pandas.concat( - [row for row, idx in index_builder], copy=False - ) - broadcast_values.columns = self.columns - new_columns = broadcast_values.sort_values( - by=by, - axis=axis, - ascending=ascending, - kind=kind, - na_position=na_position, - ).columns - return self.reindex(columns=new_columns, copy=not inplace) - def squeeze(self, axis=None): - # Checks for 1x1 DF, passes into squeeze with approproate ndim - if ( - self._query_compiler.data.shape[0] == 1 - and self._query_compiler.data.shape[1] == 1 - ): - return self._query_compiler.squeeze(0, axis) - # Checks for 1xN or Nx1 DF, passes into squeeze with appropriate ndim - elif 1 in self._query_compiler.data.shape: - return self._query_compiler.squeeze(1, axis) - # NxN DF, don't need to pass into squeeze + axis = self._get_axis_number(axis) if axis is not None else None + if axis is None and (len(self.columns) == 1 or len(self.index) == 1): + return Series(query_compiler=self._query_compiler).squeeze() + if axis == 1 and len(self.columns) == 1: + return Series(query_compiler=self._query_compiler) + if axis == 0 and len(self.index) == 1: + return Series(query_compiler=self._query_compiler) else: return self.copy() @@ -3909,204 +1480,37 @@ def stack(self, level=-1, dropna=True): pandas.DataFrame.stack, level=level, dropna=dropna ) - def std( - self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs - ): - """Computes standard deviation across the DataFrame. - - Args: - axis (int): The axis to take the std on. - skipna (bool): True to skip NA values, false otherwise. - ddof (int): degrees of freedom - - Returns: - The std of the DataFrame (Pandas Series) - """ - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 - if numeric_only is not None and not numeric_only: - self._validate_dtypes(numeric_only=True) - - return self._query_compiler.std( - axis=axis, - skipna=skipna, - level=level, - ddof=ddof, - numeric_only=numeric_only, - **kwargs - ) - def sub(self, other, axis="columns", level=None, fill_value=None): - """Subtract a DataFrame/Series/scalar from this DataFrame. - - Args: - other: The object to use to apply the subtraction to this. - axis: The axis to apply the subtraction over. - level: Mutlilevel index level to subtract over. - fill_value: The value to fill NaNs with. - - Returns: - A new DataFrame with the subtraciont applied. - """ - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 1 - if level is not None: - if isinstance(other, DataFrame): - other = other._query_compiler.to_pandas() - return self._default_to_pandas( - pandas.DataFrame.sub, - other, - axis=axis, - level=level, - fill_value=fill_value, - ) - other = self._validate_other(other, axis, numeric_or_time_only=True) - new_query_compiler = self._query_compiler.sub( - other=other, axis=axis, level=level, fill_value=fill_value - ) - return self._create_dataframe_from_compiler(new_query_compiler) - - def subtract(self, other, axis="columns", level=None, fill_value=None): - """Alias for sub. - - Args: - other: The object to use to apply the subtraction to this. - axis: THe axis to apply the subtraction over. - level: Mutlilevel index level to subtract over. - fill_value: The value to fill NaNs with. - - Returns: - A new DataFrame with the subtraciont applied. - """ - return self.sub(other, axis, level, fill_value) - - def swapaxes(self, axis1, axis2, copy=True): - return self._default_to_pandas( - pandas.DataFrame.swapaxes, axis1, axis2, copy=copy - ) - - def swaplevel(self, i=-2, j=-1, axis=0): - return self._default_to_pandas(pandas.DataFrame.swaplevel, i=i, j=j, axis=axis) - - def tail(self, n=5): - """Get the last n rows of the DataFrame. - - Args: - n (int): The number of rows to return. - - Returns: - A new DataFrame with the last n rows of this DataFrame. - """ - if n >= len(self.index): - return self.copy() - return DataFrame(query_compiler=self._query_compiler.tail(n)) - - def take(self, indices, axis=0, convert=None, is_copy=True, **kwargs): - return self._default_to_pandas( - pandas.DataFrame.take, - indices, - axis=axis, - convert=convert, - is_copy=is_copy, - **kwargs - ) - - def to_clipboard(self, excel=True, sep=None, **kwargs): # pragma: no cover - return self._default_to_pandas( - pandas.DataFrame.to_clipboard, excel=excel, sep=sep, **kwargs + if isinstance(other, Series): + other = other._to_pandas() + return super(DataFrame, self).sub( + other, axis=axis, level=level, fill_value=None ) - def to_csv( - self, - path_or_buf=None, - sep=",", - na_rep="", - float_format=None, - columns=None, - header=True, - index=True, - index_label=None, - mode="w", - encoding=None, - compression="infer", - quoting=None, - quotechar='"', - line_terminator=None, - chunksize=None, - tupleize_cols=None, - date_format=None, - doublequote=True, - escapechar=None, - decimal=".", - ): # pragma: no cover - - kwargs = { - "path_or_buf": path_or_buf, - "sep": sep, - "na_rep": na_rep, - "float_format": float_format, - "columns": columns, - "header": header, - "index": index, - "index_label": index_label, - "mode": mode, - "encoding": encoding, - "compression": compression, - "quoting": quoting, - "quotechar": quotechar, - "line_terminator": line_terminator, - "chunksize": chunksize, - "tupleize_cols": tupleize_cols, - "date_format": date_format, - "doublequote": doublequote, - "escapechar": escapechar, - "decimal": decimal, - } - return self._default_to_pandas(pandas.DataFrame.to_csv, **kwargs) - - def to_dense(self): # pragma: no cover - return self._default_to_pandas(pandas.DataFrame.to_dense) - - def to_dict(self, orient="dict", into=dict): # pragma: no cover - return self._default_to_pandas( - pandas.DataFrame.to_dict, orient=orient, into=into - ) + subtract = sub - def to_excel( + def sum( self, - excel_writer, - sheet_name="Sheet1", - na_rep="", - float_format=None, - columns=None, - header=True, - index=True, - index_label=None, - startrow=0, - startcol=0, - engine=None, - merge_cells=True, - encoding=None, - inf_rep="inf", - verbose=True, - freeze_panes=None, - ): # pragma: no cover - return self._default_to_pandas( - pandas.DataFrame.to_excel, - excel_writer, - sheet_name, - na_rep, - float_format, - columns, - header, - index, - index_label, - startrow, - startcol, - engine, - merge_cells, - encoding, - inf_rep, - verbose, - freeze_panes, + axis=None, + skipna=None, + level=None, + numeric_only=None, + min_count=0, + **kwargs + ): + axis = self._get_axis_number(axis) + new_index = self.columns if axis else self.index + if min_count > len(new_index): + return Series( + [np.nan] * len(new_index), index=new_index, dtype=np.dtype("object") + ) + return super(DataFrame, self).sum( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs ) def to_feather(self, fname): # pragma: no cover @@ -4143,11 +1547,6 @@ def to_gbq( private_key=private_key, ) - def to_hdf(self, path_or_buf, key, format="table", **kwargs): # pragma: no cover - return self._default_to_pandas( - pandas.DataFrame.to_hdf, path_or_buf, key, format=format, **kwargs - ) - def to_html( self, buf=None, @@ -4199,88 +1598,6 @@ def to_html( render_links=render_links, ) - def to_json( - self, - path_or_buf=None, - orient=None, - date_format=None, - double_precision=10, - force_ascii=True, - date_unit="ms", - default_handler=None, - lines=False, - compression="infer", - index=True, - ): # pragma: no cover - return self._default_to_pandas( - pandas.DataFrame.to_json, - path_or_buf, - orient=orient, - date_format=date_format, - double_precision=double_precision, - force_ascii=force_ascii, - date_unit=date_unit, - default_handler=default_handler, - lines=lines, - compression=compression, - index=index, - ) - - def to_latex( - self, - buf=None, - columns=None, - col_space=None, - header=True, - index=True, - na_rep="NaN", - formatters=None, - float_format=None, - sparsify=None, - index_names=True, - bold_rows=False, - column_format=None, - longtable=None, - escape=None, - encoding=None, - decimal=".", - multicolumn=None, - multicolumn_format=None, - multirow=None, - ): # pragma: no cover - return self._default_to_pandas( - pandas.DataFrame.to_latex, - buf=buf, - columns=columns, - col_space=col_space, - header=header, - index=index, - na_rep=na_rep, - formatters=formatters, - float_format=float_format, - sparsify=sparsify, - index_names=index_names, - bold_rows=bold_rows, - column_format=column_format, - longtable=longtable, - escape=escape, - encoding=encoding, - decimal=decimal, - multicolumn=multicolumn, - multicolumn_format=multicolumn_format, - multirow=multirow, - ) - - def to_msgpack( - self, path_or_buf=None, encoding="utf-8", **kwargs - ): # pragma: no cover - return self._default_to_pandas( - pandas.DataFrame.to_msgpack, - path_or_buf=path_or_buf, - encoding=encoding, - **kwargs - ) - def to_panel(self): # pragma: no cover return self._default_to_pandas(pandas.DataFrame.to_panel) @@ -4304,16 +1621,7 @@ def to_parquet( ) def to_period(self, freq=None, axis=0, copy=True): # pragma: no cover - return self._default_to_pandas( - pandas.DataFrame.to_period, freq=freq, axis=axis, copy=copy - ) - - def to_pickle( - self, path, compression="infer", protocol=pkl.HIGHEST_PROTOCOL - ): # pragma: no cover - return self._default_to_pandas( - pandas.DataFrame.to_pickle, path, compression=compression, protocol=protocol - ) + return super(DataFrame, self).to_period(freq=freq, axis=axis, copy=copy) def to_records( self, index=True, convert_datetime64=None, column_dtypes=None, index_dtypes=None @@ -4326,47 +1634,6 @@ def to_records( index_dtypes=index_dtypes, ) - def to_sparse(self, fill_value=None, kind="block"): - return self._default_to_pandas( - pandas.DataFrame.to_sparse, fill_value=fill_value, kind=kind - ) - - def to_sql( - self, - name, - con, - schema=None, - if_exists="fail", - index=True, - index_label=None, - chunksize=None, - dtype=None, - method=None, - ): - new_query_compiler = self._query_compiler - # writing the index to the database by inserting it to the DF - if index: - if not index_label: - index_label = "index" - new_query_compiler = new_query_compiler.insert(0, index_label, self.index) - # so pandas._to_sql will not write the index to the database as well - index = False - - from modin.data_management.factories import BaseFactory - - BaseFactory.to_sql( - new_query_compiler, - name=name, - con=con, - schema=schema, - if_exists=if_exists, - index=index, - index_label=index_label, - chunksize=chunksize, - dtype=dtype, - method=method, - ) - def to_stata( self, fname, @@ -4394,121 +1661,19 @@ def to_stata( convert_strl=convert_strl, ) - def to_string( - self, - buf=None, - columns=None, - col_space=None, - header=True, - index=True, - na_rep="NaN", - formatters=None, - float_format=None, - sparsify=None, - index_names=True, - justify=None, - max_rows=None, - max_cols=None, - show_dimensions=False, - decimal=".", - line_width=None, - ): - return self._default_to_pandas( - pandas.DataFrame.to_string, - buf=buf, - columns=columns, - col_space=col_space, - header=header, - index=index, - na_rep=na_rep, - formatters=formatters, - float_format=float_format, - sparsify=sparsify, - index_names=index_names, - justify=justify, - max_rows=max_rows, - max_cols=max_cols, - show_dimensions=show_dimensions, - decimal=decimal, - line_width=line_width, - ) - def to_timestamp(self, freq=None, how="start", axis=0, copy=True): - return self._default_to_pandas( - pandas.DataFrame.to_timestamp, freq=freq, how=how, axis=axis, copy=copy + return super(DataFrame, self).to_timestamp( + freq=freq, how=how, axis=axis, copy=copy ) - def to_xarray(self): - return self._default_to_pandas(pandas.DataFrame.to_xarray) - - def transform(self, func, axis=0, *args, **kwargs): - kwargs["is_transform"] = True - result = self.agg(func, axis=axis, *args, **kwargs) - if len(result) != len(self): - raise ValueError("transforms cannot produce aggregated results") - return result - def truediv(self, other, axis="columns", level=None, fill_value=None): - """Divides this DataFrame against another DataFrame/Series/scalar. - - Args: - other: The object to use to apply the divide against this. - axis: The axis to divide over. - level: The Multilevel index level to apply divide over. - fill_value: The value to fill NaNs with. - - Returns: - A new DataFrame with the Divide applied. - """ - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 1 - if level is not None: - if isinstance(other, DataFrame): - other = other._query_compiler.to_pandas() - return self._default_to_pandas( - pandas.DataFrame.truediv, - other, - axis=axis, - level=level, - fill_value=fill_value, - ) - other = self._validate_other(other, axis, numeric_only=True) - new_query_compiler = self._query_compiler.truediv( - other=other, axis=axis, level=level, fill_value=fill_value - ) - return self._create_dataframe_from_compiler(new_query_compiler) - - def truncate(self, before=None, after=None, axis=None, copy=True): - return self._default_to_pandas( - pandas.DataFrame.truncate, before=before, after=after, axis=axis, copy=copy - ) - - def tshift(self, periods=1, freq=None, axis=0): - return self._default_to_pandas( - pandas.DataFrame.tshift, periods=periods, freq=freq, axis=axis - ) - - def tz_convert(self, tz, axis=0, level=None, copy=True): - return self._default_to_pandas( - pandas.DataFrame.tz_convert, tz, axis=axis, level=level, copy=copy - ) - - def tz_localize( - self, tz, axis=0, level=None, copy=True, ambiguous="raise", nonexistent="raise" - ): - return self._default_to_pandas( - pandas.DataFrame.tz_localize, - tz, - axis=axis, - level=level, - copy=copy, - ambiguous=ambiguous, - nonexistent=nonexistent, + if isinstance(other, Series): + other = other._to_pandas() + return super(DataFrame, self).truediv( + other, axis=axis, level=level, fill_value=None ) - def unstack(self, level=-1, fill_value=None): - return self._default_to_pandas( - pandas.DataFrame.unstack, level=level, fill_value=fill_value - ) + div = divide = truediv def update( self, other, join="left", overwrite=True, filter_func=None, errors="ignore" @@ -4546,32 +1711,6 @@ def update( ) self._update_inplace(new_query_compiler=query_compiler) - def var( - self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs - ): - """Computes variance across the DataFrame. - - Args: - axis (int): The axis to take the variance on. - skipna (bool): True to skip NA values, false otherwise. - ddof (int): degrees of freedom - - Returns: - The variance of the DataFrame. - """ - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 - if numeric_only is not None and not numeric_only: - self._validate_dtypes(numeric_only=True) - - return self._query_compiler.var( - axis=axis, - skipna=skipna, - level=level, - ddof=ddof, - numeric_only=numeric_only, - **kwargs - ) - def where( self, cond, @@ -4619,8 +1758,8 @@ def where( try_cast=try_cast, raise_on_error=raise_on_error, ) - return self._create_dataframe_from_compiler(new_query_compiler, inplace) - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 + return self._create_or_update_from_compiler(new_query_compiler, inplace) + axis = self._get_axis_number(axis) if axis is not None else 0 cond = cond(self) if callable(cond) else cond if not isinstance(cond, DataFrame): @@ -4639,7 +1778,7 @@ def where( query_compiler = self._query_compiler.where( cond._query_compiler, other, axis=axis, level=level ) - return self._create_dataframe_from_compiler(query_compiler, inplace) + return self._create_or_update_from_compiler(query_compiler, inplace) def xs(self, key, axis=0, level=None, drop_level=True): return self._default_to_pandas( @@ -4668,7 +1807,7 @@ def __getitem__(self, key): indexer = convert_to_index_sliceable(pandas.DataFrame(index=self.index), key) if indexer is not None: return self._getitem_slice(indexer) - if isinstance(key, (pandas.Series, np.ndarray, pandas.Index, list)): + if isinstance(key, (Series, np.ndarray, pandas.Index, list)): return self._getitem_array(key) elif isinstance(key, DataFrame): return self.where(key) @@ -4679,11 +1818,16 @@ def __getitem__(self, key): return self._getitem_column(key) def _getitem_column(self, key): - return SeriesView( - self._query_compiler.getitem_single_key(key), self, (slice(None), key) - ) + if key not in self.keys(): + raise KeyError("{}".format(key)) + s = self._reduce_dimension(self._query_compiler.getitem_column_array([key])) + s._parent = self + return s def _getitem_array(self, key): + # TODO: dont convert to pandas for array indexing + if isinstance(key, Series): + key = key._to_pandas() if is_bool_indexer(key): if isinstance(key, pandas.Series) and not key.index.equals(self.index): warnings.warn( @@ -4768,22 +1912,11 @@ def setitem_without_string_columns(df): new_self = DataFrame({key: value}, columns=self.columns) self._update_inplace(new_self._query_compiler) else: - self._update_inplace(self._query_compiler.setitem(key, value)) - - def __len__(self): - """Gets the length of the DataFrame. - - Returns: - Returns an integer length of the DataFrame object. - """ - return len(self.index) + self._update_inplace(self._query_compiler.setitem(0, key, value)) def __unicode__(self): return self._default_to_pandas(pandas.DataFrame.__unicode__) - def __invert__(self): - return self._default_to_pandas(pandas.DataFrame.__invert__) - def __hash__(self): return self._default_to_pandas(pandas.DataFrame.__hash__) @@ -4806,38 +1939,9 @@ def __contains__(self, key): """ return self.columns.__contains__(key) - def __nonzero__(self): - raise ValueError( - "The truth value of a {0} is ambiguous. " - "Use a.empty, a.bool(), a.item(), a.any() or a.all().".format( - self.__class__.__name__ - ) - ) - - __bool__ = __nonzero__ - - def __abs__(self): - """Creates a modified DataFrame by taking the absolute value. - - Returns: - A modified DataFrame - """ - return self.abs() - def __round__(self, decimals=0): return self._default_to_pandas(pandas.DataFrame.__round__, decimals=decimals) - def __array__(self, dtype=None): - # TODO: This is very inefficient and needs fix, also see as_matrix - return to_pandas(self).__array__(dtype=dtype) - - def __array_wrap__(self, result, context=None): - # TODO: This is very inefficient, see also __array__ and as_matrix - return to_pandas(self).__array_wrap__(result, context=context) - - def __getstate__(self): - return self._default_to_pandas(pandas.DataFrame.__getstate__) - def __setstate__(self, state): return self._default_to_pandas(pandas.DataFrame.__setstate__, state) @@ -4854,64 +1958,6 @@ def __delitem__(self, key): raise KeyError(key) self._update_inplace(new_query_compiler=self._query_compiler.delitem(key)) - def __finalize__(self, other, method=None, **kwargs): - if isinstance(other, DataFrame): - other = other._query_compiler.to_pandas() - return self._default_to_pandas( - pandas.DataFrame.__finalize__, other, method=method, **kwargs - ) - - def __copy__(self, deep=True): - """Make a copy using modin.DataFrame.copy method - - Args: - deep: Boolean, deep copy or not. - Currently we do not support deep copy. - - Returns: - A Ray DataFrame object. - """ - return self.copy(deep=deep) - - def __deepcopy__(self, memo=None): - """Make a -deep- copy using modin.DataFrame.copy method - This is equivalent to copy(deep=True). - - Args: - memo: No effect. Just to comply with Pandas API. - - Returns: - A Ray DataFrame object. - """ - return self.copy(deep=True) - - def __and__(self, other): - return self.__bool__() and other - - def __or__(self, other): - return self.__bool__() or other - - def __xor__(self, other): - return self.__bool__() ^ other - - def __lt__(self, other): - return self.lt(other) - - def __le__(self, other): - return self.le(other) - - def __gt__(self, other): - return self.gt(other) - - def __ge__(self, other): - return self.ge(other) - - def __eq__(self, other): - return self.eq(other) - - def __ne__(self, other): - return self.ne(other) - def __add__(self, other, axis=None, level=None, fill_value=None): return self.add(other, axis=axis, level=level, fill_value=fill_value) @@ -4995,18 +2041,6 @@ def __div__(self, other, axis=None, level=None, fill_value=None): def __rdiv__(self, other, axis=None, level=None, fill_value=None): return self.rdiv(other, axis=axis, level=level, fill_value=fill_value) - def __neg__(self): - """Computes an element wise negative DataFrame - - Returns: - A modified DataFrame where every element is the negation of before - """ - self._validate_dtypes(numeric_only=True) - return DataFrame(query_compiler=self._query_compiler.negative()) - - def __sizeof__(self): # pragma: no cover - return self._default_to_pandas(pandas.DataFrame.__sizeof__) - @property def __doc__(self): # pragma: no cover def __doc__(df): @@ -5015,14 +2049,6 @@ def __doc__(df): return self._default_to_pandas(__doc__) - @property - def blocks(self): - def blocks(df): - """Defined because properties do not have a __name__""" - return df.blocks - - return self._default_to_pandas(blocks) - @property def style(self): def style(df): @@ -5031,53 +2057,7 @@ def style(df): return self._default_to_pandas(style) - @property - def iat(self, axis=None): - from .indexing import _iLocIndexer - - return _iLocIndexer(self) - - @property - def loc(self): - """Purely label-location based indexer for selection by label. - - We currently support: single label, list array, slice object - We do not support: boolean array, callable - """ - from .indexing import _LocIndexer - - return _LocIndexer(self) - - @property - def is_copy(self): - def is_copy(df): - """Defined because properties do not have a __name__""" - return df.is_copy - - return self._default_to_pandas(is_copy) - - @property - def at(self, axis=None): - from .indexing import _LocIndexer - - return _LocIndexer(self) - - @property - def ix(self, axis=None): - raise ErrorMessage.not_implemented("ix is not implemented.") - - @property - def iloc(self): - """Purely integer-location based indexing for selection by position. - - We currently support: single label, list array, slice object - We do not support: boolean array, callable - """ - from .indexing import _iLocIndexer - - return _iLocIndexer(self) - - def _create_dataframe_from_compiler(self, new_query_compiler, inplace=False): + def _create_or_update_from_compiler(self, new_query_compiler, inplace=False): """Returns or updates a DataFrame given new query_compiler""" assert ( isinstance(new_query_compiler, type(self._query_compiler)) @@ -5088,82 +2068,6 @@ def _create_dataframe_from_compiler(self, new_query_compiler, inplace=False): else: self._update_inplace(new_query_compiler=new_query_compiler) - def _validate_other( - self, - other, - axis, - numeric_only=False, - numeric_or_time_only=False, - numeric_or_object_only=False, - comparison_dtypes_only=False, - ): - """Helper method to check validity of other in inter-df operations""" - axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 1 - result = other - if isinstance(other, DataFrame): - return other._query_compiler - elif is_list_like(other): - other_dtypes = [type(x) for x in other] - if axis == 0: - if len(other) != len(self.index): - raise ValueError( - "Unable to coerce to Series, length must be {0}: " - "given {1}".format(len(self.index), len(other)) - ) - else: - if len(other) != len(self.columns): - raise ValueError( - "Unable to coerce to Series, length must be {0}: " - "given {1}".format(len(self.columns), len(other)) - ) - else: - other_dtypes = [ - type(other) - for _ in range(len(self.index) if axis else len(self.columns)) - ] - - # Do dtype checking - if numeric_only: - if not all( - is_numeric_dtype(self_dtype) and is_numeric_dtype(other_dtype) - for self_dtype, other_dtype in zip(self.dtypes, other_dtypes) - ): - raise TypeError("Cannot do operation on non-numeric dtypes") - elif numeric_or_object_only: - if not all( - (is_numeric_dtype(self_dtype) and is_numeric_dtype(other_dtype)) - or (is_object_dtype(self_dtype) and is_object_dtype(other_dtype)) - for self_dtype, other_dtype in zip(self.dtypes, other_dtypes) - ): - raise TypeError("Cannot do operation non-numeric dtypes") - elif comparison_dtypes_only: - if not all( - (is_numeric_dtype(self_dtype) and is_numeric_dtype(other_dtype)) - or ( - is_datetime_or_timedelta_dtype(self_dtype) - and is_datetime_or_timedelta_dtype(other_dtype) - ) - or is_dtype_equal(self_dtype, other_dtype) - for self_dtype, other_dtype in zip(self.dtypes, other_dtypes) - ): - raise TypeError( - "Cannot do operation non-numeric objects with numeric objects" - ) - elif numeric_or_time_only: - if not all( - (is_numeric_dtype(self_dtype) and is_numeric_dtype(other_dtype)) - or ( - is_datetime_or_timedelta_dtype(self_dtype) - and is_datetime_or_timedelta_dtype(other_dtype) - ) - for self_dtype, other_dtype in zip(self.dtypes, other_dtypes) - ): - raise TypeError( - "Cannot do operation non-numeric objects with numeric objects" - ) - - return result - def _validate_dtypes(self, numeric_only=False): """Helper method to check that all the dtypes are the same""" dtype = self.dtypes[0] @@ -5228,31 +2132,5 @@ def _validate_dtypes_sum_prod_mean(self, axis, numeric_only, ignore_axis=False): ): raise TypeError("Cannot operate on Numeric and Non-Numeric Types") - def _default_to_pandas(self, op, *args, **kwargs): - """Helper method to use default pandas function""" - ErrorMessage.default_to_pandas("`{}`".format(op.__name__)) - result = op(self._query_compiler.to_pandas(), *args, **kwargs) - # SparseDataFrames cannot be serialize by arrow and cause problems for Modin. - # For now we will use pandas. - if isinstance(result, pandas.DataFrame) and not isinstance( - result, pandas.SparseDataFrame - ): - return DataFrame(result) - else: - try: - if ( - isinstance(result, (list, tuple)) - and len(result) == 2 - and isinstance(result[0], pandas.DataFrame) - ): - # Some operations split the DataFrame into two (e.g. align). We need to wrap - # both of the returned results - if isinstance(result[1], pandas.DataFrame): - second = DataFrame(result[1]) - else: - second = result[1] - return DataFrame(result[0]), second - else: - return result - except TypeError: - return result + def _to_pandas(self): + return self._query_compiler.to_pandas() diff --git a/modin/pandas/general.py b/modin/pandas/general.py index 6b9f7ba8a8a..ea67f260ece 100644 --- a/modin/pandas/general.py +++ b/modin/pandas/general.py @@ -5,6 +5,7 @@ import pandas from modin.error_message import ErrorMessage +from .base import BasePandasDataset from .dataframe import DataFrame from .utils import to_pandas @@ -18,7 +19,7 @@ def isna(obj): Returns: bool or array-like of bool """ - if isinstance(obj, DataFrame): + if isinstance(obj, BasePandasDataset): return obj.isna() else: return pandas.isna(obj) @@ -28,7 +29,7 @@ def isna(obj): def notna(obj): - if isinstance(obj, DataFrame): + if isinstance(obj, BasePandasDataset): return obj.notna() else: return pandas.notna(obj) diff --git a/modin/pandas/indexing.py b/modin/pandas/indexing.py index 3c45c9eef12..b23de93c1d8 100644 --- a/modin/pandas/indexing.py +++ b/modin/pandas/indexing.py @@ -11,7 +11,8 @@ from warnings import warn from .dataframe import DataFrame -from .series import SeriesView +from .base import BasePandasDataset +from .series import Series """Indexing Helper Class works as follows: @@ -144,7 +145,7 @@ class _LocationIndexerBase(object): """Base class for location indexer like loc and iloc """ - def __init__(self, ray_df: DataFrame): + def __init__(self, ray_df: BasePandasDataset): self.df = ray_df self.qc = ray_df._query_compiler self.row_scaler = False @@ -154,18 +155,23 @@ def __getitem__( self, row_lookup: pandas.Index, col_lookup: pandas.Index, ndim: int ): qc_view = self.qc.view(row_lookup, col_lookup) - if ndim == 2: - return DataFrame(query_compiler=qc_view) + return self.df.__constructor__(query_compiler=qc_view) + if isinstance(self.df, Series) and not self.row_scaler: + return self.df.__constructor__(query_compiler=qc_view) + if isinstance(self.df, Series): + axis = 0 elif ndim == 0: - return qc_view.squeeze(ndim=0) + axis = None else: - single_axis = 1 if self.col_scaler else 0 - return SeriesView( - qc_view.squeeze(ndim=1, axis=single_axis), - self.df, - (row_lookup, col_lookup), + axis = ( + None + if self.col_scaler and self.row_scaler + else 1 + if self.col_scaler + else 0 ) + return self.df.__constructor__(query_compiler=qc_view).squeeze(axis=axis) def __setitem__(self, row_lookup: pandas.Index, col_lookup: pandas.Index, item): """ @@ -235,7 +241,7 @@ def __getitem__(self, key): # Pandas drops the levels that are in the `loc`, so we have to as well. if hasattr(result, "index") and isinstance(result.index, pandas.MultiIndex): if ( - isinstance(result, pandas.Series) + isinstance(result, Series) and not isinstance(col_loc, slice) and all( col_loc[i] in result.index.levels[i] for i in range(len(col_loc)) diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 7cdf27bf3ee..ca74b778cfa 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -2,584 +2,410 @@ from __future__ import division from __future__ import print_function -import pandas -import inspect import numpy as np +import pandas +from pandas.core.dtypes.common import is_dict_like, is_list_like, is_scalar +import sys - -# from .utils import _inherit_docstrings - - -def na_op(): - """Pandas uses a similar function to handle na values. - """ - raise NotImplementedError("Not Yet implemented.") - - -class SeriesView(object): - """A wrapper class for pandas Series. - - Note: The main use of this class is to help us implement inplace operations that - propagate their changes back to the DataFrame that a Series belongs to. We are - only need to use this object when `__getitem__` returns a pandas Series, or when - `loc`/`iloc` return a Series as well. - - Important: This is not needed to replace every Series in Modin. For example, when an - operation on a Series returns a new Series, it does not need to return an object - of this class. It can return a Series because the new object does not have a - DataFrame that it is associated with. - - """ - - def __init__(self, series, parent_df, loc): - assert type(series) is pandas.Series - from .dataframe import DataFrame - - assert type(parent_df) is DataFrame - assert type(loc) is tuple - self.series = series - self.parent_df = parent_df - self._loc = loc - - def _get_index(self): - return self.series.index - - def _set_index(self, index): - self.series.index = index - - index = property(_get_index, _set_index) - - def __repr__(self): - return repr(self.series) - - def __str__(self): - return str(self.series) - - def __dir__(self): - return self.series.__dir__() - - def __comparisons__(self, func): - def compare_func(other): - if hasattr(other, "series"): - other = other.series - return getattr(self.series, func)(other) - - return compare_func - - def __eq__(self, other): - return self.__comparisons__("__eq__")(other) - - def __ge__(self, other): - return self.__comparisons__("__ge__")(other) - - def __gt__(self, other): - return self.__comparisons__("__gt__")(other) - - def __le__(self, other): - return self.__comparisons__("__le__")(other) - - def __lt__(self, other): - return self.__comparisons__("__lt__")(other) - - def __ne__(self, other): - return self.__comparisons__("__ne__")(other) - - def __arithmetic_op__(self, func): - def arithemtic_op(other): - if hasattr(other, "series"): - other = other.series - return getattr(self.series, func)(other) - - return arithemtic_op - - def __add__(self, other): - return self.__arithmetic_op__("__add__")(other) - - def __mul__(self, other): - return self.__arithmetic_op__("__mul__")(other) - - def __sub__(self, other): - return self.__arithmetic_op__("__sub__")(other) - - def __truediv__(self, other): - return self.__arithmetic_op__("__truediv__")(other) - - def __floordiv__(self, other): - return self.__arithmetic_op__("__floordiv__")(other) - - def __mod__(self, other): - return self.__arithmetic_op__("__mod__")(other) - - def __pow__(self, other): - return self.__arithmetic_op__("__pow__")(other) - - def __radd__(self, other): - return self.__arithmetic_op__("__radd__")(other) - - def __rmul__(self, other): - return self.__arithmetic_op__("__rmul__")(other) - - def __rsub__(self, other): - return self.__arithmetic_op__("__rsub__")(other) - - def __rtruediv__(self, other): - return self.__arithmetic_op__("__rtruediv__")(other) - - def __rfloordiv__(self, other): - return self.__arithmetic_op__("__rfloordiv__")(other) - - def __rmod__(self, other): - return self.__arithmetic_op__("__rmod__")(other) - - def __rpow__(self, other): - return self.__arithmetic_op__("__rpow__")(other) - - def __iadd__(self, other): - return self.__arithmetic_op__("__iadd__")(other) - - def __imul__(self, other): - return self.__arithmetic_op__("__imul__")(other) - - def __isub__(self, other): - return self.__arithmetic_op__("__isub__")(other) - - def __itruediv__(self, other): - return self.__arithmetic_op__("__itruediv__")(other) - - def __ifloordiv__(self, other): - return self.__arithmetic_op__("__ifloordiv__")(other) - - def __imod__(self, other): - return self.__arithmetic_op__("__imod__")(other) - - def __ipow__(self, other): - return self.__arithmetic_op__("__ipow__")(other) - - def __neg__(self, other): - return self.__arithmetic_op__("__neg__")(other) - - def __abs__(self): - return self.series.abs() - - def __iter__(self): - return self.series.__iter__() - - def __len__(self): - return self.series.__len__() - - def __getitem__(self, item): - return self.series.__getitem__(item) - - def __setitem__(self, key, value): - return_val = self.series.__setitem__(key, value) - self.parent_df.loc[self._loc] = self.series - return return_val - - def __getattribute__(self, item): - default_behaviors = [ - "__init__", - "series", - "parent_df", - "_loc", - "__arithmetic_op__", - "__comparisons__", - "__class__", - "index", - "_get_index", - "_set_index", - ] - if item not in default_behaviors: - method = self.series.__getattribute__(item) - # Certain operations like `at`, `loc`, `iloc`, etc. are callable because in - # pandas they are equivalent to classes. They are verified here because they - # cannot be overridden with the functions below. This generally solves the - # problem where the instance property is callable, but the class property is - # not. - # The isclass check is to ensure that we return the correct type. Some of - # the objects that are called result in classes being returned, and we don't - # want to override with our own function. - is_callable = ( - callable(method) - and callable(getattr(type(self.series), item)) - and not inspect.isclass(getattr(type(self.series), item)) - ) - try: - has_inplace_param = is_callable and "inplace" in str( - inspect.signature(method) - ) - # This will occur on Python2 - except AttributeError: - has_inplace_param = is_callable and "inplace" in str( - inspect.getargspec(method) - ) - - if is_callable and has_inplace_param and self.parent_df is not None: - - def inplace_handler(*args, **kwargs): - """Replaces the default behavior of methods with inplace kwarg. - - Note: This method will modify the DataFrame this Series is attached - to when `inplace` is True. Instead of rewriting or overriding - every method that uses `inplace`, we use this handler. - - This handler will first check that the keyword argument passed - for `inplace` is True, if not then it will just return the - result of the operation requested. - - If `inplace` is True, do the operation, keeping track of the - previous length. This is because operations like `dropna` still - propagate back to the DataFrame that holds the Series. - - If the length did not change, we propagate the inplace changes - of the operation back to the original DataFrame with - `__setitem__`. - - If the length changed, we just need to do a `reindex` on the - parent DataFrame. This will propagate the inplace operation - (e.g. `dropna`) back to the parent DataFrame. - - See notes in SeriesView class about when it is okay to return a - pandas Series vs a SeriesView. - - Returns: - If `inplace` is True: None, else: A new Series. - """ - if kwargs.get("inplace", False): - prev_len = len(self.series) - self.series.__getattribute__(item)(*args, **kwargs) - if prev_len == len(self.series): - self.parent_df.loc[self._loc] = self.series - else: - self.parent_df.reindex(index=self.series.index, copy=False) - return None - else: - return self.series.__getattribute__(item)(*args, **kwargs) - - # We replace the method with `inplace_handler` for inplace operations - method = inplace_handler - elif is_callable: - - def other_handler(*args, **kwargs): - """Replaces the method's args and kwargs with the Series object. - - Note: This method is needed because sometimes operations like - `df['col0'].equals(df['col1'])` do not return the correct value. - This mostly has occurred in Python2, but overriding of the - method will make the behavior more deterministic for all calls. - - Returns the result of `__getattribute__` from the Series this wraps. - """ - args = tuple( - arg if not isinstance(arg, SeriesView) else arg.series - for arg in args - ) - kwargs = { - kw: arg if not isinstance(arg, SeriesView) else arg.series - for kw, arg in kwargs.items() - } - return self.series.__getattribute__(item)(*args, **kwargs) - - method = other_handler - return method - # We need to do this hack for equality checking. - elif item == "__class__": - return self.series.__class__ - else: - return object.__getattribute__(self, item) +from .base import BasePandasDataset +from .iterator import PartitionIterator +from .utils import _inherit_docstrings +from .utils import from_pandas, to_pandas -class Series(object): - def __init__(self, series_oids): +@_inherit_docstrings(pandas.Series, excluded=[pandas.Series, pandas.Series.__init__]) +class Series(BasePandasDataset): + def __init__( + self, + data=None, + index=None, + dtype=None, + name=None, + copy=False, + fastpath=False, + query_compiler=None, + ): """Constructor for a Series object. Args: series_oids ([ObjectID]): The list of remote Series objects. """ - self.series_oids = series_oids - - @property - def T(self): - raise NotImplementedError("Not Yet implemented.") - - def __abs__(self): - raise NotImplementedError("Not Yet implemented.") + if query_compiler is None: + if name is None: + name = "__reduced__" + query_compiler = from_pandas( + pandas.DataFrame( + pandas.Series( + data=data, + index=index, + dtype=dtype, + name=name, + copy=copy, + fastpath=fastpath, + ) + ) + )._query_compiler + if len(query_compiler.columns) != 1: + query_compiler = query_compiler.transpose() + self._query_compiler = query_compiler + + def _get_name(self): + name = self._query_compiler.columns[0] + if name == "__reduced__": + return None + return name + + def _set_name(self, name): + if name is None: + name = "__reduced__" + self._query_compiler.columns = [name] + + name = property(_get_name, _set_name) + _parent = None + + def _reduce_dimension(self, query_compiler): + return query_compiler.to_pandas().squeeze() + + def _validate_dtypes_sum_prod_mean(self, axis, numeric_only, ignore_axis=False): + pass + + def _validate_dtypes_min_max(self, axis, numeric_only): + pass + + def _validate_dtypes(self, numeric_only=False): + pass + + def _create_or_update_from_compiler(self, new_query_compiler, inplace=False): + """Returns or updates a DataFrame given new query_compiler""" + assert ( + isinstance(new_query_compiler, type(self._query_compiler)) + or type(new_query_compiler) in self._query_compiler.__class__.__bases__ + ), "Invalid Query Compiler object: {}".format(type(new_query_compiler)) + if not inplace and len(new_query_compiler.columns) == 1: + return Series(query_compiler=new_query_compiler) + elif not inplace: + # This can happen with things like `reset_index` where we can add columns. + from .dataframe import DataFrame + + return DataFrame(query_compiler=new_query_compiler) + else: + self._update_inplace(new_query_compiler=new_query_compiler) + + def _prepare_inter_op(self, other): + if isinstance(other, Series): + new_self = self.copy() + new_self.name = "__reduced__" + new_other = other.copy() + new_other.name = "__reduced__" + else: + new_self = self + new_other = other + return new_self, new_other - def __add__(self, right, name="__add__", na_op=na_op): - raise NotImplementedError("Not Yet implemented.") + def __add__(self, right): + return self.add(right) def __and__(self, other): - raise NotImplementedError("Not Yet implemented.") - - def __array__(self, result=None): - raise NotImplementedError("Not Yet implemented.") + new_self, new_other = self._prepare_inter_op(other) + return super(Series, new_self).__and__(new_other) - def __array_prepare__(self, result, context=None): - raise NotImplementedError("Not Yet implemented.") + def __array_prepare__(self, result, context=None): # pragma: no cover + return self._default_to_pandas( + pandas.Series.__array_prepare__, result, context=context + ) @property - def __array_priority__(self): - raise NotImplementedError("Not Yet implemented.") - - def __array_wrap__(self, result, context=None): - raise NotImplementedError("Not Yet implemented.") - - def __bool__(self): - raise NotImplementedError("Not Yet implemented.") + def __array_priority__(self): # pragma: no cover + return self._to_pandas().__array_priority__ def __bytes__(self): - raise NotImplementedError("Not Yet implemented.") - - def __class__( - self, data=None, index=None, dtype=None, name=None, copy=False, fastpath=False - ): - raise NotImplementedError("Not Yet implemented.") + return self._default_to_pandas(pandas.Series.__bytes__) def __contains__(self, key): - raise NotImplementedError("Not Yet implemented.") + return key in self.index def __copy__(self, deep=True): - raise NotImplementedError("Not Yet implemented.") + return self.copy(deep=deep) def __deepcopy__(self, memo=None): - raise NotImplementedError("Not Yet implemented.") + return self.copy(deep=True) def __delitem__(self, key): - raise NotImplementedError("Not Yet implemented.") + if key not in self.keys(): + raise KeyError(key) + self.drop(labels=key, inplace=True) - def __dir__(self): - return list(type(self).__dict__.keys()) + def __div__(self, right): + return self.div(right) - def __div__(self, right, name="__truediv__", na_op=na_op): - raise NotImplementedError("Not Yet implemented.") - - def __divmod__(self, right, name="__divmod__", na_op=na_op): - raise NotImplementedError("Not Yet implemented.") - - @property - def __doc__(self): - raise NotImplementedError("Not Yet implemented.") - - def __eq__(self, other, axis=None): - raise NotImplementedError("Not Yet implemented.") - - def __finalize__(self, other, method=None, **kwargs): - raise NotImplementedError("Not Yet implemented.") + def __divmod__(self, right): + return self.divmod(right) def __float__(self): - raise NotImplementedError("Not Yet implemented.") - - def __floordiv__(self, right, name="__floordiv__", na_op=na_op): - raise NotImplementedError("Not Yet implemented.") + return float(self.squeeze()) - def __ge__(self, other, axis=None): - raise NotImplementedError("Not Yet implemented.") + def __floordiv__(self, right): + return self.floordiv(right) def __getitem__(self, key): - raise NotImplementedError("Not Yet implemented.") - - def __getstate__(self): - raise NotImplementedError("Not Yet implemented.") - - def __gt__(self, other, axis=None): - raise NotImplementedError("Not Yet implemented.") - - def __iadd__(self, other): - raise NotImplementedError("Not Yet implemented.") - - def __imul__(self, other): - raise NotImplementedError("Not Yet implemented.") + if ( + key in self.keys() + or is_list_like(key) + and all(k in self.keys() for k in key) + ): + return self.loc[key] + else: + return self.iloc[key] def __int__(self): - raise NotImplementedError("Not Yet implemented.") - - def __invert__(self): - raise NotImplementedError("Not Yet implemented.") - - def __ipow__(self, other): - raise NotImplementedError("Not Yet implemented.") - - def __isub__(self, other): - raise NotImplementedError("Not Yet implemented.") + return int(self.squeeze()) def __iter__(self): - raise NotImplementedError("Not Yet implemented.") - - def __itruediv__(self, other): - raise NotImplementedError("Not Yet implemented.") - - def __le__(self, other, axis=None): - raise NotImplementedError("Not Yet implemented.") - - def __len__(self): - raise NotImplementedError("Not Yet implemented.") - - def __long__(self): - raise NotImplementedError("Not Yet implemented.") + return self._to_pandas().__iter__() - def __lt__(self, other, axis=None): - raise NotImplementedError("Not Yet implemented.") - - def __mod__(self, right, name="__mod__", na_op=na_op): - raise NotImplementedError("Not Yet implemented.") - - def __mul__(self, right, name="__mul__", na_op=na_op): - raise NotImplementedError("Not Yet implemented.") - - def __ne__(self, other, axis=None): - raise NotImplementedError("Not Yet implemented.") - - def __neg__(self): - raise NotImplementedError("Not Yet implemented.") + def __mod__(self, right): + return self.mod(right) - def __nonzero__(self): - raise NotImplementedError("Not Yet implemented.") + def __mul__(self, right): + return self.mul(right) def __or__(self, other): - raise NotImplementedError("Not Yet implemented.") + new_self, new_other = self._prepare_inter_op(other) + return super(Series, new_self).__or__(new_other) - def __pow__(self, right, name="__pow__", na_op=na_op): - raise NotImplementedError("Not Yet implemented.") + def __pow__(self, right): + return self.pow(right) def __repr__(self): - raise NotImplementedError("Not Yet implemented.") + # In the future, we can have this be configurable, just like Pandas. + num_rows = 60 + num_cols = 30 + temp_df = self._build_repr_df(num_rows, num_cols) + if isinstance(temp_df, pandas.DataFrame): + temp_df = temp_df.iloc[:, 0] + temp_str = repr(temp_df) + if self.name is not None: + name_str = "Name: {}, ".format(str(self.name)) + else: + name_str = "" + if len(self.index) > num_rows: + len_str = "Length: {}, ".format(len(self.index)) + else: + len_str = "" + dtype_str = "dtype: {}".format(temp_str.rsplit("dtype: ", 1)[-1]) + if len(self) == 0: + return "Series([], {}{}".format(name_str, dtype_str) + return temp_str.rsplit("\nName:", 1)[0] + "\n{}{}{}".format( + name_str, len_str, dtype_str + ) def __round__(self, decimals=0): - raise NotImplementedError("Not Yet implemented.") + return self._create_or_update_from_compiler( + self._query_compiler.round(decimals=decimals) + ) def __setitem__(self, key, value): - raise NotImplementedError("Not Yet implemented.") - - def __setstate__(self, state): - raise NotImplementedError("Not Yet implemented.") - - def __sizeof__(self): - raise NotImplementedError("Not Yet implemented.") + if key not in self.keys(): + raise KeyError(key) + self._create_or_update_from_compiler( + self._query_compiler.setitem(1, key, value), inplace=True + ) - def __str__(self): - raise NotImplementedError("Not Yet implemented.") + def __sub__(self, right): + return self.sub(right) - def __sub__(self, right, name="__sub__", na_op=na_op): - raise NotImplementedError("Not Yet implemented.") + def __truediv__(self, right): + return self.truediv(right) - def __truediv__(self, right, name="__truediv__", na_op=na_op): - raise NotImplementedError("Not Yet implemented.") + __iadd__ = __add__ + __imul__ = __add__ + __ipow__ = __pow__ + __isub__ = __sub__ + __itruediv__ = __truediv__ def __xor__(self, other): - raise NotImplementedError("Not Yet implemented.") - - def abs(self): - raise NotImplementedError("Not Yet implemented.") + new_self, new_other = self._prepare_inter_op(other) + return super(Series, new_self).__xor__(new_other) def add(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") + new_self, new_other = self._prepare_inter_op(other) + return super(Series, new_self).add( + new_other, level=level, fill_value=fill_value, axis=axis + ) def add_prefix(self, prefix): - raise NotImplementedError("Not Yet implemented.") + """Add a prefix to each of the column names. - def add_suffix(self, suffix): - raise NotImplementedError("Not Yet implemented.") + Returns: + A new Series containing the new column names. + """ + return Series(query_compiler=self._query_compiler.add_prefix(prefix, axis=0)) - def agg(self, func, axis=0, *args, **kwargs): - raise NotImplementedError("Not Yet implemented.") + def add_suffix(self, suffix): + """Add a suffix to each of the column names. - def aggregate(self, func, axis=0, *args, **kwargs): - raise NotImplementedError("Not Yet implemented.") + Returns: + A new DataFrame containing the new column names. + """ + return Series(query_compiler=self._query_compiler.add_suffix(suffix, axis=0)) - def align( - self, - other, - join="outer", - axis=None, - level=None, - copy=True, - fill_value=None, - method=None, - limit=None, - fill_axis=0, - broadcast_axis=None, - ): - raise NotImplementedError("Not Yet implemented.") + def append(self, to_append, ignore_index=False, verify_integrity=False): + """Append another DataFrame/list/Series to this one. - def all(self, axis=None, bool_only=None, skipna=None, level=None, **kwargs): - raise NotImplementedError("Not Yet implemented.") + Args: + to_append: The object to append to this. + ignore_index: Ignore the index on appending. + verify_integrity: Verify the integrity of the index on completion. - def any(self, axis=None, bool_only=None, skipna=None, level=None, **kwargs): - raise NotImplementedError("Not Yet implemented.") + Returns: + A new DataFrame containing the concatenated values. + """ + from .dataframe import DataFrame - def append(self, to_append, ignore_index=False, verify_integrity=False): - raise NotImplementedError("Not Yet implemented.") + bad_type_msg = ( + 'cannot concatenate object of type "{}"; only pd.Series, ' + "pd.DataFrame, and pd.Panel (deprecated) objs are valid" + ) + if isinstance(to_append, list): + if not all(isinstance(o, BasePandasDataset) for o in to_append): + raise TypeError( + bad_type_msg.format( + type( + next( + o + for o in to_append + if not isinstance(o, BasePandasDataset) + ) + ) + ) + ) + elif all(isinstance(o, Series) for o in to_append): + self.name = None + for i in range(len(to_append)): + to_append[i].name = None + to_append[i] = to_append[i]._query_compiler + else: + # Matching pandas behavior of naming the Series columns 0 + self.name = 0 + for i in range(len(to_append)): + if isinstance(to_append[i], Series): + to_append[i].name = 0 + to_append[i] = DataFrame(to_append[i]) + return DataFrame(self).append( + to_append, + ignore_index=ignore_index, + verify_integrity=verify_integrity, + ) + elif isinstance(to_append, Series): + self.name = None + to_append.name = None + to_append = [to_append._query_compiler] + elif isinstance(to_append, DataFrame): + self.name = 0 + return DataFrame(self).append( + to_append, ignore_index=ignore_index, verify_integrity=verify_integrity + ) + else: + raise TypeError(bad_type_msg.format(type(to_append))) + # If ignore_index is False, by definition the Index will be correct. + # We also do this first to ensure that we don't waste compute/memory. + if verify_integrity and not ignore_index: + appended_index = ( + self.index.append(to_append.index) + if not isinstance(to_append, list) + else self.index.append([o.index for o in to_append]) + ) + is_valid = next((False for idx in appended_index.duplicated() if idx), True) + if not is_valid: + raise ValueError( + "Indexes have overlapping values: {}".format( + appended_index[appended_index.duplicated()] + ) + ) + query_compiler = self._query_compiler.concat( + 0, to_append, ignore_index=ignore_index, sort=None + ) + if len(query_compiler.columns) > 1: + return DataFrame(query_compiler=query_compiler) + else: + return Series(query_compiler=query_compiler) def apply(self, func, convert_dtype=True, args=(), **kwds): - raise NotImplementedError("Not Yet implemented.") - - def argmax(self, axis=None, skipna=True, *args, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def argmin(self, axis=None, skipna=True, *args, **kwargs): - raise NotImplementedError("Not Yet implemented.") + # apply and aggregate have slightly different behaviors, so we have to use + # each one separately to determine the correct return type. In the case of + # `agg`, the axis is set, but it is not required for the computation, so we use + # it to determine which function to run. + if kwds.pop("axis", None) is not None: + apply_func = "agg" + else: + apply_func = "apply" + # Add this because it only applies for `apply` specifically. + kwds["convert_dtype"] = convert_dtype + query_compiler = super(Series, self).apply(func, *args, **kwds) + # Sometimes we can return a scalar here + if not isinstance(query_compiler, type(self._query_compiler)): + return query_compiler + # This is the simplest way to determine the return type, but there are checks + # in pandas that verify that some results are created. This is a challenge for + # empty DataFrames, but fortunately they only happen when the `func` type is + # a list or a dictionary, which means that the return type won't change from + # type(self), so we catch that error and use `self.__name__` for the return + # type. + return_type = type( + getattr(getattr(pandas, self.__name__)(index=self.index), apply_func)( + func, *args, **kwds + ) + ).__name__ + if return_type not in ["DataFrame", "Series"]: + return query_compiler.to_pandas().squeeze() + else: + result = getattr(sys.modules[self.__module__], return_type)( + query_compiler=query_compiler + ) + if result.name == self.index[0]: + result.name = None + return result + + def argmax(self, axis=0, skipna=True, *args, **kwargs): + # Series and DataFrame have a different behavior for `skipna` + if skipna is None: + skipna = True + return self.idxmax(axis=axis, skipna=skipna, *args, **kwargs) + + def argmin(self, axis=0, skipna=True, *args, **kwargs): + # Series and DataFrame have a different behavior for `skipna` + if skipna is None: + skipna = True + return self.idxmin(axis=axis, skipna=skipna, *args, **kwargs) def argsort(self, axis=0, kind="quicksort", order=None): - raise NotImplementedError("Not Yet implemented.") + return self._default_to_pandas( + pandas.Series.argsort, axis=axis, kind=kind, order=order + ) - def as_blocks(self, copy=True): - raise NotImplementedError("Not Yet implemented.") - - def as_matrix(self, columns=None): - raise NotImplementedError("Not Yet implemented.") - - def asfreq(self, freq, method=None, how=None, normalize=False, fill_value=None): - raise NotImplementedError("Not Yet implemented.") - - def asof(self, where, subset=None): - raise NotImplementedError("Not Yet implemented.") - - def astype(self, dtype, copy=True, errors="raise", **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def at(self, axis=None): - raise NotImplementedError("Not Yet implemented.") - - def at_time(self, time, asof=False): - raise NotImplementedError("Not Yet implemented.") + def array(self): + return self._default_to_pandas(pandas.Series.array) def autocorr(self, lag=1): - raise NotImplementedError("Not Yet implemented.") + return self._default_to_pandas(pandas.Series.autocorr, lag=lag) def between(self, left, right, inclusive=True): - raise NotImplementedError("Not Yet implemented.") + return self._default_to_pandas( + pandas.Series.between, left, right, inclusive=inclusive + ) - def between_time(self, start_time, end_time, include_start=True, include_end=True): - raise NotImplementedError("Not Yet implemented.") - - def bfill(self, axis=None, inplace=False, limit=None, downcast=None): - raise NotImplementedError("Not Yet implemented.") - - def bool(self): - raise NotImplementedError("Not Yet implemented.") - - def clip(self, lower=None, upper=None, axis=None, *args, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def clip_lower(self, threshold, axis=None): - raise NotImplementedError("Not Yet implemented.") - - def clip_upper(self, threshold, axis=None): - raise NotImplementedError("Not Yet implemented.") - - def combine(self, other, func, fill_value=np.nan): - raise NotImplementedError("Not Yet implemented.") - - def combine_first(self, other): - raise NotImplementedError("Not Yet implemented.") + def combine(self, other, func, fill_value=None): + return super(Series, self).combine(other, func, fill_value=fill_value) def compound(self, axis=None, skipna=None, level=None): - raise NotImplementedError("Not Yet implemented.") + return self._default_to_pandas( + pandas.Series.compound, axis=axis, skipna=skipna, level=level + ) def compress(self, condition, *args, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def consolidate(self, inplace=False): - raise NotImplementedError("Not Yet implemented.") + return self._default_to_pandas( + pandas.Series.compress, condition, *args, **kwargs + ) def convert_objects( self, @@ -588,111 +414,79 @@ def convert_objects( convert_timedeltas=True, copy=True, ): - raise NotImplementedError("Not Yet implemented.") - - def copy(self, deep=True): - raise NotImplementedError("Not Yet implemented.") + return self._default_to_pandas( + pandas.Series.convert_objects, + convert_dates=convert_dates, + convert_numeric=convert_numeric, + convert_timedeltas=convert_timedeltas, + copy=copy, + ) def corr(self, other, method="pearson", min_periods=None): - raise NotImplementedError("Not Yet implemented.") + if isinstance(other, BasePandasDataset): + other = other._to_pandas() + return self._default_to_pandas( + pandas.Series.corr, other, method=method, min_periods=min_periods + ) def count(self, level=None): - raise NotImplementedError("Not Yet implemented.") + return super(Series, self).count(level=level) def cov(self, other, min_periods=None): - raise NotImplementedError("Not Yet implemented.") - - def cummax(self, axis=None, skipna=True, *args, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def cummin(self, axis=None, skipna=True, *args, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def cumprod(self, axis=None, skipna=True, *args, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def cumsum(self, axis=None, skipna=True, *args, **kwargs): - raise NotImplementedError("Not Yet implemented.") + if isinstance(other, BasePandasDataset): + other = other._to_pandas() + return self._default_to_pandas( + pandas.Series.cov, other, min_periods=min_periods + ) def describe(self, percentiles=None, include=None, exclude=None): - raise NotImplementedError("Not Yet implemented.") + # Pandas ignores the `include` and `exclude` for Series for some reason. + return super(Series, self).describe(percentiles=percentiles) def diff(self, periods=1): - raise NotImplementedError("Not Yet implemented.") - - def div(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def divide(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") + return super(Series, self).diff(periods=periods, axis=0) - def dot(self, other): - raise NotImplementedError("Not Yet implemented.") - - def drop(self, labels, axis=0, level=None, inplace=False, errors="raise"): - raise NotImplementedError("Not Yet implemented.") + def divmod(self, other, level=None, fill_value=None, axis=0): + return self._default_to_pandas( + pandas.Series.divmod, other, level=level, fill_value=fill_value, axis=axis + ) def drop_duplicates(self, keep="first", inplace=False): - raise NotImplementedError("Not Yet implemented.") + return super(Series, self).drop_duplicates(keep=keep, inplace=inplace) def dropna(self, axis=0, inplace=False, **kwargs): - raise NotImplementedError("Not Yet implemented.") + kwargs.pop("how", None) + if kwargs: + raise TypeError( + "dropna() got an unexpected keyword " + 'argument "{0}"'.format(list(kwargs.keys())[0]) + ) + return super(Series, self).dropna(axis=axis, inplace=inplace) def duplicated(self, keep="first"): - raise NotImplementedError("Not Yet implemented.") + return super(Series, self).duplicated(keep=keep) def eq(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") + new_self, new_other = self._prepare_inter_op(other) + return super(Series, new_self).eq(new_other, level=level, axis=axis) def equals(self, other): - raise NotImplementedError("Not Yet implemented.") - - def ewm( - self, - com=None, - span=None, - halflife=None, - alpha=None, - min_periods=0, - freq=None, - adjust=True, - ignore_na=False, - axis=0, - ): - raise NotImplementedError("Not Yet implemented.") - - def expanding(self, min_periods=1, freq=None, center=False, axis=0): - raise NotImplementedError("Not Yet implemented.") + return ( + self.name == other.name + and self.index.equals(other.index) + and self.eq(other).all() + ) def factorize(self, sort=False, na_sentinel=-1): - raise NotImplementedError("Not Yet implemented.") - - def ffill(self, axis=None, inplace=False, limit=None, downcast=None): - raise NotImplementedError("Not Yet implemented.") - - def fillna( - self, - value=None, - method=None, - axis=None, - inplace=False, - limit=None, - downcast=None, - **kwargs - ): - raise NotImplementedError("Not Yet implemented.") - - def filter(self, items=None, like=None, regex=None, axis=None): - raise NotImplementedError("Not Yet implemented.") - - def first(self, offset): - raise NotImplementedError("Not Yet implemented.") - - def first_valid_index(self): - raise NotImplementedError("Not Yet implemented.") + return self._default_to_pandas( + pandas.Series.factorize, sort=sort, na_sentinel=na_sentinel + ) def floordiv(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") + new_self, new_other = self._prepare_inter_op(other) + return super(Series, new_self).floordiv( + new_other, level=level, fill_value=None, axis=axis + ) def from_array( self, arr, index=None, name=None, dtype=None, copy=False, fastpath=False @@ -709,25 +503,24 @@ def from_csv( encoding=None, infer_datetime_format=False, ): - raise NotImplementedError("Not Yet implemented.") + return super(Series, self).from_csv( + path, + sep=sep, + parse_dates=parse_dates, + header=header, + index_col=index_col, + encoding=encoding, + infer_datetime_format=infer_datetime_format, + ) def ge(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def get(self, key, default=None): - raise NotImplementedError("Not Yet implemented.") - - def get_dtype_counts(self): - raise NotImplementedError("Not Yet implemented.") - - def get_ftype_counts(self): - raise NotImplementedError("Not Yet implemented.") + new_self, new_other = self._prepare_inter_op(other) + return super(Series, new_self).ge(new_other, level=level, axis=axis) def get_value(self, label, takeable=False): - raise NotImplementedError("Not Yet implemented.") - - def get_values(self): - raise NotImplementedError("Not Yet implemented.") + return self._default_to_pandas( + pandas.Series.get_value, label, takeable=takeable + ) def groupby( self, @@ -738,15 +531,25 @@ def groupby( sort=True, group_keys=True, squeeze=False, + observed=False, **kwargs ): - raise NotImplementedError("Not Yet implemented.") + return self._default_to_pandas( + pandas.Series.groupby, + by=by, + axis=axis, + level=level, + as_index=as_index, + sort=sort, + group_keys=group_keys, + squeeze=squeeze, + observed=observed, + **kwargs + ) def gt(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def head(self, n=5): - raise NotImplementedError("Not Yet implemented.") + new_self, new_other = self._prepare_inter_op(other) + return super(Series, new_self).gt(new_other, level=level, axis=axis) def hist( self, @@ -761,19 +564,29 @@ def hist( bins=10, **kwds ): - raise NotImplementedError("Not Yet implemented.") - - def iat(self, axis=None): - raise NotImplementedError("Not Yet implemented.") - - def idxmax(self, axis=None, skipna=True, *args, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def idxmin(self, axis=None, skipna=True, *args, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def iloc(self, axis=None): - raise NotImplementedError("Not Yet implemented.") + return self._default_to_pandas( + pandas.Series.hist, + by=by, + ax=ax, + grid=grid, + xlabelsize=xlabelsize, + xrot=xrot, + ylabelsize=ylabelsize, + yrot=yrot, + figsize=figsize, + bins=bins, + **kwds + ) + + def idxmax(self, axis=0, skipna=True, *args, **kwargs): + if skipna is None: + skipna = True + return super(Series, self).idxmax(axis=axis, skipna=skipna, *args, **kwargs) + + def idxmin(self, axis=0, skipna=True, *args, **kwargs): + if skipna is None: + skipna = True + return super(Series, self).idxmin(axis=axis, skipna=skipna, *args, **kwargs) def interpolate( self, @@ -782,122 +595,96 @@ def interpolate( limit=None, inplace=False, limit_direction="forward", + limit_area=None, downcast=None, **kwargs ): - raise NotImplementedError("Not Yet implemented.") - - def isin(self, values): - raise NotImplementedError("Not Yet implemented.") - - def isnull(self): - raise NotImplementedError("Not Yet implemented.") + return self._default_to_pandas( + pandas.Series.interpolate, + method=method, + axis=axis, + limit=limit, + inplace=inplace, + limit_direction=limit_direction, + limit_area=limit_area, + downcast=downcast, + **kwargs + ) def item(self): - raise NotImplementedError("Not Yet implemented.") + return self[0] def items(self): - raise NotImplementedError("Not Yet implemented.") - - def iteritems(self): - raise NotImplementedError("Not Yet implemented.") - - def ix(self, axis=None): - raise NotImplementedError("Not Yet implemented.") - - def keys(self): - raise NotImplementedError("Not Yet implemented.") + index_iter = iter(self.index) - def kurt(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - raise NotImplementedError("Not Yet implemented.") + def item_builder(df): + s = df.iloc[:, 0] + s.index = [next(index_iter)] + s.name = self.name + return s.items() - def kurtosis(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - raise NotImplementedError("Not Yet implemented.") + partition_iterator = PartitionIterator(self._query_compiler, 0, item_builder) + for v in partition_iterator: + yield v - def last(self, offset): - raise NotImplementedError("Not Yet implemented.") + def iteritems(self): + return self.items() - def last_valid_index(self): - raise NotImplementedError("Not Yet implemented.") + def keys(self): + return self.index def le(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def loc(self, axis=None): - raise NotImplementedError("Not Yet implemented.") + new_self, new_other = self._prepare_inter_op(other) + return super(Series, new_self).le(new_other, level=level, axis=axis) def lt(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def mad(self, axis=None, skipna=None, level=None): - raise NotImplementedError("Not Yet implemented.") + new_self, new_other = self._prepare_inter_op(other) + return super(Series, new_self).lt(new_other, level=level, axis=axis) def map(self, arg, na_action=None): - raise NotImplementedError("Not Yet implemented.") - - def mask( - self, - cond, - other=np.nan, - inplace=False, - axis=None, - level=None, - try_cast=False, - raise_on_error=True, - ): - raise NotImplementedError("Not Yet implemented.") - - def max(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def mean(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def median(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - raise NotImplementedError("Not Yet implemented.") + return self.__constructor__( + query_compiler=self._query_compiler._map_partitions( + lambda df: pandas.DataFrame(df.iloc[:, 0].map(arg, na_action=na_action)) + ) + ) def memory_usage(self, index=True, deep=False): - raise NotImplementedError("Not Yet implemented.") - - def min(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - raise NotImplementedError("Not Yet implemented.") + if index: + result = self._reduce_dimension( + self._query_compiler.memory_usage(index=False, deep=deep) + ) + index_value = self.index.memory_usage(deep=deep) + return result + index_value + return super(Series, self).memory_usage(index=index, deep=deep) def mod(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def mode(self): - raise NotImplementedError("Not Yet implemented.") + new_self, new_other = self._prepare_inter_op(other) + return super(Series, new_self).mod( + new_other, level=level, fill_value=None, axis=axis + ) def mul(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") + new_self, new_other = self._prepare_inter_op(other) + return super(Series, new_self).mul( + new_other, level=level, fill_value=None, axis=axis + ) - def multiply(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") + multiply = rmul = mul def ne(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") + new_self, new_other = self._prepare_inter_op(other) + return super(Series, new_self).ne(new_other, level=level, axis=axis) def nlargest(self, n=5, keep="first"): - raise NotImplementedError("Not Yet implemented.") + return self._default_to_pandas(pandas.Series.nlargest, n=n, keep=keep) def nonzero(self): - raise NotImplementedError("Not Yet implemented.") - - def notnull(self): - raise NotImplementedError("Not Yet implemented.") + return self.to_numpy().nonzero() def nsmallest(self, n=5, keep="first"): - raise NotImplementedError("Not Yet implemented.") - - def nunique(self, dropna=True): - raise NotImplementedError("Not Yet implemented.") - - def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def pipe(self, func, *args, **kwargs): - raise NotImplementedError("Not Yet implemented.") + return self._default_to_pandas(pandas.Series.nsmallest, n=n, keep=keep) + @property def plot( self, kind="line", @@ -924,189 +711,175 @@ def plot( label=None, secondary_y=False, **kwds - ): - raise NotImplementedError("Not Yet implemented.") - - def pop(self, item): - raise NotImplementedError("Not Yet implemented.") + ): + return self._to_pandas().plot def pow(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def prod(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - raise NotImplementedError("Not Yet implemented.") + new_self, new_other = self._prepare_inter_op(other) + return super(Series, new_self).pow( + new_other, level=level, fill_value=None, axis=axis + ) - def product(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - raise NotImplementedError("Not Yet implemented.") + def prod( + self, + axis=None, + skipna=None, + level=None, + numeric_only=None, + min_count=0, + **kwargs + ): + axis = self._get_axis_number(axis) + new_index = self.columns if axis else self.index + if min_count > len(new_index): + return np.nan + return super(Series, self).prod( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs + ) + + product = prod def ptp(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - raise NotImplementedError("Not Yet implemented.") + return self._default_to_pandas( + pandas.Series.ptp, + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs + ) def put(self, *args, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def quantile(self, q=0.5, interpolation="linear"): - raise NotImplementedError("Not Yet implemented.") - - def radd(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") + return self._default_to_pandas(pandas.Series.put, *args, **kwargs) - def rank( - self, - axis=0, - method="average", - numeric_only=None, - na_option="keep", - ascending=True, - pct=False, - ): - raise NotImplementedError("Not Yet implemented.") + radd = add def ravel(self, order="C"): - raise NotImplementedError("Not Yet implemented.") - - def rdiv(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") + return self._default_to_pandas(pandas.Series.ravel, order=order) def reindex(self, index=None, **kwargs): - raise NotImplementedError("Not Yet implemented.") + method = kwargs.pop("method", None) + level = kwargs.pop("level", None) + copy = kwargs.pop("copy", True) + limit = kwargs.pop("limit", None) + tolerance = kwargs.pop("tolerance", None) + fill_value = kwargs.pop("fill_value", None) + if kwargs: + raise TypeError( + "reindex() got an unexpected keyword " + 'argument "{0}"'.format(list(kwargs.keys())[0]) + ) + return super(Series, self).reindex( + index=index, + method=method, + level=level, + copy=copy, + limit=limit, + tolerance=tolerance, + fill_value=fill_value, + ) def reindex_axis(self, labels, axis=0, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def reindex_like(self, other, method=None, copy=True, limit=None, tolerance=None): - raise NotImplementedError("Not Yet implemented.") + if axis != 0: + raise ValueError("cannot reindex series on non-zero axis!") + return self.reindex(index=labels, **kwargs) def rename(self, index=None, **kwargs): - raise NotImplementedError("Not Yet implemented.") + non_mapping = is_scalar(index) or ( + is_list_like(index) and not is_dict_like(index) + ) + if non_mapping: + if kwargs.get("inplace", False): + self.name = index + else: + self_cp = self.copy() + self_cp.name = index + return self_cp + else: + from .dataframe import DataFrame - def rename_axis(self, mapper, axis=0, copy=True, inplace=False): - raise NotImplementedError("Not Yet implemented.") + result = DataFrame(self).rename(index=index, **kwargs).squeeze() + result.name = self.name + return result def reorder_levels(self, order): - raise NotImplementedError("Not Yet implemented.") - - def repeat(self, repeats, *args, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def replace( - self, - to_replace=None, - value=None, - inplace=False, - limit=None, - regex=False, - method="pad", - axis=None, - ): - raise NotImplementedError("Not Yet implemented.") + return self._default_to_pandas(pandas.Series.reorder_levels, order) - def resample( - self, - rule, - how=None, - axis=0, - fill_method=None, - closed=None, - label=None, - convention="start", - kind=None, - loffset=None, - limit=None, - base=0, - on=None, - level=None, - ): - raise NotImplementedError("Not Yet implemented.") + def repeat(self, repeats, axis=None): + return self._default_to_pandas(pandas.Series.repeat, repeats, axis=axis) def reset_index(self, level=None, drop=False, name=None, inplace=False): - raise NotImplementedError("Not Yet implemented.") + if drop and level is None: + new_idx = pandas.RangeIndex(len(self.index)) + if inplace: + self.index = new_idx + self.name = name or self.name + else: + result = self.copy() + result.index = new_idx + result.name = name or self.name + return result + elif not drop and inplace: + raise TypeError( + "Cannot reset_index inplace on a Series to create a DataFrame" + ) + else: + obj = self.copy() + if name is not None: + obj.name = name + from .dataframe import DataFrame - def reshape(self, *args, **kwargs): - raise NotImplementedError("Not Yet implemented.") + return DataFrame(self).reset_index(level=level, drop=drop, inplace=inplace) + + def rdivmod(self, other, level=None, fill_value=None, axis=0): + return self._default_to_pandas( + pandas.Series.rdivmod, other, level=level, fill_value=fill_value, axis=axis + ) def rfloordiv(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") + new_self, new_other = self._prepare_inter_op(other) + return super(Series, new_self).rfloordiv( + new_other, level=level, fill_value=None, axis=axis + ) def rmod(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def rmul(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def rolling( - self, - window, - min_periods=None, - freq=None, - center=False, - win_type=None, - on=None, - axis=0, - closed=None, - ): - raise NotImplementedError("Not Yet implemented.") - - def round(self, decimals=0, *args, **kwargs): - raise NotImplementedError("Not Yet implemented.") + new_self, new_other = self._prepare_inter_op(other) + return super(Series, new_self).rmod( + new_other, level=level, fill_value=None, axis=axis + ) def rpow(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") + new_self, new_other = self._prepare_inter_op(other) + return super(Series, new_self).rpow( + new_other, level=level, fill_value=None, axis=axis + ) def rsub(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") + new_self, new_other = self._prepare_inter_op(other) + return super(Series, new_self).rsub( + new_other, level=level, fill_value=None, axis=axis + ) def rtruediv(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") + new_self, new_other = self._prepare_inter_op(other) + return super(Series, new_self).rtruediv( + new_other, level=level, fill_value=None, axis=axis + ) - def sample( - self, - n=None, - frac=None, - replace=False, - weights=None, - random_state=None, - axis=None, - ): - raise NotImplementedError("Not Yet implemented.") + rdiv = rtruediv def searchsorted(self, value, side="left", sorter=None): - raise NotImplementedError("Not Yet implemented.") - - def select(self, crit, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def sem( - self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs - ): - raise NotImplementedError("Not Yet implemented.") - - def set_axis(self, axis, labels): - raise NotImplementedError("Not Yet implemented.") + return self._default_to_pandas( + pandas.Series.searchsorted, value, side=side, sorter=sorter + ) def set_value(self, label, value, takeable=False): - raise NotImplementedError("Not Yet implemented.") - - def shift(self, periods=1, freq=None, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def skew(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def slice_shift(self, periods=1, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def sort_index( - self, - axis=0, - level=None, - ascending=True, - inplace=False, - kind="quicksort", - na_position="last", - sort_remaining=True, - ): - raise NotImplementedError("Not Yet implemented.") + return self._default_to_pandas("set_value", label, value, takeable=takeable) def sort_values( self, @@ -1116,153 +889,89 @@ def sort_values( kind="quicksort", na_position="last", ): - raise NotImplementedError("Not Yet implemented.") + from .dataframe import DataFrame - def sortlevel(self, level=0, ascending=True, sort_remaining=True): - raise NotImplementedError("Not Yet implemented.") + # When we convert to a DataFrame, the name is automatically converted to 0 if it + # is None, so we do this to avoid a KeyError. + by = self.name if self.name is not None else 0 + result = ( + DataFrame(self) + .sort_values( + by=by, + ascending=ascending, + inplace=False, + kind=kind, + na_position=na_position, + ) + .squeeze(axis=1) + ) + result.name = self.name + return self._create_or_update_from_compiler( + result._query_compiler, inplace=inplace + ) - def squeeze(self, axis=None): - raise NotImplementedError("Not Yet implemented.") + def sparse(self, data=None): + return self._default_to_pandas(pandas.Series.sparse, data=data) - def std( - self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs - ): - raise NotImplementedError("Not Yet implemented.") + def squeeze(self, axis=None): + if axis is not None: + # Validate `axis` + pandas.Series._get_axis_number(axis) + if len(self.index) == 1: + return self._reduce_dimension(self._query_compiler) + else: + return self.copy() def sub(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") - - def subtract(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") + new_self, new_other = self._prepare_inter_op(other) + return super(Series, new_self).sub( + new_other, level=level, fill_value=None, axis=axis + ) - def sum(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def swapaxes(self, axis1, axis2, copy=True): - raise NotImplementedError("Not Yet implemented.") - - def swaplevel(self, i=-2, j=-1, copy=True): - raise NotImplementedError("Not Yet implemented.") - - def tail(self, n=5): - raise NotImplementedError("Not Yet implemented.") + subtract = sub - def take(self, indices, axis=0, convert=True, is_copy=False, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def to_clipboard(self, excel=None, sep=None, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def to_csv( + def sum( self, - path=None, - index=True, - sep=",", - na_rep="", - float_format=None, - header=False, - index_label=None, - mode="w", - encoding=None, - date_format=None, - decimal=".", + axis=None, + skipna=None, + level=None, + numeric_only=None, + min_count=0, + **kwargs ): - raise NotImplementedError("Not Yet implemented.") - - def to_dense(self): - raise NotImplementedError("Not Yet implemented.") - - def to_dict(self): - raise NotImplementedError("Not Yet implemented.") + axis = self._get_axis_number(axis) + new_index = self.columns if axis else self.index + if min_count > len(new_index): + return np.nan + return super(Series, self).sum( + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs + ) - def to_excel( - self, - excel_writer, - sheet_name="Sheet1", - na_rep="", - float_format=None, - columns=None, - header=True, - index=True, - index_label=None, - startrow=0, - startcol=0, - engine=None, - merge_cells=True, - encoding=None, - inf_rep="inf", - verbose=True, - ): - raise NotImplementedError("Not Yet implemented.") + def swaplevel(self, i=-2, j=-1, copy=True): + return self._default_to_pandas("swaplevel", i=i, j=j, copy=copy) def to_frame(self, name=None): - raise NotImplementedError("Not Yet implemented.") - - def to_hdf(self, path_or_buf, key, **kwargs): - raise NotImplementedError("Not Yet implemented.") + from .dataframe import DataFrame - def to_json( - self, - path_or_buf=None, - orient=None, - date_format=None, - double_precision=10, - force_ascii=True, - date_unit="ms", - default_handler=None, - lines=False, - ): - raise NotImplementedError("Not Yet implemented.") + self_cp = self.copy() + if name is not None: + self_cp.name = name + return DataFrame(self) - def to_latex( - self, - buf=None, - columns=None, - col_space=None, - header=True, - index=True, - na_rep="NaN", - formatters=None, - float_format=None, - sparsify=None, - index_names=True, - bold_rows=False, - column_format=None, - longtable=None, - escape=None, - encoding=None, - decimal=".", - multicolumn=None, - multicolumn_format=None, - multirow=None, - ): - raise NotImplementedError("Not Yet implemented.") + def to_list(self): + return self._default_to_pandas(pandas.Series.to_list) - def to_msgpack(self, path_or_buf=None, encoding="utf-8", **kwargs): - raise NotImplementedError("Not Yet implemented.") + tolist = to_list + # TODO(williamma12): When we implement to_timestamp, have this call the version + # in base.py def to_period(self, freq=None, copy=True): - raise NotImplementedError("Not Yet implemented.") - - def to_pickle(self, path, compression="infer"): - raise NotImplementedError("Not Yet implemented.") - - def to_sparse(self, kind="block", fill_value=None): - raise NotImplementedError("Not Yet implemented.") - - def to_sql( - self, - name, - con, - flavor=None, - schema=None, - if_exists="fail", - index=True, - index_label=None, - chunksize=None, - dtype=None, - ): - raise NotImplementedError("Not Yet implemented.") + return self._default_to_pandas("to_period", freq=freq, copy=copy) def to_string( self, @@ -1276,62 +985,65 @@ def to_string( name=False, max_rows=None, ): - raise NotImplementedError("Not Yet implemented.") - + return self._default_to_pandas( + pandas.Series.to_string, + buf=buf, + na_rep=na_rep, + float_format=float_format, + header=header, + index=index, + length=length, + dtype=dtype, + name=name, + max_rows=max_rows, + ) + + # TODO(williamma12): When we implement to_timestamp, have this call the version + # in base.py def to_timestamp(self, freq=None, how="start", copy=True): - raise NotImplementedError("Not Yet implemented.") - - def to_xarray(self): - raise NotImplementedError("Not Yet implemented.") - - def tolist(self): - raise NotImplementedError("Not Yet implemented.") - - def transform(self, func, *args, **kwargs): - raise NotImplementedError("Not Yet implemented.") + return self._default_to_pandas("to_timestamp", freq=freq, how=how, copy=copy) def transpose(self, *args, **kwargs): - raise NotImplementedError("Not Yet implemented.") - - def truediv(self, other, level=None, fill_value=None, axis=0): - raise NotImplementedError("Not Yet implemented.") + return self - def truncate(self, before=None, after=None, axis=None, copy=True): - raise NotImplementedError("Not Yet implemented.") + T = property(transpose) - def tshift(self, periods=1, freq=None, axis=0): - raise NotImplementedError("Not Yet implemented.") + def truediv(self, other, level=None, fill_value=None, axis=0): + new_self, new_other = self._prepare_inter_op(other) + return super(Series, new_self).truediv( + new_other, level=level, fill_value=None, axis=axis + ) - def tz_convert(self, tz, axis=0, level=None, copy=True): - raise NotImplementedError("Not Yet implemented.") + div = divide = truediv - def tz_localize(self, tz, axis=0, level=None, copy=True, ambiguous="raise"): - raise NotImplementedError("Not Yet implemented.") + def truncate(self, before=None, after=None, axis=None, copy=True): + return self._default_to_pandas( + pandas.Series.truncate, before=before, after=after, axis=axis, copy=copy + ) def unique(self): - raise NotImplementedError("Not Yet implemented.") - - def unstack(self, level=-1, fill_value=None): - raise NotImplementedError("Not Yet implemented.") + return self._default_to_pandas(pandas.Series.unique) - def upandasate(self, other): - raise NotImplementedError("Not Yet implemented.") + def update(self, other): + return self._default_to_pandas(pandas.Series.update, other) def valid(self, inplace=False, **kwargs): - raise NotImplementedError("Not Yet implemented.") + return self._default_to_pandas(pandas.Series.valid, inplace=inplace, **kwargs) def value_counts( self, normalize=False, sort=True, ascending=False, bins=None, dropna=True ): - raise NotImplementedError("Not Yet implemented.") - - def var( - self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs - ): - raise NotImplementedError("Not Yet implemented.") + return self._default_to_pandas( + pandas.Series.value_counts, + normalize=normalize, + sort=sort, + ascending=ascending, + bins=bins, + dropna=dropna, + ) def view(self, dtype=None): - raise NotImplementedError("Not Yet implemented.") + return self._default_to_pandas(pandas.Series.view, dtype=dtype) def where( self, @@ -1340,122 +1052,184 @@ def where( inplace=False, axis=None, level=None, + errors="raise", try_cast=False, - raise_on_error=True, + raise_on_error=None, ): - raise NotImplementedError("Not Yet implemented.") - - def xs(key, axis=0, level=None, drop_level=True): + if isinstance(other, Series): + other = to_pandas(other) + return self._default_to_pandas( + pandas.Series.where, + cond, + other=other, + inplace=inplace, + axis=axis, + level=level, + errors=errors, + try_cast=try_cast, + raise_on_error=raise_on_error, + ) + + def xs(self, key, axis=0, level=None, drop_level=True): # pragma: no cover raise NotImplementedError("Not Yet implemented.") @property def asobject(self): - raise NotImplementedError("Not Yet implemented.") + # We cannot default to pandas without a named function to call. + def asobject(df): + return df.asobject + + return self._default_to_pandas(asobject) @property def axes(self): - raise NotImplementedError("Not Yet implemented.") + return [self.index] @property def base(self): - raise NotImplementedError("Not Yet implemented.") + # We cannot default to pandas without a named function to call. + def base(df): + return df.base + + return self._default_to_pandas(base) @property - def blocks(self): - raise NotImplementedError("Not Yet implemented.") + def cat(self): + return self._default_to_pandas(pandas.Series.cat) @property def data(self): - raise NotImplementedError("Not Yet implemented.") + # We cannot default to pandas without a named function to call. + def data(df): + return df.data + + return self._default_to_pandas(data) @property - def dtype(self): - raise NotImplementedError("Not Yet implemented.") + def dt(self): + return self._default_to_pandas(pandas.Series.dt) @property - def dtypes(self): - raise NotImplementedError("Not Yet implemented.") + def dtype(self): + return self._query_compiler.dtypes.squeeze() + + dtypes = dtype @property def empty(self): - raise NotImplementedError("Not Yet implemented.") + return len(self.index) == 0 @property def flags(self): - raise NotImplementedError("Not Yet implemented.") + # We cannot default to pandas without a named function to call. + def flags(df): + return df.flags + + return self._default_to_pandas(flags) @property def ftype(self): - raise NotImplementedError("Not Yet implemented.") + return "{}:dense".format(self.dtype) - @property - def ftypes(self): - raise NotImplementedError("Not Yet implemented.") + ftypes = ftype @property def hasnans(self): - raise NotImplementedError("Not Yet implemented.") + return self.isna().sum() > 0 @property def imag(self): - raise NotImplementedError("Not Yet implemented.") - - @property - def index(self): - raise NotImplementedError("Not Yet implemented.") + # We cannot default to pandas without a named function to call. + def imag(df): + return df.imag - @property - def is_copy(self): - raise NotImplementedError("Not Yet implemented.") + return self._default_to_pandas(imag) @property def is_monotonic(self): - raise NotImplementedError("Not Yet implemented.") + # We cannot default to pandas without a named function to call. + def is_monotonic(df): + return df.is_monotonic + + return self._default_to_pandas(is_monotonic) @property def is_monotonic_decreasing(self): - raise NotImplementedError("Not Yet implemented.") + # We cannot default to pandas without a named function to call. + def is_monotonic_decreasing(df): + return df.is_monotonic + + return self._default_to_pandas(is_monotonic_decreasing) @property def is_monotonic_increasing(self): - raise NotImplementedError("Not Yet implemented.") + # We cannot default to pandas without a named function to call. + def is_monotonic_increasing(df): + return df.is_monotonic + + return self._default_to_pandas(is_monotonic_increasing) @property def is_unique(self): - raise NotImplementedError("Not Yet implemented.") + # We cannot default to pandas without a named function to call. + def is_unique(df): + return df.is_unique + + return self._default_to_pandas(is_unique) @property def itemsize(self): - raise NotImplementedError("Not Yet implemented.") + # We cannot default to pandas without a named function to call. + def itemsize(df): + return df.itemsize - @property - def name(self): - raise NotImplementedError("Not Yet implemented.") + return self._default_to_pandas(itemsize) @property def nbytes(self): - raise NotImplementedError("Not Yet implemented.") + # We cannot default to pandas without a named function to call. + def nbytes(df): + return df.nbytes + + return self._default_to_pandas(nbytes) @property def ndim(self): - raise NotImplementedError("Not Yet implemented.") + """Get the number of dimensions for this DataFrame. + + Returns: + The number of dimensions for this Series. + """ + # Series have an invariant that requires they be 1 dimension. + return 1 @property def real(self): - raise NotImplementedError("Not Yet implemented.") + # We cannot default to pandas without a named function to call. + def real(df): + return df.real - @property - def shape(self): - raise NotImplementedError("Not Yet implemented.") + return self._default_to_pandas(real) @property - def size(self): - raise NotImplementedError("Not Yet implemented.") + def shape(self): + return (len(self),) @property def strides(self): - raise NotImplementedError("Not Yet implemented.") + # We cannot default to pandas without a named function to call. + def strides(df): + return df.strides + + return self._default_to_pandas(strides) @property - def values(self): - raise NotImplementedError("Not Yet implemented.") + def str(self): + return self._default_to_pandas(pandas.Series.str) + + def _to_pandas(self): + df = self._query_compiler.to_pandas() + series = df[df.columns[0]] + if series.name == "__reduced__": + series.name = None + return series diff --git a/modin/pandas/test/test_api.py b/modin/pandas/test/test_api.py index 4a229852685..625b2810fc1 100644 --- a/modin/pandas/test/test_api.py +++ b/modin/pandas/test/test_api.py @@ -128,3 +128,55 @@ def test_dataframe_api_equality(): pass assert not len(difference), "Differences found in API: {}".format(difference) + + +def test_series_api_equality(): + modin_dir = [obj for obj in dir(pd.Series) if obj[0] != "_"] + pandas_dir = [obj for obj in dir(pandas.Series) if obj[0] != "_"] + + ignore = ["timetuple"] + missing_from_modin = set(pandas_dir) - set(modin_dir) + assert not len(missing_from_modin - set(ignore)), missing_from_modin + + assert not len(set(modin_dir) - set(pandas_dir)), set(modin_dir) - set(pandas_dir) + + # These have to be checked manually + allowed_different = ["to_hdf", "hist"] + difference = [] + + for m in modin_dir: + print(m) + if m in allowed_different: + continue + try: + pandas_sig = dict(inspect.signature(getattr(pandas.Series, m)).parameters) + except TypeError: + continue + try: + modin_sig = dict(inspect.signature(getattr(pd.Series, m)).parameters) + except TypeError: + continue + + if not pandas_sig == modin_sig: + append_val = ( + m, + { + i: pandas_sig[i] + for i in pandas_sig.keys() + if pandas_sig[i] != modin_sig[i] + and not ( + pandas_sig[i].default is np.nan + and modin_sig[i].default is np.nan + ) + }, + ) + try: + # This validates that there are actually values to add to the difference + # based on the condition above. + if len(list(append_val[-1])[-1]) > 0: + difference.append(append_val) + except IndexError: + pass + + print(difference) + assert not len(difference), "Differences found in API: {}".format(difference) diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index 36e178e9dc3..31c95726be0 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -10,7 +10,6 @@ import matplotlib import modin.pandas as pd from modin.pandas.utils import to_pandas -from modin.pandas.series import SeriesView from numpy.testing import assert_array_equal import sys @@ -113,14 +112,15 @@ def inter_df_math_helper(self, modin_df, pandas_df, op): modin_result = getattr(modin_df, op)(list_test, axis=1) df_equals(modin_result, pandas_result) - list_test = random_state.randint(RAND_LOW, RAND_HIGH, size=(modin_df.shape[0])) + series_test_modin = modin_df[modin_df.columns[0]] + series_test_pandas = pandas_df[pandas_df.columns[0]] try: - pandas_result = getattr(pandas_df, op)(list_test, axis=0) + pandas_result = getattr(pandas_df, op)(series_test_pandas, axis=0) except Exception as e: with pytest.raises(type(e)): - getattr(modin_df, op)(list_test, axis=0) + getattr(modin_df, op)(series_test_modin, axis=0) else: - modin_result = getattr(modin_df, op)(list_test, axis=0) + modin_result = getattr(modin_df, op)(series_test_modin, axis=0) df_equals(modin_result, pandas_result) # Level test @@ -129,7 +129,6 @@ def inter_df_math_helper(self, modin_df, pandas_df, op): ) modin_df_multi_level = modin_df.copy() modin_df_multi_level.index = new_idx - # Defaults to pandas with pytest.warns(UserWarning): # Operation against self for sanity check @@ -789,6 +788,29 @@ def test_all(self, data, axis, skipna, bool_only): modin_result = modin_df.all(axis=None, skipna=skipna, bool_only=bool_only) df_equals(modin_result, pandas_result) + try: + pandas_result = pandas_df.T.all( + axis=axis, skipna=skipna, bool_only=bool_only + ) + except Exception as e: + with pytest.raises(type(e)): + modin_df.T.all(axis=axis, skipna=skipna, bool_only=bool_only) + else: + modin_result = modin_df.T.all(axis=axis, skipna=skipna, bool_only=bool_only) + df_equals(modin_result, pandas_result) + + # Test when axis is None. This will get repeated but easier than using list in parameterize decorator + try: + pandas_result = pandas_df.T.all( + axis=None, skipna=skipna, bool_only=bool_only + ) + except Exception as e: + with pytest.raises(type(e)): + modin_df.T.all(axis=None, skipna=skipna, bool_only=bool_only) + else: + modin_result = modin_df.T.all(axis=None, skipna=skipna, bool_only=bool_only) + df_equals(modin_result, pandas_result) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize( @@ -819,6 +841,28 @@ def test_any(self, data, axis, skipna, bool_only): modin_result = modin_df.any(axis=None, skipna=skipna, bool_only=bool_only) df_equals(modin_result, pandas_result) + try: + pandas_result = pandas_df.T.any( + axis=axis, skipna=skipna, bool_only=bool_only + ) + except Exception as e: + with pytest.raises(type(e)): + modin_df.T.any(axis=axis, skipna=skipna, bool_only=bool_only) + else: + modin_result = modin_df.T.any(axis=axis, skipna=skipna, bool_only=bool_only) + df_equals(modin_result, pandas_result) + + try: + pandas_result = pandas_df.T.any( + axis=None, skipna=skipna, bool_only=bool_only + ) + except Exception as e: + with pytest.raises(type(e)): + modin_df.T.any(axis=None, skipna=skipna, bool_only=bool_only) + else: + modin_result = modin_df.T.any(axis=None, skipna=skipna, bool_only=bool_only) + df_equals(modin_result, pandas_result) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_append(self, data): modin_df = pd.DataFrame(data) @@ -1221,6 +1265,10 @@ def test_count(self, request, data, axis, numeric_only): pandas_result = pandas_df.count(axis=axis, numeric_only=numeric_only) df_equals(modin_result, pandas_result) + modin_result = modin_df.T.count(axis=axis, numeric_only=numeric_only) + pandas_result = pandas_df.T.count(axis=axis, numeric_only=numeric_only) + df_equals(modin_result, pandas_result) + def test_cov(self): data = test_data_values[0] with pytest.warns(UserWarning): @@ -1244,6 +1292,15 @@ def test_cummax(self, request, data, axis, skipna): modin_result = modin_df.cummax(axis=axis, skipna=skipna) df_equals(modin_result, pandas_result) + try: + pandas_result = pandas_df.T.cummax(axis=axis, skipna=skipna) + except Exception as e: + with pytest.raises(type(e)): + modin_df.T.cummax(axis=axis, skipna=skipna) + else: + modin_result = modin_df.T.cummax(axis=axis, skipna=skipna) + df_equals(modin_result, pandas_result) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize( @@ -1262,6 +1319,15 @@ def test_cummin(self, request, data, axis, skipna): modin_result = modin_df.cummin(axis=axis, skipna=skipna) df_equals(modin_result, pandas_result) + try: + pandas_result = pandas_df.T.cummin(axis=axis, skipna=skipna) + except Exception as e: + with pytest.raises(type(e)): + modin_df.T.cummin(axis=axis, skipna=skipna) + else: + modin_result = modin_df.T.cummin(axis=axis, skipna=skipna) + df_equals(modin_result, pandas_result) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize( @@ -1280,6 +1346,15 @@ def test_cumprod(self, request, data, axis, skipna): modin_result = modin_df.cumprod(axis=axis, skipna=skipna) df_equals(modin_result, pandas_result) + try: + pandas_result = pandas_df.T.cumprod(axis=axis, skipna=skipna) + except Exception as e: + with pytest.raises(type(e)): + modin_df.T.cumprod(axis=axis, skipna=skipna) + else: + modin_result = modin_df.T.cumprod(axis=axis, skipna=skipna) + df_equals(modin_result, pandas_result) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize( @@ -1306,6 +1381,21 @@ def test_cumsum(self, request, data, axis, skipna): modin_result = modin_df.cumsum(axis=axis, skipna=skipna) df_equals(modin_result, pandas_result) + if name_contains(request.node.name, ["datetime_timedelta_data"]) and ( + axis == 0 or axis == "rows" + ): + with pytest.raises(TypeError): + modin_df.T.cumsum(axis=axis, skipna=skipna) + else: + try: + pandas_result = pandas_df.T.cumsum(axis=axis, skipna=skipna) + except Exception as e: + with pytest.raises(type(e)): + modin_df.T.cumsum(axis=axis, skipna=skipna) + else: + modin_result = modin_df.T.cumsum(axis=axis, skipna=skipna) + df_equals(modin_result, pandas_result) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_describe(self, data): modin_df = pd.DataFrame(data) @@ -1416,6 +1506,15 @@ def test_diff(self, request, data, axis, periods): modin_result = modin_df.diff(axis=axis, periods=periods) df_equals(modin_result, pandas_result) + try: + pandas_result = pandas_df.T.diff(axis=axis, periods=periods) + except Exception as e: + with pytest.raises(type(e)): + modin_df.T.diff(axis=axis, periods=periods) + else: + modin_result = modin_df.T.diff(axis=axis, periods=periods) + df_equals(modin_result, pandas_result) + def test_drop(self): frame_data = {"A": [1, 2, 3, 4], "B": [0, 1, 2, 3]} simple = pandas.DataFrame(frame_data) @@ -1864,7 +1963,7 @@ def test_eval_df_use_case(self): "arctan2(sin(a), b)", engine="python", parser="pandas" ) - assert isinstance(tmp_modin, (pandas.Series, SeriesView)) + assert isinstance(tmp_modin, pd.Series) df_equals(tmp_modin, tmp_pandas) # Test not inplace assignments @@ -2150,7 +2249,7 @@ def test_fillna_dict_series(self): ) # Series treated same as dict - df_equals(modin_df.fillna(df.max()), df.fillna(df.max())) + df_equals(modin_df.fillna(modin_df.max()), df.fillna(df.max())) def test_fillna_dataframe(self): frame_data = { @@ -2170,9 +2269,10 @@ def test_fillna_dataframe(self): }, index=list("VWXuZ"), ) + modin_df2 = pd.DataFrame(df2) # only those columns and indices which are shared get filled - df_equals(modin_df.fillna(df2), df.fillna(df2)) + df_equals(modin_df.fillna(modin_df2), df.fillna(df2)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_fillna_columns(self, data): @@ -2354,6 +2454,10 @@ def test_idxmax(self, data, axis, skipna): modin_result = modin_df.idxmax(axis=axis, skipna=skipna) df_equals(modin_result, pandas_result) + pandas_result = pandas_df.T.idxmax(axis=axis, skipna=skipna) + modin_result = modin_df.T.idxmax(axis=axis, skipna=skipna) + df_equals(modin_result, pandas_result) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize( @@ -2367,6 +2471,10 @@ def test_idxmin(self, data, axis, skipna): pandas_result = pandas_df.idxmin(axis=axis, skipna=skipna) df_equals(modin_result, pandas_result) + modin_result = modin_df.T.idxmin(axis=axis, skipna=skipna) + pandas_result = pandas_df.T.idxmin(axis=axis, skipna=skipna) + df_equals(modin_result, pandas_result) + def test_infer_objects(self): data = test_data_values[0] with pytest.warns(UserWarning): @@ -2507,8 +2615,8 @@ def test_interpolate(self): def test_is_copy(self): data = test_data_values[0] - with pytest.warns(UserWarning): - pd.DataFrame(data).is_copy + with pytest.warns(FutureWarning): + assert pd.DataFrame(data).is_copy == pandas.DataFrame(data).is_copy @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_items(self, data): @@ -2628,8 +2736,6 @@ def test_kurtosis(self): with pytest.warns(UserWarning): pd.DataFrame(data).kurtosis() - -class TestDFPartTwo: def test_last(self): i = pd.date_range("2018-04-09", periods=4, freq="2D") ts = pd.DataFrame({"A": [1, 2, 3, 4]}, index=i) @@ -2687,9 +2793,10 @@ def test_loc_multi_index(self): ) df_equals(modin_df.loc[1], pandas_df.loc[1]) - assert modin_df.loc[1, "Presidents"].equals(pandas_df.loc[1, "Presidents"]) - assert modin_df.loc[1, ("Presidents", "Pure mentions")].equals( - pandas_df.loc[1, ("Presidents", "Pure mentions")] + df_equals(modin_df.loc[1, "Presidents"], pandas_df.loc[1, "Presidents"]) + df_equals( + modin_df.loc[1, ("Presidents", "Pure mentions")], + pandas_df.loc[1, ("Presidents", "Pure mentions")], ) assert ( modin_df.loc[1, ("Presidents", "Pure mentions", "IND", "all")] @@ -2731,7 +2838,7 @@ def test_loc_multi_index(self): index=pandas_index, columns=["col{}".format(i) for i in range(100)], ) - assert modin_df.loc["bar", "col1"].equals(pandas_df.loc["bar", "col1"]) + df_equals(modin_df.loc["bar", "col1"], pandas_df.loc["bar", "col1"]) assert ( modin_df.loc[("bar", "one"), "col1"] == pandas_df.loc[("bar", "one"), "col1"] @@ -2778,15 +2885,26 @@ def test_max(self, request, data, axis, skipna, numeric_only): ) except Exception: with pytest.raises(TypeError): - modin_result = modin_df.max( - axis=axis, skipna=skipna, numeric_only=numeric_only - ) + modin_df.max(axis=axis, skipna=skipna, numeric_only=numeric_only) else: modin_result = modin_df.max( axis=axis, skipna=skipna, numeric_only=numeric_only ) df_equals(modin_result, pandas_result) + try: + pandas_result = pandas_df.T.max( + axis=axis, skipna=skipna, numeric_only=numeric_only + ) + except Exception: + with pytest.raises(TypeError): + modin_df.T.max(axis=axis, skipna=skipna, numeric_only=numeric_only) + else: + modin_result = modin_df.T.max( + axis=axis, skipna=skipna, numeric_only=numeric_only + ) + df_equals(modin_result, pandas_result) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize( @@ -2812,6 +2930,19 @@ def test_mean(self, request, data, axis, skipna, numeric_only): ) df_equals(modin_result, pandas_result) + try: + pandas_result = pandas_df.T.mean( + axis=axis, skipna=skipna, numeric_only=numeric_only + ) + except Exception as e: + with pytest.raises(type(e)): + modin_df.T.mean(axis=axis, skipna=skipna, numeric_only=numeric_only) + else: + modin_result = modin_df.T.mean( + axis=axis, skipna=skipna, numeric_only=numeric_only + ) + df_equals(modin_result, pandas_result) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize( @@ -2837,6 +2968,21 @@ def test_median(self, request, data, axis, skipna, numeric_only): ) df_equals(modin_result, pandas_result) + try: + pandas_result = pandas_df.T.median( + axis=axis, skipna=skipna, numeric_only=numeric_only + ) + except Exception: + with pytest.raises(TypeError): + modin_df.T.median(axis=axis, skipna=skipna, numeric_only=numeric_only) + else: + modin_result = modin_df.T.median( + axis=axis, skipna=skipna, numeric_only=numeric_only + ) + df_equals(modin_result, pandas_result) + + +class TestDFPartTwo: def test_melt(self): data = test_data_values[0] with pytest.warns(UserWarning): @@ -2948,6 +3094,19 @@ def test_min(self, data, axis, skipna, numeric_only): ) df_equals(modin_result, pandas_result) + try: + pandas_result = pandas_df.T.min( + axis=axis, skipna=skipna, numeric_only=numeric_only + ) + except Exception: + with pytest.raises(TypeError): + modin_df.T.min(axis=axis, skipna=skipna, numeric_only=numeric_only) + else: + modin_result = modin_df.T.min( + axis=axis, skipna=skipna, numeric_only=numeric_only + ) + df_equals(modin_result, pandas_result) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize( @@ -3064,6 +3223,10 @@ def test_nunique(self, data, axis, dropna): pandas_result = pandas_df.nunique(axis=axis, dropna=dropna) df_equals(modin_result, pandas_result) + modin_result = modin_df.T.nunique(axis=axis, dropna=dropna) + pandas_result = pandas_df.T.nunique(axis=axis, dropna=dropna) + df_equals(modin_result, pandas_result) + def test_pct_change(self): data = test_data_values[0] with pytest.warns(UserWarning): @@ -3203,6 +3366,24 @@ def test_prod(self, request, data, axis, skipna, numeric_only, min_count): ) df_equals(modin_result, pandas_result) + try: + pandas_result = pandas_df.T.prod( + axis=axis, skipna=skipna, numeric_only=numeric_only, min_count=min_count + ) + except Exception: + with pytest.raises(TypeError): + modin_df.T.prod( + axis=axis, + skipna=skipna, + numeric_only=numeric_only, + min_count=min_count, + ) + else: + modin_result = modin_df.T.prod( + axis=axis, skipna=skipna, numeric_only=numeric_only, min_count=min_count + ) + df_equals(modin_result, pandas_result) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize( @@ -3258,6 +3439,22 @@ def test_quantile(self, request, data, q): with pytest.raises(ValueError): modin_df.quantile(q) + if not name_contains(request.node.name, no_numeric_dfs): + df_equals(modin_df.T.quantile(q), pandas_df.T.quantile(q)) + df_equals(modin_df.T.quantile(q, axis=1), pandas_df.T.quantile(q, axis=1)) + + try: + pandas_result = pandas_df.T.quantile(q, axis=1, numeric_only=False) + except Exception as e: + with pytest.raises(type(e)): + modin_df.T.quantile(q, axis=1, numeric_only=False) + else: + modin_result = modin_df.T.quantile(q, axis=1, numeric_only=False) + df_equals(modin_result, pandas_result) + else: + with pytest.raises(ValueError): + modin_df.T.quantile(q) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("funcs", query_func_values, ids=query_func_keys) def test_query(self, data, funcs): @@ -3395,7 +3592,7 @@ def test_rename_sanity(self): ) # have to pass something - pytest.raises(TypeError, modin_df.rename) + pytest.raises(TypeError, modin_df.rename()) # partial columns renamed = test_data.frame.rename(columns={"C": "foo", "D": "bar"}) @@ -3901,6 +4098,19 @@ def test_skew(self, request, data, axis, skipna, numeric_only): ) df_equals(modin_result, pandas_result) + try: + pandas_result = pandas_df.T.skew( + axis=axis, skipna=skipna, numeric_only=numeric_only + ) + except Exception: + with pytest.raises(TypeError): + modin_df.T.skew(axis=axis, skipna=skipna, numeric_only=numeric_only) + else: + modin_result = modin_df.T.skew( + axis=axis, skipna=skipna, numeric_only=numeric_only + ) + df_equals(modin_result, pandas_result) + def test_slice_shift(self): data = test_data_values[0] with pytest.warns(UserWarning): @@ -4119,6 +4329,21 @@ def test_std(self, request, data, axis, skipna, numeric_only, ddof): ) df_equals(modin_result, pandas_result) + try: + pandas_result = pandas_df.T.std( + axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof + ) + except Exception as e: + with pytest.raises(type(e)): + modin_df.T.std( + axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof + ) + else: + modin_result = modin_df.T.std( + axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof + ) + df_equals(modin_result, pandas_result) + def test_style(self): data = test_data_values[0] with pytest.warns(UserWarning): @@ -4156,6 +4381,23 @@ def test_sum(self, request, data, axis, skipna, numeric_only, min_count): axis=axis, skipna=skipna, numeric_only=numeric_only, min_count=min_count ) df_equals(modin_result, pandas_result) + try: + pandas_result = pandas_df.T.sum( + axis=axis, skipna=skipna, numeric_only=numeric_only, min_count=min_count + ) + except Exception: + with pytest.raises(TypeError): + modin_df.T.sum( + axis=axis, + skipna=skipna, + numeric_only=numeric_only, + min_count=min_count, + ) + else: + modin_result = modin_df.T.sum( + axis=axis, skipna=skipna, numeric_only=numeric_only, min_count=min_count + ) + df_equals(modin_result, pandas_result) def test_swapaxes(self): data = test_data_values[0] @@ -4201,13 +4443,6 @@ def test_take(self): with pytest.warns(UserWarning): df.take([0, 3]) - def test_to_datetime(self): - frame_data = {"year": [2015, 2016], "month": [2, 3], "day": [4, 5]} - modin_df = pd.DataFrame(frame_data) - pd_df = pandas.DataFrame(frame_data) - - df_equals(pd.to_datetime(modin_df), pandas.to_datetime(pd_df)) - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_to_records(self, request, data): modin_df = pd.DataFrame(data) @@ -4369,6 +4604,21 @@ def test_var(self, request, data, axis, skipna, numeric_only, ddof): ) df_equals(modin_result, pandas_result) + try: + pandas_result = pandas_df.T.var( + axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof + ) + except Exception: + with pytest.raises(TypeError): + modin_df.T.var( + axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof + ) + else: + modin_result = modin_df.T.var( + axis=axis, skipna=skipna, numeric_only=numeric_only, ddof=ddof + ) + df_equals(modin_result, pandas_result) + def test_where(self): frame_data = random_state.randn(100, 10) pandas_df = pandas.DataFrame(frame_data, columns=list("abcdefghij")) @@ -4407,15 +4657,6 @@ def test_xs(self): with pytest.warns(UserWarning): df.xs("mammal") - def test__doc__(self): - assert pd.DataFrame.__doc__ != pandas.DataFrame.__doc__ - assert pd.DataFrame.__init__ != pandas.DataFrame.__init__ - for attr, obj in pd.DataFrame.__dict__.items(): - if (callable(obj) or isinstance(obj, property)) and attr != "__init__": - pd_obj = getattr(pandas.DataFrame, attr, None) - if callable(pd_obj) or isinstance(pd_obj, property): - assert obj.__doc__ == pd_obj.__doc__ - @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___getitem__(self, request, data): modin_df = pd.DataFrame(data) @@ -4424,7 +4665,7 @@ def test___getitem__(self, request, data): if "empty_data" not in request.node.name: key = modin_df.columns[0] modin_col = modin_df.__getitem__(key) - assert isinstance(modin_col, (pandas.Series, SeriesView)) + assert isinstance(modin_col, pd.Series) pd_col = pandas_df[key] df_equals(pd_col, modin_col) @@ -4439,10 +4680,10 @@ def test___getattr__(self, request, data): col = modin_df.__getattr__(key) col = modin_df.__getattr__("col1") - assert isinstance(col, (pandas.Series, SeriesView)) + assert isinstance(col, pd.Series) col = getattr(modin_df, "col1") - assert isinstance(col, (pandas.Series, SeriesView)) + assert isinstance(col, pd.Series) # Check that lookup in column doesn't override other attributes df2 = modin_df.rename(index=str, columns={key: "columns"}) @@ -4524,13 +4765,18 @@ def test___neg__(self, request, data): modin_result = modin_df.__neg__() df_equals(modin_result, pandas_result) - def test___invert__(self): - data = test_data_values[0] - with pytest.warns(UserWarning): - try: - pd.DataFrame(data).__invert__() - except TypeError: - pass + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) + def test___invert__(self, data): + modin_df = pd.DataFrame(data) + pandas_df = pandas.DataFrame(data) + try: + pandas_result = ~pandas_df + except Exception as e: + with pytest.raises(type(e)): + repr(~modin_df) + else: + modin_result = ~modin_df + df_equals(modin_result, pandas_result) def test___hash__(self): data = test_data_values[0] diff --git a/modin/pandas/test/test_general.py b/modin/pandas/test/test_general.py index 677f7ff95eb..49d6685b924 100644 --- a/modin/pandas/test/test_general.py +++ b/modin/pandas/test/test_general.py @@ -13,13 +13,13 @@ def test_isna(data): pandas_result = pandas.isna(pandas_df) modin_result = pd.isna(modin_df) - df_equals(modin_result, pandas_result) modin_result = pd.isna(pd.Series([1, np.nan, 2])) pandas_result = pandas.isna(pandas.Series([1, np.nan, 2])) + df_equals(modin_result, pandas_result) - assert modin_result.equals(pandas_result) + assert pd.isna(np.nan) == pandas.isna(np.nan) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -29,13 +29,13 @@ def test_isnull(data): pandas_result = pandas.isnull(pandas_df) modin_result = pd.isnull(modin_df) - df_equals(modin_result, pandas_result) modin_result = pd.isnull(pd.Series([1, np.nan, 2])) pandas_result = pandas.isnull(pandas.Series([1, np.nan, 2])) + df_equals(modin_result, pandas_result) - assert modin_result.equals(pandas_result) + assert pd.isna(np.nan) == pandas.isna(np.nan) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -45,13 +45,13 @@ def test_notna(data): pandas_result = pandas.notna(pandas_df) modin_result = pd.notna(modin_df) - df_equals(modin_result, pandas_result) modin_result = pd.notna(pd.Series([1, np.nan, 2])) pandas_result = pandas.notna(pandas.Series([1, np.nan, 2])) + df_equals(modin_result, pandas_result) - assert modin_result.equals(pandas_result) + assert pd.isna(np.nan) == pandas.isna(np.nan) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -61,13 +61,13 @@ def test_notnull(data): pandas_result = pandas.notnull(pandas_df) modin_result = pd.notnull(modin_df) - df_equals(modin_result, pandas_result) modin_result = pd.notnull(pd.Series([1, np.nan, 2])) pandas_result = pandas.notnull(pandas.Series([1, np.nan, 2])) + df_equals(modin_result, pandas_result) - assert modin_result.equals(pandas_result) + assert pd.isna(np.nan) == pandas.isna(np.nan) def test_merge(): diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index ddc7e06aba8..aed3883b791 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -3,2327 +3,2622 @@ from __future__ import print_function import pytest +import numpy as np +import pandas +import matplotlib import modin.pandas as pd +from numpy.testing import assert_array_equal +import sys + +from modin.pandas.utils import to_pandas +from .utils import ( + random_state, + RAND_LOW, + RAND_HIGH, + df_equals, + arg_keys, + name_contains, + test_data_values, + test_data_keys, + numeric_dfs, + no_numeric_dfs, + agg_func_keys, + agg_func_values, + numeric_agg_funcs, + quantiles_keys, + quantiles_values, + bool_arg_keys, + bool_arg_values, + int_arg_keys, + int_arg_values, +) pd.DEFAULT_NPARTITIONS = 4 +# Force matplotlib to not use any Xwindows backend. +matplotlib.use("Agg") + +if sys.version_info[0] < 3: + PY2 = True +else: + PY2 = False + + +def inter_df_math_helper(modin_series, pandas_series, op): + try: + pandas_result = getattr(pandas_series, op)(4) + except Exception as e: + with pytest.raises(type(e)): + repr(getattr(modin_series, op)(4)) # repr to force materialization + else: + modin_result = getattr(modin_series, op)(4) + df_equals(modin_result, pandas_result) + + try: + pandas_result = getattr(pandas_series, op)(4.0) + except Exception as e: + with pytest.raises(type(e)): + repr(getattr(modin_series, op)(4.0)) # repr to force materialization + else: + modin_result = getattr(modin_series, op)(4.0) + df_equals(modin_result, pandas_result) + + # These operations don't support non-scalar `other` or have a strange behavior in + # the testing environment + if op in [ + "__divmod__", + "divmod", + "rdivmod", + "floordiv", + "__floordiv__", + "rfloordiv", + "__rfloordiv__", + "mod", + "__mod__", + "rmod", + "__rmod__", + ]: + return + + try: + pandas_result = getattr(pandas_series, op)(pandas_series) + except Exception as e: + with pytest.raises(type(e)): + repr( + getattr(modin_series, op)(modin_series) + ) # repr to force materialization + else: + modin_result = getattr(modin_series, op)(modin_series) + df_equals(modin_result, pandas_result) + + list_test = random_state.randint(RAND_LOW, RAND_HIGH, size=(modin_series.shape[0])) + try: + pandas_result = getattr(pandas_series, op)(list_test) + except Exception as e: + with pytest.raises(type(e)): + repr(getattr(modin_series, op)(list_test)) # repr to force materialization + else: + modin_result = getattr(modin_series, op)(list_test) + df_equals(modin_result, pandas_result) + + series_test_modin = pd.Series(list_test, index=modin_series.index) + series_test_pandas = pandas.Series(list_test, index=pandas_series.index) + try: + pandas_result = getattr(pandas_series, op)(series_test_pandas) + except Exception as e: + with pytest.raises(type(e)): + repr( + getattr(modin_series, op)(series_test_modin) + ) # repr to force materialization + else: + modin_result = getattr(modin_series, op)(series_test_modin) + df_equals(modin_result, pandas_result) + + # Level test + new_idx = pandas.MultiIndex.from_tuples( + [(i // 4, i // 2, i) for i in modin_series.index] + ) + modin_df_multi_level = modin_series.copy() + modin_df_multi_level.index = new_idx + + try: + # Defaults to pandas + with pytest.warns(UserWarning): + # Operation against self for sanity check + getattr(modin_df_multi_level, op)(modin_df_multi_level, level=1) + except TypeError: + # Some operations don't support multilevel `level` parameter + pass + + +def create_test_series(dict_vals): + modin_series = pd.Series(dict_vals[next(iter(dict_vals.keys()))]) + pandas_series = pandas.Series(dict_vals[next(iter(dict_vals.keys()))]) + return modin_series, pandas_series + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_T(data): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series.T, pandas_series.T) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___abs__(data): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series.__abs__(), pandas_series.__abs__()) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___add__(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "__add__") + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___and__(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "__and__") + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___array__(data): + modin_series, pandas_series = create_test_series(data) + with pytest.warns(UserWarning): + modin_result = modin_series.__array__() + assert_array_equal(modin_result, pandas_series.__array__()) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___bool__(data): + modin_series, pandas_series = create_test_series(data) + try: + pandas_result = pandas_series.__bool__() + except Exception as e: + with pytest.raises(type(e)): + modin_series.__bool__() + else: + modin_result = modin_series.__bool__() + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___contains__(request, data): + modin_series, pandas_series = create_test_series(data) + + result = False + key = "Not Exist" + assert result == modin_series.__contains__(key) + assert result == (key in modin_series) + + if "empty_data" not in request.node.name: + result = True + key = pandas_series.keys()[0] + assert result == modin_series.__contains__(key) + assert result == (key in modin_series) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___copy__(data): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series.copy(), modin_series) + df_equals(modin_series.copy(), pandas_series.copy()) + df_equals(modin_series.copy(), pandas_series) -@pytest.fixture -def create_test_series(): - return pd.Series(None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___deepcopy__(data): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series.__deepcopy__(), modin_series) + df_equals(modin_series.__deepcopy__(), pandas_series.__deepcopy__()) + df_equals(modin_series.__deepcopy__(), pandas_series) -@pytest.mark.skip(reason="Using pandas Series.") -def test_T(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.T +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___delitem__(data): + modin_series, pandas_series = create_test_series(data) + del modin_series[modin_series.index[0]] + del pandas_series[pandas_series.index[0]] + df_equals(modin_series, pandas_series) -@pytest.mark.skip(reason="Using pandas Series.") -def test___abs__(): - ray_series = create_test_series() + del modin_series[modin_series.index[-1]] + del pandas_series[pandas_series.index[-1]] + df_equals(modin_series, pandas_series) - with pytest.raises(NotImplementedError): - ray_series.__abs__() + del modin_series[modin_series.index[0]] + del pandas_series[pandas_series.index[0]] + df_equals(modin_series, pandas_series) -@pytest.mark.skip(reason="Using pandas Series.") -def test___add__(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___div__(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "__div__") - with pytest.raises(NotImplementedError): - ray_series.__add__(None, None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_divmod(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "divmod") -@pytest.mark.skip(reason="Using pandas Series.") -def test___and__(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.__and__(None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_rdivmod(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "rdivmod") -@pytest.mark.skip(reason="Using pandas Series.") -def test___array__(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___eq__(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "__eq__") - with pytest.raises(NotImplementedError): - ray_series.__array__(None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___floordiv__(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "__floordiv__") -@pytest.mark.skip(reason="Using pandas Series.") -def test___array_prepare__(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.__array_prepare__(None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___ge__(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "__ge__") -@pytest.mark.skip(reason="Using pandas Series.") -def test___array_priority__(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___getitem__(data): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series[0], pandas_series[0]) + df_equals( + modin_series[modin_series.index[-1]], pandas_series[pandas_series.index[-1]] + ) - with pytest.raises(NotImplementedError): - ray_series.__array_priority__ +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___gt__(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "__gt__") -@pytest.mark.skip(reason="Using pandas Series.") -def test___array_wrap__(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.__array_wrap__(None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___int__(data): + modin_series, pandas_series = create_test_series(data) + try: + pandas_result = int(pandas_series[0]) + except Exception as e: + with pytest.raises(type(e)): + int(modin_series[0]) + else: + assert int(modin_series[0]) == pandas_result -@pytest.mark.skip(reason="Using pandas Series.") -def test___bool__(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___invert__(data): + modin_series, pandas_series = create_test_series(data) + try: + pandas_result = pandas_series.__invert__() + except Exception as e: + with pytest.raises(type(e)): + repr(modin_series.__invert__()) + else: + df_equals(modin_series.__invert__(), pandas_result) - with pytest.raises(NotImplementedError): - ray_series.__bool__() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___iter__(data): + modin_series, pandas_series = create_test_series(data) + for m, p in zip(modin_series.__iter__(), pandas_series.__iter__()): + np.testing.assert_equal(m, p) -@pytest.mark.skip(reason="Using pandas Series.") -def test___bytes__(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.__bytes__() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___le__(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "__le__") -@pytest.mark.skip(reason="Using pandas Series.") -def test___class__(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___len__(data): + modin_series, pandas_series = create_test_series(data) + assert len(modin_series) == len(pandas_series) - with pytest.raises(NotImplementedError): - ray_series.__class__(None, None, None, None, None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___long__(data): + modin_series, pandas_series = create_test_series(data) + try: + pandas_result = pandas_series[0].__long__() + except Exception as e: + with pytest.raises(type(e)): + modin_series[0].__long__() + else: + assert modin_series[0].__long__() == pandas_result -@pytest.mark.skip(reason="Using pandas Series.") -def test___contains__(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.__contains__(None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___lt__(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "__lt__") -@pytest.mark.skip(reason="Using pandas Series.") -def test___copy__(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___mod__(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "__mod__") - with pytest.raises(NotImplementedError): - ray_series.__copy__(None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___mul__(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "__mul__") -@pytest.mark.skip(reason="Using pandas Series.") -def test___deepcopy__(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.__deepcopy__(None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___ne__(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "__ne__") -@pytest.mark.skip(reason="Using pandas Series.") -def test___delitem__(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___neg__(request, data): + modin_series, pandas_series = create_test_series(data) + try: + pandas_result = pandas_series.__neg__() + except Exception as e: + with pytest.raises(type(e)): + repr(modin_series.__neg__()) + else: + df_equals(modin_series.__neg__(), pandas_result) - with pytest.raises(NotImplementedError): - ray_series.__delitem__(None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___or__(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "__or__") -@pytest.mark.skip(reason="Using pandas Series.") -def test___div__(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.__div__(None, None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___pow__(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "__pow__") -@pytest.mark.skip(reason="Using pandas Series.") -def test___divmod__(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___repr__(data): + modin_series, pandas_series = create_test_series(data) + assert repr(modin_series) == repr(pandas_series) - with pytest.raises(NotImplementedError): - ray_series.__divmod__(None, None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___round__(data): + modin_series, pandas_series = create_test_series(data) + if not PY2: + df_equals(round(modin_series), round(pandas_series)) -@pytest.mark.skip(reason="Using pandas Series.") -def test___doc__(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.__doc__ +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___setitem__(data): + modin_series, pandas_series = create_test_series(data) + for key in modin_series.keys(): + modin_series[key] = 0 + pandas_series[key] = 0 + df_equals(modin_series, pandas_series) -@pytest.mark.skip(reason="Using pandas Series.") -def test___eq__(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___sizeof__(data): + modin_series, pandas_series = create_test_series(data) + with pytest.warns(UserWarning): + modin_series.__sizeof__() - with pytest.raises(NotImplementedError): - ray_series.__eq__(None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___str__(data): + modin_series, pandas_series = create_test_series(data) + assert str(modin_series) == str(pandas_series) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___sub__(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "__sub__") + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___truediv__(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "__truediv__") + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test___xor__(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "__xor__") + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_abs(data): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series.abs(), pandas_series.abs()) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_add(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "add") + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_add_prefix(data): + modin_series, pandas_series = create_test_series(data) + df_equals( + modin_series.add_prefix("PREFIX_ADD_"), pandas_series.add_prefix("PREFIX_ADD_") + ) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_add_suffix(data): + modin_series, pandas_series = create_test_series(data) + df_equals( + modin_series.add_suffix("SUFFIX_ADD_"), pandas_series.add_suffix("SUFFIX_ADD_") + ) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) +def test_agg(data, func): + modin_series, pandas_series = create_test_series(data) + try: + pandas_result = pandas_series.agg(func) + except Exception as e: + with pytest.raises(type(e)): + modin_series.agg(func) + else: + modin_result = modin_series.agg(func) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) +def test_agg_numeric(request, data, func): + if name_contains(request.node.name, numeric_agg_funcs) and name_contains( + request.node.name, numeric_dfs + ): + axis = 0 + modin_series, pandas_series = create_test_series(data) + try: + pandas_result = pandas_series.agg(func, axis) + except Exception as e: + with pytest.raises(type(e)): + modin_series.agg(func, axis) + else: + modin_result = modin_series.agg(func, axis) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) +def test_aggregate(request, data, func): + axis = 0 + modin_series, pandas_series = create_test_series(data) + try: + pandas_result = pandas_series.aggregate(func, axis) + except Exception as e: + with pytest.raises(type(e)): + modin_series.aggregate(func, axis) + else: + modin_result = modin_series.aggregate(func, axis) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) +def test_aggregate_numeric(request, data, func): + if name_contains(request.node.name, numeric_agg_funcs) and name_contains( + request.node.name, numeric_dfs + ): + axis = 0 + modin_series, pandas_series = create_test_series(data) + try: + pandas_result = pandas_series.agg(func, axis) + except Exception as e: + with pytest.raises(type(e)): + modin_series.agg(func, axis) + else: + modin_result = modin_series.agg(func, axis) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_aggregate_error_checking(data): + modin_series, _ = create_test_series(data) # noqa: F841 + + assert modin_series.aggregate("ndim") == 1 + with pytest.warns(UserWarning): + modin_series.aggregate("cumproduct") + with pytest.raises(ValueError): + modin_series.aggregate("NOT_EXISTS") + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_align(data): + modin_series, _ = create_test_series(data) # noqa: F841 + with pytest.warns(UserWarning): + modin_series.align(modin_series) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) +) +def test_all(data, skipna): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series.all(skipna=skipna), pandas_series.all(skipna=skipna)) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) +) +def test_any(data, skipna): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series.any(skipna=skipna), pandas_series.any(skipna=skipna)) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_append(data): + modin_series, pandas_series = create_test_series(data) + + data_to_append = {"append_a": 2, "append_b": 1000} + + ignore_idx_values = [True, False] + + for ignore in ignore_idx_values: + try: + pandas_result = pandas_series.append(data_to_append, ignore_index=ignore) + except Exception as e: + with pytest.raises(type(e)): + modin_series.append(data_to_append, ignore_index=ignore) + else: + modin_result = modin_series.append(data_to_append, ignore_index=ignore) + df_equals(modin_result, pandas_result) + + try: + pandas_result = pandas_series.append(pandas_series.iloc[-1]) + except Exception as e: + with pytest.raises(type(e)): + modin_series.append(modin_series.iloc[-1]) + else: + modin_result = modin_series.append(modin_series.iloc[-1]) + df_equals(modin_result, pandas_result) + + try: + pandas_result = pandas_series.append([pandas_series.iloc[-1]]) + except Exception as e: + with pytest.raises(type(e)): + modin_series.append([modin_series.iloc[-1]]) + else: + modin_result = modin_series.append([modin_series.iloc[-1]]) + df_equals(modin_result, pandas_result) + + verify_integrity_values = [True, False] + + for verify_integrity in verify_integrity_values: + try: + pandas_result = pandas_series.append( + [pandas_series, pandas_series], verify_integrity=verify_integrity + ) + except Exception as e: + with pytest.raises(type(e)): + modin_series.append( + [modin_series, modin_series], verify_integrity=verify_integrity + ) + else: + modin_result = modin_series.append( + [modin_series, modin_series], verify_integrity=verify_integrity + ) + df_equals(modin_result, pandas_result) + + try: + pandas_result = pandas_series.append( + pandas_series, verify_integrity=verify_integrity + ) + except Exception as e: + with pytest.raises(type(e)): + modin_series.append(modin_series, verify_integrity=verify_integrity) + else: + modin_result = modin_series.append( + modin_series, verify_integrity=verify_integrity + ) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) +def test_apply(request, data, func): + modin_series, pandas_series = create_test_series(data) + + try: + pandas_result = pandas_series.apply(func) + except Exception as e: + with pytest.raises(type(e)): + modin_series.apply(func) + else: + modin_result = modin_series.apply(func) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) +def test_apply_numeric(request, data, func): + modin_series, pandas_series = create_test_series(data) + + if name_contains(request.node.name, numeric_dfs): + try: + pandas_result = pandas_series.apply(func) + except Exception as e: + with pytest.raises(type(e)): + modin_series.apply(func) + else: + modin_result = modin_series.apply(func) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) +) +def test_argmax(data, skipna): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series.argmax(skipna=skipna), pandas_series.argmax(skipna=skipna)) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) +) +def test_argmin(data, skipna): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series.argmin(skipna=skipna), pandas_series.argmin(skipna=skipna)) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_argsort(data): + modin_series, pandas_series = create_test_series(data) + with pytest.warns(UserWarning): + modin_result = modin_series.argsort() + df_equals(modin_result, pandas_series.argsort()) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_as_blocks(data): + modin_series, _ = create_test_series(data) # noqa: F841 + with pytest.warns(UserWarning): + modin_series.as_blocks() + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_as_matrix(data): + modin_series, _ = create_test_series(data) # noqa: F841 + with pytest.warns(UserWarning): + modin_series.as_matrix() -@pytest.mark.skip(reason="Using pandas Series.") -def test___finalize__(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.__finalize__(None, None) +def test_asfreq(): + index = pd.date_range("1/1/2000", periods=4, freq="T") + series = pd.Series([0.0, None, 2.0, 3.0], index=index) + with pytest.warns(UserWarning): + # We are only testing that this defaults to pandas, so we will just check for + # the warning + series.asfreq(freq="30S") -@pytest.mark.skip(reason="Using pandas Series.") -def test___float__(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_asobject(data): + modin_series, _ = create_test_series(data) # noqa: F841 + with pytest.warns(UserWarning): + _ = modin_series.asobject - with pytest.raises(NotImplementedError): - ray_series.__float__() +def test_asof(): + series = pd.Series( + [10, 20, 30, 40, 50], + index=pd.DatetimeIndex( + [ + "2018-02-27 09:01:00", + "2018-02-27 09:02:00", + "2018-02-27 09:03:00", + "2018-02-27 09:04:00", + "2018-02-27 09:05:00", + ] + ), + ) + with pytest.warns(UserWarning): + series.asof(pd.DatetimeIndex(["2018-02-27 09:03:30", "2018-02-27 09:04:30"])) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_astype(data): + modin_series, pandas_series = create_test_series(data) + try: + pandas_result = pandas_series.astype(str) + except Exception as e: + with pytest.raises(type(e)): + repr(modin_series.astype(str)) # repr to force materialization + else: + df_equals(modin_series.astype(str), pandas_result) + + try: + pandas_result = pandas_series.astype(np.int64) + except Exception as e: + with pytest.raises(type(e)): + repr(modin_series.astype(np.int64)) # repr to force materialization + else: + df_equals(modin_series.astype(np.int64), pandas_result) + + try: + pandas_result = pandas_series.astype(np.float64) + except Exception as e: + with pytest.raises(type(e)): + repr(modin_series.astype(np.float64)) # repr to force materialization + else: + df_equals(modin_series.astype(np.float64), pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_at(data): + modin_series, pandas_series = create_test_series(data) + df_equals( + modin_series.at[modin_series.index[0]], pandas_series.at[pandas_series.index[0]] + ) + df_equals( + modin_series.at[modin_series.index[-1]], pandas_series[pandas_series.index[-1]] + ) -@pytest.mark.skip(reason="Using pandas Series.") -def test___floordiv__(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.__floordiv__(None, None) +def test_at_time(): + i = pd.date_range("2018-04-09", periods=4, freq="12H") + ts = pd.Series([1, 2, 3, 4], index=i) + with pytest.warns(UserWarning): + ts.at_time("12:00") -@pytest.mark.skip(reason="Using pandas Series.") -def test___ge__(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_autocorr(data): + modin_series, _ = create_test_series(data) # noqa: F841 + with pytest.warns(UserWarning): + modin_series.autocorr() - with pytest.raises(NotImplementedError): - ray_series.__ge__(None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_axes(data): + modin_series, pandas_series = create_test_series(data) + assert modin_series.axes[0].equals(pandas_series.axes[0]) + assert len(modin_series.axes) == len(pandas_series.axes) -@pytest.mark.skip(reason="Using pandas Series.") -def test___getitem__(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.__getitem__(None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_base(data): + modin_series, _ = create_test_series(data) # noqa: F841 + with pytest.warns(UserWarning): + _ = modin_series.base @pytest.mark.skip(reason="Using pandas Series.") -def test___getstate__(): - ray_series = create_test_series() +def test_between(): + modin_series = create_test_series() with pytest.raises(NotImplementedError): - ray_series.__getstate__() + modin_series.between(None, None) -@pytest.mark.skip(reason="Using pandas Series.") -def test___gt__(): - ray_series = create_test_series() +def test_between_time(): + i = pd.date_range("2018-04-09", periods=4, freq="12H") + ts = pd.Series([1, 2, 3, 4], index=i) + with pytest.warns(UserWarning): + ts.between_time("0:15", "0:45") - with pytest.raises(NotImplementedError): - ray_series.__gt__(None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_bfill(data): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series.bfill(), pandas_series.bfill()) + # inplace + modin_series_cp = modin_series.copy() + pandas_series_cp = pandas_series.copy() + modin_series_cp.bfill(inplace=True) + pandas_series_cp.bfill(inplace=True) + df_equals(modin_series_cp, pandas_series_cp) -@pytest.mark.skip(reason="Using pandas Series.") -def test___iadd__(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.__iadd__(None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_blocks(data): + modin_series, _ = create_test_series(data) # noqa: F841 + with pytest.warns(UserWarning): + _ = modin_series.blocks -@pytest.mark.skip(reason="Using pandas Series.") -def test___imul__(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_bool(data): + modin_series, pandas_series = create_test_series(data) - with pytest.raises(NotImplementedError): - ray_series.__imul__(None) + with pytest.raises(ValueError): + modin_series.bool() + with pytest.raises(ValueError): + modin_series.__bool__() -@pytest.mark.skip(reason="Using pandas Series.") -def test___int__(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_clip(request, data): + modin_series, pandas_series = create_test_series(data) - with pytest.raises(NotImplementedError): - ray_series.__int__() + if name_contains(request.node.name, numeric_dfs): + # set bounds + lower, upper = np.sort(random_state.random_integers(RAND_LOW, RAND_HIGH, 2)) + # test only upper scalar bound + modin_result = modin_series.clip(None, upper) + pandas_result = pandas_series.clip(None, upper) + df_equals(modin_result, pandas_result) -@pytest.mark.skip(reason="Using pandas Series.") -def test___invert__(): - ray_series = create_test_series() + # test lower and upper scalar bound + modin_result = modin_series.clip(lower, upper) + pandas_result = pandas_series.clip(lower, upper) + df_equals(modin_result, pandas_result) - with pytest.raises(NotImplementedError): - ray_series.__invert__() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_clip_lower(request, data): + modin_series, pandas_series = create_test_series(data) -@pytest.mark.skip(reason="Using pandas Series.") -def test___ipow__(): - ray_series = create_test_series() + if name_contains(request.node.name, numeric_dfs): + # set bounds + lower = random_state.random_integers(RAND_LOW, RAND_HIGH, 1)[0] - with pytest.raises(NotImplementedError): - ray_series.__ipow__(None) + # test lower scalar bound + pandas_result = pandas_series.clip_lower(lower) + modin_result = modin_series.clip_lower(lower) + df_equals(modin_result, pandas_result) -@pytest.mark.skip(reason="Using pandas Series.") -def test___isub__(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_clip_upper(request, data): + modin_series, pandas_series = create_test_series(data) + + if name_contains(request.node.name, numeric_dfs): + # set bounds + upper = random_state.random_integers(RAND_LOW, RAND_HIGH, 1)[0] + + # test upper scalar bound + modin_result = modin_series.clip_upper(upper) + pandas_result = pandas_series.clip_upper(upper) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_combine(data): + modin_series, _ = create_test_series(data) # noqa: F841 + modin_series2 = modin_series % (max(modin_series) // 2) + with pytest.warns(UserWarning): + modin_series.combine(modin_series2, lambda s1, s2: s1 if s1 < s2 else s2) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_combine_first(data): + modin_series, _ = create_test_series(data) # noqa: F841 + modin_series2 = modin_series % (max(modin_series) // 2) + with pytest.warns(UserWarning): + modin_series.combine_first(modin_series2) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_compound(data): + modin_series, _ = create_test_series(data) # noqa: F841 + with pytest.warns(UserWarning): + modin_series.compound() + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_compress(data): + modin_series, _ = create_test_series(data) # noqa: F841 + with pytest.warns(UserWarning): + modin_series.compress(modin_series > 30) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_convert_objects(data): + modin_series, _ = create_test_series(data) # noqa: F841 + with pytest.warns(UserWarning): + modin_series.convert_objects() + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_copy(data): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series, modin_series.copy()) + df_equals(modin_series.copy(), pandas_series) + df_equals(modin_series.copy(), pandas_series.copy()) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_corr(data): + modin_series, _ = create_test_series(data) # noqa: F841 + with pytest.warns(UserWarning): + modin_series.corr(modin_series) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_count(data): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series.count(), pandas_series.count()) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_cov(data): + modin_series, _ = create_test_series(data) # noqa: F841 + with pytest.warns(UserWarning): + modin_series.cov(modin_series) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) +) +def test_cummax(data, skipna): + modin_series, pandas_series = create_test_series(data) + try: + pandas_result = pandas_series.cummax(skipna=skipna) + except Exception as e: + with pytest.raises(type(e)): + modin_series.cummax(skipna=skipna) + else: + df_equals(modin_series.cummax(skipna=skipna), pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) +) +def test_cummin(data, skipna): + modin_series, pandas_series = create_test_series(data) + try: + pandas_result = pandas_series.cummin(skipna=skipna) + except Exception as e: + with pytest.raises(type(e)): + modin_series.cummin(skipna=skipna) + else: + df_equals(modin_series.cummin(skipna=skipna), pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) +) +def test_cumprod(data, skipna): + modin_series, pandas_series = create_test_series(data) + try: + pandas_result = pandas_series.cumprod(skipna=skipna) + except Exception as e: + with pytest.raises(type(e)): + modin_series.cumprod(skipna=skipna) + else: + df_equals(modin_series.cumprod(skipna=skipna), pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) +) +def test_cumsum(data, skipna): + modin_series, pandas_series = create_test_series(data) + try: + pandas_result = pandas_series.cumsum(skipna=skipna) + except Exception as e: + with pytest.raises(type(e)): + modin_series.cumsum(skipna=skipna) + else: + df_equals(modin_series.cumsum(skipna=skipna), pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_data(data): + modin_series, _ = create_test_series(data) # noqa: F841 + with pytest.warns(UserWarning): + _ = modin_series.data + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_describe(data): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series.describe(), pandas_series.describe()) + percentiles = [0.10, 0.11, 0.44, 0.78, 0.99] + df_equals( + modin_series.describe(percentiles=percentiles), + pandas_series.describe(percentiles=percentiles), + ) + + try: + pandas_result = pandas_series.describe(exclude=[np.float64]) + except Exception as e: + with pytest.raises(type(e)): + modin_series.describe(exclude=[np.float64]) + else: + modin_result = modin_series.describe(exclude=[np.float64]) + df_equals(modin_result, pandas_result) + + try: + pandas_result = pandas_series.describe(exclude=np.float64) + except Exception as e: + with pytest.raises(type(e)): + modin_series.describe(exclude=np.float64) + else: + modin_result = modin_series.describe(exclude=np.float64) + df_equals(modin_result, pandas_result) + + try: + pandas_result = pandas_series.describe( + include=[np.timedelta64, np.datetime64, np.object, np.bool] + ) + except Exception as e: + with pytest.raises(type(e)): + modin_series.describe( + include=[np.timedelta64, np.datetime64, np.object, np.bool] + ) + else: + modin_result = modin_series.describe( + include=[np.timedelta64, np.datetime64, np.object, np.bool] + ) + df_equals(modin_result, pandas_result) - with pytest.raises(NotImplementedError): - ray_series.__isub__(None) + modin_result = modin_series.describe(include=str(modin_series.dtypes)) + pandas_result = pandas_series.describe(include=str(pandas_series.dtypes)) + df_equals(modin_result, pandas_result) + modin_result = modin_series.describe(include=[np.number]) + pandas_result = pandas_series.describe(include=[np.number]) + df_equals(modin_result, pandas_result) -@pytest.mark.skip(reason="Using pandas Series.") -def test___iter__(): - ray_series = create_test_series() + df_equals( + modin_series.describe(include="all"), pandas_series.describe(include="all") + ) - with pytest.raises(NotImplementedError): - ray_series.__iter__() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "periods", int_arg_values, ids=arg_keys("periods", int_arg_keys) +) +def test_diff(data, periods): + modin_series, pandas_series = create_test_series(data) -@pytest.mark.skip(reason="Using pandas Series.") -def test___itruediv__(): - ray_series = create_test_series() + try: + pandas_result = pandas_series.diff(periods=periods) + except Exception as e: + with pytest.raises(type(e)): + modin_series.diff(periods=periods) + else: + modin_result = modin_series.diff(periods=periods) + df_equals(modin_result, pandas_result) - with pytest.raises(NotImplementedError): - ray_series.__itruediv__(None) + try: + pandas_result = pandas_series.T.diff(periods=periods) + except Exception as e: + with pytest.raises(type(e)): + modin_series.T.diff(periods=periods) + else: + modin_result = modin_series.T.diff(periods=periods) + df_equals(modin_result, pandas_result) -@pytest.mark.skip(reason="Using pandas Series.") -def test___le__(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_div(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "div") - with pytest.raises(NotImplementedError): - ray_series.__le__(None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_divide(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "divide") -@pytest.mark.skip(reason="Using pandas Series.") -def test___len__(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.__len__() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_dot(data): + modin_series, _ = create_test_series(data) # noqa: F841 + with pytest.warns(UserWarning): + modin_series.dot(modin_series) @pytest.mark.skip(reason="Using pandas Series.") -def test___long__(): - ray_series = create_test_series() +def test_drop(): + modin_series = create_test_series() with pytest.raises(NotImplementedError): - ray_series.__long__() + modin_series.drop(None, None, None, None) -@pytest.mark.skip(reason="Using pandas Series.") -def test___lt__(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_drop_duplicates(data): + modin_series, pandas_series = create_test_series(data) + df_equals( + modin_series.drop_duplicates(keep="first", inplace=False), + pandas_series.drop_duplicates(keep="first", inplace=False), + ) + df_equals( + modin_series.drop_duplicates(keep="last", inplace=False), + pandas_series.drop_duplicates(keep="last", inplace=False), + ) + df_equals( + modin_series.drop_duplicates(keep=False, inplace=False), + pandas_series.drop_duplicates(keep=False, inplace=False), + ) + df_equals( + modin_series.drop_duplicates(inplace=False), + pandas_series.drop_duplicates(inplace=False), + ) + modin_series.drop_duplicates(inplace=True) + df_equals(modin_series, pandas_series.drop_duplicates(inplace=False)) - with pytest.raises(NotImplementedError): - ray_series.__lt__(None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("how", ["any", "all"], ids=["any", "all"]) +def test_dropna(data, how): + modin_series, pandas_series = create_test_series(data) -@pytest.mark.skip(reason="Using pandas Series.") -def test___mod__(): - ray_series = create_test_series() + with pytest.raises(TypeError): + modin_series.dropna(how=None, thresh=None) - with pytest.raises(NotImplementedError): - ray_series.__mod__(None, None) + modin_result = modin_series.dropna(how=how) + pandas_result = pandas_series.dropna(how=how) + df_equals(modin_result, pandas_result) -@pytest.mark.skip(reason="Using pandas Series.") -def test___mul__(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_dropna_inplace(data): + modin_series, pandas_series = create_test_series(data) + pandas_result = pandas_series.dropna() + modin_series.dropna(inplace=True) + df_equals(modin_series, pandas_result) - with pytest.raises(NotImplementedError): - ray_series.__mul__(None, None) + modin_series, pandas_series = create_test_series(data) + with pytest.raises(TypeError): + modin_series.dropna(thresh=2, inplace=True) + modin_series, pandas_series = create_test_series(data) + pandas_series.dropna(how="any", inplace=True) + modin_series.dropna(how="any", inplace=True) + df_equals(modin_series, pandas_series) -@pytest.mark.skip(reason="Using pandas Series.") -def test___ne__(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.__ne__(None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_dtype(data): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series.dtype, modin_series.dtypes) + df_equals(modin_series.dtype, pandas_series.dtype) + df_equals(modin_series.dtype, pandas_series.dtypes) -@pytest.mark.skip(reason="Using pandas Series.") -def test___neg__(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("keep", ["last", "first"], ids=["last", "first"]) +def test_duplicated(data, keep): + modin_series, pandas_series = create_test_series(data) + with pytest.warns(UserWarning): + modin_result = modin_series.duplicated(keep=keep) + df_equals(modin_result, pandas_series.duplicated(keep=keep)) - with pytest.raises(NotImplementedError): - ray_series.__neg__() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_empty(data): + modin_series, pandas_series = create_test_series(data) + assert modin_series.empty == pandas_series.empty -@pytest.mark.skip(reason="Using pandas Series.") -def test___nonzero__(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.__nonzero__() +def test_empty_series(): + modin_series = pd.Series() + assert modin_series.empty -@pytest.mark.skip(reason="Using pandas Series.") -def test___or__(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_eq(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "eq") - with pytest.raises(NotImplementedError): - ray_series.__or__(None) +def test_equals(): + series_data = [2.9, 3, 3, 3] + modin_df1 = pd.Series(series_data) + modin_df2 = pd.Series(series_data) -@pytest.mark.skip(reason="Using pandas Series.") -def test___pow__(): - ray_series = create_test_series() + assert modin_df1.equals(modin_df2) + assert modin_df1.equals(pd.Series(modin_df1)) + df_equals(modin_df1, modin_df2) + df_equals(modin_df1, pd.Series(modin_df1)) - with pytest.raises(NotImplementedError): - ray_series.__pow__(None, None) + series_data = [2, 3, 5, 1] + modin_df3 = pd.Series(series_data, index=list("abcd")) + assert not modin_df1.equals(modin_df3) -@pytest.mark.skip(reason="Using pandas Series.") -def test___repr__(): - ray_series = create_test_series() + with pytest.raises(AssertionError): + df_equals(modin_df3, modin_df1) - with pytest.raises(NotImplementedError): - ray_series.__repr__() + with pytest.raises(AssertionError): + df_equals(modin_df3, modin_df2) -@pytest.mark.skip(reason="Using pandas Series.") -def test___round__(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_ewm(data): + modin_series, _ = create_test_series(data) # noqa: F841 + with pytest.warns(UserWarning): + modin_series.ewm(halflife=6) - with pytest.raises(NotImplementedError): - ray_series.__round__(None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_expanding(data): + modin_series, _ = create_test_series(data) # noqa: F841 + with pytest.warns(UserWarning): + modin_series.expanding() -@pytest.mark.skip(reason="Using pandas Series.") -def test___setitem__(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.__setitem__(None, None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_factorize(data): + modin_series, _ = create_test_series(data) # noqa: F841 + with pytest.warns(UserWarning): + modin_series.factorize() -@pytest.mark.skip(reason="Using pandas Series.") -def test___setstate__(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_ffill(data): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series.ffill(), pandas_series.ffill()) + # inplace + modin_series_cp = modin_series.copy() + pandas_series_cp = pandas_series.copy() + modin_series_cp.ffill(inplace=True) + pandas_series_cp.ffill(inplace=True) + df_equals(modin_series_cp, pandas_series_cp) - with pytest.raises(NotImplementedError): - ray_series.__setstate__(None) + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_fillna(data): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series.fillna(0), pandas_series.fillna(0)) + df_equals(modin_series.fillna(method="bfill"), pandas_series.fillna(method="bfill")) + df_equals(modin_series.fillna(method="ffill"), pandas_series.fillna(method="ffill")) + df_equals(modin_series.fillna(0, limit=1), pandas_series.fillna(0, limit=1)) @pytest.mark.skip(reason="Using pandas Series.") -def test___sizeof__(): - ray_series = create_test_series() +def test_filter(): + modin_series = create_test_series() with pytest.raises(NotImplementedError): - ray_series.__sizeof__() + modin_series.filter(None, None, None) -@pytest.mark.skip(reason="Using pandas Series.") -def test___str__(): - ray_series = create_test_series() +def test_first(): + i = pd.date_range("2018-04-09", periods=4, freq="2D") + ts = pd.Series([1, 2, 3, 4], index=i) + with pytest.warns(UserWarning): + ts.first("3D") - with pytest.raises(NotImplementedError): - ray_series.__str__() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_first_valid_index(data): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series.first_valid_index(), pandas_series.first_valid_index()) -@pytest.mark.skip(reason="Using pandas Series.") -def test___sub__(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.__sub__(None, None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_flags(data): + modin_series, _ = create_test_series(data) # noqa: F841 + with pytest.warns(UserWarning): + _ = modin_series.flags -@pytest.mark.skip(reason="Using pandas Series.") -def test___truediv__(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_floordiv(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "floordiv") - with pytest.raises(NotImplementedError): - ray_series.__truediv__(None, None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_ftype(data): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series.ftype, modin_series.ftypes) + df_equals(modin_series.ftype, pandas_series.ftype) + df_equals(modin_series.ftype, pandas_series.ftypes) -@pytest.mark.skip(reason="Using pandas Series.") -def test___xor__(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.__xor__(None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_ge(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "ge") -@pytest.mark.skip(reason="Using pandas Series.") -def test_abs(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_get(data): + modin_series, pandas_series = create_test_series(data) + for key in modin_series.keys(): + df_equals(modin_series.get(key), pandas_series.get(key)) + df_equals( + modin_series.get("NO_EXIST", "DEFAULT"), + pandas_series.get("NO_EXIST", "DEFAULT"), + ) - with pytest.raises(NotImplementedError): - ray_series.abs() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_get_dtype_counts(data): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series.get_dtype_counts(), pandas_series.get_dtype_counts()) -@pytest.mark.skip(reason="Using pandas Series.") -def test_add(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.add(None, None, None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_get_ftype_counts(data): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series.get_ftype_counts(), pandas_series.get_ftype_counts()) -@pytest.mark.skip(reason="Using pandas Series.") -def test_add_prefix(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_get_value(data): + modin_series, _ = create_test_series(data) # noqa: F841 + with pytest.warns(UserWarning): + modin_series.get_value(0) - with pytest.raises(NotImplementedError): - ray_series.add_prefix(None) + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_get_values(data): + modin_series, _ = create_test_series(data) # noqa: F841 + with pytest.warns(UserWarning): + modin_series.get_values() @pytest.mark.skip(reason="Using pandas Series.") -def test_add_suffix(): - ray_series = create_test_series() +def test_groupby(): + modin_series = create_test_series() with pytest.raises(NotImplementedError): - ray_series.add_suffix(None) - + modin_series.groupby(None, None, None, None, None, None, None) -@pytest.mark.skip(reason="Using pandas Series.") -def test_agg(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.agg(None, None, None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_gt(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "gt") -@pytest.mark.skip(reason="Using pandas Series.") -def test_aggregate(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_hasnans(data): + modin_series, pandas_series = create_test_series(data) + assert modin_series.hasnans == pandas_series.hasnans - with pytest.raises(NotImplementedError): - ray_series.aggregate(None, None, None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("n", int_arg_values, ids=arg_keys("n", int_arg_keys)) +def test_head(data, n): + modin_series, pandas_series = create_test_series(data) -@pytest.mark.skip(reason="Using pandas Series.") -def test_align(): - ray_series = create_test_series() + df_equals(modin_series.head(n), pandas_series.head(n)) + df_equals( + modin_series.head(len(modin_series)), pandas_series.head(len(pandas_series)) + ) - with pytest.raises(NotImplementedError): - ray_series.align(None, None, None, None, None, None, None, None, None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_hist(data): + modin_series, _ = create_test_series(data) # noqa: F841 + with pytest.warns(UserWarning): + modin_series.hist(None) -@pytest.mark.skip(reason="Using pandas Series.") -def test_all(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.all(None, None, None, None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_iat(data): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series.iat[0], pandas_series.iat[0]) -@pytest.mark.skip(reason="Using pandas Series.") -def test_any(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) +) +def test_idxmax(data, skipna): + modin_series, pandas_series = create_test_series(data) + pandas_result = pandas_series.idxmax(skipna=skipna) + modin_result = modin_series.idxmax(skipna=skipna) + df_equals(modin_result, pandas_result) - with pytest.raises(NotImplementedError): - ray_series.any(None, None, None, None) + pandas_result = pandas_series.T.idxmax(skipna=skipna) + modin_result = modin_series.T.idxmax(skipna=skipna) + df_equals(modin_result, pandas_result) -@pytest.mark.skip(reason="Using pandas Series.") -def test_append(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) +) +def test_idxmin(data, skipna): + modin_series, pandas_series = create_test_series(data) + pandas_result = pandas_series.idxmin(skipna=skipna) + modin_result = modin_series.idxmin(skipna=skipna) + df_equals(modin_result, pandas_result) - with pytest.raises(NotImplementedError): - ray_series.append(None, None) + pandas_result = pandas_series.T.idxmin(skipna=skipna) + modin_result = modin_series.T.idxmin(skipna=skipna) + df_equals(modin_result, pandas_result) -@pytest.mark.skip(reason="Using pandas Series.") -def test_apply(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_iloc(request, data): + modin_series, pandas_series = create_test_series(data) - with pytest.raises(NotImplementedError): - ray_series.apply(None, None, None) + if not name_contains(request.node.name, ["empty_data"]): + # Scaler + np.testing.assert_equal(modin_series.iloc[0], pandas_series.iloc[0]) + # Series + df_equals(modin_series.iloc[1:], pandas_series.iloc[1:]) + df_equals(modin_series.iloc[1:2], pandas_series.iloc[1:2]) + df_equals(modin_series.iloc[[1, 2]], pandas_series.iloc[[1, 2]]) -@pytest.mark.skip(reason="Using pandas Series.") -def test_argmax(): - ray_series = create_test_series() + # Write Item + modin_series.iloc[[1, 2]] = 42 + pandas_series.iloc[[1, 2]] = 42 + df_equals(modin_series, pandas_series) - with pytest.raises(NotImplementedError): - ray_series.argmax(None, None, None) + with pytest.raises(IndexError): + modin_series.iloc[1:, 1] + else: + with pytest.raises(IndexError): + modin_series.iloc[0] -@pytest.mark.skip(reason="Using pandas Series.") -def test_argmin(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_imag(data): + modin_series, _ = create_test_series(data) # noqa: F841 + with pytest.warns(UserWarning): + modin_series.imag - with pytest.raises(NotImplementedError): - ray_series.argmin(None, None, None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_index(data): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series.index, pandas_series.index) + with pytest.raises(ValueError): + modin_series.index = list(modin_series.index) + [999] -@pytest.mark.skip(reason="Using pandas Series.") -def test_argsort(): - ray_series = create_test_series() + modin_series.index = modin_series.index.map(str) + pandas_series.index = pandas_series.index.map(str) + df_equals(modin_series.index, pandas_series.index) - with pytest.raises(NotImplementedError): - ray_series.argsort(None, None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_interpolate(data): + modin_series, _ = create_test_series(data) # noqa: F841 + with pytest.warns(UserWarning): + modin_series.interpolate() -@pytest.mark.skip(reason="Using pandas Series.") -def test_as_blocks(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.as_blocks(None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_is_copy(data): + modin_series, pandas_series = create_test_series(data) + with pytest.warns(FutureWarning): + assert modin_series.is_copy is pandas_series.is_copy + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_is_monotonic(data): + modin_series, pandas_series = create_test_series(data) + with pytest.warns(UserWarning): + assert modin_series.is_monotonic == pandas_series.is_monotonic + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_is_monotonic_decreasing(data): + modin_series, pandas_series = create_test_series(data) + with pytest.warns(UserWarning): + assert ( + modin_series.is_monotonic_decreasing + == pandas_series.is_monotonic_decreasing + ) -@pytest.mark.skip(reason="Using pandas Series.") -def test_as_matrix(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_is_monotonic_increasing(data): + modin_series, pandas_series = create_test_series(data) + with pytest.warns(UserWarning): + assert ( + modin_series.is_monotonic_increasing + == pandas_series.is_monotonic_increasing + ) - with pytest.raises(NotImplementedError): - ray_series.as_matrix(None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_is_unique(data): + modin_series, pandas_series = create_test_series(data) + with pytest.warns(UserWarning): + assert modin_series.is_unique == pandas_series.is_unique -@pytest.mark.skip(reason="Using pandas Series.") -def test_asfreq(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.asfreq(None, None, None, None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_isin(data): + modin_series, pandas_series = create_test_series(data) + val = [1, 2, 3, 4] + pandas_result = pandas_series.isin(val) + modin_result = modin_series.isin(val) + df_equals(modin_result, pandas_result) -@pytest.mark.skip(reason="Using pandas Series.") -def test_asobject(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_isnull(data): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series.isnull(), pandas_series.isnull()) - with pytest.raises(NotImplementedError): - ray_series.asobject +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_items(data): + modin_series, pandas_series = create_test_series(data) -@pytest.mark.skip(reason="Using pandas Series.") -def test_asof(): - ray_series = create_test_series() + modin_items = modin_series.items() + pandas_items = pandas_series.items() + for modin_item, pandas_item in zip(modin_items, pandas_items): + modin_index, modin_scalar = modin_item + pandas_index, pandas_scalar = pandas_item + df_equals(modin_scalar, pandas_scalar) + assert pandas_index == modin_index - with pytest.raises(NotImplementedError): - ray_series.asof(None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_itemsize(data): + modin_series, pandas_series = create_test_series(data) + with pytest.warns(UserWarning): + assert modin_series.itemsize == pandas_series.itemsize -@pytest.mark.skip(reason="Using pandas Series.") -def test_astype(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.astype(None, None, None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_iteritems(data): + modin_series, pandas_series = create_test_series(data) + modin_items = modin_series.iteritems() + pandas_items = pandas_series.iteritems() + for modin_item, pandas_item in zip(modin_items, pandas_items): + modin_index, modin_scalar = modin_item + pandas_index, pandas_scalar = pandas_item + df_equals(modin_scalar, pandas_scalar) + assert pandas_index == modin_index -@pytest.mark.skip(reason="Using pandas Series.") -def test_at(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_ix(data): + modin_series, _ = create_test_series(data) # noqa: F841 with pytest.raises(NotImplementedError): - ray_series.at(None) + modin_series.ix[0] -@pytest.mark.skip(reason="Using pandas Series.") -def test_at_time(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_keys(data): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series.keys(), pandas_series.keys()) - with pytest.raises(NotImplementedError): - ray_series.at_time(None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_kurt(data): + modin_series, pandas_series = create_test_series(data) + with pytest.warns(UserWarning): + modin_series.kurt() -@pytest.mark.skip(reason="Using pandas Series.") -def test_autocorr(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.autocorr(None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_kurtosis(data): + modin_series, pandas_series = create_test_series(data) + with pytest.warns(UserWarning): + modin_series.kurtosis() -@pytest.mark.skip(reason="Using pandas Series.") -def test_axes(): - ray_series = create_test_series() +def test_last(): + i = pd.date_range("2018-04-09", periods=4, freq="2D") + ts = pd.Series([1, 2, 3, 4], index=i) + with pytest.warns(UserWarning): + ts.last("3D") - with pytest.raises(NotImplementedError): - ray_series.axes +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_last_valid_index(data): + modin_series, pandas_series = create_test_series(data) + assert modin_series.last_valid_index() == (pandas_series.last_valid_index()) -@pytest.mark.skip(reason="Using pandas Series.") -def test_base(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.base +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_le(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "le") -@pytest.mark.skip(reason="Using pandas Series.") -def test_between(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_loc(data): + modin_series, pandas_series = create_test_series(data) + for v in modin_series.index: + df_equals(modin_series.loc[v], pandas_series.loc[v]) + df_equals(modin_series.loc[v:], pandas_series.loc[v:]) - with pytest.raises(NotImplementedError): - ray_series.between(None, None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_lt(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "lt") -@pytest.mark.skip(reason="Using pandas Series.") -def test_between_time(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.between_time(None, None, None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_mad(data): + modin_series, _ = create_test_series(data) # noqa: F841 + with pytest.warns(UserWarning): + modin_series.mad() -@pytest.mark.skip(reason="Using pandas Series.") -def test_bfill(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_map(data): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series.map(str), pandas_series.map(str)) - with pytest.raises(NotImplementedError): - ray_series.bfill(None, None, None) +def test_mask(): + modin_series = pd.Series(np.arange(10)) + m = modin_series % 3 == 0 + with pytest.warns(UserWarning): + try: + modin_series.mask(~m, -modin_series) + except ValueError: + pass -@pytest.mark.skip(reason="Using pandas Series.") -def test_blocks(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.blocks +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) +) +def test_max(data, skipna): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series.max(skipna=skipna), pandas_series.max(skipna=skipna)) -@pytest.mark.skip(reason="Using pandas Series.") -def test_bool(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) +) +def test_mean(data, skipna): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series.mean(skipna=skipna), pandas_series.mean(skipna=skipna)) - with pytest.raises(NotImplementedError): - ray_series.bool() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) +) +def test_median(data, skipna): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series.median(skipna=skipna), pandas_series.median(skipna=skipna)) -@pytest.mark.skip(reason="Using pandas Series.") -def test_clip(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.clip(None, None, None, None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("index", [True, False], ids=["True", "False"]) +def test_memory_usage(data, index): + modin_series, pandas_series = create_test_series(data) + df_equals( + modin_series.memory_usage(index=index), pandas_series.memory_usage(index=index) + ) -@pytest.mark.skip(reason="Using pandas Series.") -def test_clip_lower(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) +) +def test_min(data, skipna): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series.min(skipna=skipna), pandas_series.min(skipna=skipna)) - with pytest.raises(NotImplementedError): - ray_series.clip_lower(None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_mod(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "mod") -@pytest.mark.skip(reason="Using pandas Series.") -def test_clip_upper(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.clip_upper(None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_mode(data): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series.mode(), pandas_series.mode()) -@pytest.mark.skip(reason="Using pandas Series.") -def test_combine(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_mul(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "mul") - with pytest.raises(NotImplementedError): - ray_series.combine(None, None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_multiply(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "multiply") -@pytest.mark.skip(reason="Using pandas Series.") -def test_combine_first(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.combine_first(None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_name(data): + modin_series, pandas_series = create_test_series(data) + assert modin_series.name == pandas_series.name + modin_series.name = pandas_series.name = "New_name" + assert modin_series.name == pandas_series.name + assert modin_series._query_compiler.columns == ["New_name"] -@pytest.mark.skip(reason="Using pandas Series.") -def test_compound(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_nbytes(data): + modin_series, pandas_series = create_test_series(data) + with pytest.warns(UserWarning): + assert modin_series.nbytes == pandas_series.nbytes - with pytest.raises(NotImplementedError): - ray_series.compound(None, None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_ndim(data): + modin_series, _ = create_test_series(data) # noqa: F841 + assert modin_series.ndim == 1 -@pytest.mark.skip(reason="Using pandas Series.") -def test_compress(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.compress(None, None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_ne(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "ne") @pytest.mark.skip(reason="Using pandas Series.") -def test_consolidate(): - ray_series = create_test_series() +def test_nlargest(): + modin_series = create_test_series() with pytest.raises(NotImplementedError): - ray_series.consolidate(None) - + modin_series.nlargest(None) -@pytest.mark.skip(reason="Using pandas Series.") -def test_convert_objects(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.convert_objects(None, None, None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_notnull(data): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series.notnull(), pandas_series.notnull()) @pytest.mark.skip(reason="Using pandas Series.") -def test_copy(): - ray_series = create_test_series() +def test_nsmallest(): + modin_series = create_test_series() with pytest.raises(NotImplementedError): - ray_series.copy(None) - + modin_series.nsmallest(None) -@pytest.mark.skip(reason="Using pandas Series.") -def test_corr(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.corr(None, None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("dropna", [True, False], ids=["True", "False"]) +def test_nunique(data, dropna): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series.nunique(dropna=dropna), pandas_series.nunique(dropna=dropna)) -@pytest.mark.skip(reason="Using pandas Series.") -def test_count(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_pct_change(data): + modin_series, pandas_series = create_test_series(data) + with pytest.warns(UserWarning): + modin_series.pct_change() - with pytest.raises(NotImplementedError): - ray_series.count(None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_pipe(data): + modin_series, pandas_series = create_test_series(data) + n = len(modin_series.index) + a, b, c = 2 % n, 0, 3 % n -@pytest.mark.skip(reason="Using pandas Series.") -def test_cov(): - ray_series = create_test_series() + def h(x): + return x.dropna() - with pytest.raises(NotImplementedError): - ray_series.cov(None) + def g(x, arg1=0): + for _ in range(arg1): + x = x.append(x) + return x + def f(x, arg2=0, arg3=0): + return x.drop(x.index[[arg2, arg3]]) -@pytest.mark.skip(reason="Using pandas Series.") -def test_cummax(): - ray_series = create_test_series() + df_equals( + f(g(h(modin_series), arg1=a), arg2=b, arg3=c), + (modin_series.pipe(h).pipe(g, arg1=a).pipe(f, arg2=b, arg3=c)), + ) + df_equals( + (modin_series.pipe(h).pipe(g, arg1=a).pipe(f, arg2=b, arg3=c)), + (pandas_series.pipe(h).pipe(g, arg1=a).pipe(f, arg2=b, arg3=c)), + ) - with pytest.raises(NotImplementedError): - ray_series.cummax(None, None, None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_plot(request, data): + modin_series, pandas_series = create_test_series(data) -@pytest.mark.skip(reason="Using pandas Series.") -def test_cummin(): - ray_series = create_test_series() + if name_contains(request.node.name, numeric_dfs): + # We have to test this way because equality in plots means same object. + zipped_plot_lines = zip(modin_series.plot().lines, pandas_series.plot().lines) + for l, r in zipped_plot_lines: + if isinstance(l.get_xdata(), np.ma.core.MaskedArray) and isinstance( + r.get_xdata(), np.ma.core.MaskedArray + ): + assert all((l.get_xdata() == r.get_xdata()).data) + else: + assert np.array_equal(l.get_xdata(), r.get_xdata()) + if isinstance(l.get_ydata(), np.ma.core.MaskedArray) and isinstance( + r.get_ydata(), np.ma.core.MaskedArray + ): + assert all((l.get_ydata() == r.get_ydata()).data) + else: + assert np.array_equal(l.get_xdata(), r.get_xdata()) - with pytest.raises(NotImplementedError): - ray_series.cummin(None, None, None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_pop(data): + modin_series, pandas_series = create_test_series(data) + + for key in modin_series.keys(): + df_equals(modin_series.pop(key), pandas_series.pop(key)) + df_equals(modin_series, pandas_series) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_pow(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "pow") + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_prod(data): + modin_series, pandas_series = create_test_series(data) + # Wrap in Series to test almost_equal because of overflow + df_equals(pd.Series([modin_series.prod()]), pandas.Series([pandas_series.prod()])) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_product(data): + modin_series, pandas_series = create_test_series(data) + # Wrap in Series to test almost_equal because of overflow + df_equals( + pd.Series([modin_series.product()]), pandas.Series([pandas_series.product()]) + ) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_ptp(data): + modin_series, pandas_series = create_test_series(data) + with pytest.warns(UserWarning): + modin_series.ptp() + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_put(data): + modin_series, pandas_series = create_test_series(data) + with pytest.warns(UserWarning): + assert modin_series.put(0, 3) == pandas_series.put(0, 3) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("q", quantiles_values, ids=quantiles_keys) +def test_quantile(request, data, q): + modin_series, pandas_series = create_test_series(data) + if not name_contains(request.node.name, no_numeric_dfs): + df_equals(modin_series.quantile(q), pandas_series.quantile(q)) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_radd(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "radd") + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "na_option", ["keep", "top", "bottom"], ids=["keep", "top", "bottom"] +) +def test_rank(data, na_option): + modin_series, pandas_series = create_test_series(data) + try: + pandas_result = pandas_series.rank(na_option=na_option) + except Exception as e: + with pytest.raises(type(e)): + modin_series.rank(na_option=na_option) + else: + modin_result = modin_series.rank(na_option=na_option) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_ravel(data): + modin_series, pandas_series = create_test_series(data) + with pytest.warns(UserWarning): + np.testing.assert_equal(modin_series.ravel(), pandas_series.ravel()) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_rdiv(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "rdiv") + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_real(data): + modin_series, pandas_series = create_test_series(data) + with pytest.warns(UserWarning): + np.testing.assert_equal(modin_series.real, pandas_series.real) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_reindex(data): + modin_series, pandas_series = create_test_series(data) + pandas_result = pandas_series.reindex( + list(pandas_series.index) + ["_A_NEW_ROW"], fill_value=0 + ) + modin_result = modin_series.reindex( + list(modin_series.index) + ["_A_NEW_ROW"], fill_value=0 + ) + df_equals(pandas_result, modin_result) + + frame_data = { + "col1": [0, 1, 2, 3], + "col2": [4, 5, 6, 7], + "col3": [8, 9, 10, 11], + "col4": [12, 13, 14, 15], + "col5": [0, 0, 0, 0], + } + pandas_df = pandas.DataFrame(frame_data) + modin_df = pd.DataFrame(frame_data) + + for col in pandas_df.columns: + modin_series = modin_df[col] + pandas_series = pandas_df[col] + df_equals( + modin_series.reindex([0, 3, 2, 1]), pandas_series.reindex([0, 3, 2, 1]) + ) + df_equals(modin_series.reindex([0, 6, 2]), pandas_series.reindex([0, 6, 2])) + df_equals( + modin_series.reindex(index=[0, 1, 5]), + pandas_series.reindex(index=[0, 1, 5]), + ) -@pytest.mark.skip(reason="Using pandas Series.") -def test_cumprod(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.cumprod(None, None, None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_reindex_axis(data): + modin_series, pandas_series = create_test_series(data) + modin_series.reindex_axis( + [i for i in modin_series.index[: len(modin_series.index) // 2]] + ) -@pytest.mark.skip(reason="Using pandas Series.") -def test_cumsum(): - ray_series = create_test_series() +def test_reindex_like(): + df1 = pd.DataFrame( + [ + [24.3, 75.7, "high"], + [31, 87.8, "high"], + [22, 71.6, "medium"], + [35, 95, "medium"], + ], + columns=["temp_celsius", "temp_fahrenheit", "windspeed"], + index=pd.date_range(start="2014-02-12", end="2014-02-15", freq="D"), + ) + df2 = pd.DataFrame( + [[28, "low"], [30, "low"], [35.1, "medium"]], + columns=["temp_celsius", "windspeed"], + index=pd.DatetimeIndex(["2014-02-12", "2014-02-13", "2014-02-15"]), + ) + + series1 = df1["windspeed"] + series2 = df2["windspeed"] + with pytest.warns(UserWarning): + series2.reindex_like(series1) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_rename(data): + modin_series, pandas_series = create_test_series(data) + new_name = "NEW_NAME" + df_equals(modin_series.rename(new_name), pandas_series.rename(new_name)) + + modin_series_cp = modin_series.copy() + pandas_series_cp = pandas_series.copy() + modin_series_cp.rename(new_name, inplace=True) + pandas_series_cp.rename(new_name, inplace=True) + df_equals(modin_series_cp, pandas_series_cp) + + modin_result = modin_series.rename("{}__".format) + pandas_result = pandas_series.rename("{}__".format) + df_equals(modin_result, pandas_result) - with pytest.raises(NotImplementedError): - ray_series.cumsum(None, None, None) +def test_reorder_levels(): + series = pd.Series( + np.random.randint(1, 100, 12), + index=pd.MultiIndex.from_tuples( + [ + (num, letter, color) + for num in range(1, 3) + for letter in ["a", "b", "c"] + for color in ["Red", "Green"] + ], + names=["Number", "Letter", "Color"], + ), + ) + with pytest.warns(UserWarning): + series.reorder_levels(["Letter", "Color", "Number"]) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "repeats", [2, 3, 4], ids=["repeats_{}".format(i) for i in [2, 3, 4]] +) +def test_repeat(data, repeats): + modin_series, pandas_series = create_test_series(data) + with pytest.warns(UserWarning): + df_equals(modin_series.repeat(repeats), pandas_series.repeat(repeats)) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_replace(data): + modin_series, _ = create_test_series(data) # noqa: F841 + with pytest.warns(UserWarning): + modin_series.replace(0, 5) -@pytest.mark.skip(reason="Using pandas Series.") -def test_data(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.data +def test_resample(): + modin_series = pd.Series( + [10, 11, 9, 13, 14, 18, 17, 19], + index=pd.date_range("01/01/2018", periods=8, freq="W"), + ) + with pytest.warns(UserWarning): + modin_series.resample("M") -@pytest.mark.skip(reason="Using pandas Series.") -def test_describe(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("drop", [True, False], ids=["True", "False"]) +def test_reset_index(data, drop): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series.reset_index(drop=drop), pandas_series.reset_index(drop=drop)) - with pytest.raises(NotImplementedError): - ray_series.describe(None, None) + modin_series_cp = modin_series.copy() + pandas_series_cp = pandas_series.copy() + try: + pandas_result = pandas_series_cp.reset_index(drop=drop, inplace=True) + except Exception as e: + with pytest.raises(type(e)): + modin_series_cp.reset_index(drop=drop, inplace=True) + else: + modin_result = modin_series_cp.reset_index(drop=drop, inplace=True) + df_equals(pandas_result, modin_result) @pytest.mark.skip(reason="Using pandas Series.") -def test_diff(): - ray_series = create_test_series() +def test_reshape(): + modin_series = create_test_series() with pytest.raises(NotImplementedError): - ray_series.diff(None) + modin_series.reshape(None) -@pytest.mark.skip(reason="Using pandas Series.") -def test_div(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_rfloordiv(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "rfloordiv") - with pytest.raises(NotImplementedError): - ray_series.div(None, None, None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_rmod(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "rmod") -@pytest.mark.skip(reason="Using pandas Series.") -def test_divide(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.divide(None, None, None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_rmul(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "rmul") -@pytest.mark.skip(reason="Using pandas Series.") -def test_dot(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_rolling(data): + modin_series, _ = create_test_series(data) # noqa: F841 + with pytest.warns(UserWarning): + modin_series.rolling(10) - with pytest.raises(NotImplementedError): - ray_series.dot(None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_round(data): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series.round(), pandas_series.round()) -@pytest.mark.skip(reason="Using pandas Series.") -def test_drop(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.drop(None, None, None, None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_rpow(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "rpow") -@pytest.mark.skip(reason="Using pandas Series.") -def test_drop_duplicates(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_rsub(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "rsub") - with pytest.raises(NotImplementedError): - ray_series.drop_duplicates(None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_rtruediv(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "rtruediv") -@pytest.mark.skip(reason="Using pandas Series.") -def test_dropna(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.dropna(None, None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_sample(data): + modin_series, pandas_series = create_test_series(data) + df_equals( + modin_series.sample(frac=0.5, random_state=21019), + pandas_series.sample(frac=0.5, random_state=21019), + ) + df_equals( + modin_series.sample(n=12, random_state=21019), + pandas_series.sample(n=12, random_state=21019), + ) + with pytest.warns(UserWarning): + df_equals( + modin_series.sample(n=0, random_state=21019), + pandas_series.sample(n=0, random_state=21019), + ) + with pytest.raises(ValueError): + modin_series.sample(n=-3) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_searchsorted(data): + modin_series, pandas_series = create_test_series(data) + with pytest.warns(UserWarning): + modin_series.searchsorted(3) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_select(data): + modin_series, _ = create_test_series(data) # noqa: F841 + with pytest.warns(UserWarning): + modin_series.select(lambda x: x == 4) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_sem(data): + modin_series, _ = create_test_series(data) # noqa: F841 + with pytest.warns(UserWarning): + modin_series.sem() + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_set_axis(data): + modin_series, _ = create_test_series(data) # noqa: F841 + modin_series.set_axis(labels=["{}_{}".format(i, i + 1) for i in modin_series.index]) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_set_value(data): + modin_series, _ = create_test_series(data) # noqa: F841 + with pytest.warns(UserWarning): + modin_series.set_value(5, 6) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_shape(data): + modin_series, pandas_series = create_test_series(data) + assert modin_series.shape == pandas_series.shape + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_shift(data): + modin_series, _ = create_test_series(data) # noqa: F841 + with pytest.warns(UserWarning): + modin_series.shift() + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_size(data): + modin_series, pandas_series = create_test_series(data) + assert modin_series.size == pandas_series.size + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) +) +def test_skew(data, skipna): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series.skew(skipna=skipna), pandas_series.skew(skipna=skipna)) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_slice_shift(data): + modin_series, _ = create_test_series(data) # noqa: F841 + with pytest.warns(UserWarning): + modin_series.slice_shift() + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "ascending", bool_arg_values, ids=arg_keys("ascending", bool_arg_keys) +) +@pytest.mark.parametrize( + "sort_remaining", bool_arg_values, ids=arg_keys("sort_remaining", bool_arg_keys) +) +@pytest.mark.parametrize("na_position", ["first", "last"], ids=["first", "last"]) +def test_sort_index(data, ascending, sort_remaining, na_position): + modin_series, pandas_series = create_test_series(data) + df_equals( + modin_series.sort_index( + ascending=ascending, sort_remaining=sort_remaining, na_position=na_position + ), + pandas_series.sort_index( + ascending=ascending, sort_remaining=sort_remaining, na_position=na_position + ), + ) + + modin_series_cp = modin_series.copy() + pandas_series_cp = pandas_series.copy() + modin_series_cp.sort_index( + ascending=ascending, + sort_remaining=sort_remaining, + na_position=na_position, + inplace=True, + ) + pandas_series_cp.sort_index( + ascending=ascending, + sort_remaining=sort_remaining, + na_position=na_position, + inplace=True, + ) + df_equals(modin_series_cp, pandas_series_cp) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("ascending", [True, False], ids=["True", "False"]) +@pytest.mark.parametrize("na_position", ["first", "last"], ids=["first", "last"]) +def test_sort_values(data, ascending, na_position): + modin_series, pandas_series = create_test_series(data) + modin_result = modin_series.sort_values( + ascending=ascending, na_position=na_position + ) + pandas_result = pandas_series.sort_values( + ascending=ascending, na_position=na_position + ) + # Note: For `ascending=False` only + # For some reason, the indexing of Series and DataFrame differ in the underlying + # algorithm. The order of values is the same, but the index values are shuffled. + # Since we use `DataFrame.sort_values` even for Series, the index can be different + # between `pandas.Series.sort_values`. For this reason, we check that the values are + # identical instead of the index as well. + if ascending: + df_equals(modin_result, pandas_result) + else: + np.testing.assert_equal(modin_result.values, pandas_result.values) + + modin_series_cp = modin_series.copy() + pandas_series_cp = pandas_series.copy() + modin_series_cp.sort_values( + ascending=ascending, na_position=na_position, inplace=True + ) + pandas_series_cp.sort_values( + ascending=ascending, na_position=na_position, inplace=True + ) + # See above about `ascending=False` + if ascending: + df_equals(modin_series_cp, pandas_series_cp) + else: + np.testing.assert_equal(modin_series_cp.values, pandas_series_cp.values) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_squeeze(data): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series.squeeze(None), pandas_series.squeeze(None)) + df_equals(modin_series.squeeze(0), pandas_series.squeeze(0)) + with pytest.raises(ValueError): + modin_series.squeeze(1) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) +) +@pytest.mark.parametrize("ddof", int_arg_values, ids=arg_keys("ddof", int_arg_keys)) +def test_std(request, data, skipna, ddof): + modin_series, pandas_series = create_test_series(data) + try: + pandas_result = pandas_series.std(skipna=skipna, ddof=ddof) + except Exception as e: + with pytest.raises(type(e)): + modin_series.std(skipna=skipna, ddof=ddof) + else: + modin_result = modin_series.std(skipna=skipna, ddof=ddof) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_strides(data): + modin_series, pandas_series = create_test_series(data) + with pytest.warns(UserWarning): + modin_series.strides + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_sub(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "sub") + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_subtract(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "subtract") + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) +) +@pytest.mark.parametrize( + "min_count", int_arg_values, ids=arg_keys("min_count", int_arg_keys) +) +def test_sum(data, skipna, min_count): + modin_series, pandas_series = create_test_series(data) + try: + pandas_result = pandas_series.sum(skipna=skipna, min_count=min_count) + except Exception: + with pytest.raises(TypeError): + modin_series.sum(skipna=skipna, min_count=min_count) + else: + modin_result = modin_series.sum(skipna=skipna, min_count=min_count) + df_equals(modin_result, pandas_result) + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_swapaxes(data): + modin_series, pandas_series = create_test_series(data) + with pytest.warns(UserWarning): + modin_series.swapaxes(0, 0) -@pytest.mark.skip(reason="Using pandas Series.") -def test_dtype(): - ray_series = create_test_series() +def test_swaplevel(): + s = pd.Series( + np.random.randint(1, 100, 12), + index=pd.MultiIndex.from_tuples( + [ + (num, letter, color) + for num in range(1, 3) + for letter in ["a", "b", "c"] + for color in ["Red", "Green"] + ], + names=["Number", "Letter", "Color"], + ), + ) + with pytest.warns(UserWarning): + s.swaplevel("Number", "Color") + + +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("n", int_arg_values, ids=arg_keys("n", int_arg_keys)) +def test_tail(data, n): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series.tail(n), pandas_series.tail(n)) + df_equals( + modin_series.tail(len(modin_series)), pandas_series.tail(len(pandas_series)) + ) - with pytest.raises(NotImplementedError): - ray_series.dtype +def test_take(): + series = pd.Series([1, 2, 3, 4], index=[0, 2, 3, 1]) + with pytest.warns(UserWarning): + series.take([0, 3]) -@pytest.mark.skip(reason="Using pandas Series.") -def test_dtypes(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.dtypes +def test_to_period(): + idx = pd.date_range("1/1/2012", periods=5, freq="M") + series = pd.Series(np.random.randint(0, 100, size=(len(idx))), index=idx) + with pytest.warns(UserWarning): + series.to_period() -@pytest.mark.skip(reason="Using pandas Series.") -def test_duplicated(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_to_sparse(data): + modin_series, _ = create_test_series(data) # noqa: F841 + with pytest.warns(UserWarning): + modin_series.to_sparse() - with pytest.raises(NotImplementedError): - ray_series.duplicated(None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_to_string(request, data): + modin_series, pandas_series = create_test_series(data) + # Skips nan because only difference is nan instead of NaN + if not name_contains(request.node.name, ["nan"]): + assert modin_series.to_string() == pandas_series.to_string() -@pytest.mark.skip(reason="Using pandas Series.") -def test_empty(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.empty +def test_to_timestamp(): + idx = pd.date_range("1/1/2012", periods=5, freq="M") + series = pd.Series(np.random.randint(0, 100, size=(len(idx))), index=idx) + with pytest.warns(UserWarning): + series.to_period().to_timestamp() -@pytest.mark.skip(reason="Using pandas Series.") -def test_eq(): - ray_series = create_test_series() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_to_xarray(data): + modin_series, _ = create_test_series(data) # noqa: F841 + with pytest.warns(UserWarning): + modin_series.to_xarray() - with pytest.raises(NotImplementedError): - ray_series.eq(None, None, None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_tolist(data): + modin_series, _ = create_test_series(data) # noqa: F841 + with pytest.warns(UserWarning): + modin_series.tolist() -@pytest.mark.skip(reason="Using pandas Series.") -def test_equals(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.equals(None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_ewm(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.ewm(None, None, None, None, None, None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_expanding(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.expanding(None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_factorize(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.factorize(None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_ffill(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.ffill(None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_fillna(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.fillna(None, None, None, None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_filter(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.filter(None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_first(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.first(None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_first_valid_index(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.first_valid_index() - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_flags(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.flags - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_floordiv(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.floordiv(None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_from_array(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.from_array(None, None, None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_from_csv(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.from_csv(None, None, None, None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_ftype(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.ftype - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_ftypes(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.ftypes - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_ge(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.ge(None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_get(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.get(None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_get_dtype_counts(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.get_dtype_counts() - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_get_ftype_counts(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.get_ftype_counts() - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_get_value(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.get_value(None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_get_values(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.get_values() - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_groupby(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.groupby(None, None, None, None, None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_gt(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.gt(None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_hasnans(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.hasnans - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_head(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.head(None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_hist(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.hist(None, None, None, None, None, None, None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_iat(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.iat(None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_idxmax(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.idxmax(None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_idxmin(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.idxmin(None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_iloc(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.iloc(None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_imag(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.imag - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_index(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.index - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_interpolate(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.interpolate(None, None, None, None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_is_copy(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.is_copy - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_is_monotonic(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.is_monotonic - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_is_monotonic_decreasing(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.is_monotonic_decreasing - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_is_monotonic_increasing(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.is_monotonic_increasing - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_is_unique(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.is_unique - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_isin(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.isin(None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_isnull(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.isnull() - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_item(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.item() - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_items(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.items() - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_itemsize(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.itemsize - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_iteritems(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.iteritems() - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_ix(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.ix(None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_keys(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.keys() - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_kurt(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.kurt(None, None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_kurtosis(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.kurtosis(None, None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_last(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.last(None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_last_valid_index(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.last_valid_index() - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_le(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.le(None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_loc(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.loc(None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_lt(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.lt(None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_mad(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.mad(None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_map(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.map(None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_mask(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.mask(None, None, None, None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_max(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.max(None, None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_mean(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.mean(None, None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_median(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.median(None, None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_memory_usage(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.memory_usage(None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_min(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.min(None, None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_mod(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.mod(None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_mode(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.mode() - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_mul(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.mul(None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_multiply(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.multiply(None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_name(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.name - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_nbytes(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.nbytes - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_ndim(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.ndim - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_ne(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.ne(None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_nlargest(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.nlargest(None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_nonzero(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.nonzero() - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_notnull(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.notnull() - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_nsmallest(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.nsmallest(None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_nunique(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.nunique(None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_pct_change(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.pct_change(None, None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_pipe(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.pipe(None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_plot(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.plot( - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - ) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_pop(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.pop(None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_pow(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.pow(None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_prod(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.prod(None, None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_product(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.product(None, None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_ptp(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.ptp(None, None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_put(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.put(None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_quantile(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.quantile(None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_radd(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.radd(None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_rank(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.rank(None, None, None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_ravel(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.ravel(None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_rdiv(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.rdiv(None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_real(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.real - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_reindex(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.reindex(None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_reindex_axis(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.reindex_axis(None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_reindex_like(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.reindex_like(None, None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_rename(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.rename(None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_rename_axis(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.rename_axis(None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_reorder_levels(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.reorder_levels(None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_repeat(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.repeat(None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_replace(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.replace(None, None, None, None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_resample(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.resample( - None, None, None, None, None, None, None, None, None, None, None, None - ) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_reset_index(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.reset_index(None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_reshape(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.reshape(None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_rfloordiv(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.rfloordiv(None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_rmod(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.rmod(None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_rmul(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.rmul(None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_rolling(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.rolling(None, None, None, None, None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_round(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.round(None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_rpow(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.rpow(None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_rsub(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.rsub(None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_rtruediv(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.rtruediv(None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_sample(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.sample(None, None, None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_searchsorted(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.searchsorted(None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_select(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.select(None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_sem(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.sem(None, None, None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_set_axis(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.set_axis(None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_set_value(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.set_value(None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_shape(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.shape - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_shift(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.shift(None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_size(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.size - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_skew(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.skew(None, None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_slice_shift(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.slice_shift(None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_sort_index(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.sort_index(None, None, None, None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_sort_values(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.sort_values(None, None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_sortlevel(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.sortlevel(None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_squeeze(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.squeeze(None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_std(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.std(None, None, None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_strides(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.strides - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_sub(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.sub(None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_subtract(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.subtract(None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_sum(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.sum(None, None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_swapaxes(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.swapaxes(None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_swaplevel(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.swaplevel(None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_tail(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.tail(None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_take(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.take(None, None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_to_clipboard(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.to_clipboard(None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_to_csv(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.to_csv(None, None, None, None, None, None, None, None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_to_dense(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.to_dense() - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_to_dict(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.to_dict() - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_to_excel(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.to_excel( - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - ) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_to_frame(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.to_frame(None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_to_hdf(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.to_hdf(None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_to_json(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.to_json(None, None, None, None, None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_to_latex(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.to_latex( - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - None, - ) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_to_msgpack(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.to_msgpack(None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_to_period(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.to_period(None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_to_pickle(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.to_pickle(None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_to_sparse(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.to_sparse(None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_to_sql(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.to_sql(None, None, None, None, None, None, None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_to_string(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.to_string(None, None, None, None, None, None, None, None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) +def test_transform(data, func): + modin_series, pandas_series = create_test_series(data) + try: + pandas_result = pandas_series.transform(func) + except Exception as e: + with pytest.raises(type(e)): + modin_series.transform(func) + else: + df_equals(modin_series.transform(func), pandas_result) -@pytest.mark.skip(reason="Using pandas Series.") -def test_to_timestamp(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.to_timestamp(None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_to_xarray(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.to_xarray() - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_tolist(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.tolist() - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_transform(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.transform(None, None) - - -@pytest.mark.skip(reason="Using pandas Series.") -def test_transpose(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.transpose(None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_transpose(data): + modin_series, pandas_series = create_test_series(data) + df_equals(modin_series.transpose(), modin_series) + df_equals(modin_series.transpose(), pandas_series.transpose()) + df_equals(modin_series.transpose(), pandas_series) -@pytest.mark.skip(reason="Using pandas Series.") -def test_truediv(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.truediv(None, None, None) - +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_truediv(data): + modin_series, pandas_series = create_test_series(data) + inter_df_math_helper(modin_series, pandas_series, "truediv") -@pytest.mark.skip(reason="Using pandas Series.") -def test_truncate(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.truncate(None, None, None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_truncate(data): + modin_series, _ = create_test_series(data) # noqa: F841 + with pytest.warns(UserWarning): + modin_series.truncate() -@pytest.mark.skip(reason="Using pandas Series.") def test_tshift(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.tshift(None, None) + idx = pd.date_range("1/1/2012", periods=5, freq="M") + modin_series = pd.Series(np.random.randint(0, 100, size=len(idx)), index=idx) + with pytest.warns(UserWarning): + modin_series.to_period().tshift() -@pytest.mark.skip(reason="Using pandas Series.") def test_tz_convert(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.tz_convert(None, None, None) + idx = pd.date_range("1/1/2012", periods=5, freq="M") + modin_series = pd.Series(np.random.randint(0, 100, size=len(idx)), index=idx) + with pytest.warns(UserWarning): + modin_series.tz_localize("America/Los_Angeles").tz_convert( + "America/Los_Angeles" + ) -@pytest.mark.skip(reason="Using pandas Series.") def test_tz_localize(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.tz_localize(None, None, None, None) + idx = pd.date_range("1/1/2012", periods=5, freq="M") + modin_series = pd.Series(np.random.randint(0, 100, size=len(idx)), index=idx) + with pytest.warns(UserWarning): + modin_series.tz_localize("America/Los_Angeles") -@pytest.mark.skip(reason="Using pandas Series.") -def test_unique(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.unique() +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_unique(data): + modin_series, _ = create_test_series(data) # noqa: F841 + with pytest.warns(UserWarning): + modin_series.unique() -@pytest.mark.skip(reason="Using pandas Series.") def test_unstack(): - ray_series = create_test_series() + s = pd.Series( + np.random.randint(1, 100, 12), + index=pd.MultiIndex.from_tuples( + [ + (num, letter, color) + for num in range(1, 3) + for letter in ["a", "b", "c"] + for color in ["Red", "Green"] + ], + names=["Number", "Letter", "Color"], + ), + ) + with pytest.warns(UserWarning): + s.unstack() - with pytest.raises(NotImplementedError): - ray_series.unstack(None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_update(data): + modin_series, _ = create_test_series(data) # noqa: F841 + with pytest.warns(UserWarning): + try: + modin_series.update(pd.Series([4.1 for _ in modin_series])) + except Exception: + pass -@pytest.mark.skip(reason="Using pandas Series.") -def test_update(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.update(None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_valid(data): + modin_series, pandas_series = create_test_series(data) + with pytest.warns(UserWarning): + modin_series.valid() -@pytest.mark.skip(reason="Using pandas Series.") -def test_valid(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.valid(None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_value_counts(data): + modin_series, pandas_series = create_test_series(data) + with pytest.warns(UserWarning): + modin_series.value_counts() -@pytest.mark.skip(reason="Using pandas Series.") -def test_value_counts(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.value_counts(None, None, None, None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_values(data): + modin_series, pandas_series = create_test_series(data) + np.testing.assert_equal(modin_series.values, pandas_series.values) -@pytest.mark.skip(reason="Using pandas Series.") -def test_values(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.values +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.parametrize( + "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) +) +@pytest.mark.parametrize("ddof", int_arg_values, ids=arg_keys("ddof", int_arg_keys)) +def test_var(data, skipna, ddof): + modin_series, pandas_series = create_test_series(data) + try: + pandas_result = pandas_series.var(skipna=skipna, ddof=ddof) + except Exception: + with pytest.raises(TypeError): + modin_series.var(skipna=skipna, ddof=ddof) + else: + modin_result = modin_series.var(skipna=skipna, ddof=ddof) + df_equals(modin_result, pandas_result) -@pytest.mark.skip(reason="Using pandas Series.") -def test_var(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.var(None, None, None, None, None) +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_view(data): + modin_series, pandas_series = create_test_series(data) + with pytest.warns(UserWarning): + modin_series.view(None) -@pytest.mark.skip(reason="Using pandas Series.") -def test_view(): - ray_series = create_test_series() - with pytest.raises(NotImplementedError): - ray_series.view(None) +def test_where(): + frame_data = random_state.randn(100) + pandas_series = pandas.Series(frame_data) + modin_series = pd.Series(frame_data) + pandas_cond_series = pandas_series % 5 < 2 + modin_cond_series = modin_series % 5 < 2 + pandas_result = pandas_series.where(pandas_cond_series, -pandas_series) + modin_result = modin_series.where(modin_cond_series, -modin_series) + assert all((to_pandas(modin_result) == pandas_result)) -@pytest.mark.skip(reason="Using pandas Series.") -def test_where(): - ray_series = create_test_series() + other = pandas.Series(random_state.randn(100)) + pandas_result = pandas_series.where(pandas_cond_series, other, axis=0) + modin_result = modin_series.where(modin_cond_series, other, axis=0) + assert all(to_pandas(modin_result) == pandas_result) - with pytest.raises(NotImplementedError): - ray_series.where(None, None, None, None, None, None) + pandas_result = pandas_series.where(pandas_series < 2, True) + modin_result = modin_series.where(modin_series < 2, True) + assert all(to_pandas(modin_result) == pandas_result) -@pytest.mark.skip(reason="Using pandas Series.") +@pytest.mark.skip("Deprecated in pandas.") def test_xs(): - ray_series = create_test_series() - - with pytest.raises(NotImplementedError): - ray_series.xs(None, None, None) + series = pd.Series([4, 0, "mammal", "cat", "walks"]) + with pytest.warns(UserWarning): + series.xs("mammal") diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py index 9663a29986c..725e711c032 100644 --- a/modin/pandas/test/utils.py +++ b/modin/pandas/test/utils.py @@ -237,11 +237,35 @@ def df_equals(df1, df2): groupby_types = (pandas.core.groupby.DataFrameGroupBy, DataFrameGroupBy) + # The typing behavior of how pandas treats its index is not consistent when the + # length of the DataFrame or Series is 0, so we just verify that the contents are + # the same. + if ( + hasattr(df1, "index") + and hasattr(df2, "index") + and len(df1) == 0 + and len(df2) == 0 + ): + if type(df1).__name__ == type(df2).__name__: + if hasattr(df1, "name") and hasattr(df2, "name") and df1.name == df2.name: + return + if ( + hasattr(df1, "columns") + and hasattr(df2, "columns") + and df1.columns.equals(df2.columns) + ): + return + assert False + # Convert to pandas if isinstance(df1, pd.DataFrame): df1 = to_pandas(df1) if isinstance(df2, pd.DataFrame): df2 = to_pandas(df2) + if isinstance(df1, pd.Series): + df1 = to_pandas(df1) + if isinstance(df2, pd.Series): + df2 = to_pandas(df2) if isinstance(df1, pandas.DataFrame) and isinstance(df2, pandas.DataFrame): if (df1.empty and not df2.empty) or (df2.empty and not df1.empty): @@ -285,7 +309,10 @@ def df_equals(df1, df2): assert all(df1.index == df2.index) assert df1.dtypes == df2.dtypes else: - assert df1 == df2 + if df1 != df2: + print(df1) + print(df2) + np.testing.assert_almost_equal(df1, df2) def df_is_empty(df): diff --git a/modin/pandas/utils.py b/modin/pandas/utils.py index 36f47b9eb9f..f94a134f7b2 100644 --- a/modin/pandas/utils.py +++ b/modin/pandas/utils.py @@ -18,14 +18,16 @@ def from_pandas(df): return DataFrame(query_compiler=BaseFactory.from_pandas(df)) -def to_pandas(df): - """Converts a Ray DataFrame to a pandas DataFrame/Series. +def to_pandas(modin_obj): + """Converts a Modin DataFrame/Series to a pandas DataFrame/Series. + Args: - df (modin.DataFrame): The Ray DataFrame to convert. + obj {modin.DataFrame, modin.Series}: The Ray DataFrame/Series to convert. + Returns: - A new pandas DataFrame. + A new pandas DataFrame or Series. """ - return df._query_compiler.to_pandas() + return modin_obj._to_pandas() def _inherit_docstrings(parent, excluded=[]):