diff --git a/.flake8 b/.flake8
new file mode 100644
index 00000000000..508150ef2f5
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,6 @@
+# Adopted from *Black*'s config
+
+[flake8]
+ignore = E203, E266, E501, W503
+max-line-length = 88
+select = B,C,E,F,W,T4,B9
diff --git a/.travis.yml b/.travis.yml
index b966cc6a3cb..db1bf229863 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -25,8 +25,8 @@ matrix:
env: LINT=1
script:
- export PATH="$HOME/miniconda/bin:$PATH"
- - yapf -dr modin/pandas
- - flake8 --max-line-length=88 .
+ - black --check modin/
+ - flake8 .
install:
- ./.travis/install-dependencies.sh
diff --git a/.travis/install-dependencies.sh b/.travis/install-dependencies.sh
index d91af1d9887..73059a2b007 100755
--- a/.travis/install-dependencies.sh
+++ b/.travis/install-dependencies.sh
@@ -46,6 +46,7 @@ elif [[ "$LINT" == "1" ]]; then
bash miniconda.sh -b -p $HOME/miniconda
export PATH="$HOME/miniconda/bin:$PATH"
conda install -y python==3.6.5
+ pip install black flake8 flake8-comprehensions
else
echo "Unrecognized environment."
@@ -53,5 +54,5 @@ else
fi
pip install -r requirements.txt
-pip install -q pytest flake8 flake8-comprehensions yapf feather-format lxml openpyxl xlrd numpy
+pip install -q pytest feather-format lxml openpyxl xlrd numpy
diff --git a/README.rst b/README.rst
index 0525ef1e051..9d4b176ce61 100644
--- a/README.rst
+++ b/README.rst
@@ -7,6 +7,8 @@ Modin
.. image:: https://readthedocs.org/projects/modin/badge/?version=latest
:target: https://modin.readthedocs.io/en/latest/?badge=latest
+.. image:: https://img.shields.io/badge/code%20style-black-000000.svg
+ :target: https://github.com/ambv/black
|
*Modin is a library for unifying the way you interact with your data*
diff --git a/modin/__init__.py b/modin/__init__.py
index 40a6bd47928..a0f845e1209 100644
--- a/modin/__init__.py
+++ b/modin/__init__.py
@@ -6,19 +6,18 @@ def git_version():
def _execute_cmd_in_temp_env(cmd):
# construct environment
env = {}
- for k in ['SYSTEMROOT', 'PATH', 'HOME']:
+ for k in ["SYSTEMROOT", "PATH", "HOME"]:
v = os.environ.get(k)
if v is not None:
env[k] = v
# LANGUAGE is used on win32
- env['LANGUAGE'] = 'C'
- env['LANG'] = 'C'
- env['LC_ALL'] = 'C'
- return subprocess.Popen(
- cmd, stdout=subprocess.PIPE, env=env).communicate()[0]
+ env["LANGUAGE"] = "C"
+ env["LANG"] = "C"
+ env["LC_ALL"] = "C"
+ return subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env).communicate()[0]
try:
- git_revision = _execute_cmd_in_temp_env(['git', 'rev-parse', 'HEAD'])
+ git_revision = _execute_cmd_in_temp_env(["git", "rev-parse", "HEAD"])
return git_revision.strip().decode()
except OSError:
return "Unknown"
diff --git a/modin/data_management/data_manager.py b/modin/data_management/data_manager.py
index 8aab730f22f..11e67012658 100644
--- a/modin/data_management/data_manager.py
+++ b/modin/data_management/data_manager.py
@@ -7,9 +7,12 @@
from pandas.compat import string_types
from pandas.core.dtypes.cast import find_common_type
-from pandas.core.dtypes.common import (_get_dtype_from_object, is_list_like,
- is_numeric_dtype,
- is_datetime_or_timedelta_dtype)
+from pandas.core.dtypes.common import (
+ _get_dtype_from_object,
+ is_list_like,
+ is_numeric_dtype,
+ is_datetime_or_timedelta_dtype,
+)
from pandas.core.index import _ensure_index
from .partitioning.partition_collections import BlockPartitions
@@ -19,11 +22,13 @@ class PandasDataManager(object):
"""This class implements the logic necessary for operating on partitions
with a Pandas backend. This logic is specific to Pandas."""
- def __init__(self,
- block_partitions_object: BlockPartitions,
- index: pandas.Index,
- columns: pandas.Index,
- dtypes=None):
+ def __init__(
+ self,
+ block_partitions_object: BlockPartitions,
+ index: pandas.Index,
+ columns: pandas.Index,
+ dtypes=None,
+ ):
assert isinstance(block_partitions_object, BlockPartitions)
self.data = block_partitions_object
self.index = index
@@ -31,11 +36,7 @@ def __init__(self,
if dtypes is not None:
self._dtype_cache = dtypes
- def __constructor__(self,
- block_paritions_object,
- index,
- columns,
- dtypes=None):
+ def __constructor__(self, block_paritions_object, index, columns, dtypes=None):
"""By default, constructor method will invoke an init"""
return type(self)(block_paritions_object, index, columns, dtypes)
@@ -48,11 +49,9 @@ def _get_dtype(self):
map_func = self._prepare_method(lambda df: df.dtypes)
def dtype_builder(df):
- return df.apply(
- lambda row: find_common_type(row.values), axis=0)
+ return df.apply(lambda row: find_common_type(row.values), axis=0)
- self._dtype_cache = self.data.full_reduce(map_func, dtype_builder,
- 0)
+ self._dtype_cache = self.data.full_reduce(map_func, dtype_builder, 0)
self._dtype_cache.index = self.columns
return self._dtype_cache
@@ -77,8 +76,9 @@ def _validate_set_axis(self, new_labels, old_labels):
new_len = len(new_labels)
if old_len != new_len:
raise ValueError(
- 'Length mismatch: Expected axis has %d elements, '
- 'new values have %d elements' % (old_len, new_len))
+ "Length mismatch: Expected axis has %d elements, "
+ "new values have %d elements" % (old_len, new_len)
+ )
return new_labels
def _set_index(self, new_index):
@@ -92,8 +92,7 @@ def _set_columns(self, new_columns):
if self._columns_cache is None:
self._columns_cache = _ensure_index(new_columns)
else:
- new_columns = self._validate_set_axis(new_columns,
- self._columns_cache)
+ new_columns = self._validate_set_axis(new_columns, self._columns_cache)
self._columns_cache = new_columns
columns = property(_get_columns, _set_columns)
@@ -133,7 +132,8 @@ def pandas_index_extraction(df, axis):
new_indices = data_object.get_indices(
axis=axis,
index_func=lambda df: pandas_index_extraction(df, axis),
- old_blocks=old_blocks)
+ old_blocks=old_blocks,
+ )
return index_obj[new_indices] if compute_diff else new_indices
@@ -154,6 +154,7 @@ def _prepare_method(self, pandas_func, **kwargs):
def helper(df, internal_indices=[]):
return pandas_func(df.T, **kwargs)
+
else:
def helper(df, internal_indices=[]):
@@ -190,7 +191,8 @@ def numeric_function_clean_dataframe(self, axis):
result = pandas.Series(dtype=np.float64)
nonnumeric = [
- col for col, dtype in zip(self.columns, self.dtypes)
+ col
+ for col, dtype in zip(self.columns, self.dtypes)
if not is_numeric_dtype(dtype)
]
if len(nonnumeric) == len(self.columns):
@@ -206,13 +208,15 @@ def numeric_function_clean_dataframe(self, axis):
# Metadata modification methods
def add_prefix(self, prefix):
new_column_names = self.columns.map(lambda x: str(prefix) + str(x))
- return self.__constructor__(self.data, self.index, new_column_names,
- self._dtype_cache)
+ return self.__constructor__(
+ self.data, self.index, new_column_names, self._dtype_cache
+ )
def add_suffix(self, suffix):
new_column_names = self.columns.map(lambda x: str(x) + str(suffix))
- return self.__constructor__(self.data, self.index, new_column_names,
- self._dtype_cache)
+ return self.__constructor__(
+ self.data, self.index, new_column_names, self._dtype_cache
+ )
# END Metadata modification methods
@@ -221,8 +225,9 @@ def add_suffix(self, suffix):
# copies if we end up modifying something here. We copy all of the metadata
# to prevent that.
def copy(self):
- return self.__constructor__(self.data.copy(), self.index.copy(),
- self.columns.copy(), self._dtype_cache)
+ return self.__constructor__(
+ self.data.copy(), self.index.copy(), self.columns.copy(), self._dtype_cache
+ )
# Append/Concat/Join (Not Merge)
# The append/concat/join operations should ideally never trigger remote
@@ -287,8 +292,9 @@ def concat(self, axis, other, **kwargs):
def _append_list_of_managers(self, others, axis, **kwargs):
if not isinstance(others, list):
others = [others]
- assert all(isinstance(other, type(self)) for other in others), \
- "Different Manager objects are being used. This is not allowed"
+ assert all(
+ isinstance(other, type(self)) for other in others
+ ), "Different Manager objects are being used. This is not allowed"
sort = kwargs.get("sort", None)
join = kwargs.get("join", "outer")
@@ -301,13 +307,12 @@ def _append_list_of_managers(self, others, axis, **kwargs):
axis,
[other.columns if axis == 0 else other.index for other in others],
join,
- sort=sort)
+ sort=sort,
+ )
# Since we are concatenating a list of managers, we will align all of
# the indices based on the `joined_axis` computed above.
- to_append = [
- other.reindex(axis ^ 1, joined_axis).data for other in others
- ]
+ to_append = [other.reindex(axis ^ 1, joined_axis).data for other in others]
new_self = self.reindex(axis ^ 1, joined_axis).data
new_data = new_self.concat(axis, to_append)
@@ -316,20 +321,23 @@ def _append_list_of_managers(self, others, axis, **kwargs):
# If `ignore_index` is true, we create a RangeIndex that is the
# length of all of the index objects combined. This is the same
# behavior as pandas.
- new_index = self.index.append([
- other.index for other in others
- ]) if not ignore_index else pandas.RangeIndex(
- len(self.index) + sum(len(other.index) for other in others))
+ new_index = (
+ self.index.append([other.index for other in others])
+ if not ignore_index
+ else pandas.RangeIndex(
+ len(self.index) + sum(len(other.index) for other in others)
+ )
+ )
return self.__constructor__(new_data, new_index, joined_axis)
else:
# The columns will be appended to form the final columns.
- new_columns = self.columns.append(
- [other.columns for other in others])
+ new_columns = self.columns.append([other.columns for other in others])
return self.__constructor__(new_data, joined_axis, new_columns)
def _join_data_manager(self, other, **kwargs):
- assert isinstance(other, type(self)), \
- "This method is for data manager objects only"
+ assert isinstance(
+ other, type(self)
+ ), "This method is for data manager objects only"
# Uses join's default value (though should not revert to default)
how = kwargs.get("how", "left")
@@ -349,15 +357,18 @@ def _join_data_manager(self, other, **kwargs):
self_proxy = pandas.DataFrame(columns=self.columns)
other_proxy = pandas.DataFrame(columns=other.columns)
new_columns = self_proxy.join(
- other_proxy, lsuffix=lsuffix, rsuffix=rsuffix).columns
+ other_proxy, lsuffix=lsuffix, rsuffix=rsuffix
+ ).columns
return self.__constructor__(new_data, joined_index, new_columns)
def _join_list_of_managers(self, others, **kwargs):
- assert isinstance(others, list), \
- "This method is for lists of DataManager objects only"
- assert all(isinstance(other, type(self)) for other in others), \
- "Different Manager objects are being used. This is not allowed"
+ assert isinstance(
+ others, list
+ ), "This method is for lists of DataManager objects only"
+ assert all(
+ isinstance(other, type(self)) for other in others
+ ), "Different Manager objects are being used. This is not allowed"
# Uses join's default value (though should not revert to default)
how = kwargs.get("how", "left")
@@ -366,7 +377,8 @@ def _join_list_of_managers(self, others, **kwargs):
rsuffix = kwargs.get("rsuffix", "")
joined_index = self._join_index_objects(
- 1, [other.index for other in others], how, sort=sort)
+ 1, [other.index for other in others], how, sort=sort
+ )
to_join = [other.reindex(0, joined_index).data for other in others]
new_self = self.reindex(0, joined_index).data
@@ -376,11 +388,10 @@ def _join_list_of_managers(self, others, **kwargs):
# This stage is to efficiently get the resulting columns, including the
# suffixes.
self_proxy = pandas.DataFrame(columns=self.columns)
- others_proxy = [
- pandas.DataFrame(columns=other.columns) for other in others
- ]
+ others_proxy = [pandas.DataFrame(columns=other.columns) for other in others]
new_columns = self_proxy.join(
- others_proxy, lsuffix=lsuffix, rsuffix=rsuffix).columns
+ others_proxy, lsuffix=lsuffix, rsuffix=rsuffix
+ ).columns
return self.__constructor__(new_data, joined_index, new_columns)
@@ -401,13 +412,14 @@ def inter_manager_operations(self, other, how_to_join, func):
Returns:
New DataManager with new data and index.
"""
- assert isinstance(other, type(self)), \
- "Must have the same DataManager subclass to perform this operation"
+ assert isinstance(
+ other, type(self)
+ ), "Must have the same DataManager subclass to perform this operation"
- joined_index = self._join_index_objects(
- 1, other.index, how_to_join, sort=False)
+ joined_index = self._join_index_objects(1, other.index, how_to_join, sort=False)
new_columns = self._join_index_objects(
- 0, other.columns, how_to_join, sort=False)
+ 0, other.columns, how_to_join, sort=False
+ )
reindexed_other = other.reindex(0, joined_index).data
reindexed_self = self.reindex(0, joined_index).data
@@ -429,7 +441,8 @@ def inter_data_op_builder(left, right, self_cols, other_cols, func):
new_data = reindexed_self.inter_data_operation(
1,
lambda l, r: inter_data_op_builder(l, r, self_cols, other_cols, func),
- reindexed_other)
+ reindexed_other,
+ )
return self.__constructor__(new_data, joined_index, new_columns)
@@ -447,10 +460,12 @@ def _inter_df_op_handler(self, func, other, **kwargs):
if isinstance(other, type(self)):
return self.inter_manager_operations(
- other, "outer", lambda x, y: func(x, y, **kwargs))
+ other, "outer", lambda x, y: func(x, y, **kwargs)
+ )
else:
- return self.scalar_operations(axis, other,
- lambda df: func(df, other, **kwargs))
+ return self.scalar_operations(
+ axis, other, lambda df: func(df, other, **kwargs)
+ )
def add(self, other, **kwargs):
"""Adds this manager with other object (manager or scalar).
@@ -667,8 +682,9 @@ def update(self, other, **kwargs):
Returns:
New DataManager with updated data and index.
"""
- assert isinstance(other, type(self)), \
- "Must have the same DataManager subclass to perform this operation"
+ assert isinstance(
+ other, type(self)
+ ), "Must have the same DataManager subclass to perform this operation"
def update_builder(df, other, **kwargs):
df.update(other, **kwargs)
@@ -686,8 +702,9 @@ def where(self, cond, other, **kwargs):
New DataManager with updated data and index.
"""
- assert isinstance(cond, type(self)), \
- "Must have the same DataManager subclass to perform this operation"
+ assert isinstance(
+ cond, type(self)
+ ), "Must have the same DataManager subclass to perform this operation"
if isinstance(other, type(self)):
# Note: Currently we are doing this with two maps across the entire
@@ -708,11 +725,13 @@ def where_builder_second_pass(df, new_other, **kwargs):
reindexed_self = self.reindex(0, self.index).data
first_pass = reindexed_cond.inter_data_operation(
- 1, lambda l, r: where_builder_first_pass(l, r, **kwargs),
- reindexed_other)
+ 1,
+ lambda l, r: where_builder_first_pass(l, r, **kwargs),
+ reindexed_other,
+ )
final_pass = reindexed_self.inter_data_operation(
- 1, lambda l, r: where_builder_second_pass(l, r, **kwargs),
- first_pass)
+ 1, lambda l, r: where_builder_second_pass(l, r, **kwargs), first_pass
+ )
return self.__constructor__(final_pass, self.index, self.columns)
else:
axis = kwargs.get("axis", 0)
@@ -725,13 +744,17 @@ def where_builder_series(df, cond, other, **kwargs):
return df.where(cond, other, **kwargs)
reindexed_self = self.reindex(
- axis, self.index if not axis else self.columns).data
+ axis, self.index if not axis else self.columns
+ ).data
reindexed_cond = cond.reindex(
- axis, self.index if not axis else self.columns).data
+ axis, self.index if not axis else self.columns
+ ).data
new_data = reindexed_self.inter_data_operation(
- axis, lambda l, r: where_builder_series(l, r, other, **kwargs),
- reindexed_cond)
+ axis,
+ lambda l, r: where_builder_series(l, r, other, **kwargs),
+ reindexed_cond,
+ )
return self.__constructor__(new_data, self.index, self.columns)
# END Inter-Data operations
@@ -791,7 +814,8 @@ def reindex_builer(df, axis, old_labels, new_labels, **kwargs):
new_columns = labels if axis else self.columns
func = self._prepare_method(
- lambda df: reindex_builer(df, axis, old_labels, labels, **kwargs))
+ lambda df: reindex_builer(df, axis, old_labels, labels, **kwargs)
+ )
# The reindex can just be mapped over the axis we are modifying. This
# is for simplicity in implementation. We specify num_splits here
@@ -819,8 +843,9 @@ def reset_index(self, **kwargs):
else:
# The copies here are to ensure that we do not give references to
# this object for the purposes of updates.
- return self.__constructor__(self.data.copy(), new_index,
- self.columns.copy(), self._dtype_cache)
+ return self.__constructor__(
+ self.data.copy(), new_index, self.columns.copy(), self._dtype_cache
+ )
# END Reindex/reset_index
@@ -858,8 +883,7 @@ def transpose(self, *args, **kwargs):
# Currently, this means a Pandas Series will be returned, but in the future
# we will implement a Distributed Series, and this will be returned
# instead.
- def full_reduce(self, axis, map_func, reduce_func=None,
- numeric_only=False):
+ def full_reduce(self, axis, map_func, reduce_func=None, numeric_only=False):
"""Apply function that will reduce the data to a Pandas Series.
Args:
@@ -883,8 +907,9 @@ def full_reduce(self, axis, map_func, reduce_func=None,
# The XOR here will ensure that we reduce over the correct axis that
# exists on the internal partitions. We flip the axis
- result = data_manager.data.full_reduce(map_func, reduce_func,
- axis ^ self._is_transposed)
+ result = data_manager.data.full_reduce(
+ map_func, reduce_func, axis ^ self._is_transposed
+ )
if not axis:
result.index = data_manager.columns
else:
@@ -967,36 +992,39 @@ def sum(self, **kwargs):
# These operations are operations that apply a function to every partition.
def map_partitions(self, func, new_dtypes=None):
return self.__constructor__(
- self.data.map_across_blocks(func), self.index, self.columns,
- new_dtypes)
+ self.data.map_across_blocks(func), self.index, self.columns, new_dtypes
+ )
def abs(self):
func = self._prepare_method(pandas.DataFrame.abs)
- new_dtypes = pandas.Series([np.dtype('float64') for _ in self.columns],
- index=self.columns)
+ new_dtypes = pandas.Series(
+ [np.dtype("float64") for _ in self.columns], index=self.columns
+ )
return self.map_partitions(func, new_dtypes=new_dtypes)
def applymap(self, func):
- remote_func = self._prepare_method(
- pandas.DataFrame.applymap, func=func)
+ remote_func = self._prepare_method(pandas.DataFrame.applymap, func=func)
return self.map_partitions(remote_func)
def isin(self, **kwargs):
func = self._prepare_method(pandas.DataFrame.isin, **kwargs)
- new_dtypes = pandas.Series([np.dtype('bool') for _ in self.columns],
- index=self.columns)
+ new_dtypes = pandas.Series(
+ [np.dtype("bool") for _ in self.columns], index=self.columns
+ )
return self.map_partitions(func, new_dtypes=new_dtypes)
def isna(self):
func = self._prepare_method(pandas.DataFrame.isna)
- new_dtypes = pandas.Series([np.dtype('bool') for _ in self.columns],
- index=self.columns)
+ new_dtypes = pandas.Series(
+ [np.dtype("bool") for _ in self.columns], index=self.columns
+ )
return self.map_partitions(func, new_dtypes=new_dtypes)
def isnull(self):
func = self._prepare_method(pandas.DataFrame.isnull)
- new_dtypes = pandas.Series([np.dtype('bool') for _ in self.columns],
- index=self.columns)
+ new_dtypes = pandas.Series(
+ [np.dtype("bool") for _ in self.columns], index=self.columns
+ )
return self.map_partitions(func, new_dtypes=new_dtypes)
def negative(self, **kwargs):
@@ -1005,14 +1033,16 @@ def negative(self, **kwargs):
def notna(self):
func = self._prepare_method(pandas.DataFrame.notna)
- new_dtypes = pandas.Series([np.dtype('bool') for _ in self.columns],
- index=self.columns)
+ new_dtypes = pandas.Series(
+ [np.dtype("bool") for _ in self.columns], index=self.columns
+ )
return self.map_partitions(func, new_dtypes=new_dtypes)
def notnull(self):
func = self._prepare_method(pandas.DataFrame.notnull)
- new_dtypes = pandas.Series([np.dtype('bool') for _ in self.columns],
- index=self.columns)
+ new_dtypes = pandas.Series(
+ [np.dtype("bool") for _ in self.columns], index=self.columns
+ )
return self.map_partitions(func, new_dtypes=new_dtypes)
def round(self, **kwargs):
@@ -1053,9 +1083,9 @@ def astype(self, col_dtypes, **kwargs):
# Update the new dtype series to the proper pandas dtype
new_dtype = np.dtype(dtype)
if dtype != np.int32 and new_dtype == np.int32:
- new_dtype = np.dtype('int64')
+ new_dtype = np.dtype("int64")
elif dtype != np.float32 and new_dtype == np.float32:
- new_dtype = np.dtype('float64')
+ new_dtype = np.dtype("float64")
new_dtypes[column] = new_dtype
# Update partitions for each dtype that is updated
@@ -1069,10 +1099,10 @@ def astype(df, internal_indices=[]):
return df.astype(block_dtypes)
new_data = new_data.apply_func_to_select_indices(
- 0, astype, dtype_indices[dtype], keep_remaining=True)
+ 0, astype, dtype_indices[dtype], keep_remaining=True
+ )
- return self.__constructor__(new_data, self.index, self.columns,
- new_dtypes)
+ return self.__constructor__(new_data, self.index, self.columns, new_dtypes)
# END Map partitions across select indices
@@ -1092,7 +1122,8 @@ def full_axis_reduce(self, func, axis):
Pandas series containing the reduced data.
"""
result = self.data.map_across_full_axis(axis, func).to_pandas(
- self._is_transposed)
+ self._is_transposed
+ )
if not axis:
result.index = self.columns
@@ -1223,7 +1254,7 @@ def memory_usage(self, **kwargs):
def memory_usage_builder(df, **kwargs):
return df.memory_usage(index=False, deep=deep)
- deep = kwargs.get('deep', False)
+ deep = kwargs.get("deep", False)
func = self._prepare_method(memory_usage_builder, **kwargs)
return self.full_axis_reduce(func, 0)
@@ -1260,11 +1291,9 @@ def to_datetime_builder(df, **kwargs):
# Currently, this means a Pandas Series will be returned, but in the future
# we will implement a Distributed Series, and this will be returned
# instead.
- def full_axis_reduce_along_select_indices(self,
- func,
- axis,
- index,
- pandas_result=True):
+ def full_axis_reduce_along_select_indices(
+ self, func, axis, index, pandas_result=True
+ ):
"""Reduce Manger along select indices using function that needs full axis.
Args:
@@ -1279,11 +1308,10 @@ def full_axis_reduce_along_select_indices(self,
"""
# Convert indices to numeric indices
old_index = self.index if axis else self.columns
- numeric_indices = [
- i for i, name in enumerate(old_index) if name in index
- ]
+ numeric_indices = [i for i, name in enumerate(old_index) if name in index]
result = self.data.apply_func_to_select_indices_along_full_axis(
- axis, func, numeric_indices)
+ axis, func, numeric_indices
+ )
if pandas_result:
result = result.to_pandas(self._is_transposed)
@@ -1311,19 +1339,19 @@ def describe_builder(df, **kwargs):
# Apply describe and update indices, columns, and dtypes
func = self._prepare_method(describe_builder, **kwargs)
- new_data = self.full_axis_reduce_along_select_indices(
- func, 0, new_index, False)
+ new_data = self.full_axis_reduce_along_select_indices(func, 0, new_index, False)
new_index = self.compute_index(0, new_data, False)
new_columns = self.compute_index(1, new_data, True)
if numeric:
- new_dtypes = pandas.Series([np.float64 for _ in new_columns],
- index=new_columns)
+ new_dtypes = pandas.Series(
+ [np.float64 for _ in new_columns], index=new_columns
+ )
else:
- new_dtypes = pandas.Series([np.object for _ in new_columns],
- index=new_columns)
+ new_dtypes = pandas.Series(
+ [np.object for _ in new_columns], index=new_columns
+ )
- return self.__constructor__(new_data, new_index, new_columns,
- new_dtypes)
+ return self.__constructor__(new_data, new_index, new_columns, new_dtypes)
def median(self, **kwargs):
"""Returns median of each column or row.
@@ -1440,8 +1468,9 @@ def _cumulative_builder(self, func, **kwargs):
axis = kwargs.get("axis", 0)
func = self._prepare_method(func, **kwargs)
new_data = self.map_across_full_axis(axis, func)
- return self.__constructor__(new_data, self.index, self.columns,
- self._dtype_cache)
+ return self.__constructor__(
+ new_data, self.index, self.columns, self._dtype_cache
+ )
def cumsum(self, **kwargs):
return self._cumulative_builder(pandas.DataFrame.cumsum, **kwargs)
@@ -1493,13 +1522,11 @@ def dropna(self, **kwargs):
# Count the number of NA values and specify which are higher than
# thresh.
drop_values = {
- ax ^ 1: compute_na.isna().sum(axis=ax ^ 1) > thresh
- for ax in axis
+ ax ^ 1: compute_na.isna().sum(axis=ax ^ 1) > thresh for ax in axis
}
else:
drop_values = {
- ax ^ 1: getattr(compute_na.isna(), how)(axis=ax ^ 1)
- for ax in axis
+ ax ^ 1: getattr(compute_na.isna(), how)(axis=ax ^ 1) for ax in axis
}
if 0 not in drop_values:
@@ -1508,16 +1535,25 @@ def dropna(self, **kwargs):
if 1 not in drop_values:
drop_values[1] = None
- rm_from_index = [obj for obj in compute_na.index[drop_values[1]]
- ] if drop_values[1] is not None else None
- rm_from_columns = [
- obj for obj in compute_na.columns[drop_values[0]]
- ] if drop_values[0] is not None else None
+ rm_from_index = (
+ [obj for obj in compute_na.index[drop_values[1]]]
+ if drop_values[1] is not None
+ else None
+ )
+ rm_from_columns = (
+ [obj for obj in compute_na.columns[drop_values[0]]]
+ if drop_values[0] is not None
+ else None
+ )
else:
- rm_from_index = compute_na.index[
- drop_values[1]] if drop_values[1] is not None else None
- rm_from_columns = compute_na.columns[
- drop_values[0]] if drop_values[0] is not None else None
+ rm_from_index = (
+ compute_na.index[drop_values[1]] if drop_values[1] is not None else None
+ )
+ rm_from_columns = (
+ compute_na.columns[drop_values[0]]
+ if drop_values[0] is not None
+ else None
+ )
return self.drop(index=rm_from_index, columns=rm_from_columns)
@@ -1545,8 +1581,7 @@ def eval(self, expr, **kwargs):
# in the first column
if expect_series:
if inplace:
- raise ValueError(
- "Cannot operate inplace if there is no assignment")
+ raise ValueError("Cannot operate inplace if there is no assignment")
else:
expr = "{0} = {1}".format(columns[0], expr)
@@ -1578,13 +1613,15 @@ def mode(self, **kwargs):
func = self._prepare_method(pandas.DataFrame.mode, **kwargs)
new_data = self.map_across_full_axis(axis, func)
- counts = self.__constructor__(new_data, self.index,
- self.columns).notnull().sum(axis=axis)
+ counts = (
+ self.__constructor__(new_data, self.index, self.columns)
+ .notnull()
+ .sum(axis=axis)
+ )
max_count = counts.max()
new_index = pandas.RangeIndex(max_count) if not axis else self.index
- new_columns = self.columns if not axis else pandas.RangeIndex(
- max_count)
+ new_columns = self.columns if not axis else pandas.RangeIndex(max_count)
# We have to reindex the DataFrame so that all of the partitions are
# matching in shape. The next steps ensure this happens.
@@ -1592,10 +1629,13 @@ def mode(self, **kwargs):
# We build these intermediate objects to avoid depending directly on
# the underlying implementation.
final_data = self.__constructor__(
- new_data, new_index, new_columns).map_across_full_axis(
- axis, lambda df: df.reindex(axis=axis, labels=final_labels))
- return self.__constructor__(final_data, new_index, new_columns,
- self._dtype_cache)
+ new_data, new_index, new_columns
+ ).map_across_full_axis(
+ axis, lambda df: df.reindex(axis=axis, labels=final_labels)
+ )
+ return self.__constructor__(
+ final_data, new_index, new_columns, self._dtype_cache
+ )
def fillna(self, **kwargs):
"""Replaces NaN values with the method provided.
@@ -1614,15 +1654,15 @@ def fillna(self, **kwargs):
else:
index = self.index
value = {
- idx: value[key]
- for key in value for idx in index.get_indexer_for([key])
+ idx: value[key] for key in value for idx in index.get_indexer_for([key])
}
def fillna_dict_builder(df, func_dict={}):
return df.fillna(value=func_dict, **kwargs)
new_data = self.data.apply_func_to_select_indices(
- axis, fillna_dict_builder, value, keep_remaining=True)
+ axis, fillna_dict_builder, value, keep_remaining=True
+ )
return self.__constructor__(new_data, self.index, self.columns)
else:
func = self._prepare_method(pandas.DataFrame.fillna, **kwargs)
@@ -1655,8 +1695,7 @@ def query_builder(df, **kwargs):
# Query removes rows, so we need to update the index
new_index = self.compute_index(0, new_data, True)
- return self.__constructor__(new_data, new_index, self.columns,
- self.dtypes)
+ return self.__constructor__(new_data, new_index, self.columns, self.dtypes)
def rank(self, **kwargs):
"""Computes numerical rank along axis. Equal values are set to the average.
@@ -1676,10 +1715,8 @@ def rank(self, **kwargs):
new_columns = self.compute_index(1, new_data, True)
else:
new_columns = self.columns
- new_dtypes = pandas.Series([np.float64 for _ in new_columns],
- index=new_columns)
- return self.__constructor__(new_data, self.index, new_columns,
- new_dtypes)
+ new_dtypes = pandas.Series([np.float64 for _ in new_columns], index=new_columns)
+ return self.__constructor__(new_data, self.index, new_columns, new_dtypes)
# END Map across rows/columns
@@ -1687,11 +1724,9 @@ def rank(self, **kwargs):
# These operations require some global knowledge of the full column/row
# that is being operated on. This means that we have to put all of that
# data in the same place.
- def map_across_full_axis_select_indices(self,
- axis,
- func,
- indices,
- keep_remaining=False):
+ def map_across_full_axis_select_indices(
+ self, axis, func, indices, keep_remaining=False
+ ):
"""Maps function to select indices along full axis.
Args:
@@ -1704,7 +1739,8 @@ def map_across_full_axis_select_indices(self,
BlockPartitions containing the result of mapping func over axis on indices.
"""
return self.data.apply_func_to_select_indices_along_full_axis(
- axis, func, indices, keep_remaining)
+ axis, func, indices, keep_remaining
+ )
def quantile_for_list_of_values(self, **kwargs):
"""Returns Manager containing quantiles along an axis for numeric columns.
@@ -1721,22 +1757,22 @@ def quantile_for_list_of_values(self, **kwargs):
new_columns = self.numeric_columns()
else:
new_columns = [
- col for col, dtype in zip(self.columns, self.dtypes)
- if (is_numeric_dtype(dtype)
- or is_datetime_or_timedelta_dtype(dtype))
+ col
+ for col, dtype in zip(self.columns, self.dtypes)
+ if (is_numeric_dtype(dtype) or is_datetime_or_timedelta_dtype(dtype))
]
if axis:
# If along rows, then drop the nonnumeric columns, record the index, and
# take transpose. We have to do this because if we don't, the result is all
# in one column for some reason.
nonnumeric = [
- col for col, dtype in zip(self.columns, self.dtypes)
+ col
+ for col, dtype in zip(self.columns, self.dtypes)
if not is_numeric_dtype(dtype)
]
data_manager = self.drop(columns=nonnumeric)
new_columns = data_manager.index
- numeric_indices = list(
- data_manager.index.get_indexer_for(new_columns))
+ numeric_indices = list(data_manager.index.get_indexer_for(new_columns))
data_manager = data_manager.transpose()
kwargs.pop("axis")
else:
@@ -1749,7 +1785,8 @@ def quantile_builder(df, internal_indices=[], **kwargs):
func = self._prepare_method(quantile_builder, **kwargs)
q_index = pandas.Float64Index(q)
new_data = data_manager.map_across_full_axis_select_indices(
- 0, func, numeric_indices)
+ 0, func, numeric_indices
+ )
return self.__constructor__(new_data, q_index, new_columns)
# END Map across rows/columns
@@ -1773,13 +1810,16 @@ def head(self, n):
# on a transposed manager is already set to the correct value, so
# we need to only take the head of that instead of re-transposing.
result = self.__constructor__(
- self.data.transpose().take(1, n).transpose(), self.index[:n],
- self.columns, self._dtype_cache)
+ self.data.transpose().take(1, n).transpose(),
+ self.index[:n],
+ self.columns,
+ self._dtype_cache,
+ )
result._is_transposed = True
else:
result = self.__constructor__(
- self.data.take(0, n), self.index[:n], self.columns,
- self._dtype_cache)
+ self.data.take(0, n), self.index[:n], self.columns, self._dtype_cache
+ )
return result
def tail(self, n):
@@ -1794,13 +1834,16 @@ def tail(self, n):
# See head for an explanation of the transposed behavior
if self._is_transposed:
result = self.__constructor__(
- self.data.transpose().take(1, -n).transpose(), self.index[-n:],
- self.columns, self._dtype_cache)
+ self.data.transpose().take(1, -n).transpose(),
+ self.index[-n:],
+ self.columns,
+ self._dtype_cache,
+ )
result._is_transposed = True
else:
result = self.__constructor__(
- self.data.take(0, -n), self.index[-n:], self.columns,
- self._dtype_cache)
+ self.data.take(0, -n), self.index[-n:], self.columns, self._dtype_cache
+ )
return result
@@ -1816,13 +1859,16 @@ def front(self, n):
# See head for an explanation of the transposed behavior
if self._is_transposed:
result = self.__constructor__(
- self.data.transpose().take(0, n).transpose(), self.index,
- self.columns[:n], self.dtypes[:n])
+ self.data.transpose().take(0, n).transpose(),
+ self.index,
+ self.columns[:n],
+ self.dtypes[:n],
+ )
result._is_transposed = True
else:
result = self.__constructor__(
- self.data.take(1, n), self.index, self.columns[:n],
- self.dtypes[:n])
+ self.data.take(1, n), self.index, self.columns[:n], self.dtypes[:n]
+ )
return result
def back(self, n):
@@ -1837,13 +1883,16 @@ def back(self, n):
# See head for an explanation of the transposed behavior
if self._is_transposed:
result = self.__constructor__(
- self.data.transpose().take(0, -n).transpose(), self.index,
- self.columns[-n:], self.dtypes[-n:])
+ self.data.transpose().take(0, -n).transpose(),
+ self.index,
+ self.columns[-n:],
+ self.dtypes[-n:],
+ )
result._is_transposed = True
else:
result = self.__constructor__(
- self.data.take(1, -n), self.index, self.columns[-n:],
- self.dtypes[-n:])
+ self.data.take(1, -n), self.index, self.columns[-n:], self.dtypes[-n:]
+ )
return result
# End Head/Tail/Front/Back
@@ -1928,14 +1977,14 @@ def getitem(df, internal_indices=[]):
return df.iloc[:, internal_indices]
result = self.data.apply_func_to_select_indices(
- 0, getitem, numeric_indices, keep_remaining=False)
+ 0, getitem, numeric_indices, keep_remaining=False
+ )
# We can't just set the columns to key here because there may be
# multiple instances of a key.
new_columns = self.columns[numeric_indices]
new_dtypes = self.dtypes[numeric_indices]
- return self.__constructor__(result, self.index, new_columns,
- new_dtypes)
+ return self.__constructor__(result, self.index, new_columns, new_dtypes)
def getitem_row_array(self, key):
"""Get row data for target labels.
@@ -1953,12 +2002,12 @@ def getitem(df, internal_indices=[]):
return df.iloc[internal_indices]
result = self.data.apply_func_to_select_indices(
- 1, getitem, numeric_indices, keep_remaining=False)
+ 1, getitem, numeric_indices, keep_remaining=False
+ )
# We can't just set the index to key here because there may be multiple
# instances of a key.
new_index = self.index[numeric_indices]
- return self.__constructor__(result, new_index, self.columns,
- self._dtype_cache)
+ return self.__constructor__(result, new_index, self.columns, self._dtype_cache)
# END __getitem__ methods
@@ -1987,11 +2036,13 @@ def delitem(df, internal_indices=[]):
numeric_indices = list(self.index.get_indexer_for(index))
new_data = self.data.apply_func_to_select_indices(
- 1, delitem, numeric_indices, keep_remaining=True)
+ 1, delitem, numeric_indices, keep_remaining=True
+ )
# We can't use self.index.drop with duplicate keys because in Pandas
# it throws an error.
new_index = [
- self.index[i] for i in range(len(self.index))
+ self.index[i]
+ for i in range(len(self.index))
if i not in numeric_indices
]
@@ -2005,16 +2056,17 @@ def delitem(df, internal_indices=[]):
numeric_indices = list(self.columns.get_indexer_for(columns))
new_data = new_data.apply_func_to_select_indices(
- 0, delitem, numeric_indices, keep_remaining=True)
+ 0, delitem, numeric_indices, keep_remaining=True
+ )
# We can't use self.columns.drop with duplicate keys because in Pandas
# it throws an error.
new_columns = [
- self.columns[i] for i in range(len(self.columns))
+ self.columns[i]
+ for i in range(len(self.columns))
if i not in numeric_indices
]
new_dtypes = self.dtypes.drop(columns)
- return self.__constructor__(new_data, new_index, new_columns,
- new_dtypes)
+ return self.__constructor__(new_data, new_index, new_columns, new_dtypes)
# END __delitem__ and drop
@@ -2041,7 +2093,8 @@ def insert(df, internal_indices=[]):
return df
new_data = self.data.apply_func_to_select_indices_along_full_axis(
- 0, insert, loc, keep_remaining=True)
+ 0, insert, loc, keep_remaining=True
+ )
new_columns = self.columns.insert(loc, column)
# Because a Pandas Series does not allow insert, we make a DataFrame
@@ -2050,8 +2103,7 @@ def insert(df, internal_indices=[]):
temp_dtypes.insert(loc, column, _get_dtype_from_object(value))
new_dtypes = temp_dtypes.iloc[0]
- return self.__constructor__(new_data, self.index, new_columns,
- new_dtypes)
+ return self.__constructor__(new_data, self.index, new_columns, new_dtypes)
# END Insert
@@ -2108,11 +2160,17 @@ def _post_process_apply(self, result_data, axis, try_scale=True):
# this logic here.
if len(columns) == 0:
series_result = result_data.to_pandas(False)
- if not axis and len(series_result) == len(
- self.columns) and len(index) != len(series_result):
+ if (
+ not axis
+ and len(series_result) == len(self.columns)
+ and len(index) != len(series_result)
+ ):
index = self.columns
- elif axis and len(series_result) == len(
- self.index) and len(index) != len(series_result):
+ elif (
+ axis
+ and len(series_result) == len(self.index)
+ and len(index) != len(series_result)
+ ):
index = self.index
series_result.index = index
@@ -2138,16 +2196,14 @@ def _dict_func(self, func, axis, *args, **kwargs):
else:
index = self.index
- func = {
- idx: func[key]
- for key in func for idx in index.get_indexer_for([key])
- }
+ func = {idx: func[key] for key in func for idx in index.get_indexer_for([key])}
def dict_apply_builder(df, func_dict={}):
return df.apply(func_dict, *args, **kwargs)
result_data = self.data.apply_func_to_select_indices_along_full_axis(
- axis, dict_apply_builder, func, keep_remaining=False)
+ axis, dict_apply_builder, func, keep_remaining=False
+ )
full_result = self._post_process_apply(result_data, axis)
@@ -2169,14 +2225,11 @@ def _list_like_func(self, func, axis, *args, **kwargs):
Returns:
A new PandasDataManager.
"""
- func_prepared = self._prepare_method(
- lambda df: df.apply(func, *args, **kwargs))
+ func_prepared = self._prepare_method(lambda df: df.apply(func, *args, **kwargs))
new_data = self.map_across_full_axis(axis, func_prepared)
# When the function is list-like, the function names become the index
- new_index = [
- f if isinstance(f, string_types) else f.__name__ for f in func
- ]
+ new_index = [f if isinstance(f, string_types) else f.__name__ for f in func]
return self.__constructor__(new_data, new_index, self.columns)
def _callable_func(self, func, axis, *args, **kwargs):
@@ -2236,11 +2289,9 @@ def groupby_agg_builder(df):
df.index = remote_index
else:
df.columns = remote_index
- return agg_func(
- df.groupby(by=by, axis=axis, **groupby_args), **agg_args)
+ return agg_func(df.groupby(by=by, axis=axis, **groupby_args), **agg_args)
- func_prepared = self._prepare_method(
- lambda df: groupby_agg_builder(df))
+ func_prepared = self._prepare_method(lambda df: groupby_agg_builder(df))
result_data = self.map_across_full_axis(axis, func_prepared)
return self._post_process_apply(result_data, axis, try_scale=False)
@@ -2260,9 +2311,7 @@ def get_dummies(self, columns, **kwargs):
# `columns` as None does not mean all columns, by default it means only
# non-numeric columns.
if columns is None:
- columns = [
- c for c in self.columns if not is_numeric_dtype(self.dtypes[c])
- ]
+ columns = [c for c in self.columns if not is_numeric_dtype(self.dtypes[c])]
# If we aren't computing any dummies, there is no need for any
# remote compute.
@@ -2286,7 +2335,8 @@ def set_columns(df, columns):
set_cols = self.columns
columns_applied = self.map_across_full_axis(
- 1, lambda df: set_columns(df, set_cols))
+ 1, lambda df: set_columns(df, set_cols)
+ )
# In some cases, we are mapping across all of the data. It is more
# efficient if we are mapping over all of the data to do it this way
@@ -2307,11 +2357,13 @@ def get_dummies_builder(df):
def get_dummies_builder(df, internal_indices=[]):
return pandas.get_dummies(
- df.iloc[:, internal_indices], columns=None, **kwargs)
+ df.iloc[:, internal_indices], columns=None, **kwargs
+ )
numeric_indices = list(self.columns.get_indexer_for(columns))
new_data = columns_applied.apply_func_to_select_indices_along_full_axis(
- 0, get_dummies_builder, numeric_indices, keep_remaining=False)
+ 0, get_dummies_builder, numeric_indices, keep_remaining=False
+ )
untouched_data = self.drop(columns=columns)
# Since we set the columns in the beginning, we can just extract them
@@ -2324,26 +2376,30 @@ def get_dummies_builder(df, internal_indices=[]):
# not selected.
if len(columns) != len(self.columns):
new_data = untouched_data.data.concat(1, new_data)
- final_columns = untouched_data.columns.append(
- pandas.Index(final_columns))
+ final_columns = untouched_data.columns.append(pandas.Index(final_columns))
return cls(new_data, self.index, final_columns)
# Indexing
def view(self, index=None, columns=None):
- index_map_series = pandas.Series(
- np.arange(len(self.index)), index=self.index)
+ index_map_series = pandas.Series(np.arange(len(self.index)), index=self.index)
column_map_series = pandas.Series(
- np.arange(len(self.columns)), index=self.columns)
+ np.arange(len(self.columns)), index=self.columns
+ )
if index is not None:
index_map_series = index_map_series.reindex(index)
if columns is not None:
column_map_series = column_map_series.reindex(columns)
- return PandasDataManagerView(self.data, index_map_series.index,
- column_map_series.index, self.dtypes,
- index_map_series, column_map_series)
+ return PandasDataManagerView(
+ self.data,
+ index_map_series.index,
+ column_map_series.index,
+ self.dtypes,
+ index_map_series,
+ column_map_series,
+ )
def squeeze(self, ndim=0, axis=None):
squeezed = self.data.to_pandas().squeeze()
@@ -2358,10 +2414,8 @@ def squeeze(self, ndim=0, axis=None):
return squeezed
- def write_items(self, row_numeric_index, col_numeric_index,
- broadcasted_items):
- def iloc_mut(partition, row_internal_indices, col_internal_indices,
- item):
+ def write_items(self, row_numeric_index, col_numeric_index, broadcasted_items):
+ def iloc_mut(partition, row_internal_indices, col_internal_indices, item):
partition = partition.copy()
partition.iloc[row_internal_indices, col_internal_indices] = item
return partition
@@ -2371,7 +2425,8 @@ def iloc_mut(partition, row_internal_indices, col_internal_indices,
row_indices=row_numeric_index,
col_indices=col_numeric_index,
mutate=True,
- item_to_distribute=broadcasted_items)
+ item_to_distribute=broadcasted_items,
+ )
self.data = mutated_blk_partitions
def global_idx_to_numeric_idx(self, axis, indices):
@@ -2385,25 +2440,34 @@ def global_idx_to_numeric_idx(self, axis, indices):
Returns:
An Index object.
"""
- assert axis in ['row', 'col', 'columns']
- if axis == 'row':
+ assert axis in ["row", "col", "columns"]
+ if axis == "row":
return pandas.Index(
- pandas.Series(np.arange(len(self.index)),
- index=self.index).loc[indices].values)
- elif axis in ['col', 'columns']:
+ pandas.Series(np.arange(len(self.index)), index=self.index)
+ .loc[indices]
+ .values
+ )
+ elif axis in ["col", "columns"]:
return pandas.Index(
- pandas.Series(
- np.arange(len(self.columns)),
- index=self.columns).loc[indices].values)
+ pandas.Series(np.arange(len(self.columns)), index=self.columns)
+ .loc[indices]
+ .values
+ )
def enlarge_partitions(self, new_row_labels=None, new_col_labels=None):
new_data = self.data.enlarge_partitions(
- len(new_row_labels), len(new_col_labels))
- concated_index = self.index.append(type(
- self.index)(new_row_labels)) if new_row_labels else self.index
- concated_columns = self.columns.append(
- type(self.columns)(
- new_col_labels)) if new_col_labels else self.columns
+ len(new_row_labels), len(new_col_labels)
+ )
+ concated_index = (
+ self.index.append(type(self.index)(new_row_labels))
+ if new_row_labels
+ else self.index
+ )
+ concated_columns = (
+ self.columns.append(type(self.columns)(new_col_labels))
+ if new_col_labels
+ else self.columns
+ )
return self.__constructor__(new_data, concated_index, concated_columns)
@@ -2415,13 +2479,15 @@ class PandasDataManagerView(PandasDataManager):
- (len(self.index), len(self.columns)) != self.data.shape
"""
- def __init__(self,
- block_partitions_object: BlockPartitions,
- index: pandas.Index,
- columns: pandas.Index,
- dtypes=None,
- index_map_series: pandas.Series = None,
- columns_map_series: pandas.Series = None):
+ def __init__(
+ self,
+ block_partitions_object: BlockPartitions,
+ index: pandas.Index,
+ columns: pandas.Index,
+ dtypes=None,
+ index_map_series: pandas.Series = None,
+ columns_map_series: pandas.Series = None,
+ ):
"""
Args:
index_map_series: a Pandas Series Object mapping user-facing index to
@@ -2438,19 +2504,28 @@ def __init__(self,
self.columns_map = columns_map_series
self.is_view = True
- PandasDataManager.__init__(self, block_partitions_object, index,
- columns, dtypes)
+ PandasDataManager.__init__(
+ self, block_partitions_object, index, columns, dtypes
+ )
- def __constructor__(self,
- block_partitions_object: BlockPartitions,
- index: pandas.Index,
- columns: pandas.Index,
- dtypes=None):
+ def __constructor__(
+ self,
+ block_partitions_object: BlockPartitions,
+ index: pandas.Index,
+ columns: pandas.Index,
+ dtypes=None,
+ ):
new_index_map = self.index_map.reindex(index)
new_columns_map = self.columns_map.reindex(columns)
- return type(self)(block_partitions_object, index, columns, dtypes,
- new_index_map, new_columns_map)
+ return type(self)(
+ block_partitions_object,
+ index,
+ columns,
+ dtypes,
+ new_index_map,
+ new_columns_map,
+ )
def _get_data(self) -> BlockPartitions:
"""Perform the map step
@@ -2467,7 +2542,8 @@ def iloc(partition, row_internal_indices, col_internal_indices):
row_indices=self.index_map.values,
col_indices=self.columns_map.values,
lazy=True,
- keep_remaining=False)
+ keep_remaining=False,
+ )
return masked_data
def _set_data(self, new_data):
@@ -2479,8 +2555,8 @@ def _set_data(self, new_data):
data = property(_get_data, _set_data)
def global_idx_to_numeric_idx(self, axis, indices):
- assert axis in ['row', 'col', 'columns']
- if axis == 'row':
+ assert axis in ["row", "col", "columns"]
+ if axis == "row":
return self.index_map.loc[indices].index
- elif axis in ['col', 'columns']:
+ elif axis in ["col", "columns"]:
return self.columns_map.loc[indices].index
diff --git a/modin/data_management/factories.py b/modin/data_management/factories.py
index fea12f12b7b..9702a6634ec 100644
--- a/modin/data_management/factories.py
+++ b/modin/data_management/factories.py
@@ -13,8 +13,7 @@
class BaseFactory(object):
@classmethod
def _determine_engine(cls):
- factory_name = \
- partition_format + "Backed" + execution_engine + "Factory"
+ factory_name = partition_format + "Backed" + execution_engine + "Factory"
return getattr(sys.modules[__name__], factory_name)
diff --git a/modin/data_management/partitioning/axis_partition.py b/modin/data_management/partitioning/axis_partition.py
index dc8e760647c..007e92a9047 100644
--- a/modin/data_management/partitioning/axis_partition.py
+++ b/modin/data_management/partitioning/axis_partition.py
@@ -28,8 +28,7 @@ class AxisPartition(object):
The only abstract method needed to implement is the `apply` method.
"""
- def apply(self, func, num_splits=None, other_axis_partition=None,
- **kwargs):
+ def apply(self, func, num_splits=None, other_axis_partition=None, **kwargs):
"""Applies a function to a full axis.
Note: The procedures that invoke this method assume full axis
@@ -71,8 +70,7 @@ def __init__(self, list_of_blocks):
# Unwrap from RemotePartition object for ease of use
self.list_of_blocks = [obj.oid for obj in list_of_blocks]
- def apply(self, func, num_splits=None, other_axis_partition=None,
- **kwargs):
+ def apply(self, func, num_splits=None, other_axis_partition=None, **kwargs):
"""Applies func to the object in the plasma store.
See notes in Parent class about this method.
@@ -93,18 +91,17 @@ def apply(self, func, num_splits=None, other_axis_partition=None,
return [
RayRemotePartition(obj)
for obj in deploy_ray_func_between_two_axis_partitions._submit(
- args=(self.axis, func, num_splits,
- len(self.list_of_blocks), kwargs) +
- tuple(self.list_of_blocks +
- other_axis_partition.list_of_blocks),
- num_return_vals=num_splits)
+ args=(self.axis, func, num_splits, len(self.list_of_blocks), kwargs)
+ + tuple(self.list_of_blocks + other_axis_partition.list_of_blocks),
+ num_return_vals=num_splits,
+ )
]
args = [self.axis, func, num_splits, kwargs]
args.extend(self.list_of_blocks)
return [
- RayRemotePartition(obj) for obj in deploy_ray_axis_func._submit(
- args, num_return_vals=num_splits)
+ RayRemotePartition(obj)
+ for obj in deploy_ray_axis_func._submit(args, num_return_vals=num_splits)
]
def shuffle(self, func, num_splits=None, **kwargs):
@@ -123,8 +120,8 @@ def shuffle(self, func, num_splits=None, **kwargs):
args = [self.axis, func, num_splits, kwargs]
args.extend(self.list_of_blocks)
return [
- RayRemotePartition(obj) for obj in deploy_ray_axis_func._submit(
- args, num_return_vals=num_splits)
+ RayRemotePartition(obj)
+ for obj in deploy_ray_axis_func._submit(args, num_return_vals=num_splits)
]
@@ -133,6 +130,7 @@ class RayColumnPartition(RayAxisPartition):
for this class is in the parent class, and this class defines the axis
to perform the computation over.
"""
+
axis = 0
@@ -141,6 +139,7 @@ class RayRowPartition(RayAxisPartition):
for this class is in the parent class, and this class defines the axis
to perform the computation over.
"""
+
axis = 1
@@ -160,13 +159,12 @@ def split_result_of_axis_func_pandas(axis, num_splits, result):
if axis == 0 or type(result) is pandas.Series:
chunksize = compute_chunksize(len(result), num_splits)
return [
- result.iloc[chunksize * i:chunksize * (i + 1)]
- for i in range(num_splits)
+ result.iloc[chunksize * i : chunksize * (i + 1)] for i in range(num_splits)
]
else:
chunksize = compute_chunksize(len(result.columns), num_splits)
return [
- result.iloc[:, chunksize * i:chunksize * (i + 1)]
+ result.iloc[:, chunksize * i : chunksize * (i + 1)]
for i in range(num_splits)
]
@@ -193,7 +191,8 @@ def deploy_ray_axis_func(axis, func, num_splits, kwargs, *partitions):
@ray.remote
def deploy_ray_func_between_two_axis_partitions(
- axis, func, num_splits, len_of_left, kwargs, *partitions):
+ axis, func, num_splits, len_of_left, kwargs, *partitions
+):
"""Deploy a function along a full axis between two data sets in Ray.
Args:
@@ -210,10 +209,8 @@ def deploy_ray_func_between_two_axis_partitions(
Returns:
A list of Pandas DataFrames.
"""
- lt_frame = pandas.concat(
- list(partitions[:len_of_left]), axis=axis, copy=False)
- rt_frame = pandas.concat(
- list(partitions[len_of_left:]), axis=axis, copy=False)
+ lt_frame = pandas.concat(list(partitions[:len_of_left]), axis=axis, copy=False)
+ rt_frame = pandas.concat(list(partitions[len_of_left:]), axis=axis, copy=False)
result = func(lt_frame, rt_frame, **kwargs)
return split_result_of_axis_func_pandas(axis, num_splits, result)
diff --git a/modin/data_management/partitioning/partition_collections.py b/modin/data_management/partitioning/partition_collections.py
index 9a7ba70451c..43bfd893cc0 100644
--- a/modin/data_management/partitioning/partition_collections.py
+++ b/modin/data_management/partitioning/partition_collections.py
@@ -102,9 +102,7 @@ def block_lengths(self):
# The first column will have the correct lengths. We have an
# invariant that requires that all blocks be the same length in a
# row of blocks.
- self._lengths_cache = [
- obj.length().get() for obj in self.partitions.T[0]
- ]
+ self._lengths_cache = [obj.length().get() for obj in self.partitions.T[0]]
return self._lengths_cache
# Widths of the blocks
@@ -121,9 +119,7 @@ def block_widths(self):
# The first column will have the correct lengths. We have an
# invariant that requires that all blocks be the same width in a
# column of blocks.
- self._widths_cache = [
- obj.width().get() for obj in self.partitions[0]
- ]
+ self._widths_cache = [obj.width().get() for obj in self.partitions[0]]
return self._widths_cache
@property
@@ -158,12 +154,16 @@ def full_reduce(self, map_func, reduce_func, axis):
# DataFrame. The individual partitions return Series objects, and those
# cannot be concatenated the correct way without casting them as
# DataFrames.
- full_frame = pandas.concat([
- pandas.concat(
- [pandas.DataFrame(part.get()).T for part in row_of_parts],
- axis=axis ^ 1) for row_of_parts in mapped_parts
- ],
- axis=axis)
+ full_frame = pandas.concat(
+ [
+ pandas.concat(
+ [pandas.DataFrame(part.get()).T for part in row_of_parts],
+ axis=axis ^ 1,
+ )
+ for row_of_parts in mapped_parts
+ ],
+ axis=axis,
+ )
# Transpose because operations where axis == 1 assume that the
# operation is performed across the other axis
@@ -186,17 +186,25 @@ def map_across_blocks(self, map_func):
preprocessed_map_func = self.preprocess_func(map_func)
new_partitions = np.array(
- [[part.apply(preprocessed_map_func) for part in row_of_parts]
- for row_of_parts in self.partitions])
+ [
+ [part.apply(preprocessed_map_func) for part in row_of_parts]
+ for row_of_parts in self.partitions
+ ]
+ )
return cls(new_partitions)
def lazy_map_across_blocks(self, map_func, kwargs):
cls = type(self)
preprocessed_map_func = self.preprocess_func(map_func)
- new_partitions = np.array([[
- part.add_to_apply_calls(preprocessed_map_func, kwargs)
- for part in row_of_parts
- ] for row_of_parts in self.partitions])
+ new_partitions = np.array(
+ [
+ [
+ part.add_to_apply_calls(preprocessed_map_func, kwargs)
+ for part in row_of_parts
+ ]
+ for row_of_parts in self.partitions
+ ]
+ )
return cls(new_partitions)
def map_across_full_axis(self, axis, map_func):
@@ -220,10 +228,9 @@ def map_across_full_axis(self, axis, map_func):
preprocessed_map_func = self.preprocess_func(map_func)
partitions = self.column_partitions if not axis else self.row_partitions
- result_blocks = np.array([
- part.apply(preprocessed_map_func, num_splits)
- for part in partitions
- ])
+ result_blocks = np.array(
+ [part.apply(preprocessed_map_func, num_splits) for part in partitions]
+ )
# If we are mapping over columns, they are returned to use the same as
# rows, so we need to transpose the returned 2D numpy array to return
# the structure to the correct order.
@@ -272,16 +279,22 @@ def take(self, axis, n):
# We build this iloc to avoid creating a bunch of helper methods.
# This code creates slice objects to be passed to `iloc` to grab
# the last n rows or columns depending on axis.
- slice_obj = slice(-remaining, None) if axis == 0 else (
- slice(None), slice(-remaining, None))
+ slice_obj = (
+ slice(-remaining, None)
+ if axis == 0
+ else (slice(None), slice(-remaining, None))
+ )
func = self.preprocess_func(lambda df: df.iloc[slice_obj])
# We use idx + 1 here because the loop is not inclusive, and we
# need to iterate through idx.
- result = np.array([
- partitions[i] if i != idx else
- [obj.apply(func) for obj in partitions[i]]
- for i in range(idx + 1)
- ])[::-1]
+ result = np.array(
+ [
+ partitions[i]
+ if i != idx
+ else [obj.apply(func) for obj in partitions[i]]
+ for i in range(idx + 1)
+ ]
+ )[::-1]
else:
length_bins = np.cumsum(bin_lengths)
idx = int(np.digitize(n, length_bins))
@@ -296,15 +309,19 @@ def take(self, axis, n):
# We build this iloc to avoid creating a bunch of helper methods.
# This code creates slice objects to be passed to `iloc` to grab
# the first n rows or columns depending on axis.
- slice_obj = slice(remaining) if axis == 0 else (
- slice(None), slice(remaining))
+ slice_obj = (
+ slice(remaining) if axis == 0 else (slice(None), slice(remaining))
+ )
func = self.preprocess_func(lambda df: df.iloc[slice_obj])
# See note above about idx + 1
- result = np.array([
- partitions[i] if i != idx else
- [obj.apply(func) for obj in partitions[i]]
- for i in range(idx + 1)
- ])
+ result = np.array(
+ [
+ partitions[i]
+ if i != idx
+ else [obj.apply(func) for obj in partitions[i]]
+ for i in range(idx + 1)
+ ]
+ )
return cls(result.T) if axis else cls(result)
@@ -326,11 +343,9 @@ def concat(self, axis, other_blocks):
cls = type(self)
if type(other_blocks) is list:
other_blocks = [blocks.partitions for blocks in other_blocks]
- return cls(
- np.concatenate([self.partitions] + other_blocks, axis=axis))
+ return cls(np.concatenate([self.partitions] + other_blocks, axis=axis))
else:
- return cls(
- np.append(self.partitions, other_blocks.partitions, axis=axis))
+ return cls(np.append(self.partitions, other_blocks.partitions, axis=axis))
def copy(self):
"""Create a copy of this object.
@@ -367,16 +382,21 @@ def to_pandas(self, is_transposed=False):
if is_transposed:
return self.transpose().to_pandas(False).T
else:
- retrieved_objects = [[obj.to_pandas() for obj in part]
- for part in self.partitions]
+ retrieved_objects = [
+ [obj.to_pandas() for obj in part] for part in self.partitions
+ ]
if all(
- isinstance(part, pandas.Series)
- for row in retrieved_objects for part in row):
+ isinstance(part, pandas.Series)
+ for row in retrieved_objects
+ for part in row
+ ):
axis = 0
retrieved_objects = np.array(retrieved_objects).T
elif all(
- isinstance(part, pandas.DataFrame)
- for row in retrieved_objects for part in row):
+ isinstance(part, pandas.DataFrame)
+ for row in retrieved_objects
+ for part in row
+ ):
axis = 1
else:
raise ValueError(
@@ -403,15 +423,15 @@ def from_pandas(cls, df):
# Each chunk must have a RangeIndex that spans its length and width
# according to our invariant.
def chunk_builder(i, j):
- chunk = df.iloc[i:i + row_chunksize, j:j + col_chunksize]
+ chunk = df.iloc[i : i + row_chunksize, j : j + col_chunksize]
chunk.index = pandas.RangeIndex(len(chunk.index))
chunk.columns = pandas.RangeIndex(len(chunk.columns))
return put_func(chunk)
- parts = [[
- chunk_builder(i, j)
- for j in range(0, len(df.columns), col_chunksize)
- ] for i in range(0, len(df), row_chunksize)]
+ parts = [
+ [chunk_builder(i, j) for j in range(0, len(df.columns), col_chunksize)]
+ for i in range(0, len(df), row_chunksize)
+ ]
return cls(np.array(parts))
@@ -431,31 +451,25 @@ def get_indices(self, axis=0, index_func=None, old_blocks=None):
Returns:
A Pandas Index object.
"""
- assert callable(index_func), \
- "Must tell this function how to extract index"
+ assert callable(index_func), "Must tell this function how to extract index"
if axis == 0:
func = self.preprocess_func(index_func)
# We grab the first column of blocks and extract the indices
- new_indices = [
- idx.apply(func).get() for idx in self.partitions.T[0]
- ]
+ new_indices = [idx.apply(func).get() for idx in self.partitions.T[0]]
# This is important because sometimes we have resized the data. The new
# sizes will not be valid if we are trying to compute the index on a
# new object that has a different length.
if old_blocks is not None:
- cumulative_block_lengths = np.array(
- old_blocks.block_lengths).cumsum()
+ cumulative_block_lengths = np.array(old_blocks.block_lengths).cumsum()
else:
- cumulative_block_lengths = np.array(
- self.block_lengths).cumsum()
+ cumulative_block_lengths = np.array(self.block_lengths).cumsum()
else:
func = self.preprocess_func(index_func)
new_indices = [idx.apply(func).get() for idx in self.partitions[0]]
if old_blocks is not None:
- cumulative_block_lengths = np.array(
- old_blocks.block_widths).cumsum()
+ cumulative_block_lengths = np.array(old_blocks.block_widths).cumsum()
else:
cumulative_block_lengths = np.array(self.block_widths).cumsum()
@@ -469,8 +483,7 @@ def get_indices(self, axis=0, index_func=None, old_blocks=None):
# The try-except here is intended to catch issues where we are
# trying to get a string index out of the internal index.
try:
- append_val = new_indices[i] + cumulative_block_lengths[i -
- 1]
+ append_val = new_indices[i] + cumulative_block_lengths[i - 1]
except TypeError:
append_val = new_indices[i]
@@ -488,6 +501,7 @@ def _compute_num_partitions(cls):
:return:
"""
from ...pandas import DEFAULT_NPARTITIONS
+
return DEFAULT_NPARTITIONS
# Extracting rows/columns
@@ -511,15 +525,21 @@ def _get_blocks_containing_index(self, axis, index):
block_idx = int(np.digitize(index, cumulative_column_widths))
# Compute the internal index based on the previous lengths. This
# is a global index, so we must subtract the lengths first.
- internal_idx = index if not block_idx else index - cumulative_column_widths[
- block_idx - 1]
+ internal_idx = (
+ index
+ if not block_idx
+ else index - cumulative_column_widths[block_idx - 1]
+ )
return block_idx, internal_idx
else:
cumulative_row_lengths = np.array(self.block_lengths).cumsum()
block_idx = int(np.digitize(index, cumulative_row_lengths))
# See note above about internal index
- internal_idx = index if not block_idx else index - cumulative_row_lengths[
- block_idx - 1]
+ internal_idx = (
+ index
+ if not block_idx
+ else index - cumulative_row_lengths[block_idx - 1]
+ )
return block_idx, internal_idx
def _get_dict_of_block_index(self, axis, indices):
@@ -567,11 +587,7 @@ def _apply_func_to_list_of_partitions(self, func, partitions, **kwargs):
preprocessed_func = self.preprocess_func(func)
return [obj.apply(preprocessed_func, **kwargs) for obj in partitions]
- def apply_func_to_select_indices(self,
- axis,
- func,
- indices,
- keep_remaining=False):
+ def apply_func_to_select_indices(self, axis, func, indices, keep_remaining=False):
"""Applies a function to select indices.
Note: Your internal function must take a kwarg `internal_indices` for
@@ -617,58 +633,70 @@ def apply_func_to_select_indices(self,
# accept a keyword argument `func_dict`.
if dict_indices is not None:
if not keep_remaining:
- result = np.array([
- self._apply_func_to_list_of_partitions(
- func,
- partitions_for_apply[i],
- func_dict={
- idx: dict_indices[idx]
- for idx in partitions_dict[i]
- }) for i in partitions_dict
- ])
+ result = np.array(
+ [
+ self._apply_func_to_list_of_partitions(
+ func,
+ partitions_for_apply[i],
+ func_dict={
+ idx: dict_indices[idx] for idx in partitions_dict[i]
+ },
+ )
+ for i in partitions_dict
+ ]
+ )
else:
- result = np.array([
- partitions_for_apply[i] if i not in partitions_dict else
- self._apply_func_to_list_of_partitions(
- func,
- partitions_for_apply[i],
- func_dict={
- idx: dict_indices[i]
- for idx in partitions_dict[i]
- }) for i in range(len(partitions_for_apply))
- ])
+ result = np.array(
+ [
+ partitions_for_apply[i]
+ if i not in partitions_dict
+ else self._apply_func_to_list_of_partitions(
+ func,
+ partitions_for_apply[i],
+ func_dict={
+ idx: dict_indices[i] for idx in partitions_dict[i]
+ },
+ )
+ for i in range(len(partitions_for_apply))
+ ]
+ )
else:
if not keep_remaining:
# We are passing internal indices in here. In order for func to
# actually be able to use this information, it must be able to take in
# the internal indices. This might mean an iloc in the case of Pandas
# or some other way to index into the internal representation.
- result = np.array([
- self._apply_func_to_list_of_partitions(
- func,
- partitions_for_apply[i],
- internal_indices=partitions_dict[i])
- for i in partitions_dict
- ])
+ result = np.array(
+ [
+ self._apply_func_to_list_of_partitions(
+ func,
+ partitions_for_apply[i],
+ internal_indices=partitions_dict[i],
+ )
+ for i in partitions_dict
+ ]
+ )
else:
# The difference here is that we modify a subset and return the
# remaining (non-updated) blocks in their original position.
- result = np.array([
- partitions_for_apply[i] if i not in partitions_dict else
- self._apply_func_to_list_of_partitions(
- func,
- partitions_for_apply[i],
- internal_indices=partitions_dict[i])
- for i in range(len(partitions_for_apply))
- ])
+ result = np.array(
+ [
+ partitions_for_apply[i]
+ if i not in partitions_dict
+ else self._apply_func_to_list_of_partitions(
+ func,
+ partitions_for_apply[i],
+ internal_indices=partitions_dict[i],
+ )
+ for i in range(len(partitions_for_apply))
+ ]
+ )
return cls(result.T) if not axis else cls(result)
- def apply_func_to_select_indices_along_full_axis(self,
- axis,
- func,
- indices,
- keep_remaining=False):
+ def apply_func_to_select_indices_along_full_axis(
+ self, axis, func, indices, keep_remaining=False
+ ):
"""Applies a function to a select subset of full columns/rows.
Note: This should be used when you need to apply a function that relies
@@ -718,52 +746,68 @@ def apply_func_to_select_indices_along_full_axis(self,
# accept a keyword argument `func_dict`.
if dict_indices is not None:
if not keep_remaining:
- result = np.array([
- partitions_for_apply[i].apply(
- preprocessed_func,
- func_dict={
- idx: dict_indices[idx]
- for idx in partitions_dict[i]
- }) for i in partitions_dict
- ])
+ result = np.array(
+ [
+ partitions_for_apply[i].apply(
+ preprocessed_func,
+ func_dict={
+ idx: dict_indices[idx] for idx in partitions_dict[i]
+ },
+ )
+ for i in partitions_dict
+ ]
+ )
else:
- result = np.array([
- partitions_for_remaining[i] if i not in partitions_dict
- else self._apply_func_to_list_of_partitions(
- preprocessed_func,
- partitions_for_apply[i],
- func_dict={
- idx: dict_indices[idx]
- for idx in partitions_dict[i]
- }) for i in range(len(partitions_for_apply))
- ])
+ result = np.array(
+ [
+ partitions_for_remaining[i]
+ if i not in partitions_dict
+ else self._apply_func_to_list_of_partitions(
+ preprocessed_func,
+ partitions_for_apply[i],
+ func_dict={
+ idx: dict_indices[idx] for idx in partitions_dict[i]
+ },
+ )
+ for i in range(len(partitions_for_apply))
+ ]
+ )
else:
if not keep_remaining:
# See notes in `apply_func_to_select_indices`
- result = np.array([
- partitions_for_apply[i].apply(
- preprocessed_func, internal_indices=partitions_dict[i])
- for i in partitions_dict
- ])
+ result = np.array(
+ [
+ partitions_for_apply[i].apply(
+ preprocessed_func, internal_indices=partitions_dict[i]
+ )
+ for i in partitions_dict
+ ]
+ )
else:
# See notes in `apply_func_to_select_indices`
- result = np.array([
- partitions_for_remaining[i] if i not in partitions_dict
- else partitions_for_apply[i].apply(
- preprocessed_func, internal_indices=partitions_dict[i])
- for i in range(len(partitions_for_remaining))
- ])
+ result = np.array(
+ [
+ partitions_for_remaining[i]
+ if i not in partitions_dict
+ else partitions_for_apply[i].apply(
+ preprocessed_func, internal_indices=partitions_dict[i]
+ )
+ for i in range(len(partitions_for_remaining))
+ ]
+ )
return cls(result.T) if not axis else cls(result)
- def apply_func_to_indices_both_axis(self,
- func,
- row_indices,
- col_indices,
- lazy=False,
- keep_remaining=True,
- mutate=False,
- item_to_distribute=None):
+ def apply_func_to_indices_both_axis(
+ self,
+ func,
+ row_indices,
+ col_indices,
+ lazy=False,
+ keep_remaining=True,
+ mutate=False,
+ item_to_distribute=None,
+ ):
"""
Apply a function to along both axis
@@ -782,18 +826,22 @@ def apply_func_to_indices_both_axis(self,
row_position_counter = 0
for row_blk_idx, row_internal_idx in self._get_dict_of_block_index(
- 1, row_indices).items():
+ 1, row_indices
+ ).items():
col_position_counter = 0
for col_blk_idx, col_internal_idx in self._get_dict_of_block_index(
- 0, col_indices).items():
+ 0, col_indices
+ ).items():
remote_part = partition_copy[row_blk_idx, col_blk_idx]
if item_to_distribute is not None:
item = item_to_distribute[
- row_position_counter:row_position_counter +
- len(row_internal_idx), col_position_counter:
- col_position_counter + len(col_internal_idx)]
- item = {'item': item}
+ row_position_counter : row_position_counter
+ + len(row_internal_idx),
+ col_position_counter : col_position_counter
+ + len(col_internal_idx),
+ ]
+ item = {"item": item}
else:
item = {}
@@ -802,13 +850,15 @@ def apply_func_to_indices_both_axis(self,
func,
row_internal_indices=row_internal_idx,
col_internal_indices=col_internal_idx,
- **item)
+ **item
+ )
else:
result = remote_part.apply(
func,
row_internal_indices=row_internal_idx,
col_internal_indices=col_internal_idx,
- **item)
+ **item
+ )
partition_copy[row_blk_idx, col_blk_idx] = result
operation_mask[row_blk_idx, col_blk_idx] = True
@@ -845,13 +895,16 @@ def inter_data_operation(self, axis, func, other):
func = self.preprocess_func(func)
- result = np.array([
- partitions[i].apply(
- func,
- num_splits=cls._compute_num_partitions(),
- other_axis_partition=other_partitions[i])
- for i in range(len(partitions))
- ])
+ result = np.array(
+ [
+ partitions[i].apply(
+ func,
+ num_splits=cls._compute_num_partitions(),
+ other_axis_partition=other_partitions[i],
+ )
+ for i in range(len(partitions))
+ ]
+ )
return cls(result) if axis else cls(result.T)
def manual_shuffle(self, axis, shuffle_func):
@@ -872,10 +925,12 @@ def manual_shuffle(self, axis, shuffle_func):
partitions = self.column_partitions
func = self.preprocess_func(shuffle_func)
- result = np.array([
- part.shuffle(func, num_splits=cls._compute_num_partitions())
- for part in partitions
- ])
+ result = np.array(
+ [
+ part.shuffle(func, num_splits=cls._compute_num_partitions())
+ for part in partitions
+ ]
+ )
return cls(result) if axis else cls(result.T)
def __getitem__(self, key):
@@ -893,7 +948,8 @@ def enlarge_partitions(self, n_rows=None, n_cols=None):
n_cols_lst = self.block_widths
nan_oids_lst = [
self._partition_class(
- _get_nan_block_id(self._partition_class, n_rows, n_cols_))
+ _get_nan_block_id(self._partition_class, n_rows, n_cols_)
+ )
for n_cols_ in n_cols_lst
]
new_chunk = block_partitions_cls(np.array([nan_oids_lst]))
@@ -903,7 +959,8 @@ def enlarge_partitions(self, n_rows=None, n_cols=None):
n_rows_lst = self.block_lengths
nan_oids_lst = [
self._partition_class(
- _get_nan_block_id(self._partition_class, n_rows_, n_cols))
+ _get_nan_block_id(self._partition_class, n_rows_, n_cols)
+ )
for n_rows_ in n_rows_lst
]
new_chunk = block_partitions_cls(np.array([nan_oids_lst]).T)
@@ -939,7 +996,8 @@ def block_lengths(self):
# invariant that requires that all blocks be the same length in a
# row of blocks.
self._lengths_cache = ray.get(
- [obj.length().oid for obj in self.partitions.T[0]])
+ [obj.length().oid for obj in self.partitions.T[0]]
+ )
return self._lengths_cache
# Widths of the blocks
@@ -957,7 +1015,8 @@ def block_widths(self):
# invariant that requires that all blocks be the same width in a
# column of blocks.
self._widths_cache = ray.get(
- [obj.width().oid for obj in self.partitions[0]])
+ [obj.width().oid for obj in self.partitions[0]]
+ )
return self._widths_cache
@property
diff --git a/modin/data_management/partitioning/remote_partition.py b/modin/data_management/partitioning/remote_partition.py
index 9e94e26abfa..49493d28953 100644
--- a/modin/data_management/partitioning/remote_partition.py
+++ b/modin/data_management/partitioning/remote_partition.py
@@ -188,7 +188,8 @@ def call_queue_closure(oid_obj, call_queues):
return oid_obj
oid = deploy_ray_func.remote(
- call_queue_closure, oid, kwargs={'call_queues': self.call_queue})
+ call_queue_closure, oid, kwargs={"call_queues": self.call_queue}
+ )
self.call_queue = []
return RayRemotePartition(oid)
@@ -207,8 +208,7 @@ def to_pandas(self):
A Pandas DataFrame.
"""
dataframe = self.get()
- assert type(dataframe) is pandas.DataFrame or type(
- dataframe) is pandas.Series
+ assert type(dataframe) is pandas.DataFrame or type(dataframe) is pandas.Series
return dataframe
diff --git a/modin/data_management/partitioning/utils.py b/modin/data_management/partitioning/utils.py
index c47be187193..f63eeef7042 100644
--- a/modin/data_management/partitioning/utils.py
+++ b/modin/data_management/partitioning/utils.py
@@ -8,8 +8,9 @@
def compute_chunksize(length, num_splits):
# We do this to avoid zeros and having an extremely large last partition
- return length // num_splits if length % num_splits == 0 \
- else length // num_splits + 1
+ return (
+ length // num_splits if length % num_splits == 0 else length // num_splits + 1
+ )
def _get_nan_block_id(partition_class, n_row=1, n_col=1, transpose=False):
diff --git a/modin/pandas/__init__.py b/modin/pandas/__init__.py
index 8b1aebcb53a..d0c1eb46592 100644
--- a/modin/pandas/__init__.py
+++ b/modin/pandas/__init__.py
@@ -4,11 +4,33 @@
# TODO: In the future `set_option` or similar needs to run on every node
# in order to keep all pandas instances across nodes consistent
-from pandas import (eval, unique, value_counts, cut, to_numeric, factorize,
- test, qcut, match, Panel, date_range, Index, MultiIndex,
- CategoricalIndex, Series, bdate_range, DatetimeIndex,
- Timedelta, Timestamp, to_timedelta, set_eng_float_format,
- set_option, NaT, PeriodIndex, Categorical)
+from pandas import (
+ eval,
+ unique,
+ value_counts,
+ cut,
+ to_numeric,
+ factorize,
+ test,
+ qcut,
+ match,
+ Panel,
+ date_range,
+ Index,
+ MultiIndex,
+ CategoricalIndex,
+ Series,
+ bdate_range,
+ DatetimeIndex,
+ Timedelta,
+ Timestamp,
+ to_timedelta,
+ set_eng_float_format,
+ set_option,
+ NaT,
+ PeriodIndex,
+ Categorical,
+)
import threading
import os
import ray
@@ -17,34 +39,79 @@
from .concat import concat
from .dataframe import DataFrame
from .datetimes import to_datetime
-from .io import (read_csv, read_parquet, read_json, read_html, read_clipboard,
- read_excel, read_hdf, read_feather, read_msgpack, read_stata,
- read_sas, read_pickle, read_sql)
+from .io import (
+ read_csv,
+ read_parquet,
+ read_json,
+ read_html,
+ read_clipboard,
+ read_excel,
+ read_hdf,
+ read_feather,
+ read_msgpack,
+ read_stata,
+ read_sas,
+ read_pickle,
+ read_sql,
+)
from .reshape import get_dummies
# Set this so that Pandas doesn't try to multithread by itself
-os.environ['OMP_NUM_THREADS'] = "1"
+os.environ["OMP_NUM_THREADS"] = "1"
try:
if threading.current_thread().name == "MainThread":
- ray.init(
- redirect_output=True,
- include_webui=False,
- redirect_worker_output=True)
+ ray.init(redirect_output=True, include_webui=False, redirect_worker_output=True)
except AssertionError:
pass
-num_cpus = ray.global_state.cluster_resources()['CPU']
+num_cpus = ray.global_state.cluster_resources()["CPU"]
DEFAULT_NPARTITIONS = max(4, int(num_cpus))
__all__ = [
- "DataFrame", "Series", "read_csv", "read_parquet", "read_json",
- "read_html", "read_clipboard", "read_excel", "read_hdf", "read_feather",
- "read_msgpack", "read_stata", "read_sas", "read_pickle", "read_sql",
- "concat", "eval", "unique", "value_counts", "cut", "to_numeric",
- "factorize", "test", "qcut", "match", "to_datetime", "get_dummies",
- "Panel", "date_range", "Index", "MultiIndex", "Series", "bdate_range",
- "DatetimeIndex", "to_timedelta", "set_eng_float_format", "set_option",
- "CategoricalIndex", "Timedelta", "Timestamp", "NaT", "PeriodIndex",
- "Categorical", "__git_revision__", "__version__"
+ "DataFrame",
+ "Series",
+ "read_csv",
+ "read_parquet",
+ "read_json",
+ "read_html",
+ "read_clipboard",
+ "read_excel",
+ "read_hdf",
+ "read_feather",
+ "read_msgpack",
+ "read_stata",
+ "read_sas",
+ "read_pickle",
+ "read_sql",
+ "concat",
+ "eval",
+ "unique",
+ "value_counts",
+ "cut",
+ "to_numeric",
+ "factorize",
+ "test",
+ "qcut",
+ "match",
+ "to_datetime",
+ "get_dummies",
+ "Panel",
+ "date_range",
+ "Index",
+ "MultiIndex",
+ "Series",
+ "bdate_range",
+ "DatetimeIndex",
+ "to_timedelta",
+ "set_eng_float_format",
+ "set_option",
+ "CategoricalIndex",
+ "Timedelta",
+ "Timestamp",
+ "NaT",
+ "PeriodIndex",
+ "Categorical",
+ "__git_revision__",
+ "__version__",
]
diff --git a/modin/pandas/concat.py b/modin/pandas/concat.py
index d6bc52ab867..15cf62693cb 100644
--- a/modin/pandas/concat.py
+++ b/modin/pandas/concat.py
@@ -7,16 +7,18 @@
from .dataframe import DataFrame
-def concat(objs,
- axis=0,
- join='outer',
- join_axes=None,
- ignore_index=False,
- keys=None,
- levels=None,
- names=None,
- verify_integrity=False,
- copy=True):
+def concat(
+ objs,
+ axis=0,
+ join="outer",
+ join_axes=None,
+ ignore_index=False,
+ keys=None,
+ levels=None,
+ names=None,
+ verify_integrity=False,
+ copy=True,
+):
if keys is not None:
objs = [objs[k] for k in keys]
@@ -33,41 +35,60 @@ def concat(objs,
try:
type_check = next(
- obj for obj in objs
- if not isinstance(obj, (pandas.Series, pandas.DataFrame,
- DataFrame)))
+ obj
+ for obj in objs
+ if not isinstance(obj, (pandas.Series, pandas.DataFrame, DataFrame))
+ )
except StopIteration:
type_check = None
if type_check is not None:
raise ValueError(
- "cannot concatenate object of type \"{0}\"; only "
+ 'cannot concatenate object of type "{0}"; only '
"pandas.Series, pandas.DataFrame, "
"and modin.pandas.DataFrame objs are "
- "valid", type(type_check))
+ "valid",
+ type(type_check),
+ )
all_series = all(isinstance(obj, pandas.Series) for obj in objs)
if all_series:
return DataFrame(
- pandas.concat(objs, axis, join, join_axes, ignore_index, keys,
- levels, names, verify_integrity, copy))
+ pandas.concat(
+ objs,
+ axis,
+ join,
+ join_axes,
+ ignore_index,
+ keys,
+ levels,
+ names,
+ verify_integrity,
+ copy,
+ )
+ )
if isinstance(objs, dict):
raise NotImplementedError(
"Obj as dicts not implemented. To contribute to "
- "Pandas on Ray, please visit github.com/ray-project/ray.")
+ "Pandas on Ray, please visit github.com/ray-project/ray."
+ )
axis = pandas.DataFrame()._get_axis_number(axis)
- if join not in ['inner', 'outer']:
- raise ValueError("Only can inner (intersect) or outer (union) join the"
- " other axis")
+ if join not in ["inner", "outer"]:
+ raise ValueError(
+ "Only can inner (intersect) or outer (union) join the" " other axis"
+ )
# We have the weird Series and axis check because, when concatenating a
# dataframe to a series on axis=0, pandas ignores the name of the series,
# and this check aims to mirror that (possibly buggy) functionality
objs = [
- obj if isinstance(obj, DataFrame) else DataFrame(obj.rename())
- if isinstance(obj, pandas.Series) and axis == 0 else DataFrame(obj)
+ obj
+ if isinstance(obj, DataFrame)
+ else DataFrame(obj.rename())
+ if isinstance(obj, pandas.Series) and axis == 0
+ else DataFrame(obj)
for obj in objs
]
df = objs[0]
@@ -82,5 +103,6 @@ def concat(objs,
levels=None,
names=None,
verify_integrity=False,
- copy=True)
+ copy=True,
+ )
return DataFrame(data_manager=new_manager)
diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py
index 757679320ff..3c554b62cc0 100644
--- a/modin/pandas/dataframe.py
+++ b/modin/pandas/dataframe.py
@@ -7,11 +7,15 @@
from pandas.compat import to_str, string_types, cPickle as pkl
import pandas.core.common as com
from pandas.core.dtypes.common import (
- _get_dtype_from_object, is_bool_dtype, is_list_like, is_numeric_dtype,
- is_datetime_or_timedelta_dtype, is_dtype_equal)
+ _get_dtype_from_object,
+ is_bool_dtype,
+ is_list_like,
+ is_numeric_dtype,
+ is_datetime_or_timedelta_dtype,
+ is_dtype_equal,
+)
from pandas.core.index import _ensure_index_from_sequences
-from pandas.core.indexing import (check_bool_indexer,
- convert_to_index_sliceable)
+from pandas.core.indexing import check_bool_indexer, convert_to_index_sliceable
from pandas.util._validators import validate_bool_kwarg
import itertools
@@ -22,20 +26,23 @@
import sys
import warnings
-from .utils import (from_pandas, to_pandas, _inherit_docstrings)
+from .utils import from_pandas, to_pandas, _inherit_docstrings
from .iterator import PartitionIterator
@_inherit_docstrings(
- pandas.DataFrame, excluded=[pandas.DataFrame, pandas.DataFrame.__init__])
+ pandas.DataFrame, excluded=[pandas.DataFrame, pandas.DataFrame.__init__]
+)
class DataFrame(object):
- def __init__(self,
- data=None,
- index=None,
- columns=None,
- dtype=None,
- copy=False,
- data_manager=None):
+ def __init__(
+ self,
+ data=None,
+ index=None,
+ columns=None,
+ dtype=None,
+ copy=False,
+ data_manager=None,
+ ):
"""Distributed DataFrame object backed by Pandas dataframes.
Args:
@@ -60,11 +67,8 @@ def __init__(self,
if data is not None or data_manager is None:
pandas_df = pandas.DataFrame(
- data=data,
- index=index,
- columns=columns,
- dtype=dtype,
- copy=copy)
+ data=data, index=index, columns=columns, dtype=dtype, copy=copy
+ )
self._data_manager = from_pandas(pandas_df)._data_manager
else:
@@ -120,9 +124,9 @@ def __repr__(self):
result = repr(self._build_repr_df(num_rows, num_cols))
if len(self.index) > num_rows or len(self.columns) > num_cols:
# The split here is so that we don't repr pandas row lengths.
- return result.rsplit("\n\n",
- 1)[0] + "\n\n[{0} rows x {1} columns]".format(
- len(self.index), len(self.columns))
+ return result.rsplit("\n\n", 1)[0] + "\n\n[{0} rows x {1} columns]".format(
+ len(self.index), len(self.columns)
+ )
else:
return result
@@ -142,9 +146,11 @@ def _repr_html_(self):
result = self._build_repr_df(num_rows, num_cols)._repr_html_()
if len(self.index) > num_rows or len(self.columns) > num_cols:
# We split so that we insert our correct dataframe dimensions.
- return result.split(
- "
")[0] + "
{0} rows x {1} columns
\n".format(
- len(self.index), len(self.columns))
+ return result.split("")[
+ 0
+ ] + "
{0} rows x {1} columns
\n".format(
+ len(self.index), len(self.columns)
+ )
else:
return result
@@ -193,15 +199,14 @@ def _validate_eval_query(self, expr, **kwargs):
expr: The expression to evaluate. This string cannot contain any
Python statements, only Python expressions.
"""
- if isinstance(expr, str) and expr is '':
+ if isinstance(expr, str) and expr is "":
raise ValueError("expr cannot be an empty string")
- if isinstance(expr, str) and '@' in expr:
- raise NotImplementedError("Local variables not yet supported in "
- "eval.")
+ if isinstance(expr, str) and "@" in expr:
+ raise NotImplementedError("Local variables not yet supported in " "eval.")
- if isinstance(expr, str) and 'not' in expr:
- if 'parser' in kwargs and kwargs['parser'] == 'python':
+ if isinstance(expr, str) and "not" in expr:
+ if "parser" in kwargs and kwargs["parser"] == "python":
raise NotImplementedError("'Not' nodes are not implemented.")
@property
@@ -316,8 +321,7 @@ def applymap(self, func):
func (callable): The function to apply.
"""
if not callable(func):
- raise ValueError("\'{0}\' object is not callable".format(
- type(func)))
+ raise ValueError("'{0}' object is not callable".format(type(func)))
return DataFrame(data_manager=self._data_manager.applymap(func))
@@ -329,15 +333,17 @@ def copy(self, deep=True):
"""
return DataFrame(data_manager=self._data_manager.copy())
- def groupby(self,
- by=None,
- axis=0,
- level=None,
- as_index=True,
- sort=True,
- group_keys=True,
- squeeze=False,
- **kwargs):
+ def groupby(
+ self,
+ by=None,
+ axis=0,
+ level=None,
+ as_index=True,
+ sort=True,
+ group_keys=True,
+ squeeze=False,
+ **kwargs
+ ):
"""Apply a groupby to this DataFrame. See _groupby() remote task.
Args:
by: The value to groupby.
@@ -359,26 +365,32 @@ def groupby(self,
if isinstance(by, pandas.Series):
by = by.values.tolist()
- mismatch = len(by) != len(self) if axis == 0 \
- else len(by) != len(self.columns)
+ mismatch = (
+ len(by) != len(self) if axis == 0 else len(by) != len(self.columns)
+ )
if all(obj in self for obj in by) and mismatch:
raise NotImplementedError(
- "Groupby with lists of columns not yet supported.")
+ "Groupby with lists of columns not yet supported."
+ )
elif mismatch:
raise KeyError(next(x for x in by if x not in self))
from .groupby import DataFrameGroupBy
- return DataFrameGroupBy(self, by, axis, level, as_index, sort,
- group_keys, squeeze, **kwargs)
-
- def sum(self,
- axis=None,
- skipna=True,
- level=None,
- numeric_only=None,
- min_count=1,
- **kwargs):
+
+ return DataFrameGroupBy(
+ self, by, axis, level, as_index, sort, group_keys, squeeze, **kwargs
+ )
+
+ def sum(
+ self,
+ axis=None,
+ skipna=True,
+ level=None,
+ numeric_only=None,
+ min_count=1,
+ **kwargs
+ ):
"""Perform a sum across the DataFrame.
Args:
@@ -388,8 +400,7 @@ def sum(self,
Returns:
The sum of the DataFrame.
"""
- axis = pandas.DataFrame()._get_axis_number(
- axis) if axis is not None else 0
+ axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0
return self._data_manager.sum(
axis=axis,
@@ -397,7 +408,8 @@ def sum(self,
level=level,
numeric_only=numeric_only,
min_count=min_count,
- **kwargs)
+ **kwargs
+ )
def abs(self):
"""Apply an absolute value function to all numeric columns.
@@ -406,7 +418,7 @@ def abs(self):
A new DataFrame with the applied absolute value.
"""
for t in self.dtypes:
- if np.dtype('O') == t:
+ if np.dtype("O") == t:
# TODO Give a more accurate error to Pandas
raise TypeError("bad operand type for abs():", "str")
@@ -462,17 +474,11 @@ def transpose(self, *args, **kwargs):
Returns:
A new DataFrame transposed from this DataFrame.
"""
- return DataFrame(
- data_manager=self._data_manager.transpose(*args, **kwargs))
+ return DataFrame(data_manager=self._data_manager.transpose(*args, **kwargs))
T = property(transpose)
- def dropna(self,
- axis=0,
- how='any',
- thresh=None,
- subset=None,
- inplace=False):
+ def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False):
"""Create a new DataFrame from the removed NA values from this one.
Args:
@@ -498,8 +504,7 @@ def dropna(self,
result = self
for ax in axis:
- result = result.dropna(
- axis=ax, how=how, thresh=thresh, subset=subset)
+ result = result.dropna(axis=ax, how=how, thresh=thresh, subset=subset)
if not inplace:
return result
@@ -508,10 +513,10 @@ def dropna(self,
axis = pandas.DataFrame()._get_axis_number(axis)
- if how is not None and how not in ['any', 'all']:
- raise ValueError('invalid how option: %s' % how)
+ if how is not None and how not in ["any", "all"]:
+ raise ValueError("invalid how option: %s" % how)
if how is None and thresh is None:
- raise TypeError('must specify how or thresh')
+ raise TypeError("must specify how or thresh")
if subset is not None:
if axis == 1:
@@ -526,14 +531,15 @@ def dropna(self,
raise KeyError(list(np.compress(check, subset)))
new_manager = self._data_manager.dropna(
- axis=axis, how=how, thresh=thresh, subset=subset)
+ axis=axis, how=how, thresh=thresh, subset=subset
+ )
if not inplace:
return DataFrame(data_manager=new_manager)
else:
self._update_inplace(new_manager=new_manager)
- def add(self, other, axis='columns', level=None, fill_value=None):
+ def add(self, other, axis="columns", level=None, fill_value=None):
"""Add this DataFrame to another or a scalar/list.
Args:
@@ -547,12 +553,14 @@ def add(self, other, axis='columns', level=None, fill_value=None):
A new DataFrame with the applied addition.
"""
if level is not None:
- raise NotImplementedError("Mutlilevel index not yet supported "
- "in Pandas on Ray")
+ raise NotImplementedError(
+ "Mutlilevel index not yet supported " "in Pandas on Ray"
+ )
other = self._validate_other(other, axis)
new_manager = self._data_manager.add(
- other=other, axis=axis, level=level, fill_value=fill_value)
+ other=other, axis=axis, level=level, fill_value=fill_value
+ )
return self._create_dataframe_from_manager(new_manager)
def agg(self, func, axis=0, *args, **kwargs):
@@ -570,16 +578,16 @@ def aggregate(self, func, axis=0, *args, **kwargs):
pass
if result is None:
- kwargs.pop('is_transform', None)
+ kwargs.pop("is_transform", None)
return self.apply(func, axis=axis, args=args, **kwargs)
return result
def _aggregate(self, arg, *args, **kwargs):
- _axis = kwargs.pop('_axis', None)
+ _axis = kwargs.pop("_axis", None)
if _axis is None:
- _axis = getattr(self, 'axis', 0)
- kwargs.pop('_level', None)
+ _axis = getattr(self, "axis", 0)
+ kwargs.pop("_level", None)
if isinstance(arg, string_types):
return self._string_function(arg, *args, **kwargs)
@@ -588,7 +596,8 @@ def _aggregate(self, arg, *args, **kwargs):
elif isinstance(arg, dict):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
elif is_list_like(arg) or callable(arg):
return self.apply(arg, axis=_axis, args=args, **kwargs)
else:
@@ -605,9 +614,9 @@ def _string_function(self, func, *args, **kwargs):
return f(*args, **kwargs)
assert len(args) == 0
- assert len([
- kwarg for kwarg in kwargs if kwarg not in ['axis', '_level']
- ]) == 0
+ assert (
+ len([kwarg for kwarg in kwargs if kwarg not in ["axis", "_level"]]) == 0
+ )
return f
f = getattr(np, func, None)
@@ -616,20 +625,23 @@ def _string_function(self, func, *args, **kwargs):
raise ValueError("{} is an unknown string function".format(func))
- def align(self,
- other,
- join='outer',
- axis=None,
- level=None,
- copy=True,
- fill_value=None,
- method=None,
- limit=None,
- fill_axis=0,
- broadcast_axis=None):
+ def align(
+ self,
+ other,
+ join="outer",
+ axis=None,
+ level=None,
+ copy=True,
+ fill_value=None,
+ method=None,
+ limit=None,
+ fill_axis=0,
+ broadcast_axis=None,
+ ):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def all(self, axis=0, bool_only=None, skipna=None, level=None, **kwargs):
"""Return whether all elements are True over requested axis
@@ -644,33 +656,25 @@ def all(self, axis=0, bool_only=None, skipna=None, level=None, **kwargs):
axis = None
result = self._data_manager.all(
- axis=axis,
- bool_only=bool_only,
- skipna=skipna,
- level=level,
- **kwargs)
+ axis=axis, bool_only=bool_only, skipna=skipna, level=level, **kwargs
+ )
if axis is not None:
return result
else:
return result.all()
- def any(self, axis=None, bool_only=None, skipna=None, level=None,
- **kwargs):
+ def any(self, axis=None, bool_only=None, skipna=None, level=None, **kwargs):
"""Return whether any elements are True over requested axis
Note:
If axis=None or axis=0, this call applies on the column partitions,
otherwise operates on row partitions
"""
- axis = pandas.DataFrame()._get_axis_number(
- axis) if axis is not None else 0
+ axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0
return self._data_manager.any(
- axis=axis,
- bool_only=bool_only,
- skipna=skipna,
- level=level,
- **kwargs)
+ axis=axis, bool_only=bool_only, skipna=skipna, level=level, **kwargs
+ )
def append(self, other, ignore_index=False, verify_integrity=False):
"""Append another DataFrame/list/Series to this one.
@@ -687,8 +691,10 @@ def append(self, other, ignore_index=False, verify_integrity=False):
if isinstance(other, dict):
other = pandas.Series(other)
if other.name is None and not ignore_index:
- raise TypeError('Can only append a Series if ignore_index=True'
- ' or if the Series has a name')
+ raise TypeError(
+ "Can only append a Series if ignore_index=True"
+ " or if the Series has a name"
+ )
if other.name is None:
index = None
@@ -698,8 +704,7 @@ def append(self, other, ignore_index=False, verify_integrity=False):
index = pandas.Index([other.name], name=self.index.name)
# Create a Modin DataFrame from this Series for ease of development
- other = DataFrame(
- pandas.DataFrame(other).T, index=index)._data_manager
+ other = DataFrame(pandas.DataFrame(other).T, index=index)._data_manager
elif isinstance(other, list):
if not isinstance(other[0], DataFrame):
other = pandas.DataFrame(other)
@@ -716,24 +721,20 @@ def append(self, other, ignore_index=False, verify_integrity=False):
# We also do this first to ensure that we don't waste compute/memory.
if verify_integrity and not ignore_index:
appended_index = self.index.append(other.index)
- is_valid = next(
- (False for idx in appended_index.duplicated() if idx), True)
+ is_valid = next((False for idx in appended_index.duplicated() if idx), True)
if not is_valid:
- raise ValueError("Indexes have overlapping values: {}".format(
- appended_index[appended_index.duplicated()]))
+ raise ValueError(
+ "Indexes have overlapping values: {}".format(
+ appended_index[appended_index.duplicated()]
+ )
+ )
- data_manager = self._data_manager.concat(
- 0, other, ignore_index=ignore_index)
+ data_manager = self._data_manager.concat(0, other, ignore_index=ignore_index)
return DataFrame(data_manager=data_manager)
- def apply(self,
- func,
- axis=0,
- broadcast=False,
- raw=False,
- reduce=None,
- args=(),
- **kwds):
+ def apply(
+ self, func, axis=0, broadcast=False, raw=False, reduce=None, args=(), **kwds
+ ):
"""Apply a function along input axis of DataFrame.
Args:
@@ -750,23 +751,26 @@ def apply(self,
if isinstance(func, string_types):
if axis == 1:
- kwds['axis'] = axis
+ kwds["axis"] = axis
return getattr(self, func)(*args, **kwds)
elif isinstance(func, dict):
if axis == 1:
- raise TypeError("(\"'dict' object is not callable\", "
- "'occurred at index {0}'".format(
- self.index[0]))
+ raise TypeError(
+ "(\"'dict' object is not callable\", "
+ "'occurred at index {0}'".format(self.index[0])
+ )
if len(self.columns) != len(set(self.columns)):
warnings.warn(
- 'duplicate column names not supported with apply().',
+ "duplicate column names not supported with apply().",
FutureWarning,
- stacklevel=2)
+ stacklevel=2,
+ )
elif is_list_like(func):
if axis == 1:
- raise TypeError("(\"'list' object is not callable\", "
- "'occurred at index {0}'".format(
- self.index[0]))
+ raise TypeError(
+ "(\"'list' object is not callable\", "
+ "'occurred at index {0}'".format(self.index[0])
+ )
elif not callable(func):
return
@@ -778,7 +782,8 @@ def apply(self,
def as_blocks(self, copy=True):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def as_matrix(self, columns=None):
"""Convert the frame to its Numpy-array representation.
@@ -793,33 +798,32 @@ def as_matrix(self, columns=None):
# TODO this is very inefficient, also see __array__
return to_pandas(self).as_matrix(columns)
- def asfreq(self,
- freq,
- method=None,
- how=None,
- normalize=False,
- fill_value=None):
+ def asfreq(self, freq, method=None, how=None, normalize=False, fill_value=None):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def asof(self, where, subset=None):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def assign(self, **kwargs):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
- def astype(self, dtype, copy=True, errors='raise', **kwargs):
+ def astype(self, dtype, copy=True, errors="raise", **kwargs):
col_dtypes = {}
if isinstance(dtype, dict):
- if (not set(dtype.keys()).issubset(set(self.columns))
- and errors == 'raise'):
- raise KeyError("Only a column name can be used for the key in"
- "a dtype mappings argument.")
+ if not set(dtype.keys()).issubset(set(self.columns)) and errors == "raise":
+ raise KeyError(
+ "Only a column name can be used for the key in"
+ "a dtype mappings argument."
+ )
col_dtypes = dtype
else:
@@ -835,25 +839,20 @@ def astype(self, dtype, copy=True, errors='raise', **kwargs):
def at_time(self, time, asof=False):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
- def between_time(self,
- start_time,
- end_time,
- include_start=True,
- include_end=True):
+ def between_time(self, start_time, end_time, include_start=True, include_end=True):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def bfill(self, axis=None, inplace=False, limit=None, downcast=None):
"""Synonym for DataFrame.fillna(method='bfill')"""
new_df = self.fillna(
- method='bfill',
- axis=axis,
- limit=limit,
- downcast=downcast,
- inplace=inplace)
+ method="bfill", axis=axis, limit=limit, downcast=downcast, inplace=inplace
+ )
if not inplace:
return new_df
@@ -865,89 +864,100 @@ def bool(self):
element is not boolean
"""
shape = self.shape
- if shape != (1, ) and shape != (1, 1):
- raise ValueError("""The PandasObject does not have exactly
+ if shape != (1,) and shape != (1, 1):
+ raise ValueError(
+ """The PandasObject does not have exactly
1 element. Return the bool of a single
element PandasObject. The truth value is
ambiguous. Use a.empty, a.item(), a.any()
- or a.all().""")
+ or a.all()."""
+ )
else:
return to_pandas(self).bool()
- def boxplot(self,
- column=None,
- by=None,
- ax=None,
- fontsize=None,
- rot=0,
- grid=True,
- figsize=None,
- layout=None,
- return_type=None,
- **kwds):
+ def boxplot(
+ self,
+ column=None,
+ by=None,
+ ax=None,
+ fontsize=None,
+ rot=0,
+ grid=True,
+ figsize=None,
+ layout=None,
+ return_type=None,
+ **kwds
+ ):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
-
- def clip(self,
- lower=None,
- upper=None,
- axis=None,
- inplace=False,
- *args,
- **kwargs):
+ "github.com/modin-project/modin."
+ )
+
+ def clip(self, lower=None, upper=None, axis=None, inplace=False, *args, **kwargs):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def clip_lower(self, threshold, axis=None, inplace=False):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def clip_upper(self, threshold, axis=None, inplace=False):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def combine(self, other, func, fill_value=None, overwrite=True):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def combine_first(self, other):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def compound(self, axis=None, skipna=None, level=None):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def consolidate(self, inplace=False):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
-
- def convert_objects(self,
- convert_dates=True,
- convert_numeric=False,
- convert_timedeltas=True,
- copy=True):
+ "github.com/modin-project/modin."
+ )
+
+ def convert_objects(
+ self,
+ convert_dates=True,
+ convert_numeric=False,
+ convert_timedeltas=True,
+ copy=True,
+ ):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
- def corr(self, method='pearson', min_periods=1):
+ def corr(self, method="pearson", min_periods=1):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def corrwith(self, other, axis=0, drop=False):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def count(self, axis=0, level=None, numeric_only=False):
"""Get the count of non-null objects in the DataFrame.
@@ -961,15 +971,16 @@ def count(self, axis=0, level=None, numeric_only=False):
Returns:
The count, in a Series (or DataFrame if level is specified).
"""
- axis = pandas.DataFrame()._get_axis_number(
- axis) if axis is not None else 0
+ axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0
return self._data_manager.count(
- axis=axis, level=level, numeric_only=numeric_only)
+ axis=axis, level=level, numeric_only=numeric_only
+ )
def cov(self, min_periods=None):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def cummax(self, axis=None, skipna=True, *args, **kwargs):
"""Perform a cumulative maximum across the DataFrame.
@@ -981,11 +992,10 @@ def cummax(self, axis=None, skipna=True, *args, **kwargs):
Returns:
The cumulative maximum of the DataFrame.
"""
- axis = pandas.DataFrame()._get_axis_number(
- axis) if axis is not None else 0
+ axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0
return DataFrame(
- data_manager=self._data_manager.cummax(
- axis=axis, skipna=skipna, **kwargs))
+ data_manager=self._data_manager.cummax(axis=axis, skipna=skipna, **kwargs)
+ )
def cummin(self, axis=None, skipna=True, *args, **kwargs):
"""Perform a cumulative minimum across the DataFrame.
@@ -997,11 +1007,10 @@ def cummin(self, axis=None, skipna=True, *args, **kwargs):
Returns:
The cumulative minimum of the DataFrame.
"""
- axis = pandas.DataFrame()._get_axis_number(
- axis) if axis is not None else 0
+ axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0
return DataFrame(
- data_manager=self._data_manager.cummin(
- axis=axis, skipna=skipna, **kwargs))
+ data_manager=self._data_manager.cummin(axis=axis, skipna=skipna, **kwargs)
+ )
def cumprod(self, axis=None, skipna=True, *args, **kwargs):
"""Perform a cumulative product across the DataFrame.
@@ -1013,11 +1022,10 @@ def cumprod(self, axis=None, skipna=True, *args, **kwargs):
Returns:
The cumulative product of the DataFrame.
"""
- axis = pandas.DataFrame()._get_axis_number(
- axis) if axis is not None else 0
+ axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0
return DataFrame(
- data_manager=self._data_manager.cumprod(
- axis=axis, skipna=skipna, **kwargs))
+ data_manager=self._data_manager.cumprod(axis=axis, skipna=skipna, **kwargs)
+ )
def cumsum(self, axis=None, skipna=True, *args, **kwargs):
"""Perform a cumulative sum across the DataFrame.
@@ -1029,11 +1037,10 @@ def cumsum(self, axis=None, skipna=True, *args, **kwargs):
Returns:
The cumulative sum of the DataFrame.
"""
- axis = pandas.DataFrame()._get_axis_number(
- axis) if axis is not None else 0
+ axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0
return DataFrame(
- data_manager=self._data_manager.cumsum(
- axis=axis, skipna=skipna, **kwargs))
+ data_manager=self._data_manager.cumsum(axis=axis, skipna=skipna, **kwargs)
+ )
def describe(self, percentiles=None, include=None, exclude=None):
"""
@@ -1056,15 +1063,20 @@ def describe(self, percentiles=None, include=None, exclude=None):
if exclude is None:
exclude = "object"
elif "object" not in include:
- exclude = ([exclude] + "object") if isinstance(
- exclude, str) else list(exclude) + "object"
+ exclude = (
+ ([exclude] + "object")
+ if isinstance(exclude, str)
+ else list(exclude) + "object"
+ )
if percentiles is not None:
pandas.DataFrame()._check_percentile(percentiles)
return DataFrame(
data_manager=self._data_manager.describe(
- percentiles=percentiles, include=include, exclude=exclude))
+ percentiles=percentiles, include=include, exclude=exclude
+ )
+ )
def diff(self, periods=1, axis=0):
"""Finds the difference between elements on the axis requested
@@ -1077,9 +1089,10 @@ def diff(self, periods=1, axis=0):
DataFrame with the diff applied
"""
return DataFrame(
- data_manager=self._data_manager.diff(periods=periods, axis=axis))
+ data_manager=self._data_manager.diff(periods=periods, axis=axis)
+ )
- def div(self, other, axis='columns', level=None, fill_value=None):
+ def div(self, other, axis="columns", level=None, fill_value=None):
"""Divides this DataFrame against another DataFrame/Series/scalar.
Args:
@@ -1092,15 +1105,17 @@ def div(self, other, axis='columns', level=None, fill_value=None):
A new DataFrame with the Divide applied.
"""
if level is not None:
- raise NotImplementedError("Mutlilevel index not yet supported "
- "in Pandas on Ray")
+ raise NotImplementedError(
+ "Mutlilevel index not yet supported " "in Pandas on Ray"
+ )
other = self._validate_other(other, axis)
new_manager = self._data_manager.div(
- other=other, axis=axis, level=level, fill_value=fill_value)
+ other=other, axis=axis, level=level, fill_value=fill_value
+ )
return self._create_dataframe_from_manager(new_manager)
- def divide(self, other, axis='columns', level=None, fill_value=None):
+ def divide(self, other, axis="columns", level=None, fill_value=None):
"""Synonym for div.
Args:
@@ -1117,16 +1132,19 @@ def divide(self, other, axis='columns', level=None, fill_value=None):
def dot(self, other):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
-
- def drop(self,
- labels=None,
- axis=0,
- index=None,
- columns=None,
- level=None,
- inplace=False,
- errors='raise'):
+ "github.com/modin-project/modin."
+ )
+
+ def drop(
+ self,
+ labels=None,
+ axis=0,
+ index=None,
+ columns=None,
+ level=None,
+ inplace=False,
+ errors="raise",
+ ):
"""Return new object with labels in requested axis removed.
Args:
labels: Index or column labels to drop.
@@ -1148,16 +1166,19 @@ def drop(self,
inplace = validate_bool_kwarg(inplace, "inplace")
if labels is not None:
if index is not None or columns is not None:
- raise ValueError("Cannot specify both 'labels' and "
- "'index'/'columns'")
+ raise ValueError(
+ "Cannot specify both 'labels' and " "'index'/'columns'"
+ )
axis = pandas.DataFrame()._get_axis_name(axis)
axes = {axis: labels}
elif index is not None or columns is not None:
- axes, _ = pandas.DataFrame() \
- ._construct_axes_from_arguments((index, columns), {})
+ axes, _ = pandas.DataFrame()._construct_axes_from_arguments(
+ (index, columns), {}
+ )
else:
- raise ValueError("Need to specify at least one of 'labels', "
- "'index' or 'columns'")
+ raise ValueError(
+ "Need to specify at least one of 'labels', " "'index' or 'columns'"
+ )
# TODO Clean up this error checking
if "index" not in axes:
@@ -1165,17 +1186,14 @@ def drop(self,
elif axes["index"] is not None:
if not is_list_like(axes["index"]):
axes["index"] = [axes["index"]]
- if errors == 'raise':
- non_existant = [
- obj for obj in axes["index"] if obj not in self.index
- ]
+ if errors == "raise":
+ non_existant = [obj for obj in axes["index"] if obj not in self.index]
if len(non_existant):
raise ValueError(
- "labels {} not contained in axis".format(non_existant))
+ "labels {} not contained in axis".format(non_existant)
+ )
else:
- axes["index"] = [
- obj for obj in axes["index"] if obj in self.index
- ]
+ axes["index"] = [obj for obj in axes["index"] if obj in self.index]
# If the length is zero, we will just do nothing
if not len(axes["index"]):
axes["index"] = None
@@ -1185,13 +1203,14 @@ def drop(self,
elif axes["columns"] is not None:
if not is_list_like(axes["columns"]):
axes["columns"] = [axes["columns"]]
- if errors == 'raise':
+ if errors == "raise":
non_existant = [
obj for obj in axes["columns"] if obj not in self.columns
]
if len(non_existant):
raise ValueError(
- "labels {} not contained in axis".format(non_existant))
+ "labels {} not contained in axis".format(non_existant)
+ )
else:
axes["columns"] = [
obj for obj in axes["columns"] if obj in self.columns
@@ -1201,24 +1220,27 @@ def drop(self,
axes["columns"] = None
new_manager = self._data_manager.drop(
- index=axes["index"], columns=axes["columns"])
+ index=axes["index"], columns=axes["columns"]
+ )
if inplace:
self._update_inplace(new_manager=new_manager)
return DataFrame(data_manager=new_manager)
- def drop_duplicates(self, subset=None, keep='first', inplace=False):
+ def drop_duplicates(self, subset=None, keep="first", inplace=False):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
- def duplicated(self, subset=None, keep='first'):
+ def duplicated(self, subset=None, keep="first"):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
- def eq(self, other, axis='columns', level=None):
+ def eq(self, other, axis="columns", level=None):
"""Checks element-wise that this is equal to other.
Args:
@@ -1230,12 +1252,12 @@ def eq(self, other, axis='columns', level=None):
A new DataFrame filled with Booleans.
"""
if level is not None:
- raise NotImplementedError("Mutlilevel index not yet supported "
- "in Pandas on Ray")
+ raise NotImplementedError(
+ "Mutlilevel index not yet supported " "in Pandas on Ray"
+ )
other = self._validate_other(other, axis)
- new_manager = self._data_manager.eq(
- other=other, axis=axis, level=level)
+ new_manager = self._data_manager.eq(other=other, axis=axis, level=level)
return self._create_dataframe_from_manager(new_manager)
def equals(self, other):
@@ -1249,8 +1271,7 @@ def equals(self, other):
# Copy into a Ray DataFrame to simplify logic below
other = DataFrame(other)
- if not self.index.equals(other.index) or not \
- self.columns.equals(other.columns):
+ if not self.index.equals(other.index) or not self.columns.equals(other.columns):
return False
return all(self.eq(other).all())
@@ -1313,45 +1334,48 @@ def eval(self, expr, inplace=False, **kwargs):
else:
return DataFrame(data_manager=result)
- def ewm(self,
- com=None,
- span=None,
- halflife=None,
- alpha=None,
- min_periods=0,
- freq=None,
- adjust=True,
- ignore_na=False,
- axis=0):
+ def ewm(
+ self,
+ com=None,
+ span=None,
+ halflife=None,
+ alpha=None,
+ min_periods=0,
+ freq=None,
+ adjust=True,
+ ignore_na=False,
+ axis=0,
+ ):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def expanding(self, min_periods=1, freq=None, center=False, axis=0):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def ffill(self, axis=None, inplace=False, limit=None, downcast=None):
"""Synonym for DataFrame.fillna(method='ffill')
"""
new_df = self.fillna(
- method='ffill',
- axis=axis,
- limit=limit,
- downcast=downcast,
- inplace=inplace)
+ method="ffill", axis=axis, limit=limit, downcast=downcast, inplace=inplace
+ )
if not inplace:
return new_df
- def fillna(self,
- value=None,
- method=None,
- axis=None,
- inplace=False,
- limit=None,
- downcast=None,
- **kwargs):
+ def fillna(
+ self,
+ value=None,
+ method=None,
+ axis=None,
+ inplace=False,
+ limit=None,
+ downcast=None,
+ **kwargs
+ ):
"""Fill NA/NaN values using the specified method.
Args:
@@ -1384,28 +1408,28 @@ def fillna(self,
"""
# TODO implement value passed as DataFrame
if isinstance(value, pandas.DataFrame):
- raise NotImplementedError("Passing a DataFrame as the value for "
- "fillna is not yet supported.")
+ raise NotImplementedError(
+ "Passing a DataFrame as the value for " "fillna is not yet supported."
+ )
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
- axis = pandas.DataFrame()._get_axis_number(axis) \
- if axis is not None \
- else 0
+ axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0
if isinstance(value, (list, tuple)):
- raise TypeError('"value" parameter must be a scalar or dict, but '
- 'you passed a "{0}"'.format(type(value).__name__))
+ raise TypeError(
+ '"value" parameter must be a scalar or dict, but '
+ 'you passed a "{0}"'.format(type(value).__name__)
+ )
if value is None and method is None:
- raise ValueError('must specify a fill method or value')
+ raise ValueError("must specify a fill method or value")
if value is not None and method is not None:
- raise ValueError('cannot specify both a fill method and value')
- if method is not None and method not in [
- 'backfill', 'bfill', 'pad', 'ffill'
- ]:
- expecting = 'pad (ffill) or backfill (bfill)'
- msg = 'Invalid fill method. Expecting {expecting}. Got {method}'\
- .format(expecting=expecting, method=method)
+ raise ValueError("cannot specify both a fill method and value")
+ if method is not None and method not in ["backfill", "bfill", "pad", "ffill"]:
+ expecting = "pad (ffill) or backfill (bfill)"
+ msg = "Invalid fill method. Expecting {expecting}. Got {method}".format(
+ expecting=expecting, method=method
+ )
raise ValueError(msg)
if isinstance(value, pandas.Series):
@@ -1418,7 +1442,8 @@ def fillna(self,
inplace=False,
limit=limit,
downcast=downcast,
- **kwargs)
+ **kwargs
+ )
if inplace:
self._update_inplace(new_manager=new_manager)
@@ -1439,13 +1464,15 @@ def filter(self, items=None, like=None, regex=None, axis=None):
"""
nkw = com._count_not_none(items, like, regex)
if nkw > 1:
- raise TypeError('Keyword arguments `items`, `like`, or `regex` '
- 'are mutually exclusive')
+ raise TypeError(
+ "Keyword arguments `items`, `like`, or `regex` "
+ "are mutually exclusive"
+ )
if nkw == 0:
- raise TypeError('Must pass either `items`, `like`, or `regex`')
+ raise TypeError("Must pass either `items`, `like`, or `regex`")
if axis is None:
- axis = 'columns' # This is the default info axis for dataframes
+ axis = "columns" # This is the default info axis for dataframes
axis = pandas.DataFrame()._get_axis_number(axis)
labels = self.columns if axis else self.index
@@ -1473,7 +1500,8 @@ def f(x):
def first(self, offset):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def first_valid_index(self):
"""Return index for first non-NA/null value.
@@ -1483,7 +1511,7 @@ def first_valid_index(self):
"""
return self._data_manager.first_valid_index()
- def floordiv(self, other, axis='columns', level=None, fill_value=None):
+ def floordiv(self, other, axis="columns", level=None, fill_value=None):
"""Divides this DataFrame against another DataFrame/Series/scalar.
Args:
@@ -1496,53 +1524,63 @@ def floordiv(self, other, axis='columns', level=None, fill_value=None):
A new DataFrame with the Divide applied.
"""
if level is not None:
- raise NotImplementedError("Mutlilevel index not yet supported "
- "in Pandas on Ray")
+ raise NotImplementedError(
+ "Mutlilevel index not yet supported " "in Pandas on Ray"
+ )
other = self._validate_other(other, axis)
new_manager = self._data_manager.floordiv(
- other=other, axis=axis, level=level, fill_value=fill_value)
+ other=other, axis=axis, level=level, fill_value=fill_value
+ )
return self._create_dataframe_from_manager(new_manager)
@classmethod
- def from_csv(self,
- path,
- header=0,
- sep=', ',
- index_col=0,
- parse_dates=True,
- encoding=None,
- tupleize_cols=None,
- infer_datetime_format=False):
+ def from_csv(
+ self,
+ path,
+ header=0,
+ sep=", ",
+ index_col=0,
+ parse_dates=True,
+ encoding=None,
+ tupleize_cols=None,
+ infer_datetime_format=False,
+ ):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
@classmethod
- def from_dict(self, data, orient='columns', dtype=None):
+ def from_dict(self, data, orient="columns", dtype=None):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
@classmethod
- def from_items(self, items, columns=None, orient='columns'):
+ def from_items(self, items, columns=None, orient="columns"):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
@classmethod
- def from_records(self,
- data,
- index=None,
- exclude=None,
- columns=None,
- coerce_float=False,
- nrows=None):
+ def from_records(
+ self,
+ data,
+ index=None,
+ exclude=None,
+ columns=None,
+ coerce_float=False,
+ nrows=None,
+ ):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
- def ge(self, other, axis='columns', level=None):
+ def ge(self, other, axis="columns", level=None):
"""Checks element-wise that this is greater than or equal to other.
Args:
@@ -1554,12 +1592,12 @@ def ge(self, other, axis='columns', level=None):
A new DataFrame filled with Booleans.
"""
if level is not None:
- raise NotImplementedError("Mutlilevel index not yet supported "
- "in Pandas on Ray")
+ raise NotImplementedError(
+ "Mutlilevel index not yet supported " "in Pandas on Ray"
+ )
other = self._validate_other(other, axis)
- new_manager = self._data_manager.ge(
- other=other, axis=axis, level=level)
+ new_manager = self._data_manager.ge(other=other, axis=axis, level=level)
return self._create_dataframe_from_manager(new_manager)
def get(self, key, default=None):
@@ -1602,14 +1640,16 @@ def get_ftype_counts(self):
def get_value(self, index, col, takeable=False):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def get_values(self):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
- def gt(self, other, axis='columns', level=None):
+ def gt(self, other, axis="columns", level=None):
"""Checks element-wise that this is greater than other.
Args:
@@ -1621,12 +1661,12 @@ def gt(self, other, axis='columns', level=None):
A new DataFrame filled with Booleans.
"""
if level is not None:
- raise NotImplementedError("Mutlilevel index not yet supported "
- "in Pandas on Ray")
+ raise NotImplementedError(
+ "Mutlilevel index not yet supported " "in Pandas on Ray"
+ )
other = self._validate_other(other, axis)
- new_manager = self._data_manager.gt(
- other=other, axis=axis, level=level)
+ new_manager = self._data_manager.gt(other=other, axis=axis, level=level)
return self._create_dataframe_from_manager(new_manager)
def head(self, n=5):
@@ -1643,25 +1683,28 @@ def head(self, n=5):
return DataFrame(data_manager=self._data_manager.head(n))
- def hist(self,
- data,
- column=None,
- by=None,
- grid=True,
- xlabelsize=None,
- xrot=None,
- ylabelsize=None,
- yrot=None,
- ax=None,
- sharex=False,
- sharey=False,
- figsize=None,
- layout=None,
- bins=10,
- **kwds):
+ def hist(
+ self,
+ data,
+ column=None,
+ by=None,
+ grid=True,
+ xlabelsize=None,
+ xrot=None,
+ ylabelsize=None,
+ yrot=None,
+ ax=None,
+ sharex=False,
+ sharey=False,
+ figsize=None,
+ layout=None,
+ bins=10,
+ **kwds
+ ):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def idxmax(self, axis=0, skipna=True):
"""Get the index of the first occurrence of the max value of the axis.
@@ -1674,9 +1717,8 @@ def idxmax(self, axis=0, skipna=True):
A Series with the index for each maximum value for the axis
specified.
"""
- if not all(d != np.dtype('O') for d in self.dtypes):
- raise TypeError(
- "reduction operation 'argmax' not allowed for this dtype")
+ if not all(d != np.dtype("O") for d in self.dtypes):
+ raise TypeError("reduction operation 'argmax' not allowed for this dtype")
return self._data_manager.idxmax(axis=axis, skipna=skipna)
@@ -1691,23 +1733,20 @@ def idxmin(self, axis=0, skipna=True):
A Series with the index for each minimum value for the axis
specified.
"""
- if not all(d != np.dtype('O') for d in self.dtypes):
- raise TypeError(
- "reduction operation 'argmax' not allowed for this dtype")
+ if not all(d != np.dtype("O") for d in self.dtypes):
+ raise TypeError("reduction operation 'argmax' not allowed for this dtype")
return self._data_manager.idxmin(axis=axis, skipna=skipna)
def infer_objects(self):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
-
- def info(self,
- verbose=None,
- buf=None,
- max_cols=None,
- memory_usage=None,
- null_counts=None):
+ "github.com/modin-project/modin."
+ )
+
+ def info(
+ self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None
+ ):
"""Print a concise summary of a DataFrame, which includes the index
dtype and column dtypes, non-null values and memory usage.
@@ -1755,62 +1794,59 @@ def info(self,
null_counts = False
# Determine if actually verbose
- actually_verbose = True if verbose and max_cols > len(
- columns) else False
+ actually_verbose = True if verbose and max_cols > len(columns) else False
- if type(memory_usage) == str and memory_usage == 'deep':
+ if type(memory_usage) == str and memory_usage == "deep":
memory_usage_deep = True
else:
memory_usage_deep = False
# Start putting together output
# Class denoted in info() output
- class_string = '\n'
+ class_string = "\n"
# Create the Index info() string by parsing self.index
- index_string = index.summary() + '\n'
+ index_string = index.summary() + "\n"
if null_counts:
counts = self._data_manager.count()
if memory_usage:
memory_usage_data = self._data_manager.memory_usage(
- deep=memory_usage_deep, index=True)
+ deep=memory_usage_deep, index=True
+ )
if actually_verbose:
# Create string for verbose output
- col_string = 'Data columns (total {0} columns):\n' \
- .format(len(columns))
+ col_string = "Data columns (total {0} columns):\n".format(len(columns))
for col, dtype in zip(columns, dtypes):
- col_string += '{0}\t'.format(col)
+ col_string += "{0}\t".format(col)
if null_counts:
- col_string += '{0} not-null '.format(counts[col])
- col_string += '{0}\n'.format(dtype)
+ col_string += "{0} not-null ".format(counts[col])
+ col_string += "{0}\n".format(dtype)
else:
# Create string for not verbose output
- col_string = 'Columns: {0} entries, {1} to {2}\n'\
- .format(len(columns), columns[0], columns[-1])
+ col_string = "Columns: {0} entries, {1} to {2}\n".format(
+ len(columns), columns[0], columns[-1]
+ )
# A summary of the dtypes in the dataframe
dtypes_string = "dtypes: "
for dtype, count in dtypes.value_counts().iteritems():
dtypes_string += "{0}({1}),".format(dtype, count)
- dtypes_string = dtypes_string[:-1] + '\n'
+ dtypes_string = dtypes_string[:-1] + "\n"
# Create memory usage string
- memory_string = ''
+ memory_string = ""
if memory_usage:
if memory_usage_deep:
- memory_string = 'memory usage: {0} bytes'.format(
- memory_usage_data)
+ memory_string = "memory usage: {0} bytes".format(memory_usage_data)
else:
- memory_string = 'memory usage: {0}+ bytes'.format(
- memory_usage_data)
+ memory_string = "memory usage: {0}+ bytes".format(memory_usage_data)
# Combine all the components of the info() output
- result = ''.join([
- class_string, index_string, col_string, dtypes_string,
- memory_string
- ])
+ result = "".join(
+ [class_string, index_string, col_string, dtypes_string, memory_string]
+ )
# Write to specified output buffer
buf.write(result)
@@ -1830,29 +1866,33 @@ def insert(self, loc, column, value, allow_duplicates=False):
if len(value) != len(self.index):
raise ValueError("Length of values does not match length of index")
if not allow_duplicates and column in self.columns:
- raise ValueError(
- "cannot insert {0}, already exists".format(column))
+ raise ValueError("cannot insert {0}, already exists".format(column))
if loc > len(self.columns):
raise IndexError(
"index {0} is out of bounds for axis 0 with size {1}".format(
- loc, len(self.columns)))
+ loc, len(self.columns)
+ )
+ )
if loc < 0:
raise ValueError("unbounded slice")
new_manager = self._data_manager.insert(loc, column, value)
self._update_inplace(new_manager=new_manager)
- def interpolate(self,
- method='linear',
- axis=0,
- limit=None,
- inplace=False,
- limit_direction='forward',
- downcast=None,
- **kwargs):
+ def interpolate(
+ self,
+ method="linear",
+ axis=0,
+ limit=None,
+ inplace=False,
+ limit_direction="forward",
+ downcast=None,
+ **kwargs
+ ):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def iterrows(self):
"""Iterate over DataFrame rows as (index, Series) pairs.
@@ -1872,8 +1912,7 @@ def iterrow_builder(df):
df.index = [next(index_iter)]
return df.iterrows()
- partition_iterator = PartitionIterator(self._data_manager, 0,
- iterrow_builder)
+ partition_iterator = PartitionIterator(self._data_manager, 0, iterrow_builder)
for v in partition_iterator:
yield v
@@ -1896,8 +1935,7 @@ def items_builder(df):
df.index = self.index
return df.items()
- partition_iterator = PartitionIterator(self._data_manager, 1,
- items_builder)
+ partition_iterator = PartitionIterator(self._data_manager, 1, items_builder)
for v in partition_iterator:
yield v
@@ -1913,7 +1951,7 @@ def iteritems(self):
"""
return self.items()
- def itertuples(self, index=True, name='Pandas'):
+ def itertuples(self, index=True, name="Pandas"):
"""Iterate over DataFrame rows as namedtuples.
Args:
@@ -1936,19 +1974,14 @@ def itertuples_builder(df):
df.index = [next(index_iter)]
return df.itertuples(index=index, name=name)
- partition_iterator = PartitionIterator(self._data_manager, 0,
- itertuples_builder)
+ partition_iterator = PartitionIterator(
+ self._data_manager, 0, itertuples_builder
+ )
for v in partition_iterator:
yield v
- def join(self,
- other,
- on=None,
- how='left',
- lsuffix='',
- rsuffix='',
- sort=False):
+ def join(self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False):
"""Join two or more DataFrames, or a DataFrame with a collection.
Args:
@@ -1978,7 +2011,8 @@ def join(self,
pandas.DataFrame(columns=self.columns).join(
pandas.DataFrame(columns=other.columns),
lsuffix=lsuffix,
- rsuffix=rsuffix).columns
+ rsuffix=rsuffix,
+ ).columns
return DataFrame(
data_manager=self._data_manager.join(
@@ -1986,18 +2020,22 @@ def join(self,
how=how,
lsuffix=lsuffix,
rsuffix=rsuffix,
- sort=sort))
+ sort=sort,
+ )
+ )
else:
# This constraint carried over from Pandas.
if on is not None:
- raise ValueError("Joining multiple DataFrames only supported"
- " for joining on index")
+ raise ValueError(
+ "Joining multiple DataFrames only supported" " for joining on index"
+ )
# See note above about error checking with an empty join.
pandas.DataFrame(columns=self.columns).join(
[pandas.DataFrame(columns=obj.columns) for obj in other],
lsuffix=lsuffix,
- rsuffix=rsuffix).columns
+ rsuffix=rsuffix,
+ ).columns
return DataFrame(
data_manager=self._data_manager.join(
@@ -2005,32 +2043,27 @@ def join(self,
how=how,
lsuffix=lsuffix,
rsuffix=rsuffix,
- sort=sort))
-
- def kurt(self,
- axis=None,
- skipna=None,
- level=None,
- numeric_only=None,
- **kwargs):
+ sort=sort,
+ )
+ )
+
+ def kurt(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
-
- def kurtosis(self,
- axis=None,
- skipna=None,
- level=None,
- numeric_only=None,
- **kwargs):
+ "github.com/modin-project/modin."
+ )
+
+ def kurtosis(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def last(self, offset):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def last_valid_index(self):
"""Return index for last non-NA/null value.
@@ -2040,7 +2073,7 @@ def last_valid_index(self):
"""
return self._data_manager.last_valid_index()
- def le(self, other, axis='columns', level=None):
+ def le(self, other, axis="columns", level=None):
"""Checks element-wise that this is less than or equal to other.
Args:
@@ -2052,20 +2085,21 @@ def le(self, other, axis='columns', level=None):
A new DataFrame filled with Booleans.
"""
if level is not None:
- raise NotImplementedError("Mutlilevel index not yet supported "
- "in Pandas on Ray")
+ raise NotImplementedError(
+ "Mutlilevel index not yet supported " "in Pandas on Ray"
+ )
other = self._validate_other(other, axis)
- new_manager = self._data_manager.le(
- other=other, axis=axis, level=level)
+ new_manager = self._data_manager.le(other=other, axis=axis, level=level)
return self._create_dataframe_from_manager(new_manager)
def lookup(self, row_labels, col_labels):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
- def lt(self, other, axis='columns', level=None):
+ def lt(self, other, axis="columns", level=None):
"""Checks element-wise that this is less than other.
Args:
@@ -2077,38 +2111,37 @@ def lt(self, other, axis='columns', level=None):
A new DataFrame filled with Booleans.
"""
if level is not None:
- raise NotImplementedError("Mutlilevel index not yet supported "
- "in Pandas on Ray")
+ raise NotImplementedError(
+ "Mutlilevel index not yet supported " "in Pandas on Ray"
+ )
other = self._validate_other(other, axis)
- new_manager = self._data_manager.lt(
- other=other, axis=axis, level=level)
+ new_manager = self._data_manager.lt(other=other, axis=axis, level=level)
return self._create_dataframe_from_manager(new_manager)
def mad(self, axis=None, skipna=None, level=None):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
-
- def mask(self,
- cond,
- other=np.nan,
- inplace=False,
- axis=None,
- level=None,
- errors='raise',
- try_cast=False,
- raise_on_error=None):
+ "github.com/modin-project/modin."
+ )
+
+ def mask(
+ self,
+ cond,
+ other=np.nan,
+ inplace=False,
+ axis=None,
+ level=None,
+ errors="raise",
+ try_cast=False,
+ raise_on_error=None,
+ ):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
-
- def max(self,
- axis=None,
- skipna=None,
- level=None,
- numeric_only=None,
- **kwargs):
+ "github.com/modin-project/modin."
+ )
+
+ def max(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
"""Perform max across the DataFrame.
Args:
@@ -2118,22 +2151,13 @@ def max(self,
Returns:
The max of the DataFrame.
"""
- axis = pandas.DataFrame()._get_axis_number(
- axis) if axis is not None else 0
+ axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0
return self._data_manager.max(
- axis=axis,
- skipna=skipna,
- level=level,
- numeric_only=numeric_only,
- **kwargs)
-
- def mean(self,
- axis=None,
- skipna=None,
- level=None,
- numeric_only=None,
- **kwargs):
+ axis=axis, skipna=skipna, level=level, numeric_only=numeric_only, **kwargs
+ )
+
+ def mean(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
"""Computes mean across the DataFrame.
Args:
@@ -2143,22 +2167,13 @@ def mean(self,
Returns:
The mean of the DataFrame. (Pandas series)
"""
- axis = pandas.DataFrame()._get_axis_number(
- axis) if axis is not None else 0
+ axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0
return self._data_manager.mean(
- axis=axis,
- skipna=skipna,
- level=level,
- numeric_only=numeric_only,
- **kwargs)
-
- def median(self,
- axis=None,
- skipna=None,
- level=None,
- numeric_only=None,
- **kwargs):
+ axis=axis, skipna=skipna, level=level, numeric_only=numeric_only, **kwargs
+ )
+
+ def median(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
"""Computes median across the DataFrame.
Args:
@@ -2168,24 +2183,23 @@ def median(self,
Returns:
The median of the DataFrame. (Pandas series)
"""
- axis = pandas.DataFrame()._get_axis_number(
- axis) if axis is not None else 0
+ axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0
return self._data_manager.median(
- axis=axis,
- skipna=skipna,
- level=level,
- numeric_only=numeric_only,
- **kwargs)
-
- def melt(self,
- id_vars=None,
- value_vars=None,
- var_name=None,
- value_name='value',
- col_level=None):
+ axis=axis, skipna=skipna, level=level, numeric_only=numeric_only, **kwargs
+ )
+
+ def melt(
+ self,
+ id_vars=None,
+ value_vars=None,
+ var_name=None,
+ value_name="value",
+ col_level=None,
+ ):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def memory_usage(self, index=True, deep=False):
"""Returns the memory usage of each column in bytes
@@ -2206,23 +2220,25 @@ def memory_usage(self, index=True, deep=False):
result.index = self.columns
if index:
index_value = self.index.memory_usage(deep=deep)
- return pandas.Series(index_value, index=['Index']).append(result)
+ return pandas.Series(index_value, index=["Index"]).append(result)
return result
- def merge(self,
- right,
- how='inner',
- on=None,
- left_on=None,
- right_on=None,
- left_index=False,
- right_index=False,
- sort=False,
- suffixes=('_x', '_y'),
- copy=True,
- indicator=False,
- validate=None):
+ def merge(
+ self,
+ right,
+ how="inner",
+ on=None,
+ left_on=None,
+ right_on=None,
+ left_index=False,
+ right_index=False,
+ sort=False,
+ suffixes=("_x", "_y"),
+ copy=True,
+ indicator=False,
+ validate=None,
+ ):
"""Database style join, where common columns in "on" are merged.
Args:
@@ -2247,28 +2263,23 @@ def merge(self,
"""
if not isinstance(right, DataFrame):
- raise ValueError("can not merge DataFrame with instance of type "
- "{}".format(type(right)))
+ raise ValueError(
+ "can not merge DataFrame with instance of type "
+ "{}".format(type(right))
+ )
if left_index is False or right_index is False:
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
if left_index and right_index:
return self.join(
- right,
- how=how,
- lsuffix=suffixes[0],
- rsuffix=suffixes[1],
- sort=sort)
-
- def min(self,
- axis=None,
- skipna=None,
- level=None,
- numeric_only=None,
- **kwargs):
+ right, how=how, lsuffix=suffixes[0], rsuffix=suffixes[1], sort=sort
+ )
+
+ def min(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
"""Perform min across the DataFrame.
Args:
@@ -2278,17 +2289,13 @@ def min(self,
Returns:
The min of the DataFrame.
"""
- axis = pandas.DataFrame()._get_axis_number(
- axis) if axis is not None else 0
+ axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0
return self._data_manager.min(
- axis=axis,
- skipna=skipna,
- level=level,
- numeric_only=numeric_only,
- **kwargs)
+ axis=axis, skipna=skipna, level=level, numeric_only=numeric_only, **kwargs
+ )
- def mod(self, other, axis='columns', level=None, fill_value=None):
+ def mod(self, other, axis="columns", level=None, fill_value=None):
"""Mods this DataFrame against another DataFrame/Series/scalar.
Args:
@@ -2301,12 +2308,14 @@ def mod(self, other, axis='columns', level=None, fill_value=None):
A new DataFrame with the Mod applied.
"""
if level is not None:
- raise NotImplementedError("Mutlilevel index not yet supported "
- "in Pandas on Ray")
+ raise NotImplementedError(
+ "Mutlilevel index not yet supported " "in Pandas on Ray"
+ )
other = self._validate_other(other, axis)
new_manager = self._data_manager.mod(
- other=other, axis=axis, level=level, fill_value=fill_value)
+ other=other, axis=axis, level=level, fill_value=fill_value
+ )
return self._create_dataframe_from_manager(new_manager)
def mode(self, axis=0, numeric_only=False):
@@ -2322,10 +2331,10 @@ def mode(self, axis=0, numeric_only=False):
axis = pandas.DataFrame()._get_axis_number(axis)
return DataFrame(
- data_manager=self._data_manager.mode(
- axis=axis, numeric_only=numeric_only))
+ data_manager=self._data_manager.mode(axis=axis, numeric_only=numeric_only)
+ )
- def mul(self, other, axis='columns', level=None, fill_value=None):
+ def mul(self, other, axis="columns", level=None, fill_value=None):
"""Multiplies this DataFrame against another DataFrame/Series/scalar.
Args:
@@ -2338,15 +2347,17 @@ def mul(self, other, axis='columns', level=None, fill_value=None):
A new DataFrame with the Multiply applied.
"""
if level is not None:
- raise NotImplementedError("Mutlilevel index not yet supported "
- "in Pandas on Ray")
+ raise NotImplementedError(
+ "Mutlilevel index not yet supported " "in Pandas on Ray"
+ )
other = self._validate_other(other, axis)
new_manager = self._data_manager.mul(
- other=other, axis=axis, level=level, fill_value=fill_value)
+ other=other, axis=axis, level=level, fill_value=fill_value
+ )
return self._create_dataframe_from_manager(new_manager)
- def multiply(self, other, axis='columns', level=None, fill_value=None):
+ def multiply(self, other, axis="columns", level=None, fill_value=None):
"""Synonym for mul.
Args:
@@ -2360,7 +2371,7 @@ def multiply(self, other, axis='columns', level=None, fill_value=None):
"""
return self.mul(other, axis, level, fill_value)
- def ne(self, other, axis='columns', level=None):
+ def ne(self, other, axis="columns", level=None):
"""Checks element-wise that this is not equal to other.
Args:
@@ -2372,18 +2383,19 @@ def ne(self, other, axis='columns', level=None):
A new DataFrame filled with Booleans.
"""
if level is not None:
- raise NotImplementedError("Mutlilevel index not yet supported "
- "in Pandas on Ray")
+ raise NotImplementedError(
+ "Mutlilevel index not yet supported " "in Pandas on Ray"
+ )
other = self._validate_other(other, axis)
- new_manager = self._data_manager.ne(
- other=other, axis=axis, level=level)
+ new_manager = self._data_manager.ne(other=other, axis=axis, level=level)
return self._create_dataframe_from_manager(new_manager)
- def nlargest(self, n, columns, keep='first'):
+ def nlargest(self, n, columns, keep="first"):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def notna(self):
"""Perform notna across the DataFrame.
@@ -2403,10 +2415,11 @@ def notnull(self):
"""
return DataFrame(data_manager=self._data_manager.notnull())
- def nsmallest(self, n, columns, keep='first'):
+ def nsmallest(self, n, columns, keep="first"):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def nunique(self, axis=0, dropna=True):
"""Return Series with number of distinct
@@ -2421,15 +2434,11 @@ def nunique(self, axis=0, dropna=True):
"""
return self._data_manager.nunique(axis=axis, dropna=dropna)
- def pct_change(self,
- periods=1,
- fill_method='pad',
- limit=None,
- freq=None,
- **kwargs):
+ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, **kwargs):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def pipe(self, func, *args, **kwargs):
"""Apply func(self, *args, **kwargs)
@@ -2447,55 +2456,62 @@ def pipe(self, func, *args, **kwargs):
def pivot(self, index=None, columns=None, values=None):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
-
- def pivot_table(self,
- values=None,
- index=None,
- columns=None,
- aggfunc='mean',
- fill_value=None,
- margins=False,
- dropna=True,
- margins_name='All'):
+ "github.com/modin-project/modin."
+ )
+
+ def pivot_table(
+ self,
+ values=None,
+ index=None,
+ columns=None,
+ aggfunc="mean",
+ fill_value=None,
+ margins=False,
+ dropna=True,
+ margins_name="All",
+ ):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
-
- def plot(self,
- x=None,
- y=None,
- kind='line',
- ax=None,
- subplots=False,
- sharex=None,
- sharey=False,
- layout=None,
- figsize=None,
- use_index=True,
- title=None,
- grid=None,
- legend=True,
- style=None,
- logx=False,
- logy=False,
- loglog=False,
- xticks=None,
- yticks=None,
- xlim=None,
- ylim=None,
- rot=None,
- fontsize=None,
- colormap=None,
- table=False,
- yerr=None,
- xerr=None,
- secondary_y=False,
- sort_columns=False,
- **kwds):
+ "github.com/modin-project/modin."
+ )
+
+ def plot(
+ self,
+ x=None,
+ y=None,
+ kind="line",
+ ax=None,
+ subplots=False,
+ sharex=None,
+ sharey=False,
+ layout=None,
+ figsize=None,
+ use_index=True,
+ title=None,
+ grid=None,
+ legend=True,
+ style=None,
+ logx=False,
+ logy=False,
+ loglog=False,
+ xticks=None,
+ yticks=None,
+ xlim=None,
+ ylim=None,
+ rot=None,
+ fontsize=None,
+ colormap=None,
+ table=False,
+ yerr=None,
+ xerr=None,
+ secondary_y=False,
+ sort_columns=False,
+ **kwds
+ ):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def pop(self, item):
"""Pops an item from this DataFrame and returns it.
@@ -2511,7 +2527,7 @@ def pop(self, item):
del self[item]
return result
- def pow(self, other, axis='columns', level=None, fill_value=None):
+ def pow(self, other, axis="columns", level=None, fill_value=None):
"""Pow this DataFrame against another DataFrame/Series/scalar.
Args:
@@ -2524,21 +2540,25 @@ def pow(self, other, axis='columns', level=None, fill_value=None):
A new DataFrame with the Pow applied.
"""
if level is not None:
- raise NotImplementedError("Mutlilevel index not yet supported "
- "in Pandas on Ray")
+ raise NotImplementedError(
+ "Mutlilevel index not yet supported " "in Pandas on Ray"
+ )
other = self._validate_other(other, axis)
new_manager = self._data_manager.pow(
- other=other, axis=axis, level=level, fill_value=fill_value)
+ other=other, axis=axis, level=level, fill_value=fill_value
+ )
return self._create_dataframe_from_manager(new_manager)
- def prod(self,
- axis=None,
- skipna=None,
- level=None,
- numeric_only=None,
- min_count=1,
- **kwargs):
+ def prod(
+ self,
+ axis=None,
+ skipna=None,
+ level=None,
+ numeric_only=None,
+ min_count=1,
+ **kwargs
+ ):
"""Return the product of the values for the requested axis
Args:
@@ -2551,8 +2571,7 @@ def prod(self,
Returns:
prod : Series or DataFrame (if level specified)
"""
- axis = pandas.DataFrame()._get_axis_number(
- axis) if axis is not None else 0
+ axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0
return self._data_manager.prod(
axis=axis,
@@ -2560,15 +2579,18 @@ def prod(self,
level=level,
numeric_only=numeric_only,
min_count=min_count,
- **kwargs)
-
- def product(self,
- axis=None,
- skipna=None,
- level=None,
- numeric_only=None,
- min_count=1,
- **kwargs):
+ **kwargs
+ )
+
+ def product(
+ self,
+ axis=None,
+ skipna=None,
+ level=None,
+ numeric_only=None,
+ min_count=1,
+ **kwargs
+ ):
"""Return the product of the values for the requested axis
Args:
@@ -2587,13 +2609,10 @@ def product(self,
level=level,
numeric_only=numeric_only,
min_count=min_count,
- **kwargs)
+ **kwargs
+ )
- def quantile(self,
- q=0.5,
- axis=0,
- numeric_only=True,
- interpolation='linear'):
+ def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation="linear"):
"""Return values at the given quantile over requested axis,
a la numpy.percentile.
@@ -2616,14 +2635,13 @@ def quantile(self,
"""
def check_dtype(t):
- return (is_numeric_dtype(t) or is_datetime_or_timedelta_dtype(t))
+ return is_numeric_dtype(t) or is_datetime_or_timedelta_dtype(t)
if not numeric_only:
# If not numeric_only and columns, then check all columns are either
# numeric, timestamp, or timedelta
if not axis and not all(check_dtype(t) for t in self.dtypes):
- raise TypeError("can't multiply sequence by non-int of type "
- "'float'")
+ raise TypeError("can't multiply sequence by non-int of type " "'float'")
# If over rows, then make sure that all dtypes are equal for not
# numeric_only
@@ -2634,7 +2652,9 @@ def check_dtype(t):
if not is_dtype_equal(pre_dtype, curr_dtype):
raise TypeError(
"Cannot compare type '{0}' with type '{1}'".format(
- pre_dtype, curr_dtype))
+ pre_dtype, curr_dtype
+ )
+ )
else:
# Normally pandas returns this near the end of the quantile, but we
# can't afford the overhead of running the entire operation before
@@ -2653,14 +2673,14 @@ def check_dtype(t):
q=q,
axis=axis,
numeric_only=numeric_only,
- interpolation=interpolation))
+ interpolation=interpolation,
+ )
+ )
else:
return self._data_manager.quantile_for_single_value(
- q=q,
- axis=axis,
- numeric_only=numeric_only,
- interpolation=interpolation)
+ q=q, axis=axis, numeric_only=numeric_only, interpolation=interpolation
+ )
def query(self, expr, inplace=False, **kwargs):
"""Queries the Dataframe with a boolean expression
@@ -2678,16 +2698,18 @@ def query(self, expr, inplace=False, **kwargs):
else:
return DataFrame(data_manager=new_manager)
- def radd(self, other, axis='columns', level=None, fill_value=None):
+ def radd(self, other, axis="columns", level=None, fill_value=None):
return self.add(other, axis, level, fill_value)
- def rank(self,
- axis=0,
- method='average',
- numeric_only=None,
- na_option='keep',
- ascending=True,
- pct=False):
+ def rank(
+ self,
+ axis=0,
+ method="average",
+ numeric_only=None,
+ na_option="keep",
+ ascending=True,
+ pct=False,
+ ):
"""
Compute numerical data ranks (1 through n) along axis.
Equal values are assigned a rank that is the [method] of
@@ -2718,9 +2740,11 @@ def rank(self,
numeric_only=numeric_only,
na_option=na_option,
ascending=ascending,
- pct=pct))
+ pct=pct,
+ )
+ )
- def rdiv(self, other, axis='columns', level=None, fill_value=None):
+ def rdiv(self, other, axis="columns", level=None, fill_value=None):
"""Div this DataFrame against another DataFrame/Series/scalar.
Args:
@@ -2733,33 +2757,37 @@ def rdiv(self, other, axis='columns', level=None, fill_value=None):
A new DataFrame with the rdiv applied.
"""
if level is not None:
- raise NotImplementedError("Mutlilevel index not yet supported "
- "in Pandas on Ray")
+ raise NotImplementedError(
+ "Mutlilevel index not yet supported " "in Pandas on Ray"
+ )
other = self._validate_other(other, axis)
new_manager = self._data_manager.rdiv(
- other=other, axis=axis, level=level, fill_value=fill_value)
+ other=other, axis=axis, level=level, fill_value=fill_value
+ )
return self._create_dataframe_from_manager(new_manager)
- def reindex(self,
- labels=None,
- index=None,
- columns=None,
- axis=None,
- method=None,
- copy=True,
- level=None,
- fill_value=np.nan,
- limit=None,
- tolerance=None):
+ def reindex(
+ self,
+ labels=None,
+ index=None,
+ columns=None,
+ axis=None,
+ method=None,
+ copy=True,
+ level=None,
+ fill_value=np.nan,
+ limit=None,
+ tolerance=None,
+ ):
if level is not None:
raise NotImplementedError(
"Multilevel Index not Implemented. "
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
- axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None \
- else 0
+ axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0
if axis == 0 and labels is not None:
index = labels
elif labels is not None:
@@ -2772,7 +2800,8 @@ def reindex(self,
method=method,
fill_value=fill_value,
limit=limit,
- tolerance=tolerance)
+ tolerance=tolerance,
+ )
else:
new_manager = self._data_manager
@@ -2783,7 +2812,8 @@ def reindex(self,
method=method,
fill_value=fill_value,
limit=limit,
- tolerance=tolerance)
+ tolerance=tolerance,
+ )
else:
final_manager = new_manager
@@ -2792,36 +2822,37 @@ def reindex(self,
self._update_inplace(new_manager=final_manager)
- def reindex_axis(self,
- labels,
- axis=0,
- method=None,
- level=None,
- copy=True,
- limit=None,
- fill_value=np.nan):
+ def reindex_axis(
+ self,
+ labels,
+ axis=0,
+ method=None,
+ level=None,
+ copy=True,
+ limit=None,
+ fill_value=np.nan,
+ ):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
-
- def reindex_like(self,
- other,
- method=None,
- copy=True,
- limit=None,
- tolerance=None):
+ "github.com/modin-project/modin."
+ )
+
+ def reindex_like(self, other, method=None, copy=True, limit=None, tolerance=None):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
-
- def rename(self,
- mapper=None,
- index=None,
- columns=None,
- axis=None,
- copy=True,
- inplace=False,
- level=None):
+ "github.com/modin-project/modin."
+ )
+
+ def rename(
+ self,
+ mapper=None,
+ index=None,
+ columns=None,
+ axis=None,
+ copy=True,
+ inplace=False,
+ level=None,
+ ):
"""Alters axes labels.
Args:
@@ -2835,19 +2866,16 @@ def rename(self,
Returns:
If inplace is False, a new DataFrame with the updated axes.
"""
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
# We have to do this with the args because of how rename handles
# kwargs. It doesn't ignore None values passed in, so we have to filter
# them ourselves.
args = locals()
- kwargs = {
- k: v
- for k, v in args.items() if v is not None and k != "self"
- }
+ kwargs = {k: v for k, v in args.items() if v is not None and k != "self"}
# inplace should always be true because this is just a copy, and we
# will use the results after.
- kwargs['inplace'] = True
+ kwargs["inplace"] = True
df_to_rename = pandas.DataFrame(index=self.index, columns=self.columns)
df_to_rename.rename(**kwargs)
@@ -2897,44 +2925,48 @@ def _set_axis_name(self, name, axis=0, inplace=False):
def reorder_levels(self, order, axis=0):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
-
- def replace(self,
- to_replace=None,
- value=None,
- inplace=False,
- limit=None,
- regex=False,
- method='pad',
- axis=None):
+ "github.com/modin-project/modin."
+ )
+
+ def replace(
+ self,
+ to_replace=None,
+ value=None,
+ inplace=False,
+ limit=None,
+ regex=False,
+ method="pad",
+ axis=None,
+ ):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
-
- def resample(self,
- rule,
- how=None,
- axis=0,
- fill_method=None,
- closed=None,
- label=None,
- convention='start',
- kind=None,
- loffset=None,
- limit=None,
- base=0,
- on=None,
- level=None):
+ "github.com/modin-project/modin."
+ )
+
+ def resample(
+ self,
+ rule,
+ how=None,
+ axis=0,
+ fill_method=None,
+ closed=None,
+ label=None,
+ convention="start",
+ kind=None,
+ loffset=None,
+ limit=None,
+ base=0,
+ on=None,
+ level=None,
+ ):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
-
- def reset_index(self,
- level=None,
- drop=False,
- inplace=False,
- col_level=0,
- col_fill=''):
+ "github.com/modin-project/modin."
+ )
+
+ def reset_index(
+ self, level=None, drop=False, inplace=False, col_level=0, col_fill=""
+ ):
"""Reset this index to default and create column from current index.
Args:
@@ -2956,7 +2988,7 @@ def reset_index(self,
# TODO Implement level
if level is not None:
raise NotImplementedError("Level not yet supported!")
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
# Error checking for matching Pandas. Pandas does not allow you to
# insert a dropped index into a DataFrame if these columns already
@@ -2970,27 +3002,30 @@ def reset_index(self,
else:
return DataFrame(data_manager=new_manager)
- def rfloordiv(self, other, axis='columns', level=None, fill_value=None):
+ def rfloordiv(self, other, axis="columns", level=None, fill_value=None):
return self.floordiv(other, axis, level, fill_value)
- def rmod(self, other, axis='columns', level=None, fill_value=None):
+ def rmod(self, other, axis="columns", level=None, fill_value=None):
return self.mod(other, axis, level, fill_value)
- def rmul(self, other, axis='columns', level=None, fill_value=None):
+ def rmul(self, other, axis="columns", level=None, fill_value=None):
return self.mul(other, axis, level, fill_value)
- def rolling(self,
- window,
- min_periods=None,
- freq=None,
- center=False,
- win_type=None,
- on=None,
- axis=0,
- closed=None):
+ def rolling(
+ self,
+ window,
+ min_periods=None,
+ freq=None,
+ center=False,
+ win_type=None,
+ on=None,
+ axis=0,
+ closed=None,
+ ):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def round(self, decimals=0, *args, **kwargs):
"""Round each element in the DataFrame.
@@ -3002,9 +3037,10 @@ def round(self, decimals=0, *args, **kwargs):
A new DataFrame.
"""
return DataFrame(
- data_manager=self._data_manager.round(decimals=decimals, **kwargs))
+ data_manager=self._data_manager.round(decimals=decimals, **kwargs)
+ )
- def rpow(self, other, axis='columns', level=None, fill_value=None):
+ def rpow(self, other, axis="columns", level=None, fill_value=None):
"""Pow this DataFrame against another DataFrame/Series/scalar.
Args:
@@ -3017,16 +3053,18 @@ def rpow(self, other, axis='columns', level=None, fill_value=None):
A new DataFrame with the Pow applied.
"""
if level is not None:
- raise NotImplementedError("Mutlilevel index not yet supported "
- "in Pandas on Ray")
+ raise NotImplementedError(
+ "Mutlilevel index not yet supported " "in Pandas on Ray"
+ )
other = self._validate_other(other, axis)
new_manager = self._data_manager.rpow(
- other=other, axis=axis, level=level, fill_value=fill_value)
+ other=other, axis=axis, level=level, fill_value=fill_value
+ )
return self._create_dataframe_from_manager(new_manager)
- def rsub(self, other, axis='columns', level=None, fill_value=None):
+ def rsub(self, other, axis="columns", level=None, fill_value=None):
"""Subtract a DataFrame/Series/scalar from this DataFrame.
Args:
@@ -3039,24 +3077,28 @@ def rsub(self, other, axis='columns', level=None, fill_value=None):
A new DataFrame with the subtraciont applied.
"""
if level is not None:
- raise NotImplementedError("Mutlilevel index not yet supported "
- "in Pandas on Ray")
+ raise NotImplementedError(
+ "Mutlilevel index not yet supported " "in Pandas on Ray"
+ )
other = self._validate_other(other, axis)
new_manager = self._data_manager.rsub(
- other=other, axis=axis, level=level, fill_value=fill_value)
+ other=other, axis=axis, level=level, fill_value=fill_value
+ )
return self._create_dataframe_from_manager(new_manager)
- def rtruediv(self, other, axis='columns', level=None, fill_value=None):
+ def rtruediv(self, other, axis="columns", level=None, fill_value=None):
return self.truediv(other, axis, level, fill_value)
- def sample(self,
- n=None,
- frac=None,
- replace=False,
- weights=None,
- random_state=None,
- axis=None):
+ def sample(
+ self,
+ n=None,
+ frac=None,
+ replace=False,
+ weights=None,
+ random_state=None,
+ axis=None,
+ ):
"""Returns a random sample of items from an axis of object.
Args:
@@ -3082,8 +3124,7 @@ def sample(self,
A new Dataframe
"""
- axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None \
- else 0
+ axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0
if axis == 0:
axis_labels = self.index
@@ -3106,25 +3147,26 @@ def sample(self,
try:
weights = self[weights]
except KeyError:
- raise KeyError("String passed to weights not a "
- "valid column")
+ raise KeyError("String passed to weights not a " "valid column")
else:
- raise ValueError("Strings can only be passed to "
- "weights when sampling from rows on "
- "a DataFrame")
+ raise ValueError(
+ "Strings can only be passed to "
+ "weights when sampling from rows on "
+ "a DataFrame"
+ )
- weights = pandas.Series(weights, dtype='float64')
+ weights = pandas.Series(weights, dtype="float64")
if len(weights) != axis_length:
- raise ValueError("Weights and axis to be sampled must be of "
- "same length")
+ raise ValueError(
+ "Weights and axis to be sampled must be of " "same length"
+ )
if (weights == np.inf).any() or (weights == -np.inf).any():
raise ValueError("weight vector may not include `inf` values")
if (weights < 0).any():
- raise ValueError("weight vector many not include negative "
- "values")
+ raise ValueError("weight vector many not include negative " "values")
# weights cannot be NaN when sampling, so we must set all nan
# values to 0
@@ -3154,11 +3196,11 @@ def sample(self,
elif n is not None and frac is not None:
# Pandas specification does not allow both n and frac to be passed
# in
- raise ValueError('Please enter a value for `frac` OR `n`, not '
- 'both')
+ raise ValueError("Please enter a value for `frac` OR `n`, not " "both")
if n < 0:
- raise ValueError("A negative number of rows requested. Please "
- "provide positive value.")
+ raise ValueError(
+ "A negative number of rows requested. Please " "provide positive value."
+ )
if n == 0:
# An Empty DataFrame is returned if the number of samples is 0.
@@ -3166,7 +3208,8 @@ def sample(self,
# depending on which axis is passed in.
return DataFrame(
columns=[] if axis == 1 else self.columns,
- index=self.index if axis == 1 else [])
+ index=self.index if axis == 1 else [],
+ )
if random_state is not None:
# Get a random number generator depending on the type of
@@ -3177,18 +3220,22 @@ def sample(self,
random_num_gen = random_state
else:
# random_state must be an int or a numpy RandomState object
- raise ValueError("Please enter an `int` OR a "
- "np.random.RandomState for random_state")
+ raise ValueError(
+ "Please enter an `int` OR a "
+ "np.random.RandomState for random_state"
+ )
# choose random numbers and then get corresponding labels from
# chosen axis
sample_indices = random_num_gen.choice(
- np.arange(0, axis_length), size=n, replace=replace)
+ np.arange(0, axis_length), size=n, replace=replace
+ )
samples = axis_labels[sample_indices]
else:
# randomly select labels from chosen axis
samples = np.random.choice(
- a=axis_labels, size=n, replace=replace, p=weights)
+ a=axis_labels, size=n, replace=replace, p=weights
+ )
if axis == 1:
data_manager = self._data_manager.getitem_col_array(samples)
@@ -3200,7 +3247,8 @@ def sample(self,
def select(self, crit, axis=0):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def select_dtypes(self, include=None, exclude=None):
# Validates arguments for whether both include and exclude are None or
@@ -3219,8 +3267,7 @@ def select_dtypes(self, include=None, exclude=None):
sel = tuple(map(set, (include, exclude)))
- include, exclude = map(lambda x: set(map(_get_dtype_from_object, x)),
- sel)
+ include, exclude = map(lambda x: set(map(_get_dtype_from_object, x)), sel)
include_these = pandas.Series(not bool(include), index=self.columns)
exclude_these = pandas.Series(not bool(exclude), index=self.columns)
@@ -3228,8 +3275,9 @@ def select_dtypes(self, include=None, exclude=None):
def is_dtype_instance_mapper(column, dtype):
return column, functools.partial(issubclass, dtype.type)
- for column, f in itertools.starmap(is_dtype_instance_mapper,
- self.dtypes.iteritems()):
+ for column, f in itertools.starmap(
+ is_dtype_instance_mapper, self.dtypes.iteritems()
+ ):
if include: # checks for the case of empty include or exclude
include_these[column] = any(map(f, include))
if exclude:
@@ -3237,21 +3285,17 @@ def is_dtype_instance_mapper(column, dtype):
dtype_indexer = include_these & exclude_these
indicate = [
- i for i in range(len(dtype_indexer.values))
- if not dtype_indexer.values[i]
+ i for i in range(len(dtype_indexer.values)) if not dtype_indexer.values[i]
]
return self.drop(columns=self.columns[indicate], inplace=False)
- def sem(self,
- axis=None,
- skipna=None,
- level=None,
- ddof=1,
- numeric_only=None,
- **kwargs):
+ def sem(
+ self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs
+ ):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def set_axis(self, labels, axis=0, inplace=None):
"""Assign desired index to given axis.
@@ -3268,19 +3312,21 @@ def set_axis(self, labels, axis=0, inplace=None):
warnings.warn(
'set_axis now takes "labels" as first argument, and '
'"axis" as named parameter. The old form, with "axis" as '
- 'first parameter and \"labels\" as second, is still supported '
- 'but will be deprecated in a future version of pandas.',
+ 'first parameter and "labels" as second, is still supported '
+ "but will be deprecated in a future version of pandas.",
FutureWarning,
- stacklevel=2)
+ stacklevel=2,
+ )
labels, axis = axis, labels
if inplace is None:
warnings.warn(
- 'set_axis currently defaults to operating inplace.\nThis '
- 'will change in a future version of pandas, use '
- 'inplace=True to avoid this warning.',
+ "set_axis currently defaults to operating inplace.\nThis "
+ "will change in a future version of pandas, use "
+ "inplace=True to avoid this warning.",
FutureWarning,
- stacklevel=2)
+ stacklevel=2,
+ )
inplace = True
if inplace:
setattr(self, pandas.DataFrame()._get_axis_name(axis), labels)
@@ -3289,12 +3335,9 @@ def set_axis(self, labels, axis=0, inplace=None):
obj.set_axis(labels, axis=axis, inplace=True)
return obj
- def set_index(self,
- keys,
- drop=True,
- append=False,
- inplace=False,
- verify_integrity=False):
+ def set_index(
+ self, keys, drop=True, append=False, inplace=False, verify_integrity=False
+ ):
"""Set the DataFrame index using one or more existing columns.
Args:
@@ -3309,7 +3352,7 @@ def set_index(self,
Returns:
If inplace is set to false returns a new DataFrame, otherwise None.
"""
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
if not isinstance(keys, list):
keys = [keys]
@@ -3358,7 +3401,7 @@ def set_index(self,
if verify_integrity and not index.is_unique:
duplicates = index.get_duplicates()
- raise ValueError('Index has duplicate keys: %s' % duplicates)
+ raise ValueError("Index has duplicate keys: %s" % duplicates)
for c in to_remove:
del frame[c]
@@ -3374,19 +3417,16 @@ def set_index(self,
def set_value(self, index, col, value, takeable=False):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def shift(self, periods=1, freq=None, axis=0):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
-
- def skew(self,
- axis=None,
- skipna=None,
- level=None,
- numeric_only=None,
- **kwargs):
+ "github.com/modin-project/modin."
+ )
+
+ def skew(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
"""Return unbiased skew over requested axis Normalized by N-1
Args:
@@ -3400,26 +3440,26 @@ def skew(self,
skew : Series or DataFrame (if level specified)
"""
return self._data_manager.skew(
- axis=axis,
- skipna=skipna,
- level=level,
- numeric_only=numeric_only,
- **kwargs)
+ axis=axis, skipna=skipna, level=level, numeric_only=numeric_only, **kwargs
+ )
def slice_shift(self, periods=1, axis=0):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
-
- def sort_index(self,
- axis=0,
- level=None,
- ascending=True,
- inplace=False,
- kind='quicksort',
- na_position='last',
- sort_remaining=True,
- by=None):
+ "github.com/modin-project/modin."
+ )
+
+ def sort_index(
+ self,
+ axis=0,
+ level=None,
+ ascending=True,
+ inplace=False,
+ kind="quicksort",
+ na_position="last",
+ sort_remaining=True,
+ by=None,
+ ):
"""Sort a DataFrame by one of the indices (columns or index).
Args:
@@ -3443,11 +3483,11 @@ def sort_index(self,
"by argument to sort_index is deprecated, "
"please use .sort_values(by=...)",
FutureWarning,
- stacklevel=2)
+ stacklevel=2,
+ )
if level is not None:
raise ValueError("unable to simultaneously sort by and level")
- return self.sort_values(
- by, axis=axis, ascending=ascending, inplace=inplace)
+ return self.sort_values(by, axis=axis, ascending=ascending, inplace=inplace)
axis = pandas.DataFrame()._get_axis_number(axis)
@@ -3460,13 +3500,15 @@ def sort_index(self,
return self.reindex(index=new_index, columns=new_columns)
- def sort_values(self,
- by,
- axis=0,
- ascending=True,
- inplace=False,
- kind='quicksort',
- na_position='last'):
+ def sort_values(
+ self,
+ by,
+ axis=0,
+ ascending=True,
+ inplace=False,
+ kind="quicksort",
+ na_position="last",
+ ):
"""Sorts by a column/row or list of columns/rows.
Args:
@@ -3490,55 +3532,53 @@ def sort_values(self,
# TODO create a more efficient way to sort
if axis == 0:
broadcast_value_dict = {col: self[col] for col in by}
- broadcast_values = pandas.DataFrame(
- broadcast_value_dict, index=self.index)
+ broadcast_values = pandas.DataFrame(broadcast_value_dict, index=self.index)
new_index = broadcast_values.sort_values(
- by=by, axis=axis, ascending=ascending, kind=kind).index
+ by=by, axis=axis, ascending=ascending, kind=kind
+ ).index
return self.reindex(index=new_index)
else:
broadcast_value_list = [
- to_pandas(self[row::len(self.index)]) for row in by
+ to_pandas(self[row :: len(self.index)]) for row in by
]
index_builder = list(zip(broadcast_value_list, by))
- broadcast_values = \
- pandas.concat([row for row, idx in index_builder], copy=False)
+ broadcast_values = pandas.concat(
+ [row for row, idx in index_builder], copy=False
+ )
broadcast_values.columns = self.columns
new_columns = broadcast_values.sort_values(
- by=by, axis=axis, ascending=ascending, kind=kind).columns
+ by=by, axis=axis, ascending=ascending, kind=kind
+ ).columns
return self.reindex(columns=new_columns)
- def sortlevel(self,
- level=0,
- axis=0,
- ascending=True,
- inplace=False,
- sort_remaining=True):
+ def sortlevel(
+ self, level=0, axis=0, ascending=True, inplace=False, sort_remaining=True
+ ):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def squeeze(self, axis=None):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def stack(self, level=-1, dropna=True):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
-
- def std(self,
- axis=None,
- skipna=None,
- level=None,
- ddof=1,
- numeric_only=None,
- **kwargs):
+ "github.com/modin-project/modin."
+ )
+
+ def std(
+ self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs
+ ):
"""Computes standard deviation across the DataFrame.
Args:
@@ -3549,8 +3589,7 @@ def std(self,
Returns:
The std of the DataFrame (Pandas Series)
"""
- axis = pandas.DataFrame()._get_axis_number(
- axis) if axis is not None else 0
+ axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0
return self._data_manager.std(
axis=axis,
@@ -3558,9 +3597,10 @@ def std(self,
level=level,
ddof=ddof,
numeric_only=numeric_only,
- **kwargs)
+ **kwargs
+ )
- def sub(self, other, axis='columns', level=None, fill_value=None):
+ def sub(self, other, axis="columns", level=None, fill_value=None):
"""Subtract a DataFrame/Series/scalar from this DataFrame.
Args:
@@ -3573,15 +3613,17 @@ def sub(self, other, axis='columns', level=None, fill_value=None):
A new DataFrame with the subtraciont applied.
"""
if level is not None:
- raise NotImplementedError("Mutlilevel index not yet supported "
- "in Pandas on Ray")
+ raise NotImplementedError(
+ "Mutlilevel index not yet supported " "in Pandas on Ray"
+ )
other = self._validate_other(other, axis)
new_manager = self._data_manager.sub(
- other=other, axis=axis, level=level, fill_value=fill_value)
+ other=other, axis=axis, level=level, fill_value=fill_value
+ )
return self._create_dataframe_from_manager(new_manager)
- def subtract(self, other, axis='columns', level=None, fill_value=None):
+ def subtract(self, other, axis="columns", level=None, fill_value=None):
"""Alias for sub.
Args:
@@ -3598,12 +3640,14 @@ def subtract(self, other, axis='columns', level=None, fill_value=None):
def swapaxes(self, axis1, axis2, copy=True):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def swaplevel(self, i=-2, j=-1, axis=0):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def tail(self, n=5):
"""Get the last n rows of the DataFrame.
@@ -3622,209 +3666,257 @@ def tail(self, n=5):
def take(self, indices, axis=0, convert=None, is_copy=True, **kwargs):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def to_clipboard(self, excel=None, sep=None, **kwargs):
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
+ warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning)
port_frame = to_pandas(self)
port_frame.to_clipboard(excel, sep, **kwargs)
- def to_csv(self,
- path_or_buf=None,
- sep=",",
- na_rep="",
- float_format=None,
- columns=None,
- header=True,
- index=True,
- index_label=None,
- mode="w",
- encoding=None,
- compression=None,
- quoting=None,
- quotechar='"',
- line_terminator="\n",
- chunksize=None,
- tupleize_cols=None,
- date_format=None,
- doublequote=True,
- escapechar=None,
- decimal="."):
+ def to_csv(
+ self,
+ path_or_buf=None,
+ sep=",",
+ na_rep="",
+ float_format=None,
+ columns=None,
+ header=True,
+ index=True,
+ index_label=None,
+ mode="w",
+ encoding=None,
+ compression=None,
+ quoting=None,
+ quotechar='"',
+ line_terminator="\n",
+ chunksize=None,
+ tupleize_cols=None,
+ date_format=None,
+ doublequote=True,
+ escapechar=None,
+ decimal=".",
+ ):
kwargs = {
- 'path_or_buf': path_or_buf,
- 'sep': sep,
- 'na_rep': na_rep,
- 'float_format': float_format,
- 'columns': columns,
- 'header': header,
- 'index': index,
- 'index_label': index_label,
- 'mode': mode,
- 'encoding': encoding,
- 'compression': compression,
- 'quoting': quoting,
- 'quotechar': quotechar,
- 'line_terminator': line_terminator,
- 'chunksize': chunksize,
- 'tupleize_cols': tupleize_cols,
- 'date_format': date_format,
- 'doublequote': doublequote,
- 'escapechar': escapechar,
- 'decimal': decimal
+ "path_or_buf": path_or_buf,
+ "sep": sep,
+ "na_rep": na_rep,
+ "float_format": float_format,
+ "columns": columns,
+ "header": header,
+ "index": index,
+ "index_label": index_label,
+ "mode": mode,
+ "encoding": encoding,
+ "compression": compression,
+ "quoting": quoting,
+ "quotechar": quotechar,
+ "line_terminator": line_terminator,
+ "chunksize": chunksize,
+ "tupleize_cols": tupleize_cols,
+ "date_format": date_format,
+ "doublequote": doublequote,
+ "escapechar": escapechar,
+ "decimal": decimal,
}
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
+ warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning)
return to_pandas(self).to_csv(**kwargs)
def to_dense(self):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
- def to_dict(self, orient='dict', into=dict):
+ def to_dict(self, orient="dict", into=dict):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
-
- def to_excel(self,
- excel_writer,
- sheet_name='Sheet1',
- na_rep='',
- float_format=None,
- columns=None,
- header=True,
- index=True,
- index_label=None,
- startrow=0,
- startcol=0,
- engine=None,
- merge_cells=True,
- encoding=None,
- inf_rep='inf',
- verbose=True,
- freeze_panes=None):
-
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
+ "github.com/modin-project/modin."
+ )
+
+ def to_excel(
+ self,
+ excel_writer,
+ sheet_name="Sheet1",
+ na_rep="",
+ float_format=None,
+ columns=None,
+ header=True,
+ index=True,
+ index_label=None,
+ startrow=0,
+ startcol=0,
+ engine=None,
+ merge_cells=True,
+ encoding=None,
+ inf_rep="inf",
+ verbose=True,
+ freeze_panes=None,
+ ):
+
+ warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning)
port_frame = to_pandas(self)
- port_frame.to_excel(excel_writer, sheet_name, na_rep, float_format,
- columns, header, index, index_label, startrow,
- startcol, engine, merge_cells, encoding, inf_rep,
- verbose, freeze_panes)
+ port_frame.to_excel(
+ excel_writer,
+ sheet_name,
+ na_rep,
+ float_format,
+ columns,
+ header,
+ index,
+ index_label,
+ startrow,
+ startcol,
+ engine,
+ merge_cells,
+ encoding,
+ inf_rep,
+ verbose,
+ freeze_panes,
+ )
def to_feather(self, fname):
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
+ warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning)
port_frame = to_pandas(self)
port_frame.to_feather(fname)
- def to_gbq(self,
- destination_table,
- project_id,
- chunksize=10000,
- verbose=True,
- reauth=False,
- if_exists='fail',
- private_key=None):
+ def to_gbq(
+ self,
+ destination_table,
+ project_id,
+ chunksize=10000,
+ verbose=True,
+ reauth=False,
+ if_exists="fail",
+ private_key=None,
+ ):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def to_hdf(self, path_or_buf, key, **kwargs):
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
+ warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning)
port_frame = to_pandas(self)
port_frame.to_hdf(path_or_buf, key, **kwargs)
- def to_html(self,
- buf=None,
- columns=None,
- col_space=None,
- header=True,
- index=True,
- na_rep='np.NaN',
- formatters=None,
- float_format=None,
- sparsify=None,
- index_names=True,
- justify=None,
- bold_rows=True,
- classes=None,
- escape=True,
- max_rows=None,
- max_cols=None,
- show_dimensions=False,
- notebook=False,
- decimal='.',
- border=None):
-
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
+ def to_html(
+ self,
+ buf=None,
+ columns=None,
+ col_space=None,
+ header=True,
+ index=True,
+ na_rep="np.NaN",
+ formatters=None,
+ float_format=None,
+ sparsify=None,
+ index_names=True,
+ justify=None,
+ bold_rows=True,
+ classes=None,
+ escape=True,
+ max_rows=None,
+ max_cols=None,
+ show_dimensions=False,
+ notebook=False,
+ decimal=".",
+ border=None,
+ ):
+
+ warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning)
port_frame = to_pandas(self)
- port_frame.to_html(buf, columns, col_space, header, index, na_rep,
- formatters, float_format, sparsify, index_names,
- justify, bold_rows, classes, escape, max_rows,
- max_cols, show_dimensions, notebook, decimal,
- border)
-
- def to_json(self,
- path_or_buf=None,
- orient=None,
- date_format=None,
- double_precision=10,
- force_ascii=True,
- date_unit='ms',
- default_handler=None,
- lines=False,
- compression=None):
-
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
+ port_frame.to_html(
+ buf,
+ columns,
+ col_space,
+ header,
+ index,
+ na_rep,
+ formatters,
+ float_format,
+ sparsify,
+ index_names,
+ justify,
+ bold_rows,
+ classes,
+ escape,
+ max_rows,
+ max_cols,
+ show_dimensions,
+ notebook,
+ decimal,
+ border,
+ )
+
+ def to_json(
+ self,
+ path_or_buf=None,
+ orient=None,
+ date_format=None,
+ double_precision=10,
+ force_ascii=True,
+ date_unit="ms",
+ default_handler=None,
+ lines=False,
+ compression=None,
+ ):
+
+ warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning)
port_frame = to_pandas(self)
- port_frame.to_json(path_or_buf, orient, date_format, double_precision,
- force_ascii, date_unit, default_handler, lines,
- compression)
-
- def to_latex(self,
- buf=None,
- columns=None,
- col_space=None,
- header=True,
- index=True,
- na_rep='np.NaN',
- formatters=None,
- float_format=None,
- sparsify=None,
- index_names=True,
- bold_rows=False,
- column_format=None,
- longtable=None,
- escape=None,
- encoding=None,
- decimal='.',
- multicolumn=None,
- multicolumn_format=None,
- multirow=None):
+ port_frame.to_json(
+ path_or_buf,
+ orient,
+ date_format,
+ double_precision,
+ force_ascii,
+ date_unit,
+ default_handler,
+ lines,
+ compression,
+ )
+
+ def to_latex(
+ self,
+ buf=None,
+ columns=None,
+ col_space=None,
+ header=True,
+ index=True,
+ na_rep="np.NaN",
+ formatters=None,
+ float_format=None,
+ sparsify=None,
+ index_names=True,
+ bold_rows=False,
+ column_format=None,
+ longtable=None,
+ escape=None,
+ encoding=None,
+ decimal=".",
+ multicolumn=None,
+ multicolumn_format=None,
+ multirow=None,
+ ):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
- def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs):
+ def to_msgpack(self, path_or_buf=None, encoding="utf-8", **kwargs):
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
+ warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning)
port_frame = to_pandas(self)
port_frame.to_msgpack(path_or_buf, encoding, **kwargs)
@@ -3832,12 +3924,12 @@ def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs):
def to_panel(self):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
- def to_parquet(self, fname, engine='auto', compression='snappy', **kwargs):
+ def to_parquet(self, fname, engine="auto", compression="snappy", **kwargs):
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
+ warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning)
port_frame = to_pandas(self)
port_frame.to_parquet(fname, engine, compression, **kwargs)
@@ -3845,15 +3937,12 @@ def to_parquet(self, fname, engine='auto', compression='snappy', **kwargs):
def to_period(self, freq=None, axis=0, copy=True):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
- def to_pickle(self,
- path,
- compression='infer',
- protocol=pkl.HIGHEST_PROTOCOL):
+ def to_pickle(self, path, compression="infer", protocol=pkl.HIGHEST_PROTOCOL):
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
+ warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning)
port_frame = to_pandas(self)
port_frame.to_pickle(path, compression, protocol)
@@ -3861,77 +3950,95 @@ def to_pickle(self,
def to_records(self, index=True, convert_datetime64=True):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
- def to_sparse(self, fill_value=None, kind='block'):
+ def to_sparse(self, fill_value=None, kind="block"):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
-
- def to_sql(self,
- name,
- con,
- flavor=None,
- schema=None,
- if_exists='fail',
- index=True,
- index_label=None,
- chunksize=None,
- dtype=None):
-
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
+ "github.com/modin-project/modin."
+ )
+
+ def to_sql(
+ self,
+ name,
+ con,
+ flavor=None,
+ schema=None,
+ if_exists="fail",
+ index=True,
+ index_label=None,
+ chunksize=None,
+ dtype=None,
+ ):
+
+ warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning)
port_frame = to_pandas(self)
- port_frame.to_sql(name, con, flavor, schema, if_exists, index,
- index_label, chunksize, dtype)
-
- def to_stata(self,
- fname,
- convert_dates=None,
- write_index=True,
- encoding='latin-1',
- byteorder=None,
- time_stamp=None,
- data_label=None,
- variable_labels=None):
-
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
+ port_frame.to_sql(
+ name, con, flavor, schema, if_exists, index, index_label, chunksize, dtype
+ )
+
+ def to_stata(
+ self,
+ fname,
+ convert_dates=None,
+ write_index=True,
+ encoding="latin-1",
+ byteorder=None,
+ time_stamp=None,
+ data_label=None,
+ variable_labels=None,
+ ):
+
+ warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning)
port_frame = to_pandas(self)
- port_frame.to_stata(fname, convert_dates, write_index, encoding,
- byteorder, time_stamp, data_label, variable_labels)
-
- def to_string(self,
- buf=None,
- columns=None,
- col_space=None,
- header=True,
- index=True,
- na_rep='np.NaN',
- formatters=None,
- float_format=None,
- sparsify=None,
- index_names=True,
- justify=None,
- line_width=None,
- max_rows=None,
- max_cols=None,
- show_dimensions=False):
+ port_frame.to_stata(
+ fname,
+ convert_dates,
+ write_index,
+ encoding,
+ byteorder,
+ time_stamp,
+ data_label,
+ variable_labels,
+ )
+
+ def to_string(
+ self,
+ buf=None,
+ columns=None,
+ col_space=None,
+ header=True,
+ index=True,
+ na_rep="np.NaN",
+ formatters=None,
+ float_format=None,
+ sparsify=None,
+ index_names=True,
+ justify=None,
+ line_width=None,
+ max_rows=None,
+ max_cols=None,
+ show_dimensions=False,
+ ):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
- def to_timestamp(self, freq=None, how='start', axis=0, copy=True):
+ def to_timestamp(self, freq=None, how="start", axis=0, copy=True):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def to_xarray(self):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def transform(self, func, *args, **kwargs):
kwargs["is_transform"] = True
@@ -3943,7 +4050,7 @@ def transform(self, func, *args, **kwargs):
raise ValueError("transforms cannot produce aggregated results")
return result
- def truediv(self, other, axis='columns', level=None, fill_value=None):
+ def truediv(self, other, axis="columns", level=None, fill_value=None):
"""Divides this DataFrame against another DataFrame/Series/scalar.
Args:
@@ -3956,46 +4063,49 @@ def truediv(self, other, axis='columns', level=None, fill_value=None):
A new DataFrame with the Divide applied.
"""
if level is not None:
- raise NotImplementedError("Mutlilevel index not yet supported "
- "in Pandas on Ray")
+ raise NotImplementedError(
+ "Mutlilevel index not yet supported " "in Pandas on Ray"
+ )
other = self._validate_other(other, axis)
new_manager = self._data_manager.truediv(
- other=other, axis=axis, level=level, fill_value=fill_value)
+ other=other, axis=axis, level=level, fill_value=fill_value
+ )
return self._create_dataframe_from_manager(new_manager)
def truncate(self, before=None, after=None, axis=None, copy=True):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def tshift(self, periods=1, freq=None, axis=0):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def tz_convert(self, tz, axis=0, level=None, copy=True):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
- def tz_localize(self, tz, axis=0, level=None, copy=True,
- ambiguous='raise'):
+ def tz_localize(self, tz, axis=0, level=None, copy=True, ambiguous="raise"):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def unstack(self, level=-1, fill_value=None):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
-
- def update(self,
- other,
- join='left',
- overwrite=True,
- filter_func=None,
- raise_conflict=False):
+ "github.com/modin-project/modin."
+ )
+
+ def update(
+ self, other, join="left", overwrite=True, filter_func=None, raise_conflict=False
+ ):
"""Modify DataFrame in place using non-NA values from other.
Args:
@@ -4013,7 +4123,8 @@ def update(self,
raise NotImplementedError(
"raise_conflict parameter not yet supported. "
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
if not isinstance(other, DataFrame):
other = DataFrame(other)
@@ -4023,16 +4134,13 @@ def update(self,
join=join,
overwrite=overwrite,
filter_func=filter_func,
- raise_conflict=raise_conflict)
+ raise_conflict=raise_conflict,
+ )
self._update_inplace(new_manager=data_manager)
- def var(self,
- axis=None,
- skipna=None,
- level=None,
- ddof=1,
- numeric_only=None,
- **kwargs):
+ def var(
+ self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs
+ ):
"""Computes variance across the DataFrame.
Args:
@@ -4043,8 +4151,7 @@ def var(self,
Returns:
The variance of the DataFrame.
"""
- axis = pandas.DataFrame()._get_axis_number(
- axis) if axis is not None else 0
+ axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0
return self._data_manager.var(
axis=axis,
@@ -4052,17 +4159,20 @@ def var(self,
level=level,
ddof=ddof,
numeric_only=numeric_only,
- **kwargs)
-
- def where(self,
- cond,
- other=np.nan,
- inplace=False,
- axis=None,
- level=None,
- errors='raise',
- try_cast=False,
- raise_on_error=None):
+ **kwargs
+ )
+
+ def where(
+ self,
+ cond,
+ other=np.nan,
+ inplace=False,
+ axis=None,
+ level=None,
+ errors="raise",
+ try_cast=False,
+ raise_on_error=None,
+ ):
"""Replaces values not meeting condition with values in other.
Args:
@@ -4081,26 +4191,25 @@ def where(self,
A new DataFrame with the replaced values.
"""
- inplace = validate_bool_kwarg(inplace, 'inplace')
+ inplace = validate_bool_kwarg(inplace, "inplace")
if isinstance(other, pandas.Series) and axis is None:
raise ValueError("Must specify axis=0 or 1")
if level is not None:
- raise NotImplementedError("Multilevel Index not yet supported on "
- "Pandas on Ray.")
+ raise NotImplementedError(
+ "Multilevel Index not yet supported on " "Pandas on Ray."
+ )
- axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None \
- else 0
+ axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0
cond = cond(self) if callable(cond) else cond
if not isinstance(cond, DataFrame):
- if not hasattr(cond, 'shape'):
+ if not hasattr(cond, "shape"):
cond = np.asanyarray(cond)
if cond.shape != self.shape:
- raise ValueError("Array conditional must be same shape as "
- "self")
+ raise ValueError("Array conditional must be same shape as " "self")
cond = DataFrame(cond, index=self.index, columns=self.columns)
if isinstance(other, DataFrame):
@@ -4113,7 +4222,8 @@ def where(self,
other = pandas.Series(other, index=index)
data_manager = self._data_manager.where(
- cond._data_manager, other, axis=axis, level=level)
+ cond._data_manager, other, axis=axis, level=level
+ )
if inplace:
self._update_inplace(new_manager=data_manager)
else:
@@ -4122,7 +4232,8 @@ def where(self,
def xs(self, key, axis=0, level=None, drop_level=True):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def __getitem__(self, key):
"""Get the column specified by key for this DataFrame.
@@ -4145,20 +4256,23 @@ def __getitem__(self, key):
# see if we can slice the rows
# This lets us reuse code in Pandas to error check
- indexer = convert_to_index_sliceable(
- pandas.DataFrame(index=self.index), key)
+ indexer = convert_to_index_sliceable(pandas.DataFrame(index=self.index), key)
if indexer is not None:
return self._getitem_slice(indexer)
if isinstance(key, (pandas.Series, np.ndarray, pandas.Index, list)):
return self._getitem_array(key)
elif isinstance(key, DataFrame):
- raise NotImplementedError("To contribute to Pandas on Ray, please"
- "visit github.com/modin-project/modin.")
+ raise NotImplementedError(
+ "To contribute to Pandas on Ray, please"
+ "visit github.com/modin-project/modin."
+ )
# return self._getitem_frame(key)
elif is_mi_columns:
- raise NotImplementedError("To contribute to Pandas on Ray, please"
- "visit github.com/modin-project/modin.")
+ raise NotImplementedError(
+ "To contribute to Pandas on Ray, please"
+ "visit github.com/modin-project/modin."
+ )
# return self._getitem_multilevel(key)
else:
return self._getitem_column(key)
@@ -4168,33 +4282,32 @@ def _getitem_column(self, key):
def _getitem_array(self, key):
if com.is_bool_indexer(key):
- if isinstance(key, pandas.Series) and \
- not key.index.equals(self.index):
+ if isinstance(key, pandas.Series) and not key.index.equals(self.index):
warnings.warn(
- "Boolean Series key will be reindexed to match "
- "DataFrame index.",
+ "Boolean Series key will be reindexed to match " "DataFrame index.",
UserWarning,
- stacklevel=3)
+ stacklevel=3,
+ )
elif len(key) != len(self.index):
- raise ValueError('Item wrong length {} instead of {}.'.format(
- len(key), len(self.index)))
+ raise ValueError(
+ "Item wrong length {} instead of {}.".format(
+ len(key), len(self.index)
+ )
+ )
key = check_bool_indexer(self.index, key)
# We convert here because the data_manager assumes it is a list of
# indices. This greatly decreases the complexity of the code.
key = self.index[key]
- return DataFrame(
- data_manager=self._data_manager.getitem_row_array(key))
+ return DataFrame(data_manager=self._data_manager.getitem_row_array(key))
else:
- return DataFrame(
- data_manager=self._data_manager.getitem_column_array(key))
+ return DataFrame(data_manager=self._data_manager.getitem_column_array(key))
def _getitem_slice(self, key):
# We convert here because the data_manager assumes it is a list of
# indices. This greatly decreases the complexity of the code.
key = self.index[key]
- return DataFrame(
- data_manager=self._data_manager.getitem_row_array(key))
+ return DataFrame(data_manager=self._data_manager.getitem_row_array(key))
def __getattr__(self, key):
"""After regular attribute access, looks up the name in the columns
@@ -4216,7 +4329,8 @@ def __setitem__(self, key, value):
if not isinstance(key, str):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
if key not in self.columns:
self.insert(loc=len(self.columns), column=key, value=value)
else:
@@ -4235,17 +4349,20 @@ def __len__(self):
def __unicode__(self):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def __invert__(self):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def __hash__(self):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def __iter__(self):
"""Iterate over the columns
@@ -4269,12 +4386,14 @@ def __contains__(self, key):
def __nonzero__(self):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def __bool__(self):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def __abs__(self):
"""Creates a modified DataFrame by taking the absolute value.
@@ -4287,7 +4406,8 @@ def __abs__(self):
def __round__(self, decimals=0):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def __array__(self, dtype=None):
# TODO: This is very inefficient and needs fix, also see as_matrix
@@ -4300,12 +4420,14 @@ def __array_wrap__(self, result, context=None):
def __getstate__(self):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def __setstate__(self, state):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def __delitem__(self, key):
"""Delete a column by key. `del a[key]` for example.
@@ -4324,7 +4446,8 @@ def __delitem__(self, key):
def __finalize__(self, other, method=None, **kwargs):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def __copy__(self, deep=True):
"""Make a copy using modin.DataFrame.copy method
@@ -4353,17 +4476,20 @@ def __deepcopy__(self, memo=None):
def __and__(self, other):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def __or__(self, other):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def __xor__(self, other):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def __lt__(self, other):
return self.lt(other)
@@ -4425,8 +4551,7 @@ def __floordiv__(self, other):
def __ifloordiv__(self, other):
return self.floordiv(other)
- def __rfloordiv__(self, other, axis="columns", level=None,
- fill_value=None):
+ def __rfloordiv__(self, other, axis="columns", level=None, fill_value=None):
return self.rfloordiv(other, axis, level, fill_value)
def __truediv__(self, other):
@@ -4460,40 +4585,49 @@ def __neg__(self):
A modified DataFrame where every element is the negation of before
"""
for t in self.dtypes:
- if not (is_bool_dtype(t) or is_numeric_dtype(t)
- or is_datetime_or_timedelta_dtype(t)):
+ if not (
+ is_bool_dtype(t)
+ or is_numeric_dtype(t)
+ or is_datetime_or_timedelta_dtype(t)
+ ):
raise TypeError(
- "Unary negative expects numeric dtype, not {}".format(t))
+ "Unary negative expects numeric dtype, not {}".format(t)
+ )
return DataFrame(data_manager=self._data_manager.negative())
def __sizeof__(self):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
@property
def __doc__(self):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
@property
def blocks(self):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
@property
def style(self):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def iat(self, axis=None):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
@property
def loc(self):
@@ -4503,23 +4637,27 @@ def loc(self):
We do not support: boolean array, callable
"""
from .indexing import _LocIndexer
+
return _LocIndexer(self)
@property
def is_copy(self):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def at(self, axis=None):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def ix(self, axis=None):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
@property
def iloc(self):
@@ -4529,6 +4667,7 @@ def iloc(self):
We do not support: boolean array, callable
"""
from .indexing import _iLocIndexer
+
return _iLocIndexer(self)
def _create_dataframe_from_manager(self, new_manager, inplace=False):
@@ -4549,12 +4688,14 @@ def _validate_other(self, other, axis):
if len(other) != len(self.index):
raise ValueError(
"Unable to coerce to Series, length must be {0}: "
- "given {1}".format(len(self.index), len(other)))
+ "given {1}".format(len(self.index), len(other))
+ )
else:
if len(other) != len(self.columns):
raise ValueError(
"Unable to coerce to Series, length must be {0}: "
- "given {1}".format(len(self.columns), len(other)))
+ "given {1}".format(len(self.columns), len(other))
+ )
return other
@@ -4570,6 +4711,8 @@ def _merge_columns(left_columns, right_columns, *args):
Returns:
The columns for the merge operation.
"""
- return pandas.DataFrame(columns=left_columns, index=[0], dtype='uint8') \
- .merge(pandas.DataFrame(columns=right_columns, index=[0],
- dtype='uint8'), *args).columns
+ return (
+ pandas.DataFrame(columns=left_columns, index=[0], dtype="uint8")
+ .merge(pandas.DataFrame(columns=right_columns, index=[0], dtype="uint8"), *args)
+ .columns
+ )
diff --git a/modin/pandas/datetimes.py b/modin/pandas/datetimes.py
index 0ca2c0c0f5d..5e6ae78354d 100644
--- a/modin/pandas/datetimes.py
+++ b/modin/pandas/datetimes.py
@@ -7,17 +7,19 @@
from .dataframe import DataFrame
-def to_datetime(arg,
- errors='raise',
- dayfirst=False,
- yearfirst=False,
- utc=None,
- box=True,
- format=None,
- exact=True,
- unit=None,
- infer_datetime_format=False,
- origin='unix'):
+def to_datetime(
+ arg,
+ errors="raise",
+ dayfirst=False,
+ yearfirst=False,
+ utc=None,
+ box=True,
+ format=None,
+ exact=True,
+ unit=None,
+ infer_datetime_format=False,
+ origin="unix",
+):
"""Convert the arg to datetime format. If not Ray DataFrame, this falls
back on pandas.
@@ -53,7 +55,8 @@ def to_datetime(arg,
exact=exact,
unit=unit,
infer_datetime_format=infer_datetime_format,
- origin=origin)
+ origin=origin,
+ )
# Pandas seems to ignore this kwarg so we will too
pandas.to_datetime(
@@ -67,6 +70,7 @@ def to_datetime(arg,
exact=exact,
unit=unit,
infer_datetime_format=infer_datetime_format,
- origin=origin)
+ origin=origin,
+ )
return arg._data_manager.to_datetime()
diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py
index e48f631b9e5..011a269121a 100644
--- a/modin/pandas/groupby.py
+++ b/modin/pandas/groupby.py
@@ -14,11 +14,13 @@
pandas.core.groupby.DataFrameGroupBy,
excluded=[
pandas.core.groupby.DataFrameGroupBy,
- pandas.core.groupby.DataFrameGroupBy.__init__
- ])
+ pandas.core.groupby.DataFrameGroupBy.__init__,
+ ],
+)
class DataFrameGroupBy(object):
- def __init__(self, df, by, axis, level, as_index, sort, group_keys,
- squeeze, **kwargs):
+ def __init__(
+ self, df, by, axis, level, as_index, sort, group_keys, squeeze, **kwargs
+ ):
self._axis = axis
self._data_manager = df._data_manager
@@ -30,7 +32,7 @@ def __init__(self, df, by, axis, level, as_index, sort, group_keys,
"sort": sort,
"as_index": as_index,
"group_keys": group_keys,
- "squeeze": squeeze
+ "squeeze": squeeze,
}
def __getattr__(self, key):
@@ -49,7 +51,8 @@ def __getattr__(self, key):
raise NotImplementedError(
"SeriesGroupBy is not implemented."
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
raise e
_index_grouped_cache = None
@@ -79,17 +82,29 @@ def _iter(self):
from .dataframe import DataFrame
if self._axis == 0:
- return ((k,
- DataFrame(
- data_manager=self._data_manager.getitem_row_array(
- self._index_grouped[k])))
- for k, _ in self._keys_and_values)
+ return (
+ (
+ k,
+ DataFrame(
+ data_manager=self._data_manager.getitem_row_array(
+ self._index_grouped[k]
+ )
+ ),
+ )
+ for k, _ in self._keys_and_values
+ )
else:
- return ((k,
- DataFrame(
- data_manager=self._data_manager.getitem_column_array(
- self._index_grouped[k])))
- for k, _ in self._keys_and_values)
+ return (
+ (
+ k,
+ DataFrame(
+ data_manager=self._data_manager.getitem_column_array(
+ self._index_grouped[k]
+ )
+ ),
+ )
+ for k, _ in self._keys_and_values
+ )
@property
def ngroups(self):
@@ -101,12 +116,14 @@ def skew(self, **kwargs):
def ffill(self, limit=None):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def sem(self, ddof=1):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def mean(self, *args, **kwargs):
return self._apply_agg_function(lambda df: df.mean(*args, **kwargs))
@@ -118,23 +135,27 @@ def any(self):
def plot(self):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def ohlc(self):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def __bytes__(self):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
@property
def tshift(self):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
@property
def groups(self):
@@ -146,7 +167,8 @@ def min(self, **kwargs):
def idxmax(self):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
@property
def ndim(self):
@@ -155,16 +177,17 @@ def ndim(self):
def shift(self, periods=1, freq=None, axis=0):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def nth(self, n, dropna=None):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def cumsum(self, axis=0, *args, **kwargs):
- return self._apply_agg_function(
- lambda df: df.cumsum(axis, *args, **kwargs))
+ return self._apply_agg_function(lambda df: df.cumsum(axis, *args, **kwargs))
@property
def indices(self):
@@ -173,19 +196,20 @@ def indices(self):
def pct_change(self):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def filter(self, func, dropna=True, *args, **kwargs):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def cummax(self, axis=0, **kwargs):
return self._apply_agg_function(lambda df: df.cummax(axis, **kwargs))
def apply(self, func, *args, **kwargs):
- return self._apply_agg_function(
- lambda df: df.apply(func, *args, **kwargs))
+ return self._apply_agg_function(lambda df: df.apply(func, *args, **kwargs))
@property
def dtypes(self):
@@ -196,7 +220,8 @@ def dtypes(self):
def first(self, **kwargs):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def backfill(self, limit=None):
return self.bfill(limit)
@@ -205,28 +230,29 @@ def __getitem__(self, key):
# This operation requires a SeriesGroupBy Object
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def cummin(self, axis=0, **kwargs):
- return self._apply_agg_function(
- lambda df: df.cummin(axis=axis, **kwargs))
+ return self._apply_agg_function(lambda df: df.cummin(axis=axis, **kwargs))
def bfill(self, limit=None):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def idxmin(self):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def prod(self, **kwargs):
return self._apply_agg_function(lambda df: df.prod(**kwargs))
def std(self, ddof=1, *args, **kwargs):
- return self._apply_agg_function(
- lambda df: df.std(ddof, *args, **kwargs))
+ return self._apply_agg_function(lambda df: df.std(ddof, *args, **kwargs))
def aggregate(self, arg, *args, **kwargs):
if self._axis != 0:
@@ -238,19 +264,21 @@ def aggregate(self, arg, *args, **kwargs):
raise NotImplementedError(
"This requires Multi-level index to be implemented. "
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
- return self._apply_agg_function(
- lambda df: df.aggregate(arg, *args, **kwargs))
+ "github.com/modin-project/modin."
+ )
+ return self._apply_agg_function(lambda df: df.aggregate(arg, *args, **kwargs))
def last(self, **kwargs):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def mad(self):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def rank(self):
return self._apply_agg_function(lambda df: df.rank())
@@ -259,24 +287,26 @@ def rank(self):
def corrwith(self):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def pad(self, limit=None):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def max(self, **kwargs):
return self._apply_agg_function(lambda df: df.max(**kwargs))
def var(self, ddof=1, *args, **kwargs):
- return self._apply_agg_function(
- lambda df: df.var(ddof, *args, **kwargs))
+ return self._apply_agg_function(lambda df: df.var(ddof, *args, **kwargs))
def get_group(self, name, obj=None):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def __len__(self):
return len(self._index_grouped)
@@ -293,32 +323,40 @@ def sum(self, **kwargs):
def __unicode__(self):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def describe(self, **kwargs):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
- def boxplot(self,
- grouped,
- subplots=True,
- column=None,
- fontsize=None,
- rot=0,
- grid=True,
- ax=None,
- figsize=None,
- layout=None,
- **kwds):
+ def boxplot(
+ self,
+ grouped,
+ subplots=True,
+ column=None,
+ fontsize=None,
+ rot=0,
+ grid=True,
+ ax=None,
+ figsize=None,
+ layout=None,
+ **kwds
+ ):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def ngroup(self, ascending=True):
index = self._index if not self._axis else self._columns
- return pandas.Series(index=index).groupby(
- by=self._by, **self._kwargs).ngroup(ascending)
+ return (
+ pandas.Series(index=index)
+ .groupby(by=self._by, **self._kwargs)
+ .ngroup(ascending)
+ )
def nunique(self, dropna=True):
return self._apply_agg_function(lambda df: df.nunique(dropna))
@@ -326,7 +364,8 @@ def nunique(self, dropna=True):
def resample(self, rule, *args, **kwargs):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def median(self, **kwargs):
return self._apply_agg_function(lambda df: df.median(**kwargs))
@@ -334,11 +373,11 @@ def median(self, **kwargs):
def head(self, n=5):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def cumprod(self, axis=0, *args, **kwargs):
- return self._apply_agg_function(
- lambda df: df.cumprod(axis, *args, **kwargs))
+ return self._apply_agg_function(lambda df: df.cumprod(axis, *args, **kwargs))
def __iter__(self):
return self._iter.__iter__()
@@ -349,16 +388,17 @@ def agg(self, arg, *args, **kwargs):
def cov(self):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def transform(self, func, *args, **kwargs):
- return self._apply_agg_function(
- lambda df: df.transform(func, *args, **kwargs))
+ return self._apply_agg_function(lambda df: df.transform(func, *args, **kwargs))
def corr(self, **kwargs):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def fillna(self, **kwargs):
return self._apply_agg_function(lambda df: df.fillna(**kwargs))
@@ -372,48 +412,56 @@ def pipe(self, func, *args, **kwargs):
def cumcount(self, ascending=True):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def tail(self, n=5):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
# expanding and rolling are unique cases and need to likely be handled
# separately. They do not appear to be commonly used.
def expanding(self, *args, **kwargs):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def rolling(self, *args, **kwargs):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def hist(self):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def quantile(self, q=0.5, **kwargs):
if is_list_like(q):
raise NotImplementedError(
"This requires Multi-level index to be implemented. "
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
return self._apply_agg_function(lambda df: df.quantile(q, **kwargs))
def diff(self):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def take(self, **kwargs):
raise NotImplementedError(
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
def _apply_agg_function(self, f, **kwargs):
"""Perform aggregation and combine stages based on a given function.
@@ -424,8 +472,10 @@ def _apply_agg_function(self, f, **kwargs):
Returns:
A new combined DataFrame with the result of all groups.
"""
- assert callable(f), "\'{0}\' object is not callable".format(type(f))
+ assert callable(f), "'{0}' object is not callable".format(type(f))
from .dataframe import DataFrame
- new_manager = self._data_manager.groupby_agg(self._by, self._axis, f,
- self._kwargs, kwargs)
+
+ new_manager = self._data_manager.groupby_agg(
+ self._by, self._axis, f, self._kwargs, kwargs
+ )
return DataFrame(data_manager=new_manager)
diff --git a/modin/pandas/indexing.py b/modin/pandas/indexing.py
index cdac309b7cc..f76bae3a5c8 100644
--- a/modin/pandas/indexing.py
+++ b/modin/pandas/indexing.py
@@ -4,13 +4,14 @@
import numpy as np
import pandas
-from pandas.api.types import (is_scalar, is_list_like, is_bool)
+from pandas.api.types import is_scalar, is_list_like, is_bool
from pandas.core.dtypes.common import is_integer
from pandas.core.indexing import IndexingError
from typing import Tuple
from warnings import warn
from .dataframe import DataFrame
+
"""Indexing Helper Class works as follows:
_LocationIndexerBase provide methods framework for __getitem__
@@ -86,7 +87,7 @@ def _parse_tuple(tup):
if len(tup) == 2:
col_loc = tup[1]
if len(tup) > 2:
- raise IndexingError('Too many indexers')
+ raise IndexingError("Too many indexers")
else:
row_loc = tup
@@ -105,8 +106,12 @@ def _is_enlargement(locator, global_index):
Enlargement happens when you trying to locate using labels isn't in the
original index. In other words, enlargement == adding NaNs !
"""
- if is_list_like(locator) and not is_slice(
- locator) and len(locator) > 0 and not is_boolean_array(locator):
+ if (
+ is_list_like(locator)
+ and not is_slice(locator)
+ and len(locator) > 0
+ and not is_boolean_array(locator)
+ ):
n_diff_elems = len(pandas.Index(locator).difference(global_index))
is_enlargement_boolean = n_diff_elems > 0
return is_enlargement_boolean
@@ -144,11 +149,11 @@ def __init__(self, ray_df: DataFrame):
self.row_scaler = False
self.col_scaler = False
- def __getitem__(self, row_lookup: pandas.Index, col_lookup: pandas.Index,
- ndim: int):
+ def __getitem__(
+ self, row_lookup: pandas.Index, col_lookup: pandas.Index, ndim: int
+ ):
if self.is_view:
- dm_view = self.dm.__constructor__(self.dm.data, row_lookup,
- col_lookup)
+ dm_view = self.dm.__constructor__(self.dm.data, row_lookup, col_lookup)
else:
dm_view = self.dm.view(row_lookup, col_lookup)
@@ -160,8 +165,7 @@ def __getitem__(self, row_lookup: pandas.Index, col_lookup: pandas.Index,
single_axis = 1 if self.col_scaler else 0
return dm_view.squeeze(ndim=1, axis=single_axis)
- def __setitem__(self, row_lookup: pandas.Index, col_lookup: pandas.Index,
- item):
+ def __setitem__(self, row_lookup: pandas.Index, col_lookup: pandas.Index, item):
"""
Args:
row_lookup: the global row index to write item to
@@ -187,15 +191,18 @@ def _broadcast_item(self, item, to_shape):
return np.broadcast_to(item, to_shape)
except ValueError:
from_shape = np.array(item).shape
- raise ValueError("could not broadcast input array from \
+ raise ValueError(
+ "could not broadcast input array from \
shape {from_shape} into shape {to_shape}".format(
- from_shape=from_shape, to_shape=to_shape))
+ from_shape=from_shape, to_shape=to_shape
+ )
+ )
def _write_items(self, row_lookup, col_lookup, item):
"""Perform remote write and replace blocks.
"""
- row_numeric_idx = self.dm.global_idx_to_numeric_idx('row', row_lookup)
- col_numeric_idx = self.dm.global_idx_to_numeric_idx('col', col_lookup)
+ row_numeric_idx = self.dm.global_idx_to_numeric_idx("row", row_lookup)
+ col_numeric_idx = self.dm.global_idx_to_numeric_idx("col", col_lookup)
self.dm.write_items(row_numeric_idx, col_numeric_idx, item)
@@ -203,13 +210,11 @@ class _LocIndexer(_LocationIndexerBase):
"""A indexer for ray_df.loc[] functionality"""
def __getitem__(self, key):
- row_loc, col_loc, ndim, self.row_scaler, self.col_scaler = _parse_tuple(
- key)
+ row_loc, col_loc, ndim, self.row_scaler, self.col_scaler = _parse_tuple(key)
self._handle_enlargement(row_loc, col_loc)
row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc)
ndim = self._expand_dim(row_lookup, col_lookup, ndim)
- result = super(_LocIndexer, self).__getitem__(row_lookup, col_lookup,
- ndim)
+ result = super(_LocIndexer, self).__getitem__(row_lookup, col_lookup, ndim)
return result
def __setitem__(self, key, item):
@@ -224,13 +229,13 @@ def _handle_enlargement(self, row_loc, col_loc):
None
"""
if _is_enlargement(row_loc, self.dm.index) or _is_enlargement(
- col_loc, self.dm.columns):
+ col_loc, self.dm.columns
+ ):
_warn_enlargement()
self.dm.enlarge_partitions(
- new_row_labels=self._compute_enlarge_labels(
- row_loc, self.dm.index),
- new_col_labels=self._compute_enlarge_labels(
- col_loc, self.dm.columns))
+ new_row_labels=self._compute_enlarge_labels(row_loc, self.dm.index),
+ new_col_labels=self._compute_enlarge_labels(col_loc, self.dm.columns),
+ )
def _compute_enlarge_labels(self, locator, base_index):
"""Helper for _enlarge_axis, compute common labels and extra labels.
@@ -249,8 +254,10 @@ def _compute_enlarge_labels(self, locator, base_index):
if len(common_labels) == 0:
raise KeyError(
- 'None of [{labels}] are in the [{base_index_name}]'.format(
- labels=list(locator_as_index), base_index_name=base_index))
+ "None of [{labels}] are in the [{base_index_name}]".format(
+ labels=list(locator_as_index), base_index_name=base_index
+ )
+ )
return nan_labels
@@ -268,8 +275,7 @@ def _expand_dim(self, row_lookup, col_lookup, ndim):
return ndim
- def _compute_lookup(self, row_loc,
- col_loc) -> Tuple[pandas.Index, pandas.Index]:
+ def _compute_lookup(self, row_loc, col_loc) -> Tuple[pandas.Index, pandas.Index]:
row_lookup = self.dm.index.to_series().loc[row_loc].index
col_lookup = self.dm.columns.to_series().loc[col_loc].index
return row_lookup, col_lookup
@@ -279,15 +285,13 @@ class _iLocIndexer(_LocationIndexerBase):
"""A indexer for ray_df.iloc[] functionality"""
def __getitem__(self, key):
- row_loc, col_loc, ndim, self.row_scaler, self.col_scaler = _parse_tuple(
- key)
+ row_loc, col_loc, ndim, self.row_scaler, self.col_scaler = _parse_tuple(key)
self._check_dtypes(row_loc)
self._check_dtypes(col_loc)
row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc)
- result = super(_iLocIndexer, self).__getitem__(row_lookup, col_lookup,
- ndim)
+ result = super(_iLocIndexer, self).__getitem__(row_lookup, col_lookup, ndim)
return result
def __setitem__(self, key, item):
@@ -299,8 +303,7 @@ def __setitem__(self, key, item):
row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc)
super(_iLocIndexer, self).__setitem__(row_lookup, col_lookup, item)
- def _compute_lookup(self, row_loc,
- col_loc) -> Tuple[pandas.Index, pandas.Index]:
+ def _compute_lookup(self, row_loc, col_loc) -> Tuple[pandas.Index, pandas.Index]:
row_lookup = self.dm.index.to_series().iloc[row_loc].index
col_lookup = self.dm.columns.to_series().iloc[col_loc].index
return row_lookup, col_lookup
diff --git a/modin/pandas/io.py b/modin/pandas/io.py
index 2551c0bae59..caabc740cfe 100644
--- a/modin/pandas/io.py
+++ b/modin/pandas/io.py
@@ -18,14 +18,15 @@
from ..data_management.partitioning.partition_collections import RayBlockPartitions
from ..data_management.partitioning.remote_partition import RayRemotePartition
from ..data_management.partitioning.axis_partition import (
- split_result_of_axis_func_pandas)
+ split_result_of_axis_func_pandas
+)
from ..data_management.data_manager import PandasDataManager
-PQ_INDEX_REGEX = re.compile('__index_level_\d+__')
+PQ_INDEX_REGEX = re.compile("__index_level_\d+__")
# Parquet
-def read_parquet(path, engine='auto', columns=None, **kwargs):
+def read_parquet(path, engine="auto", columns=None, **kwargs):
"""Load a parquet object from the file path, returning a DataFrame.
Ray DataFrame only supports pyarrow engine for now.
@@ -49,25 +50,28 @@ def _read_parquet_pandas_on_ray(path, engine, columns, **kwargs):
if not columns:
pf = ParquetFile(path)
columns = [
- name for name in pf.metadata.schema.names
- if not PQ_INDEX_REGEX.match(name)
+ name for name in pf.metadata.schema.names if not PQ_INDEX_REGEX.match(name)
]
- num_splits = min(
- len(columns), RayBlockPartitions._compute_num_partitions())
+ num_splits = min(len(columns), RayBlockPartitions._compute_num_partitions())
# Each item in this list will be a column of original df
# partitioned to smaller pieces along rows.
# We need to transpose the oids array to fit our schema.
- blk_partitions = np.array([
- _read_parquet_column._submit(
- args=(path, col, num_splits, kwargs),
- num_return_vals=num_splits + 1) for col in columns
- ]).T
- remote_partitions = np.array([[RayRemotePartition(obj) for obj in row]
- for row in blk_partitions[:-1]])
+ blk_partitions = np.array(
+ [
+ _read_parquet_column._submit(
+ args=(path, col, num_splits, kwargs), num_return_vals=num_splits + 1
+ )
+ for col in columns
+ ]
+ ).T
+ remote_partitions = np.array(
+ [[RayRemotePartition(obj) for obj in row] for row in blk_partitions[:-1]]
+ )
index_len = ray.get(blk_partitions[-1][0])
index = pandas.RangeIndex(index_len)
new_manager = PandasDataManager(
- RayBlockPartitions(remote_partitions), index, columns)
+ RayBlockPartitions(remote_partitions), index, columns
+ )
df = DataFrame(data_manager=new_manager)
return df
@@ -134,14 +138,15 @@ def _read_csv_from_file_pandas_on_ray(filepath, kwargs={}):
DataFrame or Series constructed from CSV file.
"""
empty_pd_df = pandas.read_csv(
- filepath, **dict(kwargs, nrows=0, skipfooter=0, skip_footer=0))
+ filepath, **dict(kwargs, nrows=0, skipfooter=0, skip_footer=0)
+ )
column_names = empty_pd_df.columns
- skipfooter = kwargs.get("skipfooter", None) or kwargs.get(
- "skip_footer", None)
+ skipfooter = kwargs.get("skipfooter", None) or kwargs.get("skip_footer", None)
partition_kwargs = dict(
- kwargs, header=None, names=column_names, skipfooter=0, skip_footer=0)
+ kwargs, header=None, names=column_names, skipfooter=0, skip_footer=0
+ )
with open(filepath, "rb") as f:
# Get the BOM if necessary
prefix = b""
@@ -173,11 +178,17 @@ def _read_csv_from_file_pandas_on_ray(filepath, kwargs={}):
f.readline() # Read a whole number of lines
partition_id = _read_csv_with_offset_pandas_on_ray._submit(
- args=(filepath, num_splits, start, f.tell(),
- partition_kwargs_id, prefix_id),
- num_return_vals=num_splits + 1)
- partition_ids.append(
- [RayRemotePartition(obj) for obj in partition_id[:-1]])
+ args=(
+ filepath,
+ num_splits,
+ start,
+ f.tell(),
+ partition_kwargs_id,
+ prefix_id,
+ ),
+ num_return_vals=num_splits + 1,
+ )
+ partition_ids.append([RayRemotePartition(obj) for obj in partition_id[:-1]])
index_ids.append(partition_id[-1])
index_col = kwargs.get("index_col", None)
@@ -188,7 +199,8 @@ def _read_csv_from_file_pandas_on_ray(filepath, kwargs={}):
new_index = ray.get(new_index_ids)
new_manager = PandasDataManager(
- RayBlockPartitions(np.array(partition_ids)), new_index, column_names)
+ RayBlockPartitions(np.array(partition_ids)), new_index, column_names
+ )
df = DataFrame(data_manager=new_manager)
if skipfooter:
@@ -208,65 +220,66 @@ def _read_csv_from_pandas(filepath_or_buffer, kwargs):
# Overwriting the read method should return a ray DataFrame for calls
# to __next__ and get_chunk
pd_read = pd_obj.read
- pd_obj.read = lambda *args, **kwargs: \
- from_pandas(pd_read(*args, **kwargs))
+ pd_obj.read = lambda *args, **kwargs: from_pandas(pd_read(*args, **kwargs))
return pd_obj
-def read_csv(filepath_or_buffer,
- sep=',',
- delimiter=None,
- header='infer',
- names=None,
- index_col=None,
- usecols=None,
- squeeze=False,
- prefix=None,
- mangle_dupe_cols=True,
- dtype=None,
- engine=None,
- converters=None,
- true_values=None,
- false_values=None,
- skipinitialspace=False,
- skiprows=None,
- nrows=None,
- na_values=None,
- keep_default_na=True,
- na_filter=True,
- verbose=False,
- skip_blank_lines=True,
- parse_dates=False,
- infer_datetime_format=False,
- keep_date_col=False,
- date_parser=None,
- dayfirst=False,
- iterator=False,
- chunksize=None,
- compression='infer',
- thousands=None,
- decimal=b'.',
- lineterminator=None,
- quotechar='"',
- quoting=0,
- escapechar=None,
- comment=None,
- encoding=None,
- dialect=None,
- tupleize_cols=None,
- error_bad_lines=True,
- warn_bad_lines=True,
- skipfooter=0,
- skip_footer=0,
- doublequote=True,
- delim_whitespace=False,
- as_recarray=None,
- compact_ints=None,
- use_unsigned=None,
- low_memory=True,
- buffer_lines=None,
- memory_map=False,
- float_precision=None):
+def read_csv(
+ filepath_or_buffer,
+ sep=",",
+ delimiter=None,
+ header="infer",
+ names=None,
+ index_col=None,
+ usecols=None,
+ squeeze=False,
+ prefix=None,
+ mangle_dupe_cols=True,
+ dtype=None,
+ engine=None,
+ converters=None,
+ true_values=None,
+ false_values=None,
+ skipinitialspace=False,
+ skiprows=None,
+ nrows=None,
+ na_values=None,
+ keep_default_na=True,
+ na_filter=True,
+ verbose=False,
+ skip_blank_lines=True,
+ parse_dates=False,
+ infer_datetime_format=False,
+ keep_date_col=False,
+ date_parser=None,
+ dayfirst=False,
+ iterator=False,
+ chunksize=None,
+ compression="infer",
+ thousands=None,
+ decimal=b".",
+ lineterminator=None,
+ quotechar='"',
+ quoting=0,
+ escapechar=None,
+ comment=None,
+ encoding=None,
+ dialect=None,
+ tupleize_cols=None,
+ error_bad_lines=True,
+ warn_bad_lines=True,
+ skipfooter=0,
+ skip_footer=0,
+ doublequote=True,
+ delim_whitespace=False,
+ as_recarray=None,
+ compact_ints=None,
+ use_unsigned=None,
+ low_memory=True,
+ buffer_lines=None,
+ memory_map=False,
+ float_precision=None,
+):
"""Read csv file from local disk.
Args:
filepath:
@@ -287,7 +300,8 @@ def read_csv(filepath_or_buffer,
defaults = dict(zip(args[1:], defaults))
kwargs = {
kw: kwargs[kw]
- for kw in kwargs if kw in defaults and kwargs[kw] != defaults[kw]
+ for kw in kwargs
+ if kw in defaults and kwargs[kw] != defaults[kw]
}
# This happens on Python2, we will just default to serializing the entire dictionary
except AttributeError:
@@ -297,9 +311,10 @@ def read_csv(filepath_or_buffer,
if isinstance(filepath_or_buffer, str):
if not os.path.exists(filepath_or_buffer):
- warnings.warn(("File not found on disk. "
- "Defaulting to Pandas implementation."),
- PendingDeprecationWarning)
+ warnings.warn(
+ ("File not found on disk. " "Defaulting to Pandas implementation."),
+ PendingDeprecationWarning,
+ )
return _read_csv_from_pandas(filepath_or_buffer, kwargs)
elif not isinstance(filepath_or_buffer, py.path.local):
read_from_pandas = True
@@ -307,109 +322,141 @@ def read_csv(filepath_or_buffer,
# Pandas read_csv supports pathlib.Path
try:
import pathlib
+
if isinstance(filepath_or_buffer, pathlib.Path):
read_from_pandas = False
except ImportError:
pass
if read_from_pandas:
- warnings.warn(("Reading from buffer. "
- "Defaulting to Pandas implementation."),
- PendingDeprecationWarning)
+ warnings.warn(
+ ("Reading from buffer. " "Defaulting to Pandas implementation."),
+ PendingDeprecationWarning,
+ )
return _read_csv_from_pandas(filepath_or_buffer, kwargs)
if _infer_compression(filepath_or_buffer, compression) is not None:
- warnings.warn(("Compression detected. "
- "Defaulting to Pandas implementation."),
- PendingDeprecationWarning)
+ warnings.warn(
+ ("Compression detected. " "Defaulting to Pandas implementation."),
+ PendingDeprecationWarning,
+ )
return _read_csv_from_pandas(filepath_or_buffer, kwargs)
if as_recarray:
- warnings.warn("Defaulting to Pandas implementation.",
- PendingDeprecationWarning)
+ warnings.warn("Defaulting to Pandas implementation.", PendingDeprecationWarning)
return _read_csv_from_pandas(filepath_or_buffer, kwargs)
if chunksize is not None:
- warnings.warn(("Reading chunks from a file. "
- "Defaulting to Pandas implementation."),
- PendingDeprecationWarning)
+ warnings.warn(
+ ("Reading chunks from a file. " "Defaulting to Pandas implementation."),
+ PendingDeprecationWarning,
+ )
return _read_csv_from_pandas(filepath_or_buffer, kwargs)
if skiprows is not None and not isinstance(skiprows, int):
- warnings.warn(("Defaulting to Pandas implementation. To speed up "
- "read_csv through the Pandas on Ray implementation, "
- "comment the rows to skip instead."))
+ warnings.warn(
+ (
+ "Defaulting to Pandas implementation. To speed up "
+ "read_csv through the Pandas on Ray implementation, "
+ "comment the rows to skip instead."
+ )
+ )
return _read_csv_from_pandas(filepath_or_buffer, kwargs)
# TODO: replace this by reading lines from file.
if nrows is not None:
- warnings.warn("Defaulting to Pandas implementation.",
- PendingDeprecationWarning)
+ warnings.warn("Defaulting to Pandas implementation.", PendingDeprecationWarning)
return _read_csv_from_pandas(filepath_or_buffer, kwargs)
return _read_csv_from_file_pandas_on_ray(filepath_or_buffer, kwargs)
-def read_json(path_or_buf=None,
- orient=None,
- typ='frame',
- dtype=True,
- convert_axes=True,
- convert_dates=True,
- keep_default_dates=True,
- numpy=False,
- precise_float=False,
- date_unit=None,
- encoding=None,
- lines=False,
- chunksize=None,
- compression='infer'):
-
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
+def read_json(
+ path_or_buf=None,
+ orient=None,
+ typ="frame",
+ dtype=True,
+ convert_axes=True,
+ convert_dates=True,
+ keep_default_dates=True,
+ numpy=False,
+ precise_float=False,
+ date_unit=None,
+ encoding=None,
+ lines=False,
+ chunksize=None,
+ compression="infer",
+):
+
+ warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning)
port_frame = pandas.read_json(
- path_or_buf, orient, typ, dtype, convert_axes, convert_dates,
- keep_default_dates, numpy, precise_float, date_unit, encoding, lines,
- chunksize, compression)
+ path_or_buf,
+ orient,
+ typ,
+ dtype,
+ convert_axes,
+ convert_dates,
+ keep_default_dates,
+ numpy,
+ precise_float,
+ date_unit,
+ encoding,
+ lines,
+ chunksize,
+ compression,
+ )
ray_frame = from_pandas(port_frame)
return ray_frame
-def read_html(io,
- match='.+',
- flavor=None,
- header=None,
- index_col=None,
- skiprows=None,
- attrs=None,
- parse_dates=False,
- tupleize_cols=None,
- thousands=',',
- encoding=None,
- decimal='.',
- converters=None,
- na_values=None,
- keep_default_na=True):
-
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
-
- port_frame = pandas.read_html(io, match, flavor, header, index_col,
- skiprows, attrs, parse_dates, tupleize_cols,
- thousands, encoding, decimal, converters,
- na_values, keep_default_na)
+def read_html(
+ io,
+ match=".+",
+ flavor=None,
+ header=None,
+ index_col=None,
+ skiprows=None,
+ attrs=None,
+ parse_dates=False,
+ tupleize_cols=None,
+ thousands=",",
+ encoding=None,
+ decimal=".",
+ converters=None,
+ na_values=None,
+ keep_default_na=True,
+):
+
+ warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning)
+
+ port_frame = pandas.read_html(
+ io,
+ match,
+ flavor,
+ header,
+ index_col,
+ skiprows,
+ attrs,
+ parse_dates,
+ tupleize_cols,
+ thousands,
+ encoding,
+ decimal,
+ converters,
+ na_values,
+ keep_default_na,
+ )
ray_frame = from_pandas(port_frame[0])
return ray_frame
-def read_clipboard(sep=r'\s+'):
+def read_clipboard(sep=r"\s+"):
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
+ warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning)
port_frame = pandas.read_clipboard(sep)
ray_frame = from_pandas(port_frame)
@@ -417,42 +464,59 @@ def read_clipboard(sep=r'\s+'):
return ray_frame
-def read_excel(io,
- sheet_name=0,
- header=0,
- skiprows=None,
- skip_footer=0,
- index_col=None,
- names=None,
- usecols=None,
- parse_dates=False,
- date_parser=None,
- na_values=None,
- thousands=None,
- convert_float=True,
- converters=None,
- dtype=None,
- true_values=None,
- false_values=None,
- engine=None,
- squeeze=False):
-
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
+def read_excel(
+ io,
+ sheet_name=0,
+ header=0,
+ skiprows=None,
+ skip_footer=0,
+ index_col=None,
+ names=None,
+ usecols=None,
+ parse_dates=False,
+ date_parser=None,
+ na_values=None,
+ thousands=None,
+ convert_float=True,
+ converters=None,
+ dtype=None,
+ true_values=None,
+ false_values=None,
+ engine=None,
+ squeeze=False,
+):
+
+ warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning)
port_frame = pandas.read_excel(
- io, sheet_name, header, skiprows, skip_footer, index_col, names,
- usecols, parse_dates, date_parser, na_values, thousands, convert_float,
- converters, dtype, true_values, false_values, engine, squeeze)
+ io,
+ sheet_name,
+ header,
+ skiprows,
+ skip_footer,
+ index_col,
+ names,
+ usecols,
+ parse_dates,
+ date_parser,
+ na_values,
+ thousands,
+ convert_float,
+ converters,
+ dtype,
+ true_values,
+ false_values,
+ engine,
+ squeeze,
+ )
ray_frame = from_pandas(port_frame)
return ray_frame
-def read_hdf(path_or_buf, key=None, mode='r'):
+def read_hdf(path_or_buf, key=None, mode="r"):
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
+ warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning)
port_frame = pandas.read_hdf(path_or_buf, key, mode)
ray_frame = from_pandas(port_frame)
@@ -462,8 +526,7 @@ def read_hdf(path_or_buf, key=None, mode='r'):
def read_feather(path, nthreads=1):
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
+ warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning)
port_frame = pandas.read_feather(path)
ray_frame = from_pandas(port_frame)
@@ -471,10 +534,9 @@ def read_feather(path, nthreads=1):
return ray_frame
-def read_msgpack(path_or_buf, encoding='utf-8', iterator=False):
+def read_msgpack(path_or_buf, encoding="utf-8", iterator=False):
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
+ warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning)
port_frame = pandas.read_msgpack(path_or_buf, encoding, iterator)
ray_frame = from_pandas(port_frame)
@@ -482,51 +544,62 @@ def read_msgpack(path_or_buf, encoding='utf-8', iterator=False):
return ray_frame
-def read_stata(filepath_or_buffer,
- convert_dates=True,
- convert_categoricals=True,
- encoding=None,
- index_col=None,
- convert_missing=False,
- preserve_dtypes=True,
- columns=None,
- order_categoricals=True,
- chunksize=None,
- iterator=False):
-
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
-
- port_frame = pandas.read_stata(filepath_or_buffer, convert_dates,
- convert_categoricals, encoding, index_col,
- convert_missing, preserve_dtypes, columns,
- order_categoricals, chunksize, iterator)
+def read_stata(
+ filepath_or_buffer,
+ convert_dates=True,
+ convert_categoricals=True,
+ encoding=None,
+ index_col=None,
+ convert_missing=False,
+ preserve_dtypes=True,
+ columns=None,
+ order_categoricals=True,
+ chunksize=None,
+ iterator=False,
+):
+
+ warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning)
+
+ port_frame = pandas.read_stata(
+ filepath_or_buffer,
+ convert_dates,
+ convert_categoricals,
+ encoding,
+ index_col,
+ convert_missing,
+ preserve_dtypes,
+ columns,
+ order_categoricals,
+ chunksize,
+ iterator,
+ )
ray_frame = from_pandas(port_frame)
return ray_frame
-def read_sas(filepath_or_buffer,
- format=None,
- index=None,
- encoding=None,
- chunksize=None,
- iterator=False):
+def read_sas(
+ filepath_or_buffer,
+ format=None,
+ index=None,
+ encoding=None,
+ chunksize=None,
+ iterator=False,
+):
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
+ warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning)
- port_frame = pandas.read_sas(filepath_or_buffer, format, index, encoding,
- chunksize, iterator)
+ port_frame = pandas.read_sas(
+ filepath_or_buffer, format, index, encoding, chunksize, iterator
+ )
ray_frame = from_pandas(port_frame)
return ray_frame
-def read_pickle(path, compression='infer'):
+def read_pickle(path, compression="infer"):
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
+ warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning)
port_frame = pandas.read_pickle(path, compression)
ray_frame = from_pandas(port_frame)
@@ -534,20 +607,22 @@ def read_pickle(path, compression='infer'):
return ray_frame
-def read_sql(sql,
- con,
- index_col=None,
- coerce_float=True,
- params=None,
- parse_dates=None,
- columns=None,
- chunksize=None):
+def read_sql(
+ sql,
+ con,
+ index_col=None,
+ coerce_float=True,
+ params=None,
+ parse_dates=None,
+ columns=None,
+ chunksize=None,
+):
- warnings.warn("Defaulting to Pandas implementation",
- PendingDeprecationWarning)
+ warnings.warn("Defaulting to Pandas implementation", PendingDeprecationWarning)
- port_frame = pandas.read_sql(sql, con, index_col, coerce_float, params,
- parse_dates, columns, chunksize)
+ port_frame = pandas.read_sql(
+ sql, con, index_col, coerce_float, params, parse_dates, columns, chunksize
+ )
ray_frame = from_pandas(port_frame)
return ray_frame
@@ -561,8 +636,7 @@ def get_index(index_name, *partition_indices):
@ray.remote
-def _read_csv_with_offset_pandas_on_ray(fname, num_splits, start, end, kwargs,
- header):
+def _read_csv_with_offset_pandas_on_ray(fname, num_splits, start, end, kwargs, header):
"""Use a Ray task to read a chunk of a CSV into a Pandas DataFrame.
Args:
@@ -579,7 +653,7 @@ def _read_csv_with_offset_pandas_on_ray(fname, num_splits, start, end, kwargs,
This is used to determine the total length of the DataFrame to build a
default Index.
"""
- bio = open(fname, 'rb')
+ bio = open(fname, "rb")
bio.seek(start)
to_read = header + bio.read(end - start)
bio.close()
@@ -612,7 +686,7 @@ def _read_parquet_column(path, column, num_splits, kwargs):
default Index.
"""
import pyarrow.parquet as pq
+
df = pq.read_pandas(path, columns=[column], **kwargs).to_pandas()
# Append the length of the index here to build it externally
- return split_result_of_axis_func_pandas(0, num_splits,
- df) + [len(df.index)]
+ return split_result_of_axis_func_pandas(0, num_splits, df) + [len(df.index)]
diff --git a/modin/pandas/iterator.py b/modin/pandas/iterator.py
index 35cb2b445db..08ecbabeaea 100644
--- a/modin/pandas/iterator.py
+++ b/modin/pandas/iterator.py
@@ -17,8 +17,9 @@ def __init__(self, data_manager, axis, func):
"""
self.data_manager = data_manager
self.axis = axis
- self.index_iter = iter(self.data_manager.columns) if axis else iter(
- self.data_manager.index)
+ self.index_iter = (
+ iter(self.data_manager.columns) if axis else iter(self.data_manager.index)
+ )
self.func = func
def __iter__(self):
diff --git a/modin/pandas/reshape.py b/modin/pandas/reshape.py
index 882bbc90025..8fe9232c520 100644
--- a/modin/pandas/reshape.py
+++ b/modin/pandas/reshape.py
@@ -8,13 +8,15 @@
from .dataframe import DataFrame
-def get_dummies(data,
- prefix=None,
- prefix_sep='_',
- dummy_na=False,
- columns=None,
- sparse=False,
- drop_first=False):
+def get_dummies(
+ data,
+ prefix=None,
+ prefix_sep="_",
+ dummy_na=False,
+ columns=None,
+ sparse=False,
+ drop_first=False,
+):
"""Convert categorical variable into indicator variables.
Args:
@@ -34,7 +36,8 @@ def get_dummies(data,
raise NotImplementedError(
"SparseDataFrame is not implemented. "
"To contribute to Pandas on Ray, please visit "
- "github.com/modin-project/modin.")
+ "github.com/modin-project/modin."
+ )
if not isinstance(data, DataFrame):
return pandas.get_dummies(
@@ -44,7 +47,8 @@ def get_dummies(data,
dummy_na=dummy_na,
columns=columns,
sparse=sparse,
- drop_first=drop_first)
+ drop_first=drop_first,
+ )
if isinstance(data, DataFrame):
df = data
@@ -56,6 +60,7 @@ def get_dummies(data,
prefix=prefix,
prefix_sep=prefix_sep,
dummy_na=dummy_na,
- drop_first=drop_first)
+ drop_first=drop_first,
+ )
return DataFrame(data_manager=new_manager)
diff --git a/modin/pandas/series.py b/modin/pandas/series.py
index 30fba343995..991bcbc7e56 100644
--- a/modin/pandas/series.py
+++ b/modin/pandas/series.py
@@ -15,8 +15,7 @@ def na_op():
raise NotImplementedError("Not Yet implemented.")
-@_inherit_docstrings(
- pandas.Series, excluded=[pandas.Series, pandas.Series.__init__])
+@_inherit_docstrings(pandas.Series, excluded=[pandas.Series, pandas.Series.__init__])
class Series(object):
def __init__(self, series_oids):
"""Constructor for a Series object.
@@ -33,7 +32,7 @@ def T(self):
def __abs__(self):
raise NotImplementedError("Not Yet implemented.")
- def __add__(self, right, name='__add__', na_op=na_op):
+ def __add__(self, right, name="__add__", na_op=na_op):
raise NotImplementedError("Not Yet implemented.")
def __and__(self, other):
@@ -58,13 +57,9 @@ def __bool__(self):
def __bytes__(self):
raise NotImplementedError("Not Yet implemented.")
- def __class__(self,
- data=None,
- index=None,
- dtype=None,
- name=None,
- copy=False,
- fastpath=False):
+ def __class__(
+ self, data=None, index=None, dtype=None, name=None, copy=False, fastpath=False
+ ):
raise NotImplementedError("Not Yet implemented.")
def __contains__(self, key):
@@ -82,10 +77,10 @@ def __delitem__(self, key):
def __dir__(self):
return list(type(self).__dict__.keys())
- def __div__(self, right, name='__truediv__', na_op=na_op):
+ def __div__(self, right, name="__truediv__", na_op=na_op):
raise NotImplementedError("Not Yet implemented.")
- def __divmod__(self, right, name='__divmod__', na_op=na_op):
+ def __divmod__(self, right, name="__divmod__", na_op=na_op):
raise NotImplementedError("Not Yet implemented.")
@property
@@ -101,7 +96,7 @@ def __finalize__(self, other, method=None, **kwargs):
def __float__(self):
raise NotImplementedError("Not Yet implemented.")
- def __floordiv__(self, right, name='__floordiv__', na_op=na_op):
+ def __floordiv__(self, right, name="__floordiv__", na_op=na_op):
raise NotImplementedError("Not Yet implemented.")
def __ge__(self, other, axis=None):
@@ -152,10 +147,10 @@ def __long__(self):
def __lt__(self, other, axis=None):
raise NotImplementedError("Not Yet implemented.")
- def __mod__(self, right, name='__mod__', na_op=na_op):
+ def __mod__(self, right, name="__mod__", na_op=na_op):
raise NotImplementedError("Not Yet implemented.")
- def __mul__(self, right, name='__mul__', na_op=na_op):
+ def __mul__(self, right, name="__mul__", na_op=na_op):
raise NotImplementedError("Not Yet implemented.")
def __ne__(self, other, axis=None):
@@ -170,7 +165,7 @@ def __nonzero__(self):
def __or__(self, other):
raise NotImplementedError("Not Yet implemented.")
- def __pow__(self, right, name='__pow__', na_op=na_op):
+ def __pow__(self, right, name="__pow__", na_op=na_op):
raise NotImplementedError("Not Yet implemented.")
def __repr__(self):
@@ -191,10 +186,10 @@ def __sizeof__(self):
def __str__(self):
raise NotImplementedError("Not Yet implemented.")
- def __sub__(self, right, name='__sub__', na_op=na_op):
+ def __sub__(self, right, name="__sub__", na_op=na_op):
raise NotImplementedError("Not Yet implemented.")
- def __truediv__(self, right, name='__truediv__', na_op=na_op):
+ def __truediv__(self, right, name="__truediv__", na_op=na_op):
raise NotImplementedError("Not Yet implemented.")
def __xor__(self, other):
@@ -218,25 +213,25 @@ def agg(self, func, axis=0, *args, **kwargs):
def aggregate(self, func, axis=0, *args, **kwargs):
raise NotImplementedError("Not Yet implemented.")
- def align(self,
- other,
- join='outer',
- axis=None,
- level=None,
- copy=True,
- fill_value=None,
- method=None,
- limit=None,
- fill_axis=0,
- broadcast_axis=None):
+ def align(
+ self,
+ other,
+ join="outer",
+ axis=None,
+ level=None,
+ copy=True,
+ fill_value=None,
+ method=None,
+ limit=None,
+ fill_axis=0,
+ broadcast_axis=None,
+ ):
raise NotImplementedError("Not Yet implemented.")
- def all(self, axis=None, bool_only=None, skipna=None, level=None,
- **kwargs):
+ def all(self, axis=None, bool_only=None, skipna=None, level=None, **kwargs):
raise NotImplementedError("Not Yet implemented.")
- def any(self, axis=None, bool_only=None, skipna=None, level=None,
- **kwargs):
+ def any(self, axis=None, bool_only=None, skipna=None, level=None, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def append(self, to_append, ignore_index=False, verify_integrity=False):
@@ -251,7 +246,7 @@ def argmax(self, axis=None, skipna=True, *args, **kwargs):
def argmin(self, axis=None, skipna=True, *args, **kwargs):
raise NotImplementedError("Not Yet implemented.")
- def argsort(self, axis=0, kind='quicksort', order=None):
+ def argsort(self, axis=0, kind="quicksort", order=None):
raise NotImplementedError("Not Yet implemented.")
def as_blocks(self, copy=True):
@@ -260,18 +255,13 @@ def as_blocks(self, copy=True):
def as_matrix(self, columns=None):
raise NotImplementedError("Not Yet implemented.")
- def asfreq(self,
- freq,
- method=None,
- how=None,
- normalize=False,
- fill_value=None):
+ def asfreq(self, freq, method=None, how=None, normalize=False, fill_value=None):
raise NotImplementedError("Not Yet implemented.")
def asof(self, where, subset=None):
raise NotImplementedError("Not Yet implemented.")
- def astype(self, dtype, copy=True, errors='raise', **kwargs):
+ def astype(self, dtype, copy=True, errors="raise", **kwargs):
raise NotImplementedError("Not Yet implemented.")
def at(self, axis=None):
@@ -286,11 +276,7 @@ def autocorr(self, lag=1):
def between(self, left, right, inclusive=True):
raise NotImplementedError("Not Yet implemented.")
- def between_time(self,
- start_time,
- end_time,
- include_start=True,
- include_end=True):
+ def between_time(self, start_time, end_time, include_start=True, include_end=True):
raise NotImplementedError("Not Yet implemented.")
def bfill(self, axis=None, inplace=False, limit=None, downcast=None):
@@ -323,17 +309,19 @@ def compress(self, condition, *args, **kwargs):
def consolidate(self, inplace=False):
raise NotImplementedError("Not Yet implemented.")
- def convert_objects(self,
- convert_dates=True,
- convert_numeric=False,
- convert_timedeltas=True,
- copy=True):
+ def convert_objects(
+ self,
+ convert_dates=True,
+ convert_numeric=False,
+ convert_timedeltas=True,
+ copy=True,
+ ):
raise NotImplementedError("Not Yet implemented.")
def copy(self, deep=True):
raise NotImplementedError("Not Yet implemented.")
- def corr(self, other, method='pearson', min_periods=None):
+ def corr(self, other, method="pearson", min_periods=None):
raise NotImplementedError("Not Yet implemented.")
def count(self, level=None):
@@ -369,16 +357,16 @@ def divide(self, other, level=None, fill_value=None, axis=0):
def dot(self, other):
raise NotImplementedError("Not Yet implemented.")
- def drop(self, labels, axis=0, level=None, inplace=False, errors='raise'):
+ def drop(self, labels, axis=0, level=None, inplace=False, errors="raise"):
raise NotImplementedError("Not Yet implemented.")
- def drop_duplicates(self, keep='first', inplace=False):
+ def drop_duplicates(self, keep="first", inplace=False):
raise NotImplementedError("Not Yet implemented.")
def dropna(self, axis=0, inplace=False, **kwargs):
raise NotImplementedError("Not Yet implemented.")
- def duplicated(self, keep='first'):
+ def duplicated(self, keep="first"):
raise NotImplementedError("Not Yet implemented.")
def eq(self, other, level=None, fill_value=None, axis=0):
@@ -387,16 +375,18 @@ def eq(self, other, level=None, fill_value=None, axis=0):
def equals(self, other):
raise NotImplementedError("Not Yet implemented.")
- def ewm(self,
- com=None,
- span=None,
- halflife=None,
- alpha=None,
- min_periods=0,
- freq=None,
- adjust=True,
- ignore_na=False,
- axis=0):
+ def ewm(
+ self,
+ com=None,
+ span=None,
+ halflife=None,
+ alpha=None,
+ min_periods=0,
+ freq=None,
+ adjust=True,
+ ignore_na=False,
+ axis=0,
+ ):
raise NotImplementedError("Not Yet implemented.")
def expanding(self, min_periods=1, freq=None, center=False, axis=0):
@@ -408,14 +398,16 @@ def factorize(self, sort=False, na_sentinel=-1):
def ffill(self, axis=None, inplace=False, limit=None, downcast=None):
raise NotImplementedError("Not Yet implemented.")
- def fillna(self,
- value=None,
- method=None,
- axis=None,
- inplace=False,
- limit=None,
- downcast=None,
- **kwargs):
+ def fillna(
+ self,
+ value=None,
+ method=None,
+ axis=None,
+ inplace=False,
+ limit=None,
+ downcast=None,
+ **kwargs
+ ):
raise NotImplementedError("Not Yet implemented.")
def filter(self, items=None, like=None, regex=None, axis=None):
@@ -430,23 +422,21 @@ def first_valid_index(self):
def floordiv(self, other, level=None, fill_value=None, axis=0):
raise NotImplementedError("Not Yet implemented.")
- def from_array(self,
- arr,
- index=None,
- name=None,
- dtype=None,
- copy=False,
- fastpath=False):
+ def from_array(
+ self, arr, index=None, name=None, dtype=None, copy=False, fastpath=False
+ ):
raise NotImplementedError("Not Yet implemented.")
- def from_csv(self,
- path,
- sep=',',
- parse_dates=True,
- header=None,
- index_col=0,
- encoding=None,
- infer_datetime_format=False):
+ def from_csv(
+ self,
+ path,
+ sep=",",
+ parse_dates=True,
+ header=None,
+ index_col=0,
+ encoding=None,
+ infer_datetime_format=False,
+ ):
raise NotImplementedError("Not Yet implemented.")
def ge(self, other, level=None, fill_value=None, axis=0):
@@ -467,15 +457,17 @@ def get_value(self, label, takeable=False):
def get_values(self):
raise NotImplementedError("Not Yet implemented.")
- def groupby(self,
- by=None,
- axis=0,
- level=None,
- as_index=True,
- sort=True,
- group_keys=True,
- squeeze=False,
- **kwargs):
+ def groupby(
+ self,
+ by=None,
+ axis=0,
+ level=None,
+ as_index=True,
+ sort=True,
+ group_keys=True,
+ squeeze=False,
+ **kwargs
+ ):
raise NotImplementedError("Not Yet implemented.")
def gt(self, other, level=None, fill_value=None, axis=0):
@@ -484,17 +476,19 @@ def gt(self, other, level=None, fill_value=None, axis=0):
def head(self, n=5):
raise NotImplementedError("Not Yet implemented.")
- def hist(self,
- by=None,
- ax=None,
- grid=True,
- xlabelsize=None,
- xrot=None,
- ylabelsize=None,
- yrot=None,
- figsize=None,
- bins=10,
- **kwds):
+ def hist(
+ self,
+ by=None,
+ ax=None,
+ grid=True,
+ xlabelsize=None,
+ xrot=None,
+ ylabelsize=None,
+ yrot=None,
+ figsize=None,
+ bins=10,
+ **kwds
+ ):
raise NotImplementedError("Not Yet implemented.")
def iat(self, axis=None):
@@ -509,14 +503,16 @@ def idxmin(self, axis=None, skipna=True, *args, **kwargs):
def iloc(self, axis=None):
raise NotImplementedError("Not Yet implemented.")
- def interpolate(self,
- method='linear',
- axis=0,
- limit=None,
- inplace=False,
- limit_direction='forward',
- downcast=None,
- **kwargs):
+ def interpolate(
+ self,
+ method="linear",
+ axis=0,
+ limit=None,
+ inplace=False,
+ limit_direction="forward",
+ downcast=None,
+ **kwargs
+ ):
raise NotImplementedError("Not Yet implemented.")
def isin(self, values):
@@ -540,20 +536,10 @@ def ix(self, axis=None):
def keys(self):
raise NotImplementedError("Not Yet implemented.")
- def kurt(self,
- axis=None,
- skipna=None,
- level=None,
- numeric_only=None,
- **kwargs):
+ def kurt(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
raise NotImplementedError("Not Yet implemented.")
- def kurtosis(self,
- axis=None,
- skipna=None,
- level=None,
- numeric_only=None,
- **kwargs):
+ def kurtosis(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def last(self, offset):
@@ -577,49 +563,31 @@ def mad(self, axis=None, skipna=None, level=None):
def map(self, arg, na_action=None):
raise NotImplementedError("Not Yet implemented.")
- def mask(self,
- cond,
- other=np.nan,
- inplace=False,
- axis=None,
- level=None,
- try_cast=False,
- raise_on_error=True):
+ def mask(
+ self,
+ cond,
+ other=np.nan,
+ inplace=False,
+ axis=None,
+ level=None,
+ try_cast=False,
+ raise_on_error=True,
+ ):
raise NotImplementedError("Not Yet implemented.")
- def max(self,
- axis=None,
- skipna=None,
- level=None,
- numeric_only=None,
- **kwargs):
+ def max(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
raise NotImplementedError("Not Yet implemented.")
- def mean(self,
- axis=None,
- skipna=None,
- level=None,
- numeric_only=None,
- **kwargs):
+ def mean(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
raise NotImplementedError("Not Yet implemented.")
- def median(self,
- axis=None,
- skipna=None,
- level=None,
- numeric_only=None,
- **kwargs):
+ def median(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def memory_usage(self, index=True, deep=False):
raise NotImplementedError("Not Yet implemented.")
- def min(self,
- axis=None,
- skipna=None,
- level=None,
- numeric_only=None,
- **kwargs):
+ def min(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def mod(self, other, level=None, fill_value=None, axis=0):
@@ -637,7 +605,7 @@ def multiply(self, other, level=None, fill_value=None, axis=0):
def ne(self, other, level=None, fill_value=None, axis=0):
raise NotImplementedError("Not Yet implemented.")
- def nlargest(self, n=5, keep='first'):
+ def nlargest(self, n=5, keep="first"):
raise NotImplementedError("Not Yet implemented.")
def nonzero(self):
@@ -646,48 +614,45 @@ def nonzero(self):
def notnull(self):
raise NotImplementedError("Not Yet implemented.")
- def nsmallest(self, n=5, keep='first'):
+ def nsmallest(self, n=5, keep="first"):
raise NotImplementedError("Not Yet implemented.")
def nunique(self, dropna=True):
raise NotImplementedError("Not Yet implemented.")
- def pct_change(self,
- periods=1,
- fill_method='pad',
- limit=None,
- freq=None,
- **kwargs):
+ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def pipe(self, func, *args, **kwargs):
raise NotImplementedError("Not Yet implemented.")
- def plot(self,
- kind='line',
- ax=None,
- figsize=None,
- use_index=True,
- title=None,
- grid=None,
- legend=False,
- style=None,
- logx=False,
- logy=False,
- loglog=False,
- xticks=None,
- yticks=None,
- xlim=None,
- ylim=None,
- rot=None,
- fontsize=None,
- colormap=None,
- table=False,
- yerr=None,
- xerr=None,
- label=None,
- secondary_y=False,
- **kwds):
+ def plot(
+ self,
+ kind="line",
+ ax=None,
+ figsize=None,
+ use_index=True,
+ title=None,
+ grid=None,
+ legend=False,
+ style=None,
+ logx=False,
+ logy=False,
+ loglog=False,
+ xticks=None,
+ yticks=None,
+ xlim=None,
+ ylim=None,
+ rot=None,
+ fontsize=None,
+ colormap=None,
+ table=False,
+ yerr=None,
+ xerr=None,
+ label=None,
+ secondary_y=False,
+ **kwds
+ ):
raise NotImplementedError("Not Yet implemented.")
def pop(self, item):
@@ -696,49 +661,36 @@ def pop(self, item):
def pow(self, other, level=None, fill_value=None, axis=0):
raise NotImplementedError("Not Yet implemented.")
- def prod(self,
- axis=None,
- skipna=None,
- level=None,
- numeric_only=None,
- **kwargs):
+ def prod(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
raise NotImplementedError("Not Yet implemented.")
- def product(self,
- axis=None,
- skipna=None,
- level=None,
- numeric_only=None,
- **kwargs):
+ def product(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
raise NotImplementedError("Not Yet implemented.")
- def ptp(self,
- axis=None,
- skipna=None,
- level=None,
- numeric_only=None,
- **kwargs):
+ def ptp(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def put(self, *args, **kwargs):
raise NotImplementedError("Not Yet implemented.")
- def quantile(self, q=0.5, interpolation='linear'):
+ def quantile(self, q=0.5, interpolation="linear"):
raise NotImplementedError("Not Yet implemented.")
def radd(self, other, level=None, fill_value=None, axis=0):
raise NotImplementedError("Not Yet implemented.")
- def rank(self,
- axis=0,
- method='average',
- numeric_only=None,
- na_option='keep',
- ascending=True,
- pct=False):
+ def rank(
+ self,
+ axis=0,
+ method="average",
+ numeric_only=None,
+ na_option="keep",
+ ascending=True,
+ pct=False,
+ ):
raise NotImplementedError("Not Yet implemented.")
- def ravel(self, order='C'):
+ def ravel(self, order="C"):
raise NotImplementedError("Not Yet implemented.")
def rdiv(self, other, level=None, fill_value=None, axis=0):
@@ -750,12 +702,7 @@ def reindex(self, index=None, **kwargs):
def reindex_axis(self, labels, axis=0, **kwargs):
raise NotImplementedError("Not Yet implemented.")
- def reindex_like(self,
- other,
- method=None,
- copy=True,
- limit=None,
- tolerance=None):
+ def reindex_like(self, other, method=None, copy=True, limit=None, tolerance=None):
raise NotImplementedError("Not Yet implemented.")
def rename(self, index=None, **kwargs):
@@ -770,30 +717,34 @@ def reorder_levels(self, order):
def repeat(self, repeats, *args, **kwargs):
raise NotImplementedError("Not Yet implemented.")
- def replace(self,
- to_replace=None,
- value=None,
- inplace=False,
- limit=None,
- regex=False,
- method='pad',
- axis=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def resample(self,
- rule,
- how=None,
- axis=0,
- fill_method=None,
- closed=None,
- label=None,
- convention='start',
- kind=None,
- loffset=None,
- limit=None,
- base=0,
- on=None,
- level=None):
+ def replace(
+ self,
+ to_replace=None,
+ value=None,
+ inplace=False,
+ limit=None,
+ regex=False,
+ method="pad",
+ axis=None,
+ ):
+ raise NotImplementedError("Not Yet implemented.")
+
+ def resample(
+ self,
+ rule,
+ how=None,
+ axis=0,
+ fill_method=None,
+ closed=None,
+ label=None,
+ convention="start",
+ kind=None,
+ loffset=None,
+ limit=None,
+ base=0,
+ on=None,
+ level=None,
+ ):
raise NotImplementedError("Not Yet implemented.")
def reset_index(self, level=None, drop=False, name=None, inplace=False):
@@ -811,15 +762,17 @@ def rmod(self, other, level=None, fill_value=None, axis=0):
def rmul(self, other, level=None, fill_value=None, axis=0):
raise NotImplementedError("Not Yet implemented.")
- def rolling(self,
- window,
- min_periods=None,
- freq=None,
- center=False,
- win_type=None,
- on=None,
- axis=0,
- closed=None):
+ def rolling(
+ self,
+ window,
+ min_periods=None,
+ freq=None,
+ center=False,
+ win_type=None,
+ on=None,
+ axis=0,
+ closed=None,
+ ):
raise NotImplementedError("Not Yet implemented.")
def round(self, decimals=0, *args, **kwargs):
@@ -834,28 +787,26 @@ def rsub(self, other, level=None, fill_value=None, axis=0):
def rtruediv(self, other, level=None, fill_value=None, axis=0):
raise NotImplementedError("Not Yet implemented.")
- def sample(self,
- n=None,
- frac=None,
- replace=False,
- weights=None,
- random_state=None,
- axis=None):
+ def sample(
+ self,
+ n=None,
+ frac=None,
+ replace=False,
+ weights=None,
+ random_state=None,
+ axis=None,
+ ):
raise NotImplementedError("Not Yet implemented.")
- def searchsorted(self, value, side='left', sorter=None):
+ def searchsorted(self, value, side="left", sorter=None):
raise NotImplementedError("Not Yet implemented.")
def select(self, crit, axis=0):
raise NotImplementedError("Not Yet implemented.")
- def sem(self,
- axis=None,
- skipna=None,
- level=None,
- ddof=1,
- numeric_only=None,
- **kwargs):
+ def sem(
+ self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs
+ ):
raise NotImplementedError("Not Yet implemented.")
def set_axis(self, axis, labels):
@@ -867,33 +818,32 @@ def set_value(self, label, value, takeable=False):
def shift(self, periods=1, freq=None, axis=0):
raise NotImplementedError("Not Yet implemented.")
- def skew(self,
- axis=None,
- skipna=None,
- level=None,
- numeric_only=None,
- **kwargs):
+ def skew(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def slice_shift(self, periods=1, axis=0):
raise NotImplementedError("Not Yet implemented.")
- def sort_index(self,
- axis=0,
- level=None,
- ascending=True,
- inplace=False,
- kind='quicksort',
- na_position='last',
- sort_remaining=True):
+ def sort_index(
+ self,
+ axis=0,
+ level=None,
+ ascending=True,
+ inplace=False,
+ kind="quicksort",
+ na_position="last",
+ sort_remaining=True,
+ ):
raise NotImplementedError("Not Yet implemented.")
- def sort_values(self,
- axis=0,
- ascending=True,
- inplace=False,
- kind='quicksort',
- na_position='last'):
+ def sort_values(
+ self,
+ axis=0,
+ ascending=True,
+ inplace=False,
+ kind="quicksort",
+ na_position="last",
+ ):
raise NotImplementedError("Not Yet implemented.")
def sortlevel(self, level=0, ascending=True, sort_remaining=True):
@@ -902,13 +852,9 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True):
def squeeze(self, axis=None):
raise NotImplementedError("Not Yet implemented.")
- def std(self,
- axis=None,
- skipna=None,
- level=None,
- ddof=1,
- numeric_only=None,
- **kwargs):
+ def std(
+ self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs
+ ):
raise NotImplementedError("Not Yet implemented.")
def sub(self, other, level=None, fill_value=None, axis=0):
@@ -917,12 +863,7 @@ def sub(self, other, level=None, fill_value=None, axis=0):
def subtract(self, other, level=None, fill_value=None, axis=0):
raise NotImplementedError("Not Yet implemented.")
- def sum(self,
- axis=None,
- skipna=None,
- level=None,
- numeric_only=None,
- **kwargs):
+ def sum(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs):
raise NotImplementedError("Not Yet implemented.")
def swapaxes(self, axis1, axis2, copy=True):
@@ -940,18 +881,20 @@ def take(self, indices, axis=0, convert=True, is_copy=False, **kwargs):
def to_clipboard(self, excel=None, sep=None, **kwargs):
raise NotImplementedError("Not Yet implemented.")
- def to_csv(self,
- path=None,
- index=True,
- sep=',',
- na_rep='',
- float_format=None,
- header=False,
- index_label=None,
- mode='w',
- encoding=None,
- date_format=None,
- decimal='.'):
+ def to_csv(
+ self,
+ path=None,
+ index=True,
+ sep=",",
+ na_rep="",
+ float_format=None,
+ header=False,
+ index_label=None,
+ mode="w",
+ encoding=None,
+ date_format=None,
+ decimal=".",
+ ):
raise NotImplementedError("Not Yet implemented.")
def to_dense(self):
@@ -960,22 +903,24 @@ def to_dense(self):
def to_dict(self):
raise NotImplementedError("Not Yet implemented.")
- def to_excel(self,
- excel_writer,
- sheet_name='Sheet1',
- na_rep='',
- float_format=None,
- columns=None,
- header=True,
- index=True,
- index_label=None,
- startrow=0,
- startcol=0,
- engine=None,
- merge_cells=True,
- encoding=None,
- inf_rep='inf',
- verbose=True):
+ def to_excel(
+ self,
+ excel_writer,
+ sheet_name="Sheet1",
+ na_rep="",
+ float_format=None,
+ columns=None,
+ header=True,
+ index=True,
+ index_label=None,
+ startrow=0,
+ startcol=0,
+ engine=None,
+ merge_cells=True,
+ encoding=None,
+ inf_rep="inf",
+ verbose=True,
+ ):
raise NotImplementedError("Not Yet implemented.")
def to_frame(self, name=None):
@@ -984,76 +929,84 @@ def to_frame(self, name=None):
def to_hdf(self, path_or_buf, key, **kwargs):
raise NotImplementedError("Not Yet implemented.")
- def to_json(self,
- path_or_buf=None,
- orient=None,
- date_format=None,
- double_precision=10,
- force_ascii=True,
- date_unit='ms',
- default_handler=None,
- lines=False):
- raise NotImplementedError("Not Yet implemented.")
-
- def to_latex(self,
- buf=None,
- columns=None,
- col_space=None,
- header=True,
- index=True,
- na_rep='NaN',
- formatters=None,
- float_format=None,
- sparsify=None,
- index_names=True,
- bold_rows=False,
- column_format=None,
- longtable=None,
- escape=None,
- encoding=None,
- decimal='.',
- multicolumn=None,
- multicolumn_format=None,
- multirow=None):
- raise NotImplementedError("Not Yet implemented.")
-
- def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs):
+ def to_json(
+ self,
+ path_or_buf=None,
+ orient=None,
+ date_format=None,
+ double_precision=10,
+ force_ascii=True,
+ date_unit="ms",
+ default_handler=None,
+ lines=False,
+ ):
+ raise NotImplementedError("Not Yet implemented.")
+
+ def to_latex(
+ self,
+ buf=None,
+ columns=None,
+ col_space=None,
+ header=True,
+ index=True,
+ na_rep="NaN",
+ formatters=None,
+ float_format=None,
+ sparsify=None,
+ index_names=True,
+ bold_rows=False,
+ column_format=None,
+ longtable=None,
+ escape=None,
+ encoding=None,
+ decimal=".",
+ multicolumn=None,
+ multicolumn_format=None,
+ multirow=None,
+ ):
+ raise NotImplementedError("Not Yet implemented.")
+
+ def to_msgpack(self, path_or_buf=None, encoding="utf-8", **kwargs):
raise NotImplementedError("Not Yet implemented.")
def to_period(self, freq=None, copy=True):
raise NotImplementedError("Not Yet implemented.")
- def to_pickle(self, path, compression='infer'):
+ def to_pickle(self, path, compression="infer"):
raise NotImplementedError("Not Yet implemented.")
- def to_sparse(self, kind='block', fill_value=None):
+ def to_sparse(self, kind="block", fill_value=None):
raise NotImplementedError("Not Yet implemented.")
- def to_sql(self,
- name,
- con,
- flavor=None,
- schema=None,
- if_exists='fail',
- index=True,
- index_label=None,
- chunksize=None,
- dtype=None):
+ def to_sql(
+ self,
+ name,
+ con,
+ flavor=None,
+ schema=None,
+ if_exists="fail",
+ index=True,
+ index_label=None,
+ chunksize=None,
+ dtype=None,
+ ):
raise NotImplementedError("Not Yet implemented.")
- def to_string(self,
- buf=None,
- na_rep='NaN',
- float_format=None,
- header=True,
- index=True,
- length=False,
- dtype=False,
- name=False,
- max_rows=None):
+ def to_string(
+ self,
+ buf=None,
+ na_rep="NaN",
+ float_format=None,
+ header=True,
+ index=True,
+ length=False,
+ dtype=False,
+ name=False,
+ max_rows=None,
+ ):
raise NotImplementedError("Not Yet implemented.")
- def to_timestamp(self, freq=None, how='start', copy=True):
+ def to_timestamp(self, freq=None, how="start", copy=True):
raise NotImplementedError("Not Yet implemented.")
def to_xarray(self):
@@ -1080,8 +1033,7 @@ def tshift(self, periods=1, freq=None, axis=0):
def tz_convert(self, tz, axis=0, level=None, copy=True):
raise NotImplementedError("Not Yet implemented.")
- def tz_localize(self, tz, axis=0, level=None, copy=True,
- ambiguous='raise'):
+ def tz_localize(self, tz, axis=0, level=None, copy=True, ambiguous="raise"):
raise NotImplementedError("Not Yet implemented.")
def unique(self):
@@ -1096,34 +1048,29 @@ def upandasate(self, other):
def valid(self, inplace=False, **kwargs):
raise NotImplementedError("Not Yet implemented.")
- def value_counts(self,
- normalize=False,
- sort=True,
- ascending=False,
- bins=None,
- dropna=True):
+ def value_counts(
+ self, normalize=False, sort=True, ascending=False, bins=None, dropna=True
+ ):
raise NotImplementedError("Not Yet implemented.")
- def var(self,
- axis=None,
- skipna=None,
- level=None,
- ddof=1,
- numeric_only=None,
- **kwargs):
+ def var(
+ self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs
+ ):
raise NotImplementedError("Not Yet implemented.")
def view(self, dtype=None):
raise NotImplementedError("Not Yet implemented.")
- def where(self,
- cond,
- other=np.nan,
- inplace=False,
- axis=None,
- level=None,
- try_cast=False,
- raise_on_error=True):
+ def where(
+ self,
+ cond,
+ other=np.nan,
+ inplace=False,
+ axis=None,
+ level=None,
+ try_cast=False,
+ raise_on_error=True,
+ ):
raise NotImplementedError("Not Yet implemented.")
def xs(key, axis=0, level=None, drop_level=True):
diff --git a/modin/pandas/test/test_concat.py b/modin/pandas/test/test_concat.py
index e6963f4b78c..dcdf8e56aa1 100644
--- a/modin/pandas/test/test_concat.py
+++ b/modin/pandas/test/test_concat.py
@@ -15,41 +15,49 @@ def ray_df_equals_pandas(ray_df, pandas_df):
@pytest.fixture
def generate_dfs():
- df = pandas.DataFrame({
- 'col1': [0, 1, 2, 3],
- 'col2': [4, 5, 6, 7],
- 'col3': [8, 9, 10, 11],
- 'col4': [12, 13, 14, 15],
- 'col5': [0, 0, 0, 0]
- })
-
- df2 = pandas.DataFrame({
- 'col1': [0, 1, 2, 3],
- 'col2': [4, 5, 6, 7],
- 'col3': [8, 9, 10, 11],
- 'col6': [12, 13, 14, 15],
- 'col7': [0, 0, 0, 0]
- })
+ df = pandas.DataFrame(
+ {
+ "col1": [0, 1, 2, 3],
+ "col2": [4, 5, 6, 7],
+ "col3": [8, 9, 10, 11],
+ "col4": [12, 13, 14, 15],
+ "col5": [0, 0, 0, 0],
+ }
+ )
+
+ df2 = pandas.DataFrame(
+ {
+ "col1": [0, 1, 2, 3],
+ "col2": [4, 5, 6, 7],
+ "col3": [8, 9, 10, 11],
+ "col6": [12, 13, 14, 15],
+ "col7": [0, 0, 0, 0],
+ }
+ )
return df, df2
@pytest.fixture
def generate_none_dfs():
- df = pandas.DataFrame({
- 'col1': [0, 1, 2, 3],
- 'col2': [4, 5, None, 7],
- 'col3': [8, 9, 10, 11],
- 'col4': [12, 13, 14, 15],
- 'col5': [None, None, None, None]
- })
-
- df2 = pandas.DataFrame({
- 'col1': [0, 1, 2, 3],
- 'col2': [4, 5, 6, 7],
- 'col3': [8, 9, 10, 11],
- 'col6': [12, 13, 14, 15],
- 'col7': [0, 0, 0, 0]
- })
+ df = pandas.DataFrame(
+ {
+ "col1": [0, 1, 2, 3],
+ "col2": [4, 5, None, 7],
+ "col3": [8, 9, 10, 11],
+ "col4": [12, 13, 14, 15],
+ "col5": [None, None, None, None],
+ }
+ )
+
+ df2 = pandas.DataFrame(
+ {
+ "col1": [0, 1, 2, 3],
+ "col2": [4, 5, 6, 7],
+ "col3": [8, 9, 10, 11],
+ "col6": [12, 13, 14, 15],
+ "col7": [0, 0, 0, 0],
+ }
+ )
return df, df2
@@ -57,16 +65,14 @@ def generate_none_dfs():
def test_df_concat():
df, df2 = generate_dfs()
- assert (ray_df_equals_pandas(
- pd.concat([df, df2]), pandas.concat([df, df2])))
+ assert ray_df_equals_pandas(pd.concat([df, df2]), pandas.concat([df, df2]))
def test_ray_concat():
df, df2 = generate_dfs()
ray_df, ray_df2 = from_pandas(df), from_pandas(df2)
- assert ray_df_equals_pandas(
- pd.concat([ray_df, ray_df2]), pandas.concat([df, df2]))
+ assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2]), pandas.concat([df, df2]))
def test_ray_concat_with_series():
@@ -76,11 +82,13 @@ def test_ray_concat_with_series():
assert ray_df_equals_pandas(
pd.concat([ray_df, ray_df2, pandas_series], axis=0),
- pandas.concat([df, df2, pandas_series], axis=0))
+ pandas.concat([df, df2, pandas_series], axis=0),
+ )
assert ray_df_equals_pandas(
pd.concat([ray_df, ray_df2, pandas_series], axis=1),
- pandas.concat([df, df2, pandas_series], axis=1))
+ pandas.concat([df, df2, pandas_series], axis=1),
+ )
def test_ray_concat_on_index():
@@ -88,15 +96,17 @@ def test_ray_concat_on_index():
ray_df, ray_df2 = from_pandas(df), from_pandas(df2)
assert ray_df_equals_pandas(
- pd.concat([ray_df, ray_df2], axis='index'),
- pandas.concat([df, df2], axis='index'))
+ pd.concat([ray_df, ray_df2], axis="index"),
+ pandas.concat([df, df2], axis="index"),
+ )
assert ray_df_equals_pandas(
- pd.concat([ray_df, ray_df2], axis='rows'),
- pandas.concat([df, df2], axis='rows'))
+ pd.concat([ray_df, ray_df2], axis="rows"), pandas.concat([df, df2], axis="rows")
+ )
assert ray_df_equals_pandas(
- pd.concat([ray_df, ray_df2], axis=0), pandas.concat([df, df2], axis=0))
+ pd.concat([ray_df, ray_df2], axis=0), pandas.concat([df, df2], axis=0)
+ )
def test_ray_concat_on_column():
@@ -104,11 +114,13 @@ def test_ray_concat_on_column():
ray_df, ray_df2 = from_pandas(df), from_pandas(df2)
assert ray_df_equals_pandas(
- pd.concat([ray_df, ray_df2], axis=1), pandas.concat([df, df2], axis=1))
+ pd.concat([ray_df, ray_df2], axis=1), pandas.concat([df, df2], axis=1)
+ )
assert ray_df_equals_pandas(
pd.concat([ray_df, ray_df2], axis="columns"),
- pandas.concat([df, df2], axis="columns"))
+ pandas.concat([df, df2], axis="columns"),
+ )
def test_invalid_axis_errors():
@@ -125,8 +137,7 @@ def test_mixed_concat():
mixed_dfs = [from_pandas(df), from_pandas(df2), df3]
- assert (ray_df_equals_pandas(
- pd.concat(mixed_dfs), pandas.concat([df, df2, df3])))
+ assert ray_df_equals_pandas(pd.concat(mixed_dfs), pandas.concat([df, df2, df3]))
def test_mixed_inner_concat():
@@ -135,9 +146,9 @@ def test_mixed_inner_concat():
mixed_dfs = [from_pandas(df), from_pandas(df2), df3]
- assert (ray_df_equals_pandas(
- pd.concat(mixed_dfs, join='inner'),
- pandas.concat([df, df2, df3], join='inner')))
+ assert ray_df_equals_pandas(
+ pd.concat(mixed_dfs, join="inner"), pandas.concat([df, df2, df3], join="inner")
+ )
def test_mixed_none_concat():
@@ -146,5 +157,4 @@ def test_mixed_none_concat():
mixed_dfs = [from_pandas(df), from_pandas(df2), df3]
- assert (ray_df_equals_pandas(
- pd.concat(mixed_dfs), pandas.concat([df, df2, df3])))
+ assert ray_df_equals_pandas(pd.concat(mixed_dfs), pandas.concat([df, df2, df3]))
diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py
index 9d1886d72be..b64fac33eea 100644
--- a/modin/pandas/test/test_dataframe.py
+++ b/modin/pandas/test/test_dataframe.py
@@ -31,39 +31,46 @@ def ray_df_equals(ray_df1, ray_df2):
@pytest.fixture
def create_test_dataframe():
- return pd.DataFrame({
- 'col1': [0, 1, 2, 3],
- 'col2': [4, 5, 6, 7],
- 'col3': [8, 9, 10, 11],
- 'col4': [12, 13, 14, 15],
- 'col5': [0, 0, 0, 0]
- })
+ return pd.DataFrame(
+ {
+ "col1": [0, 1, 2, 3],
+ "col2": [4, 5, 6, 7],
+ "col3": [8, 9, 10, 11],
+ "col4": [12, 13, 14, 15],
+ "col5": [0, 0, 0, 0],
+ }
+ )
def test_int_dataframe():
frame_data = {
- 'col1': [0, 1, 2, 3],
- 'col2': [4, 5, 6, 7],
- 'col3': [8, 9, 10, 11],
- 'col4': [12, 13, 14, 15],
- 'col5': [0, 0, 0, 0]
+ "col1": [0, 1, 2, 3],
+ "col2": [4, 5, 6, 7],
+ "col3": [8, 9, 10, 11],
+ "col4": [12, 13, 14, 15],
+ "col5": [0, 0, 0, 0],
}
pandas_df = pandas.DataFrame(frame_data)
ray_df = pd.DataFrame(frame_data)
testfuncs = [
- lambda x: x + 1, lambda x: str(x), lambda x: x * x, lambda x: x,
- lambda x: False
+ lambda x: x + 1,
+ lambda x: str(x),
+ lambda x: x * x,
+ lambda x: x,
+ lambda x: False,
]
query_funcs = [
- 'col1 < col2', 'col3 > col4', 'col1 == col2',
- '(col2 > col1) and (col1 < col3)'
+ "col1 < col2",
+ "col3 > col4",
+ "col1 == col2",
+ "(col2 > col1) and (col1 < col3)",
]
- keys = ['col1', 'col2', 'col3', 'col4']
+ keys = ["col1", "col2", "col3", "col4"]
- filter_by = {'items': ['col1', 'col5'], 'regex': '4$|3$', 'like': 'col'}
+ filter_by = {"items": ["col1", "col5"], "regex": "4$|3$", "like": "col"}
test_sample(ray_df, pandas_df)
test_filter(ray_df, pandas_df, filter_by)
@@ -150,12 +157,12 @@ def test_int_dataframe():
test_loc(ray_df, pandas_df)
test_iloc(ray_df, pandas_df)
- labels = ['a', 'b', 'c', 'd']
+ labels = ["a", "b", "c", "d"]
test_set_axis(ray_df, pandas_df, labels, 0)
- test_set_axis(ray_df, pandas_df, labels, 'rows')
- labels.append('e')
+ test_set_axis(ray_df, pandas_df, labels, "rows")
+ labels.append("e")
test_set_axis(ray_df, pandas_df, labels, 1)
- test_set_axis(ray_df, pandas_df, labels, 'columns')
+ test_set_axis(ray_df, pandas_df, labels, "columns")
for key in keys:
test_set_index(ray_df, pandas_df, key)
@@ -175,9 +182,7 @@ def test_int_dataframe():
test___array__(ray_df, pandas_df)
- apply_agg_functions = [
- 'sum', lambda df: df.sum(), ['sum', 'mean'], ['sum', 'sum']
- ]
+ apply_agg_functions = ["sum", lambda df: df.sum(), ["sum", "mean"], ["sum", "sum"]]
for func in apply_agg_functions:
test_apply(ray_df, pandas_df, func, 0)
test_aggregate(ray_df, pandas_df, func, 0)
@@ -194,7 +199,7 @@ def test_int_dataframe():
with pytest.raises(TypeError):
test_aggregate(ray_df, pandas_df, func, 1)
- func = ['sum', lambda df: df.sum()]
+ func = ["sum", lambda df: df.sum()]
test_apply(ray_df, pandas_df, func, 0)
test_aggregate(ray_df, pandas_df, func, 0)
test_agg(ray_df, pandas_df, func, 0)
@@ -205,36 +210,41 @@ def test_int_dataframe():
with pytest.raises(TypeError):
test_agg(ray_df, pandas_df, func, 1)
- test_apply(ray_df, pandas_df, lambda df: df.drop('col1'), 1)
+ test_apply(ray_df, pandas_df, lambda df: df.drop("col1"), 1)
test_apply(ray_df, pandas_df, lambda df: -df, 0)
test_transform(ray_df, pandas_df)
def test_float_dataframe():
frame_data = {
- 'col1': [0.0, 1.0, 2.0, 3.0],
- 'col2': [4.0, 5.0, 6.0, 7.0],
- 'col3': [8.0, 9.0, 10.0, 11.0],
- 'col4': [12.0, 13.0, 14.0, 15.0],
- 'col5': [0.0, 0.0, 0.0, 0.0]
+ "col1": [0.0, 1.0, 2.0, 3.0],
+ "col2": [4.0, 5.0, 6.0, 7.0],
+ "col3": [8.0, 9.0, 10.0, 11.0],
+ "col4": [12.0, 13.0, 14.0, 15.0],
+ "col5": [0.0, 0.0, 0.0, 0.0],
}
pandas_df = pandas.DataFrame(frame_data)
ray_df = pd.DataFrame(frame_data)
testfuncs = [
- lambda x: x + 1, lambda x: str(x), lambda x: x * x, lambda x: x,
- lambda x: False
+ lambda x: x + 1,
+ lambda x: str(x),
+ lambda x: x * x,
+ lambda x: x,
+ lambda x: False,
]
query_funcs = [
- 'col1 < col2', 'col3 > col4', 'col1 == col2',
- '(col2 > col1) and (col1 < col3)'
+ "col1 < col2",
+ "col3 > col4",
+ "col1 == col2",
+ "(col2 > col1) and (col1 < col3)",
]
- keys = ['col1', 'col2', 'col3', 'col4']
+ keys = ["col1", "col2", "col3", "col4"]
- filter_by = {'items': ['col1', 'col5'], 'regex': '4$|3$', 'like': 'col'}
+ filter_by = {"items": ["col1", "col5"], "regex": "4$|3$", "like": "col"}
test_sample(ray_df, pandas_df)
test_filter(ray_df, pandas_df, filter_by)
@@ -320,12 +330,12 @@ def test_float_dataframe():
test_loc(ray_df, pandas_df)
test_iloc(ray_df, pandas_df)
- labels = ['a', 'b', 'c', 'd']
+ labels = ["a", "b", "c", "d"]
test_set_axis(ray_df, pandas_df, labels, 0)
- test_set_axis(ray_df, pandas_df, labels, 'rows')
- labels.append('e')
+ test_set_axis(ray_df, pandas_df, labels, "rows")
+ labels.append("e")
test_set_axis(ray_df, pandas_df, labels, 1)
- test_set_axis(ray_df, pandas_df, labels, 'columns')
+ test_set_axis(ray_df, pandas_df, labels, "columns")
for key in keys:
test_set_index(ray_df, pandas_df, key)
@@ -346,9 +356,7 @@ def test_float_dataframe():
test___array__(ray_df, pandas_df)
- apply_agg_functions = [
- 'sum', lambda df: df.sum(), ['sum', 'mean'], ['sum', 'sum']
- ]
+ apply_agg_functions = ["sum", lambda df: df.sum(), ["sum", "mean"], ["sum", "sum"]]
for func in apply_agg_functions:
test_apply(ray_df, pandas_df, func, 0)
test_aggregate(ray_df, pandas_df, func, 0)
@@ -365,7 +373,7 @@ def test_float_dataframe():
with pytest.raises(TypeError):
test_aggregate(ray_df, pandas_df, func, 1)
- func = ['sum', lambda df: df.sum()]
+ func = ["sum", lambda df: df.sum()]
test_apply(ray_df, pandas_df, func, 0)
test_aggregate(ray_df, pandas_df, func, 0)
test_agg(ray_df, pandas_df, func, 0)
@@ -376,33 +384,29 @@ def test_float_dataframe():
with pytest.raises(TypeError):
test_agg(ray_df, pandas_df, func, 1)
- test_apply(ray_df, pandas_df, lambda df: df.drop('col1'), 1)
+ test_apply(ray_df, pandas_df, lambda df: df.drop("col1"), 1)
test_apply(ray_df, pandas_df, lambda df: -df, 0)
test_transform(ray_df, pandas_df)
def test_mixed_dtype_dataframe():
frame_data = {
- 'col1': [1, 2, 3, 4],
- 'col2': [4, 5, 6, 7],
- 'col3': [8.0, 9.4, 10.1, 11.3],
- 'col4': ['a', 'b', 'c', 'd']
+ "col1": [1, 2, 3, 4],
+ "col2": [4, 5, 6, 7],
+ "col3": [8.0, 9.4, 10.1, 11.3],
+ "col4": ["a", "b", "c", "d"],
}
pandas_df = pandas.DataFrame(frame_data)
ray_df = pd.DataFrame(frame_data)
- testfuncs = [
- lambda x: x + x, lambda x: str(x), lambda x: x, lambda x: False
- ]
+ testfuncs = [lambda x: x + x, lambda x: str(x), lambda x: x, lambda x: False]
- query_funcs = [
- 'col1 < col2', 'col1 == col2', '(col2 > col1) and (col1 < col3)'
- ]
+ query_funcs = ["col1 < col2", "col1 == col2", "(col2 > col1) and (col1 < col3)"]
- keys = ['col1', 'col2', 'col3', 'col4']
+ keys = ["col1", "col2", "col3", "col4"]
- filter_by = {'items': ['col1', 'col5'], 'regex': '4$|3$', 'like': 'col'}
+ filter_by = {"items": ["col1", "col5"], "regex": "4$|3$", "like": "col"}
test_sample(ray_df, pandas_df)
test_filter(ray_df, pandas_df, filter_by)
@@ -501,11 +505,11 @@ def test_mixed_dtype_dataframe():
test_loc(ray_df, pandas_df)
test_iloc(ray_df, pandas_df)
- labels = ['a', 'b', 'c', 'd']
+ labels = ["a", "b", "c", "d"]
test_set_axis(ray_df, pandas_df, labels, 0)
- test_set_axis(ray_df, pandas_df, labels, 'rows')
+ test_set_axis(ray_df, pandas_df, labels, "rows")
test_set_axis(ray_df, pandas_df, labels, 1)
- test_set_axis(ray_df, pandas_df, labels, 'columns')
+ test_set_axis(ray_df, pandas_df, labels, "columns")
for key in keys:
test_set_index(ray_df, pandas_df, key)
@@ -526,13 +530,13 @@ def test_mixed_dtype_dataframe():
test___array__(ray_df, pandas_df)
- apply_agg_functions = ['sum', lambda df: df.sum()]
+ apply_agg_functions = ["sum", lambda df: df.sum()]
for func in apply_agg_functions:
test_apply(ray_df, pandas_df, func, 0)
test_aggregate(ray_df, pandas_df, func, 0)
test_agg(ray_df, pandas_df, func, 0)
- func = ['sum', lambda df: df.sum()]
+ func = ["sum", lambda df: df.sum()]
test_apply(ray_df, pandas_df, func, 0)
test_aggregate(ray_df, pandas_df, func, 0)
test_agg(ray_df, pandas_df, func, 0)
@@ -544,32 +548,32 @@ def test_mixed_dtype_dataframe():
test_agg(ray_df, pandas_df, func, 1)
test_transform(ray_df, pandas_df)
- test_apply(ray_df, pandas_df, lambda df: df.drop('col1'), 1)
+ test_apply(ray_df, pandas_df, lambda df: df.drop("col1"), 1)
def test_nan_dataframe():
frame_data = {
- 'col1': [1, 2, 3, np.nan],
- 'col2': [4, 5, np.nan, 7],
- 'col3': [8, np.nan, 10, 11],
- 'col4': [np.nan, 13, 14, 15]
+ "col1": [1, 2, 3, np.nan],
+ "col2": [4, 5, np.nan, 7],
+ "col3": [8, np.nan, 10, 11],
+ "col4": [np.nan, 13, 14, 15],
}
pandas_df = pandas.DataFrame(frame_data)
ray_df = pd.DataFrame(frame_data)
- testfuncs = [
- lambda x: x + x, lambda x: str(x), lambda x: x, lambda x: False
- ]
+ testfuncs = [lambda x: x + x, lambda x: str(x), lambda x: x, lambda x: False]
query_funcs = [
- 'col1 < col2', 'col3 > col4', 'col1 == col2',
- '(col2 > col1) and (col1 < col3)'
+ "col1 < col2",
+ "col3 > col4",
+ "col1 == col2",
+ "(col2 > col1) and (col1 < col3)",
]
- keys = ['col1', 'col2', 'col3', 'col4']
+ keys = ["col1", "col2", "col3", "col4"]
- filter_by = {'items': ['col1', 'col5'], 'regex': '4$|3$', 'like': 'col'}
+ filter_by = {"items": ["col1", "col5"], "regex": "4$|3$", "like": "col"}
test_sample(ray_df, pandas_df)
test_filter(ray_df, pandas_df, filter_by)
@@ -653,11 +657,11 @@ def test_nan_dataframe():
test_loc(ray_df, pandas_df)
test_iloc(ray_df, pandas_df)
- labels = ['a', 'b', 'c', 'd']
+ labels = ["a", "b", "c", "d"]
test_set_axis(ray_df, pandas_df, labels, 0)
- test_set_axis(ray_df, pandas_df, labels, 'rows')
+ test_set_axis(ray_df, pandas_df, labels, "rows")
test_set_axis(ray_df, pandas_df, labels, 1)
- test_set_axis(ray_df, pandas_df, labels, 'columns')
+ test_set_axis(ray_df, pandas_df, labels, "columns")
for key in keys:
test_set_index(ray_df, pandas_df, key)
@@ -678,9 +682,7 @@ def test_nan_dataframe():
test___array__(ray_df, pandas_df)
- apply_agg_functions = [
- 'sum', lambda df: df.sum(), ['sum', 'mean'], ['sum', 'sum']
- ]
+ apply_agg_functions = ["sum", lambda df: df.sum(), ["sum", "mean"], ["sum", "sum"]]
for func in apply_agg_functions:
test_apply(ray_df, pandas_df, func, 0)
test_aggregate(ray_df, pandas_df, func, 0)
@@ -697,7 +699,7 @@ def test_nan_dataframe():
with pytest.raises(TypeError):
test_aggregate(ray_df, pandas_df, func, 1)
- func = ['sum', lambda df: df.sum()]
+ func = ["sum", lambda df: df.sum()]
test_apply(ray_df, pandas_df, func, 0)
test_aggregate(ray_df, pandas_df, func, 0)
test_agg(ray_df, pandas_df, func, 0)
@@ -708,36 +710,36 @@ def test_nan_dataframe():
with pytest.raises(TypeError):
test_agg(ray_df, pandas_df, func, 1)
- test_apply(ray_df, pandas_df, lambda df: df.drop('col1'), 1)
+ test_apply(ray_df, pandas_df, lambda df: df.drop("col1"), 1)
test_apply(ray_df, pandas_df, lambda df: -df, 0)
test_transform(ray_df, pandas_df)
def test_empty_df():
- df = pd.DataFrame(index=['a', 'b'])
+ df = pd.DataFrame(index=["a", "b"])
test_is_empty(df)
- tm.assert_index_equal(df.index, pd.Index(['a', 'b']))
+ tm.assert_index_equal(df.index, pd.Index(["a", "b"]))
assert len(df.columns) == 0
- df = pd.DataFrame(columns=['a', 'b'])
+ df = pd.DataFrame(columns=["a", "b"])
test_is_empty(df)
assert len(df.index) == 0
- tm.assert_index_equal(df.columns, pd.Index(['a', 'b']))
+ tm.assert_index_equal(df.columns, pd.Index(["a", "b"]))
df = pd.DataFrame()
test_is_empty(df)
assert len(df.index) == 0
assert len(df.columns) == 0
- df = pd.DataFrame(index=['a', 'b'])
+ df = pd.DataFrame(index=["a", "b"])
test_is_empty(df)
- tm.assert_index_equal(df.index, pd.Index(['a', 'b']))
+ tm.assert_index_equal(df.index, pd.Index(["a", "b"]))
assert len(df.columns) == 0
- df = pd.DataFrame(columns=['a', 'b'])
+ df = pd.DataFrame(columns=["a", "b"])
test_is_empty(df)
assert len(df.index) == 0
- tm.assert_index_equal(df.columns, pd.Index(['a', 'b']))
+ tm.assert_index_equal(df.columns, pd.Index(["a", "b"]))
df = pd.DataFrame()
test_is_empty(df)
@@ -752,13 +754,16 @@ def test_is_empty(df):
def test_dense_nan_df():
- frame_data = [[np.nan, 2, np.nan, 0], [3, 4, np.nan, 1],
- [np.nan, np.nan, np.nan, 5]]
- ray_df = pd.DataFrame(frame_data, columns=list('ABCD'))
+ frame_data = [
+ [np.nan, 2, np.nan, 0],
+ [3, 4, np.nan, 1],
+ [np.nan, np.nan, np.nan, 5],
+ ]
+ ray_df = pd.DataFrame(frame_data, columns=list("ABCD"))
- pd_df = pandas.DataFrame(frame_data, columns=list('ABCD'))
+ pd_df = pandas.DataFrame(frame_data, columns=list("ABCD"))
- column_subsets = [list('AD'), list('BC'), list('CD')]
+ column_subsets = [list("AD"), list("BC"), list("CD")]
row_subsets = [[0, 1], [0, 1, 2], [2, 0]]
test_dropna(ray_df, pd_df)
@@ -775,40 +780,38 @@ def test_inter_df_math(op, simple=False):
"col1": [0, 1, 2, 3],
"col2": [4, 5, 6, 7],
"col3": [8, 9, 0, 1],
- "col4": [2, 4, 5, 6]
+ "col4": [2, 4, 5, 6],
}
ray_df = pd.DataFrame(frame_data)
pandas_df = pandas.DataFrame(frame_data)
assert ray_df_equals_pandas(
- getattr(ray_df, op)(ray_df),
- getattr(pandas_df, op)(pandas_df))
- assert ray_df_equals_pandas(
- getattr(ray_df, op)(4),
- getattr(pandas_df, op)(4))
- assert ray_df_equals_pandas(
- getattr(ray_df, op)(4.0),
- getattr(pandas_df, op)(4.0))
+ getattr(ray_df, op)(ray_df), getattr(pandas_df, op)(pandas_df)
+ )
+ assert ray_df_equals_pandas(getattr(ray_df, op)(4), getattr(pandas_df, op)(4))
+ assert ray_df_equals_pandas(getattr(ray_df, op)(4.0), getattr(pandas_df, op)(4.0))
frame_data = {"A": [0, 2], "col1": [0, 19], "col2": [1, 1]}
ray_df2 = pd.DataFrame(frame_data)
pandas_df2 = pandas.DataFrame(frame_data)
assert ray_df_equals_pandas(
- getattr(ray_df, op)(ray_df2),
- getattr(pandas_df, op)(pandas_df2))
+ getattr(ray_df, op)(ray_df2), getattr(pandas_df, op)(pandas_df2)
+ )
list_test = [0, 1, 2, 4]
if not simple:
assert ray_df_equals_pandas(
getattr(ray_df, op)(list_test, axis=1),
- getattr(pandas_df, op)(list_test, axis=1))
+ getattr(pandas_df, op)(list_test, axis=1),
+ )
assert ray_df_equals_pandas(
getattr(ray_df, op)(list_test, axis=0),
- getattr(pandas_df, op)(list_test, axis=0))
+ getattr(pandas_df, op)(list_test, axis=0),
+ )
@pytest.fixture
@@ -817,21 +820,17 @@ def test_comparison_inter_ops(op):
"col1": [0, 1, 2, 3],
"col2": [4, 5, 6, 7],
"col3": [8, 9, 0, 1],
- "col4": [2, 4, 5, 6]
+ "col4": [2, 4, 5, 6],
}
ray_df = pd.DataFrame(frame_data)
pandas_df = pandas.DataFrame(frame_data)
assert ray_df_equals_pandas(
- getattr(ray_df, op)(ray_df),
- getattr(pandas_df, op)(pandas_df))
- assert ray_df_equals_pandas(
- getattr(ray_df, op)(4),
- getattr(pandas_df, op)(4))
- assert ray_df_equals_pandas(
- getattr(ray_df, op)(4.0),
- getattr(pandas_df, op)(4.0))
+ getattr(ray_df, op)(ray_df), getattr(pandas_df, op)(pandas_df)
+ )
+ assert ray_df_equals_pandas(getattr(ray_df, op)(4), getattr(pandas_df, op)(4))
+ assert ray_df_equals_pandas(getattr(ray_df, op)(4.0), getattr(pandas_df, op)(4.0))
frame_data = {"A": [0, 2], "col1": [0, 19], "col2": [1, 1]}
@@ -839,8 +838,8 @@ def test_comparison_inter_ops(op):
pandas_df2 = pandas.DataFrame(frame_data)
assert ray_df_equals_pandas(
- getattr(ray_df2, op)(ray_df2),
- getattr(pandas_df2, op)(pandas_df2))
+ getattr(ray_df2, op)(ray_df2), getattr(pandas_df2, op)(pandas_df2)
+ )
@pytest.fixture
@@ -849,18 +848,14 @@ def test_inter_df_math_right_ops(op):
"col1": [0, 1, 2, 3],
"col2": [4, 5, 6, 7],
"col3": [8, 9, 0, 1],
- "col4": [2, 4, 5, 6]
+ "col4": [2, 4, 5, 6],
}
ray_df = pd.DataFrame(frame_data)
pandas_df = pandas.DataFrame(frame_data)
- assert ray_df_equals_pandas(
- getattr(ray_df, op)(4),
- getattr(pandas_df, op)(4))
- assert ray_df_equals_pandas(
- getattr(ray_df, op)(4.0),
- getattr(pandas_df, op)(4.0))
+ assert ray_df_equals_pandas(getattr(ray_df, op)(4), getattr(pandas_df, op)(4))
+ assert ray_df_equals_pandas(getattr(ray_df, op)(4.0), getattr(pandas_df, op)(4.0))
@pytest.fixture
@@ -902,7 +897,7 @@ def test_values(ray_df, pandas_df):
@pytest.fixture
def test_axes(ray_df, pandas_df):
for ray_axis, pd_axis in zip(ray_df.axes, pandas_df.axes):
- assert (np.array_equal(ray_axis, pd_axis))
+ assert np.array_equal(ray_axis, pd_axis)
@pytest.fixture
@@ -940,8 +935,9 @@ def test_copy(ray_df):
new_ray_df = ray_df.copy()
assert new_ray_df is not ray_df
- assert np.array_equal(new_ray_df._data_manager.data.partitions,
- ray_df._data_manager.data.partitions)
+ assert np.array_equal(
+ new_ray_df._data_manager.data.partitions, ray_df._data_manager.data.partitions
+ )
@pytest.fixture
@@ -968,8 +964,9 @@ def test_transpose(ray_df, pandas_df):
@pytest.fixture
def test_get(ray_df, pandas_df, key):
assert ray_df.get(key).equals(pandas_df.get(key))
- assert ray_df.get(
- key, default='default').equals(pandas_df.get(key, default='default'))
+ assert ray_df.get(key, default="default").equals(
+ pandas_df.get(key, default="default")
+ )
@pytest.fixture
@@ -1030,7 +1027,7 @@ def test_append():
"col1": [0, 1, 2, 3],
"col2": [4, 5, 6, 7],
"col3": [8, 9, 0, 1],
- "col4": [2, 4, 5, 6]
+ "col4": [2, 4, 5, 6],
}
ray_df = pd.DataFrame(frame_data)
@@ -1041,8 +1038,7 @@ def test_append():
ray_df2 = pd.DataFrame(frame_data2)
pandas_df2 = pandas.DataFrame(frame_data2)
- assert ray_df_equals_pandas(
- ray_df.append(ray_df2), pandas_df.append(pandas_df2))
+ assert ray_df_equals_pandas(ray_df.append(ray_df2), pandas_df.append(pandas_df2))
with pytest.raises(ValueError):
ray_df.append(ray_df2, verify_integrity=True)
@@ -1080,16 +1076,16 @@ def test_as_matrix():
assert value == frame[col][i]
# mixed type
- mat = pd.DataFrame(test_data.mixed_frame).as_matrix(['foo', 'A'])
- assert mat[0, 0] == 'bar'
+ mat = pd.DataFrame(test_data.mixed_frame).as_matrix(["foo", "A"])
+ assert mat[0, 0] == "bar"
- df = pd.DataFrame({'real': [1, 2, 3], 'complex': [1j, 2j, 3j]})
+ df = pd.DataFrame({"real": [1, 2, 3], "complex": [1j, 2j, 3j]})
mat = df.as_matrix()
assert mat[0, 0] == 1j
# single block corner case
- mat = pd.DataFrame(test_data.frame).as_matrix(['A', 'B'])
- expected = test_data.frame.reindex(columns=['A', 'B']).values
+ mat = pd.DataFrame(test_data.frame).as_matrix(["A", "B"])
+ expected = test_data.frame.reindex(columns=["A", "B"]).values
tm.assert_almost_equal(mat, expected)
@@ -1117,9 +1113,11 @@ def test_assign():
def test_astype():
td = TestData()
ray_df = pd.DataFrame(
- td.frame.values, index=td.frame.index, columns=td.frame.columns)
+ td.frame.values, index=td.frame.index, columns=td.frame.columns
+ )
expected_df = pandas.DataFrame(
- td.frame.values, index=td.frame.index, columns=td.frame.columns)
+ td.frame.values, index=td.frame.index, columns=td.frame.columns
+ )
ray_df_casted = ray_df.astype(np.int32)
expected_df_casted = expected_df.astype(np.int32)
@@ -1154,8 +1152,8 @@ def test_between_time():
@pytest.fixture
def test_bfill():
test_data = TestData()
- test_data.tsframe['A'][:5] = np.nan
- test_data.tsframe['A'][-5:] = np.nan
+ test_data.tsframe["A"][:5] = np.nan
+ test_data.tsframe["A"][-5:] = np.nan
ray_df = pd.DataFrame(test_data.tsframe)
assert ray_df_equals_pandas(ray_df.bfill(), test_data.tsframe.bfill())
@@ -1291,8 +1289,7 @@ def test_describe(ray_df, pandas_df):
def test_diff(ray_df, pandas_df):
assert ray_df_equals_pandas(ray_df.diff(), pandas_df.diff())
assert ray_df_equals_pandas(ray_df.diff(axis=1), pandas_df.diff(axis=1))
- assert ray_df_equals_pandas(
- ray_df.diff(periods=1), pandas_df.diff(periods=1))
+ assert ray_df_equals_pandas(ray_df.diff(periods=1), pandas_df.diff(periods=1))
def test_div():
@@ -1314,50 +1311,47 @@ def test_drop():
frame_data = {"A": [1, 2, 3, 4], "B": [0, 1, 2, 3]}
simple = pandas.DataFrame(frame_data)
ray_simple = pd.DataFrame(frame_data)
- assert ray_df_equals_pandas(ray_simple.drop("A", axis=1), simple[['B']])
- assert ray_df_equals_pandas(
- ray_simple.drop(["A", "B"], axis='columns'), simple[[]])
+ assert ray_df_equals_pandas(ray_simple.drop("A", axis=1), simple[["B"]])
+ assert ray_df_equals_pandas(ray_simple.drop(["A", "B"], axis="columns"), simple[[]])
+ assert ray_df_equals_pandas(ray_simple.drop([0, 1, 3], axis=0), simple.loc[[2], :])
assert ray_df_equals_pandas(
- ray_simple.drop([0, 1, 3], axis=0), simple.loc[[2], :])
- assert ray_df_equals_pandas(
- ray_simple.drop([0, 3], axis='index'), simple.loc[[1, 2], :])
+ ray_simple.drop([0, 3], axis="index"), simple.loc[[1, 2], :]
+ )
pytest.raises(ValueError, ray_simple.drop, 5)
- pytest.raises(ValueError, ray_simple.drop, 'C', 1)
+ pytest.raises(ValueError, ray_simple.drop, "C", 1)
pytest.raises(ValueError, ray_simple.drop, [1, 5])
- pytest.raises(ValueError, ray_simple.drop, ['A', 'C'], 1)
+ pytest.raises(ValueError, ray_simple.drop, ["A", "C"], 1)
# errors = 'ignore'
- assert ray_df_equals_pandas(ray_simple.drop(5, errors='ignore'), simple)
- assert ray_df_equals_pandas(
- ray_simple.drop([0, 5], errors='ignore'), simple.loc[[1, 2, 3], :])
+ assert ray_df_equals_pandas(ray_simple.drop(5, errors="ignore"), simple)
assert ray_df_equals_pandas(
- ray_simple.drop('C', axis=1, errors='ignore'), simple)
+ ray_simple.drop([0, 5], errors="ignore"), simple.loc[[1, 2, 3], :]
+ )
+ assert ray_df_equals_pandas(ray_simple.drop("C", axis=1, errors="ignore"), simple)
assert ray_df_equals_pandas(
- ray_simple.drop(['A', 'C'], axis=1, errors='ignore'), simple[['B']])
+ ray_simple.drop(["A", "C"], axis=1, errors="ignore"), simple[["B"]]
+ )
# non-unique
nu_df = pandas.DataFrame(
- pandas.compat.lzip(range(3), range(-3, 1), list('abc')),
- columns=['a', 'a', 'b'])
+ pandas.compat.lzip(range(3), range(-3, 1), list("abc")), columns=["a", "a", "b"]
+ )
ray_nu_df = pd.DataFrame(nu_df)
- assert ray_df_equals_pandas(ray_nu_df.drop('a', axis=1), nu_df[['b']])
- assert ray_df_equals_pandas(
- ray_nu_df.drop('b', axis='columns'), nu_df['a'])
+ assert ray_df_equals_pandas(ray_nu_df.drop("a", axis=1), nu_df[["b"]])
+ assert ray_df_equals_pandas(ray_nu_df.drop("b", axis="columns"), nu_df["a"])
assert ray_df_equals_pandas(ray_nu_df.drop([]), nu_df)
- nu_df = nu_df.set_index(pandas.Index(['X', 'Y', 'X']))
- nu_df.columns = list('abc')
+ nu_df = nu_df.set_index(pandas.Index(["X", "Y", "X"]))
+ nu_df.columns = list("abc")
ray_nu_df = pd.DataFrame(nu_df)
- assert ray_df_equals_pandas(
- ray_nu_df.drop('X', axis='rows'), nu_df.loc[["Y"], :])
- assert ray_df_equals_pandas(
- ray_nu_df.drop(['X', 'Y'], axis=0), nu_df.loc[[], :])
+ assert ray_df_equals_pandas(ray_nu_df.drop("X", axis="rows"), nu_df.loc[["Y"], :])
+ assert ray_df_equals_pandas(ray_nu_df.drop(["X", "Y"], axis=0), nu_df.loc[[], :])
# inplace cache issue
frame_data = np.random.randn(10, 3)
- df = pandas.DataFrame(frame_data, columns=list('abc'))
- ray_df = pd.DataFrame(frame_data, columns=list('abc'))
+ df = pandas.DataFrame(frame_data, columns=list("abc"))
+ ray_df = pd.DataFrame(frame_data, columns=list("abc"))
expected = df[~(df.b > 0)]
ray_df.drop(labels=df[df.b > 0].index, inplace=True)
assert ray_df_equals_pandas(ray_df, expected)
@@ -1367,34 +1361,33 @@ def test_drop_api_equivalence():
# equivalence of the labels/axis and index/columns API's
frame_data = [[1, 2, 3], [3, 4, 5], [5, 6, 7]]
- ray_df = pd.DataFrame(
- frame_data, index=['a', 'b', 'c'], columns=['d', 'e', 'f'])
+ ray_df = pd.DataFrame(frame_data, index=["a", "b", "c"], columns=["d", "e", "f"])
- ray_df1 = ray_df.drop('a')
- ray_df2 = ray_df.drop(index='a')
+ ray_df1 = ray_df.drop("a")
+ ray_df2 = ray_df.drop(index="a")
assert ray_df_equals(ray_df1, ray_df2)
- ray_df1 = ray_df.drop('d', 1)
- ray_df2 = ray_df.drop(columns='d')
+ ray_df1 = ray_df.drop("d", 1)
+ ray_df2 = ray_df.drop(columns="d")
assert ray_df_equals(ray_df1, ray_df2)
- ray_df1 = ray_df.drop(labels='e', axis=1)
- ray_df2 = ray_df.drop(columns='e')
+ ray_df1 = ray_df.drop(labels="e", axis=1)
+ ray_df2 = ray_df.drop(columns="e")
assert ray_df_equals(ray_df1, ray_df2)
- ray_df1 = ray_df.drop(['a'], axis=0)
- ray_df2 = ray_df.drop(index=['a'])
+ ray_df1 = ray_df.drop(["a"], axis=0)
+ ray_df2 = ray_df.drop(index=["a"])
assert ray_df_equals(ray_df1, ray_df2)
- ray_df1 = ray_df.drop(['a'], axis=0).drop(['d'], axis=1)
- ray_df2 = ray_df.drop(index=['a'], columns=['d'])
+ ray_df1 = ray_df.drop(["a"], axis=0).drop(["d"], axis=1)
+ ray_df2 = ray_df.drop(index=["a"], columns=["d"])
assert ray_df_equals(ray_df1, ray_df2)
with pytest.raises(ValueError):
- ray_df.drop(labels='a', index='b')
+ ray_df.drop(labels="a", index="b")
with pytest.raises(ValueError):
- ray_df.drop(labels='a', columns='b')
+ ray_df.drop(labels="a", columns="b")
with pytest.raises(ValueError):
ray_df.drop(axis=1)
@@ -1410,16 +1403,18 @@ def test_drop_duplicates():
@pytest.fixture
def test_dropna(ray_df, pd_df):
assert ray_df_equals_pandas(
- ray_df.dropna(axis=1, how='all'), pd_df.dropna(axis=1, how='all'))
+ ray_df.dropna(axis=1, how="all"), pd_df.dropna(axis=1, how="all")
+ )
assert ray_df_equals_pandas(
- ray_df.dropna(axis=1, how='any'), pd_df.dropna(axis=1, how='any'))
+ ray_df.dropna(axis=1, how="any"), pd_df.dropna(axis=1, how="any")
+ )
assert ray_df_equals_pandas(
- ray_df.dropna(axis=0, how='all'), pd_df.dropna(axis=0, how='all'))
+ ray_df.dropna(axis=0, how="all"), pd_df.dropna(axis=0, how="all")
+ )
- assert ray_df_equals_pandas(
- ray_df.dropna(thresh=2), pd_df.dropna(thresh=2))
+ assert ray_df_equals_pandas(ray_df.dropna(thresh=2), pd_df.dropna(thresh=2))
@pytest.fixture
@@ -1432,8 +1427,8 @@ def test_dropna_inplace(ray_df, pd_df):
assert ray_df_equals_pandas(ray_df, pd_df)
- ray_df.dropna(axis=1, how='any', inplace=True)
- pd_df.dropna(axis=1, how='any', inplace=True)
+ ray_df.dropna(axis=1, how="any", inplace=True)
+ pd_df.dropna(axis=1, how="any", inplace=True)
assert ray_df_equals_pandas(ray_df, pd_df)
@@ -1441,11 +1436,11 @@ def test_dropna_inplace(ray_df, pd_df):
@pytest.fixture
def test_dropna_multiple_axes(ray_df, pd_df):
assert ray_df_equals_pandas(
- ray_df.dropna(how='all', axis=[0, 1]),
- pd_df.dropna(how='all', axis=[0, 1]))
+ ray_df.dropna(how="all", axis=[0, 1]), pd_df.dropna(how="all", axis=[0, 1])
+ )
assert ray_df_equals_pandas(
- ray_df.dropna(how='all', axis=(0, 1)),
- pd_df.dropna(how='all', axis=(0, 1)))
+ ray_df.dropna(how="all", axis=(0, 1)), pd_df.dropna(how="all", axis=(0, 1))
+ )
@pytest.fixture
@@ -1453,16 +1448,16 @@ def test_dropna_multiple_axes_inplace(ray_df, pd_df):
ray_df_copy = ray_df.copy()
pd_df_copy = pd_df.copy()
- ray_df_copy.dropna(how='all', axis=[0, 1], inplace=True)
- pd_df_copy.dropna(how='all', axis=[0, 1], inplace=True)
+ ray_df_copy.dropna(how="all", axis=[0, 1], inplace=True)
+ pd_df_copy.dropna(how="all", axis=[0, 1], inplace=True)
assert ray_df_equals_pandas(ray_df_copy, pd_df_copy)
ray_df_copy = ray_df.copy()
pd_df_copy = pd_df.copy()
- ray_df_copy.dropna(how='all', axis=(0, 1), inplace=True)
- pd_df_copy.dropna(how='all', axis=(0, 1), inplace=True)
+ ray_df_copy.dropna(how="all", axis=(0, 1), inplace=True)
+ pd_df_copy.dropna(how="all", axis=(0, 1), inplace=True)
assert ray_df_equals_pandas(ray_df_copy, pd_df_copy)
@@ -1471,27 +1466,31 @@ def test_dropna_multiple_axes_inplace(ray_df, pd_df):
def test_dropna_subset(ray_df, pd_df, column_subsets, row_subsets):
for subset in column_subsets:
assert ray_df_equals_pandas(
- ray_df.dropna(how='all', subset=subset),
- pd_df.dropna(how='all', subset=subset))
+ ray_df.dropna(how="all", subset=subset),
+ pd_df.dropna(how="all", subset=subset),
+ )
assert ray_df_equals_pandas(
- ray_df.dropna(how='any', subset=subset),
- pd_df.dropna(how='any', subset=subset))
+ ray_df.dropna(how="any", subset=subset),
+ pd_df.dropna(how="any", subset=subset),
+ )
for subset in row_subsets:
assert ray_df_equals_pandas(
- ray_df.dropna(how='all', axis=1, subset=subset),
- pd_df.dropna(how='all', axis=1, subset=subset))
+ ray_df.dropna(how="all", axis=1, subset=subset),
+ pd_df.dropna(how="all", axis=1, subset=subset),
+ )
assert ray_df_equals_pandas(
- ray_df.dropna(how='any', axis=1, subset=subset),
- pd_df.dropna(how='any', axis=1, subset=subset))
+ ray_df.dropna(how="any", axis=1, subset=subset),
+ pd_df.dropna(how="any", axis=1, subset=subset),
+ )
@pytest.fixture
def test_dropna_subset_error(ray_df):
with pytest.raises(KeyError):
- ray_df.dropna(subset=list('EF'))
+ ray_df.dropna(subset=list("EF"))
with pytest.raises(KeyError):
ray_df.dropna(axis=1, subset=[4, 5])
@@ -1509,13 +1508,13 @@ def test_eq():
def test_equals():
- frame_data = {'col1': [2.9, 3, 3, 3], 'col2': [2, 3, 4, 1]}
+ frame_data = {"col1": [2.9, 3, 3, 3], "col2": [2, 3, 4, 1]}
ray_df1 = pd.DataFrame(frame_data)
ray_df2 = pd.DataFrame(frame_data)
assert ray_df1.equals(ray_df2)
- frame_data = {'col1': [2.9, 3, 3, 3], 'col2': [2, 3, 5, 1]}
+ frame_data = {"col1": [2.9, 3, 3, 3], "col2": [2, 3, 5, 1]}
ray_df3 = pd.DataFrame(frame_data)
assert not ray_df3.equals(ray_df1)
@@ -1523,49 +1522,37 @@ def test_equals():
def test_eval_df_use_case():
- frame_data = {'a': np.random.randn(10), 'b': np.random.randn(10)}
+ frame_data = {"a": np.random.randn(10), "b": np.random.randn(10)}
df = pandas.DataFrame(frame_data)
ray_df = pd.DataFrame(frame_data)
# test eval for series results
- tmp_pandas = df.eval(
- "arctan2(sin(a), b)", engine='python', parser='pandas')
- tmp_ray = ray_df.eval(
- "arctan2(sin(a), b)", engine='python', parser='pandas')
+ tmp_pandas = df.eval("arctan2(sin(a), b)", engine="python", parser="pandas")
+ tmp_ray = ray_df.eval("arctan2(sin(a), b)", engine="python", parser="pandas")
assert isinstance(tmp_ray, pandas.Series)
assert ray_series_equals_pandas(tmp_ray, tmp_pandas)
# Test not inplace assignments
- tmp_pandas = df.eval(
- "e = arctan2(sin(a), b)", engine='python', parser='pandas')
- tmp_ray = ray_df.eval(
- "e = arctan2(sin(a), b)", engine='python', parser='pandas')
+ tmp_pandas = df.eval("e = arctan2(sin(a), b)", engine="python", parser="pandas")
+ tmp_ray = ray_df.eval("e = arctan2(sin(a), b)", engine="python", parser="pandas")
assert ray_df_equals_pandas(tmp_ray, tmp_pandas)
# Test inplace assignments
- df.eval(
- "e = arctan2(sin(a), b)",
- engine='python',
- parser='pandas',
- inplace=True)
+ df.eval("e = arctan2(sin(a), b)", engine="python", parser="pandas", inplace=True)
ray_df.eval(
- "e = arctan2(sin(a), b)",
- engine='python',
- parser='pandas',
- inplace=True)
+ "e = arctan2(sin(a), b)", engine="python", parser="pandas", inplace=True
+ )
# TODO: Use a series equality validator.
assert ray_df_equals_pandas(ray_df, df)
def test_eval_df_arithmetic_subexpression():
- frame_data = {'a': np.random.randn(10), 'b': np.random.randn(10)}
+ frame_data = {"a": np.random.randn(10), "b": np.random.randn(10)}
df = pandas.DataFrame(frame_data)
ray_df = pd.DataFrame(frame_data)
- df.eval(
- "not_e = sin(a + b)", engine='python', parser='pandas', inplace=True)
- ray_df.eval(
- "not_e = sin(a + b)", engine='python', parser='pandas', inplace=True)
+ df.eval("not_e = sin(a + b)", engine="python", parser="pandas", inplace=True)
+ ray_df.eval("not_e = sin(a + b)", engine="python", parser="pandas", inplace=True)
# TODO: Use a series equality validator.
assert ray_df_equals_pandas(ray_df, df)
@@ -1587,8 +1574,8 @@ def test_expanding():
@pytest.fixture
def test_ffill():
test_data = TestData()
- test_data.tsframe['A'][:5] = np.nan
- test_data.tsframe['A'][-5:] = np.nan
+ test_data.tsframe["A"][:5] = np.nan
+ test_data.tsframe["A"][-5:] = np.nan
ray_df = pd.DataFrame(test_data.tsframe)
assert ray_df_equals_pandas(ray_df.ffill(), test_data.tsframe.ffill())
@@ -1621,44 +1608,44 @@ def test_fillna():
def test_fillna_sanity():
test_data = TestData()
tf = test_data.tsframe
- tf.loc[tf.index[:5], 'A'] = np.nan
- tf.loc[tf.index[-5:], 'A'] = np.nan
+ tf.loc[tf.index[:5], "A"] = np.nan
+ tf.loc[tf.index[-5:], "A"] = np.nan
zero_filled = test_data.tsframe.fillna(0)
ray_df = pd.DataFrame(test_data.tsframe).fillna(0)
assert ray_df_equals_pandas(ray_df, zero_filled)
- padded = test_data.tsframe.fillna(method='pad')
- ray_df = pd.DataFrame(test_data.tsframe).fillna(method='pad')
+ padded = test_data.tsframe.fillna(method="pad")
+ ray_df = pd.DataFrame(test_data.tsframe).fillna(method="pad")
assert ray_df_equals_pandas(ray_df, padded)
# mixed type
mf = test_data.mixed_frame
- mf.loc[mf.index[5:20], 'foo'] = np.nan
- mf.loc[mf.index[-10:], 'A'] = np.nan
+ mf.loc[mf.index[5:20], "foo"] = np.nan
+ mf.loc[mf.index[-10:], "A"] = np.nan
result = test_data.mixed_frame.fillna(value=0)
ray_df = pd.DataFrame(test_data.mixed_frame).fillna(value=0)
assert ray_df_equals_pandas(ray_df, result)
- result = test_data.mixed_frame.fillna(method='pad')
- ray_df = pd.DataFrame(test_data.mixed_frame).fillna(method='pad')
+ result = test_data.mixed_frame.fillna(method="pad")
+ ray_df = pd.DataFrame(test_data.mixed_frame).fillna(method="pad")
assert ray_df_equals_pandas(ray_df, result)
pytest.raises(ValueError, test_data.tsframe.fillna)
pytest.raises(ValueError, pd.DataFrame(test_data.tsframe).fillna)
with pytest.raises(ValueError):
- pd.DataFrame(test_data.tsframe).fillna(5, method='ffill')
+ pd.DataFrame(test_data.tsframe).fillna(5, method="ffill")
# mixed numeric (but no float16)
- mf = test_data.mixed_float.reindex(columns=['A', 'B', 'D'])
- mf.loc[mf.index[-10:], 'A'] = np.nan
+ mf = test_data.mixed_float.reindex(columns=["A", "B", "D"])
+ mf.loc[mf.index[-10:], "A"] = np.nan
result = mf.fillna(value=0)
ray_df = pd.DataFrame(mf).fillna(value=0)
assert ray_df_equals_pandas(ray_df, result)
- result = mf.fillna(method='pad')
- ray_df = pd.DataFrame(mf).fillna(method='pad')
+ result = mf.fillna(method="pad")
+ ray_df = pd.DataFrame(mf).fillna(method="pad")
assert ray_df_equals_pandas(ray_df, result)
# TODO: Use this when Arrow issue resolves:
@@ -1670,27 +1657,30 @@ def test_fillna_sanity():
# df.x.fillna(method=m)
# with different dtype
- frame_data = [['a', 'a', np.nan, 'a'], ['b', 'b', np.nan, 'b'],
- ['c', 'c', np.nan, 'c']]
+ frame_data = [
+ ["a", "a", np.nan, "a"],
+ ["b", "b", np.nan, "b"],
+ ["c", "c", np.nan, "c"],
+ ]
df = pandas.DataFrame(frame_data)
- result = df.fillna({2: 'foo'})
- ray_df = pd.DataFrame(frame_data).fillna({2: 'foo'})
+ result = df.fillna({2: "foo"})
+ ray_df = pd.DataFrame(frame_data).fillna({2: "foo"})
assert ray_df_equals_pandas(ray_df, result)
ray_df = pd.DataFrame(df)
- df.fillna({2: 'foo'}, inplace=True)
- ray_df.fillna({2: 'foo'}, inplace=True)
+ df.fillna({2: "foo"}, inplace=True)
+ ray_df.fillna({2: "foo"}, inplace=True)
assert ray_df_equals_pandas(ray_df, result)
frame_data = {
- 'Date': [pandas.NaT, pandas.Timestamp("2014-1-1")],
- 'Date2': [pandas.Timestamp("2013-1-1"), pandas.NaT]
+ "Date": [pandas.NaT, pandas.Timestamp("2014-1-1")],
+ "Date2": [pandas.Timestamp("2013-1-1"), pandas.NaT],
}
df = pandas.DataFrame(frame_data)
- result = df.fillna(value={'Date': df['Date2']})
- ray_df = pd.DataFrame(frame_data).fillna(value={'Date': df['Date2']})
+ result = df.fillna(value={"Date": df["Date2"]})
+ ray_df = pd.DataFrame(frame_data).fillna(value={"Date": df["Date2"]})
assert ray_df_equals_pandas(ray_df, result)
# TODO: Use this when Arrow issue resolves:
@@ -1715,39 +1705,39 @@ def test_fillna_sanity():
@pytest.fixture
def test_fillna_downcast():
# infer int64 from float64
- frame_data = {'a': [1., np.nan]}
+ frame_data = {"a": [1., np.nan]}
df = pandas.DataFrame(frame_data)
- result = df.fillna(0, downcast='infer')
- ray_df = pd.DataFrame(frame_data).fillna(0, downcast='infer')
+ result = df.fillna(0, downcast="infer")
+ ray_df = pd.DataFrame(frame_data).fillna(0, downcast="infer")
assert ray_df_equals_pandas(ray_df, result)
# infer int64 from float64 when fillna value is a dict
df = pandas.DataFrame(frame_data)
- result = df.fillna({'a': 0}, downcast='infer')
- ray_df = pd.DataFrame(frame_data).fillna({'a': 0}, downcast='infer')
+ result = df.fillna({"a": 0}, downcast="infer")
+ ray_df = pd.DataFrame(frame_data).fillna({"a": 0}, downcast="infer")
assert ray_df_equals_pandas(ray_df, result)
@pytest.fixture
def test_ffill2():
test_data = TestData()
- test_data.tsframe['A'][:5] = np.nan
- test_data.tsframe['A'][-5:] = np.nan
+ test_data.tsframe["A"][:5] = np.nan
+ test_data.tsframe["A"][-5:] = np.nan
ray_df = pd.DataFrame(test_data.tsframe)
assert ray_df_equals_pandas(
- ray_df.fillna(method='ffill'),
- test_data.tsframe.fillna(method='ffill'))
+ ray_df.fillna(method="ffill"), test_data.tsframe.fillna(method="ffill")
+ )
@pytest.fixture
def test_bfill2():
test_data = TestData()
- test_data.tsframe['A'][:5] = np.nan
- test_data.tsframe['A'][-5:] = np.nan
+ test_data.tsframe["A"][:5] = np.nan
+ test_data.tsframe["A"][-5:] = np.nan
ray_df = pd.DataFrame(test_data.tsframe)
assert ray_df_equals_pandas(
- ray_df.fillna(method='bfill'),
- test_data.tsframe.fillna(method='bfill'))
+ ray_df.fillna(method="bfill"), test_data.tsframe.fillna(method="bfill")
+ )
@pytest.fixture
@@ -1770,11 +1760,11 @@ def test_fillna_inplace():
df[1][:4] = np.nan
df[3][-4:] = np.nan
ray_df = pd.DataFrame(df)
- df.fillna(method='ffill', inplace=True)
+ df.fillna(method="ffill", inplace=True)
assert not ray_df_equals_pandas(ray_df, df)
- ray_df.fillna(method='ffill', inplace=True)
+ ray_df.fillna(method="ffill", inplace=True)
assert ray_df_equals_pandas(ray_df, df)
@@ -1785,15 +1775,14 @@ def test_frame_fillna_limit():
df = pandas.DataFrame(frame_data, index=index)
expected = df[:2].reindex(index)
- expected = expected.fillna(method='pad', limit=5)
+ expected = expected.fillna(method="pad", limit=5)
- ray_df = pd.DataFrame(df[:2].reindex(index)).fillna(method='pad', limit=5)
+ ray_df = pd.DataFrame(df[:2].reindex(index)).fillna(method="pad", limit=5)
assert ray_df_equals_pandas(ray_df, expected)
expected = df[-2:].reindex(index)
- expected = expected.fillna(method='backfill', limit=5)
- ray_df = pd.DataFrame(df[-2:].reindex(index)).fillna(
- method='backfill', limit=5)
+ expected = expected.fillna(method="backfill", limit=5)
+ ray_df = pd.DataFrame(df[-2:].reindex(index)).fillna(method="backfill", limit=5)
assert ray_df_equals_pandas(ray_df, expected)
@@ -1806,27 +1795,28 @@ def test_frame_pad_backfill_limit():
result = df[:2].reindex(index)
ray_df = pd.DataFrame(result)
assert ray_df_equals_pandas(
- ray_df.fillna(method='pad', limit=5),
- result.fillna(method='pad', limit=5))
+ ray_df.fillna(method="pad", limit=5), result.fillna(method="pad", limit=5)
+ )
result = df[-2:].reindex(index)
ray_df = pd.DataFrame(result)
assert ray_df_equals_pandas(
- ray_df.fillna(method='backfill', limit=5),
- result.fillna(method='backfill', limit=5))
+ ray_df.fillna(method="backfill", limit=5),
+ result.fillna(method="backfill", limit=5),
+ )
@pytest.fixture
def test_fillna_dtype_conversion():
# make sure that fillna on an empty frame works
- df = pandas.DataFrame(index=range(3), columns=['A', 'B'], dtype='float64')
- ray_df = pd.DataFrame(index=range(3), columns=['A', 'B'], dtype='float64')
- assert ray_df_equals_pandas(ray_df.fillna('nan'), df.fillna('nan'))
+ df = pandas.DataFrame(index=range(3), columns=["A", "B"], dtype="float64")
+ ray_df = pd.DataFrame(index=range(3), columns=["A", "B"], dtype="float64")
+ assert ray_df_equals_pandas(ray_df.fillna("nan"), df.fillna("nan"))
- frame_data = {'A': [1, np.nan], 'B': [1., 2.]}
+ frame_data = {"A": [1, np.nan], "B": [1., 2.]}
df = pandas.DataFrame(frame_data)
ray_df = pd.DataFrame(frame_data)
- for v in ['', 1, np.nan, 1.0]:
+ for v in ["", 1, np.nan, 1.0]:
assert ray_df_equals_pandas(ray_df.fillna(v), df.fillna(v))
@@ -1844,32 +1834,20 @@ def test_fillna_skip_certain_blocks():
@pytest.fixture
def test_fillna_dict_series():
frame_data = {
- 'a': [np.nan, 1, 2, np.nan, np.nan],
- 'b': [1, 2, 3, np.nan, np.nan],
- 'c': [np.nan, 1, 2, 3, 4]
+ "a": [np.nan, 1, 2, np.nan, np.nan],
+ "b": [1, 2, 3, np.nan, np.nan],
+ "c": [np.nan, 1, 2, 3, 4],
}
df = pandas.DataFrame(frame_data)
ray_df = pd.DataFrame(frame_data)
assert ray_df_equals_pandas(
- ray_df.fillna({
- 'a': 0,
- 'b': 5
- }), df.fillna({
- 'a': 0,
- 'b': 5
- }))
+ ray_df.fillna({"a": 0, "b": 5}), df.fillna({"a": 0, "b": 5})
+ )
assert ray_df_equals_pandas(
- ray_df.fillna({
- 'a': 0,
- 'b': 5,
- 'd': 7
- }), df.fillna({
- 'a': 0,
- 'b': 5,
- 'd': 7
- }))
+ ray_df.fillna({"a": 0, "b": 5, "d": 7}), df.fillna({"a": 0, "b": 5, "d": 7})
+ )
# Series treated same as dict
assert ray_df_equals_pandas(ray_df.fillna(df.max()), df.fillna(df.max()))
@@ -1878,20 +1856,18 @@ def test_fillna_dict_series():
@pytest.fixture
def test_fillna_dataframe():
frame_data = {
- 'a': [np.nan, 1, 2, np.nan, np.nan],
- 'b': [1, 2, 3, np.nan, np.nan],
- 'c': [np.nan, 1, 2, 3, 4]
+ "a": [np.nan, 1, 2, np.nan, np.nan],
+ "b": [1, 2, 3, np.nan, np.nan],
+ "c": [np.nan, 1, 2, 3, 4],
}
- df = pandas.DataFrame(frame_data, index=list('VWXYZ'))
- ray_df = pd.DataFrame(frame_data, index=list('VWXYZ'))
+ df = pandas.DataFrame(frame_data, index=list("VWXYZ"))
+ ray_df = pd.DataFrame(frame_data, index=list("VWXYZ"))
# df2 may have different index and columns
- df2 = pandas.DataFrame({
- 'a': [np.nan, 10, 20, 30, 40],
- 'b': [50, 60, 70, 80, 90],
- 'foo': ['bar'] * 5
- },
- index=list('VWXuZ'))
+ df2 = pandas.DataFrame(
+ {"a": [np.nan, 10, 20, 30, 40], "b": [50, 60, 70, 80, 90], "foo": ["bar"] * 5},
+ index=list("VWXuZ"),
+ )
# only those columns and indices which are shared get filled
assert ray_df_equals_pandas(ray_df.fillna(df2), df.fillna(df2))
@@ -1905,22 +1881,22 @@ def test_fillna_columns():
ray_df = pd.DataFrame(df)
assert ray_df_equals_pandas(
- ray_df.fillna(method='ffill', axis=1), df.fillna(
- method='ffill', axis=1))
+ ray_df.fillna(method="ffill", axis=1), df.fillna(method="ffill", axis=1)
+ )
- df.insert(6, 'foo', 5)
+ df.insert(6, "foo", 5)
ray_df = pd.DataFrame(df)
assert ray_df_equals_pandas(
- ray_df.fillna(method='ffill', axis=1), df.fillna(
- method='ffill', axis=1))
+ ray_df.fillna(method="ffill", axis=1), df.fillna(method="ffill", axis=1)
+ )
@pytest.fixture
def test_fillna_invalid_method():
test_data = TestData()
ray_df = pd.DataFrame(test_data.frame)
- with tm.assert_raises_regex(ValueError, 'ffil'):
- ray_df.fillna(method='ffil')
+ with tm.assert_raises_regex(ValueError, "ffil"):
+ ray_df.fillna(method="ffil")
@pytest.fixture
@@ -1942,7 +1918,8 @@ def test_fillna_col_reordering():
df = pandas.DataFrame(index=range(20), columns=cols, data=data)
ray_df = pd.DataFrame(index=range(20), columns=cols, data=data)
assert ray_df_equals_pandas(
- ray_df.fillna(method='ffill'), df.fillna(method='ffill'))
+ ray_df.fillna(method="ffill"), df.fillna(method="ffill")
+ )
"""
@@ -1972,13 +1949,16 @@ def test_fillna_datetime_columns():
@pytest.fixture
def test_filter(ray_df, pandas_df, by):
assert ray_df_equals_pandas(
- ray_df.filter(items=by['items']), pandas_df.filter(items=by['items']))
+ ray_df.filter(items=by["items"]), pandas_df.filter(items=by["items"])
+ )
assert ray_df_equals_pandas(
- ray_df.filter(regex=by['regex']), pandas_df.filter(regex=by['regex']))
+ ray_df.filter(regex=by["regex"]), pandas_df.filter(regex=by["regex"])
+ )
assert ray_df_equals_pandas(
- ray_df.filter(like=by['like']), pandas_df.filter(like=by['like']))
+ ray_df.filter(like=by["like"]), pandas_df.filter(like=by["like"])
+ )
def test_first():
@@ -2053,14 +2033,12 @@ def test_hist():
@pytest.fixture
def test_idxmax(ray_df, pandas_df):
- assert \
- ray_df.idxmax().equals(pandas_df.idxmax())
+ assert ray_df.idxmax().equals(pandas_df.idxmax())
@pytest.fixture
def test_idxmin(ray_df, pandas_df):
- assert \
- ray_df.idxmin().equals(pandas_df.idxmin())
+ assert ray_df.idxmin().equals(pandas_df.idxmin())
def test_infer_objects():
@@ -2071,24 +2049,26 @@ def test_infer_objects():
def test_info():
- ray_df = pd.DataFrame({
- 'col1': [1, 2, 3, np.nan],
- 'col2': [4, 5, np.nan, 7],
- 'col3': [8, np.nan, 10, 11],
- 'col4': [np.nan, 13, 14, 15]
- })
- ray_df.info(memory_usage='deep')
+ ray_df = pd.DataFrame(
+ {
+ "col1": [1, 2, 3, np.nan],
+ "col2": [4, 5, np.nan, 7],
+ "col3": [8, np.nan, 10, 11],
+ "col4": [np.nan, 13, 14, 15],
+ }
+ )
+ ray_df.info(memory_usage="deep")
with io.StringIO() as buf:
ray_df.info(buf=buf)
info_string = buf.getvalue()
- assert '\n' in info_string
- assert 'memory usage: ' in info_string
- assert 'Data columns (total 4 columns):' in info_string
+ assert "\n" in info_string
+ assert "memory usage: " in info_string
+ assert "Data columns (total 4 columns):" in info_string
with io.StringIO() as buf:
ray_df.info(buf=buf, verbose=False, memory_usage=False)
info_string = buf.getvalue()
- assert 'memory usage: ' not in info_string
- assert 'Columns: 4 entries, col1 to col4' in info_string
+ assert "memory usage: " not in info_string
+ assert "Columns: 4 entries, col1 to col4" in info_string
@pytest.fixture
@@ -2150,7 +2130,7 @@ def test_itertuples(ray_df, pandas_df):
# test all combinations of custom params
indices = [True, False]
- names = [None, 'NotPandas', 'Pandas']
+ names = [None, "NotPandas", "Pandas"]
for index in indices:
for name in names:
@@ -2165,7 +2145,7 @@ def test_join():
"col1": [0, 1, 2, 3],
"col2": [4, 5, 6, 7],
"col3": [8, 9, 0, 1],
- "col4": [2, 4, 5, 6]
+ "col4": [2, 4, 5, 6],
}
ray_df = pd.DataFrame(frame_data)
@@ -2255,8 +2235,8 @@ def test_max(ray_df, pandas_df):
# We pass in numeric_only because
# https://github.com/modin-project/modin/issues/83
assert ray_series_equals_pandas(
- ray_df.max(axis=1, numeric_only=True),
- pandas_df.max(axis=1, numeric_only=True))
+ ray_df.max(axis=1, numeric_only=True), pandas_df.max(axis=1, numeric_only=True)
+ )
@pytest.fixture
@@ -2279,9 +2259,8 @@ def test_melt():
def test_memory_usage():
ray_df = create_test_dataframe()
assert type(ray_df.memory_usage()) is pandas.core.series.Series
- assert ray_df.memory_usage(index=True).at['Index'] is not None
- assert ray_df.memory_usage(deep=True).sum() >= \
- ray_df.memory_usage(deep=False).sum()
+ assert ray_df.memory_usage(index=True).at["Index"] is not None
+ assert ray_df.memory_usage(deep=True).sum() >= ray_df.memory_usage(deep=False).sum()
def test_merge():
@@ -2289,7 +2268,7 @@ def test_merge():
"col1": [0, 1, 2, 3],
"col2": [4, 5, 6, 7],
"col3": [8, 9, 0, 1],
- "col4": [2, 4, 5, 6]
+ "col4": [2, 4, 5, 6],
}
ray_df = pd.DataFrame(frame_data)
@@ -2309,37 +2288,41 @@ def test_merge():
# left_on and right_index
ray_result = ray_df.merge(
- ray_df2, how=how, left_on='col1', right_index=True)
+ ray_df2, how=how, left_on="col1", right_index=True
+ )
pandas_result = pandas_df.merge(
- pandas_df2, how=how, left_on='col1', right_index=True)
+ pandas_df2, how=how, left_on="col1", right_index=True
+ )
ray_df_equals_pandas(ray_result, pandas_result)
# left_index and right_on
ray_result = ray_df.merge(
- ray_df2, how=how, left_index=True, right_on='col1')
+ ray_df2, how=how, left_index=True, right_on="col1"
+ )
pandas_result = pandas_df.merge(
- pandas_df2, how=how, left_index=True, right_on='col1')
+ pandas_df2, how=how, left_index=True, right_on="col1"
+ )
ray_df_equals_pandas(ray_result, pandas_result)
# left_on and right_on col1
- ray_result = ray_df.merge(
- ray_df2, how=how, left_on='col1', right_on='col1')
+ ray_result = ray_df.merge(ray_df2, how=how, left_on="col1", right_on="col1")
pandas_result = pandas_df.merge(
- pandas_df2, how=how, left_on='col1', right_on='col1')
+ pandas_df2, how=how, left_on="col1", right_on="col1"
+ )
ray_df_equals_pandas(ray_result, pandas_result)
# left_on and right_on col2
- ray_result = ray_df.merge(
- ray_df2, how=how, left_on='col2', right_on='col2')
+ ray_result = ray_df.merge(ray_df2, how=how, left_on="col2", right_on="col2")
pandas_result = pandas_df.merge(
- pandas_df2, how=how, left_on='col2', right_on='col2')
+ pandas_df2, how=how, left_on="col2", right_on="col2"
+ )
ray_df_equals_pandas(ray_result, pandas_result)
# left_index and right_index
- ray_result = ray_df.merge(
- ray_df2, how=how, left_index=True, right_index=True)
+ ray_result = ray_df.merge(ray_df2, how=how, left_index=True, right_index=True)
pandas_result = pandas_df.merge(
- pandas_df2, how=how, left_index=True, right_index=True)
+ pandas_df2, how=how, left_index=True, right_index=True
+ )
ray_df_equals_pandas(ray_result, pandas_result)
@@ -2356,8 +2339,7 @@ def test_mod():
@pytest.fixture
def test_mode(ray_df, pandas_df):
assert ray_series_equals_pandas(ray_df.mode(), pandas_df.mode())
- assert ray_series_equals_pandas(
- ray_df.mode(axis=1), pandas_df.mode(axis=1))
+ assert ray_series_equals_pandas(ray_df.mode(axis=1), pandas_df.mode(axis=1))
def test_mul():
@@ -2399,8 +2381,7 @@ def test_nsmallest():
@pytest.fixture
def test_nunique(ray_df, pandas_df):
assert ray_df_equals_pandas(ray_df.nunique(), pandas_df.nunique())
- assert ray_df_equals_pandas(
- ray_df.nunique(axis=1), pandas_df.nunique(axis=1))
+ assert ray_df_equals_pandas(ray_df.nunique(axis=1), pandas_df.nunique(axis=1))
def test_pct_change():
@@ -2429,11 +2410,13 @@ def f(x, arg2=0, arg3=0):
assert ray_df_equals(
f(g(h(ray_df), arg1=a), arg2=b, arg3=c),
- (ray_df.pipe(h).pipe(g, arg1=a).pipe(f, arg2=b, arg3=c)))
+ (ray_df.pipe(h).pipe(g, arg1=a).pipe(f, arg2=b, arg3=c)),
+ )
assert ray_df_equals_pandas(
(ray_df.pipe(h).pipe(g, arg1=a).pipe(f, arg2=b, arg3=c)),
- (pandas_df.pipe(h).pipe(g, arg1=a).pipe(f, arg2=b, arg3=c)))
+ (pandas_df.pipe(h).pipe(g, arg1=a).pipe(f, arg2=b, arg3=c)),
+ )
def test_pivot():
@@ -2461,8 +2444,8 @@ def test_plot():
def test_pop(ray_df, pandas_df):
temp_ray_df = ray_df.copy()
temp_pandas_df = pandas_df.copy()
- ray_popped = temp_ray_df.pop('col2')
- pandas_popped = temp_pandas_df.pop('col2')
+ ray_popped = temp_ray_df.pop("col2")
+ pandas_popped = temp_pandas_df.pop("col2")
assert ray_popped.equals(pandas_popped)
assert ray_df_equals_pandas(temp_ray_df, temp_pandas_df)
@@ -2509,34 +2492,35 @@ def test_rdiv():
def test_reindex():
frame_data = {
- 'col1': [0, 1, 2, 3],
- 'col2': [4, 5, 6, 7],
- 'col3': [8, 9, 10, 11],
- 'col4': [12, 13, 14, 15],
- 'col5': [0, 0, 0, 0]
+ "col1": [0, 1, 2, 3],
+ "col2": [4, 5, 6, 7],
+ "col3": [8, 9, 10, 11],
+ "col4": [12, 13, 14, 15],
+ "col5": [0, 0, 0, 0],
}
pandas_df = pandas.DataFrame(frame_data)
ray_df = pd.DataFrame(frame_data)
assert ray_df_equals_pandas(
- ray_df.reindex([0, 3, 2, 1]), pandas_df.reindex([0, 3, 2, 1]))
+ ray_df.reindex([0, 3, 2, 1]), pandas_df.reindex([0, 3, 2, 1])
+ )
- assert ray_df_equals_pandas(
- ray_df.reindex([0, 6, 2]), pandas_df.reindex([0, 6, 2]))
+ assert ray_df_equals_pandas(ray_df.reindex([0, 6, 2]), pandas_df.reindex([0, 6, 2]))
assert ray_df_equals_pandas(
- ray_df.reindex(['col1', 'col3', 'col4', 'col2'], axis=1),
- pandas_df.reindex(['col1', 'col3', 'col4', 'col2'], axis=1))
+ ray_df.reindex(["col1", "col3", "col4", "col2"], axis=1),
+ pandas_df.reindex(["col1", "col3", "col4", "col2"], axis=1),
+ )
assert ray_df_equals_pandas(
- ray_df.reindex(['col1', 'col7', 'col4', 'col8'], axis=1),
- pandas_df.reindex(['col1', 'col7', 'col4', 'col8'], axis=1))
+ ray_df.reindex(["col1", "col7", "col4", "col8"], axis=1),
+ pandas_df.reindex(["col1", "col7", "col4", "col8"], axis=1),
+ )
assert ray_df_equals_pandas(
- ray_df.reindex(
- index=[0, 1, 5], columns=['col1', 'col7', 'col4', 'col8']),
- pandas_df.reindex(
- index=[0, 1, 5], columns=['col1', 'col7', 'col4', 'col8']))
+ ray_df.reindex(index=[0, 1, 5], columns=["col1", "col7", "col4", "col8"]),
+ pandas_df.reindex(index=[0, 1, 5], columns=["col1", "col7", "col4", "col8"]),
+ )
def test_reindex_axis():
@@ -2568,55 +2552,46 @@ def test_rename():
@pytest.fixture
def test_rename_sanity():
test_data = TestData()
- mapping = {'A': 'a', 'B': 'b', 'C': 'c', 'D': 'd'}
+ mapping = {"A": "a", "B": "b", "C": "c", "D": "d"}
ray_df = pd.DataFrame(test_data.frame)
assert ray_df_equals_pandas(
- ray_df.rename(columns=mapping),
- test_data.frame.rename(columns=mapping))
+ ray_df.rename(columns=mapping), test_data.frame.rename(columns=mapping)
+ )
renamed2 = test_data.frame.rename(columns=str.lower)
assert ray_df_equals_pandas(ray_df.rename(columns=str.lower), renamed2)
ray_df = pd.DataFrame(renamed2)
assert ray_df_equals_pandas(
- ray_df.rename(columns=str.upper), renamed2.rename(columns=str.upper))
+ ray_df.rename(columns=str.upper), renamed2.rename(columns=str.upper)
+ )
# index
- data = {'A': {'foo': 0, 'bar': 1}}
+ data = {"A": {"foo": 0, "bar": 1}}
# gets sorted alphabetical
df = pandas.DataFrame(data)
ray_df = pd.DataFrame(data)
tm.assert_index_equal(
- ray_df.rename(index={
- 'foo': 'bar',
- 'bar': 'foo'
- }).index,
- df.rename(index={
- 'foo': 'bar',
- 'bar': 'foo'
- }).index)
+ ray_df.rename(index={"foo": "bar", "bar": "foo"}).index,
+ df.rename(index={"foo": "bar", "bar": "foo"}).index,
+ )
tm.assert_index_equal(
- ray_df.rename(index=str.upper).index,
- df.rename(index=str.upper).index)
+ ray_df.rename(index=str.upper).index, df.rename(index=str.upper).index
+ )
# have to pass something
pytest.raises(TypeError, ray_df.rename)
# partial columns
- renamed = test_data.frame.rename(columns={'C': 'foo', 'D': 'bar'})
+ renamed = test_data.frame.rename(columns={"C": "foo", "D": "bar"})
ray_df = pd.DataFrame(test_data.frame)
tm.assert_index_equal(
- ray_df.rename(columns={
- 'C': 'foo',
- 'D': 'bar'
- }).index,
- test_data.frame.rename(columns={
- 'C': 'foo',
- 'D': 'bar'
- }).index)
+ ray_df.rename(columns={"C": "foo", "D": "bar"}).index,
+ test_data.frame.rename(columns={"C": "foo", "D": "bar"}).index,
+ )
# TODO: Uncomment when transpose works
# other axis
@@ -2626,12 +2601,12 @@ def test_rename_sanity():
# ray_df.T.rename(index={'C': 'foo', 'D': 'bar'}).index)
# index with name
- index = pandas.Index(['foo', 'bar'], name='name')
+ index = pandas.Index(["foo", "bar"], name="name")
renamer = pandas.DataFrame(data, index=index)
ray_df = pd.DataFrame(data, index=index)
- renamed = renamer.rename(index={'foo': 'bar', 'bar': 'foo'})
- ray_renamed = ray_df.rename(index={'foo': 'bar', 'bar': 'foo'})
+ renamed = renamer.rename(index={"foo": "bar", "bar": "foo"})
+ ray_renamed = ray_df.rename(index={"foo": "bar", "bar": "foo"})
tm.assert_index_equal(renamed.index, ray_renamed.index)
assert renamed.index.name == ray_renamed.index.name
@@ -2639,11 +2614,10 @@ def test_rename_sanity():
@pytest.fixture
def test_rename_multiindex():
- tuples_index = [('foo1', 'bar1'), ('foo2', 'bar2')]
- tuples_columns = [('fizz1', 'buzz1'), ('fizz2', 'buzz2')]
- index = pandas.MultiIndex.from_tuples(tuples_index, names=['foo', 'bar'])
- columns = pandas.MultiIndex.from_tuples(
- tuples_columns, names=['fizz', 'buzz'])
+ tuples_index = [("foo1", "bar1"), ("foo2", "bar2")]
+ tuples_columns = [("fizz1", "buzz1"), ("fizz2", "buzz2")]
+ index = pandas.MultiIndex.from_tuples(tuples_index, names=["foo", "bar"])
+ columns = pandas.MultiIndex.from_tuples(tuples_columns, names=["fizz", "buzz"])
frame_data = [(0, 0), (1, 1)]
df = pandas.DataFrame(frame_data, index=index, columns=columns)
@@ -2652,34 +2626,19 @@ def test_rename_multiindex():
#
# without specifying level -> accross all levels
renamed = df.rename(
- index={
- 'foo1': 'foo3',
- 'bar2': 'bar3'
- },
- columns={
- 'fizz1': 'fizz3',
- 'buzz2': 'buzz3'
- })
+ index={"foo1": "foo3", "bar2": "bar3"},
+ columns={"fizz1": "fizz3", "buzz2": "buzz3"},
+ )
ray_renamed = ray_df.rename(
- index={
- 'foo1': 'foo3',
- 'bar2': 'bar3'
- },
- columns={
- 'fizz1': 'fizz3',
- 'buzz2': 'buzz3'
- })
+ index={"foo1": "foo3", "bar2": "bar3"},
+ columns={"fizz1": "fizz3", "buzz2": "buzz3"},
+ )
tm.assert_index_equal(renamed.index, ray_renamed.index)
renamed = df.rename(
- index={
- 'foo1': 'foo3',
- 'bar2': 'bar3'
- },
- columns={
- 'fizz1': 'fizz3',
- 'buzz2': 'buzz3'
- })
+ index={"foo1": "foo3", "bar2": "bar3"},
+ columns={"fizz1": "fizz3", "buzz2": "buzz3"},
+ )
tm.assert_index_equal(renamed.columns, ray_renamed.columns)
assert renamed.index.names == ray_renamed.index.names
assert renamed.columns.names == ray_renamed.columns.names
@@ -2688,42 +2647,22 @@ def test_rename_multiindex():
# with specifying a level
# dict
- renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, level=0)
- ray_renamed = ray_df.rename(
- columns={
- 'fizz1': 'fizz3',
- 'buzz2': 'buzz3'
- }, level=0)
+ renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=0)
+ ray_renamed = ray_df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=0)
tm.assert_index_equal(renamed.columns, ray_renamed.columns)
- renamed = df.rename(
- columns={
- 'fizz1': 'fizz3',
- 'buzz2': 'buzz3'
- }, level='fizz')
+ renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="fizz")
ray_renamed = ray_df.rename(
- columns={
- 'fizz1': 'fizz3',
- 'buzz2': 'buzz3'
- }, level='fizz')
+ columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="fizz"
+ )
tm.assert_index_equal(renamed.columns, ray_renamed.columns)
- renamed = df.rename(columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}, level=1)
- ray_renamed = ray_df.rename(
- columns={
- 'fizz1': 'fizz3',
- 'buzz2': 'buzz3'
- }, level=1)
+ renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=1)
+ ray_renamed = ray_df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=1)
tm.assert_index_equal(renamed.columns, ray_renamed.columns)
- renamed = df.rename(
- columns={
- 'fizz1': 'fizz3',
- 'buzz2': 'buzz3'
- }, level='buzz')
+ renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="buzz")
ray_renamed = ray_df.rename(
- columns={
- 'fizz1': 'fizz3',
- 'buzz2': 'buzz3'
- }, level='buzz')
+ columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="buzz"
+ )
tm.assert_index_equal(renamed.columns, ray_renamed.columns)
# function
@@ -2731,24 +2670,20 @@ def test_rename_multiindex():
renamed = df.rename(columns=func, level=0)
ray_renamed = ray_df.rename(columns=func, level=0)
tm.assert_index_equal(renamed.columns, ray_renamed.columns)
- renamed = df.rename(columns=func, level='fizz')
- ray_renamed = ray_df.rename(columns=func, level='fizz')
+ renamed = df.rename(columns=func, level="fizz")
+ ray_renamed = ray_df.rename(columns=func, level="fizz")
tm.assert_index_equal(renamed.columns, ray_renamed.columns)
renamed = df.rename(columns=func, level=1)
ray_renamed = ray_df.rename(columns=func, level=1)
tm.assert_index_equal(renamed.columns, ray_renamed.columns)
- renamed = df.rename(columns=func, level='buzz')
- ray_renamed = ray_df.rename(columns=func, level='buzz')
+ renamed = df.rename(columns=func, level="buzz")
+ ray_renamed = ray_df.rename(columns=func, level="buzz")
tm.assert_index_equal(renamed.columns, ray_renamed.columns)
# index
- renamed = df.rename(index={'foo1': 'foo3', 'bar2': 'bar3'}, level=0)
- ray_renamed = ray_df.rename(
- index={
- 'foo1': 'foo3',
- 'bar2': 'bar3'
- }, level=0)
+ renamed = df.rename(index={"foo1": "foo3", "bar2": "bar3"}, level=0)
+ ray_renamed = ray_df.rename(index={"foo1": "foo3", "bar2": "bar3"}, level=0)
tm.assert_index_equal(ray_renamed.index, renamed.index)
@@ -2756,9 +2691,9 @@ def test_rename_multiindex():
def test_rename_nocopy():
test_data = TestData().frame
ray_df = pd.DataFrame(test_data)
- ray_renamed = ray_df.rename(columns={'C': 'foo'}, copy=False)
- ray_renamed['foo'] = 1
- assert (ray_df['C'] == 1).all()
+ ray_renamed = ray_df.rename(columns={"C": "foo"}, copy=False)
+ ray_renamed["foo"] = 1
+ assert (ray_df["C"] == 1).all()
@pytest.fixture
@@ -2767,13 +2702,13 @@ def test_rename_inplace():
ray_df = pd.DataFrame(test_data)
assert ray_df_equals_pandas(
- ray_df.rename(columns={'C': 'foo'}),
- test_data.rename(columns={'C': 'foo'}))
+ ray_df.rename(columns={"C": "foo"}), test_data.rename(columns={"C": "foo"})
+ )
frame = test_data.copy()
ray_frame = ray_df.copy()
- frame.rename(columns={'C': 'foo'}, inplace=True)
- ray_frame.rename(columns={'C': 'foo'}, inplace=True)
+ frame.rename(columns={"C": "foo"}, inplace=True)
+ ray_frame.rename(columns={"C": "foo"}, inplace=True)
assert ray_df_equals_pandas(ray_frame, frame)
@@ -2781,17 +2716,17 @@ def test_rename_inplace():
@pytest.fixture
def test_rename_bug():
# rename set ref_locs, and set_index was not resetting
- frame_data = {0: ['foo', 'bar'], 1: ['bah', 'bas'], 2: [1, 2]}
+ frame_data = {0: ["foo", "bar"], 1: ["bah", "bas"], 2: [1, 2]}
df = pandas.DataFrame(frame_data)
ray_df = pd.DataFrame(frame_data)
- df = df.rename(columns={0: 'a'})
- df = df.rename(columns={1: 'b'})
+ df = df.rename(columns={0: "a"})
+ df = df.rename(columns={1: "b"})
# TODO: Uncomment when set_index is implemented
# df = df.set_index(['a', 'b'])
# df.columns = ['2001-01-01']
- ray_df = ray_df.rename(columns={0: 'a'})
- ray_df = ray_df.rename(columns={1: 'b'})
+ ray_df = ray_df.rename(columns={0: "a"})
+ ray_df = ray_df.rename(columns={1: "b"})
# TODO: Uncomment when set_index is implemented
# ray_df = ray_df.set_index(['a', 'b'])
# ray_df.columns = ['2001-01-01']
@@ -2805,16 +2740,16 @@ def test_rename_axis_inplace():
result = test_frame.copy()
ray_result = ray_df.copy()
- no_return = result.rename_axis('foo', inplace=True)
- ray_no_return = ray_result.rename_axis('foo', inplace=True)
+ no_return = result.rename_axis("foo", inplace=True)
+ ray_no_return = ray_result.rename_axis("foo", inplace=True)
assert no_return is ray_no_return
assert ray_df_equals_pandas(ray_result, result)
result = test_frame.copy()
ray_result = ray_df.copy()
- no_return = result.rename_axis('bar', axis=1, inplace=True)
- ray_no_return = ray_result.rename_axis('bar', axis=1, inplace=True)
+ no_return = result.rename_axis("bar", axis=1, inplace=True)
+ ray_no_return = ray_result.rename_axis("bar", axis=1, inplace=True)
assert no_return is ray_no_return
assert ray_df_equals_pandas(ray_result, result)
@@ -2845,7 +2780,8 @@ def test_resample():
def test_reset_index(ray_df, pandas_df, inplace=False):
if not inplace:
assert to_pandas(ray_df.reset_index(inplace=inplace)).equals(
- pandas_df.reset_index(inplace=inplace))
+ pandas_df.reset_index(inplace=inplace)
+ )
else:
ray_df_cp = ray_df.copy()
pd_df_cp = pandas_df.copy()
@@ -2854,14 +2790,16 @@ def test_reset_index(ray_df, pandas_df, inplace=False):
assert to_pandas(ray_df_cp).equals(pd_df_cp)
-@pytest.mark.skip(reason="dtypes on different partitions may not match up, "
- "no fix for this yet")
+@pytest.mark.skip(
+ reason="dtypes on different partitions may not match up, " "no fix for this yet"
+)
def test_rfloordiv():
test_inter_df_math_right_ops("rfloordiv")
-@pytest.mark.skip(reason="dtypes on different partitions may not match up, "
- "no fix for this yet")
+@pytest.mark.skip(
+ reason="dtypes on different partitions may not match up, " "no fix for this yet"
+)
def test_rmod():
test_inter_df_math_right_ops("rmod")
@@ -2891,8 +2829,9 @@ def test_rsub():
test_inter_df_math_right_ops("rsub")
-@pytest.mark.skip(reason="dtypes on different partitions may not match up, "
- "no fix for this yet")
+@pytest.mark.skip(
+ reason="dtypes on different partitions may not match up, " "no fix for this yet"
+)
def test_rtruediv():
test_inter_df_math_right_ops("rtruediv")
@@ -2904,10 +2843,11 @@ def test_sample(ray_df, pd_df):
assert ray_df_equals_pandas(
ray_df.sample(frac=0.5, random_state=42),
- pd_df.sample(frac=0.5, random_state=42))
+ pd_df.sample(frac=0.5, random_state=42),
+ )
assert ray_df_equals_pandas(
- ray_df.sample(n=2, random_state=42), pd_df.sample(
- n=2, random_state=42))
+ ray_df.sample(n=2, random_state=42), pd_df.sample(n=2, random_state=42)
+ )
def test_select():
@@ -2919,18 +2859,18 @@ def test_select():
def test_select_dtypes():
frame_data = {
- 'test1': list('abc'),
- 'test2': np.arange(3, 6).astype('u1'),
- 'test3': np.arange(8.0, 11.0, dtype='float64'),
- 'test4': [True, False, True],
- 'test5': pandas.date_range('now', periods=3).values,
- 'test6': list(range(5, 8))
+ "test1": list("abc"),
+ "test2": np.arange(3, 6).astype("u1"),
+ "test3": np.arange(8.0, 11.0, dtype="float64"),
+ "test4": [True, False, True],
+ "test5": pandas.date_range("now", periods=3).values,
+ "test6": list(range(5, 8)),
}
df = pandas.DataFrame(frame_data)
rd = pd.DataFrame(frame_data)
- include = np.float, 'integer'
- exclude = np.bool_,
+ include = np.float, "integer"
+ exclude = (np.bool_,)
r = rd.select_dtypes(include=include, exclude=exclude)
e = df[["test2", "test3", "test6"]]
@@ -2953,14 +2893,14 @@ def test_sem():
@pytest.fixture
def test_set_axis(ray_df, pandas_df, label, axis):
assert to_pandas(ray_df.set_axis(label, axis, inplace=False)).equals(
- pandas_df.set_axis(label, axis, inplace=False))
+ pandas_df.set_axis(label, axis, inplace=False)
+ )
@pytest.fixture
def test_set_index(ray_df, pandas_df, keys, inplace=False):
if not inplace:
- assert to_pandas(ray_df.set_index(keys)).equals(
- pandas_df.set_index(keys))
+ assert to_pandas(ray_df.set_index(keys)).equals(pandas_df.set_index(keys))
else:
ray_df_cp = ray_df.copy()
pd_df_cp = pandas_df.copy()
@@ -3137,9 +3077,9 @@ def test_to_xarray():
def test_transform(ray_df, pandas_df):
assert ray_df_equals_pandas(
ray_df.transform(lambda df: df.isna()),
- pandas_df.transform(lambda df: df.isna()))
- assert ray_df_equals_pandas(
- ray_df.transform('isna'), pandas_df.transform('isna'))
+ pandas_df.transform(lambda df: df.isna()),
+ )
+ assert ray_df_equals_pandas(ray_df.transform("isna"), pandas_df.transform("isna"))
def test_truediv():
@@ -3182,14 +3122,15 @@ def test_unstack():
def test_update():
- df = pd.DataFrame([[1.5, np.nan, 3.], [1.5, np.nan, 3.], [1.5, np.nan, 3],
- [1.5, np.nan, 3]])
- other = pd.DataFrame([[3.6, 2., np.nan], [np.nan, np.nan, 7]],
- index=[1, 3])
+ df = pd.DataFrame(
+ [[1.5, np.nan, 3.], [1.5, np.nan, 3.], [1.5, np.nan, 3], [1.5, np.nan, 3]]
+ )
+ other = pd.DataFrame([[3.6, 2., np.nan], [np.nan, np.nan, 7]], index=[1, 3])
df.update(other)
- expected = pd.DataFrame([[1.5, np.nan, 3], [3.6, 2, 3], [1.5, np.nan, 3],
- [1.5, np.nan, 7.]])
+ expected = pd.DataFrame(
+ [[1.5, np.nan, 3], [3.6, 2, 3], [1.5, np.nan, 3], [1.5, np.nan, 7.]]
+ )
assert ray_df_equals(df, expected)
@@ -3197,14 +3138,15 @@ def test_update():
def test_var(ray_df, pandas_df):
# Because of some differences in floating point arithmetic, we need to check that
# they are almost equal if they are not identically equal.
- assert (ray_df.var() == pandas_df.var()).all() or \
- ((ray_df.var() - pandas_df.var()).abs() < 10**-10).all()
+ assert (ray_df.var() == pandas_df.var()).all() or (
+ (ray_df.var() - pandas_df.var()).abs() < 10 ** -10
+ ).all()
def test_where():
frame_data = np.random.randn(100, 10)
- pandas_df = pandas.DataFrame(frame_data, columns=list('abcdefghij'))
- ray_df = pd.DataFrame(frame_data, columns=list('abcdefghij'))
+ pandas_df = pandas.DataFrame(frame_data, columns=list("abcdefghij"))
+ ray_df = pd.DataFrame(frame_data, columns=list("abcdefghij"))
pandas_cond_df = pandas_df % 5 < 2
ray_cond_df = ray_df % 5 < 2
@@ -3217,7 +3159,7 @@ def test_where():
ray_result = ray_df.where(ray_cond_df, other, axis=1)
assert all((to_pandas(ray_result) == pandas_result).all())
- other = pandas_df['e']
+ other = pandas_df["e"]
pandas_result = pandas_df.where(pandas_cond_df, other, axis=0)
ray_result = ray_df.where(ray_cond_df, other, axis=0)
assert all((to_pandas(ray_result) == pandas_result).all())
@@ -3236,10 +3178,10 @@ def test_xs():
@pytest.fixture
def test___getitem__(ray_df, pd_df):
- ray_col = ray_df.__getitem__('col1')
+ ray_col = ray_df.__getitem__("col1")
assert isinstance(ray_col, pandas.Series)
- pd_col = pd_df['col1']
+ pd_col = pd_df["col1"]
assert pd_col.equals(ray_col)
@@ -3304,8 +3246,8 @@ def test___iter__(ray_df, pd_df):
ray_iterator = ray_df.__iter__()
# Check that ray_iterator implements the iterator interface
- assert hasattr(ray_iterator, '__iter__')
- assert hasattr(ray_iterator, 'next') or hasattr(ray_iterator, '__next__')
+ assert hasattr(ray_iterator, "__iter__")
+ assert hasattr(ray_iterator, "next") or hasattr(ray_iterator, "__next__")
pd_iterator = pd_df.__iter__()
assert list(ray_iterator) == list(pd_iterator)
@@ -3366,8 +3308,8 @@ def test___setstate__():
def test___delitem__(ray_df, pd_df):
ray_df = ray_df.copy()
pd_df = pd_df.copy()
- ray_df.__delitem__('col1')
- pd_df.__delitem__('col1')
+ ray_df.__delitem__("col1")
+ pd_df.__delitem__("col1")
assert ray_df_equals_pandas(ray_df, pd_df)
# Issue 2027
@@ -3473,12 +3415,12 @@ def test___repr__():
@pytest.fixture
def test_loc(ray_df, pd_df):
# Scaler
- assert ray_df.loc[0, 'col1'] == pd_df.loc[0, 'col1']
+ assert ray_df.loc[0, "col1"] == pd_df.loc[0, "col1"]
# Series
assert ray_df.loc[0].equals(pd_df.loc[0])
- assert ray_df.loc[1:, 'col1'].equals(pd_df.loc[1:, 'col1'])
- assert ray_df.loc[1:2, 'col1'].equals(pd_df.loc[1:2, 'col1'])
+ assert ray_df.loc[1:, "col1"].equals(pd_df.loc[1:, "col1"])
+ assert ray_df.loc[1:2, "col1"].equals(pd_df.loc[1:2, "col1"])
# DataFrame
assert ray_df_equals_pandas(ray_df.loc[[1, 2]], pd_df.loc[[1, 2]])
@@ -3486,8 +3428,9 @@ def test_loc(ray_df, pd_df):
# See issue #80
# assert ray_df_equals_pandas(ray_df.loc[[1, 2], ['col1']],
# pd_df.loc[[1, 2], ['col1']])
- assert ray_df_equals_pandas(ray_df.loc[1:2, 'col1':'col2'],
- pd_df.loc[1:2, 'col1':'col2'])
+ assert ray_df_equals_pandas(
+ ray_df.loc[1:2, "col1":"col2"], pd_df.loc[1:2, "col1":"col2"]
+ )
# Write Item
ray_df_copy = ray_df.copy()
@@ -3554,15 +3497,14 @@ def test__doc__():
assert pd.DataFrame.__doc__ != pandas.DataFrame.__doc__
assert pd.DataFrame.__init__ != pandas.DataFrame.__init__
for attr, obj in pd.DataFrame.__dict__.items():
- if (callable(obj) or isinstance(obj, property)) \
- and attr != "__init__":
+ if (callable(obj) or isinstance(obj, property)) and attr != "__init__":
pd_obj = getattr(pandas.DataFrame, attr, None)
if callable(pd_obj) or isinstance(pd_obj, property):
assert obj.__doc__ == pd_obj.__doc__
def test_to_datetime():
- frame_data = {'year': [2015, 2016], 'month': [2, 3], 'day': [4, 5]}
+ frame_data = {"year": [2015, 2016], "month": [2, 3], "day": [4, 5]}
ray_df = pd.DataFrame(frame_data)
pd_df = pandas.DataFrame(frame_data)
@@ -3570,20 +3512,17 @@ def test_to_datetime():
def test_get_dummies():
- frame_data = {'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], 'C': [1, 2, 3]}
+ frame_data = {"A": ["a", "b", "a"], "B": ["b", "a", "c"], "C": [1, 2, 3]}
ray_df = pd.DataFrame(frame_data)
pd_df = pandas.DataFrame(frame_data)
- assert ray_df_equals_pandas(
- pd.get_dummies(ray_df), pandas.get_dummies(pd_df))
+ assert ray_df_equals_pandas(pd.get_dummies(ray_df), pandas.get_dummies(pd_df))
- frame_data = {'A': ['a'], 'B': ['b']}
+ frame_data = {"A": ["a"], "B": ["b"]}
ray_df = pd.DataFrame(frame_data)
pd_df = pandas.DataFrame(frame_data)
- assert ray_df_equals_pandas(
- pd.get_dummies(ray_df), pandas.get_dummies(pd_df))
+ assert ray_df_equals_pandas(pd.get_dummies(ray_df), pandas.get_dummies(pd_df))
- frame_data = {'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [1, 2, 3]}
+ frame_data = {"A": [1, 2, 3], "B": [4, 5, 6], "C": [1, 2, 3]}
ray_df = pd.DataFrame(frame_data)
pd_df = pandas.DataFrame(frame_data)
- assert ray_df_equals_pandas(
- pd.get_dummies(ray_df), pandas.get_dummies(pd_df))
+ assert ray_df_equals_pandas(pd.get_dummies(ray_df), pandas.get_dummies(pd_df))
diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py
index 436ad434ba7..98210e59a18 100644
--- a/modin/pandas/test/test_groupby.py
+++ b/modin/pandas/test/test_groupby.py
@@ -17,8 +17,9 @@
@pytest.fixture
def ray_df_equals_pandas(ray_df, pandas_df):
assert isinstance(ray_df, pd.DataFrame)
- assert to_pandas(ray_df).equals(pandas_df) or (all(ray_df.isna().all()) and
- all(pandas_df.isna().all()))
+ assert to_pandas(ray_df).equals(pandas_df) or (
+ all(ray_df.isna().all()) and all(pandas_df.isna().all())
+ )
@pytest.fixture
@@ -26,8 +27,11 @@ def ray_df_almost_equals_pandas(ray_df, pandas_df):
assert isinstance(ray_df, pd.DataFrame)
difference = to_pandas(ray_df) - pandas_df
diff_max = difference.max().max()
- assert to_pandas(ray_df).equals(pandas_df) or diff_max < 0.0001 or (all(
- ray_df.isna().all()) and all(pandas_df.isna().all()))
+ assert (
+ to_pandas(ray_df).equals(pandas_df)
+ or diff_max < 0.0001
+ or (all(ray_df.isna().all()) and all(pandas_df.isna().all()))
+ )
@pytest.fixture
@@ -48,13 +52,15 @@ def ray_groupby_equals_pandas(ray_groupby, pandas_groupby):
def test_simple_row_groupby():
- pandas_df = pandas.DataFrame({
- 'col1': [0, 1, 2, 3],
- 'col2': [4, 5, 6, 7],
- 'col3': [3, 8, 12, 10],
- 'col4': [17, 13, 16, 15],
- 'col5': [-4, -5, -6, -7]
- })
+ pandas_df = pandas.DataFrame(
+ {
+ "col1": [0, 1, 2, 3],
+ "col2": [4, 5, 6, 7],
+ "col3": [3, 8, 12, 10],
+ "col4": [17, 13, 16, 15],
+ "col5": [-4, -5, -6, -7],
+ }
+ )
ray_df = from_pandas(pandas_df)
@@ -91,7 +97,7 @@ def test_simple_row_groupby():
test_prod(ray_groupby, pandas_groupby)
test_std(ray_groupby, pandas_groupby)
- agg_functions = ['min', 'max']
+ agg_functions = ["min", "max"]
for func in agg_functions:
test_agg(ray_groupby, pandas_groupby, func)
test_aggregate(ray_groupby, pandas_groupby, func)
@@ -127,13 +133,15 @@ def test_simple_row_groupby():
def test_single_group_row_groupby():
- pandas_df = pandas.DataFrame({
- 'col1': [0, 1, 2, 3],
- 'col2': [4, 5, 36, 7],
- 'col3': [3, 8, 12, 10],
- 'col4': [17, 3, 16, 15],
- 'col5': [-4, 5, -6, -7]
- })
+ pandas_df = pandas.DataFrame(
+ {
+ "col1": [0, 1, 2, 3],
+ "col2": [4, 5, 36, 7],
+ "col3": [3, 8, 12, 10],
+ "col4": [17, 3, 16, 15],
+ "col5": [-4, 5, -6, -7],
+ }
+ )
ray_df = from_pandas(pandas_df)
@@ -170,7 +178,7 @@ def test_single_group_row_groupby():
test_prod(ray_groupby, pandas_groupby)
test_std(ray_groupby, pandas_groupby)
- agg_functions = ['min', 'max']
+ agg_functions = ["min", "max"]
for func in agg_functions:
test_agg(ray_groupby, pandas_groupby, func)
test_aggregate(ray_groupby, pandas_groupby, func)
@@ -208,11 +216,12 @@ def test_single_group_row_groupby():
@pytest.mark.skip(reason="See Modin issue #21.")
def test_large_row_groupby():
pandas_df = pandas.DataFrame(
- np.random.randint(0, 8, size=(100, 4)), columns=list('ABCD'))
+ np.random.randint(0, 8, size=(100, 4)), columns=list("ABCD")
+ )
ray_df = from_pandas(pandas_df)
- by = [str(i) for i in pandas_df['A'].tolist()]
+ by = [str(i) for i in pandas_df["A"].tolist()]
n = 4
ray_groupby = ray_df.groupby(by=by)
@@ -245,7 +254,7 @@ def test_large_row_groupby():
# test_prod(ray_groupby, pandas_groupby) causes overflows
test_std(ray_groupby, pandas_groupby)
- agg_functions = ['min', 'max']
+ agg_functions = ["min", "max"]
for func in agg_functions:
test_agg(ray_groupby, pandas_groupby, func)
test_aggregate(ray_groupby, pandas_groupby, func)
@@ -281,13 +290,15 @@ def test_large_row_groupby():
def test_simple_col_groupby():
- pandas_df = pandas.DataFrame({
- 'col1': [0, 3, 2, 3],
- 'col2': [4, 1, 6, 7],
- 'col3': [3, 8, 2, 10],
- 'col4': [1, 13, 6, 15],
- 'col5': [-4, 5, 6, -7]
- })
+ pandas_df = pandas.DataFrame(
+ {
+ "col1": [0, 3, 2, 3],
+ "col2": [4, 1, 6, 7],
+ "col3": [3, 8, 2, 10],
+ "col4": [1, 13, 6, 15],
+ "col5": [-4, 5, 6, -7],
+ }
+ )
ray_df = from_pandas(pandas_df)
@@ -407,7 +418,8 @@ def test_ndim(ray_groupby, pandas_groupby):
@pytest.fixture
def test_cumsum(ray_groupby, pandas_groupby, axis=0):
ray_df_equals_pandas(
- ray_groupby.cumsum(axis=axis), pandas_groupby.cumsum(axis=axis))
+ ray_groupby.cumsum(axis=axis), pandas_groupby.cumsum(axis=axis)
+ )
@pytest.fixture
@@ -419,7 +431,8 @@ def test_pct_change(ray_groupby, pandas_groupby):
@pytest.fixture
def test_cummax(ray_groupby, pandas_groupby, axis=0):
ray_df_equals_pandas(
- ray_groupby.cummax(axis=axis), pandas_groupby.cummax(axis=axis))
+ ray_groupby.cummax(axis=axis), pandas_groupby.cummax(axis=axis)
+ )
@pytest.fixture
@@ -447,7 +460,8 @@ def test_backfill(ray_groupby, pandas_groupby):
@pytest.fixture
def test_cummin(ray_groupby, pandas_groupby, axis=0):
ray_df_equals_pandas(
- ray_groupby.cummin(axis=axis), pandas_groupby.cummin(axis=axis))
+ ray_groupby.cummin(axis=axis), pandas_groupby.cummin(axis=axis)
+ )
@pytest.fixture
@@ -474,8 +488,7 @@ def test_std(ray_groupby, pandas_groupby):
@pytest.fixture
def test_aggregate(ray_groupby, pandas_groupby, func):
- ray_df_equals_pandas(
- ray_groupby.aggregate(func), pandas_groupby.aggregate(func))
+ ray_df_equals_pandas(ray_groupby.aggregate(func), pandas_groupby.aggregate(func))
@pytest.fixture
@@ -545,7 +558,8 @@ def test_head(ray_groupby, pandas_groupby, n):
def test_cumprod(ray_groupby, pandas_groupby, axis=0):
ray_df_equals_pandas(ray_groupby.cumprod(), pandas_groupby.cumprod())
ray_df_equals_pandas(
- ray_groupby.cumprod(axis=axis), pandas_groupby.cumprod(axis=axis))
+ ray_groupby.cumprod(axis=axis), pandas_groupby.cumprod(axis=axis)
+ )
@pytest.fixture
@@ -556,8 +570,7 @@ def test_cov(ray_groupby, pandas_groupby):
@pytest.fixture
def test_transform(ray_groupby, pandas_groupby, func):
- ray_df_equals_pandas(
- ray_groupby.transform(func), pandas_groupby.transform(func))
+ ray_df_equals_pandas(ray_groupby.transform(func), pandas_groupby.transform(func))
@pytest.fixture
@@ -569,8 +582,8 @@ def test_corr(ray_groupby, pandas_groupby):
@pytest.fixture
def test_fillna(ray_groupby, pandas_groupby):
ray_df_equals_pandas(
- ray_groupby.fillna(method="ffill"),
- pandas_groupby.fillna(method="ffill"))
+ ray_groupby.fillna(method="ffill"), pandas_groupby.fillna(method="ffill")
+ )
@pytest.fixture
@@ -591,8 +604,7 @@ def test_tail(ray_groupby, pandas_groupby, n):
@pytest.fixture
def test_quantile(ray_groupby, pandas_groupby):
- ray_df_equals_pandas(
- ray_groupby.quantile(q=0.4), pandas_groupby.quantile(q=0.4))
+ ray_df_equals_pandas(ray_groupby.quantile(q=0.4), pandas_groupby.quantile(q=0.4))
@pytest.fixture
diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py
index 0def3731a3a..58eda9bbb8f 100644
--- a/modin/pandas/test/test_io.py
+++ b/modin/pandas/test/test_io.py
@@ -10,18 +10,18 @@
import os
import sqlite3
-TEST_PARQUET_FILENAME = 'test.parquet'
-TEST_CSV_FILENAME = 'test.csv'
-TEST_JSON_FILENAME = 'test.json'
-TEST_HTML_FILENAME = 'test.html'
-TEST_EXCEL_FILENAME = 'test.xlsx'
-TEST_FEATHER_FILENAME = 'test.feather'
-TEST_HDF_FILENAME = 'test.hdf'
-TEST_MSGPACK_FILENAME = 'test.msg'
-TEST_STATA_FILENAME = 'test.dta'
-TEST_PICKLE_FILENAME = 'test.pkl'
-TEST_SAS_FILENAME = os.getcwd() + '/data/test1.sas7bdat'
-TEST_SQL_FILENAME = 'test.db'
+TEST_PARQUET_FILENAME = "test.parquet"
+TEST_CSV_FILENAME = "test.csv"
+TEST_JSON_FILENAME = "test.json"
+TEST_HTML_FILENAME = "test.html"
+TEST_EXCEL_FILENAME = "test.xlsx"
+TEST_FEATHER_FILENAME = "test.feather"
+TEST_HDF_FILENAME = "test.hdf"
+TEST_MSGPACK_FILENAME = "test.msg"
+TEST_STATA_FILENAME = "test.dta"
+TEST_PICKLE_FILENAME = "test.pkl"
+TEST_SAS_FILENAME = os.getcwd() + "/data/test1.sas7bdat"
+TEST_SQL_FILENAME = "test.db"
SMALL_ROW_SIZE = 2000
@@ -35,42 +35,45 @@ def setup_parquet_file(row_size, force=False):
if os.path.exists(TEST_PARQUET_FILENAME) and not force:
pass
else:
- df = pandas.DataFrame({
- 'col1': np.arange(row_size),
- 'col2': np.arange(row_size)
- })
+ df = pandas.DataFrame(
+ {"col1": np.arange(row_size), "col2": np.arange(row_size)}
+ )
df.to_parquet(TEST_PARQUET_FILENAME)
@pytest.fixture
def create_test_ray_dataframe():
- df = pd.DataFrame({
- 'col1': [0, 1, 2, 3],
- 'col2': [4, 5, 6, 7],
- 'col3': [8, 9, 10, 11],
- 'col4': [12, 13, 14, 15],
- 'col5': [0, 0, 0, 0]
- })
+ df = pd.DataFrame(
+ {
+ "col1": [0, 1, 2, 3],
+ "col2": [4, 5, 6, 7],
+ "col3": [8, 9, 10, 11],
+ "col4": [12, 13, 14, 15],
+ "col5": [0, 0, 0, 0],
+ }
+ )
return df
@pytest.fixture
def create_test_pandas_dataframe():
- df = pandas.DataFrame({
- 'col1': [0, 1, 2, 3],
- 'col2': [4, 5, 6, 7],
- 'col3': [8, 9, 10, 11],
- 'col4': [12, 13, 14, 15],
- 'col5': [0, 0, 0, 0]
- })
+ df = pandas.DataFrame(
+ {
+ "col1": [0, 1, 2, 3],
+ "col2": [4, 5, 6, 7],
+ "col3": [8, 9, 10, 11],
+ "col4": [12, 13, 14, 15],
+ "col5": [0, 0, 0, 0],
+ }
+ )
return df
@pytest.fixture
def test_files_eq(path1, path2):
- with open(path1, 'rb') as file1, open(path2, 'rb') as file2:
+ with open(path1, "rb") as file1, open(path2, "rb") as file2:
file1_content = file1.read()
file2_content = file2.read()
@@ -93,14 +96,13 @@ def teardown_parquet_file():
@pytest.fixture
-def setup_csv_file(row_size, force=False, delimiter=','):
+def setup_csv_file(row_size, force=False, delimiter=","):
if os.path.exists(TEST_CSV_FILENAME) and not force:
pass
else:
- df = pandas.DataFrame({
- 'col1': np.arange(row_size),
- 'col2': np.arange(row_size)
- })
+ df = pandas.DataFrame(
+ {"col1": np.arange(row_size), "col2": np.arange(row_size)}
+ )
df.to_csv(TEST_CSV_FILENAME, sep=delimiter)
@@ -115,10 +117,9 @@ def setup_json_file(row_size, force=False):
if os.path.exists(TEST_JSON_FILENAME) and not force:
pass
else:
- df = pandas.DataFrame({
- 'col1': np.arange(row_size),
- 'col2': np.arange(row_size)
- })
+ df = pandas.DataFrame(
+ {"col1": np.arange(row_size), "col2": np.arange(row_size)}
+ )
df.to_json(TEST_JSON_FILENAME)
@@ -133,10 +134,9 @@ def setup_html_file(row_size, force=False):
if os.path.exists(TEST_HTML_FILENAME) and not force:
pass
else:
- df = pandas.DataFrame({
- 'col1': np.arange(row_size),
- 'col2': np.arange(row_size)
- })
+ df = pandas.DataFrame(
+ {"col1": np.arange(row_size), "col2": np.arange(row_size)}
+ )
df.to_html(TEST_HTML_FILENAME)
@@ -148,10 +148,7 @@ def teardown_html_file():
@pytest.fixture
def setup_clipboard(row_size, force=False):
- df = pandas.DataFrame({
- 'col1': np.arange(row_size),
- 'col2': np.arange(row_size)
- })
+ df = pandas.DataFrame({"col1": np.arange(row_size), "col2": np.arange(row_size)})
df.to_clipboard()
@@ -160,10 +157,9 @@ def setup_excel_file(row_size, force=False):
if os.path.exists(TEST_EXCEL_FILENAME) and not force:
pass
else:
- df = pandas.DataFrame({
- 'col1': np.arange(row_size),
- 'col2': np.arange(row_size)
- })
+ df = pandas.DataFrame(
+ {"col1": np.arange(row_size), "col2": np.arange(row_size)}
+ )
df.to_excel(TEST_EXCEL_FILENAME)
@@ -178,10 +174,9 @@ def setup_feather_file(row_size, force=False):
if os.path.exists(TEST_FEATHER_FILENAME) and not force:
pass
else:
- df = pandas.DataFrame({
- 'col1': np.arange(row_size),
- 'col2': np.arange(row_size)
- })
+ df = pandas.DataFrame(
+ {"col1": np.arange(row_size), "col2": np.arange(row_size)}
+ )
df.to_feather(TEST_FEATHER_FILENAME)
@@ -196,11 +191,10 @@ def setup_hdf_file(row_size, force=False):
if os.path.exists(TEST_HDF_FILENAME) and not force:
pass
else:
- df = pandas.DataFrame({
- 'col1': np.arange(row_size),
- 'col2': np.arange(row_size)
- })
- df.to_hdf(TEST_HDF_FILENAME, 'test')
+ df = pandas.DataFrame(
+ {"col1": np.arange(row_size), "col2": np.arange(row_size)}
+ )
+ df.to_hdf(TEST_HDF_FILENAME, "test")
@pytest.fixture
@@ -214,10 +208,9 @@ def setup_msgpack_file(row_size, force=False):
if os.path.exists(TEST_MSGPACK_FILENAME) and not force:
pass
else:
- df = pandas.DataFrame({
- 'col1': np.arange(row_size),
- 'col2': np.arange(row_size)
- })
+ df = pandas.DataFrame(
+ {"col1": np.arange(row_size), "col2": np.arange(row_size)}
+ )
df.to_msgpack(TEST_MSGPACK_FILENAME)
@@ -232,10 +225,9 @@ def setup_stata_file(row_size, force=False):
if os.path.exists(TEST_STATA_FILENAME) and not force:
pass
else:
- df = pandas.DataFrame({
- 'col1': np.arange(row_size),
- 'col2': np.arange(row_size)
- })
+ df = pandas.DataFrame(
+ {"col1": np.arange(row_size), "col2": np.arange(row_size)}
+ )
df.to_stata(TEST_STATA_FILENAME)
@@ -250,10 +242,9 @@ def setup_pickle_file(row_size, force=False):
if os.path.exists(TEST_PICKLE_FILENAME) and not force:
pass
else:
- df = pandas.DataFrame({
- 'col1': np.arange(row_size),
- 'col2': np.arange(row_size)
- })
+ df = pandas.DataFrame(
+ {"col1": np.arange(row_size), "col2": np.arange(row_size)}
+ )
df.to_pickle(TEST_PICKLE_FILENAME)
@@ -268,13 +259,15 @@ def setup_sql_file(conn, force=False):
if os.path.exists(TEST_SQL_FILENAME) and not force:
pass
else:
- df = pandas.DataFrame({
- 'col1': [0, 1, 2, 3],
- 'col2': [4, 5, 6, 7],
- 'col3': [8, 9, 10, 11],
- 'col4': [12, 13, 14, 15],
- 'col5': [0, 0, 0, 0]
- })
+ df = pandas.DataFrame(
+ {
+ "col1": [0, 1, 2, 3],
+ "col2": [4, 5, 6, 7],
+ "col3": [8, 9, 10, 11],
+ "col4": [12, 13, 14, 15],
+ "col5": [0, 0, 0, 0],
+ }
+ )
df.to_sql(TEST_SQL_FILENAME.split(".")[0], conn)
@@ -297,8 +290,8 @@ def test_from_parquet():
def test_from_parquet_with_columns():
setup_parquet_file(SMALL_ROW_SIZE)
- pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME, columns=['col1'])
- ray_df = pd.read_parquet(TEST_PARQUET_FILENAME, columns=['col1'])
+ pandas_df = pandas.read_parquet(TEST_PARQUET_FILENAME, columns=["col1"])
+ ray_df = pd.read_parquet(TEST_PARQUET_FILENAME, columns=["col1"])
assert ray_df_equals_pandas(ray_df, pandas_df)
teardown_parquet_file()
@@ -403,8 +396,8 @@ def test_from_feather():
def test_from_hdf():
setup_hdf_file(SMALL_ROW_SIZE)
- pandas_df = pandas.read_hdf(TEST_HDF_FILENAME, key='test')
- ray_df = pd.read_hdf(TEST_HDF_FILENAME, key='test')
+ pandas_df = pandas.read_hdf(TEST_HDF_FILENAME, key="test")
+ ray_df = pd.read_hdf(TEST_HDF_FILENAME, key="test")
assert ray_df_equals_pandas(ray_df, pandas_df)
@@ -465,7 +458,7 @@ def test_from_sas():
def test_from_csv_delimiter():
- setup_csv_file(SMALL_ROW_SIZE, delimiter='|')
+ setup_csv_file(SMALL_ROW_SIZE, delimiter="|")
pandas_df = pandas.read_csv(TEST_CSV_FILENAME)
ray_df = pd.read_csv(TEST_CSV_FILENAME)
@@ -486,7 +479,7 @@ def test_to_clipboard():
pandas_df.to_clipboard()
pandas_as_clip = pandas.read_clipboard()
- assert (ray_as_clip.equals(pandas_as_clip))
+ assert ray_as_clip.equals(pandas_as_clip)
def test_to_csv():
@@ -499,7 +492,7 @@ def test_to_csv():
ray_df.to_csv(TEST_CSV_DF_FILENAME)
pandas_df.to_csv(TEST_CSV_pandas_FILENAME)
- assert (test_files_eq(TEST_CSV_DF_FILENAME, TEST_CSV_pandas_FILENAME))
+ assert test_files_eq(TEST_CSV_DF_FILENAME, TEST_CSV_pandas_FILENAME)
teardown_test_file(TEST_CSV_pandas_FILENAME)
teardown_test_file(TEST_CSV_DF_FILENAME)
@@ -535,7 +528,7 @@ def test_to_excel():
ray_writer.save()
pandas_writer.save()
- assert (test_files_eq(TEST_EXCEL_DF_FILENAME, TEST_EXCEL_pandas_FILENAME))
+ assert test_files_eq(TEST_EXCEL_DF_FILENAME, TEST_EXCEL_pandas_FILENAME)
teardown_test_file(TEST_EXCEL_DF_FILENAME)
teardown_test_file(TEST_EXCEL_pandas_FILENAME)
@@ -551,8 +544,7 @@ def test_to_feather():
ray_df.to_feather(TEST_FEATHER_DF_FILENAME)
pandas_df.to_feather(TEST_FEATHER_pandas_FILENAME)
- assert (test_files_eq(TEST_FEATHER_DF_FILENAME,
- TEST_FEATHER_pandas_FILENAME))
+ assert test_files_eq(TEST_FEATHER_DF_FILENAME, TEST_FEATHER_pandas_FILENAME)
teardown_test_file(TEST_FEATHER_pandas_FILENAME)
teardown_test_file(TEST_FEATHER_DF_FILENAME)
@@ -576,7 +568,7 @@ def test_to_html():
ray_df.to_html(TEST_HTML_DF_FILENAME)
pandas_df.to_html(TEST_HTML_pandas_FILENAME)
- assert (test_files_eq(TEST_HTML_DF_FILENAME, TEST_HTML_pandas_FILENAME))
+ assert test_files_eq(TEST_HTML_DF_FILENAME, TEST_HTML_pandas_FILENAME)
teardown_test_file(TEST_HTML_pandas_FILENAME)
teardown_test_file(TEST_HTML_DF_FILENAME)
@@ -592,7 +584,7 @@ def test_to_json():
ray_df.to_json(TEST_JSON_DF_FILENAME)
pandas_df.to_json(TEST_JSON_pandas_FILENAME)
- assert (test_files_eq(TEST_JSON_DF_FILENAME, TEST_JSON_pandas_FILENAME))
+ assert test_files_eq(TEST_JSON_DF_FILENAME, TEST_JSON_pandas_FILENAME)
teardown_test_file(TEST_JSON_pandas_FILENAME)
teardown_test_file(TEST_JSON_DF_FILENAME)
@@ -615,8 +607,7 @@ def test_to_msgpack():
ray_df.to_msgpack(TEST_MSGPACK_DF_FILENAME)
pandas_df.to_msgpack(TEST_MSGPACK_pandas_FILENAME)
- assert (test_files_eq(TEST_MSGPACK_DF_FILENAME,
- TEST_MSGPACK_pandas_FILENAME))
+ assert test_files_eq(TEST_MSGPACK_DF_FILENAME, TEST_MSGPACK_pandas_FILENAME)
teardown_test_file(TEST_MSGPACK_pandas_FILENAME)
teardown_test_file(TEST_MSGPACK_DF_FILENAME)
@@ -639,8 +630,7 @@ def test_to_parquet():
ray_df.to_parquet(TEST_PARQUET_DF_FILENAME)
pandas_df.to_parquet(TEST_PARQUET_pandas_FILENAME)
- assert (test_files_eq(TEST_PARQUET_DF_FILENAME,
- TEST_PARQUET_pandas_FILENAME))
+ assert test_files_eq(TEST_PARQUET_DF_FILENAME, TEST_PARQUET_pandas_FILENAME)
teardown_test_file(TEST_PARQUET_pandas_FILENAME)
teardown_test_file(TEST_PARQUET_DF_FILENAME)
@@ -663,8 +653,7 @@ def test_to_pickle():
ray_df.to_pickle(TEST_PICKLE_DF_FILENAME)
pandas_df.to_pickle(TEST_PICKLE_pandas_FILENAME)
- assert (test_files_eq(TEST_PICKLE_DF_FILENAME,
- TEST_PICKLE_pandas_FILENAME))
+ assert test_files_eq(TEST_PICKLE_DF_FILENAME, TEST_PICKLE_pandas_FILENAME)
teardown_test_file(TEST_PICKLE_pandas_FILENAME)
teardown_test_file(TEST_PICKLE_DF_FILENAME)
@@ -680,7 +669,7 @@ def test_to_sql():
ray_df.to_pickle(TEST_SQL_DF_FILENAME)
pandas_df.to_pickle(TEST_SQL_pandas_FILENAME)
- assert (test_files_eq(TEST_SQL_DF_FILENAME, TEST_SQL_pandas_FILENAME))
+ assert test_files_eq(TEST_SQL_DF_FILENAME, TEST_SQL_pandas_FILENAME)
teardown_test_file(TEST_SQL_DF_FILENAME)
teardown_test_file(TEST_SQL_pandas_FILENAME)
@@ -696,7 +685,7 @@ def test_to_stata():
ray_df.to_stata(TEST_STATA_DF_FILENAME)
pandas_df.to_stata(TEST_STATA_pandas_FILENAME)
- assert (test_files_eq(TEST_STATA_DF_FILENAME, TEST_STATA_pandas_FILENAME))
+ assert test_files_eq(TEST_STATA_DF_FILENAME, TEST_STATA_pandas_FILENAME)
teardown_test_file(TEST_STATA_pandas_FILENAME)
teardown_test_file(TEST_STATA_DF_FILENAME)
diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py
index 1178056f9c3..3be3a303e41 100644
--- a/modin/pandas/test/test_series.py
+++ b/modin/pandas/test/test_series.py
@@ -1544,9 +1544,31 @@ def test_plot():
ray_series = create_test_series()
with pytest.raises(NotImplementedError):
- ray_series.plot(None, None, None, None, None, None, None, None, None,
- None, None, None, None, None, None, None, None, None,
- None, None, None, None, None)
+ ray_series.plot(
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ )
@pytest.mark.skip(reason="Using pandas Series.")
@@ -1714,8 +1736,9 @@ def test_resample():
ray_series = create_test_series()
with pytest.raises(NotImplementedError):
- ray_series.resample(None, None, None, None, None, None, None, None,
- None, None, None, None)
+ ray_series.resample(
+ None, None, None, None, None, None, None, None, None, None, None, None
+ )
@pytest.mark.skip(reason="Using pandas Series.")
@@ -2003,8 +2026,7 @@ def test_to_csv():
ray_series = create_test_series()
with pytest.raises(NotImplementedError):
- ray_series.to_csv(None, None, None, None, None, None, None, None, None,
- None)
+ ray_series.to_csv(None, None, None, None, None, None, None, None, None, None)
@pytest.mark.skip(reason="Using pandas Series.")
@@ -2028,8 +2050,22 @@ def test_to_excel():
ray_series = create_test_series()
with pytest.raises(NotImplementedError):
- ray_series.to_excel(None, None, None, None, None, None, None, None,
- None, None, None, None, None, None)
+ ray_series.to_excel(
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ )
@pytest.mark.skip(reason="Using pandas Series.")
@@ -2061,9 +2097,26 @@ def test_to_latex():
ray_series = create_test_series()
with pytest.raises(NotImplementedError):
- ray_series.to_latex(None, None, None, None, None, None, None, None,
- None, None, None, None, None, None, None, None,
- None, None)
+ ray_series.to_latex(
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ None,
+ )
@pytest.mark.skip(reason="Using pandas Series.")
diff --git a/modin/pandas/utils.py b/modin/pandas/utils.py
index 45769fc9275..7ac6848bc1b 100644
--- a/modin/pandas/utils.py
+++ b/modin/pandas/utils.py
@@ -49,9 +49,9 @@ def decorator(cls):
cls.__doc__ = parent.__doc__
for attr, obj in cls.__dict__.items():
parent_obj = getattr(parent, attr, None)
- if parent_obj in excluded or \
- (not callable(parent_obj) and
- not isinstance(parent_obj, property)):
+ if parent_obj in excluded or (
+ not callable(parent_obj) and not isinstance(parent_obj, property)
+ ):
continue
if callable(obj):
obj.__doc__ = parent_obj.__doc__
diff --git a/modin/sql/connection.py b/modin/sql/connection.py
index 23c985f3658..7989683f535 100644
--- a/modin/sql/connection.py
+++ b/modin/sql/connection.py
@@ -33,22 +33,22 @@ def execute(self, query):
elif " ".join(split_query[:2]) == "INSERT INTO":
self._insert_into(split_query)
else:
- raise NotImplementedError("This API is for demonstration purposes "
- "only. Coming Soon!")
+ raise NotImplementedError(
+ "This API is for demonstration purposes " "only. Coming Soon!"
+ )
def _create_table(self, split_query):
- column_names = " ".join(split_query[3:]) \
- .replace("(", "").replace(")", "").split(", ")
+ column_names = (
+ " ".join(split_query[3:]).replace("(", "").replace(")", "").split(", ")
+ )
columns = Series(column_names)
self._tables[split_query[2]] = DataFrame(columns=columns)
def _insert_into(self, split_query):
table = self._tables[split_query[2]]
- values = " ".join(split_query[4:]) \
- .replace("(", "").replace(")", "").split(", ")
+ values = " ".join(split_query[4:]).replace("(", "").replace(")", "").split(", ")
to_append = Series([eval(i) for i in values], index=table.columns)
- self._tables[split_query[2]] = \
- table.append(to_append, ignore_index=True)
+ self._tables[split_query[2]] = table.append(to_append, ignore_index=True)
print(self._tables[split_query[2]])