From d18b9b110d8eda8f937333acb145f4e81ba5648c Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 5 Jan 2024 02:32:22 +0100 Subject: [PATCH 01/12] Add function to compare Column objects with iterable references Signed-off-by: Anatoly Myachev --- tests/column/and_or_test.py | 24 +++++++++--------------- tests/utils.py | 9 +++++++++ 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/tests/column/and_or_test.py b/tests/column/and_or_test.py index 48b73380..b620341e 100644 --- a/tests/column/and_or_test.py +++ b/tests/column/and_or_test.py @@ -1,9 +1,7 @@ from __future__ import annotations -import pandas as pd - from tests.utils import bool_dataframe_1 -from tests.utils import interchange_to_pandas +from tests.utils import compare_column_with_reference def test_column_and(library: str) -> None: @@ -11,9 +9,8 @@ def test_column_and(library: str) -> None: ser = df.col("a") other = df.col("b") result = df.assign((ser & other).rename("result")) - result_pd = interchange_to_pandas(result)["result"] - expected = pd.Series([True, True, False], name="result") - pd.testing.assert_series_equal(result_pd, expected) + expected = [True, True, False] + compare_column_with_reference(result.col("result"), expected) def test_column_or(library: str) -> None: @@ -21,9 +18,8 @@ def test_column_or(library: str) -> None: ser = df.col("a") other = df.col("b") result = df.assign((ser | other).rename("result")) - result_pd = interchange_to_pandas(result)["result"] - expected = pd.Series([True, True, True], name="result") - pd.testing.assert_series_equal(result_pd, expected) + expected = [True, True, True] + compare_column_with_reference(result.col("result"), expected) def test_column_and_with_scalar(library: str) -> None: @@ -31,9 +27,8 @@ def test_column_and_with_scalar(library: str) -> None: ser = df.col("a") other = True result = df.assign((other & ser).rename("result")) - result_pd = interchange_to_pandas(result)["result"] - expected = pd.Series([True, True, False], name="result") - pd.testing.assert_series_equal(result_pd, expected) + expected = [True, True, False] + compare_column_with_reference(result.col("result"), expected) def test_column_or_with_scalar(library: str) -> None: @@ -41,6 +36,5 @@ def test_column_or_with_scalar(library: str) -> None: ser = df.col("a") other = True result = df.assign((other | ser).rename("result")) - result_pd = interchange_to_pandas(result)["result"] - expected = pd.Series([True, True, True], name="result") - pd.testing.assert_series_equal(result_pd, expected) + expected = [True, True, True] + compare_column_with_reference(result.col("result"), expected) diff --git a/tests/utils.py b/tests/utils.py index 8cc049c4..942430aa 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -17,6 +17,7 @@ DType = TypeVar("DType") if TYPE_CHECKING: + from dataframe_api import Column from dataframe_api import DataFrame POLARS_VERSION = parse(pl.__version__) @@ -485,6 +486,14 @@ def interchange_to_pandas(result: Any) -> pd.DataFrame: return cast(pd.DataFrame, df) +def compare_column_with_reference(column: Column, reference: list[Any]) -> None: + column = column.persist() + col_len = column.len().scalar + assert col_len == len(reference) + for idx in range(col_len): + assert reference[idx] == column.get_value(idx).scalar + + def mixed_dataframe_1(library: str) -> DataFrame: df: Any data = { From 7708bfb73c2ddcc4c3c9e28e059d6027a7ae2dd9 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 5 Jan 2024 15:10:00 +0100 Subject: [PATCH 02/12] check Column dtype Signed-off-by: Anatoly Myachev --- dataframe_api_compat/pandas_standard/__init__.py | 7 ++++--- pyproject.toml | 3 +++ tests/column/and_or_test.py | 8 ++++---- tests/utils.py | 8 +++++++- 4 files changed, 18 insertions(+), 8 deletions(-) diff --git a/dataframe_api_compat/pandas_standard/__init__.py b/dataframe_api_compat/pandas_standard/__init__.py index de999bcd..422e9f96 100644 --- a/dataframe_api_compat/pandas_standard/__init__.py +++ b/dataframe_api_compat/pandas_standard/__init__.py @@ -104,18 +104,19 @@ def map_pandas_dtype_to_standard_dtype(dtype: Any) -> DType: return Namespace.Float32() if dtype == "Float32": return Namespace.Float32() - if dtype == "bool": + if dtype in ("bool", "boolean"): + # Also for `pandas.core.arrays.boolean.BooleanDtype` return Namespace.Bool() if dtype == "object": return Namespace.String() if dtype == "string": return Namespace.String() - if dtype.startswith("datetime64["): + if hasattr(dtype, "startswith") and dtype.startswith("datetime64["): match = re.search(r"datetime64\[(\w{1,2})", dtype) assert match is not None time_unit = cast(Literal["ms", "us"], match.group(1)) return Namespace.Datetime(time_unit) - if dtype.startswith("timedelta64["): + if hasattr(dtype, "startswith") and dtype.startswith("timedelta64["): match = re.search(r"timedelta64\[(\w{1,2})", dtype) assert match is not None time_unit = cast(Literal["ms", "us"], match.group(1)) diff --git a/pyproject.toml b/pyproject.toml index 490f735e..6cf0717f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,6 +75,9 @@ ignore = [ [tool.ruff.isort] force-single-line = true +[tool.black] +line-length = 90 + [tool.pytest.ini_options] filterwarnings = [ "error", diff --git a/tests/column/and_or_test.py b/tests/column/and_or_test.py index b620341e..4d1b87d7 100644 --- a/tests/column/and_or_test.py +++ b/tests/column/and_or_test.py @@ -10,7 +10,7 @@ def test_column_and(library: str) -> None: other = df.col("b") result = df.assign((ser & other).rename("result")) expected = [True, True, False] - compare_column_with_reference(result.col("result"), expected) + compare_column_with_reference(result.col("result"), expected, dtype="bool") def test_column_or(library: str) -> None: @@ -19,7 +19,7 @@ def test_column_or(library: str) -> None: other = df.col("b") result = df.assign((ser | other).rename("result")) expected = [True, True, True] - compare_column_with_reference(result.col("result"), expected) + compare_column_with_reference(result.col("result"), expected, dtype="bool") def test_column_and_with_scalar(library: str) -> None: @@ -28,7 +28,7 @@ def test_column_and_with_scalar(library: str) -> None: other = True result = df.assign((other & ser).rename("result")) expected = [True, True, False] - compare_column_with_reference(result.col("result"), expected) + compare_column_with_reference(result.col("result"), expected, dtype="bool") def test_column_or_with_scalar(library: str) -> None: @@ -37,4 +37,4 @@ def test_column_or_with_scalar(library: str) -> None: other = True result = df.assign((other | ser).rename("result")) expected = [True, True, True] - compare_column_with_reference(result.col("result"), expected) + compare_column_with_reference(result.col("result"), expected, dtype="bool") diff --git a/tests/utils.py b/tests/utils.py index 942430aa..86cd134a 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -486,10 +486,16 @@ def interchange_to_pandas(result: Any) -> pd.DataFrame: return cast(pd.DataFrame, df) -def compare_column_with_reference(column: Column, reference: list[Any]) -> None: +def compare_column_with_reference( + column: Column, + reference: list[Any], + dtype: str | None, +) -> None: column = column.persist() col_len = column.len().scalar assert col_len == len(reference) + if dtype is not None: + assert column.__column_namespace__().is_dtype(column.dtype, dtype) for idx in range(col_len): assert reference[idx] == column.get_value(idx).scalar From 05171fcc8d8498cf955b4da0faf4bfda33c34732 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 5 Jan 2024 15:48:17 +0100 Subject: [PATCH 03/12] another way to check Column dtype Signed-off-by: Anatoly Myachev --- tests/column/and_or_test.py | 12 ++++++++---- tests/column/cast_test.py | 11 +++-------- tests/utils.py | 9 +++------ 3 files changed, 14 insertions(+), 18 deletions(-) diff --git a/tests/column/and_or_test.py b/tests/column/and_or_test.py index 4d1b87d7..2919ed2b 100644 --- a/tests/column/and_or_test.py +++ b/tests/column/and_or_test.py @@ -6,35 +6,39 @@ def test_column_and(library: str) -> None: df = bool_dataframe_1(library, api_version="2023.09-beta") + pdx = df.__dataframe_namespace__() ser = df.col("a") other = df.col("b") result = df.assign((ser & other).rename("result")) expected = [True, True, False] - compare_column_with_reference(result.col("result"), expected, dtype="bool") + compare_column_with_reference(result.col("result"), expected, dtype=pdx.Bool) def test_column_or(library: str) -> None: df = bool_dataframe_1(library) + pdx = df.__dataframe_namespace__() ser = df.col("a") other = df.col("b") result = df.assign((ser | other).rename("result")) expected = [True, True, True] - compare_column_with_reference(result.col("result"), expected, dtype="bool") + compare_column_with_reference(result.col("result"), expected, dtype=pdx.Bool) def test_column_and_with_scalar(library: str) -> None: df = bool_dataframe_1(library) + pdx = df.__dataframe_namespace__() ser = df.col("a") other = True result = df.assign((other & ser).rename("result")) expected = [True, True, False] - compare_column_with_reference(result.col("result"), expected, dtype="bool") + compare_column_with_reference(result.col("result"), expected, dtype=pdx.Bool) def test_column_or_with_scalar(library: str) -> None: df = bool_dataframe_1(library) + pdx = df.__dataframe_namespace__() ser = df.col("a") other = True result = df.assign((other | ser).rename("result")) expected = [True, True, True] - compare_column_with_reference(result.col("result"), expected, dtype="bool") + compare_column_with_reference(result.col("result"), expected, dtype=pdx.Bool) diff --git a/tests/column/cast_test.py b/tests/column/cast_test.py index f8fc16a3..45582cd4 100644 --- a/tests/column/cast_test.py +++ b/tests/column/cast_test.py @@ -1,15 +1,10 @@ -import pandas as pd - +from tests.utils import compare_column_with_reference from tests.utils import integer_dataframe_1 -from tests.utils import interchange_to_pandas def test_cast_integers(library: str) -> None: df = integer_dataframe_1(library) pdx = df.__dataframe_namespace__() result = df.assign(df.col("a").cast(pdx.Int32())) - expected = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}).astype( - {"a": "int32", "b": "int64"}, - ) - result_pd = interchange_to_pandas(result) - pd.testing.assert_frame_equal(result_pd, expected) + compare_column_with_reference(result.col("a"), [1, 2, 3], dtype=pdx.Int32) + compare_column_with_reference(result.col("b"), [4, 5, 6], dtype=pdx.Int64) diff --git a/tests/utils.py b/tests/utils.py index 86cd134a..005e306f 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -4,7 +4,6 @@ from datetime import timedelta from typing import TYPE_CHECKING from typing import Any -from typing import TypeVar from typing import cast import pandas as pd @@ -14,11 +13,10 @@ import dataframe_api_compat.pandas_standard import dataframe_api_compat.polars_standard -DType = TypeVar("DType") - if TYPE_CHECKING: from dataframe_api import Column from dataframe_api import DataFrame + from dataframe_api.typing import DType POLARS_VERSION = parse(pl.__version__) PANDAS_VERSION = parse(pd.__version__) @@ -489,13 +487,12 @@ def interchange_to_pandas(result: Any) -> pd.DataFrame: def compare_column_with_reference( column: Column, reference: list[Any], - dtype: str | None, + dtype: DType, ) -> None: column = column.persist() col_len = column.len().scalar assert col_len == len(reference) - if dtype is not None: - assert column.__column_namespace__().is_dtype(column.dtype, dtype) + assert isinstance(column.dtype, dtype) for idx in range(col_len): assert reference[idx] == column.get_value(idx).scalar From f0005d8a264b18bbd3bb254d89fe970bcf6701e2 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 5 Jan 2024 16:25:30 +0100 Subject: [PATCH 04/12] temp fix for mypy Signed-off-by: Anatoly Myachev --- tests/utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/utils.py b/tests/utils.py index 005e306f..5d7a613c 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -16,7 +16,6 @@ if TYPE_CHECKING: from dataframe_api import Column from dataframe_api import DataFrame - from dataframe_api.typing import DType POLARS_VERSION = parse(pl.__version__) PANDAS_VERSION = parse(pd.__version__) @@ -487,7 +486,7 @@ def interchange_to_pandas(result: Any) -> pd.DataFrame: def compare_column_with_reference( column: Column, reference: list[Any], - dtype: DType, + dtype: Any, ) -> None: column = column.persist() col_len = column.len().scalar From 65c1f661091f46ffb392cfc2a1f4b5ee07d94c0c Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 5 Jan 2024 21:08:08 +0100 Subject: [PATCH 05/12] add 'compare_dataframe_with_reference' func Signed-off-by: Anatoly Myachev --- .../pandas_standard/__init__.py | 6 +- tests/column/cast_test.py | 9 ++- tests/column/col_sorted_indices_test.py | 58 +++++++---------- tests/column/temporal/components_test.py | 65 +++++++++++-------- tests/column/temporal/filter_test.py | 12 ++-- tests/column/temporal/floor_test.py | 10 ++- tests/utils.py | 21 +++++- 7 files changed, 99 insertions(+), 82 deletions(-) diff --git a/dataframe_api_compat/pandas_standard/__init__.py b/dataframe_api_compat/pandas_standard/__init__.py index 422e9f96..8da7312e 100644 --- a/dataframe_api_compat/pandas_standard/__init__.py +++ b/dataframe_api_compat/pandas_standard/__init__.py @@ -111,12 +111,14 @@ def map_pandas_dtype_to_standard_dtype(dtype: Any) -> DType: return Namespace.String() if dtype == "string": return Namespace.String() - if hasattr(dtype, "startswith") and dtype.startswith("datetime64["): + if not hasattr(dtype, "startswith"): + dtype = str(dtype) + if dtype.startswith("datetime64["): match = re.search(r"datetime64\[(\w{1,2})", dtype) assert match is not None time_unit = cast(Literal["ms", "us"], match.group(1)) return Namespace.Datetime(time_unit) - if hasattr(dtype, "startswith") and dtype.startswith("timedelta64["): + if dtype.startswith("timedelta64["): match = re.search(r"timedelta64\[(\w{1,2})", dtype) assert match is not None time_unit = cast(Literal["ms", "us"], match.group(1)) diff --git a/tests/column/cast_test.py b/tests/column/cast_test.py index 45582cd4..3d942a4e 100644 --- a/tests/column/cast_test.py +++ b/tests/column/cast_test.py @@ -1,4 +1,4 @@ -from tests.utils import compare_column_with_reference +from tests.utils import compare_dataframe_with_reference from tests.utils import integer_dataframe_1 @@ -6,5 +6,8 @@ def test_cast_integers(library: str) -> None: df = integer_dataframe_1(library) pdx = df.__dataframe_namespace__() result = df.assign(df.col("a").cast(pdx.Int32())) - compare_column_with_reference(result.col("a"), [1, 2, 3], dtype=pdx.Int32) - compare_column_with_reference(result.col("b"), [4, 5, 6], dtype=pdx.Int64) + compare_dataframe_with_reference( + result, + {"a": [1, 2, 3], "b": [4, 5, 6]}, + dtype={"a": pdx.Int32, "b": pdx.Int64}, + ) diff --git a/tests/column/col_sorted_indices_test.py b/tests/column/col_sorted_indices_test.py index 2d63618c..c371ca1a 100644 --- a/tests/column/col_sorted_indices_test.py +++ b/tests/column/col_sorted_indices_test.py @@ -1,66 +1,54 @@ from __future__ import annotations -import pandas as pd - +from tests.utils import compare_dataframe_with_reference from tests.utils import integer_dataframe_6 -from tests.utils import interchange_to_pandas def test_expression_sorted_indices_ascending(library: str) -> None: df = integer_dataframe_6(library) - df.__dataframe_namespace__() + pdx = df.__dataframe_namespace__() col = df.col sorted_indices = col("b").sorted_indices() result = df.take(sorted_indices) - result_pd = interchange_to_pandas(result) - expected = pd.DataFrame( - { - "a": [2, 2, 1, 1, 1], - "b": [1, 2, 3, 4, 4], - }, + compare_dataframe_with_reference( + result, + {"a": [2, 2, 1, 1, 1], "b": [1, 2, 3, 4, 4]}, + dtype=pdx.Int64, ) - pd.testing.assert_frame_equal(result_pd, expected) def test_expression_sorted_indices_descending(library: str) -> None: df = integer_dataframe_6(library) - df.__dataframe_namespace__() + pdx = df.__dataframe_namespace__() col = df.col sorted_indices = col("b").sorted_indices(ascending=False) result = df.take(sorted_indices) - result_pd = interchange_to_pandas(result) - expected = pd.DataFrame( - { - "a": [1, 1, 1, 2, 2], - "b": [4, 4, 3, 2, 1], - }, + compare_dataframe_with_reference( + result, + {"a": [1, 1, 1, 2, 2], "b": [4, 4, 3, 2, 1]}, + dtype=pdx.Int64, ) - pd.testing.assert_frame_equal(result_pd, expected) def test_column_sorted_indices_ascending(library: str) -> None: - df = integer_dataframe_6(library).persist() + df = integer_dataframe_6(library) + pdx = df.__dataframe_namespace__() sorted_indices = df.col("b").sorted_indices() result = df.take(sorted_indices) - result_pd = interchange_to_pandas(result) - expected = pd.DataFrame( - { - "a": [2, 2, 1, 1, 1], - "b": [1, 2, 3, 4, 4], - }, + compare_dataframe_with_reference( + result, + {"a": [2, 2, 1, 1, 1], "b": [1, 2, 3, 4, 4]}, + dtype=pdx.Int64, ) - pd.testing.assert_frame_equal(result_pd, expected) def test_column_sorted_indices_descending(library: str) -> None: - df = integer_dataframe_6(library).persist() + df = integer_dataframe_6(library) + pdx = df.__dataframe_namespace__() sorted_indices = df.col("b").sorted_indices(ascending=False) result = df.take(sorted_indices) - result_pd = interchange_to_pandas(result) - expected = pd.DataFrame( - { - "a": [1, 1, 1, 2, 2], - "b": [4, 4, 3, 2, 1], - }, + compare_dataframe_with_reference( + result, + {"a": [1, 1, 1, 2, 2], "b": [4, 4, 3, 2, 1]}, + dtype=pdx.Int64, ) - pd.testing.assert_frame_equal(result_pd, expected) diff --git a/tests/column/temporal/components_test.py b/tests/column/temporal/components_test.py index 640a819f..6259265b 100644 --- a/tests/column/temporal/components_test.py +++ b/tests/column/temporal/components_test.py @@ -2,10 +2,9 @@ from typing import Literal -import pandas as pd import pytest -from tests.utils import interchange_to_pandas +from tests.utils import compare_column_with_reference from tests.utils import temporal_dataframe_1 @@ -23,14 +22,17 @@ ], ) def test_col_components(library: str, attr: str, expected: list[int]) -> None: - df = temporal_dataframe_1(library).persist() + df = temporal_dataframe_1(library) + pdx = df.__dataframe_namespace__() for col_name in ("a", "c", "e"): - result = df.assign(getattr(df.col(col_name), attr)().rename("result")).select( - "result", + result = ( + df.assign(getattr(df.col(col_name), attr)().rename("result")) + .select( + "result", + ) + .cast({"result": pdx.Int64()}) ) - result = interchange_to_pandas(result)["result"].astype("int64") - expected = pd.Series(expected, name="result") - pd.testing.assert_series_equal(result, expected) + compare_column_with_reference(result.col("result"), expected, dtype=pdx.Int64) @pytest.mark.parametrize( @@ -42,13 +44,16 @@ def test_col_components(library: str, attr: str, expected: list[int]) -> None: ], ) def test_col_microsecond(library: str, col_name: str, expected: list[int]) -> None: - df = temporal_dataframe_1(library).persist() - result = df.assign(df.col(col_name).microsecond().rename("result")).select( - "result", + df = temporal_dataframe_1(library) + pdx = df.__dataframe_namespace__() + result = ( + df.assign(df.col(col_name).microsecond().rename("result")) + .select( + "result", + ) + .cast({"result": pdx.Int64()}) ) - result = interchange_to_pandas(result)["result"].astype("int64") - expected = pd.Series(expected, name="result") - pd.testing.assert_series_equal(result, expected) + compare_column_with_reference(result.col("result"), expected, dtype=pdx.Int64) @pytest.mark.parametrize( @@ -60,13 +65,16 @@ def test_col_microsecond(library: str, col_name: str, expected: list[int]) -> No ], ) def test_col_nanosecond(library: str, col_name: str, expected: list[int]) -> None: - df = temporal_dataframe_1(library).persist() - result = df.assign(df.col(col_name).nanosecond().rename("result")).select( # type: ignore[attr-defined] - "result", + df = temporal_dataframe_1(library) + pdx = df.__dataframe_namespace__() + result = ( + df.assign(df.col(col_name).nanosecond().rename("result")) # type: ignore[attr-defined] + .select( + "result", + ) + .cast({"result": pdx.Int64()}) ) - result = interchange_to_pandas(result)["result"].astype("int64") - expected = pd.Series(expected, name="result") - pd.testing.assert_series_equal(result, expected) + compare_column_with_reference(result.col("result"), expected, dtype=pdx.Int64) @pytest.mark.parametrize( @@ -84,11 +92,14 @@ def test_col_unix_timestamp_time_units( expected: list[int], ) -> None: df = temporal_dataframe_1(library) - result = df.assign( - df.col("e").unix_timestamp(time_unit=time_unit).rename("result"), - ).select( - "result", + pdx = df.__dataframe_namespace__() + result = ( + df.assign( + df.col("e").unix_timestamp(time_unit=time_unit).rename("result"), + ) + .select( + "result", + ) + .cast({"result": pdx.Int64()}) ) - result = interchange_to_pandas(result)["result"].astype("int64") - expected = pd.Series(expected, name="result") - pd.testing.assert_series_equal(result, expected, check_exact=True) + compare_column_with_reference(result.col("result"), expected, dtype=pdx.Int64) diff --git a/tests/column/temporal/filter_test.py b/tests/column/temporal/filter_test.py index 7da55036..5e4e4f31 100644 --- a/tests/column/temporal/filter_test.py +++ b/tests/column/temporal/filter_test.py @@ -1,13 +1,9 @@ -import pandas as pd - -from tests.utils import interchange_to_pandas +from tests.utils import compare_dataframe_with_reference from tests.utils import temporal_dataframe_1 def test_filter_w_date(library: str) -> None: df = temporal_dataframe_1(library).select("a", "index") - namespace = df.__dataframe_namespace__() - result = df.filter(df.col("a") > namespace.date(2020, 1, 2)).select("index") - result_pd = interchange_to_pandas(result) - expected = pd.DataFrame({"index": [1, 2]}) - pd.testing.assert_frame_equal(result_pd, expected) + pdx = df.__dataframe_namespace__() + result = df.filter(df.col("a") > pdx.date(2020, 1, 2)).select("index") + compare_dataframe_with_reference(result, {"index": [1, 2]}, pdx.Int64) diff --git a/tests/column/temporal/floor_test.py b/tests/column/temporal/floor_test.py index de9e4519..d8773e07 100644 --- a/tests/column/temporal/floor_test.py +++ b/tests/column/temporal/floor_test.py @@ -2,10 +2,9 @@ from datetime import datetime -import pandas as pd import pytest -from tests.utils import interchange_to_pandas +from tests.utils import compare_column_with_reference from tests.utils import temporal_dataframe_1 @@ -17,9 +16,8 @@ ) def test_floor(library: str, freq: str, expected: list[datetime]) -> None: df = temporal_dataframe_1(library) + pdx = df.__dataframe_namespace__() col = df.col - result = df.assign(col("a").floor(freq).rename("result")).select("result").persist() # type: ignore[attr-defined] + result = df.assign(col("a").floor(freq).rename("result")).select("result") # type: ignore[attr-defined] # TODO check the resolution - result = interchange_to_pandas(result)["result"].astype("datetime64[ns]") - expected = pd.Series(expected, name="result") - pd.testing.assert_series_equal(result, expected) + compare_column_with_reference(result.col("result"), expected, pdx.Datetime) diff --git a/tests/utils.py b/tests/utils.py index 5d7a613c..ba271c91 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -4,6 +4,7 @@ from datetime import timedelta from typing import TYPE_CHECKING from typing import Any +from typing import Mapping from typing import cast import pandas as pd @@ -491,11 +492,29 @@ def compare_column_with_reference( column = column.persist() col_len = column.len().scalar assert col_len == len(reference) - assert isinstance(column.dtype, dtype) + assert isinstance( + column.dtype, + dtype, + ), f"{column.dtype=} isn't a instance of {dtype=}" for idx in range(col_len): assert reference[idx] == column.get_value(idx).scalar +def compare_dataframe_with_reference( + dataframe: DataFrame, + reference: Mapping[str, list[Any]], + dtype: Any | Mapping[str, Any], +) -> None: + assert dataframe.column_names == list(reference.keys()) + for col_name in dataframe.column_names: + col_dtype = dtype[col_name] if isinstance(dtype, dict) else dtype + compare_column_with_reference( + dataframe.col(col_name), + reference[col_name], + dtype=col_dtype, + ) + + def mixed_dataframe_1(library: str) -> DataFrame: df: Any data = { From efdf0976f5017aa7a0a34ca1f2be962956f5ceb8 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Sat, 6 Jan 2024 01:00:38 +0100 Subject: [PATCH 06/12] use new functions for more files [part1] Signed-off-by: Anatoly Myachev --- tests/column/comparisons_test.py | 92 ++++++++++++++++---------------- tests/column/cumulative_test.py | 10 ++-- 2 files changed, 52 insertions(+), 50 deletions(-) diff --git a/tests/column/comparisons_test.py b/tests/column/comparisons_test.py index 3513b9c7..89fdae83 100644 --- a/tests/column/comparisons_test.py +++ b/tests/column/comparisons_test.py @@ -2,83 +2,86 @@ from typing import Any -import pandas as pd import pytest +from tests.utils import compare_column_with_reference from tests.utils import integer_dataframe_1 from tests.utils import integer_dataframe_7 -from tests.utils import interchange_to_pandas @pytest.mark.parametrize( - ("comparison", "expected_data"), + ("comparison", "expected_data", "expected_dtype"), [ - ("__eq__", [True, True, False]), - ("__ne__", [False, False, True]), - ("__ge__", [True, True, False]), - ("__gt__", [False, False, False]), - ("__le__", [True, True, True]), - ("__lt__", [False, False, True]), - ("__add__", [2, 4, 7]), - ("__sub__", [0, 0, -1]), - ("__mul__", [1, 4, 12]), - ("__truediv__", [1, 1, 0.75]), - ("__floordiv__", [1, 1, 0]), - ("__pow__", [1, 4, 81]), - ("__mod__", [0, 0, 3]), + ("__eq__", [True, True, False], "Bool"), + ("__ne__", [False, False, True], "Bool"), + ("__ge__", [True, True, False], "Bool"), + ("__gt__", [False, False, False], "Bool"), + ("__le__", [True, True, True], "Bool"), + ("__lt__", [False, False, True], "Bool"), + ("__add__", [2, 4, 7], "Int64"), + ("__sub__", [0, 0, -1], "Int64"), + ("__mul__", [1, 4, 12], "Int64"), + ("__truediv__", [1, 1, 0.75], "Float64"), + ("__floordiv__", [1, 1, 0], "Int64"), + ("__pow__", [1, 4, 81], "Int64"), + ("__mod__", [0, 0, 3], "Int64"), ], ) def test_column_comparisons( library: str, comparison: str, expected_data: list[object], + expected_dtype: str, ) -> None: ser: Any - df = integer_dataframe_7(library).persist() + df = integer_dataframe_7(library) + pdx = df.__dataframe_namespace__() ser = df.col("a") other = df.col("b") result = df.assign(getattr(ser, comparison)(other).rename("result")) - result_pd = interchange_to_pandas(result)["result"] - expected = pd.Series(expected_data, name="result") - if library in ("polars", "polars-lazy") and comparison == "__pow__": + expected_pdx_dtype = getattr(pdx, expected_dtype) + if comparison == "__pow__" and library in ("polars", "polars-lazy"): # TODO - result_pd = result_pd.astype("int64") - pd.testing.assert_series_equal(result_pd, expected) + result = result.cast({"result": pdx.Int64()}) + expected_pdx_dtype = pdx.Int64 + compare_column_with_reference(result.col("result"), expected_data, expected_pdx_dtype) @pytest.mark.parametrize( - ("comparison", "expected_data"), + ("comparison", "expected_data", "expected_dtype"), [ - ("__eq__", [False, False, True]), - ("__ne__", [True, True, False]), - ("__ge__", [False, False, True]), - ("__gt__", [False, False, False]), - ("__le__", [True, True, True]), - ("__lt__", [True, True, False]), - ("__add__", [4, 5, 6]), - ("__sub__", [-2, -1, 0]), - ("__mul__", [3, 6, 9]), - ("__truediv__", [1 / 3, 2 / 3, 1]), - ("__floordiv__", [0, 0, 1]), - ("__pow__", [1, 8, 27]), - ("__mod__", [1, 2, 0]), + ("__eq__", [False, False, True], "Bool"), + ("__ne__", [True, True, False], "Bool"), + ("__ge__", [False, False, True], "Bool"), + ("__gt__", [False, False, False], "Bool"), + ("__le__", [True, True, True], "Bool"), + ("__lt__", [True, True, False], "Bool"), + ("__add__", [4, 5, 6], "Int64"), + ("__sub__", [-2, -1, 0], "Int64"), + ("__mul__", [3, 6, 9], "Int64"), + ("__truediv__", [1 / 3, 2 / 3, 1], "Float64"), + ("__floordiv__", [0, 0, 1], "Int64"), + ("__pow__", [1, 8, 27], "Int64"), + ("__mod__", [1, 2, 0], "Int64"), ], ) def test_column_comparisons_scalar( library: str, comparison: str, expected_data: list[object], + expected_dtype: str, ) -> None: ser: Any - df = integer_dataframe_1(library).persist() + df = integer_dataframe_1(library) + pdx = df.__dataframe_namespace__() ser = df.col("a") other = 3 result = df.assign(getattr(ser, comparison)(other).rename("result")) - result_pd = interchange_to_pandas(result)["result"] - expected = pd.Series(expected_data, name="result") + expected_pdx_dtype = getattr(pdx, expected_dtype) if comparison == "__pow__" and library in ("polars", "polars-lazy"): - result_pd = result_pd.astype("int64") - pd.testing.assert_series_equal(result_pd, expected) + result = result.cast({"result": pdx.Int64()}) + expected_pdx_dtype = pdx.Int64 + compare_column_with_reference(result.col("result"), expected_data, expected_pdx_dtype) @pytest.mark.parametrize( @@ -96,10 +99,9 @@ def test_right_column_comparisons( ) -> None: # 1,2,3 ser: Any - df = integer_dataframe_7(library).persist() + df = integer_dataframe_7(library) + pdx = df.__dataframe_namespace__() ser = df.col("a") other = 2 result = df.assign(getattr(ser, comparison)(other).rename("result")) - result_pd = interchange_to_pandas(result)["result"] - expected = pd.Series(expected_data, name="result") - pd.testing.assert_series_equal(result_pd, expected) + compare_column_with_reference(result.col("result"), expected_data, pdx.Int64) diff --git a/tests/column/cumulative_test.py b/tests/column/cumulative_test.py index dd6082c2..d2532ff4 100644 --- a/tests/column/cumulative_test.py +++ b/tests/column/cumulative_test.py @@ -3,8 +3,8 @@ import pandas as pd import pytest +from tests.utils import compare_column_with_reference from tests.utils import integer_dataframe_1 -from tests.utils import interchange_to_pandas @pytest.mark.parametrize( @@ -21,17 +21,17 @@ def test_cumulative_functions_column( func: str, expected_data: list[float], ) -> None: - df = integer_dataframe_1(library).persist() + df = integer_dataframe_1(library) + pdx = df.__dataframe_namespace__() ser = df.col("a") expected = pd.Series(expected_data, name="result") result = df.assign(getattr(ser, func)().rename("result")) - result_pd = interchange_to_pandas(result)["result"] if ( tuple(int(v) for v in pd.__version__.split(".")) < (2, 0, 0) and library == "pandas-nullable" ): # pragma: no cover # Upstream bug - result_pd = result_pd.astype("int64") + result = result.cast({"result": pdx.Int64()}) - pd.testing.assert_series_equal(result_pd, expected) + compare_column_with_reference(result.col("result"), expected, pdx.Int64) From 1ed8fbf0fef3c8c783ed79c778cf2f1ac4cdb908 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Sat, 6 Jan 2024 01:43:15 +0100 Subject: [PATCH 07/12] use new functions for more files [part2] Signed-off-by: Anatoly Myachev --- .../pandas_standard/column_object.py | 1 + tests/column/divmod_test.py | 24 +++------- tests/column/fill_nan_test.py | 21 ++++----- tests/column/get_rows_by_mask_test.py | 8 ++-- tests/column/get_rows_test.py | 9 ++-- tests/column/invert_test.py | 17 +++---- tests/column/is_in_test.py | 15 +++--- tests/column/is_nan_test.py | 11 ++--- tests/column/is_null_test.py | 21 ++++----- tests/column/pow_test.py | 46 ++++++++++--------- tests/column/reductions_test.py | 31 ++++++------- 11 files changed, 87 insertions(+), 117 deletions(-) diff --git a/dataframe_api_compat/pandas_standard/column_object.py b/dataframe_api_compat/pandas_standard/column_object.py index aff0c393..1bbe408e 100644 --- a/dataframe_api_compat/pandas_standard/column_object.py +++ b/dataframe_api_compat/pandas_standard/column_object.py @@ -35,6 +35,7 @@ "UInt16": "uint16", "UInt8": "uint8", "boolean": "bool", + "Float64": "float64", } diff --git a/tests/column/divmod_test.py b/tests/column/divmod_test.py index 5f0a7653..810d5fb6 100644 --- a/tests/column/divmod_test.py +++ b/tests/column/divmod_test.py @@ -1,41 +1,31 @@ from __future__ import annotations -import pandas as pd - +from tests.utils import compare_column_with_reference from tests.utils import integer_dataframe_1 -from tests.utils import interchange_to_pandas def test_expression_divmod(library: str) -> None: df = integer_dataframe_1(library) - df.__dataframe_namespace__() + pdx = df.__dataframe_namespace__() ser = df.col("a") other = df.col("b") result_quotient, result_remainder = ser.__divmod__(other) # quotient result = df.assign(result_quotient.rename("result")) - result_pd = interchange_to_pandas(result)["result"] - expected_quotient = pd.Series([0, 0, 0], name="result") - pd.testing.assert_series_equal(result_pd, expected_quotient) + compare_column_with_reference(result.col("result"), [0, 0, 0], pdx.Int64) # remainder result = df.assign(result_remainder.rename("result")) - result_pd = interchange_to_pandas(result)["result"] - expected_remainder = pd.Series([1, 2, 3], name="result") - pd.testing.assert_series_equal(result_pd, expected_remainder) + compare_column_with_reference(result.col("result"), [1, 2, 3], pdx.Int64) def test_expression_divmod_with_scalar(library: str) -> None: df = integer_dataframe_1(library) - df.__dataframe_namespace__() + pdx = df.__dataframe_namespace__() ser = df.col("a") result_quotient, result_remainder = ser.__divmod__(2) # quotient result = df.assign(result_quotient.rename("result")) - result_pd = interchange_to_pandas(result)["result"] - expected_quotient = pd.Series([0, 1, 1], name="result") - pd.testing.assert_series_equal(result_pd, expected_quotient) + compare_column_with_reference(result.col("result"), [0, 1, 1], pdx.Int64) # remainder result = df.assign(result_remainder.rename("result")) - result_pd = interchange_to_pandas(result)["result"] - expected_remainder = pd.Series([1, 0, 1], name="result") - pd.testing.assert_series_equal(result_pd, expected_remainder) + compare_column_with_reference(result.col("result"), [1, 0, 1], pdx.Int64) diff --git a/tests/column/fill_nan_test.py b/tests/column/fill_nan_test.py index de923dea..d7f75a77 100644 --- a/tests/column/fill_nan_test.py +++ b/tests/column/fill_nan_test.py @@ -1,27 +1,22 @@ from __future__ import annotations -import pandas as pd - -from tests.utils import interchange_to_pandas +from tests.utils import compare_column_with_reference from tests.utils import nan_dataframe_1 def test_column_fill_nan(library: str) -> None: # TODO: test with nullable pandas, check null isn't filled - df = nan_dataframe_1(library).persist() + df = nan_dataframe_1(library) + pdx = df.__dataframe_namespace__() ser = df.col("a") result = df.assign(ser.fill_nan(-1.0).rename("result")) - result_pd = interchange_to_pandas(result)["result"] - expected = pd.Series([1.0, 2.0, -1.0], name="result") - pd.testing.assert_series_equal(result_pd, expected) + compare_column_with_reference(result.col("result"), [1.0, 2.0, -1.0], pdx.Float64) def test_column_fill_nan_with_null(library: str) -> None: # TODO: test with nullable pandas, check null isn't filled - df = nan_dataframe_1(library).persist() - ns = df.__dataframe_namespace__() + df = nan_dataframe_1(library) + pdx = df.__dataframe_namespace__() ser = df.col("a") - result = df.assign(ser.fill_nan(ns.null).is_null().rename("result")) - result_pd = interchange_to_pandas(result)["result"] - expected = pd.Series([False, False, True], name="result") - pd.testing.assert_series_equal(result_pd, expected) + result = df.assign(ser.fill_nan(pdx.null).is_null().rename("result")) + compare_column_with_reference(result.col("result"), [False, False, True], pdx.Bool) diff --git a/tests/column/get_rows_by_mask_test.py b/tests/column/get_rows_by_mask_test.py index e22608f0..a69da19e 100644 --- a/tests/column/get_rows_by_mask_test.py +++ b/tests/column/get_rows_by_mask_test.py @@ -2,8 +2,8 @@ import pandas as pd +from tests.utils import compare_column_with_reference from tests.utils import integer_dataframe_1 -from tests.utils import interchange_to_pandas def test_column_filter(library: str) -> None: @@ -18,11 +18,9 @@ def test_column_filter(library: str) -> None: def test_column_take_by_mask_noop(library: str) -> None: df = integer_dataframe_1(library) - df.__dataframe_namespace__() + pdx = df.__dataframe_namespace__() ser = df.col("a") mask = ser > 0 ser = ser.filter(mask) result = df.assign(ser.rename("result")) - result_pd = interchange_to_pandas(result)["result"] - expected = pd.Series([1, 2, 3], name="result") - pd.testing.assert_series_equal(result_pd, expected) + compare_column_with_reference(result.col("result"), [1, 2, 3], pdx.Int64) diff --git a/tests/column/get_rows_test.py b/tests/column/get_rows_test.py index 10d7a357..b2d0f763 100644 --- a/tests/column/get_rows_test.py +++ b/tests/column/get_rows_test.py @@ -1,16 +1,13 @@ from __future__ import annotations -import pandas as pd - +from tests.utils import compare_column_with_reference from tests.utils import integer_dataframe_1 -from tests.utils import interchange_to_pandas def test_expression_take(library: str) -> None: df = integer_dataframe_1(library) + pdx = df.__dataframe_namespace__() ser = df.col("a") indices = df.col("a") - 1 result = df.assign(ser.take(indices).rename("result")).select("result") - result_pd = interchange_to_pandas(result)["result"] - expected = pd.Series([1, 2, 3], name="result") - pd.testing.assert_series_equal(result_pd, expected) + compare_column_with_reference(result.col("result"), [1, 2, 3], pdx.Int64) diff --git a/tests/column/invert_test.py b/tests/column/invert_test.py index 3a28c4c3..617a8099 100644 --- a/tests/column/invert_test.py +++ b/tests/column/invert_test.py @@ -1,25 +1,20 @@ from __future__ import annotations -import pandas as pd - from tests.utils import bool_dataframe_1 -from tests.utils import interchange_to_pandas +from tests.utils import compare_column_with_reference def test_expression_invert(library: str) -> None: df = bool_dataframe_1(library) - df.__dataframe_namespace__() + pdx = df.__dataframe_namespace__() ser = df.col("a") result = df.assign((~ser).rename("result")) - result_pd = interchange_to_pandas(result)["result"] - expected = pd.Series([False, False, True], name="result") - pd.testing.assert_series_equal(result_pd, expected) + compare_column_with_reference(result.col("result"), [False, False, True], pdx.Bool) def test_column_invert(library: str) -> None: - df = bool_dataframe_1(library).persist() + df = bool_dataframe_1(library) + pdx = df.__dataframe_namespace__() ser = df.col("a") result = df.assign((~ser).rename("result")) - result_pd = interchange_to_pandas(result)["result"] - expected = pd.Series([False, False, True], name="result") - pd.testing.assert_series_equal(result_pd, expected) + compare_column_with_reference(result.col("result"), [False, False, True], pdx.Bool) diff --git a/tests/column/is_in_test.py b/tests/column/is_in_test.py index b129c673..0d08d37d 100644 --- a/tests/column/is_in_test.py +++ b/tests/column/is_in_test.py @@ -3,13 +3,12 @@ from typing import TYPE_CHECKING from typing import Any -import pandas as pd import pytest +from tests.utils import compare_column_with_reference from tests.utils import float_dataframe_1 from tests.utils import float_dataframe_2 from tests.utils import float_dataframe_3 -from tests.utils import interchange_to_pandas if TYPE_CHECKING: from collections.abc import Callable @@ -29,13 +28,12 @@ def test_is_in( df_factory: Callable[[str], Any], expected_values: list[bool], ) -> None: - df = df_factory(library).persist() + df = df_factory(library) + pdx = df.__dataframe_namespace__() ser = df.col("a") other = ser + 1 result = df.assign(ser.is_in(other).rename("result")) - result_pd = interchange_to_pandas(result)["result"] - expected = pd.Series(expected_values, name="result") - pd.testing.assert_series_equal(result_pd, expected) + compare_column_with_reference(result.col("result"), expected_values, pdx.Bool) @pytest.mark.parametrize( @@ -53,10 +51,9 @@ def test_expr_is_in( expected_values: list[bool], ) -> None: df = df_factory(library) + pdx = df.__dataframe_namespace__() col = df.col ser = col("a") other = ser + 1 result = df.assign(ser.is_in(other).rename("result")) - result_pd = interchange_to_pandas(result)["result"] - expected = pd.Series(expected_values, name="result") - pd.testing.assert_series_equal(result_pd, expected) + compare_column_with_reference(result.col("result"), expected_values, pdx.Bool) diff --git a/tests/column/is_nan_test.py b/tests/column/is_nan_test.py index 7c33e5cb..735ea2e3 100644 --- a/tests/column/is_nan_test.py +++ b/tests/column/is_nan_test.py @@ -1,15 +1,12 @@ from __future__ import annotations -import pandas as pd - -from tests.utils import interchange_to_pandas +from tests.utils import compare_column_with_reference from tests.utils import nan_dataframe_1 def test_column_is_nan(library: str) -> None: - df = nan_dataframe_1(library).persist() + df = nan_dataframe_1(library) + pdx = df.__dataframe_namespace__() ser = df.col("a") result = df.assign(ser.is_nan().rename("result")) - result_pd = interchange_to_pandas(result)["result"] - expected = pd.Series([False, False, True], name="result") - pd.testing.assert_series_equal(result_pd, expected) + compare_column_with_reference(result.col("result"), [False, False, True], pdx.Bool) diff --git a/tests/column/is_null_test.py b/tests/column/is_null_test.py index 416350a9..ee3afe52 100644 --- a/tests/column/is_null_test.py +++ b/tests/column/is_null_test.py @@ -1,28 +1,25 @@ from __future__ import annotations -import pandas as pd - -from tests.utils import interchange_to_pandas +from tests.utils import compare_column_with_reference from tests.utils import nan_dataframe_1 from tests.utils import null_dataframe_1 def test_column_is_null_1(library: str) -> None: - df = nan_dataframe_1(library).persist() + df = nan_dataframe_1(library) + pdx = df.__dataframe_namespace__() ser = df.col("a") result = df.assign(ser.is_null().rename("result")) - result_pd = interchange_to_pandas(result)["result"] if library == "pandas-numpy": - expected = pd.Series([False, False, True], name="result") + expected = [False, False, True] else: - expected = pd.Series([False, False, False], name="result") - pd.testing.assert_series_equal(result_pd, expected) + expected = [False, False, False] + compare_column_with_reference(result.col("result"), expected, pdx.Bool) def test_column_is_null_2(library: str) -> None: - df = null_dataframe_1(library).persist() + df = null_dataframe_1(library) + pdx = df.__dataframe_namespace__() ser = df.col("a") result = df.assign(ser.is_null().rename("result")) - result_pd = interchange_to_pandas(result)["result"] - expected = pd.Series([False, False, True], name="result") - pd.testing.assert_series_equal(result_pd, expected) + compare_column_with_reference(result.col("result"), [False, False, True], pdx.Bool) diff --git a/tests/column/pow_test.py b/tests/column/pow_test.py index ab12dad5..d15261e8 100644 --- a/tests/column/pow_test.py +++ b/tests/column/pow_test.py @@ -1,56 +1,60 @@ from __future__ import annotations -import pandas as pd - +from tests.utils import compare_dataframe_with_reference from tests.utils import integer_dataframe_1 -from tests.utils import interchange_to_pandas def test_float_powers_column(library: str) -> None: df = integer_dataframe_1(library) - df.__dataframe_namespace__() + pdx = df.__dataframe_namespace__() ser = df.col("a") other = df.col("b") * 1.0 result = df.assign(ser.__pow__(other).rename("result")) - result_pd = interchange_to_pandas(result) - expected = pd.DataFrame( + compare_dataframe_with_reference( + result, {"a": [1, 2, 3], "b": [4, 5, 6], "result": [1.0, 32.0, 729.0]}, + {"a": pdx.Int64, "b": pdx.Int64, "result": pdx.Float64}, ) - pd.testing.assert_frame_equal(result_pd, expected) def test_float_powers_scalar_column(library: str) -> None: df = integer_dataframe_1(library) - df.__dataframe_namespace__() + pdx = df.__dataframe_namespace__() ser = df.col("a") other = 1.0 result = df.assign(ser.__pow__(other).rename("result")) - result_pd = interchange_to_pandas(result) - expected = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "result": [1.0, 2.0, 3.0]}) - pd.testing.assert_frame_equal(result_pd, expected) + compare_dataframe_with_reference( + result, + {"a": [1, 2, 3], "b": [4, 5, 6], "result": [1.0, 2.0, 3.0]}, + {"a": pdx.Int64, "b": pdx.Int64, "result": pdx.Float64}, + ) def test_int_powers_column(library: str) -> None: df = integer_dataframe_1(library) - df.__dataframe_namespace__() + pdx = df.__dataframe_namespace__() ser = df.col("a") other = df.col("b") * 1 result = df.assign(ser.__pow__(other).rename("result")) - result_pd = interchange_to_pandas(result) - expected = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "result": [1, 32, 729]}) if library in ("polars", "polars-lazy"): - result_pd = result_pd.astype("int64") - pd.testing.assert_frame_equal(result_pd, expected) + result = result.cast({name: pdx.Int64() for name in ("a", "b", "result")}) + compare_dataframe_with_reference( + result, + {"a": [1, 2, 3], "b": [4, 5, 6], "result": [1, 32, 729]}, + {name: pdx.Int64 for name in ("a", "b", "result")}, + ) def test_int_powers_scalar_column(library: str) -> None: df = integer_dataframe_1(library) - df.__dataframe_namespace__() + pdx = df.__dataframe_namespace__() ser = df.col("a") other = 1 result = df.assign(ser.__pow__(other).rename("result")) - result_pd = interchange_to_pandas(result) - expected = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "result": [1, 2, 3]}) if library in ("polars", "polars-lazy"): - result_pd = result_pd.astype("int64") - pd.testing.assert_frame_equal(result_pd, expected) + result = result.cast({name: pdx.Int64() for name in ("a", "b", "result")}) + compare_dataframe_with_reference( + result, + {"a": [1, 2, 3], "b": [4, 5, 6], "result": [1, 2, 3]}, + {name: pdx.Int64 for name in ("a", "b", "result")}, + ) diff --git a/tests/column/reductions_test.py b/tests/column/reductions_test.py index d9ba4d4a..3ed235cf 100644 --- a/tests/column/reductions_test.py +++ b/tests/column/reductions_test.py @@ -1,36 +1,35 @@ from __future__ import annotations -import pandas as pd import pytest +from tests.utils import compare_column_with_reference from tests.utils import integer_dataframe_1 -from tests.utils import interchange_to_pandas @pytest.mark.parametrize( - ("reduction", "expected"), + ("reduction", "expected", "expected_dtype"), [ - ("min", 1), - ("max", 3), - ("sum", 6), - ("prod", 6), - ("median", 2.0), - ("mean", 2.0), - ("std", 1.0), - ("var", 1.0), + ("min", 1, "Int64"), + ("max", 3, "Int64"), + ("sum", 6, "Int64"), + ("prod", 6, "Int64"), + ("median", 2.0, "Float64"), + ("mean", 2.0, "Float64"), + ("std", 1.0, "Float64"), + ("var", 1.0, "Float64"), ], ) def test_expression_reductions( library: str, reduction: str, expected: float, + expected_dtype: str, ) -> None: df = integer_dataframe_1(library) - df.__dataframe_namespace__() + pdx = df.__dataframe_namespace__() ser = df.col("a") ser = ser - getattr(ser, reduction)() result = df.assign(ser.rename("result")) - result_pd = interchange_to_pandas(result)["result"] - ser_pd = interchange_to_pandas(df)["a"].rename("result") - expected_pd = ser_pd - expected - pd.testing.assert_series_equal(result_pd, expected_pd) + reference = list((df.col("a") - expected).persist().to_array()) + expected_pdx_dtype = getattr(pdx, expected_dtype) + compare_column_with_reference(result.col("result"), reference, expected_pdx_dtype) From 1658fa886f76a616f46fcc5e1294f187c139fb7d Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Sun, 7 Jan 2024 00:53:55 +0100 Subject: [PATCH 08/12] use new functions for more files [final part for column tests] Signed-off-by: Anatoly Myachev --- tests/column/and_or_test.py | 16 ++--- tests/column/cast_test.py | 6 +- tests/column/col_sorted_indices_test.py | 16 ++--- tests/column/comparisons_test.py | 24 ++++---- tests/column/cumulative_test.py | 6 +- tests/column/divmod_test.py | 12 ++-- tests/column/fill_nan_test.py | 10 ++-- tests/column/get_rows_by_mask_test.py | 4 +- tests/column/get_rows_test.py | 4 +- tests/column/invert_test.py | 8 +-- tests/column/is_in_test.py | 8 +-- tests/column/is_nan_test.py | 4 +- tests/column/is_null_test.py | 8 +-- tests/column/pow_test.py | 20 +++---- tests/column/reductions_test.py | 6 +- tests/column/shift_test.py | 34 +++++------ tests/column/sort_test.py | 74 ++++++++++-------------- tests/column/statistics_test.py | 10 +--- tests/column/temporal/components_test.py | 24 ++++---- tests/column/temporal/filter_test.py | 6 +- tests/column/temporal/floor_test.py | 4 +- tests/utils.py | 12 ++-- 22 files changed, 150 insertions(+), 166 deletions(-) diff --git a/tests/column/and_or_test.py b/tests/column/and_or_test.py index 2919ed2b..2fe5846c 100644 --- a/tests/column/and_or_test.py +++ b/tests/column/and_or_test.py @@ -6,39 +6,39 @@ def test_column_and(library: str) -> None: df = bool_dataframe_1(library, api_version="2023.09-beta") - pdx = df.__dataframe_namespace__() + ns = df.__dataframe_namespace__() ser = df.col("a") other = df.col("b") result = df.assign((ser & other).rename("result")) expected = [True, True, False] - compare_column_with_reference(result.col("result"), expected, dtype=pdx.Bool) + compare_column_with_reference(result.col("result"), expected, dtype=ns.Bool) def test_column_or(library: str) -> None: df = bool_dataframe_1(library) - pdx = df.__dataframe_namespace__() + ns = df.__dataframe_namespace__() ser = df.col("a") other = df.col("b") result = df.assign((ser | other).rename("result")) expected = [True, True, True] - compare_column_with_reference(result.col("result"), expected, dtype=pdx.Bool) + compare_column_with_reference(result.col("result"), expected, dtype=ns.Bool) def test_column_and_with_scalar(library: str) -> None: df = bool_dataframe_1(library) - pdx = df.__dataframe_namespace__() + ns = df.__dataframe_namespace__() ser = df.col("a") other = True result = df.assign((other & ser).rename("result")) expected = [True, True, False] - compare_column_with_reference(result.col("result"), expected, dtype=pdx.Bool) + compare_column_with_reference(result.col("result"), expected, dtype=ns.Bool) def test_column_or_with_scalar(library: str) -> None: df = bool_dataframe_1(library) - pdx = df.__dataframe_namespace__() + ns = df.__dataframe_namespace__() ser = df.col("a") other = True result = df.assign((other | ser).rename("result")) expected = [True, True, True] - compare_column_with_reference(result.col("result"), expected, dtype=pdx.Bool) + compare_column_with_reference(result.col("result"), expected, dtype=ns.Bool) diff --git a/tests/column/cast_test.py b/tests/column/cast_test.py index 3d942a4e..72b32b5f 100644 --- a/tests/column/cast_test.py +++ b/tests/column/cast_test.py @@ -4,10 +4,10 @@ def test_cast_integers(library: str) -> None: df = integer_dataframe_1(library) - pdx = df.__dataframe_namespace__() - result = df.assign(df.col("a").cast(pdx.Int32())) + ns = df.__dataframe_namespace__() + result = df.assign(df.col("a").cast(ns.Int32())) compare_dataframe_with_reference( result, {"a": [1, 2, 3], "b": [4, 5, 6]}, - dtype={"a": pdx.Int32, "b": pdx.Int64}, + dtype={"a": ns.Int32, "b": ns.Int64}, ) diff --git a/tests/column/col_sorted_indices_test.py b/tests/column/col_sorted_indices_test.py index c371ca1a..fb8c1e42 100644 --- a/tests/column/col_sorted_indices_test.py +++ b/tests/column/col_sorted_indices_test.py @@ -6,49 +6,49 @@ def test_expression_sorted_indices_ascending(library: str) -> None: df = integer_dataframe_6(library) - pdx = df.__dataframe_namespace__() + ns = df.__dataframe_namespace__() col = df.col sorted_indices = col("b").sorted_indices() result = df.take(sorted_indices) compare_dataframe_with_reference( result, {"a": [2, 2, 1, 1, 1], "b": [1, 2, 3, 4, 4]}, - dtype=pdx.Int64, + dtype=ns.Int64, ) def test_expression_sorted_indices_descending(library: str) -> None: df = integer_dataframe_6(library) - pdx = df.__dataframe_namespace__() + ns = df.__dataframe_namespace__() col = df.col sorted_indices = col("b").sorted_indices(ascending=False) result = df.take(sorted_indices) compare_dataframe_with_reference( result, {"a": [1, 1, 1, 2, 2], "b": [4, 4, 3, 2, 1]}, - dtype=pdx.Int64, + dtype=ns.Int64, ) def test_column_sorted_indices_ascending(library: str) -> None: df = integer_dataframe_6(library) - pdx = df.__dataframe_namespace__() + ns = df.__dataframe_namespace__() sorted_indices = df.col("b").sorted_indices() result = df.take(sorted_indices) compare_dataframe_with_reference( result, {"a": [2, 2, 1, 1, 1], "b": [1, 2, 3, 4, 4]}, - dtype=pdx.Int64, + dtype=ns.Int64, ) def test_column_sorted_indices_descending(library: str) -> None: df = integer_dataframe_6(library) - pdx = df.__dataframe_namespace__() + ns = df.__dataframe_namespace__() sorted_indices = df.col("b").sorted_indices(ascending=False) result = df.take(sorted_indices) compare_dataframe_with_reference( result, {"a": [1, 1, 1, 2, 2], "b": [4, 4, 3, 2, 1]}, - dtype=pdx.Int64, + dtype=ns.Int64, ) diff --git a/tests/column/comparisons_test.py b/tests/column/comparisons_test.py index 89fdae83..266b9839 100644 --- a/tests/column/comparisons_test.py +++ b/tests/column/comparisons_test.py @@ -35,16 +35,16 @@ def test_column_comparisons( ) -> None: ser: Any df = integer_dataframe_7(library) - pdx = df.__dataframe_namespace__() + ns = df.__dataframe_namespace__() ser = df.col("a") other = df.col("b") result = df.assign(getattr(ser, comparison)(other).rename("result")) - expected_pdx_dtype = getattr(pdx, expected_dtype) + expected_ns_dtype = getattr(ns, expected_dtype) if comparison == "__pow__" and library in ("polars", "polars-lazy"): # TODO - result = result.cast({"result": pdx.Int64()}) - expected_pdx_dtype = pdx.Int64 - compare_column_with_reference(result.col("result"), expected_data, expected_pdx_dtype) + result = result.cast({"result": ns.Int64()}) + expected_ns_dtype = ns.Int64 + compare_column_with_reference(result.col("result"), expected_data, expected_ns_dtype) @pytest.mark.parametrize( @@ -73,15 +73,15 @@ def test_column_comparisons_scalar( ) -> None: ser: Any df = integer_dataframe_1(library) - pdx = df.__dataframe_namespace__() + ns = df.__dataframe_namespace__() ser = df.col("a") other = 3 result = df.assign(getattr(ser, comparison)(other).rename("result")) - expected_pdx_dtype = getattr(pdx, expected_dtype) + expected_ns_dtype = getattr(ns, expected_dtype) if comparison == "__pow__" and library in ("polars", "polars-lazy"): - result = result.cast({"result": pdx.Int64()}) - expected_pdx_dtype = pdx.Int64 - compare_column_with_reference(result.col("result"), expected_data, expected_pdx_dtype) + result = result.cast({"result": ns.Int64()}) + expected_ns_dtype = ns.Int64 + compare_column_with_reference(result.col("result"), expected_data, expected_ns_dtype) @pytest.mark.parametrize( @@ -100,8 +100,8 @@ def test_right_column_comparisons( # 1,2,3 ser: Any df = integer_dataframe_7(library) - pdx = df.__dataframe_namespace__() + ns = df.__dataframe_namespace__() ser = df.col("a") other = 2 result = df.assign(getattr(ser, comparison)(other).rename("result")) - compare_column_with_reference(result.col("result"), expected_data, pdx.Int64) + compare_column_with_reference(result.col("result"), expected_data, ns.Int64) diff --git a/tests/column/cumulative_test.py b/tests/column/cumulative_test.py index d2532ff4..9393dffc 100644 --- a/tests/column/cumulative_test.py +++ b/tests/column/cumulative_test.py @@ -22,7 +22,7 @@ def test_cumulative_functions_column( expected_data: list[float], ) -> None: df = integer_dataframe_1(library) - pdx = df.__dataframe_namespace__() + ns = df.__dataframe_namespace__() ser = df.col("a") expected = pd.Series(expected_data, name="result") result = df.assign(getattr(ser, func)().rename("result")) @@ -32,6 +32,6 @@ def test_cumulative_functions_column( and library == "pandas-nullable" ): # pragma: no cover # Upstream bug - result = result.cast({"result": pdx.Int64()}) + result = result.cast({"result": ns.Int64()}) - compare_column_with_reference(result.col("result"), expected, pdx.Int64) + compare_column_with_reference(result.col("result"), expected, ns.Int64) diff --git a/tests/column/divmod_test.py b/tests/column/divmod_test.py index 810d5fb6..26b2a222 100644 --- a/tests/column/divmod_test.py +++ b/tests/column/divmod_test.py @@ -6,26 +6,26 @@ def test_expression_divmod(library: str) -> None: df = integer_dataframe_1(library) - pdx = df.__dataframe_namespace__() + ns = df.__dataframe_namespace__() ser = df.col("a") other = df.col("b") result_quotient, result_remainder = ser.__divmod__(other) # quotient result = df.assign(result_quotient.rename("result")) - compare_column_with_reference(result.col("result"), [0, 0, 0], pdx.Int64) + compare_column_with_reference(result.col("result"), [0, 0, 0], ns.Int64) # remainder result = df.assign(result_remainder.rename("result")) - compare_column_with_reference(result.col("result"), [1, 2, 3], pdx.Int64) + compare_column_with_reference(result.col("result"), [1, 2, 3], ns.Int64) def test_expression_divmod_with_scalar(library: str) -> None: df = integer_dataframe_1(library) - pdx = df.__dataframe_namespace__() + ns = df.__dataframe_namespace__() ser = df.col("a") result_quotient, result_remainder = ser.__divmod__(2) # quotient result = df.assign(result_quotient.rename("result")) - compare_column_with_reference(result.col("result"), [0, 1, 1], pdx.Int64) + compare_column_with_reference(result.col("result"), [0, 1, 1], ns.Int64) # remainder result = df.assign(result_remainder.rename("result")) - compare_column_with_reference(result.col("result"), [1, 0, 1], pdx.Int64) + compare_column_with_reference(result.col("result"), [1, 0, 1], ns.Int64) diff --git a/tests/column/fill_nan_test.py b/tests/column/fill_nan_test.py index d7f75a77..26901301 100644 --- a/tests/column/fill_nan_test.py +++ b/tests/column/fill_nan_test.py @@ -7,16 +7,16 @@ def test_column_fill_nan(library: str) -> None: # TODO: test with nullable pandas, check null isn't filled df = nan_dataframe_1(library) - pdx = df.__dataframe_namespace__() + ns = df.__dataframe_namespace__() ser = df.col("a") result = df.assign(ser.fill_nan(-1.0).rename("result")) - compare_column_with_reference(result.col("result"), [1.0, 2.0, -1.0], pdx.Float64) + compare_column_with_reference(result.col("result"), [1.0, 2.0, -1.0], ns.Float64) def test_column_fill_nan_with_null(library: str) -> None: # TODO: test with nullable pandas, check null isn't filled df = nan_dataframe_1(library) - pdx = df.__dataframe_namespace__() + ns = df.__dataframe_namespace__() ser = df.col("a") - result = df.assign(ser.fill_nan(pdx.null).is_null().rename("result")) - compare_column_with_reference(result.col("result"), [False, False, True], pdx.Bool) + result = df.assign(ser.fill_nan(ns.null).is_null().rename("result")) + compare_column_with_reference(result.col("result"), [False, False, True], ns.Bool) diff --git a/tests/column/get_rows_by_mask_test.py b/tests/column/get_rows_by_mask_test.py index a69da19e..95617e4d 100644 --- a/tests/column/get_rows_by_mask_test.py +++ b/tests/column/get_rows_by_mask_test.py @@ -18,9 +18,9 @@ def test_column_filter(library: str) -> None: def test_column_take_by_mask_noop(library: str) -> None: df = integer_dataframe_1(library) - pdx = df.__dataframe_namespace__() + ns = df.__dataframe_namespace__() ser = df.col("a") mask = ser > 0 ser = ser.filter(mask) result = df.assign(ser.rename("result")) - compare_column_with_reference(result.col("result"), [1, 2, 3], pdx.Int64) + compare_column_with_reference(result.col("result"), [1, 2, 3], ns.Int64) diff --git a/tests/column/get_rows_test.py b/tests/column/get_rows_test.py index b2d0f763..390ea33c 100644 --- a/tests/column/get_rows_test.py +++ b/tests/column/get_rows_test.py @@ -6,8 +6,8 @@ def test_expression_take(library: str) -> None: df = integer_dataframe_1(library) - pdx = df.__dataframe_namespace__() + ns = df.__dataframe_namespace__() ser = df.col("a") indices = df.col("a") - 1 result = df.assign(ser.take(indices).rename("result")).select("result") - compare_column_with_reference(result.col("result"), [1, 2, 3], pdx.Int64) + compare_column_with_reference(result.col("result"), [1, 2, 3], ns.Int64) diff --git a/tests/column/invert_test.py b/tests/column/invert_test.py index 617a8099..a4e6d4db 100644 --- a/tests/column/invert_test.py +++ b/tests/column/invert_test.py @@ -6,15 +6,15 @@ def test_expression_invert(library: str) -> None: df = bool_dataframe_1(library) - pdx = df.__dataframe_namespace__() + ns = df.__dataframe_namespace__() ser = df.col("a") result = df.assign((~ser).rename("result")) - compare_column_with_reference(result.col("result"), [False, False, True], pdx.Bool) + compare_column_with_reference(result.col("result"), [False, False, True], ns.Bool) def test_column_invert(library: str) -> None: df = bool_dataframe_1(library) - pdx = df.__dataframe_namespace__() + ns = df.__dataframe_namespace__() ser = df.col("a") result = df.assign((~ser).rename("result")) - compare_column_with_reference(result.col("result"), [False, False, True], pdx.Bool) + compare_column_with_reference(result.col("result"), [False, False, True], ns.Bool) diff --git a/tests/column/is_in_test.py b/tests/column/is_in_test.py index 0d08d37d..d921873e 100644 --- a/tests/column/is_in_test.py +++ b/tests/column/is_in_test.py @@ -29,11 +29,11 @@ def test_is_in( expected_values: list[bool], ) -> None: df = df_factory(library) - pdx = df.__dataframe_namespace__() + ns = df.__dataframe_namespace__() ser = df.col("a") other = ser + 1 result = df.assign(ser.is_in(other).rename("result")) - compare_column_with_reference(result.col("result"), expected_values, pdx.Bool) + compare_column_with_reference(result.col("result"), expected_values, ns.Bool) @pytest.mark.parametrize( @@ -51,9 +51,9 @@ def test_expr_is_in( expected_values: list[bool], ) -> None: df = df_factory(library) - pdx = df.__dataframe_namespace__() + ns = df.__dataframe_namespace__() col = df.col ser = col("a") other = ser + 1 result = df.assign(ser.is_in(other).rename("result")) - compare_column_with_reference(result.col("result"), expected_values, pdx.Bool) + compare_column_with_reference(result.col("result"), expected_values, ns.Bool) diff --git a/tests/column/is_nan_test.py b/tests/column/is_nan_test.py index 735ea2e3..d1a7b531 100644 --- a/tests/column/is_nan_test.py +++ b/tests/column/is_nan_test.py @@ -6,7 +6,7 @@ def test_column_is_nan(library: str) -> None: df = nan_dataframe_1(library) - pdx = df.__dataframe_namespace__() + ns = df.__dataframe_namespace__() ser = df.col("a") result = df.assign(ser.is_nan().rename("result")) - compare_column_with_reference(result.col("result"), [False, False, True], pdx.Bool) + compare_column_with_reference(result.col("result"), [False, False, True], ns.Bool) diff --git a/tests/column/is_null_test.py b/tests/column/is_null_test.py index ee3afe52..00f65db3 100644 --- a/tests/column/is_null_test.py +++ b/tests/column/is_null_test.py @@ -7,19 +7,19 @@ def test_column_is_null_1(library: str) -> None: df = nan_dataframe_1(library) - pdx = df.__dataframe_namespace__() + ns = df.__dataframe_namespace__() ser = df.col("a") result = df.assign(ser.is_null().rename("result")) if library == "pandas-numpy": expected = [False, False, True] else: expected = [False, False, False] - compare_column_with_reference(result.col("result"), expected, pdx.Bool) + compare_column_with_reference(result.col("result"), expected, ns.Bool) def test_column_is_null_2(library: str) -> None: df = null_dataframe_1(library) - pdx = df.__dataframe_namespace__() + ns = df.__dataframe_namespace__() ser = df.col("a") result = df.assign(ser.is_null().rename("result")) - compare_column_with_reference(result.col("result"), [False, False, True], pdx.Bool) + compare_column_with_reference(result.col("result"), [False, False, True], ns.Bool) diff --git a/tests/column/pow_test.py b/tests/column/pow_test.py index d15261e8..dec87874 100644 --- a/tests/column/pow_test.py +++ b/tests/column/pow_test.py @@ -6,55 +6,55 @@ def test_float_powers_column(library: str) -> None: df = integer_dataframe_1(library) - pdx = df.__dataframe_namespace__() + ns = df.__dataframe_namespace__() ser = df.col("a") other = df.col("b") * 1.0 result = df.assign(ser.__pow__(other).rename("result")) compare_dataframe_with_reference( result, {"a": [1, 2, 3], "b": [4, 5, 6], "result": [1.0, 32.0, 729.0]}, - {"a": pdx.Int64, "b": pdx.Int64, "result": pdx.Float64}, + {"a": ns.Int64, "b": ns.Int64, "result": ns.Float64}, ) def test_float_powers_scalar_column(library: str) -> None: df = integer_dataframe_1(library) - pdx = df.__dataframe_namespace__() + ns = df.__dataframe_namespace__() ser = df.col("a") other = 1.0 result = df.assign(ser.__pow__(other).rename("result")) compare_dataframe_with_reference( result, {"a": [1, 2, 3], "b": [4, 5, 6], "result": [1.0, 2.0, 3.0]}, - {"a": pdx.Int64, "b": pdx.Int64, "result": pdx.Float64}, + {"a": ns.Int64, "b": ns.Int64, "result": ns.Float64}, ) def test_int_powers_column(library: str) -> None: df = integer_dataframe_1(library) - pdx = df.__dataframe_namespace__() + ns = df.__dataframe_namespace__() ser = df.col("a") other = df.col("b") * 1 result = df.assign(ser.__pow__(other).rename("result")) if library in ("polars", "polars-lazy"): - result = result.cast({name: pdx.Int64() for name in ("a", "b", "result")}) + result = result.cast({name: ns.Int64() for name in ("a", "b", "result")}) compare_dataframe_with_reference( result, {"a": [1, 2, 3], "b": [4, 5, 6], "result": [1, 32, 729]}, - {name: pdx.Int64 for name in ("a", "b", "result")}, + {name: ns.Int64 for name in ("a", "b", "result")}, ) def test_int_powers_scalar_column(library: str) -> None: df = integer_dataframe_1(library) - pdx = df.__dataframe_namespace__() + ns = df.__dataframe_namespace__() ser = df.col("a") other = 1 result = df.assign(ser.__pow__(other).rename("result")) if library in ("polars", "polars-lazy"): - result = result.cast({name: pdx.Int64() for name in ("a", "b", "result")}) + result = result.cast({name: ns.Int64() for name in ("a", "b", "result")}) compare_dataframe_with_reference( result, {"a": [1, 2, 3], "b": [4, 5, 6], "result": [1, 2, 3]}, - {name: pdx.Int64 for name in ("a", "b", "result")}, + {name: ns.Int64 for name in ("a", "b", "result")}, ) diff --git a/tests/column/reductions_test.py b/tests/column/reductions_test.py index 3ed235cf..25d85d8b 100644 --- a/tests/column/reductions_test.py +++ b/tests/column/reductions_test.py @@ -26,10 +26,10 @@ def test_expression_reductions( expected_dtype: str, ) -> None: df = integer_dataframe_1(library) - pdx = df.__dataframe_namespace__() + ns = df.__dataframe_namespace__() ser = df.col("a") ser = ser - getattr(ser, reduction)() result = df.assign(ser.rename("result")) reference = list((df.col("a") - expected).persist().to_array()) - expected_pdx_dtype = getattr(pdx, expected_dtype) - compare_column_with_reference(result.col("result"), reference, expected_pdx_dtype) + expected_ns_dtype = getattr(ns, expected_dtype) + compare_column_with_reference(result.col("result"), reference, expected_ns_dtype) diff --git a/tests/column/shift_test.py b/tests/column/shift_test.py index 1f09b8b6..2f1f004c 100644 --- a/tests/column/shift_test.py +++ b/tests/column/shift_test.py @@ -2,24 +2,22 @@ import polars as pl from polars.testing import assert_frame_equal +from tests.utils import compare_dataframe_with_reference from tests.utils import float_dataframe_1 from tests.utils import integer_dataframe_1 -from tests.utils import interchange_to_pandas def test_shift_with_fill_value(library: str) -> None: df = integer_dataframe_1(library) + ns = df.__dataframe_namespace__() result = df.assign(df.col("a").shift(1).fill_null(999)) - expected = pd.DataFrame( - { - "a": [999, 1, 2], - "b": [4, 5, 6], - }, - ) - result_pd = interchange_to_pandas(result) if library == "pandas-numpy": - result_pd = result_pd.astype("int64") - pd.testing.assert_frame_equal(result_pd, expected) + result = result.cast({name: ns.Int64() for name in ("a", "b")}) + compare_dataframe_with_reference( + result, + {"a": [999, 1, 2], "b": [4, 5, 6]}, + ns.Int64, + ) def test_shift_without_fill_value(library: str) -> None: @@ -41,14 +39,12 @@ def test_shift_without_fill_value(library: str) -> None: def test_shift_with_fill_value_complicated(library: str) -> None: df = integer_dataframe_1(library) + ns = df.__dataframe_namespace__() result = df.assign(df.col("a").shift(1).fill_null(df.col("a").mean())) - expected = pd.DataFrame( - { - "a": [2.0, 1, 2], - "b": [4, 5, 6], - }, - ) - result_pd = interchange_to_pandas(result) if library == "pandas-nullable": - result_pd = result_pd.astype({"a": "float64"}) - pd.testing.assert_frame_equal(result_pd, expected) + result = result.cast({"a": ns.Float64()}) + compare_dataframe_with_reference( + result, + {"a": [2.0, 1, 2], "b": [4, 5, 6]}, + {"a": ns.Float64, "b": ns.Int64}, + ) diff --git a/tests/column/sort_test.py b/tests/column/sort_test.py index 74c40b92..b7524c42 100644 --- a/tests/column/sort_test.py +++ b/tests/column/sort_test.py @@ -1,68 +1,56 @@ from __future__ import annotations -import pandas as pd - +from tests.utils import compare_dataframe_with_reference from tests.utils import integer_dataframe_6 -from tests.utils import interchange_to_pandas def test_expression_sort_ascending(library: str) -> None: df = integer_dataframe_6(library, api_version="2023.09-beta") - df.__dataframe_namespace__() + ns = df.__dataframe_namespace__() s_sorted = df.col("b").sort().rename("c") result = df.assign(s_sorted) - result_pd = interchange_to_pandas(result) - expected = pd.DataFrame( - { - "a": [1, 1, 1, 2, 2], - "b": [4, 4, 3, 1, 2], - "c": [1, 2, 3, 4, 4], - }, - ) - pd.testing.assert_frame_equal(result_pd, expected) + expected = { + "a": [1, 1, 1, 2, 2], + "b": [4, 4, 3, 1, 2], + "c": [1, 2, 3, 4, 4], + } + compare_dataframe_with_reference(result, expected, ns.Int64) def test_expression_sort_descending(library: str) -> None: df = integer_dataframe_6(library, api_version="2023.09-beta") - df.__dataframe_namespace__() + ns = df.__dataframe_namespace__() s_sorted = df.col("b").sort(ascending=False).rename("c") result = df.assign(s_sorted) - result_pd = interchange_to_pandas(result) - expected = pd.DataFrame( - { - "a": [1, 1, 1, 2, 2], - "b": [4, 4, 3, 1, 2], - "c": [4, 4, 3, 2, 1], - }, - ) - pd.testing.assert_frame_equal(result_pd, expected) + expected = { + "a": [1, 1, 1, 2, 2], + "b": [4, 4, 3, 1, 2], + "c": [4, 4, 3, 2, 1], + } + compare_dataframe_with_reference(result, expected, ns.Int64) def test_column_sort_ascending(library: str) -> None: - df = integer_dataframe_6(library, api_version="2023.09-beta").persist() + df = integer_dataframe_6(library, api_version="2023.09-beta") + ns = df.__dataframe_namespace__() s_sorted = df.col("b").sort().rename("c") result = df.assign(s_sorted) - result_pd = interchange_to_pandas(result) - expected = pd.DataFrame( - { - "a": [1, 1, 1, 2, 2], - "b": [4, 4, 3, 1, 2], - "c": [1, 2, 3, 4, 4], - }, - ) - pd.testing.assert_frame_equal(result_pd, expected) + expected = { + "a": [1, 1, 1, 2, 2], + "b": [4, 4, 3, 1, 2], + "c": [1, 2, 3, 4, 4], + } + compare_dataframe_with_reference(result, expected, ns.Int64) def test_column_sort_descending(library: str) -> None: - df = integer_dataframe_6(library, api_version="2023.09-beta").persist() + df = integer_dataframe_6(library, api_version="2023.09-beta") + ns = df.__dataframe_namespace__() s_sorted = df.col("b").sort(ascending=False).rename("c") result = df.assign(s_sorted) - result_pd = interchange_to_pandas(result) - expected = pd.DataFrame( - { - "a": [1, 1, 1, 2, 2], - "b": [4, 4, 3, 1, 2], - "c": [4, 4, 3, 2, 1], - }, - ) - pd.testing.assert_frame_equal(result_pd, expected) + expected = { + "a": [1, 1, 1, 2, 2], + "b": [4, 4, 3, 1, 2], + "c": [4, 4, 3, 2, 1], + } + compare_dataframe_with_reference(result, expected, ns.Int64) diff --git a/tests/column/statistics_test.py b/tests/column/statistics_test.py index cdaac140..b444e604 100644 --- a/tests/column/statistics_test.py +++ b/tests/column/statistics_test.py @@ -1,15 +1,11 @@ from __future__ import annotations -import pandas as pd - +from tests.utils import compare_column_with_reference from tests.utils import integer_dataframe_1 -from tests.utils import interchange_to_pandas def test_mean(library: str) -> None: df = integer_dataframe_1(library) - df.__dataframe_namespace__() + ns = df.__dataframe_namespace__() result = df.assign((df.col("a") - df.col("a").mean()).rename("result")) - result_pd = interchange_to_pandas(result)["result"] - expected = pd.Series([-1, 0, 1.0], name="result") - pd.testing.assert_series_equal(result_pd, expected) + compare_column_with_reference(result.col("result"), [-1, 0, 1.0], ns.Float64) diff --git a/tests/column/temporal/components_test.py b/tests/column/temporal/components_test.py index 6259265b..f0fb3bd6 100644 --- a/tests/column/temporal/components_test.py +++ b/tests/column/temporal/components_test.py @@ -23,16 +23,16 @@ ) def test_col_components(library: str, attr: str, expected: list[int]) -> None: df = temporal_dataframe_1(library) - pdx = df.__dataframe_namespace__() + ns = df.__dataframe_namespace__() for col_name in ("a", "c", "e"): result = ( df.assign(getattr(df.col(col_name), attr)().rename("result")) .select( "result", ) - .cast({"result": pdx.Int64()}) + .cast({"result": ns.Int64()}) ) - compare_column_with_reference(result.col("result"), expected, dtype=pdx.Int64) + compare_column_with_reference(result.col("result"), expected, dtype=ns.Int64) @pytest.mark.parametrize( @@ -45,15 +45,15 @@ def test_col_components(library: str, attr: str, expected: list[int]) -> None: ) def test_col_microsecond(library: str, col_name: str, expected: list[int]) -> None: df = temporal_dataframe_1(library) - pdx = df.__dataframe_namespace__() + ns = df.__dataframe_namespace__() result = ( df.assign(df.col(col_name).microsecond().rename("result")) .select( "result", ) - .cast({"result": pdx.Int64()}) + .cast({"result": ns.Int64()}) ) - compare_column_with_reference(result.col("result"), expected, dtype=pdx.Int64) + compare_column_with_reference(result.col("result"), expected, dtype=ns.Int64) @pytest.mark.parametrize( @@ -66,15 +66,15 @@ def test_col_microsecond(library: str, col_name: str, expected: list[int]) -> No ) def test_col_nanosecond(library: str, col_name: str, expected: list[int]) -> None: df = temporal_dataframe_1(library) - pdx = df.__dataframe_namespace__() + ns = df.__dataframe_namespace__() result = ( df.assign(df.col(col_name).nanosecond().rename("result")) # type: ignore[attr-defined] .select( "result", ) - .cast({"result": pdx.Int64()}) + .cast({"result": ns.Int64()}) ) - compare_column_with_reference(result.col("result"), expected, dtype=pdx.Int64) + compare_column_with_reference(result.col("result"), expected, dtype=ns.Int64) @pytest.mark.parametrize( @@ -92,7 +92,7 @@ def test_col_unix_timestamp_time_units( expected: list[int], ) -> None: df = temporal_dataframe_1(library) - pdx = df.__dataframe_namespace__() + ns = df.__dataframe_namespace__() result = ( df.assign( df.col("e").unix_timestamp(time_unit=time_unit).rename("result"), @@ -100,6 +100,6 @@ def test_col_unix_timestamp_time_units( .select( "result", ) - .cast({"result": pdx.Int64()}) + .cast({"result": ns.Int64()}) ) - compare_column_with_reference(result.col("result"), expected, dtype=pdx.Int64) + compare_column_with_reference(result.col("result"), expected, dtype=ns.Int64) diff --git a/tests/column/temporal/filter_test.py b/tests/column/temporal/filter_test.py index 5e4e4f31..ea872bfe 100644 --- a/tests/column/temporal/filter_test.py +++ b/tests/column/temporal/filter_test.py @@ -4,6 +4,6 @@ def test_filter_w_date(library: str) -> None: df = temporal_dataframe_1(library).select("a", "index") - pdx = df.__dataframe_namespace__() - result = df.filter(df.col("a") > pdx.date(2020, 1, 2)).select("index") - compare_dataframe_with_reference(result, {"index": [1, 2]}, pdx.Int64) + ns = df.__dataframe_namespace__() + result = df.filter(df.col("a") > ns.date(2020, 1, 2)).select("index") + compare_dataframe_with_reference(result, {"index": [1, 2]}, ns.Int64) diff --git a/tests/column/temporal/floor_test.py b/tests/column/temporal/floor_test.py index d8773e07..57287033 100644 --- a/tests/column/temporal/floor_test.py +++ b/tests/column/temporal/floor_test.py @@ -16,8 +16,8 @@ ) def test_floor(library: str, freq: str, expected: list[datetime]) -> None: df = temporal_dataframe_1(library) - pdx = df.__dataframe_namespace__() + ns = df.__dataframe_namespace__() col = df.col result = df.assign(col("a").floor(freq).rename("result")).select("result") # type: ignore[attr-defined] # TODO check the resolution - compare_column_with_reference(result.col("result"), expected, pdx.Datetime) + compare_column_with_reference(result.col("result"), expected, ns.Datetime) diff --git a/tests/utils.py b/tests/utils.py index ba271c91..712d67bf 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -491,13 +491,15 @@ def compare_column_with_reference( ) -> None: column = column.persist() col_len = column.len().scalar - assert col_len == len(reference) + assert col_len == len(reference), f"column length: {col_len} != {len(reference)}" assert isinstance( column.dtype, dtype, - ), f"{column.dtype=} isn't a instance of {dtype=}" + ), f"column dtype: {column.dtype} isn't a instance of {dtype}" for idx in range(col_len): - assert reference[idx] == column.get_value(idx).scalar + assert ( + reference[idx] == column.get_value(idx).scalar + ), f"{reference[idx]} != {column.get_value(idx).scalar}" def compare_dataframe_with_reference( @@ -505,7 +507,9 @@ def compare_dataframe_with_reference( reference: Mapping[str, list[Any]], dtype: Any | Mapping[str, Any], ) -> None: - assert dataframe.column_names == list(reference.keys()) + assert dataframe.column_names == list( + reference.keys(), + ), f"dataframe column names: '{dataframe.column_names}' != '{list(reference.keys())}'" for col_name in dataframe.column_names: col_dtype = dtype[col_name] if isinstance(dtype, dict) else dtype compare_column_with_reference( From f64ee52b71811a1f216426eb7a434cc33b575886 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Sun, 7 Jan 2024 01:11:49 +0100 Subject: [PATCH 09/12] refactor Signed-off-by: Anatoly Myachev --- tests/column/cast_test.py | 8 +++---- tests/column/col_sorted_indices_test.py | 28 +++++++--------------- tests/column/comparisons_test.py | 2 +- tests/column/cumulative_test.py | 2 +- tests/column/divmod_test.py | 8 +++---- tests/column/fill_nan_test.py | 6 +++-- tests/column/get_rows_by_mask_test.py | 2 +- tests/column/get_rows_test.py | 2 +- tests/column/invert_test.py | 6 +++-- tests/column/is_in_test.py | 4 ++-- tests/column/is_nan_test.py | 3 ++- tests/column/is_null_test.py | 5 ++-- tests/column/pow_test.py | 32 ++++++++++--------------- tests/column/shift_test.py | 15 ++++-------- tests/column/sort_test.py | 8 +++---- tests/column/statistics_test.py | 2 +- tests/column/temporal/filter_test.py | 2 +- tests/column/temporal/floor_test.py | 2 +- 18 files changed, 58 insertions(+), 79 deletions(-) diff --git a/tests/column/cast_test.py b/tests/column/cast_test.py index 72b32b5f..111a4300 100644 --- a/tests/column/cast_test.py +++ b/tests/column/cast_test.py @@ -6,8 +6,6 @@ def test_cast_integers(library: str) -> None: df = integer_dataframe_1(library) ns = df.__dataframe_namespace__() result = df.assign(df.col("a").cast(ns.Int32())) - compare_dataframe_with_reference( - result, - {"a": [1, 2, 3], "b": [4, 5, 6]}, - dtype={"a": ns.Int32, "b": ns.Int64}, - ) + expected = {"a": [1, 2, 3], "b": [4, 5, 6]} + expected_dtype = {"a": ns.Int32, "b": ns.Int64} + compare_dataframe_with_reference(result, expected, expected_dtype) diff --git a/tests/column/col_sorted_indices_test.py b/tests/column/col_sorted_indices_test.py index fb8c1e42..bd46afe5 100644 --- a/tests/column/col_sorted_indices_test.py +++ b/tests/column/col_sorted_indices_test.py @@ -10,11 +10,8 @@ def test_expression_sorted_indices_ascending(library: str) -> None: col = df.col sorted_indices = col("b").sorted_indices() result = df.take(sorted_indices) - compare_dataframe_with_reference( - result, - {"a": [2, 2, 1, 1, 1], "b": [1, 2, 3, 4, 4]}, - dtype=ns.Int64, - ) + expected = {"a": [2, 2, 1, 1, 1], "b": [1, 2, 3, 4, 4]} + compare_dataframe_with_reference(result, expected, dtype=ns.Int64) def test_expression_sorted_indices_descending(library: str) -> None: @@ -23,11 +20,8 @@ def test_expression_sorted_indices_descending(library: str) -> None: col = df.col sorted_indices = col("b").sorted_indices(ascending=False) result = df.take(sorted_indices) - compare_dataframe_with_reference( - result, - {"a": [1, 1, 1, 2, 2], "b": [4, 4, 3, 2, 1]}, - dtype=ns.Int64, - ) + expected = {"a": [1, 1, 1, 2, 2], "b": [4, 4, 3, 2, 1]} + compare_dataframe_with_reference(result, expected, dtype=ns.Int64) def test_column_sorted_indices_ascending(library: str) -> None: @@ -35,11 +29,8 @@ def test_column_sorted_indices_ascending(library: str) -> None: ns = df.__dataframe_namespace__() sorted_indices = df.col("b").sorted_indices() result = df.take(sorted_indices) - compare_dataframe_with_reference( - result, - {"a": [2, 2, 1, 1, 1], "b": [1, 2, 3, 4, 4]}, - dtype=ns.Int64, - ) + expected = {"a": [2, 2, 1, 1, 1], "b": [1, 2, 3, 4, 4]} + compare_dataframe_with_reference(result, expected, dtype=ns.Int64) def test_column_sorted_indices_descending(library: str) -> None: @@ -47,8 +38,5 @@ def test_column_sorted_indices_descending(library: str) -> None: ns = df.__dataframe_namespace__() sorted_indices = df.col("b").sorted_indices(ascending=False) result = df.take(sorted_indices) - compare_dataframe_with_reference( - result, - {"a": [1, 1, 1, 2, 2], "b": [4, 4, 3, 2, 1]}, - dtype=ns.Int64, - ) + expected = {"a": [1, 1, 1, 2, 2], "b": [4, 4, 3, 2, 1]} + compare_dataframe_with_reference(result, expected, dtype=ns.Int64) diff --git a/tests/column/comparisons_test.py b/tests/column/comparisons_test.py index 266b9839..9ba58710 100644 --- a/tests/column/comparisons_test.py +++ b/tests/column/comparisons_test.py @@ -104,4 +104,4 @@ def test_right_column_comparisons( ser = df.col("a") other = 2 result = df.assign(getattr(ser, comparison)(other).rename("result")) - compare_column_with_reference(result.col("result"), expected_data, ns.Int64) + compare_column_with_reference(result.col("result"), expected_data, dtype=ns.Int64) diff --git a/tests/column/cumulative_test.py b/tests/column/cumulative_test.py index 9393dffc..331106e8 100644 --- a/tests/column/cumulative_test.py +++ b/tests/column/cumulative_test.py @@ -34,4 +34,4 @@ def test_cumulative_functions_column( # Upstream bug result = result.cast({"result": ns.Int64()}) - compare_column_with_reference(result.col("result"), expected, ns.Int64) + compare_column_with_reference(result.col("result"), expected, dtype=ns.Int64) diff --git a/tests/column/divmod_test.py b/tests/column/divmod_test.py index 26b2a222..dd16fec6 100644 --- a/tests/column/divmod_test.py +++ b/tests/column/divmod_test.py @@ -12,10 +12,10 @@ def test_expression_divmod(library: str) -> None: result_quotient, result_remainder = ser.__divmod__(other) # quotient result = df.assign(result_quotient.rename("result")) - compare_column_with_reference(result.col("result"), [0, 0, 0], ns.Int64) + compare_column_with_reference(result.col("result"), [0, 0, 0], dtype=ns.Int64) # remainder result = df.assign(result_remainder.rename("result")) - compare_column_with_reference(result.col("result"), [1, 2, 3], ns.Int64) + compare_column_with_reference(result.col("result"), [1, 2, 3], dtype=ns.Int64) def test_expression_divmod_with_scalar(library: str) -> None: @@ -25,7 +25,7 @@ def test_expression_divmod_with_scalar(library: str) -> None: result_quotient, result_remainder = ser.__divmod__(2) # quotient result = df.assign(result_quotient.rename("result")) - compare_column_with_reference(result.col("result"), [0, 1, 1], ns.Int64) + compare_column_with_reference(result.col("result"), [0, 1, 1], dtype=ns.Int64) # remainder result = df.assign(result_remainder.rename("result")) - compare_column_with_reference(result.col("result"), [1, 0, 1], ns.Int64) + compare_column_with_reference(result.col("result"), [1, 0, 1], dtype=ns.Int64) diff --git a/tests/column/fill_nan_test.py b/tests/column/fill_nan_test.py index 26901301..137dd4e0 100644 --- a/tests/column/fill_nan_test.py +++ b/tests/column/fill_nan_test.py @@ -10,7 +10,8 @@ def test_column_fill_nan(library: str) -> None: ns = df.__dataframe_namespace__() ser = df.col("a") result = df.assign(ser.fill_nan(-1.0).rename("result")) - compare_column_with_reference(result.col("result"), [1.0, 2.0, -1.0], ns.Float64) + expected = [1.0, 2.0, -1.0] + compare_column_with_reference(result.col("result"), expected, dtype=ns.Float64) def test_column_fill_nan_with_null(library: str) -> None: @@ -19,4 +20,5 @@ def test_column_fill_nan_with_null(library: str) -> None: ns = df.__dataframe_namespace__() ser = df.col("a") result = df.assign(ser.fill_nan(ns.null).is_null().rename("result")) - compare_column_with_reference(result.col("result"), [False, False, True], ns.Bool) + expected = [False, False, True] + compare_column_with_reference(result.col("result"), expected, dtype=ns.Bool) diff --git a/tests/column/get_rows_by_mask_test.py b/tests/column/get_rows_by_mask_test.py index 95617e4d..2a170a0e 100644 --- a/tests/column/get_rows_by_mask_test.py +++ b/tests/column/get_rows_by_mask_test.py @@ -23,4 +23,4 @@ def test_column_take_by_mask_noop(library: str) -> None: mask = ser > 0 ser = ser.filter(mask) result = df.assign(ser.rename("result")) - compare_column_with_reference(result.col("result"), [1, 2, 3], ns.Int64) + compare_column_with_reference(result.col("result"), [1, 2, 3], dtype=ns.Int64) diff --git a/tests/column/get_rows_test.py b/tests/column/get_rows_test.py index 390ea33c..9254ef45 100644 --- a/tests/column/get_rows_test.py +++ b/tests/column/get_rows_test.py @@ -10,4 +10,4 @@ def test_expression_take(library: str) -> None: ser = df.col("a") indices = df.col("a") - 1 result = df.assign(ser.take(indices).rename("result")).select("result") - compare_column_with_reference(result.col("result"), [1, 2, 3], ns.Int64) + compare_column_with_reference(result.col("result"), [1, 2, 3], dtype=ns.Int64) diff --git a/tests/column/invert_test.py b/tests/column/invert_test.py index a4e6d4db..b6003eea 100644 --- a/tests/column/invert_test.py +++ b/tests/column/invert_test.py @@ -9,7 +9,8 @@ def test_expression_invert(library: str) -> None: ns = df.__dataframe_namespace__() ser = df.col("a") result = df.assign((~ser).rename("result")) - compare_column_with_reference(result.col("result"), [False, False, True], ns.Bool) + expected = [False, False, True] + compare_column_with_reference(result.col("result"), expected, dtype=ns.Bool) def test_column_invert(library: str) -> None: @@ -17,4 +18,5 @@ def test_column_invert(library: str) -> None: ns = df.__dataframe_namespace__() ser = df.col("a") result = df.assign((~ser).rename("result")) - compare_column_with_reference(result.col("result"), [False, False, True], ns.Bool) + expected = [False, False, True] + compare_column_with_reference(result.col("result"), expected, dtype=ns.Bool) diff --git a/tests/column/is_in_test.py b/tests/column/is_in_test.py index d921873e..f840706b 100644 --- a/tests/column/is_in_test.py +++ b/tests/column/is_in_test.py @@ -33,7 +33,7 @@ def test_is_in( ser = df.col("a") other = ser + 1 result = df.assign(ser.is_in(other).rename("result")) - compare_column_with_reference(result.col("result"), expected_values, ns.Bool) + compare_column_with_reference(result.col("result"), expected_values, dtype=ns.Bool) @pytest.mark.parametrize( @@ -56,4 +56,4 @@ def test_expr_is_in( ser = col("a") other = ser + 1 result = df.assign(ser.is_in(other).rename("result")) - compare_column_with_reference(result.col("result"), expected_values, ns.Bool) + compare_column_with_reference(result.col("result"), expected_values, dtype=ns.Bool) diff --git a/tests/column/is_nan_test.py b/tests/column/is_nan_test.py index d1a7b531..b0d04025 100644 --- a/tests/column/is_nan_test.py +++ b/tests/column/is_nan_test.py @@ -9,4 +9,5 @@ def test_column_is_nan(library: str) -> None: ns = df.__dataframe_namespace__() ser = df.col("a") result = df.assign(ser.is_nan().rename("result")) - compare_column_with_reference(result.col("result"), [False, False, True], ns.Bool) + expected = [False, False, True] + compare_column_with_reference(result.col("result"), expected, dtype=ns.Bool) diff --git a/tests/column/is_null_test.py b/tests/column/is_null_test.py index 00f65db3..fdc8e34b 100644 --- a/tests/column/is_null_test.py +++ b/tests/column/is_null_test.py @@ -14,7 +14,7 @@ def test_column_is_null_1(library: str) -> None: expected = [False, False, True] else: expected = [False, False, False] - compare_column_with_reference(result.col("result"), expected, ns.Bool) + compare_column_with_reference(result.col("result"), expected, dtype=ns.Bool) def test_column_is_null_2(library: str) -> None: @@ -22,4 +22,5 @@ def test_column_is_null_2(library: str) -> None: ns = df.__dataframe_namespace__() ser = df.col("a") result = df.assign(ser.is_null().rename("result")) - compare_column_with_reference(result.col("result"), [False, False, True], ns.Bool) + expected = [False, False, True] + compare_column_with_reference(result.col("result"), expected, dtype=ns.Bool) diff --git a/tests/column/pow_test.py b/tests/column/pow_test.py index dec87874..f08c0b22 100644 --- a/tests/column/pow_test.py +++ b/tests/column/pow_test.py @@ -10,11 +10,9 @@ def test_float_powers_column(library: str) -> None: ser = df.col("a") other = df.col("b") * 1.0 result = df.assign(ser.__pow__(other).rename("result")) - compare_dataframe_with_reference( - result, - {"a": [1, 2, 3], "b": [4, 5, 6], "result": [1.0, 32.0, 729.0]}, - {"a": ns.Int64, "b": ns.Int64, "result": ns.Float64}, - ) + expected = {"a": [1, 2, 3], "b": [4, 5, 6], "result": [1.0, 32.0, 729.0]} + expected_dtype = {"a": ns.Int64, "b": ns.Int64, "result": ns.Float64} + compare_dataframe_with_reference(result, expected, expected_dtype) def test_float_powers_scalar_column(library: str) -> None: @@ -23,11 +21,9 @@ def test_float_powers_scalar_column(library: str) -> None: ser = df.col("a") other = 1.0 result = df.assign(ser.__pow__(other).rename("result")) - compare_dataframe_with_reference( - result, - {"a": [1, 2, 3], "b": [4, 5, 6], "result": [1.0, 2.0, 3.0]}, - {"a": ns.Int64, "b": ns.Int64, "result": ns.Float64}, - ) + expected = {"a": [1, 2, 3], "b": [4, 5, 6], "result": [1.0, 2.0, 3.0]} + expected_dtype = {"a": ns.Int64, "b": ns.Int64, "result": ns.Float64} + compare_dataframe_with_reference(result, expected, expected_dtype) def test_int_powers_column(library: str) -> None: @@ -38,11 +34,9 @@ def test_int_powers_column(library: str) -> None: result = df.assign(ser.__pow__(other).rename("result")) if library in ("polars", "polars-lazy"): result = result.cast({name: ns.Int64() for name in ("a", "b", "result")}) - compare_dataframe_with_reference( - result, - {"a": [1, 2, 3], "b": [4, 5, 6], "result": [1, 32, 729]}, - {name: ns.Int64 for name in ("a", "b", "result")}, - ) + expected = {"a": [1, 2, 3], "b": [4, 5, 6], "result": [1, 32, 729]} + expected_dtype = {name: ns.Int64 for name in ("a", "b", "result")} + compare_dataframe_with_reference(result, expected, expected_dtype) def test_int_powers_scalar_column(library: str) -> None: @@ -53,8 +47,6 @@ def test_int_powers_scalar_column(library: str) -> None: result = df.assign(ser.__pow__(other).rename("result")) if library in ("polars", "polars-lazy"): result = result.cast({name: ns.Int64() for name in ("a", "b", "result")}) - compare_dataframe_with_reference( - result, - {"a": [1, 2, 3], "b": [4, 5, 6], "result": [1, 2, 3]}, - {name: ns.Int64 for name in ("a", "b", "result")}, - ) + expected = {"a": [1, 2, 3], "b": [4, 5, 6], "result": [1, 2, 3]} + expected_dtype = {name: ns.Int64 for name in ("a", "b", "result")} + compare_dataframe_with_reference(result, expected, expected_dtype) diff --git a/tests/column/shift_test.py b/tests/column/shift_test.py index 2f1f004c..0c2e6cfb 100644 --- a/tests/column/shift_test.py +++ b/tests/column/shift_test.py @@ -13,11 +13,8 @@ def test_shift_with_fill_value(library: str) -> None: result = df.assign(df.col("a").shift(1).fill_null(999)) if library == "pandas-numpy": result = result.cast({name: ns.Int64() for name in ("a", "b")}) - compare_dataframe_with_reference( - result, - {"a": [999, 1, 2], "b": [4, 5, 6]}, - ns.Int64, - ) + expected = {"a": [999, 1, 2], "b": [4, 5, 6]} + compare_dataframe_with_reference(result, expected, dtype=ns.Int64) def test_shift_without_fill_value(library: str) -> None: @@ -43,8 +40,6 @@ def test_shift_with_fill_value_complicated(library: str) -> None: result = df.assign(df.col("a").shift(1).fill_null(df.col("a").mean())) if library == "pandas-nullable": result = result.cast({"a": ns.Float64()}) - compare_dataframe_with_reference( - result, - {"a": [2.0, 1, 2], "b": [4, 5, 6]}, - {"a": ns.Float64, "b": ns.Int64}, - ) + expected = {"a": [2.0, 1, 2], "b": [4, 5, 6]} + expected_dtype = {"a": ns.Float64, "b": ns.Int64} + compare_dataframe_with_reference(result, expected, expected_dtype) diff --git a/tests/column/sort_test.py b/tests/column/sort_test.py index b7524c42..7cafc5e3 100644 --- a/tests/column/sort_test.py +++ b/tests/column/sort_test.py @@ -14,7 +14,7 @@ def test_expression_sort_ascending(library: str) -> None: "b": [4, 4, 3, 1, 2], "c": [1, 2, 3, 4, 4], } - compare_dataframe_with_reference(result, expected, ns.Int64) + compare_dataframe_with_reference(result, expected, dtype=ns.Int64) def test_expression_sort_descending(library: str) -> None: @@ -27,7 +27,7 @@ def test_expression_sort_descending(library: str) -> None: "b": [4, 4, 3, 1, 2], "c": [4, 4, 3, 2, 1], } - compare_dataframe_with_reference(result, expected, ns.Int64) + compare_dataframe_with_reference(result, expected, dtype=ns.Int64) def test_column_sort_ascending(library: str) -> None: @@ -40,7 +40,7 @@ def test_column_sort_ascending(library: str) -> None: "b": [4, 4, 3, 1, 2], "c": [1, 2, 3, 4, 4], } - compare_dataframe_with_reference(result, expected, ns.Int64) + compare_dataframe_with_reference(result, expected, dtype=ns.Int64) def test_column_sort_descending(library: str) -> None: @@ -53,4 +53,4 @@ def test_column_sort_descending(library: str) -> None: "b": [4, 4, 3, 1, 2], "c": [4, 4, 3, 2, 1], } - compare_dataframe_with_reference(result, expected, ns.Int64) + compare_dataframe_with_reference(result, expected, dtype=ns.Int64) diff --git a/tests/column/statistics_test.py b/tests/column/statistics_test.py index b444e604..b7e84868 100644 --- a/tests/column/statistics_test.py +++ b/tests/column/statistics_test.py @@ -8,4 +8,4 @@ def test_mean(library: str) -> None: df = integer_dataframe_1(library) ns = df.__dataframe_namespace__() result = df.assign((df.col("a") - df.col("a").mean()).rename("result")) - compare_column_with_reference(result.col("result"), [-1, 0, 1.0], ns.Float64) + compare_column_with_reference(result.col("result"), [-1, 0, 1.0], dtype=ns.Float64) diff --git a/tests/column/temporal/filter_test.py b/tests/column/temporal/filter_test.py index ea872bfe..27c2d901 100644 --- a/tests/column/temporal/filter_test.py +++ b/tests/column/temporal/filter_test.py @@ -6,4 +6,4 @@ def test_filter_w_date(library: str) -> None: df = temporal_dataframe_1(library).select("a", "index") ns = df.__dataframe_namespace__() result = df.filter(df.col("a") > ns.date(2020, 1, 2)).select("index") - compare_dataframe_with_reference(result, {"index": [1, 2]}, ns.Int64) + compare_dataframe_with_reference(result, {"index": [1, 2]}, dtype=ns.Int64) diff --git a/tests/column/temporal/floor_test.py b/tests/column/temporal/floor_test.py index 57287033..b9bf5d85 100644 --- a/tests/column/temporal/floor_test.py +++ b/tests/column/temporal/floor_test.py @@ -20,4 +20,4 @@ def test_floor(library: str, freq: str, expected: list[datetime]) -> None: col = df.col result = df.assign(col("a").floor(freq).rename("result")).select("result") # type: ignore[attr-defined] # TODO check the resolution - compare_column_with_reference(result.col("result"), expected, ns.Datetime) + compare_column_with_reference(result.col("result"), expected, dtype=ns.Datetime) From 90467c307827d65970c7d82cff2f4c3464180c69 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Sun, 7 Jan 2024 01:39:57 +0100 Subject: [PATCH 10/12] workarounds for mypy errors Signed-off-by: Anatoly Myachev --- tests/column/pow_test.py | 4 ++-- tests/column/shift_test.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/column/pow_test.py b/tests/column/pow_test.py index f08c0b22..253a7218 100644 --- a/tests/column/pow_test.py +++ b/tests/column/pow_test.py @@ -12,7 +12,7 @@ def test_float_powers_column(library: str) -> None: result = df.assign(ser.__pow__(other).rename("result")) expected = {"a": [1, 2, 3], "b": [4, 5, 6], "result": [1.0, 32.0, 729.0]} expected_dtype = {"a": ns.Int64, "b": ns.Int64, "result": ns.Float64} - compare_dataframe_with_reference(result, expected, expected_dtype) + compare_dataframe_with_reference(result, expected, expected_dtype) # type: ignore[arg-type] def test_float_powers_scalar_column(library: str) -> None: @@ -23,7 +23,7 @@ def test_float_powers_scalar_column(library: str) -> None: result = df.assign(ser.__pow__(other).rename("result")) expected = {"a": [1, 2, 3], "b": [4, 5, 6], "result": [1.0, 2.0, 3.0]} expected_dtype = {"a": ns.Int64, "b": ns.Int64, "result": ns.Float64} - compare_dataframe_with_reference(result, expected, expected_dtype) + compare_dataframe_with_reference(result, expected, expected_dtype) # type: ignore[arg-type] def test_int_powers_column(library: str) -> None: diff --git a/tests/column/shift_test.py b/tests/column/shift_test.py index 0c2e6cfb..86c084ce 100644 --- a/tests/column/shift_test.py +++ b/tests/column/shift_test.py @@ -42,4 +42,4 @@ def test_shift_with_fill_value_complicated(library: str) -> None: result = result.cast({"a": ns.Float64()}) expected = {"a": [2.0, 1, 2], "b": [4, 5, 6]} expected_dtype = {"a": ns.Float64, "b": ns.Int64} - compare_dataframe_with_reference(result, expected, expected_dtype) + compare_dataframe_with_reference(result, expected, expected_dtype) # type: ignore[arg-type] From 01fa14d115d233a825713323a433ecbc3cb3d3fd Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 19 Jan 2024 14:48:48 +0100 Subject: [PATCH 11/12] address review comments Signed-off-by: Anatoly Myachev --- dataframe_api_compat/pandas_standard/__init__.py | 5 +++-- dataframe_api_compat/pandas_standard/column_object.py | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/dataframe_api_compat/pandas_standard/__init__.py b/dataframe_api_compat/pandas_standard/__init__.py index 8da7312e..533eb81f 100644 --- a/dataframe_api_compat/pandas_standard/__init__.py +++ b/dataframe_api_compat/pandas_standard/__init__.py @@ -111,8 +111,9 @@ def map_pandas_dtype_to_standard_dtype(dtype: Any) -> DType: return Namespace.String() if dtype == "string": return Namespace.String() - if not hasattr(dtype, "startswith"): - dtype = str(dtype) + if hasattr(dtype, "name"): + # For types like `numpy.dtypes.DateTime64DType` + dtype = dtype.name if dtype.startswith("datetime64["): match = re.search(r"datetime64\[(\w{1,2})", dtype) assert match is not None diff --git a/dataframe_api_compat/pandas_standard/column_object.py b/dataframe_api_compat/pandas_standard/column_object.py index 1bbe408e..1df02a25 100644 --- a/dataframe_api_compat/pandas_standard/column_object.py +++ b/dataframe_api_compat/pandas_standard/column_object.py @@ -36,6 +36,7 @@ "UInt8": "uint8", "boolean": "bool", "Float64": "float64", + "Float32": "float32", } From a4c4aee13dc3e2937a20d38b889209960742e36f Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 23 Jan 2024 12:56:20 +0100 Subject: [PATCH 12/12] use 'parse' and 'Version' to compare package versions Signed-off-by: Anatoly Myachev --- tests/column/cumulative_test.py | 5 +++-- tests/column/name_test.py | 4 +++- tests/dataframe/schema_test.py | 12 ++++-------- tests/groupby/groupby_any_all_test.py | 10 ++++------ tests/groupby/numeric_test.py | 10 ++++------ tests/integration/scale_column_test.py | 8 +++++--- tests/integration/upstream_test.py | 20 +++++--------------- 7 files changed, 28 insertions(+), 41 deletions(-) diff --git a/tests/column/cumulative_test.py b/tests/column/cumulative_test.py index 331106e8..2c229b82 100644 --- a/tests/column/cumulative_test.py +++ b/tests/column/cumulative_test.py @@ -2,6 +2,8 @@ import pandas as pd import pytest +from packaging.version import Version +from packaging.version import parse from tests.utils import compare_column_with_reference from tests.utils import integer_dataframe_1 @@ -28,8 +30,7 @@ def test_cumulative_functions_column( result = df.assign(getattr(ser, func)().rename("result")) if ( - tuple(int(v) for v in pd.__version__.split(".")) < (2, 0, 0) - and library == "pandas-nullable" + parse(pd.__version__) < Version("2.0.0") and library == "pandas-nullable" ): # pragma: no cover # Upstream bug result = result.cast({"result": ns.Int64()}) diff --git a/tests/column/name_test.py b/tests/column/name_test.py index ad43a601..efd5934d 100644 --- a/tests/column/name_test.py +++ b/tests/column/name_test.py @@ -2,6 +2,8 @@ import pandas as pd import pytest +from packaging.version import Version +from packaging.version import parse from tests.utils import convert_to_standard_compliant_dataframe from tests.utils import integer_dataframe_1 @@ -20,7 +22,7 @@ def test_pandas_name_if_0_named_column() -> None: @pytest.mark.skipif( - tuple(int(v) for v in pd.__version__.split(".")) < (2, 1, 0), + parse(pd.__version__) < Version("2.1.0"), reason="before consoritum standard", ) def test_invalid_name_pandas() -> None: diff --git a/tests/dataframe/schema_test.py b/tests/dataframe/schema_test.py index 86673377..c8d0538e 100644 --- a/tests/dataframe/schema_test.py +++ b/tests/dataframe/schema_test.py @@ -3,6 +3,7 @@ import pandas as pd import pytest from packaging.version import Version +from packaging.version import parse from tests.utils import PANDAS_VERSION from tests.utils import mixed_dataframe_1 @@ -50,8 +51,7 @@ def test_schema(library: str) -> None: assert isinstance(result["m"], namespace.Datetime) assert isinstance(result["n"], namespace.Datetime) if not ( - library.startswith("pandas") - and tuple(int(v) for v in pd.__version__.split(".")) < (2, 0, 0) + library.startswith("pandas") and parse(pd.__version__) < Version("2.0.0") ): # pragma: no cover (coverage bug?) # pandas non-nanosecond support only came in 2.0 assert result["n"].time_unit == "ms" @@ -60,18 +60,14 @@ def test_schema(library: str) -> None: assert result["n"].time_zone is None assert isinstance(result["o"], namespace.Datetime) if not ( - library.startswith("pandas") - and tuple(int(v) for v in pd.__version__.split(".")) < (2, 0, 0) + library.startswith("pandas") and parse(pd.__version__) < Version("2.0.0") ): # pragma: no cover (coverage bug?) # pandas non-nanosecond support only came in 2.0 assert result["o"].time_unit == "us" else: # pragma: no cover pass assert result["o"].time_zone is None - if not ( - library.startswith("pandas") - and tuple(int(v) for v in pd.__version__.split(".")) < (2, 0, 0) - ): + if not (library.startswith("pandas") and parse(pd.__version__) < Version("2.0.0")): # pandas non-nanosecond support only came in 2.0 - before that, these would be 'float' assert isinstance(result["p"], namespace.Duration) assert result["p"].time_unit == "ms" diff --git a/tests/groupby/groupby_any_all_test.py b/tests/groupby/groupby_any_all_test.py index 7cc427c5..4c739d56 100644 --- a/tests/groupby/groupby_any_all_test.py +++ b/tests/groupby/groupby_any_all_test.py @@ -2,6 +2,8 @@ import pandas as pd import pytest +from packaging.version import Version +from packaging.version import parse from polars.exceptions import SchemaError from tests.utils import bool_dataframe_2 @@ -28,12 +30,8 @@ def test_groupby_boolean( # need to sort result = result.sort("key") result_pd = interchange_to_pandas(result) - if library == "pandas-nullable" and tuple( - int(v) for v in pd.__version__.split(".") - ) < ( - 2, - 0, - 0, + if library == "pandas-nullable" and parse(pd.__version__) < Version( + "2.0.0", ): # pragma: no cover # upstream bug result_pd = result_pd.astype({"key": "int64"}) diff --git a/tests/groupby/numeric_test.py b/tests/groupby/numeric_test.py index ee33a4fe..02c63f0e 100644 --- a/tests/groupby/numeric_test.py +++ b/tests/groupby/numeric_test.py @@ -2,6 +2,8 @@ import pandas as pd import pytest +from packaging.version import Version +from packaging.version import parse from tests.utils import integer_dataframe_4 from tests.utils import interchange_to_pandas @@ -35,12 +37,8 @@ def test_group_by_numeric( result = result.sort("key") result_pd = interchange_to_pandas(result) expected = pd.DataFrame({"key": [1, 2], "b": expected_b, "c": expected_c}) - if library == "pandas-nullable" and tuple( - int(v) for v in pd.__version__.split(".") - ) < ( - 2, - 0, - 0, + if library == "pandas-nullable" and parse(pd.__version__) < Version( + "2.0.0", ): # pragma: no cover # upstream bug result_pd = result_pd.astype({"key": "int64"}) diff --git a/tests/integration/scale_column_test.py b/tests/integration/scale_column_test.py index 325b97f5..4f07c8fd 100644 --- a/tests/integration/scale_column_test.py +++ b/tests/integration/scale_column_test.py @@ -3,11 +3,13 @@ import pandas as pd import polars as pl import pytest +from packaging.version import Version +from packaging.version import parse from polars.testing import assert_series_equal @pytest.mark.skipif( - tuple(int(v) for v in pd.__version__.split(".")) < (2, 1, 0), + parse(pd.__version__) < Version("2.1.0"), reason="pandas doesn't support 3.8", ) def test_scale_column_pandas() -> None: @@ -19,7 +21,7 @@ def test_scale_column_pandas() -> None: @pytest.mark.skipif( - tuple(int(v) for v in pl.__version__.split(".")) < (0, 19, 0), + parse(pl.__version__) < Version("0.19.0"), reason="before consortium standard in polars", ) def test_scale_column_polars() -> None: @@ -31,7 +33,7 @@ def test_scale_column_polars() -> None: @pytest.mark.skipif( - tuple(int(v) for v in pl.__version__.split(".")) < (0, 19, 0), + parse(pl.__version__) < Version("0.19.0"), reason="before consortium standard in polars", ) def test_scale_column_polars_from_persisted_df() -> None: diff --git a/tests/integration/upstream_test.py b/tests/integration/upstream_test.py index d244e7dd..abbfc982 100644 --- a/tests/integration/upstream_test.py +++ b/tests/integration/upstream_test.py @@ -1,17 +1,15 @@ import sys import pytest +from packaging.version import Version +from packaging.version import parse class TestPolars: def test_dataframe(self) -> None: import polars as pl - if tuple(int(v) for v in pl.__version__.split(".")) < ( - 0, - 19, - 0, - ): # pragma: no cover + if parse(pl.__version__) < Version("0.19.0"): # pragma: no cover # before consortium standard in polars return @@ -24,11 +22,7 @@ def test_dataframe(self) -> None: def test_lazyframe(self) -> None: import polars as pl - if tuple(int(v) for v in pl.__version__.split(".")) < ( - 0, - 19, - 0, - ): # pragma: no cover + if parse(pl.__version__) < Version("0.19.0"): # pragma: no cover # before consortium standard in polars return @@ -57,11 +51,7 @@ def test_pandas(self) -> None: """ import pandas as pd - if tuple(int(v) for v in pd.__version__.split(".")) < ( - 2, - 1, - 0, - ): # pragma: no cover + if parse(pd.__version__) < Version("2.1.0"): # pragma: no cover # before consortium standard in pandas return