From 155fd490b498bd3c9d61e290e629036e1f6dd098 Mon Sep 17 00:00:00 2001 From: Daniel Elsner Date: Wed, 25 Jun 2025 18:49:43 +0200 Subject: [PATCH 1/5] fix: Add drop_non_unique parameter to filter_relationship_one_to_one --- dataframely/functional.py | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/dataframely/functional.py b/dataframely/functional.py index 475bb67..588196b 100644 --- a/dataframely/functional.py +++ b/dataframely/functional.py @@ -23,22 +23,43 @@ # --------------------------------- RELATIONSHIP 1:1 --------------------------------- # +LEN_COLUMN = "__len__" + def filter_relationship_one_to_one( lhs: LazyFrame[S] | pl.LazyFrame, rhs: LazyFrame[T] | pl.LazyFrame, /, on: str | list[str], + *, + drop_non_unique: bool = False, ) -> pl.LazyFrame: """Express a 1:1 mapping between data frames for a collection filter. Args: lhs: The first data frame in the 1:1 mapping. rhs: The second data frame in the 1:1 mapping. - on: The columns to join the data frames on. If not provided, the join columns - are inferred from the mutual primary keys of the provided data frames. + on: The columns to join the data frames on. + drop_non_unique: If `True`, drop non-unique rows from both data frames. + This is useful when the join columns do not already uniquely identify rows. """ - return lhs.join(rhs, on=on) + if drop_non_unique: + return ( + lhs.group_by(on) + .len(LEN_COLUMN) + .filter(pl.col(LEN_COLUMN) == 1) + .drop(LEN_COLUMN) + .join( + rhs.group_by(on) + .len(LEN_COLUMN) + .filter(pl.col(LEN_COLUMN) == 1) + .drop(LEN_COLUMN), + on=on, + how="inner", + ) + ) + else: + return lhs.join(rhs, on=on) # ------------------------------- RELATIONSHIP 1:{1,N} ------------------------------- # @@ -55,8 +76,7 @@ def filter_relationship_one_to_at_least_one( Args: lhs: The data frame with exactly one occurrence for a set of key columns. rhs: The data frame with at least one occurrence for a set of key columns. - on: The columns to join the data frames on. If not provided, the join columns - are inferred from the joint primary keys of the provided data frames. + on: The columns to join the data frames on. """ return lhs.join(rhs.unique(on), on=on) From aeeb9c5cf461346eebae5612e55fba6f015a8bb9 Mon Sep 17 00:00:00 2001 From: Daniel Elsner Date: Wed, 25 Jun 2025 18:58:49 +0200 Subject: [PATCH 2/5] Adjust test --- tests/functional/test_relationships.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/tests/functional/test_relationships.py b/tests/functional/test_relationships.py index b650e11..e2f3869 100644 --- a/tests/functional/test_relationships.py +++ b/tests/functional/test_relationships.py @@ -29,7 +29,7 @@ class EmployeeSchema(dy.Schema): @pytest.fixture() def departments() -> dy.LazyFrame[DepartmentSchema]: - return DepartmentSchema.cast(pl.LazyFrame({"department_id": [1, 2]})) + return DepartmentSchema.cast(pl.LazyFrame({"department_id": [1, 2, 3]})) @pytest.fixture() @@ -44,9 +44,9 @@ def employees() -> dy.LazyFrame[EmployeeSchema]: return EmployeeSchema.cast( pl.LazyFrame( { - "department_id": [2, 2, 2], - "employee_number": [101, 102, 103], - "name": ["Huey", "Dewey", "Louie"], + "department_id": [2, 2, 2, 3], + "employee_number": [101, 102, 103, 104], + "name": ["Huey", "Dewey", "Louie", "Daisy"], } ) ) @@ -67,6 +67,19 @@ def test_one_to_one( assert actual.select("department_id").collect().to_series().to_list() == [1] +def test_one_to_one_drop_non_unique( + departments: dy.LazyFrame[DepartmentSchema], + employees: dy.LazyFrame[EmployeeSchema], +) -> None: + actual = dy.filter_relationship_one_to_one( + departments, + employees, + on="department_id", + drop_non_unique=True, + ) + assert actual.select("department_id").collect().to_series().to_list() == [3] + + def test_one_to_at_least_one( departments: dy.LazyFrame[DepartmentSchema], employees: dy.LazyFrame[EmployeeSchema], @@ -74,4 +87,4 @@ def test_one_to_at_least_one( actual = dy.filter_relationship_one_to_at_least_one( departments, employees, on="department_id" ) - assert actual.select("department_id").collect().to_series().to_list() == [2] + assert actual.select("department_id").collect().to_series().to_list() == [2, 3] From 5c6b97bde5e04d5c64392996a16d787995d29543 Mon Sep 17 00:00:00 2001 From: Daniel Elsner Date: Wed, 2 Jul 2025 08:42:01 +0200 Subject: [PATCH 3/5] Rename arg to keep_only_unique --- dataframely/functional.py | 6 +++--- tests/functional/test_relationships.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/dataframely/functional.py b/dataframely/functional.py index 588196b..89931ee 100644 --- a/dataframely/functional.py +++ b/dataframely/functional.py @@ -32,7 +32,7 @@ def filter_relationship_one_to_one( /, on: str | list[str], *, - drop_non_unique: bool = False, + keep_only_unique: bool = False, ) -> pl.LazyFrame: """Express a 1:1 mapping between data frames for a collection filter. @@ -40,10 +40,10 @@ def filter_relationship_one_to_one( lhs: The first data frame in the 1:1 mapping. rhs: The second data frame in the 1:1 mapping. on: The columns to join the data frames on. - drop_non_unique: If `True`, drop non-unique rows from both data frames. + keep_only_unique: If `True`, drop non-unique rows from both data frames. This is useful when the join columns do not already uniquely identify rows. """ - if drop_non_unique: + if keep_only_unique: return ( lhs.group_by(on) .len(LEN_COLUMN) diff --git a/tests/functional/test_relationships.py b/tests/functional/test_relationships.py index e2f3869..b72e250 100644 --- a/tests/functional/test_relationships.py +++ b/tests/functional/test_relationships.py @@ -67,7 +67,7 @@ def test_one_to_one( assert actual.select("department_id").collect().to_series().to_list() == [1] -def test_one_to_one_drop_non_unique( +def test_one_to_one_keep_only_unique( departments: dy.LazyFrame[DepartmentSchema], employees: dy.LazyFrame[EmployeeSchema], ) -> None: @@ -75,7 +75,7 @@ def test_one_to_one_drop_non_unique( departments, employees, on="department_id", - drop_non_unique=True, + keep_only_unique=True, ) assert actual.select("department_id").collect().to_series().to_list() == [3] From 5b62379fb1464bf28c1bb58d363788d283235cf4 Mon Sep 17 00:00:00 2001 From: Daniel Elsner Date: Fri, 17 Oct 2025 11:38:24 +0200 Subject: [PATCH 4/5] Refactor to require_* and consistently use filter_unique --- dataframely/__init__.py | 8 ++-- dataframely/functional.py | 61 ++++++++++++++++++-------- tests/benches/test_collection.py | 8 ++-- tests/collection/test_matches.py | 6 +-- tests/functional/test_relationships.py | 19 +++++--- 5 files changed, 65 insertions(+), 37 deletions(-) diff --git a/dataframely/__init__.py b/dataframely/__init__.py index 7bbd759..53c7dbf 100644 --- a/dataframely/__init__.py +++ b/dataframely/__init__.py @@ -54,8 +54,8 @@ from .failure import FailureInfo from .functional import ( concat_collection_members, - filter_relationship_one_to_at_least_one, - filter_relationship_one_to_one, + require_relationship_one_to_at_least_one, + require_relationship_one_to_one, ) from .schema import Schema, deserialize_schema, read_parquet_metadata_schema @@ -71,8 +71,8 @@ "Config", "FailureInfo", "concat_collection_members", - "filter_relationship_one_to_at_least_one", - "filter_relationship_one_to_one", + "require_relationship_one_to_at_least_one", + "require_relationship_one_to_one", "Schema", "deserialize_schema", "read_parquet_metadata_schema", diff --git a/dataframely/functional.py b/dataframely/functional.py index 89931ee..f55fde6 100644 --- a/dataframely/functional.py +++ b/dataframely/functional.py @@ -26,13 +26,13 @@ LEN_COLUMN = "__len__" -def filter_relationship_one_to_one( +def require_relationship_one_to_one( lhs: LazyFrame[S] | pl.LazyFrame, rhs: LazyFrame[T] | pl.LazyFrame, /, on: str | list[str], *, - keep_only_unique: bool = False, + filter_unique: bool = True, ) -> pl.LazyFrame: """Express a 1:1 mapping between data frames for a collection filter. @@ -40,44 +40,67 @@ def filter_relationship_one_to_one( lhs: The first data frame in the 1:1 mapping. rhs: The second data frame in the 1:1 mapping. on: The columns to join the data frames on. - keep_only_unique: If `True`, drop non-unique rows from both data frames. - This is useful when the join columns do not already uniquely identify rows. + filter_unique: If set to `True`, drops rows that are not uniquely identified by the + join columns specified with `on`. If set to `False`, skips uniqueness checks + and avoids performance penalties. Use with caution, as this may lead to unexpected + results if rows in one or both of the data frames are not unique in the join columns. + + Returns: + A data frame representing the inner join of the two inputs on the specified + columns, filtered to ensure a 1:1 relationship. """ - if keep_only_unique: - return ( + if filter_unique: + lhs = ( lhs.group_by(on) .len(LEN_COLUMN) .filter(pl.col(LEN_COLUMN) == 1) .drop(LEN_COLUMN) - .join( - rhs.group_by(on) - .len(LEN_COLUMN) - .filter(pl.col(LEN_COLUMN) == 1) - .drop(LEN_COLUMN), - on=on, - how="inner", - ) ) - else: - return lhs.join(rhs, on=on) + rhs = ( + rhs.group_by(on) + .len(LEN_COLUMN) + .filter(pl.col(LEN_COLUMN) == 1) + .drop(LEN_COLUMN) + ) + + return lhs.join(rhs, on=on) # ------------------------------- RELATIONSHIP 1:{1,N} ------------------------------- # -def filter_relationship_one_to_at_least_one( +def require_relationship_one_to_at_least_one( lhs: LazyFrame[S] | pl.LazyFrame, rhs: LazyFrame[T] | pl.LazyFrame, /, on: str | list[str], + *, + filter_unique: bool = True, ) -> pl.LazyFrame: """Express a 1:{1,N} mapping between data frames for a collection filter. Args: - lhs: The data frame with exactly one occurrence for a set of key columns. - rhs: The data frame with at least one occurrence for a set of key columns. + lhs: The data frame with exactly one occurrence for the set of join columns. + rhs: The data frame with at least one occurrence for the set of join columns. on: The columns to join the data frames on. + filter_unique: If set to `True`, drops rows in `lhs` that are not uniquely + identified by the join columns specified with `on`. If set to `False`, + skips uniqueness checks and avoids performance penalties. Use with + caution, as this may lead to unexpected results if rows in `lhs` are + not unique in the join columns. + + Returns: + A data frame representing the inner join of the two inputs on the specified + columns, filtered to ensure a 1:{1,N} relationship. """ + if filter_unique: + lhs = ( + lhs.group_by(on) + .len(LEN_COLUMN) + .filter(pl.col(LEN_COLUMN) == 1) + .drop(LEN_COLUMN) + ) + return lhs.join(rhs.unique(on), on=on) diff --git a/tests/benches/test_collection.py b/tests/benches/test_collection.py index ba0fbfa..f2b115a 100644 --- a/tests/benches/test_collection.py +++ b/tests/benches/test_collection.py @@ -49,7 +49,7 @@ class SingleFilterCollection(dy.Collection): @dy.filter() def one_to_one(self) -> pl.LazyFrame: - return dy.filter_relationship_one_to_one(self.first, self.second, on="idx") + return dy.require_relationship_one_to_one(self.first, self.second, on="idx") @pytest.mark.benchmark(group="collection-filter-single") @@ -79,17 +79,17 @@ class MultiFilterCollection(dy.Collection): @dy.filter() def one_to_one(self) -> pl.LazyFrame: - return dy.filter_relationship_one_to_one(self.first, self.second, on="idx") + return dy.require_relationship_one_to_one(self.first, self.second, on="idx") @dy.filter() def one_to_at_least_one(self) -> pl.LazyFrame: - return dy.filter_relationship_one_to_at_least_one( + return dy.require_relationship_one_to_at_least_one( self.first, self.second, on="idx" ) @dy.filter() def one_to_at_least_one_reverse(self) -> pl.LazyFrame: - return dy.filter_relationship_one_to_at_least_one( + return dy.require_relationship_one_to_at_least_one( self.second, self.first, on="idx" ) diff --git a/tests/collection/test_matches.py b/tests/collection/test_matches.py index 67a6dbe..f36fb45 100644 --- a/tests/collection/test_matches.py +++ b/tests/collection/test_matches.py @@ -74,7 +74,7 @@ class MyCollection1(dy.Collection): class MyCollection2(MyCollection1): @dy.filter() def test_filter(self) -> pl.LazyFrame: - return dy.filter_relationship_one_to_one(self.x, self.x, ["foo"]) + return dy.require_relationship_one_to_one(self.x, self.x, ["foo"]) # Should not match assert not MyCollection1.matches(MyCollection2) @@ -93,12 +93,12 @@ class BaseCollection(dy.Collection): class MyCollection1(BaseCollection): @dy.filter() def test_filter(self) -> pl.LazyFrame: - return dy.filter_relationship_one_to_one(self.x, self.x, ["foo"]) + return dy.require_relationship_one_to_one(self.x, self.x, ["foo"]) class MyCollection2(BaseCollection): @dy.filter() def test_filter(self) -> pl.LazyFrame: - return dy.filter_relationship_one_to_at_least_one(self.x, self.x, ["foo"]) + return dy.require_relationship_one_to_at_least_one(self.x, self.x, ["foo"]) assert not MyCollection1.matches(MyCollection2) diff --git a/tests/functional/test_relationships.py b/tests/functional/test_relationships.py index b72e250..f83dbe7 100644 --- a/tests/functional/test_relationships.py +++ b/tests/functional/test_relationships.py @@ -57,25 +57,30 @@ def employees() -> dy.LazyFrame[EmployeeSchema]: # ------------------------------------------------------------------------------------ # +@pytest.mark.parametrize("filter_unique", [True, False]) def test_one_to_one( departments: dy.LazyFrame[DepartmentSchema], managers: dy.LazyFrame[ManagerSchema], + filter_unique: bool, ) -> None: - actual = dy.filter_relationship_one_to_one( - departments, managers, on="department_id" + actual = dy.require_relationship_one_to_one( + departments, + managers, + on="department_id", + filter_unique=filter_unique, ) assert actual.select("department_id").collect().to_series().to_list() == [1] -def test_one_to_one_keep_only_unique( +def test_one_to_one_filter_unique( departments: dy.LazyFrame[DepartmentSchema], employees: dy.LazyFrame[EmployeeSchema], ) -> None: - actual = dy.filter_relationship_one_to_one( + actual = dy.require_relationship_one_to_one( departments, employees, on="department_id", - keep_only_unique=True, + filter_unique=True, ) assert actual.select("department_id").collect().to_series().to_list() == [3] @@ -84,7 +89,7 @@ def test_one_to_at_least_one( departments: dy.LazyFrame[DepartmentSchema], employees: dy.LazyFrame[EmployeeSchema], ) -> None: - actual = dy.filter_relationship_one_to_at_least_one( - departments, employees, on="department_id" + actual = dy.require_relationship_one_to_at_least_one( + departments, employees, on="department_id", filter_unique=False ) assert actual.select("department_id").collect().to_series().to_list() == [2, 3] From b064387f83ab2c9b6f1e3fa612eb097795b5a023 Mon Sep 17 00:00:00 2001 From: Daniel Elsner Date: Fri, 17 Oct 2025 11:48:54 +0200 Subject: [PATCH 5/5] Fix --- dataframely/functional.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/dataframely/functional.py b/dataframely/functional.py index f55fde6..da7991a 100644 --- a/dataframely/functional.py +++ b/dataframely/functional.py @@ -50,17 +50,18 @@ def require_relationship_one_to_one( columns, filtered to ensure a 1:1 relationship. """ if filter_unique: - lhs = ( + return ( lhs.group_by(on) .len(LEN_COLUMN) .filter(pl.col(LEN_COLUMN) == 1) .drop(LEN_COLUMN) - ) - rhs = ( - rhs.group_by(on) - .len(LEN_COLUMN) - .filter(pl.col(LEN_COLUMN) == 1) - .drop(LEN_COLUMN) + .join( + rhs.group_by(on) + .len(LEN_COLUMN) + .filter(pl.col(LEN_COLUMN) == 1) + .drop(LEN_COLUMN), + on=on, + ) ) return lhs.join(rhs, on=on) @@ -94,11 +95,12 @@ def require_relationship_one_to_at_least_one( columns, filtered to ensure a 1:{1,N} relationship. """ if filter_unique: - lhs = ( + return ( lhs.group_by(on) .len(LEN_COLUMN) .filter(pl.col(LEN_COLUMN) == 1) .drop(LEN_COLUMN) + .join(rhs.unique(on), on=on) ) return lhs.join(rhs.unique(on), on=on)