From 155fd490b498bd3c9d61e290e629036e1f6dd098 Mon Sep 17 00:00:00 2001
From: Daniel Elsner <daniel.elsner@quantco.com>
Date: Wed, 25 Jun 2025 18:49:43 +0200
Subject: [PATCH 1/5] fix: Add drop_non_unique parameter to
 filter_relationship_one_to_one

---
 dataframely/functional.py | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/dataframely/functional.py b/dataframely/functional.py
index 475bb67..588196b 100644
--- a/dataframely/functional.py
+++ b/dataframely/functional.py
@@ -23,22 +23,43 @@
 
 # --------------------------------- RELATIONSHIP 1:1 --------------------------------- #
 
+LEN_COLUMN = "__len__"
+
 
 def filter_relationship_one_to_one(
     lhs: LazyFrame[S] | pl.LazyFrame,
     rhs: LazyFrame[T] | pl.LazyFrame,
     /,
     on: str | list[str],
+    *,
+    drop_non_unique: bool = False,
 ) -> pl.LazyFrame:
     """Express a 1:1 mapping between data frames for a collection filter.
 
     Args:
         lhs: The first data frame in the 1:1 mapping.
         rhs: The second data frame in the 1:1 mapping.
-        on: The columns to join the data frames on. If not provided, the join columns
-            are inferred from the mutual primary keys of the provided data frames.
+        on: The columns to join the data frames on.
+        drop_non_unique: If `True`, drop non-unique rows from both data frames.
+            This is useful when the join columns do not already uniquely identify rows.
     """
-    return lhs.join(rhs, on=on)
+    if drop_non_unique:
+        return (
+            lhs.group_by(on)
+            .len(LEN_COLUMN)
+            .filter(pl.col(LEN_COLUMN) == 1)
+            .drop(LEN_COLUMN)
+            .join(
+                rhs.group_by(on)
+                .len(LEN_COLUMN)
+                .filter(pl.col(LEN_COLUMN) == 1)
+                .drop(LEN_COLUMN),
+                on=on,
+                how="inner",
+            )
+        )
+    else:
+        return lhs.join(rhs, on=on)
 
 
 # ------------------------------- RELATIONSHIP 1:{1,N} ------------------------------- #
@@ -55,8 +76,7 @@ def filter_relationship_one_to_at_least_one(
     Args:
         lhs: The data frame with exactly one occurrence for a set of key columns.
         rhs: The data frame with at least one occurrence for a set of key columns.
-        on: The columns to join the data frames on. If not provided, the join columns
-            are inferred from the joint primary keys of the provided data frames.
+        on: The columns to join the data frames on.
     """
     return lhs.join(rhs.unique(on), on=on)
 

From aeeb9c5cf461346eebae5612e55fba6f015a8bb9 Mon Sep 17 00:00:00 2001
From: Daniel Elsner <daniel.elsner@quantco.com>
Date: Wed, 25 Jun 2025 18:58:49 +0200
Subject: [PATCH 2/5] Adjust test

---
 tests/functional/test_relationships.py | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/tests/functional/test_relationships.py b/tests/functional/test_relationships.py
index b650e11..e2f3869 100644
--- a/tests/functional/test_relationships.py
+++ b/tests/functional/test_relationships.py
@@ -29,7 +29,7 @@ class EmployeeSchema(dy.Schema):
 
 @pytest.fixture()
 def departments() -> dy.LazyFrame[DepartmentSchema]:
-    return DepartmentSchema.cast(pl.LazyFrame({"department_id": [1, 2]}))
+    return DepartmentSchema.cast(pl.LazyFrame({"department_id": [1, 2, 3]}))
 
 
 @pytest.fixture()
@@ -44,9 +44,9 @@ def employees() -> dy.LazyFrame[EmployeeSchema]:
     return EmployeeSchema.cast(
         pl.LazyFrame(
             {
-                "department_id": [2, 2, 2],
-                "employee_number": [101, 102, 103],
-                "name": ["Huey", "Dewey", "Louie"],
+                "department_id": [2, 2, 2, 3],
+                "employee_number": [101, 102, 103, 104],
+                "name": ["Huey", "Dewey", "Louie", "Daisy"],
             }
         )
     )
@@ -67,6 +67,19 @@ def test_one_to_one(
     assert actual.select("department_id").collect().to_series().to_list() == [1]
 
 
+def test_one_to_one_drop_non_unique(
+    departments: dy.LazyFrame[DepartmentSchema],
+    employees: dy.LazyFrame[EmployeeSchema],
+) -> None:
+    actual = dy.filter_relationship_one_to_one(
+        departments,
+        employees,
+        on="department_id",
+        drop_non_unique=True,
+    )
+    assert actual.select("department_id").collect().to_series().to_list() == [3]
+
+
 def test_one_to_at_least_one(
     departments: dy.LazyFrame[DepartmentSchema],
     employees: dy.LazyFrame[EmployeeSchema],
@@ -74,4 +87,4 @@ def test_one_to_at_least_one(
     actual = dy.filter_relationship_one_to_at_least_one(
         departments, employees, on="department_id"
     )
-    assert actual.select("department_id").collect().to_series().to_list() == [2]
+    assert actual.select("department_id").collect().to_series().to_list() == [2, 3]

From 5c6b97bde5e04d5c64392996a16d787995d29543 Mon Sep 17 00:00:00 2001
From: Daniel Elsner <daniel.elsner@quantco.com>
Date: Wed, 2 Jul 2025 08:42:01 +0200
Subject: [PATCH 3/5] Rename arg to keep_only_unique

---
 dataframely/functional.py              | 6 +++---
 tests/functional/test_relationships.py | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/dataframely/functional.py b/dataframely/functional.py
index 588196b..89931ee 100644
--- a/dataframely/functional.py
+++ b/dataframely/functional.py
@@ -32,7 +32,7 @@ def filter_relationship_one_to_one(
     /,
     on: str | list[str],
     *,
-    drop_non_unique: bool = False,
+    keep_only_unique: bool = False,
 ) -> pl.LazyFrame:
     """Express a 1:1 mapping between data frames for a collection filter.
 
@@ -40,10 +40,10 @@ def filter_relationship_one_to_one(
         lhs: The first data frame in the 1:1 mapping.
         rhs: The second data frame in the 1:1 mapping.
         on: The columns to join the data frames on.
-        drop_non_unique: If `True`, drop non-unique rows from both data frames.
+        keep_only_unique: If `True`, drop non-unique rows from both data frames.
             This is useful when the join columns do not already uniquely identify rows.
     """
-    if drop_non_unique:
+    if keep_only_unique:
         return (
             lhs.group_by(on)
             .len(LEN_COLUMN)
diff --git a/tests/functional/test_relationships.py b/tests/functional/test_relationships.py
index e2f3869..b72e250 100644
--- a/tests/functional/test_relationships.py
+++ b/tests/functional/test_relationships.py
@@ -67,7 +67,7 @@ def test_one_to_one(
     assert actual.select("department_id").collect().to_series().to_list() == [1]
 
 
-def test_one_to_one_drop_non_unique(
+def test_one_to_one_keep_only_unique(
     departments: dy.LazyFrame[DepartmentSchema],
     employees: dy.LazyFrame[EmployeeSchema],
 ) -> None:
@@ -75,7 +75,7 @@ def test_one_to_one_drop_non_unique(
         departments,
         employees,
         on="department_id",
-        drop_non_unique=True,
+        keep_only_unique=True,
     )
     assert actual.select("department_id").collect().to_series().to_list() == [3]
 

From 5b62379fb1464bf28c1bb58d363788d283235cf4 Mon Sep 17 00:00:00 2001
From: Daniel Elsner <daniel.elsner@quantco.com>
Date: Fri, 17 Oct 2025 11:38:24 +0200
Subject: [PATCH 4/5] Refactor to require_* and consistently use filter_unique

---
 dataframely/__init__.py                |  8 ++--
 dataframely/functional.py              | 61 ++++++++++++++++++--------
 tests/benches/test_collection.py       |  8 ++--
 tests/collection/test_matches.py       |  6 +--
 tests/functional/test_relationships.py | 19 +++++---
 5 files changed, 65 insertions(+), 37 deletions(-)

diff --git a/dataframely/__init__.py b/dataframely/__init__.py
index 7bbd759..53c7dbf 100644
--- a/dataframely/__init__.py
+++ b/dataframely/__init__.py
@@ -54,8 +54,8 @@
 from .failure import FailureInfo
 from .functional import (
     concat_collection_members,
-    filter_relationship_one_to_at_least_one,
-    filter_relationship_one_to_one,
+    require_relationship_one_to_at_least_one,
+    require_relationship_one_to_one,
 )
 from .schema import Schema, deserialize_schema, read_parquet_metadata_schema
 
@@ -71,8 +71,8 @@
     "Config",
     "FailureInfo",
     "concat_collection_members",
-    "filter_relationship_one_to_at_least_one",
-    "filter_relationship_one_to_one",
+    "require_relationship_one_to_at_least_one",
+    "require_relationship_one_to_one",
     "Schema",
     "deserialize_schema",
     "read_parquet_metadata_schema",
diff --git a/dataframely/functional.py b/dataframely/functional.py
index 89931ee..f55fde6 100644
--- a/dataframely/functional.py
+++ b/dataframely/functional.py
@@ -26,13 +26,13 @@
 LEN_COLUMN = "__len__"
 
 
-def filter_relationship_one_to_one(
+def require_relationship_one_to_one(
     lhs: LazyFrame[S] | pl.LazyFrame,
     rhs: LazyFrame[T] | pl.LazyFrame,
     /,
     on: str | list[str],
     *,
-    keep_only_unique: bool = False,
+    filter_unique: bool = True,
 ) -> pl.LazyFrame:
     """Express a 1:1 mapping between data frames for a collection filter.
 
@@ -40,44 +40,67 @@ def filter_relationship_one_to_one(
         lhs: The first data frame in the 1:1 mapping.
         rhs: The second data frame in the 1:1 mapping.
         on: The columns to join the data frames on.
-        keep_only_unique: If `True`, drop non-unique rows from both data frames.
-            This is useful when the join columns do not already uniquely identify rows.
+        filter_unique: If set to `True`, drops rows that are not uniquely identified by the
+            join columns specified with `on`. If set to `False`, skips uniqueness checks
+            and avoids performance penalties. Use with caution, as this may lead to unexpected
+            results if rows in one or both of the data frames are not unique in the join columns.
+
+    Returns:
+        A data frame representing the inner join of the two inputs on the specified
+        columns, filtered to ensure a 1:1 relationship.
     """
-    if keep_only_unique:
-        return (
+    if filter_unique:
+        lhs = (
             lhs.group_by(on)
             .len(LEN_COLUMN)
             .filter(pl.col(LEN_COLUMN) == 1)
             .drop(LEN_COLUMN)
-            .join(
-                rhs.group_by(on)
-                .len(LEN_COLUMN)
-                .filter(pl.col(LEN_COLUMN) == 1)
-                .drop(LEN_COLUMN),
-                on=on,
-                how="inner",
-            )
         )
-    else:
-        return lhs.join(rhs, on=on)
+        rhs = (
+            rhs.group_by(on)
+            .len(LEN_COLUMN)
+            .filter(pl.col(LEN_COLUMN) == 1)
+            .drop(LEN_COLUMN)
+        )
+
+    return lhs.join(rhs, on=on)
 
 
 # ------------------------------- RELATIONSHIP 1:{1,N} ------------------------------- #
 
 
-def filter_relationship_one_to_at_least_one(
+def require_relationship_one_to_at_least_one(
     lhs: LazyFrame[S] | pl.LazyFrame,
     rhs: LazyFrame[T] | pl.LazyFrame,
     /,
     on: str | list[str],
+    *,
+    filter_unique: bool = True,
 ) -> pl.LazyFrame:
     """Express a 1:{1,N} mapping between data frames for a collection filter.
 
     Args:
-        lhs: The data frame with exactly one occurrence for a set of key columns.
-        rhs: The data frame with at least one occurrence for a set of key columns.
+        lhs: The data frame with exactly one occurrence for the set of join columns.
+        rhs: The data frame with at least one occurrence for the set of join columns.
         on: The columns to join the data frames on.
+        filter_unique: If set to `True`, drops rows in `lhs` that are not uniquely
+            identified by the join columns specified with `on`. If set to `False`,
+            skips uniqueness checks and avoids performance penalties. Use with
+            caution, as this may lead to unexpected results if rows in `lhs` are
+            not unique in the join columns.
+
+    Returns:
+        A data frame representing the inner join of the two inputs on the specified
+        columns, filtered to ensure a 1:{1,N} relationship.
     """
+    if filter_unique:
+        lhs = (
+            lhs.group_by(on)
+            .len(LEN_COLUMN)
+            .filter(pl.col(LEN_COLUMN) == 1)
+            .drop(LEN_COLUMN)
+        )
+
     return lhs.join(rhs.unique(on), on=on)
 
 
diff --git a/tests/benches/test_collection.py b/tests/benches/test_collection.py
index ba0fbfa..f2b115a 100644
--- a/tests/benches/test_collection.py
+++ b/tests/benches/test_collection.py
@@ -49,7 +49,7 @@ class SingleFilterCollection(dy.Collection):
 
     @dy.filter()
     def one_to_one(self) -> pl.LazyFrame:
-        return dy.filter_relationship_one_to_one(self.first, self.second, on="idx")
+        return dy.require_relationship_one_to_one(self.first, self.second, on="idx")
 
 
 @pytest.mark.benchmark(group="collection-filter-single")
@@ -79,17 +79,17 @@ class MultiFilterCollection(dy.Collection):
 
     @dy.filter()
     def one_to_one(self) -> pl.LazyFrame:
-        return dy.filter_relationship_one_to_one(self.first, self.second, on="idx")
+        return dy.require_relationship_one_to_one(self.first, self.second, on="idx")
 
     @dy.filter()
     def one_to_at_least_one(self) -> pl.LazyFrame:
-        return dy.filter_relationship_one_to_at_least_one(
+        return dy.require_relationship_one_to_at_least_one(
             self.first, self.second, on="idx"
         )
 
     @dy.filter()
     def one_to_at_least_one_reverse(self) -> pl.LazyFrame:
-        return dy.filter_relationship_one_to_at_least_one(
+        return dy.require_relationship_one_to_at_least_one(
             self.second, self.first, on="idx"
         )
 
diff --git a/tests/collection/test_matches.py b/tests/collection/test_matches.py
index 67a6dbe..f36fb45 100644
--- a/tests/collection/test_matches.py
+++ b/tests/collection/test_matches.py
@@ -74,7 +74,7 @@ class MyCollection1(dy.Collection):
     class MyCollection2(MyCollection1):
         @dy.filter()
         def test_filter(self) -> pl.LazyFrame:
-            return dy.filter_relationship_one_to_one(self.x, self.x, ["foo"])
+            return dy.require_relationship_one_to_one(self.x, self.x, ["foo"])
 
     # Should not match
     assert not MyCollection1.matches(MyCollection2)
@@ -93,12 +93,12 @@ class BaseCollection(dy.Collection):
     class MyCollection1(BaseCollection):
         @dy.filter()
         def test_filter(self) -> pl.LazyFrame:
-            return dy.filter_relationship_one_to_one(self.x, self.x, ["foo"])
+            return dy.require_relationship_one_to_one(self.x, self.x, ["foo"])
 
     class MyCollection2(BaseCollection):
         @dy.filter()
         def test_filter(self) -> pl.LazyFrame:
-            return dy.filter_relationship_one_to_at_least_one(self.x, self.x, ["foo"])
+            return dy.require_relationship_one_to_at_least_one(self.x, self.x, ["foo"])
 
     assert not MyCollection1.matches(MyCollection2)
 
diff --git a/tests/functional/test_relationships.py b/tests/functional/test_relationships.py
index b72e250..f83dbe7 100644
--- a/tests/functional/test_relationships.py
+++ b/tests/functional/test_relationships.py
@@ -57,25 +57,30 @@ def employees() -> dy.LazyFrame[EmployeeSchema]:
 # ------------------------------------------------------------------------------------ #
 
 
+@pytest.mark.parametrize("filter_unique", [True, False])
 def test_one_to_one(
     departments: dy.LazyFrame[DepartmentSchema],
     managers: dy.LazyFrame[ManagerSchema],
+    filter_unique: bool,
 ) -> None:
-    actual = dy.filter_relationship_one_to_one(
-        departments, managers, on="department_id"
+    actual = dy.require_relationship_one_to_one(
+        departments,
+        managers,
+        on="department_id",
+        filter_unique=filter_unique,
     )
     assert actual.select("department_id").collect().to_series().to_list() == [1]
 
 
-def test_one_to_one_keep_only_unique(
+def test_one_to_one_filter_unique(
     departments: dy.LazyFrame[DepartmentSchema],
     employees: dy.LazyFrame[EmployeeSchema],
 ) -> None:
-    actual = dy.filter_relationship_one_to_one(
+    actual = dy.require_relationship_one_to_one(
         departments,
         employees,
         on="department_id",
-        keep_only_unique=True,
+        filter_unique=True,
     )
     assert actual.select("department_id").collect().to_series().to_list() == [3]
 
@@ -84,7 +89,7 @@ def test_one_to_at_least_one(
     departments: dy.LazyFrame[DepartmentSchema],
     employees: dy.LazyFrame[EmployeeSchema],
 ) -> None:
-    actual = dy.filter_relationship_one_to_at_least_one(
-        departments, employees, on="department_id"
+    actual = dy.require_relationship_one_to_at_least_one(
+        departments, employees, on="department_id", filter_unique=False
     )
     assert actual.select("department_id").collect().to_series().to_list() == [2, 3]

From b064387f83ab2c9b6f1e3fa612eb097795b5a023 Mon Sep 17 00:00:00 2001
From: Daniel Elsner <daniel.elsner@quantco.com>
Date: Fri, 17 Oct 2025 11:48:54 +0200
Subject: [PATCH 5/5] Fix

---
 dataframely/functional.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/dataframely/functional.py b/dataframely/functional.py
index f55fde6..da7991a 100644
--- a/dataframely/functional.py
+++ b/dataframely/functional.py
@@ -50,17 +50,18 @@ def require_relationship_one_to_one(
         columns, filtered to ensure a 1:1 relationship.
     """
     if filter_unique:
-        lhs = (
+        return (
             lhs.group_by(on)
             .len(LEN_COLUMN)
             .filter(pl.col(LEN_COLUMN) == 1)
             .drop(LEN_COLUMN)
-        )
-        rhs = (
-            rhs.group_by(on)
-            .len(LEN_COLUMN)
-            .filter(pl.col(LEN_COLUMN) == 1)
-            .drop(LEN_COLUMN)
+            .join(
+                rhs.group_by(on)
+                .len(LEN_COLUMN)
+                .filter(pl.col(LEN_COLUMN) == 1)
+                .drop(LEN_COLUMN),
+                on=on,
+            )
         )
 
     return lhs.join(rhs, on=on)
@@ -94,11 +95,12 @@ def require_relationship_one_to_at_least_one(
         columns, filtered to ensure a 1:{1,N} relationship.
     """
     if filter_unique:
-        lhs = (
+        return (
             lhs.group_by(on)
             .len(LEN_COLUMN)
             .filter(pl.col(LEN_COLUMN) == 1)
             .drop(LEN_COLUMN)
+            .join(rhs.unique(on), on=on)
         )
 
     return lhs.join(rhs.unique(on), on=on)