feat: Allow drop_duplicates over unordered dataframe (#2303)

TrevorBergeron · web-flow · commit 52665fa57ef1 · 2025-12-02T17:05:19.000-08:00
diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py
@@ -67,40 +67,39 @@ def indicate_duplicates(
     if keep not in ["first", "last", False]:
         raise ValueError("keep must be one of 'first', 'last', or False'")
 
+    rownums = agg_expressions.WindowExpression(
+        agg_expressions.NullaryAggregation(
+            agg_ops.RowNumberOp(),
+        ),
+        window=windows.unbound(grouping_keys=tuple(columns)),
+    )
+    count = agg_expressions.WindowExpression(
+        agg_expressions.NullaryAggregation(
+            agg_ops.SizeOp(),
+        ),
+        window=windows.unbound(grouping_keys=tuple(columns)),
+    )
+
     if keep == "first":
         # Count how many copies occur up to current copy of value
         # Discard this value if there are copies BEFORE
-        window_spec = windows.cumulative_rows(
-            grouping_keys=tuple(columns),
-        )
+        predicate = ops.gt_op.as_expr(rownums, ex.const(0))
     elif keep == "last":
         # Count how many copies occur up to current copy of values
         # Discard this value if there are copies AFTER
-        window_spec = windows.inverse_cumulative_rows(
-            grouping_keys=tuple(columns),
-        )
+        predicate = ops.lt_op.as_expr(rownums, ops.sub_op.as_expr(count, ex.const(1)))
     else:  # keep == False
         # Count how many copies of the value occur in entire series.
         # Discard this value if there are copies ANYWHERE
-        window_spec = windows.unbound(grouping_keys=tuple(columns))
-    block, dummy = block.create_constant(1)
-    # use row number as will work even with partial ordering
-    block, val_count_col_id = block.apply_window_op(
-        dummy,
-        agg_ops.sum_op,
-        window_spec=window_spec,
-    )
-    block, duplicate_indicator = block.project_expr(
-        ops.gt_op.as_expr(val_count_col_id, ex.const(1))
+        predicate = ops.gt_op.as_expr(count, ex.const(1))
+
+    block = block.project_block_exprs(
+        [predicate],
+        labels=[None],
     )
     return (
-        block.drop_columns(
-            (
-                dummy,
-                val_count_col_id,
-            )
-        ),
-        duplicate_indicator,
+        block,
+        block.value_columns[-1],
     )
 
 
diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py
@@ -547,6 +547,9 @@ def compile_agg_op(
                 return pl.col(*inputs).first()
             if isinstance(op, agg_ops.LastOp):
                 return pl.col(*inputs).last()
+            if isinstance(op, agg_ops.RowNumberOp):
+                # pl.row_index is not yet stable enough to use here, and only supports polars>=1.32
+                return pl.int_range(pl.len(), dtype=pl.Int64)
             if isinstance(op, agg_ops.ShiftOp):
                 return pl.col(*inputs).shift(op.periods)
             if isinstance(op, agg_ops.DiffOp):
diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py
@@ -624,8 +624,6 @@ def dropna(self, how: typing.Literal["all", "any"] = "any") -> Index:
         return Index(result)
 
     def drop_duplicates(self, *, keep: __builtins__.str = "first") -> Index:
-        if keep is not False:
-            validations.enforce_ordered(self, "drop_duplicates")
         block = block_ops.drop_duplicates(self._block, self._block.index_columns, keep)
         return Index(block)
 
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -5054,8 +5054,6 @@ def drop_duplicates(
         *,
         keep: str = "first",
     ) -> DataFrame:
-        if keep is not False:
-            validations.enforce_ordered(self, "drop_duplicates(keep != False)")
         if subset is None:
             column_ids = self._block.value_columns
         elif utils.is_list_like(subset):
@@ -5069,8 +5067,6 @@ def drop_duplicates(
         return DataFrame(block)
 
     def duplicated(self, subset=None, keep: str = "first") -> bigframes.series.Series:
-        if keep is not False:
-            validations.enforce_ordered(self, "duplicated(keep != False)")
         if subset is None:
             column_ids = self._block.value_columns
         else:
diff --git a/bigframes/series.py b/bigframes/series.py
@@ -2227,8 +2227,6 @@ def reindex_like(self, other: Series, *, validate: typing.Optional[bool] = None)
         return self.reindex(other.index, validate=validate)
 
     def drop_duplicates(self, *, keep: str = "first") -> Series:
-        if keep is not False:
-            validations.enforce_ordered(self, "drop_duplicates(keep != False)")
         block = block_ops.drop_duplicates(self._block, (self._value_column,), keep)
         return Series(block)
 
@@ -2249,8 +2247,6 @@ def unique(self, keep_order=True) -> Series:
         return Series(block.select_columns(result).reset_index())
 
     def duplicated(self, keep: str = "first") -> Series:
-        if keep is not False:
-            validations.enforce_ordered(self, "duplicated(keep != False)")
         block, indicator = block_ops.indicate_duplicates(
             self._block, (self._value_column,), keep
         )
diff --git a/tests/system/large/test_dataframe.py b/tests/system/large/test_dataframe.py
@@ -40,3 +40,27 @@ def test_cov_150_columns(scalars_df_numeric_150_columns_maybe_ordered):
         check_index_type=False,
         check_column_type=False,
     )
+
+
+@pytest.mark.parametrize(
+    ("keep",),
+    [
+        ("first",),
+        ("last",),
+        (False,),
+    ],
+)
+def test_drop_duplicates_unordered(
+    scalars_df_unordered, scalars_pandas_df_default_index, keep
+):
+    uniq_scalar_rows = scalars_df_unordered.drop_duplicates(
+        subset="bool_col", keep=keep
+    )
+    uniq_pd_rows = scalars_pandas_df_default_index.drop_duplicates(
+        subset="bool_col", keep=keep
+    )
+
+    assert len(uniq_scalar_rows) == len(uniq_pd_rows)
+    assert len(uniq_scalar_rows.groupby("bool_col")) == len(
+        uniq_pd_rows.groupby("bool_col")
+    )