@@ -67,40 +67,39 @@ def indicate_duplicates(
6767 if keep not in ["first" , "last" , False ]:
6868 raise ValueError ("keep must be one of 'first', 'last', or False'" )
6969
70+ rownums = agg_expressions .WindowExpression (
71+ agg_expressions .NullaryAggregation (
72+ agg_ops .RowNumberOp (),
73+ ),
74+ window = windows .unbound (grouping_keys = tuple (columns )),
75+ )
76+ count = agg_expressions .WindowExpression (
77+ agg_expressions .NullaryAggregation (
78+ agg_ops .SizeOp (),
79+ ),
80+ window = windows .unbound (grouping_keys = tuple (columns )),
81+ )
82+
7083 if keep == "first" :
7184 # Count how many copies occur up to current copy of value
7285 # Discard this value if there are copies BEFORE
73- window_spec = windows .cumulative_rows (
74- grouping_keys = tuple (columns ),
75- )
86+ predicate = ops .gt_op .as_expr (rownums , ex .const (0 ))
7687 elif keep == "last" :
7788 # Count how many copies occur up to current copy of values
7889 # Discard this value if there are copies AFTER
79- window_spec = windows .inverse_cumulative_rows (
80- grouping_keys = tuple (columns ),
81- )
90+ predicate = ops .lt_op .as_expr (rownums , ops .sub_op .as_expr (count , ex .const (1 )))
8291 else : # keep == False
8392 # Count how many copies of the value occur in entire series.
8493 # Discard this value if there are copies ANYWHERE
85- window_spec = windows .unbound (grouping_keys = tuple (columns ))
86- block , dummy = block .create_constant (1 )
87- # use row number as will work even with partial ordering
88- block , val_count_col_id = block .apply_window_op (
89- dummy ,
90- agg_ops .sum_op ,
91- window_spec = window_spec ,
92- )
93- block , duplicate_indicator = block .project_expr (
94- ops .gt_op .as_expr (val_count_col_id , ex .const (1 ))
94+ predicate = ops .gt_op .as_expr (count , ex .const (1 ))
95+
96+ block = block .project_block_exprs (
97+ [predicate ],
98+ labels = [None ],
9599 )
96100 return (
97- block .drop_columns (
98- (
99- dummy ,
100- val_count_col_id ,
101- )
102- ),
103- duplicate_indicator ,
101+ block ,
102+ block .value_columns [- 1 ],
104103 )
105104
106105
0 commit comments