[Data] - Make Projection pushdown tests non-flaky + predicates through projects (#58688)

goutamvenkat-anyscale · web-flow · commit b8277d53fb2f · 2025-11-18T08:23:52.000-08:00
## Description 1. Use `rows_same` util for the tests in `test_projection_fusion` 2. Properly handle pushing predicates past projections ## Related issues > Link related issues: "Fixes #1234", "Closes #1234", or "Related to #1234". ## Additional information > Optional: Add implementation details, API changes, usage examples, screenshots, etc. --------- Signed-off-by: Goutam <goutam@anyscale.com>
diff --git a/python/ray/data/_internal/logical/rules/predicate_pushdown.py b/python/ray/data/_internal/logical/rules/predicate_pushdown.py
@@ -9,7 +9,7 @@
     PredicatePassThroughBehavior,
     Rule,
 )
-from ray.data._internal.logical.operators.map_operator import Filter
+from ray.data._internal.logical.operators.map_operator import Filter, Project
 from ray.data._internal.planner.plan_expression.expression_visitors import (
     _ColumnSubstitutionVisitor,
 )
@@ -63,6 +63,86 @@ def _try_fuse_filters(cls, op: LogicalOperator) -> LogicalOperator:
             predicate_expr=combined_predicate,
         )
 
+    @classmethod
+    def _can_push_filter_through_projection(
+        cls, filter_op: "Filter", projection_op: Project
+    ) -> bool:
+        """Check if a filter can be pushed through a projection operator.
+
+        Returns False (blocks pushdown) if filter references:
+        - Columns removed by select: select(['a']).filter(col('b'))
+        - Computed columns: with_column('d', 4).filter(col('d'))
+        - Old column names after rename: rename({'b': 'B'}).filter(col('b'))
+
+        Returns True (allows pushdown) for:
+        - Columns present in output: select(['a', 'b']).filter(col('a'))
+        - New column names after rename: rename({'b': 'B'}).filter(col('B'))
+        - Rename chains with name reuse: rename({'a': 'b', 'b': 'c'}).filter(col('b'))
+          (where 'b' is valid output created by a->b)
+        """
+        from ray.data._internal.logical.rules.projection_pushdown import (
+            _is_renaming_expr,
+        )
+        from ray.data._internal.planner.plan_expression.expression_visitors import (
+            _ColumnReferenceCollector,
+        )
+        from ray.data.expressions import AliasExpr
+
+        collector = _ColumnReferenceCollector()
+        collector.visit(filter_op._predicate_expr)
+        predicate_columns = set(collector.get_column_refs() or [])
+
+        output_columns = set()
+        new_names = set()
+        original_columns_being_renamed = set()
+
+        for expr in projection_op.exprs:
+            if expr.name is not None:
+                # Collect output column names
+                output_columns.add(expr.name)
+
+            # Process AliasExpr (computed columns or renames)
+            if isinstance(expr, AliasExpr):
+                new_names.add(expr.name)
+
+                # Check computed column: with_column('d', 4) creates AliasExpr(lit(4), 'd')
+                if expr.name in predicate_columns and not _is_renaming_expr(expr):
+                    return False  # Computed column
+
+                # Track old names being renamed for later check
+                if _is_renaming_expr(expr):
+                    original_columns_being_renamed.add(expr.expr.name)
+
+        # Check if filter references columns removed by explicit select
+        # Valid if: projection includes all columns (star) OR predicate columns exist in output
+        has_required_columns = (
+            projection_op.has_star_expr() or predicate_columns.issubset(output_columns)
+        )
+        if not has_required_columns:
+            return False
+
+        # Find old names that are:
+        # 1. Being renamed away (in original_columns_being_renamed), AND
+        # 2. Referenced in predicate (in predicate_columns), AND
+        # 3. NOT recreated as new names (not in new_names)
+        #
+        # Examples:
+        #   rename({'b': 'B'}).filter(col('b'))
+        #     → {'b'} & {'b'} - {'B'} = {'b'} → BLOCKS (old name 'b' no longer exists)
+        #
+        #   rename({'a': 'b', 'b': 'c'}).filter(col('b'))
+        #     → {'a','b'} & {'b'} - {'b','c'} = {} → ALLOWS (new 'b' created by a->b)
+        #
+        #   rename({'b': 'B'}).filter(col('B'))
+        #     → {'b'} & {'B'} - {'B'} = {} → ALLOWS (using new name 'B')
+        invalid_old_names = (
+            original_columns_being_renamed & predicate_columns
+        ) - new_names
+        if invalid_old_names:
+            return False  # Old name after rename
+
+        return True
+
     @classmethod
     def _substitute_predicate_columns(
         cls, predicate_expr: Expr, column_rename_map: dict[str, str]
@@ -135,6 +215,14 @@ def _try_push_down_predicate(cls, op: LogicalOperator) -> LogicalOperator:
                     behavior
                     == PredicatePassThroughBehavior.PASSTHROUGH_WITH_SUBSTITUTION
                 ):
+                    # Check if we can safely push the filter through this projection
+                    if isinstance(
+                        input_op, Project
+                    ) and not cls._can_push_filter_through_projection(
+                        filter_op, input_op
+                    ):
+                        return filter_op
+
                     rename_map = input_op.get_column_substitutions()
                     if rename_map:
                         predicate_expr = cls._substitute_predicate_columns(
diff --git a/python/ray/data/tests/test_predicate_pushdown.py b/python/ray/data/tests/test_predicate_pushdown.py
@@ -11,7 +11,7 @@
     Repartition,
     Sort,
 )
-from ray.data._internal.logical.operators.map_operator import Filter
+from ray.data._internal.logical.operators.map_operator import Filter, Project
 from ray.data._internal.logical.operators.one_to_one_operator import Limit
 from ray.data._internal.logical.optimizers import LogicalOptimizer
 from ray.data._internal.util import rows_same
@@ -543,6 +543,176 @@ def test_multiple_filters_with_renames(self, parquet_ds):
         ), "All filters should be fused, rebound, and pushed into Read"
 
 
+class TestProjectionWithFilterEdgeCases:
+    """Tests for edge cases with select_columns and with_column followed by filters.
+
+    These tests verify that filters correctly handle:
+    - Columns that are kept by select (should push through)
+    - Columns that are removed by select (should NOT push through)
+    - Computed columns from with_column (should NOT push through)
+    """
+
+    @pytest.fixture
+    def base_ds(self, ray_start_regular_shared):
+        return ray.data.from_items(
+            [
+                {"a": 1, "b": 2, "c": 3},
+                {"a": 2, "b": 5, "c": 8},
+                {"a": 3, "b": 6, "c": 9},
+            ]
+        )
+
+    def test_select_then_filter_on_selected_column(self, base_ds):
+        """Filter on selected column should push through select."""
+        ds = base_ds.select_columns(["a", "b"]).filter(expr=col("a") > 1)
+
+        # Verify correctness
+        result_df = ds.to_pandas()
+        expected_df = pd.DataFrame(
+            [
+                {"a": 2, "b": 5},
+                {"a": 3, "b": 6},
+            ]
+        )
+        # Sort columns before comparison
+        result_df = result_df[sorted(result_df.columns)]
+        expected_df = expected_df[sorted(expected_df.columns)]
+        assert rows_same(result_df, expected_df)
+
+        # Verify plan: filter pushed through select
+        optimized_plan = LogicalOptimizer().optimize(ds._plan._logical_plan)
+        assert plan_operator_comes_before(
+            optimized_plan, Filter, Project
+        ), "Filter should be pushed before Project"
+
+    def test_select_then_filter_on_removed_column(self, base_ds):
+        """Filter on removed column should fail, not push through."""
+        ds = base_ds.select_columns(["a"])
+
+        with pytest.raises((KeyError, ray.exceptions.RayTaskError)):
+            ds.filter(expr=col("b") == 2).take_all()
+
+    def test_with_column_then_filter_on_computed_column(self, base_ds):
+        """Filter on computed column should not push through."""
+
+        from ray.data.expressions import lit
+
+        ds = base_ds.with_column("d", lit(4)).filter(expr=col("d") == 4)
+
+        # Verify correctness - all rows should pass (d is always 4)
+        result_df = ds.to_pandas()
+        expected_df = pd.DataFrame(
+            [
+                {"a": 1, "b": 2, "c": 3, "d": 4},
+                {"a": 2, "b": 5, "c": 8, "d": 4},
+                {"a": 3, "b": 6, "c": 9, "d": 4},
+            ]
+        )
+        # Sort columns before comparison
+        result_df = result_df[sorted(result_df.columns)]
+        expected_df = expected_df[sorted(expected_df.columns)]
+        assert rows_same(result_df, expected_df)
+
+        # Verify plan: filter should NOT push through (stays after with_column)
+        optimized_plan = LogicalOptimizer().optimize(ds._plan._logical_plan)
+        assert plan_has_operator(
+            optimized_plan, Filter
+        ), "Filter should remain (not pushed through)"
+
+    def test_rename_then_filter_on_old_column_name(self, base_ds):
+        """Filter using old column name after rename should fail."""
+        ds = base_ds.rename_columns({"b": "B"})
+
+        with pytest.raises((KeyError, ray.exceptions.RayTaskError)):
+            ds.filter(expr=col("b") == 2).take_all()
+
+    @pytest.mark.parametrize(
+        "ds_factory,rename_map,filter_col,filter_value,expected_rows",
+        [
+            # In-memory dataset: rename a->b, b->b_old
+            (
+                lambda: ray.data.from_items(
+                    [
+                        {"a": 1, "b": 2, "c": 3},
+                        {"a": 2, "b": 5, "c": 8},
+                        {"a": 3, "b": 6, "c": 9},
+                    ]
+                ),
+                {"a": "b", "b": "b_old"},
+                "b",
+                1,
+                [{"b": 2, "b_old": 5, "c": 8}, {"b": 3, "b_old": 6, "c": 9}],
+            ),
+            # Parquet dataset: rename sepal.length->sepal.width, sepal.width->old_width
+            (
+                lambda: ray.data.read_parquet("example://iris.parquet"),
+                {"sepal.length": "sepal.width", "sepal.width": "old_width"},
+                "sepal.width",
+                5.0,
+                None,  # Will verify via alternative computation
+            ),
+        ],
+        ids=["in_memory", "parquet"],
+    )
+    def test_rename_chain_with_name_reuse(
+        self,
+        ray_start_regular_shared,
+        ds_factory,
+        rename_map,
+        filter_col,
+        filter_value,
+        expected_rows,
+    ):
+        """Test rename chains where an output name matches another rename's input name.
+
+        This tests the fix for a bug where rename(a->b, b->c) followed by filter(b>5)
+        would incorrectly block pushdown, even though 'b' is a valid output column
+        (created by a->b).
+
+        Example: rename({'a': 'b', 'b': 'temp'}) creates 'b' from 'a' and 'temp' from 'b'.
+        A filter on 'b' should be able to push through.
+        """
+        ds = ds_factory()
+
+        # Apply rename and filter
+        ds_renamed_filtered = ds.rename_columns(rename_map).filter(
+            expr=col(filter_col) > filter_value
+        )
+
+        # Verify correctness
+        if expected_rows is not None:
+            # For in-memory, compare against expected rows
+            result_df = ds_renamed_filtered.to_pandas()
+            expected_df = pd.DataFrame(expected_rows)
+            result_df = result_df[sorted(result_df.columns)]
+            expected_df = expected_df[sorted(expected_df.columns)]
+            assert rows_same(result_df, expected_df)
+        else:
+            # For parquet, compare against alternative computation
+            # Filter on original column, then rename
+            original_col = next(k for k, v in rename_map.items() if v == filter_col)
+            expected = ds.filter(expr=col(original_col) > filter_value).rename_columns(
+                rename_map
+            )
+            assert rows_same(ds_renamed_filtered.to_pandas(), expected.to_pandas())
+
+        # Verify plan optimization
+        optimized_plan = LogicalOptimizer().optimize(
+            ds_renamed_filtered._plan._logical_plan
+        )
+
+        # For parquet (supports predicate pushdown), filter should push into Read
+        if "parquet" in str(ds._plan._logical_plan.dag).lower():
+            assert not plan_has_operator(
+                optimized_plan, Filter
+            ), "Filter should be pushed into Read after rebinding through rename chain"
+        else:
+            # For in-memory, filter should at least push through projection
+            assert plan_operator_comes_before(
+                optimized_plan, Filter, Project
+            ), "Filter should be pushed before Project after rebinding through rename chain"
+
+
 class TestPushIntoBranchesBehavior:
     """Tests for PUSH_INTO_BRANCHES behavior operators.
 
diff --git a/python/ray/data/tests/test_projection_fusion.py b/python/ray/data/tests/test_projection_fusion.py