ray-project · alexeykudinkin · Oct 31, 2025 · Oct 24, 2025 · Oct 24, 2025 · Oct 24, 2025
@@ -56,6 +56,7 @@
     _has_file_extension,
     _resolve_paths_and_filesystem,
 )
+from ray.data.expressions import Expr
 from ray.util.debug import log_once
 
 if TYPE_CHECKING:
@@ -286,7 +287,7 @@ def __init__(
         self._file_metadata_shuffler = None
         self._include_paths = include_paths
         self._partitioning = partitioning
-
+        self._predicate_expr: Optional[Expr] = None
         if shuffle == "files":
             self._file_metadata_shuffler = np.random.default_rng()
         elif isinstance(shuffle, FileShuffleConfig):
@@ -362,6 +363,12 @@ def get_read_tasks(
         )
 
         read_tasks = []
+        filter_expr = (
+            self._predicate_expr.to_pyarrow()
+            if self._predicate_expr is not None
+            else None
+        )
+
         for fragments, paths in zip(
             np.array_split(pq_fragments, parallelism),
             np.array_split(pq_paths, parallelism),
@@ -411,6 +418,7 @@ def get_read_tasks(
                         f,
                         include_paths,
                         partitioning,
+                        filter_expr,
                     ),
                     meta,
                     schema=target_schema,
@@ -434,6 +442,9 @@ def supports_distributed_reads(self) -> bool:
     def supports_projection_pushdown(self) -> bool:
         return True
 
+    def supports_predicate_pushdown(self) -> bool:
+        return True
+
     def get_current_projection(self) -> Optional[List[str]]:
         # NOTE: In case there's no projection both file and partition columns
         #       will be none
@@ -456,6 +467,35 @@ def apply_projection(
 
         return clone
 
+    # TODO: This should be moved to the Datasource class
+    def apply_predicate(
+        self,
+        predicate_expr: Expr,
+    ) -> "ParquetDatasource":
+        from ray.data._internal.planner.plan_expression.expression_visitors import (
+            _ColumnRefRebindingVisitor,
+        )
+        from ray.data.expressions import col
+
+        clone = copy.copy(self)
+        # Handle column renaming for Ray Data expressions
+        if self._data_columns_rename_map:
+            # Create mapping from new column names to old column names
+            column_mapping = {
+                new_col: col(old_col)
+                for old_col, new_col in self._data_columns_rename_map.items()
+            }
+            visitor = _ColumnRefRebindingVisitor(column_mapping)
+            predicate_expr = visitor.visit(predicate_expr)
+
+        # Combine with existing predicate using AND
+        if clone._predicate_expr is not None:
+            clone._predicate_expr = clone._predicate_expr & predicate_expr
+        else:
+            clone._predicate_expr = predicate_expr
+
+        return clone
+
     def _estimate_in_mem_size(self, fragments: List[_ParquetFragment]) -> int:
         in_mem_size = sum([f.file_size for f in fragments]) * self._encoding_ratio
 
@@ -473,6 +513,7 @@ def read_fragments(
     fragments: List[_ParquetFragment],
     include_paths: bool,
     partitioning: Partitioning,
+    filter_expr: Optional["pyarrow.dataset.Expression"] = None,
 ) -> Iterator["pyarrow.Table"]:
     # This import is necessary to load the tensor extension type.
     from ray.data.extensions.tensor_extension import ArrowTensorType  # noqa
@@ -494,6 +535,7 @@ def read_fragments(
                 partition_columns=partition_columns,
                 partitioning=partitioning,
                 include_path=include_paths,
+                filter_expr=filter_expr,
                 batch_size=default_read_batch_size_rows,
                 to_batches_kwargs=to_batches_kwargs,
             ),
@@ -532,7 +574,14 @@ def _read_batches_from(
     # NOTE: Passed in kwargs overrides always take precedence
     # TODO deprecate to_batches_kwargs
     use_threads = to_batches_kwargs.pop("use_threads", use_threads)
-    filter_expr = to_batches_kwargs.pop("filter", filter_expr)
+    # TODO: We should deprecate filter through the read_parquet API and only allow through dataset.filter()
+    if to_batches_kwargs.get("filter") is not None:
+        filter_from_kwargs = to_batches_kwargs.get("filter")
+        if filter_expr is not None:
+            filter_expr = filter_expr & filter_from_kwargs
+        else:
+            filter_expr = filter_from_kwargs
+        to_batches_kwargs.pop("filter")
     # NOTE: Arrow's ``to_batches`` expects ``batch_size`` as an int
     if batch_size is not None:
         to_batches_kwargs.setdefault("batch_size", batch_size)

@@ -1,4 +1,8 @@
-from .logical_operator import LogicalOperator, LogicalOperatorSupportsProjectionPushdown
+from .logical_operator import (
+    LogicalOperator,
+    LogicalOperatorSupportsPredicatePushdown,
+    LogicalOperatorSupportsProjectionPushdown,
+)
 from .logical_plan import LogicalPlan
 from .operator import Operator
 from .optimizer import Optimizer, Rule
@@ -16,4 +20,5 @@
     "Rule",
     "SourceOperator",
     "LogicalOperatorSupportsProjectionPushdown",
+    "LogicalOperatorSupportsPredicatePushdown",
 ]
@@ -2,6 +2,7 @@
 
 from .operator import Operator
 from ray.data.block import BlockMetadata
+from ray.data.expressions import Expr
 
 if TYPE_CHECKING:
     from ray.data.block import Schema
@@ -104,3 +105,19 @@ def apply_projection(
         column_rename_map: Optional[Dict[str, str]],
     ) -> LogicalOperator:
         return self
+
+
+class LogicalOperatorSupportsPredicatePushdown(LogicalOperator):
+    """Mixin for reading operators supporting predicate pushdown"""
+
+    def supports_predicate_pushdown(self) -> bool:
+        return False
+
+    def get_current_predicate(self) -> Optional[Expr]:
+        return None
+
+    def apply_predicate(
+        self,
+        predicate_expr: Expr,
+    ) -> LogicalOperator:
+        return self
@@ -266,6 +266,14 @@ def __init__(
     def can_modify_num_rows(self) -> bool:
         return True
 
+    def is_expression_based(self) -> bool:
+        return self._predicate_expr is not None
+
+    def _get_operator_name(self, op_name: str, fn: UserDefinedFunction):
+        if self.is_expression_based():
+            return f"{op_name}(<expression>)"
+        return super()._get_operator_name(op_name, fn)
+
 
 class Project(AbstractMap):
     """Logical operator for all Projection Operations."""

diff --git a/python/ray/data/_internal/logical/operators/n_ary_operator.py b/python/ray/data/_internal/logical/operators/n_ary_operator.py
@@ -1,6 +1,10 @@
 from typing import Optional
 
-from ray.data._internal.logical.interfaces import LogicalOperator
+from ray.data._internal.logical.interfaces import (
+    LogicalOperator,
+    LogicalOperatorSupportsPredicatePushdown,
+)
+from ray.data.expressions import Expr
 
 
 class NAry(LogicalOperator):
@@ -37,14 +41,15 @@ def estimated_num_outputs(self):
         return total_num_outputs
 
 
-class Union(NAry):
+class Union(NAry, LogicalOperatorSupportsPredicatePushdown):
     """Logical operator for union."""
 
     def __init__(
         self,
         *input_ops: LogicalOperator,
     ):
         super().__init__(*input_ops)
+        self._predicate_expr: Optional[Expr] = None
 
     def estimated_num_outputs(self):
         total_num_outputs = 0
@@ -54,3 +59,45 @@ def estimated_num_outputs(self):
                 return None
             total_num_outputs += num_outputs
         return total_num_outputs
+
+    def supports_predicate_pushdown(self) -> bool:
+        """Union supports predicate pushdown by applying predicates to all branches."""
+        return True
+
+    def get_current_predicate(self) -> Optional[Expr]:
+        """Returns the current predicate expression applied to this Union."""
+        return self._predicate_expr
+
+    def apply_predicate(self, predicate_expr: Expr) -> "Union":
+        """Apply a predicate by pushing it down to all input branches.
+
+        This creates a new Union with the predicate applied to each input operator
+        that supports predicate pushdown.
+        """
+        import copy
+
+        from ray.data._internal.logical.operators.map_operator import Filter
+
+        clone = copy.copy(self)
+
+        # Combine with existing predicate using AND
+        if clone._predicate_expr is not None:
+            clone._predicate_expr = clone._predicate_expr & predicate_expr
+        else:
+            clone._predicate_expr = predicate_expr
+
+        # Apply predicate to each branch
+        new_inputs = []
+        for branch in self._input_dependencies:
+            # If the branch supports predicate pushdown, use it
+            if (
+                isinstance(branch, LogicalOperatorSupportsPredicatePushdown)
+                and branch.supports_predicate_pushdown()
+            ):
+                new_inputs.append(branch.apply_predicate(predicate_expr))
+            else:
+                # Otherwise, wrap with a Filter operator
+                new_inputs.append(Filter(branch, predicate_expr=predicate_expr))
+
+        clone._input_dependencies = new_inputs
+        return clone
@@ -4,6 +4,7 @@
 from typing import Any, Dict, List, Optional, Union
 
 from ray.data._internal.logical.interfaces import (
+    LogicalOperatorSupportsPredicatePushdown,
     LogicalOperatorSupportsProjectionPushdown,
     SourceOperator,
 )
@@ -14,9 +15,15 @@
 )
 from ray.data.context import DataContext
 from ray.data.datasource.datasource import Datasource, Reader
+from ray.data.expressions import Expr
 
 
-class Read(AbstractMap, SourceOperator, LogicalOperatorSupportsProjectionPushdown):
+class Read(
+    AbstractMap,
+    SourceOperator,
+    LogicalOperatorSupportsProjectionPushdown,
+    LogicalOperatorSupportsPredicatePushdown,
+):
     """Logical operator for read."""
 
     # TODO: make this a frozen dataclass. https://github.com/ray-project/ray/issues/55747
@@ -173,6 +180,21 @@ def apply_projection(
 
         return clone
 
+    def supports_predicate_pushdown(self) -> bool:
+        return self._datasource.supports_predicate_pushdown()
+
+    def get_current_predicate(self) -> Optional[Expr]:
+        return self._datasource.get_current_predicate()
+
+    def apply_predicate(self, predicate_expr: Expr) -> "Read":
+        clone = copy.copy(self)
+
+        predicated_datasource = self._datasource.apply_predicate(predicate_expr)
+        clone._datasource = predicated_datasource
+        clone._datasource_or_legacy_reader = predicated_datasource
+
+        return clone
+
     def can_modify_num_rows(self) -> bool:
         # NOTE: Returns true, since most of the readers expands its input
         #       and produce many rows for every single row of the input

@@ -16,6 +16,7 @@
 )
 from ray.data._internal.logical.rules.limit_pushdown import LimitPushdownRule
 from ray.data._internal.logical.rules.operator_fusion import FuseOperators
+from ray.data._internal.logical.rules.predicate_pushdown import PredicatePushdown
 from ray.data._internal.logical.rules.projection_pushdown import ProjectionPushdown
 from ray.data._internal.logical.rules.set_read_parallelism import SetReadParallelismRule
 from ray.util.annotations import DeveloperAPI
@@ -25,6 +26,7 @@
         InheritBatchFormatRule,
         LimitPushdownRule,
         ProjectionPushdown,
+        PredicatePushdown,
     ]
 )
 

@@ -0,0 +1,68 @@
+from ray.data._internal.logical.interfaces import (
+    LogicalOperator,
+    LogicalOperatorSupportsPredicatePushdown,
+    LogicalPlan,
+    Rule,
+)
+from ray.data._internal.logical.operators.map_operator import Filter
+
+
+class PredicatePushdown(Rule):
+    """Pushes down predicates across the graph.
+
+    This rule performs the following optimizations:
+    1. Combines chained Filter operators with compatible expressions
+    2. Pushes filter expressions down to operators that support predicate pushdown
+    """
+
+    def apply(self, plan: LogicalPlan) -> LogicalPlan:
+        """Apply predicate pushdown optimization to the logical plan."""
+        dag = plan.dag
+        while True:
+            new_dag = dag._apply_transform(self._try_fuse_filters)
+            new_dag = new_dag._apply_transform(self._try_push_down_predicate)
+            if new_dag is dag:
+                break
+            dag = new_dag
+        return LogicalPlan(dag, plan.context)
+
+    @classmethod
+    def _try_fuse_filters(cls, op: LogicalOperator) -> LogicalOperator:
+        """Fuse consecutive Filter operators with compatible expressions."""
+        if not isinstance(op, Filter) or not op.is_expression_based():
+            return op
+
+        input_op = op.input_dependencies[0]
+        if not isinstance(input_op, Filter) or not input_op.is_expression_based():
+            return op
+
+        # Check if predicates are of the same type
+        if type(op._predicate_expr) is not type(input_op._predicate_expr):
+            return op
+
+        # Combine predicates
+        combined_predicate = op._predicate_expr & input_op._predicate_expr
+
+        # Create new filter on the input of the lower filter
+        return Filter(
+            input_op.input_dependencies[0],
+            predicate_expr=combined_predicate,
+        )
+
+    @classmethod
+    def _try_push_down_predicate(cls, op: LogicalOperator) -> LogicalOperator:
+        """Push Filter down to any operator that supports predicate pushdown."""
+        if not isinstance(op, Filter) or not op.is_expression_based():
+            return op
+
+        input_op = op.input_dependencies[0]
+
+        # Check if the input operator supports predicate pushdown
+        if (
+            isinstance(input_op, LogicalOperatorSupportsPredicatePushdown)
+            and input_op.supports_predicate_pushdown()
+        ):
+            # Push the predicate down and return the result without the filter
+            return input_op.apply_predicate(op._predicate_expr)
+
+        return op