ray-project
diff --git a/‎doc/source/data/api/aggregate.rst‎
Lines changed: 1 addition & 0 deletions b/‎doc/source/data/api/aggregate.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/ray/data/_internal/arrow_block.py‎
Lines changed: 28 additions & 0 deletions b/‎python/ray/data/_internal/arrow_block.py‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎python/ray/data/_internal/pandas_block.py‎
Lines changed: 45 additions & 1 deletion b/‎python/ray/data/_internal/pandas_block.py‎
Lines changed: 45 additions & 1 deletion
diff --git a/‎python/ray/data/aggregate.py‎
Lines changed: 83 additions & 1 deletion b/‎python/ray/data/aggregate.py‎
Lines changed: 83 additions & 1 deletion
diff --git a/‎python/ray/data/block.py‎
Lines changed: 33 additions & 0 deletions b/‎python/ray/data/block.py‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎python/ray/data/preprocessor.py‎
Lines changed: 25 additions & 2 deletions b/‎python/ray/data/preprocessor.py‎
Lines changed: 25 additions & 2 deletions
diff --git a/‎python/ray/data/preprocessors/chain.py‎
Lines changed: 1 addition & 0 deletions b/‎python/ray/data/preprocessors/chain.py‎
Lines changed: 1 addition & 0 deletions
@@ -25,6 +25,7 @@ compute aggregations.
     AbsMax
     Quantile
     Unique
+    ValueCounter
     MissingValuePercentage
     ZeroPercentage
     ApproximateQuantile
 
@@ -530,11 +530,39 @@ def unique(self) -> BlockColumn:
 
         return pac.unique(self._column)
 
+    def value_counts(self) -> Optional[Dict[str, List]]:
+        import pyarrow.compute as pac
+
+        value_counts: pyarrow.StructArray = pac.value_counts(self._column)
+        if len(value_counts) == 0:
+            return None
+        return {
+            "values": value_counts.field("values").to_pylist(),
+            "counts": value_counts.field("counts").to_pylist(),
+        }
+
+    def hash(self) -> BlockColumn:
+        import polars as pl
+
+        df = pl.DataFrame({"col": self._column})
+        hashes = df.hash_rows().cast(pl.Int64, wrap_numerical=True)
+        return hashes.to_arrow()
+
     def flatten(self) -> BlockColumn:
         import pyarrow.compute as pac
 
         return pac.list_flatten(self._column)
 
+    def dropna(self) -> BlockColumn:
+        import pyarrow.compute as pac
+
+        return pac.drop_null(self._column)
+
+    def is_composed_of_lists(self, types: Optional[Tuple] = None) -> bool:
+        if not types:
+            types = (pyarrow.lib.ListType, pyarrow.lib.LargeListType)
+        return isinstance(self._column.type, types)
+
     def to_pylist(self) -> List[Any]:
         return self._column.to_pylist()
 
 
@@ -174,8 +174,33 @@ def quantile(
     ) -> Optional[U]:
         return self._column.quantile(q=q)
 
+    def value_counts(self) -> Optional[Dict[str, List]]:
+        value_counts = self._column.value_counts()
+        if len(value_counts) == 0:
+            return None
+        return {
+            "values": value_counts.index.tolist(),
+            "counts": value_counts.values.tolist(),
+        }
+
+    def hash(self) -> BlockColumn:
+
+        from ray.air.util.tensor_extensions.pandas import TensorArrayElement
+
+        first_non_null = next((x for x in self._column if x is not None), None)
+        if isinstance(first_non_null, TensorArrayElement):
+            self._column = self._column.apply(lambda x: x.to_numpy())
+
+        import polars as pl
+
+        df = pl.from_pandas(self._column.to_frame())
+        hashes = df.hash_rows().cast(pl.Int64, wrap_numerical=True)
+        return hashes.to_pandas()
+
     def unique(self) -> BlockColumn:
+
         pd = lazy_import_pandas()
+
         try:
             return pd.Series(self._column.unique())
         except ValueError as e:
@@ -187,7 +212,18 @@ def unique(self) -> BlockColumn:
                 raise
 
     def flatten(self) -> BlockColumn:
-        return self._column.list.flatten()
+        from ray.air.util.tensor_extensions.pandas import TensorArrayElement
+
+        first_non_null = next((x for x in self._column if x is not None), None)
+        if isinstance(first_non_null, TensorArrayElement):
+            self._column = self._column.apply(
+                lambda x: x.to_numpy() if isinstance(x, TensorArrayElement) else x
+            )
+
+        return self._column.explode(ignore_index=True)
+
+    def dropna(self) -> BlockColumn:
+        return self._column.dropna()
 
     def sum_of_squared_diffs_from_mean(
         self,
@@ -219,6 +255,14 @@ def _as_arrow_compatible(self) -> Union[List[Any], "pyarrow.Array"]:
     def _is_all_null(self):
         return not self._column.notna().any()
 
+    def is_composed_of_lists(self, types: Optional[Tuple] = None) -> bool:
+        from ray.air.util.tensor_extensions.pandas import TensorArrayElement
+
+        if not types:
+            types = (list, np.ndarray, TensorArrayElement)
+        first_non_null = next((x for x in self._column if x is not None), None)
+        return isinstance(first_non_null, types)
+
 
 class PandasBlockBuilder(TableBlockBuilder):
     def __init__(self):
 
@@ -1,6 +1,6 @@
 import abc
 import math
-from typing import TYPE_CHECKING, Any, Callable, List, Optional
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
 
 import numpy as np
 import pyarrow.compute as pc
@@ -889,6 +889,88 @@ def _to_set(x):
             return {x}
 
 
+@PublicAPI
+class ValueCounter(AggregateFnV2):
+    """Counts the number of times each value appears in a column.
+
+    This aggregation computes value counts for a specified column, similar to pandas'
+    `value_counts()` method. It returns a dictionary with two lists: "values" containing
+    the unique values found in the column, and "counts" containing the corresponding
+    count for each value.
+
+    Example:
+
+        .. testcode::
+
+            import ray
+            from ray.data.aggregate import ValueCounter
+
+            # Create a dataset with repeated values
+            ds = ray.data.from_items([
+                {"category": "A"}, {"category": "B"}, {"category": "A"},
+                {"category": "C"}, {"category": "A"}, {"category": "B"}
+            ])
+
+            # Count occurrences of each category
+            result = ds.aggregate(ValueCounter(on="category"))
+            # result: {'value_counter(category)': {'values': ['A', 'B', 'C'], 'counts': [3, 2, 1]}}
+
+            # Using with groupby
+            ds = ray.data.from_items([
+                {"group": "X", "category": "A"}, {"group": "X", "category": "B"},
+                {"group": "Y", "category": "A"}, {"group": "Y", "category": "A"}
+            ])
+            result = ds.groupby("group").aggregate(ValueCounter(on="category")).take_all()
+            # result: [{'group': 'X', 'value_counter(category)': {'values': ['A', 'B'], 'counts': [1, 1]}},
+            #          {'group': 'Y', 'value_counter(category)': {'values': ['A'], 'counts': [2]}}]
+
+    Args:
+        on: The name of the column to count values in. Must be provided.
+        alias_name: Optional name for the resulting column. If not provided,
+            defaults to "value_counter({column_name})".
+    """
+
+    def __init__(
+        self,
+        on: str,
+        alias_name: Optional[str] = None,
+    ):
+        super().__init__(
+            alias_name if alias_name else f"value_counter({str(on)})",
+            on=on,
+            ignore_nulls=True,
+            zero_factory=lambda: {"values": [], "counts": []},
+        )
+
+    def aggregate_block(self, block: Block) -> Dict[str, List]:
+
+        col_accessor = BlockColumnAccessor.for_column(block[self._target_col_name])
+        return col_accessor.value_counts()
+
+    def combine(
+        self,
+        current_accumulator: Dict[str, List],
+        new_accumulator: Dict[str, List],
+    ) -> Dict[str, List]:
+
+        values = current_accumulator["values"]
+        counts = current_accumulator["counts"]
+
+        # Build a value → index map once (avoid repeated lookups)
+        value_to_index = {v: i for i, v in enumerate(values)}
+
+        for v_new, c_new in zip(new_accumulator["values"], new_accumulator["counts"]):
+            if v_new in value_to_index:
+                idx = value_to_index[v_new]
+                counts[idx] += c_new
+            else:
+                value_to_index[v_new] = len(values)
+                values.append(v_new)
+                counts.append(c_new)
+
+        return current_accumulator
+
+
 def _null_safe_zero_factory(zero_factory, ignore_nulls: bool):
     """NOTE: PLEASE READ CAREFULLY BEFORE CHANGING
 
 
@@ -685,11 +685,44 @@ def unique(self) -> BlockColumn:
         """Returns new column holding only distinct values of the current one"""
         raise NotImplementedError()
 
+    def value_counts(self) -> Dict[str, List]:
+        raise NotImplementedError()
+
+    def hash(self) -> BlockColumn:
+        """
+        Computes a 64-bit hash value for each row in the column.
+
+        Provides a unified hashing method across supported backends.
+        Handles complex types like lists or nested structures by producing a single hash per row.
+        These hashes are useful for downstream operations such as deduplication, grouping, or partitioning.
+
+        Internally, Polars is used to compute row-level hashes even when the original column
+        is backed by Pandas or PyArrow.
+
+        :return: A column of 64-bit integer hashes, returned in the same format as the underlying backend
+             (e.g., Pandas Series or PyArrow Array).
+        """
+        raise NotImplementedError()
+
     def flatten(self) -> BlockColumn:
         """Flattens nested lists merging them into top-level container"""
 
         raise NotImplementedError()
 
+    def dropna(self) -> BlockColumn:
+        raise NotImplementedError()
+
+    def is_composed_of_lists(self, types: Optional[Tuple] = None) -> bool:
+        """
+        Checks whether the column is composed of list-like elements.
+
+        :param types: Optional tuple of backend-specific types to check against.
+                      If not provided, defaults to list-like types appropriate
+                      for the underlying backend (e.g., PyArrow list types).
+        :return: True if the column is made up of list-like values; False otherwise.
+        """
+        raise NotImplementedError()
+
     def sum_of_squared_diffs_from_mean(
         self,
         *,
 
@@ -47,6 +47,12 @@ class Preprocessor(abc.ABC):
       implemented method.
     """
 
+    def __init__(self):
+        from ray.data.preprocessors.utils import StatComputationPlan
+
+        self.stat_computation_plan = StatComputationPlan()
+        self.stats_ = {}
+
     class FitStatus(str, Enum):
         """The fit status of preprocessor."""
 
@@ -72,7 +78,7 @@ def _check_has_fitted_state(self):
         used to transform data in newer versions.
         """
 
-        fitted_vars = [v for v in vars(self) if v.endswith("_")]
+        fitted_vars = [v for v in vars(self) if v.endswith("_") and getattr(self, v)]
         return bool(fitted_vars)
 
     def fit_status(self) -> "Preprocessor.FitStatus":
@@ -114,10 +120,15 @@ def fit(self, ds: "Dataset") -> "Preprocessor":
                 "All previously fitted state will be overwritten!"
             )
 
-        fitted_ds = self._fit(ds)
+        self.stat_computation_plan.reset()
+        fitted_ds = self._fit(ds)._fit_execute(ds)
         self._fitted = True
         return fitted_ds
 
+    def _fit_execute(self, dataset: "Dataset"):
+        self.stats_ |= self.stat_computation_plan.compute(dataset)
+        return self
+
     def fit_transform(
         self,
         ds: "Dataset",
@@ -373,6 +384,18 @@ def preferred_batch_format(cls) -> BatchFormat:
         """
         return BatchFormat.PANDAS
 
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        # Exclude unpicklable attributes
+        state.pop("stat_computation_plan", None)
+        return state
+
+    def __setstate__(self, state):
+        from ray.data.preprocessors.utils import StatComputationPlan
+
+        self.__dict__.update(state)
+        self.stat_computation_plan = StatComputationPlan()
+
     @DeveloperAPI
     def serialize(self) -> str:
         """Return this preprocessor serialized as a string.
 
@@ -66,6 +66,7 @@ def fit_status(self):
             return Preprocessor.FitStatus.NOT_FITTABLE
 
     def __init__(self, *preprocessors: Preprocessor):
+        super().__init__()
         self.preprocessors = preprocessors
 
     def _fit(self, ds: "Dataset") -> Preprocessor: