rapidsai · rapids-bot · Oct 16, 2025 · Oct 13, 2025 · Oct 13, 2025 · Oct 13, 2025
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -33,7 +33,7 @@ repos:
     rev: 'v1.13.0'
     hooks:
       - id: mypy
-        additional_dependencies: [types-cachetools, pyarrow-stubs]
+        additional_dependencies: [types-cachetools, pyarrow-stubs, numpy]
         args: ["--config-file=pyproject.toml",
                "python/cudf/cudf",
                "python/pylibcudf/pylibcudf",

diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py
@@ -121,7 +121,7 @@ def _read_tzfile_as_columns(
         from cudf.core.column.column import as_column
 
         # this happens for UTC-like zones
-        min_date = np.int64(np.iinfo("int64").min + 1).astype(
+        min_date: np.datetime64 = np.int64(np.iinfo("int64").min + 1).astype(
             np.dtype("M8[s]")
         )
         return (as_column([min_date]), as_column([np.timedelta64(0, "s")]))  # type: ignore[return-value]

diff --git a/python/cudf/cudf/core/accessors/string.py b/python/cudf/cudf/core/accessors/string.py
@@ -4735,7 +4735,7 @@ def character_ngrams(
         return result
 
     def hash_character_ngrams(
-        self, n: int = 5, as_list: bool = False, seed: np.uint32 = 0
+        self, n: int = 5, as_list: bool = False, seed: int | np.uint32 = 0
     ) -> Series | Index:
         """
         Generate hashes of n-grams from characters in a column of strings.
@@ -5326,7 +5326,7 @@ def minhash(
                 return self.minhash64(seed, a_column, b_column, width)
 
     def minhash64(
-        self, seed: np.uint64, a: ColumnLike, b: ColumnLike, width: int
+        self, seed: int | np.uint64, a: ColumnLike, b: ColumnLike, width: int
     ) -> Series | Index:
         """
         Compute the minhash of a strings column.
@@ -5377,7 +5377,7 @@ def minhash64(
         )
 
     def minhash_ngrams(
-        self, ngrams: int, seed: np.uint32, a: ColumnLike, b: ColumnLike
+        self, ngrams: int, seed: int | np.uint32, a: ColumnLike, b: ColumnLike
     ) -> Series | Index:
         """
         Compute the minhash of a list column of strings.
@@ -5428,7 +5428,7 @@ def minhash_ngrams(
         )
 
     def minhash64_ngrams(
-        self, ngrams: int, seed: np.uint64, a: ColumnLike, b: ColumnLike
+        self, ngrams: int, seed: int | np.uint64, a: ColumnLike, b: ColumnLike
     ) -> Series | Index:
         """
         Compute the minhash of a list column of strings.

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -1731,15 +1731,15 @@ def astype(self, dtype: DtypeObj, copy: bool | None = False) -> ColumnBase:
             if isinstance(dtype, CategoricalDtype):
                 result = self.as_categorical_column(dtype)
             elif is_dtype_obj_interval(dtype):
-                result = self.as_interval_column(dtype)
+                result = self.as_interval_column(dtype)  # type: ignore[arg-type]
             elif is_dtype_obj_list(dtype) or is_dtype_obj_struct(dtype):
                 if self.dtype != dtype:
                     raise NotImplementedError(
                         f"Casting {self.dtype} columns not currently supported"
                     )
                 result = self
             elif is_dtype_obj_decimal(dtype):
-                result = self.as_decimal_column(dtype)
+                result = self.as_decimal_column(dtype)  # type: ignore[arg-type]
             elif dtype.kind == "M":
                 result = self.as_datetime_column(dtype)
             elif dtype.kind == "m":
@@ -2227,8 +2227,10 @@ def reduce(self, reduction_op: str, **kwargs) -> ScalarLike:
                 plc.TypeId.DECIMAL32,
             }:
                 scale = -plc_scalar.type().scale()
+                # Narrow type for mypy - we know col_dtype is a decimal type from the check above
+                assert isinstance(col_dtype, DecimalDtype)
+                p = col_dtype.precision
                 # https://docs.microsoft.com/en-us/sql/t-sql/data-types/precision-scale-and-length-transact-sql
-                p = col_dtype.precision  # type: ignore[union-attr]
                 nrows = len(self)
                 if reduction_op in {"min", "max"}:
                     new_p = p
@@ -2242,7 +2244,7 @@ def reduce(self, reduction_op: str, **kwargs) -> ScalarLike:
                     raise NotImplementedError(
                         f"{reduction_op} not implemented for decimal types."
                     )
-                precision = max(min(new_p, col_dtype.MAX_PRECISION), 0)  # type: ignore[union-attr]
+                precision = max(min(new_p, col_dtype.MAX_PRECISION), 0)
                 new_dtype = type(col_dtype)(precision, scale)
                 result_col = result_col.astype(new_dtype)
             elif isinstance(col_dtype, IntervalDtype):

diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
@@ -118,19 +118,31 @@ def _from_32_64_arrow(
         if isinstance(data, pa.ChunkedArray):
             data = data.combine_chunks()
         mask_buf, data_buf = data.buffers()
-        rmm_data_buffer = rmm.DeviceBuffer.to_device(
-            np.frombuffer(data_buf)
-            .view(view_type)[::step]
-            .copy()
-            .view("uint8")
-        )
-        plc_column = plc.Column.from_rmm_buffer(
-            rmm_data_buffer,
-            plc.DataType(plc_type, -data.type.scale),
-            len(data),
-            [],
-        )
-        if mask_buf is not None:
+        if data_buf is None:
+            # If data_buf is None, create an empty column
+            plc_column = plc.Column(
+                data_type=plc.DataType(plc_type, -data.type.scale),
+                size=0,
+                data=None,
+                mask=None,
+                null_count=0,
+                offset=0,
+                children=[],
+            )
+        else:
+            rmm_data_buffer = rmm.DeviceBuffer.to_device(
+                np.frombuffer(data_buf)
+                .view(view_type)[::step]
+                .copy()
+                .view("uint8")
+            )
+            plc_column = plc.Column.from_rmm_buffer(
+                rmm_data_buffer,
+                plc.DataType(plc_type, -data.type.scale),
+                len(data),
+                [],
+            )
+        if mask_buf is not None and data_buf is not None:
             mask_size = plc.null_mask.bitmask_allocation_size_bytes(len(data))
             if mask_buf.size < mask_size:
                 rmm_mask_buffer = rmm.DeviceBuffer(size=mask_size)
@@ -391,7 +403,9 @@ def from_arrow(cls, data: pa.Array | pa.ChunkedArray) -> Self:
 
     def to_arrow(self) -> pa.Array:
         data_buf_32 = np.array(self.base_data.memoryview()).view("int32")  # type: ignore[union-attr]
-        data_buf_128 = np.empty(len(data_buf_32) * 4, dtype="int32")
+        data_buf_128: np.ndarray = np.empty(
+            len(data_buf_32) * 4, dtype="int32"
+        )
 
         # use striding to set the first 32 bits of each 128-bit chunk:
         data_buf_128[::4] = data_buf_32
@@ -464,8 +478,9 @@ def from_arrow(cls, data: pa.Array | pa.ChunkedArray) -> Self:
         return result
 
     def to_arrow(self) -> pa.Array:
+        dtype: Decimal128Dtype
         if isinstance(self.dtype, pd.ArrowDtype):
-            dtype = pyarrow_dtype_to_cudf_dtype(self.dtype)
+            dtype = pyarrow_dtype_to_cudf_dtype(self.dtype)  # type: ignore[assignment]
         else:
             dtype = self.dtype
 
@@ -510,7 +525,9 @@ def from_arrow(cls, data: pa.Array | pa.ChunkedArray) -> Self:
 
     def to_arrow(self) -> pa.Array:
         data_buf_64 = np.array(self.base_data.memoryview()).view("int64")  # type: ignore[union-attr]
-        data_buf_128 = np.empty(len(data_buf_64) * 2, dtype="int64")
+        data_buf_128: np.ndarray = np.empty(
+            len(data_buf_64) * 2, dtype="int64"
+        )
 
         # use striding to set the first 64 bits of each 128-bit chunk:
         data_buf_128[::2] = data_buf_64

diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
@@ -526,10 +526,17 @@ def join_list_elements(
     def minhash_ngrams(
         self,
         width: int,
-        seed: np.uint32,
+        seed: int | np.uint32,
         a: NumericalColumn,
         b: NumericalColumn,
     ) -> Self:
+        # Convert int to np.uint32 with validation
+        if isinstance(seed, int):
+            if seed < 0 or seed > np.iinfo(np.uint32).max:
+                raise ValueError(
+                    f"seed must be in range [0, {np.iinfo(np.uint32).max}]"
+                )
+            seed = np.uint32(seed)
         return type(self).from_pylibcudf(  # type: ignore[return-value]
             plc.nvtext.minhash.minhash_ngrams(
                 self.to_pylibcudf(mode="read"),
@@ -544,10 +551,17 @@ def minhash_ngrams(
     def minhash64_ngrams(
         self,
         width: int,
-        seed: np.uint64,
+        seed: int | np.uint64,
         a: NumericalColumn,
         b: NumericalColumn,
     ) -> Self:
+        # Convert int to np.uint64 with validation
+        if isinstance(seed, int):
+            if seed < 0 or seed > np.iinfo(np.uint64).max:
+                raise ValueError(
+                    f"seed must be in range [0, {np.iinfo(np.uint64).max}]"
+                )
+            seed = np.uint64(seed)
         return type(self).from_pylibcudf(  # type: ignore[return-value]
             plc.nvtext.minhash.minhash64_ngrams(
                 self.to_pylibcudf(mode="read"),

diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
@@ -821,7 +821,9 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
                 # Kinds are the same but to_dtype is smaller
                 if "float" in to_dtype_numpy.name:
                     finfo = np.finfo(to_dtype_numpy)
-                    lower_, upper_ = finfo.min, finfo.max
+                    lower_: int | float
+                    upper_: int | float
+                    lower_, upper_ = finfo.min, finfo.max  # type: ignore[assignment]
 
                     # Check specifically for np.pi values when casting to lower precision
                     if self_dtype_numpy.itemsize > to_dtype_numpy.itemsize:

diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
@@ -86,12 +86,12 @@ def kurtosis(self, skipna: bool | None = None) -> float:
         skipna = True if skipna is None else skipna
 
         if len(self) == 0 or self._can_return_nan(skipna=skipna):
-            return _get_nan_for_dtype(self.dtype)
+            return _get_nan_for_dtype(self.dtype)  # type: ignore[return-value]
 
         self = self.nans_to_nulls().dropna()
 
         if len(self) < 4:
-            return _get_nan_for_dtype(self.dtype)
+            return _get_nan_for_dtype(self.dtype)  # type: ignore[return-value]
 
         n = len(self)
         miu = self.mean()
@@ -178,7 +178,7 @@ def quantile(
                 except (TypeError, ValueError):
                     pass
             return (
-                _get_nan_for_dtype(self.dtype)
+                _get_nan_for_dtype(self.dtype)  # type: ignore[return-value]
                 if scalar_result is NA
                 else scalar_result
             )
@@ -221,7 +221,7 @@ def median(self, skipna: bool | None = None) -> NumericalBaseColumn:
         skipna = True if skipna is None else skipna
 
         if self._can_return_nan(skipna=skipna):
-            return _get_nan_for_dtype(self.dtype)
+            return _get_nan_for_dtype(self.dtype)  # type: ignore[return-value]
 
         # enforce linear in case the default ever changes
         result = self.quantile(
@@ -240,21 +240,21 @@ def cov(self, other: NumericalBaseColumn) -> float:
             or len(other) == 0
             or (len(self) == 1 and len(other) == 1)
         ):
-            return _get_nan_for_dtype(self.dtype)
+            return _get_nan_for_dtype(self.dtype)  # type: ignore[return-value]
 
         result = (self - self.mean()) * (other - other.mean())
         cov_sample = result.sum() / (len(self) - 1)
         return cov_sample
 
     def corr(self, other: NumericalBaseColumn) -> float:
         if len(self) == 0 or len(other) == 0:
-            return _get_nan_for_dtype(self.dtype)
+            return _get_nan_for_dtype(self.dtype)  # type: ignore[return-value]
 
         cov = self.cov(other)
         lhs_std, rhs_std = self.std(), other.std()
 
         if not cov or lhs_std == 0 or rhs_std == 0:
-            return _get_nan_for_dtype(self.dtype)
+            return _get_nan_for_dtype(self.dtype)  # type: ignore[return-value]
         return cov / lhs_std / rhs_std
 
     def round(

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
@@ -629,11 +629,18 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
     @acquire_spill_lock()
     def minhash(
         self,
-        seed: np.uint32,
+        seed: int | np.uint32,
         a: NumericalColumn,
         b: NumericalColumn,
         width: int,
     ) -> ListColumn:
+        # Convert int to np.uint32 with validation
+        if isinstance(seed, int):
+            if seed < 0 or seed > np.iinfo(np.uint32).max:
+                raise ValueError(
+                    f"seed must be in range [0, {np.iinfo(np.uint32).max}]"
+                )
+            seed = np.uint32(seed)
         return type(self).from_pylibcudf(  # type: ignore[return-value]
             plc.nvtext.minhash.minhash(
                 self.to_pylibcudf(mode="read"),
@@ -647,11 +654,18 @@ def minhash(
     @acquire_spill_lock()
     def minhash64(
         self,
-        seed: np.uint64,
+        seed: int | np.uint64,
         a: NumericalColumn,
         b: NumericalColumn,
         width: int,
     ) -> ListColumn:
+        # Convert int to np.uint64 with validation
+        if isinstance(seed, int):
+            if seed < 0 or seed > np.iinfo(np.uint64).max:
+                raise ValueError(
+                    f"seed must be in range [0, {np.iinfo(np.uint64).max}]"
+                )
+            seed = np.uint64(seed)
         return type(self).from_pylibcudf(  # type: ignore[return-value]
             plc.nvtext.minhash.minhash64(
                 self.to_pylibcudf(mode="read"),
@@ -689,8 +703,15 @@ def generate_character_ngrams(self, ngrams: int) -> ListColumn:
 
     @acquire_spill_lock()
     def hash_character_ngrams(
-        self, ngrams: int, seed: np.uint32
+        self, ngrams: int, seed: int | np.uint32
     ) -> ListColumn:
+        # Convert int to np.uint32 with validation
+        if isinstance(seed, int):
+            if seed < 0 or seed > np.iinfo(np.uint32).max:
+                raise ValueError(
+                    f"seed must be in range [0, {np.iinfo(np.uint32).max}]"
+                )
+            seed = np.uint32(seed)
         result = plc.nvtext.generate_ngrams.hash_character_ngrams(
             self.to_pylibcudf(mode="read"), ngrams, seed
         )

diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py
@@ -107,8 +107,8 @@ def base_size(self) -> int:
 
     def to_arrow(self) -> pa.Array:
         children = [child.to_arrow() for child in self.children]
-        dtype = (
-            pyarrow_dtype_to_cudf_dtype(self.dtype)
+        dtype: StructDtype = (
+            pyarrow_dtype_to_cudf_dtype(self.dtype)  # type: ignore[assignment]
             if isinstance(self.dtype, pd.ArrowDtype)
             else self.dtype
         )