diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 75c678df3ba..dde4f686887 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -33,7 +33,7 @@ repos: rev: 'v1.13.0' hooks: - id: mypy - additional_dependencies: [types-cachetools, pyarrow-stubs] + additional_dependencies: [types-cachetools, pyarrow-stubs, numpy] args: ["--config-file=pyproject.toml", "python/cudf/cudf", "python/pylibcudf/pylibcudf", diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py index ebe0cab0999..b078c3763af 100644 --- a/python/cudf/cudf/core/_internals/timezones.py +++ b/python/cudf/cudf/core/_internals/timezones.py @@ -121,7 +121,7 @@ def _read_tzfile_as_columns( from cudf.core.column.column import as_column # this happens for UTC-like zones - min_date = np.int64(np.iinfo("int64").min + 1).astype( + min_date: np.datetime64 = np.int64(np.iinfo("int64").min + 1).astype( np.dtype("M8[s]") ) return (as_column([min_date]), as_column([np.timedelta64(0, "s")])) # type: ignore[return-value] diff --git a/python/cudf/cudf/core/accessors/string.py b/python/cudf/cudf/core/accessors/string.py index ab12dfbf2cc..8ed11249b04 100644 --- a/python/cudf/cudf/core/accessors/string.py +++ b/python/cudf/cudf/core/accessors/string.py @@ -4735,7 +4735,7 @@ def character_ngrams( return result def hash_character_ngrams( - self, n: int = 5, as_list: bool = False, seed: np.uint32 = 0 + self, n: int = 5, as_list: bool = False, seed: int | np.uint32 = 0 ) -> Series | Index: """ Generate hashes of n-grams from characters in a column of strings. @@ -5326,7 +5326,7 @@ def minhash( return self.minhash64(seed, a_column, b_column, width) def minhash64( - self, seed: np.uint64, a: ColumnLike, b: ColumnLike, width: int + self, seed: int | np.uint64, a: ColumnLike, b: ColumnLike, width: int ) -> Series | Index: """ Compute the minhash of a strings column. @@ -5377,7 +5377,7 @@ def minhash64( ) def minhash_ngrams( - self, ngrams: int, seed: np.uint32, a: ColumnLike, b: ColumnLike + self, ngrams: int, seed: int | np.uint32, a: ColumnLike, b: ColumnLike ) -> Series | Index: """ Compute the minhash of a list column of strings. @@ -5428,7 +5428,7 @@ def minhash_ngrams( ) def minhash64_ngrams( - self, ngrams: int, seed: np.uint64, a: ColumnLike, b: ColumnLike + self, ngrams: int, seed: int | np.uint64, a: ColumnLike, b: ColumnLike ) -> Series | Index: """ Compute the minhash of a list column of strings. diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 8f785ef617e..a7be09b023a 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1731,7 +1731,7 @@ def astype(self, dtype: DtypeObj, copy: bool | None = False) -> ColumnBase: if isinstance(dtype, CategoricalDtype): result = self.as_categorical_column(dtype) elif is_dtype_obj_interval(dtype): - result = self.as_interval_column(dtype) + result = self.as_interval_column(dtype) # type: ignore[arg-type] elif is_dtype_obj_list(dtype) or is_dtype_obj_struct(dtype): if self.dtype != dtype: raise NotImplementedError( @@ -1739,7 +1739,7 @@ def astype(self, dtype: DtypeObj, copy: bool | None = False) -> ColumnBase: ) result = self elif is_dtype_obj_decimal(dtype): - result = self.as_decimal_column(dtype) + result = self.as_decimal_column(dtype) # type: ignore[arg-type] elif dtype.kind == "M": result = self.as_datetime_column(dtype) elif dtype.kind == "m": @@ -2227,8 +2227,10 @@ def reduce(self, reduction_op: str, **kwargs) -> ScalarLike: plc.TypeId.DECIMAL32, }: scale = -plc_scalar.type().scale() + # Narrow type for mypy - we know col_dtype is a decimal type from the check above + assert isinstance(col_dtype, DecimalDtype) + p = col_dtype.precision # https://docs.microsoft.com/en-us/sql/t-sql/data-types/precision-scale-and-length-transact-sql - p = col_dtype.precision # type: ignore[union-attr] nrows = len(self) if reduction_op in {"min", "max"}: new_p = p @@ -2242,7 +2244,7 @@ def reduce(self, reduction_op: str, **kwargs) -> ScalarLike: raise NotImplementedError( f"{reduction_op} not implemented for decimal types." ) - precision = max(min(new_p, col_dtype.MAX_PRECISION), 0) # type: ignore[union-attr] + precision = max(min(new_p, col_dtype.MAX_PRECISION), 0) new_dtype = type(col_dtype)(precision, scale) result_col = result_col.astype(new_dtype) elif isinstance(col_dtype, IntervalDtype): diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 861f1c9c1a2..bda29a54dd7 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -118,19 +118,31 @@ def _from_32_64_arrow( if isinstance(data, pa.ChunkedArray): data = data.combine_chunks() mask_buf, data_buf = data.buffers() - rmm_data_buffer = rmm.DeviceBuffer.to_device( - np.frombuffer(data_buf) - .view(view_type)[::step] - .copy() - .view("uint8") - ) - plc_column = plc.Column.from_rmm_buffer( - rmm_data_buffer, - plc.DataType(plc_type, -data.type.scale), - len(data), - [], - ) - if mask_buf is not None: + if data_buf is None: + # If data_buf is None, create an empty column + plc_column = plc.Column( + data_type=plc.DataType(plc_type, -data.type.scale), + size=0, + data=None, + mask=None, + null_count=0, + offset=0, + children=[], + ) + else: + rmm_data_buffer = rmm.DeviceBuffer.to_device( + np.frombuffer(data_buf) + .view(view_type)[::step] + .copy() + .view("uint8") + ) + plc_column = plc.Column.from_rmm_buffer( + rmm_data_buffer, + plc.DataType(plc_type, -data.type.scale), + len(data), + [], + ) + if mask_buf is not None and data_buf is not None: mask_size = plc.null_mask.bitmask_allocation_size_bytes(len(data)) if mask_buf.size < mask_size: rmm_mask_buffer = rmm.DeviceBuffer(size=mask_size) @@ -391,7 +403,9 @@ def from_arrow(cls, data: pa.Array | pa.ChunkedArray) -> Self: def to_arrow(self) -> pa.Array: data_buf_32 = np.array(self.base_data.memoryview()).view("int32") # type: ignore[union-attr] - data_buf_128 = np.empty(len(data_buf_32) * 4, dtype="int32") + data_buf_128: np.ndarray = np.empty( + len(data_buf_32) * 4, dtype="int32" + ) # use striding to set the first 32 bits of each 128-bit chunk: data_buf_128[::4] = data_buf_32 @@ -464,8 +478,9 @@ def from_arrow(cls, data: pa.Array | pa.ChunkedArray) -> Self: return result def to_arrow(self) -> pa.Array: + dtype: Decimal128Dtype if isinstance(self.dtype, pd.ArrowDtype): - dtype = pyarrow_dtype_to_cudf_dtype(self.dtype) + dtype = pyarrow_dtype_to_cudf_dtype(self.dtype) # type: ignore[assignment] else: dtype = self.dtype @@ -510,7 +525,9 @@ def from_arrow(cls, data: pa.Array | pa.ChunkedArray) -> Self: def to_arrow(self) -> pa.Array: data_buf_64 = np.array(self.base_data.memoryview()).view("int64") # type: ignore[union-attr] - data_buf_128 = np.empty(len(data_buf_64) * 2, dtype="int64") + data_buf_128: np.ndarray = np.empty( + len(data_buf_64) * 2, dtype="int64" + ) # use striding to set the first 64 bits of each 128-bit chunk: data_buf_128[::2] = data_buf_64 diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 46b8a897471..5ed181612dd 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -526,10 +526,17 @@ def join_list_elements( def minhash_ngrams( self, width: int, - seed: np.uint32, + seed: int | np.uint32, a: NumericalColumn, b: NumericalColumn, ) -> Self: + # Convert int to np.uint32 with validation + if isinstance(seed, int): + if seed < 0 or seed > np.iinfo(np.uint32).max: + raise ValueError( + f"seed must be in range [0, {np.iinfo(np.uint32).max}]" + ) + seed = np.uint32(seed) return type(self).from_pylibcudf( # type: ignore[return-value] plc.nvtext.minhash.minhash_ngrams( self.to_pylibcudf(mode="read"), @@ -544,10 +551,17 @@ def minhash_ngrams( def minhash64_ngrams( self, width: int, - seed: np.uint64, + seed: int | np.uint64, a: NumericalColumn, b: NumericalColumn, ) -> Self: + # Convert int to np.uint64 with validation + if isinstance(seed, int): + if seed < 0 or seed > np.iinfo(np.uint64).max: + raise ValueError( + f"seed must be in range [0, {np.iinfo(np.uint64).max}]" + ) + seed = np.uint64(seed) return type(self).from_pylibcudf( # type: ignore[return-value] plc.nvtext.minhash.minhash64_ngrams( self.to_pylibcudf(mode="read"), diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 97c1faa8070..fa38daf2171 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -821,7 +821,9 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool: # Kinds are the same but to_dtype is smaller if "float" in to_dtype_numpy.name: finfo = np.finfo(to_dtype_numpy) - lower_, upper_ = finfo.min, finfo.max + lower_: int | float + upper_: int | float + lower_, upper_ = finfo.min, finfo.max # type: ignore[assignment] # Check specifically for np.pi values when casting to lower precision if self_dtype_numpy.itemsize > to_dtype_numpy.itemsize: diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 4e0a2975be7..cb2aae317f9 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -86,12 +86,12 @@ def kurtosis(self, skipna: bool | None = None) -> float: skipna = True if skipna is None else skipna if len(self) == 0 or self._can_return_nan(skipna=skipna): - return _get_nan_for_dtype(self.dtype) + return _get_nan_for_dtype(self.dtype) # type: ignore[return-value] self = self.nans_to_nulls().dropna() if len(self) < 4: - return _get_nan_for_dtype(self.dtype) + return _get_nan_for_dtype(self.dtype) # type: ignore[return-value] n = len(self) miu = self.mean() @@ -178,7 +178,7 @@ def quantile( except (TypeError, ValueError): pass return ( - _get_nan_for_dtype(self.dtype) + _get_nan_for_dtype(self.dtype) # type: ignore[return-value] if scalar_result is NA else scalar_result ) @@ -221,7 +221,7 @@ def median(self, skipna: bool | None = None) -> NumericalBaseColumn: skipna = True if skipna is None else skipna if self._can_return_nan(skipna=skipna): - return _get_nan_for_dtype(self.dtype) + return _get_nan_for_dtype(self.dtype) # type: ignore[return-value] # enforce linear in case the default ever changes result = self.quantile( @@ -240,7 +240,7 @@ def cov(self, other: NumericalBaseColumn) -> float: or len(other) == 0 or (len(self) == 1 and len(other) == 1) ): - return _get_nan_for_dtype(self.dtype) + return _get_nan_for_dtype(self.dtype) # type: ignore[return-value] result = (self - self.mean()) * (other - other.mean()) cov_sample = result.sum() / (len(self) - 1) @@ -248,13 +248,13 @@ def cov(self, other: NumericalBaseColumn) -> float: def corr(self, other: NumericalBaseColumn) -> float: if len(self) == 0 or len(other) == 0: - return _get_nan_for_dtype(self.dtype) + return _get_nan_for_dtype(self.dtype) # type: ignore[return-value] cov = self.cov(other) lhs_std, rhs_std = self.std(), other.std() if not cov or lhs_std == 0 or rhs_std == 0: - return _get_nan_for_dtype(self.dtype) + return _get_nan_for_dtype(self.dtype) # type: ignore[return-value] return cov / lhs_std / rhs_std def round( diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 7c4b972086a..6d868872956 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -629,11 +629,18 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: @acquire_spill_lock() def minhash( self, - seed: np.uint32, + seed: int | np.uint32, a: NumericalColumn, b: NumericalColumn, width: int, ) -> ListColumn: + # Convert int to np.uint32 with validation + if isinstance(seed, int): + if seed < 0 or seed > np.iinfo(np.uint32).max: + raise ValueError( + f"seed must be in range [0, {np.iinfo(np.uint32).max}]" + ) + seed = np.uint32(seed) return type(self).from_pylibcudf( # type: ignore[return-value] plc.nvtext.minhash.minhash( self.to_pylibcudf(mode="read"), @@ -647,11 +654,18 @@ def minhash( @acquire_spill_lock() def minhash64( self, - seed: np.uint64, + seed: int | np.uint64, a: NumericalColumn, b: NumericalColumn, width: int, ) -> ListColumn: + # Convert int to np.uint64 with validation + if isinstance(seed, int): + if seed < 0 or seed > np.iinfo(np.uint64).max: + raise ValueError( + f"seed must be in range [0, {np.iinfo(np.uint64).max}]" + ) + seed = np.uint64(seed) return type(self).from_pylibcudf( # type: ignore[return-value] plc.nvtext.minhash.minhash64( self.to_pylibcudf(mode="read"), @@ -689,8 +703,15 @@ def generate_character_ngrams(self, ngrams: int) -> ListColumn: @acquire_spill_lock() def hash_character_ngrams( - self, ngrams: int, seed: np.uint32 + self, ngrams: int, seed: int | np.uint32 ) -> ListColumn: + # Convert int to np.uint32 with validation + if isinstance(seed, int): + if seed < 0 or seed > np.iinfo(np.uint32).max: + raise ValueError( + f"seed must be in range [0, {np.iinfo(np.uint32).max}]" + ) + seed = np.uint32(seed) result = plc.nvtext.generate_ngrams.hash_character_ngrams( self.to_pylibcudf(mode="read"), ngrams, seed ) diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 3c912ae4575..5f3d5923fb4 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -107,8 +107,8 @@ def base_size(self) -> int: def to_arrow(self) -> pa.Array: children = [child.to_arrow() for child in self.children] - dtype = ( - pyarrow_dtype_to_cudf_dtype(self.dtype) + dtype: StructDtype = ( + pyarrow_dtype_to_cudf_dtype(self.dtype) # type: ignore[assignment] if isinstance(self.dtype, pd.ArrowDtype) else self.dtype ) diff --git a/python/cudf/cudf/core/column/temporal_base.py b/python/cudf/cudf/core/column/temporal_base.py index 3cd9e67a8af..4dfc7e9ff8d 100644 --- a/python/cudf/cudf/core/column/temporal_base.py +++ b/python/cudf/cudf/core/column/temporal_base.py @@ -5,7 +5,7 @@ import datetime import functools import warnings -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, ClassVar import cupy as cp import numpy as np @@ -44,8 +44,8 @@ class TemporalBaseColumn(ColumnBase): """ _PANDAS_NA_VALUE = pd.NaT - _UNDERLYING_DTYPE = np.dtype(np.int64) - _NP_SCALAR: np.datetime64 | np.timedelta64 + _UNDERLYING_DTYPE: np.dtype[np.int64] = np.dtype(np.int64) + _NP_SCALAR: ClassVar[type[np.datetime64] | type[np.timedelta64]] _PD_SCALAR: pd.Timestamp | pd.Timedelta def __init__( @@ -95,7 +95,9 @@ def _validate_fillna_value( ): fill_value = fill_value.astype(self.dtype) elif isinstance(fill_value, str) and fill_value.lower() == "nat": - fill_value = self._NP_SCALAR(fill_value, self.time_unit) + # call-overload must be ignored because numpy stubs only accept literal + # time unit strings, but we're passing self.time_unit which is valid at runtime + fill_value = self._NP_SCALAR(fill_value, self.time_unit) # type: ignore[call-overload] return super()._validate_fillna_value(fill_value) def _cast_setitem_value(self, value: Any) -> plc.Scalar | ColumnBase: @@ -162,7 +164,11 @@ def _normalize_binop_operand(self, other: Any) -> pa.Scalar | ColumnBase: if np.isnat(other): # Workaround for https://github.com/numpy/numpy/issues/28496 # Once fixed, can always use the astype below - other = type(other)("NaT", to_unit) + # call-overload must be ignored because numpy stubs only accept literal strings + # for time units (e.g., "ns", "us") to allow compile-time validation, + # but we're passing a variable string (to_unit) with a time unit that + # we know is valid at runtime + other = type(other)("NaT", to_unit) # type: ignore[call-overload] else: other = other.astype( np.dtype(f"{other.dtype.kind}8[{to_unit}]") @@ -310,16 +316,21 @@ def find_and_replace( def can_cast_safely(self, to_dtype: DtypeObj) -> bool: if to_dtype.kind == self.dtype.kind: # type: ignore[union-attr] to_res, _ = np.datetime_data(to_dtype) + # call-overload must be ignored because numpy stubs only accept literal strings + # for time units (e.g., "ns", "us") to allow compile-time validation, + # but we're passing variables (self.time_unit) with time units that + # we know are valid at runtime max_dist = np.timedelta64( self.max().astype(self._UNDERLYING_DTYPE, copy=False), - self.time_unit, + self.time_unit, # type: ignore[call-overload] ) min_dist = np.timedelta64( self.min().astype(self._UNDERLYING_DTYPE, copy=False), - self.time_unit, + self.time_unit, # type: ignore[call-overload] ) max_to_res = np.timedelta64( - np.iinfo(self._UNDERLYING_DTYPE).max, to_res + np.iinfo(self._UNDERLYING_DTYPE).max, + to_res, # type: ignore[call-overload] ).astype(f"m8[{self.time_unit}]", copy=False) return bool(max_dist <= max_to_res and min_dist <= max_to_res) elif ( diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 5cf846351de..6371e72c672 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -129,7 +129,9 @@ def _clear_cache(self) -> None: def __contains__(self, item: DatetimeLikeScalar) -> bool: try: - item = self._NP_SCALAR(item, self.time_unit) + # call-overload must be ignored because numpy stubs only accept literal + # time unit strings, but we're passing self.time_unit which is valid at runtime + item = self._NP_SCALAR(item, self.time_unit) # type: ignore[call-overload] except ValueError: # If item cannot be converted to duration type # np.timedelta64 raises ValueError, hence `item` diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 102e92c2a49..3215d95f427 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -691,7 +691,7 @@ def serialize(self) -> tuple[dict, list]: frames: list[Buffer] = [] - fields: dict[str, bytes | tuple[Any, tuple[int, int]]] = {} + fields: dict[str, str | tuple[Any, tuple[int, int]]] = {} for k, dtype in self.fields.items(): if isinstance(dtype, _BaseDtype): diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index f1fb883b5ae..779f4109dba 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -556,7 +556,7 @@ def _to_array( # identical except for the attribute they access to generate values. def to_array( - col: ColumnBase, to_dtype: np.dtype + col: ColumnBase, to_dtype: Dtype | None ) -> cupy.ndarray | np.ndarray: if ( col.has_nulls() diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 52966af07dc..ce878857b35 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1226,12 +1226,14 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool): group_offsets = group_offsets[:-1] else: group_offsets = group_offsets[1:] - size_per_group - to_take = np.arange(size_per_group.sum(), dtype=SIZE_TYPE_DTYPE) + to_take_indices = np.arange( + size_per_group.sum(), dtype=SIZE_TYPE_DTYPE + ) fixup = np.empty_like(size_per_group) fixup[0] = 0 np.cumsum(size_per_group[:-1], out=fixup[1:]) - to_take += np.repeat(group_offsets - fixup, size_per_group) - to_take = as_column(to_take) + to_take_indices += np.repeat(group_offsets - fixup, size_per_group) + to_take = as_column(to_take_indices) result = group_values.iloc[to_take] if preserve_order: # Can't use _mimic_pandas_order because we need to @@ -1555,7 +1557,7 @@ def sample( # interface doesn't take array-based low and high # arguments. low = 0 - high = np.repeat(size_per_group, samples_per_group) + high: np.ndarray = np.repeat(size_per_group, samples_per_group) rng = np.random.default_rng(seed=random_state) indices = rng.integers(low, high, dtype=SIZE_TYPE_DTYPE) indices += np.repeat(group_offsets[:-1], samples_per_group) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 893595e122f..bd8a7b69050 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2879,7 +2879,7 @@ def dtype(self) -> np.dtype: By default the dtype is 64 bit signed integer. This is configurable via `default_integer_bitwidth` as 32 bit in `cudf.options` """ - dtype = np.dtype(np.int64) + dtype: np.dtype = np.dtype(np.int64) return _maybe_convert_to_default_type(dtype) @property @@ -3365,7 +3365,7 @@ def _indices_of(self, value) -> NumericalColumn: i = [self._range.index(value)] except ValueError: i = [] - return as_column(i, dtype=SIZE_TYPE_DTYPE) + return as_column(i, dtype=SIZE_TYPE_DTYPE) # type: ignore[return-value] def isin(self, values, level=None) -> cupy.ndarray: if level is not None and level > 0: @@ -5172,7 +5172,7 @@ def __init__( if len(data) == 0: if not hasattr(data, "dtype"): - child_type = np.dtype(np.int64) + child_type: Dtype = np.dtype(np.int64) elif isinstance(data.dtype, (pd.IntervalDtype, IntervalDtype)): child_type = data.dtype.subtype else: diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 2616d5cd7a6..d0a27cc99d1 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2016,7 +2016,7 @@ def astype( ) raise TypeError(msg) if is_dict_like(dtype): - if len(dtype) > 1 or self.name not in dtype: + if len(dtype) > 1 or self.name not in dtype: # type: ignore[arg-type,operator] raise KeyError( "Only the Series name can be used for the key in Series " "dtype mappings." diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index cce3d5f71b6..21002e02ec7 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -317,7 +317,7 @@ def to_datetime( elif errors == "ignore": pass elif errors == "coerce": - return np.datetime64("nat", "ns" if unit is None else unit) + return np.datetime64("nat", "ns" if unit is None else unit) # type: ignore[call-overload] return arg @@ -849,7 +849,7 @@ def date_range( FutureWarning, ) - dtype = np.dtype("datetime64[ns]") + dtype: np.dtype = np.dtype("datetime64[ns]") unit, _ = np.datetime_data(dtype) if freq is None: diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index 795228e3311..83e59a185f6 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -172,12 +172,12 @@ def to_numeric( np.dtype(np.float64).char, ] elif downcast in ("integer", "signed"): - type_set = list(np.typecodes["Integer"]) + type_set = list(np.typecodes["Integer"]) # type: ignore[arg-type] elif downcast == "unsigned": - type_set = list(np.typecodes["UnsignedInteger"]) + type_set = list(np.typecodes["UnsignedInteger"]) # type: ignore[arg-type] for t in type_set: - downcast_dtype = np.dtype(t) + downcast_dtype: np.dtype = np.dtype(t) if downcast_dtype.itemsize <= col.dtype.itemsize: if col.can_cast_safely(downcast_dtype): col = col.cast(downcast_dtype) diff --git a/python/cudf/cudf/core/udf/strings_typing.py b/python/cudf/cudf/core/udf/strings_typing.py index e065c0ad75d..6817412ee6e 100644 --- a/python/cudf/cudf/core/udf/strings_typing.py +++ b/python/cudf/cudf/core/udf/strings_typing.py @@ -19,7 +19,7 @@ # String object definitions class UDFString(types.Type): - np_dtype = np.dtype("object") + np_dtype: np.dtype[np.object_] = np.dtype("object") def __init__(self): super().__init__(name="udf_string") @@ -30,7 +30,7 @@ def return_as(self): class ManagedUDFString(types.Type): - np_dtype = np.dtype("object") + np_dtype: np.dtype[np.object_] = np.dtype("object") def __init__(self): super().__init__(name="managed_udf_string") @@ -41,7 +41,7 @@ def return_as(self): class StringView(types.Type): - np_dtype = np.dtype("object") + np_dtype: np.dtype[np.object_] = np.dtype("object") def __init__(self): super().__init__(name="string_view") diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index a3dffa488e8..ce68c5770a1 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -35,7 +35,7 @@ from cudf._typing import DtypeObj -_CSV_HEX_TYPE_MAP = { +_CSV_HEX_TYPE_MAP: dict[str, np.dtype] = { "hex": np.dtype("int64"), "hex64": np.dtype("int64"), "hex32": np.dtype("int32"), diff --git a/python/cudf/cudf/pandas/_wrappers/numpy.py b/python/cudf/cudf/pandas/_wrappers/numpy.py index 4e8391083a2..7fc6231f794 100644 --- a/python/cudf/cudf/pandas/_wrappers/numpy.py +++ b/python/cudf/cudf/pandas/_wrappers/numpy.py @@ -191,7 +191,9 @@ def ndarray__reduce__(self): # NumPy 2 introduced `_core` and gives warnings for access to `core`. from numpy._core.multiarray import flagsobj as _numpy_flagsobj else: - from numpy.core.multiarray import flagsobj as _numpy_flagsobj + from numpy.core.multiarray import ( # type: ignore[no-redef] + flagsobj as _numpy_flagsobj, + ) # Mapping flags between slow and fast types _ndarray_flags = make_intermediate_proxy_type( diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index dc9b7e5f49e..62fafeca243 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -1163,7 +1163,7 @@ def _transform_arg( for k, a in arg.items() } elif isinstance(arg, np.ndarray) and arg.dtype == "O": - transformed = [ + transformed: list[Any] = [ # type: ignore[var-annotated] _transform_arg(a, attribute_name, seen) for a in arg.flat ] # Keep the same memory layout as arg (the default is C_CONTIGUOUS) @@ -1171,7 +1171,9 @@ def _transform_arg( order = "F" else: order = "C" - result = np.empty(int(np.prod(arg.shape)), dtype=object, order=order) + result = np.empty( # type: ignore[call-overload] + int(np.prod(arg.shape)), dtype=np.object_, order=order + ) result[...] = transformed return result.reshape(arg.shape) elif isinstance(arg, Iterator) and attribute_name == "_fsproxy_fast": @@ -1386,7 +1388,7 @@ def is_proxy_instance(obj, type): } -NUMPY_TYPES: set[str] = set(np.sctypeDict.values()) +NUMPY_TYPES: set[type[np.generic]] = set(np.sctypeDict.values()) # type: ignore[arg-type] _SPECIAL_METHODS: set[str] = { diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index e957da30185..d2e7f4870f1 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -2,7 +2,7 @@ from __future__ import annotations import warnings -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any, TypeGuard import numpy as np import pandas as pd @@ -20,7 +20,9 @@ from cudf._typing import DtypeObj from cudf.core.dtypes import DecimalDtype -np_dtypes_to_pandas_dtypes = { +np_dtypes_to_pandas_dtypes: dict[ + np.dtype[Any], pd.core.dtypes.base.ExtensionDtype +] = { np.dtype("uint8"): pd.UInt8Dtype(), np.dtype("uint16"): pd.UInt16Dtype(), np.dtype("uint32"): pd.UInt32Dtype(), @@ -202,7 +204,7 @@ def min_signed_type(x: int, min_size: int = 8) -> np.dtype: that can represent the integer ``x`` """ for int_dtype in (np.int8, np.int16, np.int32, np.int64): - dtype = np.dtype(int_dtype) + dtype: np.dtype[Any] = np.dtype(int_dtype) if (dtype.itemsize * 8) >= min_size: if np.iinfo(int_dtype).min <= x <= np.iinfo(int_dtype).max: return dtype @@ -216,7 +218,7 @@ def min_unsigned_type(x: int, min_size: int = 8) -> np.dtype: that can represent the integer ``x`` """ for int_dtype in (np.uint8, np.uint16, np.uint32, np.uint64): - dtype = np.dtype(int_dtype) + dtype: np.dtype[Any] = np.dtype(int_dtype) if (dtype.itemsize * 8) >= min_size: if 0 <= x <= np.iinfo(int_dtype).max: return dtype @@ -244,7 +246,12 @@ def is_mixed_with_object_dtype(lhs, rhs): ) -def _get_nan_for_dtype(dtype: DtypeObj) -> DtypeObj: +def _get_nan_for_dtype(dtype: DtypeObj) -> np.generic: + """Return the appropriate NaN/NaT value for the given dtype. + + Returns a numpy scalar (np.generic subclass) representing the + null value for the dtype (e.g., np.float64('nan'), np.datetime64('NaT')). + """ if dtype.kind in "mM": time_unit, _ = np.datetime_data(dtype) return dtype.type("nat", time_unit) @@ -289,10 +296,10 @@ def find_common_type(dtypes: Iterable[DtypeObj]) -> DtypeObj | None: ) for dtype in dtypes ): - if len({dtype._categories.dtype for dtype in dtypes}) == 1: + if len({dtype._categories.dtype for dtype in dtypes}) == 1: # type: ignore[union-attr] return cudf.CategoricalDtype( cudf.core.column.concat_columns( - [dtype._categories for dtype in dtypes] + [dtype._categories for dtype in dtypes] # type: ignore[union-attr] ).unique() ) else: @@ -449,7 +456,9 @@ def is_pandas_nullable_numpy_dtype(dtype_to_check) -> bool: ) -def is_pandas_nullable_extension_dtype(dtype_to_check) -> bool: +def is_pandas_nullable_extension_dtype( + dtype_to_check: Any, +) -> TypeGuard[pd.core.dtypes.base.ExtensionDtype]: if is_pandas_nullable_numpy_dtype(dtype_to_check) or isinstance( dtype_to_check, pd.ArrowDtype ): @@ -760,7 +769,7 @@ def is_dtype_obj_decimal128(obj): ) -SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES = { +SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES: dict[np.dtype[Any], plc.types.TypeId] = { np.dtype("int8"): plc.types.TypeId.INT8, np.dtype("int16"): plc.types.TypeId.INT16, np.dtype("int32"): plc.types.TypeId.INT32, diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py index 89e72e07fde..fdaaac78e95 100644 --- a/python/cudf/cudf/utils/queryutils.py +++ b/python/cudf/cudf/utils/queryutils.py @@ -23,7 +23,7 @@ ENVREF_PREFIX = "__CUDF_ENVREF__" -SUPPORTED_QUERY_TYPES = { +SUPPORTED_QUERY_TYPES: set[np.dtype] = { np.dtype(dt) for dt in NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | BOOL_TYPES } diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyi b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyi index 1ad210cb74a..2086173a159 100644 --- a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyi +++ b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyi @@ -1,5 +1,9 @@ # Copyright (c) 2024-2025, NVIDIA CORPORATION. +from typing import Any + +import numpy as np + from rmm.pylibrmm.memory_resource import DeviceMemoryResource from rmm.pylibrmm.stream import Stream @@ -22,7 +26,7 @@ def generate_character_ngrams( def hash_character_ngrams( input: Column, ngrams: int, - seed: int, + seed: int | np.unsignedinteger[Any], stream: Stream | None = None, mr: DeviceMemoryResource | None = None, ) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyi b/python/pylibcudf/pylibcudf/nvtext/minhash.pyi index 9f5dd83d865..9e51d6ec3e7 100644 --- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyi +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyi @@ -1,5 +1,9 @@ # Copyright (c) 2024-2025, NVIDIA CORPORATION. +from typing import Any + +import numpy as np + from rmm.pylibrmm.memory_resource import DeviceMemoryResource from rmm.pylibrmm.stream import Stream @@ -7,7 +11,7 @@ from pylibcudf.column import Column def minhash( input: Column, - seed: int, + seed: int | np.unsignedinteger[Any], a: Column, b: Column, width: int, @@ -16,7 +20,7 @@ def minhash( ) -> Column: ... def minhash64( input: Column, - seed: int, + seed: int | np.unsignedinteger[Any], a: Column, b: Column, width: int, @@ -26,7 +30,7 @@ def minhash64( def minhash_ngrams( input: Column, ngrams: int, - seed: int, + seed: int | np.unsignedinteger[Any], a: Column, b: Column, stream: Stream | None = None, @@ -35,7 +39,7 @@ def minhash_ngrams( def minhash64_ngrams( input: Column, ngrams: int, - seed: int, + seed: int | np.unsignedinteger[Any], a: Column, b: Column, stream: Stream | None = None, diff --git a/python/pylibcudf/pylibcudf/quantiles.pyi b/python/pylibcudf/pylibcudf/quantiles.pyi index baea4da09d5..538dffa5024 100644 --- a/python/pylibcudf/pylibcudf/quantiles.pyi +++ b/python/pylibcudf/pylibcudf/quantiles.pyi @@ -1,6 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from collections.abc import Sequence +from collections.abc import Iterable from rmm.pylibrmm.memory_resource import DeviceMemoryResource from rmm.pylibrmm.stream import Stream @@ -11,7 +11,7 @@ from pylibcudf.types import Interpolation, NullOrder, Order, Sorted def quantile( input: Column, - q: Sequence[float], + q: Iterable[float], interp: Interpolation = Interpolation.LINEAR, ordered_indices: Column | None = None, exact: bool = True, @@ -20,7 +20,7 @@ def quantile( ) -> Column: ... def quantiles( input: Table, - q: Sequence[float], + q: Iterable[float], interp: Interpolation = Interpolation.NEAREST, is_input_sorted: Sorted = Sorted.NO, column_order: list[Order] | None = None,