Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
e2a5bc9
Add numpy to mypy additional_dependencies in pre-commit config
vyasr Oct 13, 2025
833a866
Add type annotation for UDFString.np_dtype
vyasr Oct 13, 2025
c0ed796
Add type annotation for ManagedUDFString.np_dtype
vyasr Oct 13, 2025
f78b346
Add type annotation for StringView.np_dtype
vyasr Oct 13, 2025
0588aa2
Add type annotation for min_date in timezones.py
vyasr Oct 13, 2025
8c17dd0
Add type annotation for np_dtypes_to_pandas_dtypes
vyasr Oct 13, 2025
84bf101
Add type annotation for dtype in min_signed_type
vyasr Oct 13, 2025
51fcfb4
Add type annotation for dtype in min_unsigned_type
vyasr Oct 13, 2025
fd10a97
Add type annotation for SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES
vyasr Oct 13, 2025
d23e8a1
Add type annotation for _UNDERLYING_DTYPE
vyasr Oct 13, 2025
648b42d
Fix Category 3 mypy errors: union type attribute access
vyasr Oct 13, 2025
9dd023e
Fix mypy error in temporal_base.py:169 - type(other)("NaT", to_unit)
vyasr Oct 13, 2025
781db10
Fix mypy error in temporal_base.py:323 - np.timedelta64 max_dist
vyasr Oct 13, 2025
beaa48d
Fix mypy error in temporal_base.py:327 - np.timedelta64 min_dist
vyasr Oct 13, 2025
a9f1337
Fix mypy error in temporal_base.py:330 - np.timedelta64 max_to_res
vyasr Oct 13, 2025
db7707f
Fix mypy error in column.py:2251 - type(col_dtype)(precision, scale)
vyasr Oct 13, 2025
5922b43
Fix Group 1: Invalid type: ignore syntax in temporal_base.py
vyasr Oct 15, 2025
c57c9bf
Fix Groups 2 & 3: _NP_SCALAR ClassVar and call-overload issues
vyasr Oct 15, 2025
98a8345
Fix Group 4: Column dtype narrowing with type: ignore
vyasr Oct 15, 2025
832033a
Fix Group 6: _get_nan_for_dtype return type and add type ignores
vyasr Oct 15, 2025
a31950e
Fix Group 7: Add missing type annotations (12 errors)
vyasr Oct 15, 2025
e84fbf1
Fix additional type errors in fast_slow_proxy.py
vyasr Oct 15, 2025
3ef77c5
Fix minhash seed parameter type mismatches (9 errors)
vyasr Oct 15, 2025
f91a9b1
Fix Group 8 mypy errors (19 errors across 12 files)
vyasr Oct 15, 2025
972081d
Fix bug
vyasr Oct 16, 2025
c3bbbe9
PR review
vyasr Oct 16, 2025
568bf8f
Merge remote-tracking branch 'upstream/branch-25.12' into fix/typing_…
vyasr Oct 16, 2025
437da97
Remove one more unused ignore
vyasr Oct 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ repos:
rev: 'v1.13.0'
hooks:
- id: mypy
additional_dependencies: [types-cachetools, pyarrow-stubs]
additional_dependencies: [types-cachetools, pyarrow-stubs, numpy]
args: ["--config-file=pyproject.toml",
"python/cudf/cudf",
"python/pylibcudf/pylibcudf",
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/_internals/timezones.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def _read_tzfile_as_columns(
from cudf.core.column.column import as_column

# this happens for UTC-like zones
min_date = np.int64(np.iinfo("int64").min + 1).astype(
min_date: np.datetime64 = np.int64(np.iinfo("int64").min + 1).astype(
np.dtype("M8[s]")
)
return (as_column([min_date]), as_column([np.timedelta64(0, "s")])) # type: ignore[return-value]
Expand Down
8 changes: 4 additions & 4 deletions python/cudf/cudf/core/accessors/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -4735,7 +4735,7 @@ def character_ngrams(
return result

def hash_character_ngrams(
self, n: int = 5, as_list: bool = False, seed: np.uint32 = 0
self, n: int = 5, as_list: bool = False, seed: int | np.uint32 = 0
) -> Series | Index:
"""
Generate hashes of n-grams from characters in a column of strings.
Expand Down Expand Up @@ -5326,7 +5326,7 @@ def minhash(
return self.minhash64(seed, a_column, b_column, width)

def minhash64(
self, seed: np.uint64, a: ColumnLike, b: ColumnLike, width: int
self, seed: int | np.uint64, a: ColumnLike, b: ColumnLike, width: int
) -> Series | Index:
"""
Compute the minhash of a strings column.
Expand Down Expand Up @@ -5377,7 +5377,7 @@ def minhash64(
)

def minhash_ngrams(
self, ngrams: int, seed: np.uint32, a: ColumnLike, b: ColumnLike
self, ngrams: int, seed: int | np.uint32, a: ColumnLike, b: ColumnLike
) -> Series | Index:
"""
Compute the minhash of a list column of strings.
Expand Down Expand Up @@ -5428,7 +5428,7 @@ def minhash_ngrams(
)

def minhash64_ngrams(
self, ngrams: int, seed: np.uint64, a: ColumnLike, b: ColumnLike
self, ngrams: int, seed: int | np.uint64, a: ColumnLike, b: ColumnLike
) -> Series | Index:
"""
Compute the minhash of a list column of strings.
Expand Down
10 changes: 6 additions & 4 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -1731,15 +1731,15 @@ def astype(self, dtype: DtypeObj, copy: bool | None = False) -> ColumnBase:
if isinstance(dtype, CategoricalDtype):
result = self.as_categorical_column(dtype)
elif is_dtype_obj_interval(dtype):
result = self.as_interval_column(dtype)
result = self.as_interval_column(dtype) # type: ignore[arg-type]
elif is_dtype_obj_list(dtype) or is_dtype_obj_struct(dtype):
if self.dtype != dtype:
raise NotImplementedError(
f"Casting {self.dtype} columns not currently supported"
)
result = self
elif is_dtype_obj_decimal(dtype):
result = self.as_decimal_column(dtype)
result = self.as_decimal_column(dtype) # type: ignore[arg-type]
elif dtype.kind == "M":
result = self.as_datetime_column(dtype)
elif dtype.kind == "m":
Expand Down Expand Up @@ -2227,8 +2227,10 @@ def reduce(self, reduction_op: str, **kwargs) -> ScalarLike:
plc.TypeId.DECIMAL32,
}:
scale = -plc_scalar.type().scale()
# Narrow type for mypy - we know col_dtype is a decimal type from the check above
assert isinstance(col_dtype, DecimalDtype)
Comment thread
vyasr marked this conversation as resolved.
p = col_dtype.precision
# https://docs.microsoft.com/en-us/sql/t-sql/data-types/precision-scale-and-length-transact-sql
p = col_dtype.precision # type: ignore[union-attr]
nrows = len(self)
if reduction_op in {"min", "max"}:
new_p = p
Expand All @@ -2242,7 +2244,7 @@ def reduce(self, reduction_op: str, **kwargs) -> ScalarLike:
raise NotImplementedError(
f"{reduction_op} not implemented for decimal types."
)
precision = max(min(new_p, col_dtype.MAX_PRECISION), 0) # type: ignore[union-attr]
precision = max(min(new_p, col_dtype.MAX_PRECISION), 0)
new_dtype = type(col_dtype)(precision, scale)
result_col = result_col.astype(new_dtype)
elif isinstance(col_dtype, IntervalDtype):
Expand Down
49 changes: 33 additions & 16 deletions python/cudf/cudf/core/column/decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,19 +118,31 @@ def _from_32_64_arrow(
if isinstance(data, pa.ChunkedArray):
data = data.combine_chunks()
mask_buf, data_buf = data.buffers()
rmm_data_buffer = rmm.DeviceBuffer.to_device(
np.frombuffer(data_buf)
.view(view_type)[::step]
.copy()
.view("uint8")
)
plc_column = plc.Column.from_rmm_buffer(
rmm_data_buffer,
plc.DataType(plc_type, -data.type.scale),
len(data),
[],
)
if mask_buf is not None:
if data_buf is None:
# If data_buf is None, create an empty column
plc_column = plc.Column(
data_type=plc.DataType(plc_type, -data.type.scale),
size=0,
data=None,
mask=None,
null_count=0,
offset=0,
children=[],
)
else:
rmm_data_buffer = rmm.DeviceBuffer.to_device(
np.frombuffer(data_buf)
.view(view_type)[::step]
.copy()
.view("uint8")
)
plc_column = plc.Column.from_rmm_buffer(
rmm_data_buffer,
plc.DataType(plc_type, -data.type.scale),
len(data),
[],
)
if mask_buf is not None and data_buf is not None:
mask_size = plc.null_mask.bitmask_allocation_size_bytes(len(data))
if mask_buf.size < mask_size:
rmm_mask_buffer = rmm.DeviceBuffer(size=mask_size)
Expand Down Expand Up @@ -391,7 +403,9 @@ def from_arrow(cls, data: pa.Array | pa.ChunkedArray) -> Self:

def to_arrow(self) -> pa.Array:
data_buf_32 = np.array(self.base_data.memoryview()).view("int32") # type: ignore[union-attr]
data_buf_128 = np.empty(len(data_buf_32) * 4, dtype="int32")
data_buf_128: np.ndarray = np.empty(
len(data_buf_32) * 4, dtype="int32"
)

# use striding to set the first 32 bits of each 128-bit chunk:
data_buf_128[::4] = data_buf_32
Expand Down Expand Up @@ -464,8 +478,9 @@ def from_arrow(cls, data: pa.Array | pa.ChunkedArray) -> Self:
return result

def to_arrow(self) -> pa.Array:
dtype: Decimal128Dtype
if isinstance(self.dtype, pd.ArrowDtype):
dtype = pyarrow_dtype_to_cudf_dtype(self.dtype)
dtype = pyarrow_dtype_to_cudf_dtype(self.dtype) # type: ignore[assignment]
else:
dtype = self.dtype

Expand Down Expand Up @@ -510,7 +525,9 @@ def from_arrow(cls, data: pa.Array | pa.ChunkedArray) -> Self:

def to_arrow(self) -> pa.Array:
data_buf_64 = np.array(self.base_data.memoryview()).view("int64") # type: ignore[union-attr]
data_buf_128 = np.empty(len(data_buf_64) * 2, dtype="int64")
data_buf_128: np.ndarray = np.empty(
len(data_buf_64) * 2, dtype="int64"
)

# use striding to set the first 64 bits of each 128-bit chunk:
data_buf_128[::2] = data_buf_64
Expand Down
18 changes: 16 additions & 2 deletions python/cudf/cudf/core/column/lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -526,10 +526,17 @@ def join_list_elements(
def minhash_ngrams(
self,
width: int,
seed: np.uint32,
seed: int | np.uint32,
a: NumericalColumn,
b: NumericalColumn,
) -> Self:
# Convert int to np.uint32 with validation
if isinstance(seed, int):
if seed < 0 or seed > np.iinfo(np.uint32).max:
raise ValueError(
f"seed must be in range [0, {np.iinfo(np.uint32).max}]"
)
seed = np.uint32(seed)
return type(self).from_pylibcudf( # type: ignore[return-value]
plc.nvtext.minhash.minhash_ngrams(
self.to_pylibcudf(mode="read"),
Expand All @@ -544,10 +551,17 @@ def minhash_ngrams(
def minhash64_ngrams(
self,
width: int,
seed: np.uint64,
seed: int | np.uint64,
a: NumericalColumn,
b: NumericalColumn,
) -> Self:
# Convert int to np.uint64 with validation
if isinstance(seed, int):
if seed < 0 or seed > np.iinfo(np.uint64).max:
raise ValueError(
f"seed must be in range [0, {np.iinfo(np.uint64).max}]"
)
seed = np.uint64(seed)
return type(self).from_pylibcudf( # type: ignore[return-value]
plc.nvtext.minhash.minhash64_ngrams(
self.to_pylibcudf(mode="read"),
Expand Down
4 changes: 3 additions & 1 deletion python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -821,7 +821,9 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
# Kinds are the same but to_dtype is smaller
if "float" in to_dtype_numpy.name:
finfo = np.finfo(to_dtype_numpy)
lower_, upper_ = finfo.min, finfo.max
lower_: int | float
upper_: int | float
lower_, upper_ = finfo.min, finfo.max # type: ignore[assignment]

# Check specifically for np.pi values when casting to lower precision
if self_dtype_numpy.itemsize > to_dtype_numpy.itemsize:
Expand Down
14 changes: 7 additions & 7 deletions python/cudf/cudf/core/column/numerical_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,12 +86,12 @@ def kurtosis(self, skipna: bool | None = None) -> float:
skipna = True if skipna is None else skipna

if len(self) == 0 or self._can_return_nan(skipna=skipna):
return _get_nan_for_dtype(self.dtype)
return _get_nan_for_dtype(self.dtype) # type: ignore[return-value]

self = self.nans_to_nulls().dropna()

if len(self) < 4:
return _get_nan_for_dtype(self.dtype)
return _get_nan_for_dtype(self.dtype) # type: ignore[return-value]

n = len(self)
miu = self.mean()
Expand Down Expand Up @@ -178,7 +178,7 @@ def quantile(
except (TypeError, ValueError):
pass
return (
_get_nan_for_dtype(self.dtype)
_get_nan_for_dtype(self.dtype) # type: ignore[return-value]
if scalar_result is NA
else scalar_result
)
Expand Down Expand Up @@ -221,7 +221,7 @@ def median(self, skipna: bool | None = None) -> NumericalBaseColumn:
skipna = True if skipna is None else skipna

if self._can_return_nan(skipna=skipna):
return _get_nan_for_dtype(self.dtype)
return _get_nan_for_dtype(self.dtype) # type: ignore[return-value]

# enforce linear in case the default ever changes
result = self.quantile(
Expand All @@ -240,21 +240,21 @@ def cov(self, other: NumericalBaseColumn) -> float:
or len(other) == 0
or (len(self) == 1 and len(other) == 1)
):
return _get_nan_for_dtype(self.dtype)
return _get_nan_for_dtype(self.dtype) # type: ignore[return-value]

result = (self - self.mean()) * (other - other.mean())
cov_sample = result.sum() / (len(self) - 1)
return cov_sample

def corr(self, other: NumericalBaseColumn) -> float:
if len(self) == 0 or len(other) == 0:
return _get_nan_for_dtype(self.dtype)
return _get_nan_for_dtype(self.dtype) # type: ignore[return-value]

cov = self.cov(other)
lhs_std, rhs_std = self.std(), other.std()

if not cov or lhs_std == 0 or rhs_std == 0:
return _get_nan_for_dtype(self.dtype)
return _get_nan_for_dtype(self.dtype) # type: ignore[return-value]
return cov / lhs_std / rhs_std

def round(
Expand Down
27 changes: 24 additions & 3 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -629,11 +629,18 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
@acquire_spill_lock()
def minhash(
self,
seed: np.uint32,
seed: int | np.uint32,
a: NumericalColumn,
b: NumericalColumn,
width: int,
) -> ListColumn:
# Convert int to np.uint32 with validation
if isinstance(seed, int):
if seed < 0 or seed > np.iinfo(np.uint32).max:
raise ValueError(
f"seed must be in range [0, {np.iinfo(np.uint32).max}]"
)
seed = np.uint32(seed)
return type(self).from_pylibcudf( # type: ignore[return-value]
plc.nvtext.minhash.minhash(
self.to_pylibcudf(mode="read"),
Expand All @@ -647,11 +654,18 @@ def minhash(
@acquire_spill_lock()
def minhash64(
self,
seed: np.uint64,
seed: int | np.uint64,
a: NumericalColumn,
b: NumericalColumn,
width: int,
) -> ListColumn:
# Convert int to np.uint64 with validation
if isinstance(seed, int):
if seed < 0 or seed > np.iinfo(np.uint64).max:
raise ValueError(
f"seed must be in range [0, {np.iinfo(np.uint64).max}]"
)
seed = np.uint64(seed)
return type(self).from_pylibcudf( # type: ignore[return-value]
plc.nvtext.minhash.minhash64(
self.to_pylibcudf(mode="read"),
Expand Down Expand Up @@ -689,8 +703,15 @@ def generate_character_ngrams(self, ngrams: int) -> ListColumn:

@acquire_spill_lock()
def hash_character_ngrams(
self, ngrams: int, seed: np.uint32
self, ngrams: int, seed: int | np.uint32
) -> ListColumn:
# Convert int to np.uint32 with validation
if isinstance(seed, int):
if seed < 0 or seed > np.iinfo(np.uint32).max:
raise ValueError(
f"seed must be in range [0, {np.iinfo(np.uint32).max}]"
)
seed = np.uint32(seed)
result = plc.nvtext.generate_ngrams.hash_character_ngrams(
self.to_pylibcudf(mode="read"), ngrams, seed
)
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/column/struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,8 @@ def base_size(self) -> int:

def to_arrow(self) -> pa.Array:
children = [child.to_arrow() for child in self.children]
dtype = (
pyarrow_dtype_to_cudf_dtype(self.dtype)
dtype: StructDtype = (
pyarrow_dtype_to_cudf_dtype(self.dtype) # type: ignore[assignment]
if isinstance(self.dtype, pd.ArrowDtype)
else self.dtype
)
Expand Down
Loading