Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Combine multiple str.minhash() APIs into one call #18168

Draft
wants to merge 3 commits into
base: branch-25.04
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 69 additions & 15 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -5431,29 +5431,34 @@ def edit_distance_matrix(self) -> SeriesOrIndex:
return self._return_or_inplace(self._column.edit_distance_matrix())

def minhash(
self, seed: np.uint32, a: ColumnLike, b: ColumnLike, width: int
self, seed, a: ColumnLike, b: ColumnLike, width: int
) -> SeriesOrIndex:
"""
Compute the minhash of a strings column.
Compute the minhash of a strings column or a list strings column
of terms.

This uses the MurmurHash3_x86_32 algorithm for the hash function.
This uses the MurmurHash3_x86_32 algorithm for the hash function
if seed is of type np.uint32 or MurmurHash3_x86_128 if seed is
of type np.uint64.

Calculation uses the formula (hv * a + b) % mersenne_prime
where hv is the hash of a substring of width characters,
where hv is the hash of a substring of width characters
or ngrams of strings if a list column,
a and b are provided values and mersenne_prime is 2^61-1.

Parameters
----------
seed : uint32
seed : uint32 or uint64
The seed used for the hash algorithm.
a : ColumnLike
Values for minhash calculation.
Must be of type uint32.
Must be of type uint32 or uint64.
b : ColumnLike
Values for minhash calculation.
Must be of type uint32.
Must be of type uint32 or uint64.
width : int
The width of the substring to hash.
Or the ngram number of strings to hash.

Examples
--------
Expand All @@ -5466,20 +5471,69 @@ def minhash(
0 [1305480171, 462824409, 74608232]
1 [32665388, 65330773, 97996158]
dtype: list
>>> sl = cudf.Series([['this', 'is', 'my'], ['favorite', 'book']])
>>> sl.str.minhash(width=2, seed=0, a=a, b=b)
0 [416367551, 832735099, 1249102647]
1 [1906668704, 3813337405, 1425038810]
dtype: list
"""
a_column = column.as_column(a)
if a_column.dtype != np.uint32:
b_column = column.as_column(b)
if not hasattr(seed, "dtype"):
if a_column.dtype == np.uint32:
seed = np.uint32(seed)
else:
seed = np.uint64(seed)
if a_column.dtype != seed.dtype:
raise ValueError(
f"Expecting a Series with dtype uint32, got {type(a)}"
f"Expecting a Series dtype to match seed type, got {type(a)}"
)
b_column = column.as_column(b)
if b_column.dtype != np.uint32:
if b_column.dtype != seed.dtype:
raise ValueError(
f"Expecting a Series with dtype uint32, got {type(b)}"
f"Expecting b Series dtype to match seed type, got {type(b)}"
)
return self._return_or_inplace(
self._column.minhash(seed, a_column, b_column, width) # type: ignore[arg-type]
)
if seed.dtype == np.uint32:
if isinstance(self._parent.dtype, cudf.ListDtype):
plc_column = plc.nvtext.minhash.minhash_ngrams(
self._column.to_pylibcudf(mode="read"),
width,
seed,
a._column.to_pylibcudf(mode="read"),
b._column.to_pylibcudf(mode="read"),
)
result = ColumnBase.from_pylibcudf(plc_column)
return self._return_or_inplace(result)
else:
plc_column = plc.nvtext.minhash.minhash(
self._column.to_pylibcudf(mode="read"),
seed,
a._column.to_pylibcudf(mode="read"),
b._column.to_pylibcudf(mode="read"),
width,
)
result = ColumnBase.from_pylibcudf(plc_column)
return self._return_or_inplace(result)
else:
if isinstance(self._parent.dtype, cudf.ListDtype):
plc_column = plc.nvtext.minhash.minhash64_ngrams(
self._column.to_pylibcudf(mode="read"),
width,
seed,
a._column.to_pylibcudf(mode="read"),
b._column.to_pylibcudf(mode="read"),
)
result = ColumnBase.from_pylibcudf(plc_column)
return self._return_or_inplace(result)
else:
plc_column = plc.nvtext.minhash.minhash64(
self._column.to_pylibcudf(mode="read"),
seed,
a._column.to_pylibcudf(mode="read"),
b._column.to_pylibcudf(mode="read"),
width,
)
result = ColumnBase.from_pylibcudf(plc_column)
return self._return_or_inplace(result)

def minhash64(
self, seed: np.uint64, a: ColumnLike, b: ColumnLike, width: int
Expand Down
14 changes: 7 additions & 7 deletions python/cudf/cudf/tests/text/test_text_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -916,7 +916,7 @@ def test_minhash():
cudf.Series([0, 0, 0], dtype=np.uint64),
]
)
actual = strings.str.minhash64(0, a=params, b=params, width=5)
actual = strings.str.minhash(0, a=params, b=params, width=5)
assert_eq(expected, actual)

# test wrong seed types
Expand All @@ -927,7 +927,7 @@ def test_minhash():
strings.str.minhash(1, a=params, b=params, width=6)
with pytest.raises(ValueError):
params = cudf.Series([0, 1, 2], dtype=np.uint32)
strings.str.minhash64(1, a=params, b=params, width=8)
strings.str.minhash(np.uint64(1), a=params, b=params, width=8)


def test_minhash_ngrams():
Expand All @@ -942,7 +942,7 @@ def test_minhash_ngrams():
cudf.Series([1408797893, 2817595786, 4226393679], dtype=np.uint32),
]
)
actual = strings.str.minhash_ngrams(ngrams=2, seed=0, a=params, b=params)
actual = strings.str.minhash(width=2, seed=0, a=params, b=params)
assert_eq(expected, actual)

params = cudf.Series([1, 2, 3], dtype=np.uint64)
Expand All @@ -958,18 +958,18 @@ def test_minhash_ngrams():
),
]
)
actual = strings.str.minhash64_ngrams(ngrams=2, seed=0, a=params, b=params)
actual = strings.str.minhash(width=2, seed=0, a=params, b=params)
assert_eq(expected, actual)

# test wrong input types
with pytest.raises(ValueError):
strings.str.minhash_ngrams(ngrams=7, seed=1, a="a", b="b")
strings.str.minhash(width=7, seed=1, a="a", b="b")
with pytest.raises(ValueError):
params = cudf.Series([0, 1, 2], dtype=np.int32)
strings.str.minhash_ngrams(ngrams=6, seed=1, a=params, b=params)
strings.str.minhash(width=6, seed=1, a=params, b=params)
with pytest.raises(ValueError):
params = cudf.Series([0, 1, 2], dtype=np.uint32)
strings.str.minhash64_ngrams(ngrams=8, seed=1, a=params, b=params)
strings.str.minhash(width=8, seed=np.uint64(1), a=params, b=params)


def test_jaccard_index():
Expand Down
Loading