rapidsai · davidwendt · Mar 5, 2025 · Mar 5, 2025 · Mar 6, 2025
@@ -5431,29 +5431,34 @@ def edit_distance_matrix(self) -> SeriesOrIndex:
         return self._return_or_inplace(self._column.edit_distance_matrix())
 
     def minhash(
-        self, seed: np.uint32, a: ColumnLike, b: ColumnLike, width: int
+        self, seed, a: ColumnLike, b: ColumnLike, width: int
     ) -> SeriesOrIndex:
         """
-        Compute the minhash of a strings column.
+        Compute the minhash of a strings column or a list strings column
+        of terms.
 
-        This uses the MurmurHash3_x86_32 algorithm for the hash function.
+        This uses the MurmurHash3_x86_32 algorithm for the hash function
+        if seed is of type np.uint32 or MurmurHash3_x86_128 if seed is
+        of type np.uint64.
 
         Calculation uses the formula (hv * a + b) % mersenne_prime
-        where hv is the hash of a substring of width characters,
+        where hv is the hash of a substring of width characters
+        or ngrams of strings if a list column,
         a and b are provided values and mersenne_prime is 2^61-1.
 
         Parameters
         ----------
-        seed : uint32
+        seed : uint32 or uint64
             The seed used for the hash algorithm.
         a : ColumnLike
             Values for minhash calculation.
-            Must be of type uint32.
+            Must be of type uint32 or uint64.
         b : ColumnLike
             Values for minhash calculation.
-            Must be of type uint32.
+            Must be of type uint32 or uint64.
         width : int
             The width of the substring to hash.
+            Or the ngram number of strings to hash.
 
         Examples
         --------
@@ -5466,20 +5471,69 @@ def minhash(
         0    [1305480171, 462824409, 74608232]
         1       [32665388, 65330773, 97996158]
         dtype: list
+        >>> sl = cudf.Series([['this', 'is', 'my'], ['favorite', 'book']])
+        >>> sl.str.minhash(width=2, seed=0, a=a, b=b)
+        0      [416367551, 832735099, 1249102647]
+        1    [1906668704, 3813337405, 1425038810]
+        dtype: list
         """
         a_column = column.as_column(a)
-        if a_column.dtype != np.uint32:
+        b_column = column.as_column(b)
+        if not hasattr(seed, "dtype"):
+            if a_column.dtype == np.uint32:
+                seed = np.uint32(seed)
+            else:
+                seed = np.uint64(seed)
+        if a_column.dtype != seed.dtype:
             raise ValueError(
-                f"Expecting a Series with dtype uint32, got {type(a)}"
+                f"Expecting a Series dtype to match seed type, got {type(a)}"
             )
-        b_column = column.as_column(b)
-        if b_column.dtype != np.uint32:
+        if b_column.dtype != seed.dtype:
             raise ValueError(
-                f"Expecting a Series with dtype uint32, got {type(b)}"
+                f"Expecting b Series dtype to match seed type, got {type(b)}"
             )
-        return self._return_or_inplace(
-            self._column.minhash(seed, a_column, b_column, width)  # type: ignore[arg-type]
-        )
+        if seed.dtype == np.uint32:
+            if isinstance(self._parent.dtype, cudf.ListDtype):
+                plc_column = plc.nvtext.minhash.minhash_ngrams(
+                    self._column.to_pylibcudf(mode="read"),
+                    width,
+                    seed,
+                    a._column.to_pylibcudf(mode="read"),
+                    b._column.to_pylibcudf(mode="read"),
+                )
+                result = ColumnBase.from_pylibcudf(plc_column)
+                return self._return_or_inplace(result)
+            else:
+                plc_column = plc.nvtext.minhash.minhash(
+                    self._column.to_pylibcudf(mode="read"),
+                    seed,
+                    a._column.to_pylibcudf(mode="read"),
+                    b._column.to_pylibcudf(mode="read"),
+                    width,
+                )
+                result = ColumnBase.from_pylibcudf(plc_column)
+                return self._return_or_inplace(result)
+        else:
+            if isinstance(self._parent.dtype, cudf.ListDtype):
+                plc_column = plc.nvtext.minhash.minhash64_ngrams(
+                    self._column.to_pylibcudf(mode="read"),
+                    width,
+                    seed,
+                    a._column.to_pylibcudf(mode="read"),
+                    b._column.to_pylibcudf(mode="read"),
+                )
+                result = ColumnBase.from_pylibcudf(plc_column)
+                return self._return_or_inplace(result)
+            else:
+                plc_column = plc.nvtext.minhash.minhash64(
+                    self._column.to_pylibcudf(mode="read"),
+                    seed,
+                    a._column.to_pylibcudf(mode="read"),
+                    b._column.to_pylibcudf(mode="read"),
+                    width,
+                )
+                result = ColumnBase.from_pylibcudf(plc_column)
+                return self._return_or_inplace(result)
 
     def minhash64(
         self, seed: np.uint64, a: ColumnLike, b: ColumnLike, width: int

@@ -916,7 +916,7 @@ def test_minhash():
             cudf.Series([0, 0, 0], dtype=np.uint64),
         ]
     )
-    actual = strings.str.minhash64(0, a=params, b=params, width=5)
+    actual = strings.str.minhash(0, a=params, b=params, width=5)
     assert_eq(expected, actual)
 
     # test wrong seed types
@@ -927,7 +927,7 @@ def test_minhash():
         strings.str.minhash(1, a=params, b=params, width=6)
     with pytest.raises(ValueError):
         params = cudf.Series([0, 1, 2], dtype=np.uint32)
-        strings.str.minhash64(1, a=params, b=params, width=8)
+        strings.str.minhash(np.uint64(1), a=params, b=params, width=8)
 
 
 def test_minhash_ngrams():
@@ -942,7 +942,7 @@ def test_minhash_ngrams():
             cudf.Series([1408797893, 2817595786, 4226393679], dtype=np.uint32),
         ]
     )
-    actual = strings.str.minhash_ngrams(ngrams=2, seed=0, a=params, b=params)
+    actual = strings.str.minhash(width=2, seed=0, a=params, b=params)
     assert_eq(expected, actual)
 
     params = cudf.Series([1, 2, 3], dtype=np.uint64)
@@ -958,18 +958,18 @@ def test_minhash_ngrams():
             ),
         ]
     )
-    actual = strings.str.minhash64_ngrams(ngrams=2, seed=0, a=params, b=params)
+    actual = strings.str.minhash(width=2, seed=0, a=params, b=params)
     assert_eq(expected, actual)
 
     # test wrong input types
     with pytest.raises(ValueError):
-        strings.str.minhash_ngrams(ngrams=7, seed=1, a="a", b="b")
+        strings.str.minhash(width=7, seed=1, a="a", b="b")
     with pytest.raises(ValueError):
         params = cudf.Series([0, 1, 2], dtype=np.int32)
-        strings.str.minhash_ngrams(ngrams=6, seed=1, a=params, b=params)
+        strings.str.minhash(width=6, seed=1, a=params, b=params)
     with pytest.raises(ValueError):
         params = cudf.Series([0, 1, 2], dtype=np.uint32)
-        strings.str.minhash64_ngrams(ngrams=8, seed=1, a=params, b=params)
+        strings.str.minhash(width=8, seed=np.uint64(1), a=params, b=params)
 
 
 def test_jaccard_index():