@@ -365,7 +365,7 @@ cdef class {{name}}HashTable(HashTable):
365365    def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
366366                Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
367367                object na_value=None, bint ignore_na=False,
368-                 bint return_inverse=False):
368+                 object mask=None,  bint return_inverse=False):
369369        """
370370        Calculate unique values and labels (no sorting!)
371371
@@ -388,6 +388,10 @@ cdef class {{name}}HashTable(HashTable):
388388            Whether NA-values should be ignored for calculating the uniques. If
389389            True, the labels corresponding to missing values will be set to
390390            na_sentinel.
391+         mask : ndarray[bool], optional
392+             If not None, the mask is used as indicator for missing values
393+             (True = missing, False = valid) instead of `na_value` or
394+             condition "val != val".
391395        return_inverse : boolean, default False
392396            Whether the mapping of the original array values to their location
393397            in the vector of uniques should be returned.
@@ -406,12 +410,17 @@ cdef class {{name}}HashTable(HashTable):
406410            {{dtype}}_t val, na_value2
407411            khiter_t k
408412            {{name}}VectorData *ud
409-             bint use_na_value
413+             bint use_na_value, use_mask
414+             uint8_t[:] mask_values
410415
411416        if return_inverse:
412417            labels = np.empty(n, dtype=np.int64)
413418        ud = uniques.data
414419        use_na_value = na_value is not None
420+         use_mask = mask is not None
421+ 
422+         if use_mask:
423+             mask_values = mask.view("uint8")
415424
416425        if use_na_value:
417426            # We need this na_value2 because we want to allow users
@@ -427,7 +436,11 @@ cdef class {{name}}HashTable(HashTable):
427436            for i in range(n):
428437                val = values[i]
429438
430-                 if ignore_na and (
439+                 if ignore_na and use_mask:
440+                     if mask_values[i]:
441+                         labels[i] = na_sentinel
442+                         continue
443+                 elif ignore_na and (
431444                {{if not name.lower().startswith(("uint", "int"))}}
432445                val != val or
433446                {{endif}}
@@ -491,7 +504,7 @@ cdef class {{name}}HashTable(HashTable):
491504                            return_inverse=return_inverse)
492505
493506    def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
494-                   object na_value=None):
507+                   object na_value=None, object mask=None ):
495508        """
496509        Calculate unique values and labels (no sorting!)
497510
@@ -509,6 +522,10 @@ cdef class {{name}}HashTable(HashTable):
509522            any value "val" satisfying val != val is considered missing.
510523            If na_value is not None, then _additionally_, any value "val"
511524            satisfying val == na_value is considered missing.
525+         mask : ndarray[bool], optional
526+             If not None, the mask is used as indicator for missing values
527+             (True = missing, False = valid) instead of `na_value` or
528+             condition "val != val".
512529
513530        Returns
514531        -------
@@ -519,7 +536,7 @@ cdef class {{name}}HashTable(HashTable):
519536        """
520537        uniques_vector = {{name}}Vector()
521538        return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
522-                             na_value=na_value, ignore_na=True,
539+                             na_value=na_value, ignore_na=True, mask=mask, 
523540                            return_inverse=True)
524541
525542    def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
@@ -852,7 +869,7 @@ cdef class StringHashTable(HashTable):
852869                            return_inverse=return_inverse)
853870
854871    def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
855-                   object na_value=None):
872+                   object na_value=None, object mask=None ):
856873        """
857874        Calculate unique values and labels (no sorting!)
858875
@@ -870,6 +887,8 @@ cdef class StringHashTable(HashTable):
870887            that is not a string is considered missing. If na_value is
871888            not None, then _additionally_ any value "val" satisfying
872889            val == na_value is considered missing.
890+         mask : ndarray[bool], optional
891+             Not yet implementd for StringHashTable.
873892
874893        Returns
875894        -------
@@ -1091,7 +1110,7 @@ cdef class PyObjectHashTable(HashTable):
10911110                            return_inverse=return_inverse)
10921111
10931112    def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
1094-                   object na_value=None):
1113+                   object na_value=None, object mask=None ):
10951114        """
10961115        Calculate unique values and labels (no sorting!)
10971116
@@ -1109,6 +1128,8 @@ cdef class PyObjectHashTable(HashTable):
11091128            any value "val" satisfying val != val is considered missing.
11101129            If na_value is not None, then _additionally_, any value "val"
11111130            satisfying val == na_value is considered missing.
1131+         mask : ndarray[bool], optional
1132+             Not yet implemented for PyObjectHashTable.
11121133
11131134        Returns
11141135        -------
0 commit comments