Enable fuzzy text matching in Matcher (#11359)

* enable fuzzy matching * add fuzzy param to EntityMatcher * include rapidfuzz_capi not yet used * fix type * add FUZZY predicate * add fuzzy attribute list * fix type properly * tidying * remove unnecessary dependency * handle fuzzy sets * simplify fuzzy sets * case fix * switch to FUZZYn predicates use Levenshtein distance. remove fuzzy param. remove rapidfuzz_capi. * revert changes added for fuzzy param * switch to polyleven (Python package) * enable fuzzy matching * add fuzzy param to EntityMatcher * include rapidfuzz_capi not yet used * fix type * add FUZZY predicate * add fuzzy attribute list * fix type properly * tidying * remove unnecessary dependency * handle fuzzy sets * simplify fuzzy sets * case fix * switch to FUZZYn predicates use Levenshtein distance. remove fuzzy param. remove rapidfuzz_capi. * revert changes added for fuzzy param * switch to polyleven (Python package) * fuzzy match only on oov tokens * remove polyleven * exclude whitespace tokens * don't allow more edits than characters * fix min distance * reinstate FUZZY operator with length-based distance function * handle sets inside regex operator * remove is_oov check * attempt build fix no mypy failure locally * re-attempt build fix * don't overwrite fuzzy param value * move fuzzy_match to its own Python module to allow patching * move fuzzy_match back inside Matcher simplify logic and add tests * Format tests * Parametrize fuzzyn tests * Parametrize and merge fuzzy+set tests * Format * Move fuzzy_match to a standalone method * Change regex kwarg type to bool * Add types for fuzzy_match - Refactor variable names - Add test for symmetrical behavior * Parametrize fuzzyn+set tests * Minor refactoring for fuzz/fuzzy * Make fuzzy_match a Matcher kwarg * Update type for _default_fuzzy_match * don't overwrite function param * Rename to fuzzy_compare * Update fuzzy_compare default argument declarations * allow fuzzy_compare override from EntityRuler * define new Matcher keyword arg * fix type definition * Implement fuzzy_compare config option for EntityRuler and SpanRuler * Rename _default_fuzzy_compare to fuzzy_compare, remove from reexported objects * Use simpler fuzzy_compare algorithm * Update types * Increase minimum to 2 in fuzzy_compare to allow one transposition * Fix predicate keys and matching for SetPredicate with FUZZY and REGEX * Add FUZZY6..9 * Add initial docs * Increase default fuzzy to rounded 30% of pattern length * Update docs for fuzzy_compare in components * Update EntityRuler and SpanRuler API docs * Rename EntityRuler and SpanRuler setting to matcher_fuzzy_compare To having naming similar to `phrase_matcher_attr`, rename `fuzzy_compare` setting for `EntityRuler` and `SpanRuler` to `matcher_fuzzy_compare. Organize next to `phrase_matcher_attr` in docs. * Fix schema aliases Co-authored-by: Sofie Van Landeghem <[email protected]> * Fix typo Co-authored-by: Sofie Van Landeghem <[email protected]> * Add FUZZY6-9 operators and update tests * Parameterize test over greedy Co-authored-by: Sofie Van Landeghem <[email protected]> * Fix type for fuzzy_compare to remove Optional * Rename to spacy.levenshtein_compare.v1, move to spacy.matcher.levenshtein * Update docs following levenshtein_compare renaming Co-authored-by: Adriane Boyd <[email protected]> Co-authored-by: Sofie Van Landeghem <[email protected]>
explosion · Jan 10, 2023 · 19650eb · 19650eb
1 parent eb8bb35
commit 19650eb
Show file tree

Hide file tree

Showing 14 changed files with 554 additions and 102 deletions.
diff --git a/spacy/matcher/levenshtein.pyx b/spacy/matcher/levenshtein.pyx
@@ -4,6 +4,8 @@ from libc.stdint cimport int64_t
 
 from typing import Optional
 
+from ..util import registry
+
 
 cdef extern from "polyleven.c":
     int64_t polyleven(PyObject *o1, PyObject *o2, int64_t k)
@@ -13,3 +15,18 @@ cpdef int64_t levenshtein(a: str, b: str, k: Optional[int] = None):
     if k is None:
         k = -1
     return polyleven(<PyObject*>a, <PyObject*>b, k)
+
+
+cpdef bint levenshtein_compare(input_text: str, pattern_text: str, fuzzy: int = -1):
+    if fuzzy >= 0:
+        max_edits = fuzzy
+    else:
+        # allow at least two edits (to allow at least one transposition) and up
+        # to 20% of the pattern string length
+        max_edits = max(2, round(0.3 * len(pattern_text)))
+    return levenshtein(input_text, pattern_text, max_edits) <= max_edits
+
+
+@registry.misc("spacy.levenshtein_compare.v1")
+def make_levenshtein_compare():
+    return levenshtein_compare
diff --git a/spacy/matcher/matcher.pxd b/spacy/matcher/matcher.pxd
@@ -77,3 +77,4 @@ cdef class Matcher:
     cdef public object _extensions
     cdef public object _extra_predicates
     cdef public object _seen_attrs
+    cdef public object _fuzzy_compare
diff --git a/spacy/matcher/matcher.pyi b/spacy/matcher/matcher.pyi
@@ -5,7 +5,8 @@ from ..vocab import Vocab
 from ..tokens import Doc, Span
 
 class Matcher:
-    def __init__(self, vocab: Vocab, validate: bool = ...) -> None: ...
+    def __init__(self, vocab: Vocab, validate: bool = ...,
+                 fuzzy_compare: Callable[[str, str, int], bool] = ...) -> None: ...
     def __reduce__(self) -> Any: ...
     def __len__(self) -> int: ...
     def __contains__(self, key: str) -> bool: ...

diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
@@ -1,4 +1,4 @@
-# cython: infer_types=True, profile=True
+# cython: binding=True, infer_types=True, profile=True
 from typing import List, Iterable
 
 from libcpp.vector cimport vector
@@ -20,10 +20,12 @@ from ..tokens.token cimport Token
 from ..tokens.morphanalysis cimport MorphAnalysis
 from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH, ENT_IOB
 
+from .levenshtein import levenshtein_compare
 from ..schemas import validate_token_pattern
 from ..errors import Errors, MatchPatternError, Warnings
 from ..strings import get_string_id
 from ..attrs import IDS
+from ..util import registry
 
 
 DEF PADDING = 5
@@ -36,11 +38,13 @@ cdef class Matcher:
     USAGE: https://spacy.io/usage/rule-based-matching
     """
 
-    def __init__(self, vocab, validate=True):
+    def __init__(self, vocab, validate=True, *, fuzzy_compare=levenshtein_compare):
         """Create the Matcher.
 
         vocab (Vocab): The vocabulary object, which must be shared with the
-            documents the matcher will operate on.
+        validate (bool): Validate all patterns added to this matcher.
+        fuzzy_compare (Callable[[str, str, int], bool]): The comparison method
+            for the FUZZY operators.
         """
         self._extra_predicates = []
         self._patterns = {}
@@ -51,9 +55,10 @@ cdef class Matcher:
         self.vocab = vocab
         self.mem = Pool()
         self.validate = validate
+        self._fuzzy_compare = fuzzy_compare
 
     def __reduce__(self):
-        data = (self.vocab, self._patterns, self._callbacks)
+        data = (self.vocab, self._patterns, self._callbacks, self.validate, self._fuzzy_compare)
         return (unpickle_matcher, data, None, None)
 
     def __len__(self):
@@ -128,7 +133,7 @@ cdef class Matcher:
         for pattern in patterns:
             try:
                 specs = _preprocess_pattern(pattern, self.vocab,
-                    self._extensions, self._extra_predicates)
+                    self._extensions, self._extra_predicates, self._fuzzy_compare)
                 self.patterns.push_back(init_pattern(self.mem, key, specs))
                 for spec in specs:
                     for attr, _ in spec[1]:
@@ -326,8 +331,8 @@ cdef class Matcher:
             return key
 
 
-def unpickle_matcher(vocab, patterns, callbacks):
-    matcher = Matcher(vocab)
+def unpickle_matcher(vocab, patterns, callbacks, validate, fuzzy_compare):
+    matcher = Matcher(vocab, validate=validate, fuzzy_compare=fuzzy_compare)
     for key, pattern in patterns.items():
         callback = callbacks.get(key, None)
         matcher.add(key, pattern, on_match=callback)
@@ -754,7 +759,7 @@ cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
     return id_attr.value
 
 
-def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
+def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates, fuzzy_compare):
     """This function interprets the pattern, converting the various bits of
     syntactic sugar before we compile it into a struct with init_pattern.
 
@@ -781,7 +786,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
         ops = _get_operators(spec)
         attr_values = _get_attr_values(spec, string_store)
         extensions = _get_extensions(spec, string_store, extensions_table)
-        predicates = _get_extra_predicates(spec, extra_predicates, vocab)
+        predicates = _get_extra_predicates(spec, extra_predicates, vocab, fuzzy_compare)
         for op in ops:
             tokens.append((op, list(attr_values), list(extensions), list(predicates), token_idx))
     return tokens
@@ -826,16 +831,45 @@ def _get_attr_values(spec, string_store):
 # These predicate helper classes are used to match the REGEX, IN, >= etc
 # extensions to the matcher introduced in #3173.
 
+class _FuzzyPredicate:
+    operators = ("FUZZY", "FUZZY1", "FUZZY2", "FUZZY3", "FUZZY4", "FUZZY5",
+                 "FUZZY6", "FUZZY7", "FUZZY8", "FUZZY9")
+
+    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
+                 regex=False, fuzzy=None, fuzzy_compare=None):
+        self.i = i
+        self.attr = attr
+        self.value = value
+        self.predicate = predicate
+        self.is_extension = is_extension
+        if self.predicate not in self.operators:
+            raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
+        fuzz = self.predicate[len("FUZZY"):] # number after prefix
+        self.fuzzy = int(fuzz) if fuzz else -1
+        self.fuzzy_compare = fuzzy_compare
+        self.key = (self.attr, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True))
+
+    def __call__(self, Token token):
+        if self.is_extension:
+            value = token._.get(self.attr)
+        else:
+            value = token.vocab.strings[get_token_attr_for_matcher(token.c, self.attr)]
+        if self.value == value:
+            return True
+        return self.fuzzy_compare(value, self.value, self.fuzzy)
+
+
 class _RegexPredicate:
     operators = ("REGEX",)
 
-    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
+    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
+                 regex=False, fuzzy=None, fuzzy_compare=None):
         self.i = i
         self.attr = attr
         self.value = re.compile(value)
         self.predicate = predicate
         self.is_extension = is_extension
-        self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
+        self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
         if self.predicate not in self.operators:
             raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
 
@@ -850,18 +884,28 @@ class _RegexPredicate:
 class _SetPredicate:
     operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET", "INTERSECTS")
 
-    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
+    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
+                 regex=False, fuzzy=None, fuzzy_compare=None):
         self.i = i
         self.attr = attr
         self.vocab = vocab
+        self.regex = regex
+        self.fuzzy = fuzzy
+        self.fuzzy_compare = fuzzy_compare
         if self.attr == MORPH:
             # normalize morph strings
             self.value = set(self.vocab.morphology.add(v) for v in value)
         else:
-            self.value = set(get_string_id(v) for v in value)
+            if self.regex:
+                self.value = set(re.compile(v) for v in value)
+            elif self.fuzzy is not None:
+                # add to string store
+                self.value = set(self.vocab.strings.add(v) for v in value)
+            else:
+                self.value = set(get_string_id(v) for v in value)
         self.predicate = predicate
         self.is_extension = is_extension
-        self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
+        self.key = (self.attr, self.regex, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True))
         if self.predicate not in self.operators:
             raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
 
@@ -889,9 +933,29 @@ class _SetPredicate:
                 return False
 
         if self.predicate == "IN":
-            return value in self.value
+            if self.regex:
+                value = self.vocab.strings[value]
+                return any(bool(v.search(value)) for v in self.value)
+            elif self.fuzzy is not None:
+                value = self.vocab.strings[value]
+                return any(self.fuzzy_compare(value, self.vocab.strings[v], self.fuzzy)
+                           for v in self.value)
+            elif value in self.value:
+                return True
+            else:
+                return False
         elif self.predicate == "NOT_IN":
-            return value not in self.value
+            if self.regex:
+                value = self.vocab.strings[value]
+                return not any(bool(v.search(value)) for v in self.value)
+            elif self.fuzzy is not None:
+                value = self.vocab.strings[value]
+                return not any(self.fuzzy_compare(value, self.vocab.strings[v], self.fuzzy)
+                               for v in self.value)
+            elif value in self.value:
+                return False
+            else:
+                return True
         elif self.predicate == "IS_SUBSET":
             return value <= self.value
         elif self.predicate == "IS_SUPERSET":
@@ -906,13 +970,14 @@ class _SetPredicate:
 class _ComparisonPredicate:
     operators = ("==", "!=", ">=", "<=", ">", "<")
 
-    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None):
+    def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None,
+                 regex=False, fuzzy=None, fuzzy_compare=None):
         self.i = i
         self.attr = attr
         self.value = value
         self.predicate = predicate
         self.is_extension = is_extension
-        self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
+        self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
         if self.predicate not in self.operators:
             raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
 
@@ -935,7 +1000,7 @@ class _ComparisonPredicate:
             return value < self.value
 
 
-def _get_extra_predicates(spec, extra_predicates, vocab):
+def _get_extra_predicates(spec, extra_predicates, vocab, fuzzy_compare):
     predicate_types = {
         "REGEX": _RegexPredicate,
         "IN": _SetPredicate,
@@ -949,6 +1014,16 @@ def _get_extra_predicates(spec, extra_predicates, vocab):
         "<=": _ComparisonPredicate,
         ">": _ComparisonPredicate,
         "<": _ComparisonPredicate,
+        "FUZZY": _FuzzyPredicate,
+        "FUZZY1": _FuzzyPredicate,
+        "FUZZY2": _FuzzyPredicate,
+        "FUZZY3": _FuzzyPredicate,
+        "FUZZY4": _FuzzyPredicate,
+        "FUZZY5": _FuzzyPredicate,
+        "FUZZY6": _FuzzyPredicate,
+        "FUZZY7": _FuzzyPredicate,
+        "FUZZY8": _FuzzyPredicate,
+        "FUZZY9": _FuzzyPredicate,
     }
     seen_predicates = {pred.key: pred.i for pred in extra_predicates}
     output = []
@@ -966,22 +1041,47 @@ def _get_extra_predicates(spec, extra_predicates, vocab):
                 attr = "ORTH"
             attr = IDS.get(attr.upper())
         if isinstance(value, dict):
-            processed = False
-            value_with_upper_keys = {k.upper(): v for k, v in value.items()}
-            for type_, cls in predicate_types.items():
-                if type_ in value_with_upper_keys:
-                    predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_, vocab=vocab)
-                    # Don't create a redundant predicates.
-                    # This helps with efficiency, as we're caching the results.
-                    if predicate.key in seen_predicates:
-                        output.append(seen_predicates[predicate.key])
-                    else:
-                        extra_predicates.append(predicate)
-                        output.append(predicate.i)
-                        seen_predicates[predicate.key] = predicate.i
-                    processed = True
-            if not processed:
-                warnings.warn(Warnings.W035.format(pattern=value))
+            output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
+                                                     extra_predicates, seen_predicates, fuzzy_compare=fuzzy_compare))
+    return output
+
+
+def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types,
+                               extra_predicates, seen_predicates, regex=False, fuzzy=None, fuzzy_compare=None):
+    output = []
+    for type_, value in value_dict.items():
+        type_ = type_.upper()
+        cls = predicate_types.get(type_)
+        if cls is None:
+            warnings.warn(Warnings.W035.format(pattern=value_dict))
+            # ignore unrecognized predicate type
+            continue
+        elif cls == _RegexPredicate:
+            if isinstance(value, dict):
+                # add predicates inside regex operator
+                output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
+                                                         extra_predicates, seen_predicates,
+                                                         regex=True))
+                continue
+        elif cls == _FuzzyPredicate:
+            if isinstance(value, dict):
+                # add predicates inside fuzzy operator
+                fuzz = type_[len("FUZZY"):] # number after prefix
+                fuzzy_val = int(fuzz) if fuzz else -1
+                output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
+                                                         extra_predicates, seen_predicates,
+                                                         fuzzy=fuzzy_val, fuzzy_compare=fuzzy_compare))
+                continue
+        predicate = cls(len(extra_predicates), attr, value, type_, vocab=vocab,
+                        regex=regex, fuzzy=fuzzy, fuzzy_compare=fuzzy_compare)
+        # Don't create redundant predicates.
+        # This helps with efficiency, as we're caching the results.
+        if predicate.key in seen_predicates:
+            output.append(seen_predicates[predicate.key])
+        else:
+            extra_predicates.append(predicate)
+            output.append(predicate.i)
+            seen_predicates[predicate.key] = predicate.i
     return output