Single KNN Field Optimisation (#530)

* using a single kNN opensearch field * made end-to-end case work * added debug statements for OpenSearch requests * Added brackets around combined opensearch filter * fixed bug with no searchable attributes * changed marqo version * single knn-field: test_search.py passes (#526) * made test_search pass * Updated CUDA version to match mainline * removed debug message in OS * fixed add docs, backend, create index unit tests * pagination error handling * removed debug message * fixed bulk search and search tests * fixed broken tests * fixed index meta cache tests, add knn field validation * empty results for empty searchable attr lexical * finished pagination tests * refactored filtering, get_model_properties, index creation tests * fixed generic model error tests * added knn field tests * updated filtering tests * added changes to contextualise_user_filter * filtering unit tests pass, double backslash escape fixed * added backslash escaping to Lucene sanitise function fixed tests * fixed contextualise edge cases with field at start of filter * fixed contextualise edge cases with field at start of filter * more edge cases for filtering * edge case fixed in contextualise, also added draft regex solution * fixed wrong string length skip * removed unused code, updated name of contextualise * fixed bug when content has field name by adding colon req * gitignore vscode directory * removed vscode settings json * improved function docstrings --------- Co-authored-by: Joshua Kim <[email protected]>
marqo-ai · Jul 16, 2023 · aca7cd6 · aca7cd6
1 parent e8d078c
commit aca7cd6
Show file tree

Hide file tree

Showing 26 changed files with 1,345 additions and 463 deletions.
diff --git a/.gitignore b/.gitignore
@@ -143,3 +143,5 @@ src/marqo/tensor_search/test_throttle_timing.txt
 src/marqo/tensor_search/test_throttle_timing.csv
 dump.rdb
 
+# VSCode
+.vscode/
diff --git a/src/marqo/tensor_search/backend.py b/src/marqo/tensor_search/backend.py
@@ -62,7 +62,7 @@ def get_index_info(config: Config, index_name: str) -> IndexInfo:
 
 def add_customer_field_properties(config: Config, index_name: str,
                                   customer_field_names: Iterable[Tuple[str, enums.OpenSearchDataType]],
-                                  model_properties: dict, multimodal_combination_fields: Dict[str, Iterable[Tuple[str, enums.OpenSearchDataType]]]):
+                                  multimodal_combination_fields: Dict[str, Iterable[Tuple[str, enums.OpenSearchDataType]]]):
     """Adds new customer fields to index mapping.
 
     Pushes the updated mapping to OpenSearch, and updates the local cache.
@@ -80,25 +80,11 @@ def add_customer_field_properties(config: Config, index_name: str,
     """
     existing_info = get_cached_index_info(config=config, index_name=index_name)
 
-    # check if there is multimodal fie;ds and convert the fields name to a list with the same
-    # format of customer_field_names
-    knn_field_names = copy.deepcopy(customer_field_names)
-    if len(multimodal_combination_fields) > 0:
-        multimodal_customer_field_names = set([(field_name, "_") for field_name in list(multimodal_combination_fields)])
-        knn_field_names = knn_field_names.union(multimodal_customer_field_names)
-
     body = {
         "properties": {
             enums.TensorField.chunks: {
                 "type": "nested",
-                "properties": {
-                    validation.validate_vector_name(
-                        utils.generate_vector_name(field_name[0])): {
-                        "type": "knn_vector",
-                        "dimension": model_properties["dimensions"],
-                        "method": existing_info.get_ann_parameters()
-                    } for field_name in knn_field_names
-                }
+                "properties": {}
             }
         }
     }
@@ -146,7 +132,6 @@ def add_customer_field_properties(config: Config, index_name: str,
             "type": type_to_set
         }
 
-
     for multimodal_field, child_fields in multimodal_combination_fields.items():
         # update the new multimodal_field if it's not in it
         if multimodal_field not in new_index_properties:

diff --git a/src/marqo/tensor_search/configs.py b/src/marqo/tensor_search/configs.py
@@ -22,7 +22,7 @@ def get_default_index_settings():
             NsFields.ann_parameters: get_default_ann_parameters()
         },
         NsFields.number_of_shards: 5,
-        NsFields.number_of_replicas : 1,
+        NsFields.number_of_replicas: 1,
     }
 
 def get_default_ann_parameters():

diff --git a/src/marqo/tensor_search/constants.py b/src/marqo/tensor_search/constants.py
@@ -16,3 +16,14 @@
 NON_TENSORISABLE_FIELD_TYPES = [int, float, bool, list]
 
 ALLOWED_MULTIMODAL_FIELD_TYPES = [str]
+
+LUCENE_SPECIAL_CHARS = {
+    '/', '*', '^', '\\', '!', '[', '||', '?',
+    '&&', '"', ']', '-', '{', '~', '+', '}', ':', ')', '('
+}
+
+# these are chars that are not officially listed as Lucene special chars, but
+# aren't treated as normal chars either
+NON_OFFICIAL_LUCENE_SPECIAL_CHARS = {
+    ' '
+}
diff --git a/src/marqo/tensor_search/enums.py b/src/marqo/tensor_search/enums.py
@@ -29,6 +29,7 @@ class TensorField:
     chunk_ids = "__chunk_ids"
     # the prefix will have the customer's field name appended to the end of it
     vector_prefix = "__vector_"
+    marqo_knn_field = "__vector_marqo_knn_field"
     chunks = "__chunks"
     output_highlights = "_highlights"
     output_score = "_score"

diff --git a/src/marqo/tensor_search/filtering.py b/src/marqo/tensor_search/filtering.py
@@ -0,0 +1,146 @@
+import os
+import typing
+from timeit import default_timer as timer
+from marqo import errors
+from marqo.tensor_search import enums, configs, constants
+from typing import (
+    List, Optional, Union, Callable, Iterable, Sequence, Dict, Tuple
+)
+from marqo.marqo_logging import logger
+import copy
+from marqo.tensor_search.enums import EnvVars
+import re
+
+def build_tensor_search_filter(
+        filter_string: str, simple_properties: dict,
+        searchable_attribs: Sequence):
+    """Builds a Lucene-DSL filter string for OpenSearch, that combines the user's filter string
+    with searchable_attributes
+
+    args:
+        filter_string: user input string to filter results on. special chars must be escaped.
+        simple_properties: dict containing the index's fields as keys. will be used to add chunks prefix to fields in filter string.
+        searchable_attribs: user input list of attributes to search on. will be turned into a filter string.
+    """
+    if searchable_attribs is not None:
+        copied_searchable_attribs = copy.deepcopy(searchable_attribs)
+        searchable_attribs_filter = build_searchable_attributes_filter(
+            searchable_attribs=copied_searchable_attribs)
+    else:
+        searchable_attribs_filter = ""
+
+    filter_string_with_chunks_prefixes = add_chunks_prefix_to_filter_string_fields(
+        filter_string=filter_string, simple_properties=simple_properties)
+
+    if filter_string_with_chunks_prefixes and searchable_attribs_filter:
+        return f"({searchable_attribs_filter}) AND ({filter_string_with_chunks_prefixes})"
+    else:
+        return f"{searchable_attribs_filter}{filter_string_with_chunks_prefixes}"
+
+
+def build_searchable_attributes_filter(searchable_attribs: Sequence) -> str:
+    """Recursively constructs the filter used to narrow the search down to specific searchable attributes"""
+    if searchable_attribs is None or len(searchable_attribs) == 0:
+        return ""
+
+    vector_prop_count = len(searchable_attribs)
+
+    # brackets surround field name, in case it contains a space:
+    sanitised_attr_name = f"({sanitise_lucene_special_chars(searchable_attribs.pop())})"
+
+    # base case
+    if vector_prop_count == 1:
+        return f"{enums.TensorField.chunks}.{enums.TensorField.field_name}:{sanitised_attr_name}"
+    else:
+        return (
+            f"{enums.TensorField.chunks}.{enums.TensorField.field_name}:{sanitised_attr_name}"
+            f" OR {build_searchable_attributes_filter(searchable_attribs=searchable_attribs)}")
+
+
+def sanitise_lucene_special_chars(to_be_sanitised: str) -> str:
+    """Santitises Lucene's special chars in a string.
+
+    We shouldn't apply this to the user's filter string, as they can choose to escape
+    Lucene's special chars themselves.
+
+    This should be used to sanitise a filter string constructed for users behind the
+    scenes (such as for searchable attributes).
+
+    See here for more info:
+    https://lucene.apache.org/core/6_0_0/queryparser/org/apache/lucene/queryparser/classic/package-summary.html#Escaping_Special_Characters
+
+    """
+
+    # always escape backslashes before the other special chars
+    to_be_sanitised = to_be_sanitised.replace("\\", "\\\\")
+
+    # this prevents us from double-escaping backslashes.
+    non_backslash_chars = constants.LUCENE_SPECIAL_CHARS.union(constants.NON_OFFICIAL_LUCENE_SPECIAL_CHARS) - {'\\'}
+
+    for char in non_backslash_chars:
+        to_be_sanitised = to_be_sanitised.replace(char, f'\\{char}')
+    return to_be_sanitised
+
+
+def add_chunks_prefix_to_filter_string_fields(filter_string: Optional[str], simple_properties: typing.Iterable) -> str:
+    """adds the chunk prefix to the start of properties found in simple string (filter_string)
+    This allows for filtering within chunks.
+
+    Because this is a user-defined filter, if they want to filter on field names that contain
+    special characters, we expect them to escape the special characters themselves.
+
+    In order to search chunks we need to append the chunk prefix to the start of the field name.
+    This will only work if they escape the special characters in the field names themselves in
+    the exact same way that we do.
+
+    Args:
+        filter_string: the user defined filter string
+        simple_properties: simple properties of an index (such as text or floats
+            and bools)
+
+    Returns:
+        a string where the properties are referenced as children of a chunk.
+    """
+    if simple_properties is None:
+        # If an index has no simple properties, simple_properties should be {}, but never None
+        raise errors.InternalError("simple properties of an index can never be None!")
+
+    if filter_string is None:
+        return ''
+
+    prefixed_filter = filter_string
+
+    for field in simple_properties:
+        escaped_field_name = sanitise_lucene_special_chars(field)
+        if escaped_field_name in filter_string:
+            # The field name MUST be followed by a colon.
+            escaped_field_name_with_colon = f'{escaped_field_name}:'
+            # we want to replace the field name that directly corresponds to the simple property,
+            # not any other field names that contain the simple property as a substring.
+
+            # case 0: field name is at the start of the filter string
+            # it must be followed by a colon, otherwise it is a substring of another field name
+            # edge case example: "field_a_excess_chars:a, escaped_field_name=field_a"
+            if filter_string.startswith(escaped_field_name_with_colon):
+                # add the chunk prefix ONCE to the start of the field name
+                prefixed_filter = f'{enums.TensorField.chunks}.{prefixed_filter}'
+
+            # next we check every occurence of field name NOT at the start of the filter string
+            # note: we do this even if it was also at the start
+            possible_chars_before_field_name = {" ", "("}
+            i = 0
+            while i < len(prefixed_filter):
+                # find every occurence of the field name in the filter string
+                if prefixed_filter[i:i+len(escaped_field_name_with_colon)] == escaped_field_name_with_colon:
+                    # check if it is preceded by a space or an opening parenthesis
+                    # also check that the preceding char is NOT escaped
+                    if \
+                    (i > 0 and prefixed_filter[i-1] in possible_chars_before_field_name) and \
+                    (i == 1 or prefixed_filter[i-2] != "\\"): 
+                        # if so, add the chunk prefix the start of the field name
+                        prefixed_filter = prefixed_filter[:i] + f"{enums.TensorField.chunks}." + prefixed_filter[i:]
+                        # skip checking the newly inserted part
+                        i += len(f"{enums.TensorField.chunks}.")  
+                i += 1
+
+    return prefixed_filter
diff --git a/src/marqo/tensor_search/formatting.py b/src/marqo/tensor_search/formatting.py
@@ -23,7 +23,7 @@ def _clean_doc(doc: dict, doc_id=None, include_vectors: bool = False) -> dict:
     if include_vectors:
         copied[TensorField.tensor_facets] = [
             {ch[TensorField.field_name]: ch[TensorField.field_content],
-             TensorField.embedding: ch[utils.generate_vector_name(ch[TensorField.field_name])]
+             TensorField.embedding: ch[TensorField.marqo_knn_field]
             } for ch in copied[TensorField.chunks]
         ]
     if TensorField.chunks in copied:

diff --git a/src/marqo/tensor_search/models/index_info.py b/src/marqo/tensor_search/models/index_info.py
@@ -1,14 +1,36 @@
 import pprint
 from typing import NamedTuple, Any, Dict
 from marqo.tensor_search import enums
-from marqo.tensor_search.enums import IndexSettingsField as NsFields
+from marqo.tensor_search.enums import IndexSettingsField as NsField
 from marqo.tensor_search import configs
+from marqo.s2_inference import s2_inference
+from marqo import errors
+from marqo.s2_inference import errors as s2_inference_errors
+
+
+# For use outside of this module
+def get_model_properties_from_index_defaults(index_defaults: Dict, model_name: str):
+    """ Gets model_properties from index defaults if available. Otherwise, it attempts to get it from the model registry.
+    """
+    try:
+        model_properties = index_defaults[NsField.model_properties]
+    except KeyError:
+        try:
+            model_properties = s2_inference.get_model_properties_from_registry(model_name)
+        except s2_inference_errors.UnknownModelError:
+            raise errors.InvalidArgError(
+                f"Could not find model properties for model={model_name}. "
+                f"Please check that the model name is correct. "
+                f"Please provide model_properties if the model is a custom model and is not supported by default")
+    return model_properties
+
 
 class IndexInfo(NamedTuple):
     """
     model_name: name of the ML model used to encode the data
     properties: keys are different index field names, values
         provide info about the properties
+    index_settings: settings for the index
     """
     model_name: str
     properties: dict
@@ -35,7 +57,17 @@ def get_text_properties(self) -> dict:
         This returns more than just pure text fields. For example: ints
         bool fields.
 
+        
+        NOTE: get_text_properties will flatten the object properties
+        Example: left-text_properties   right-true_text_properties
+        {'Description': {'type': 'text'},                                                         {'Description': {'type': 'text'},
+         'Genre': {'type': 'text'},                                                                'Genre': {'type': 'text'},
+         'Title': {'type': 'text'},                                                                'Title': {'type': 'text'},
+         'my_combination_field': {'properties': {'lexical_field': {'type': 'text'}, ----->         'my_combination_field.lexical_field': {'type': 'text'},
+                                                 'my_image': {'type': 'text'},                     'my_combination_field.my_image': {'type': 'text'},
+                                                 'some_text': {'type': 'text'}}}}                   'my_combination_field.some_text': {'type': 'text'}}
         """
+
         text_props_dict = {}
         for text_field, text_props in self.properties.items():
             if not text_field.startswith(enums.TensorField.vector_prefix) and not text_field in enums.TensorField.__dict__.values():
@@ -46,14 +78,12 @@ def get_text_properties(self) -> dict:
                             text_props_dict[f"{text_field}.{sub_field}"] = sub_field_props
         return text_props_dict
 
-        # get_text_properties will flatten the object properties
-        # Example: left-text_properties   right-true_text_properties
-        # {'Description': {'type': 'text'},                                                         {'Description': {'type': 'text'},
-        #  'Genre': {'type': 'text'},                                                                'Genre': {'type': 'text'},
-        #  'Title': {'type': 'text'},                                                                'Title': {'type': 'text'},
-        #  'my_combination_field': {'properties': {'lexical_field': {'type': 'text'}, ----->         'my_combination_field.lexical_field': {'type': 'text'},
-        #                                          'my_image': {'type': 'text'},                     'my_combination_field.my_image': {'type': 'text'},
-        #                                          'some_text': {'type': 'text'}}}}                   'my_combination_field.some_text': {'type': 'text'}}
+    def get_model_properties(self) -> dict:
+        index_defaults = self.index_settings["index_defaults"]
+        return get_model_properties_from_index_defaults(
+            index_defaults=index_defaults, model_name=self.model_name
+        )
+
 
     def get_true_text_properties(self) -> dict:
         """returns a dict containing only names and properties of fields that
@@ -79,16 +109,16 @@ def get_ann_parameters(self) -> Dict[str, Any]:
         
         """
         ann_default = configs.get_default_ann_parameters()
-        index_ann_defaults = self.index_settings[NsFields.index_defaults].get(NsFields.ann_parameters, {})
+        index_ann_defaults = self.index_settings[NsField.index_defaults].get(NsField.ann_parameters, {})
 
         # index defaults override generic defaults
         ann_params = {
             **ann_default,
             **index_ann_defaults
         }
-        ann_params[NsFields.ann_method_parameters] = {
-            **ann_default[NsFields.ann_method_parameters],
-            **index_ann_defaults.get(NsFields.ann_method_parameters, {})
+        ann_params[NsField.ann_method_parameters] = {
+            **ann_default[NsField.ann_method_parameters],
+            **index_ann_defaults.get(NsField.ann_method_parameters, {})
         }
 
         return ann_params