Skip to content

Commit

Permalink
Single KNN Field Optimisation (#530)
Browse files Browse the repository at this point in the history
* using a single kNN opensearch field

* made end-to-end case work

* added debug statements for OpenSearch requests

* Added brackets around combined opensearch filter

* fixed bug with no searchable attributes

* changed marqo version

* single knn-field: test_search.py passes (#526)

* made test_search pass

* Updated CUDA version to match mainline

* removed debug message in OS

* fixed add docs, backend, create index unit tests

* pagination error handling

* removed debug message

* fixed bulk search and search tests

* fixed broken tests

* fixed index meta cache tests, add knn field validation

* empty results for empty searchable attr lexical

* finished pagination tests

* refactored filtering, get_model_properties, index creation tests

* fixed generic model error tests

* added knn field tests

* updated filtering tests

* added changes to contextualise_user_filter

* filtering unit tests pass, double backslash escape fixed

* added backslash escaping to Lucene sanitise function

fixed tests

* fixed contextualise edge cases with field at start of filter

* fixed contextualise edge cases with field at start of filter

* more edge cases for filtering

* edge case fixed in contextualise, also added draft regex solution

* fixed wrong string length skip

* removed unused code, updated name of contextualise

* fixed bug when content has field name by adding colon req

* gitignore vscode directory

* removed vscode settings json

* improved function docstrings

---------

Co-authored-by: Joshua Kim <[email protected]>
  • Loading branch information
pandu-k and vicilliar authored Jul 16, 2023
1 parent e8d078c commit aca7cd6
Show file tree
Hide file tree
Showing 26 changed files with 1,345 additions and 463 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -143,3 +143,5 @@ src/marqo/tensor_search/test_throttle_timing.txt
src/marqo/tensor_search/test_throttle_timing.csv
dump.rdb

# VSCode
.vscode/
19 changes: 2 additions & 17 deletions src/marqo/tensor_search/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def get_index_info(config: Config, index_name: str) -> IndexInfo:

def add_customer_field_properties(config: Config, index_name: str,
customer_field_names: Iterable[Tuple[str, enums.OpenSearchDataType]],
model_properties: dict, multimodal_combination_fields: Dict[str, Iterable[Tuple[str, enums.OpenSearchDataType]]]):
multimodal_combination_fields: Dict[str, Iterable[Tuple[str, enums.OpenSearchDataType]]]):
"""Adds new customer fields to index mapping.
Pushes the updated mapping to OpenSearch, and updates the local cache.
Expand All @@ -80,25 +80,11 @@ def add_customer_field_properties(config: Config, index_name: str,
"""
existing_info = get_cached_index_info(config=config, index_name=index_name)

# check if there is multimodal fie;ds and convert the fields name to a list with the same
# format of customer_field_names
knn_field_names = copy.deepcopy(customer_field_names)
if len(multimodal_combination_fields) > 0:
multimodal_customer_field_names = set([(field_name, "_") for field_name in list(multimodal_combination_fields)])
knn_field_names = knn_field_names.union(multimodal_customer_field_names)

body = {
"properties": {
enums.TensorField.chunks: {
"type": "nested",
"properties": {
validation.validate_vector_name(
utils.generate_vector_name(field_name[0])): {
"type": "knn_vector",
"dimension": model_properties["dimensions"],
"method": existing_info.get_ann_parameters()
} for field_name in knn_field_names
}
"properties": {}
}
}
}
Expand Down Expand Up @@ -146,7 +132,6 @@ def add_customer_field_properties(config: Config, index_name: str,
"type": type_to_set
}


for multimodal_field, child_fields in multimodal_combination_fields.items():
# update the new multimodal_field if it's not in it
if multimodal_field not in new_index_properties:
Expand Down
2 changes: 1 addition & 1 deletion src/marqo/tensor_search/configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def get_default_index_settings():
NsFields.ann_parameters: get_default_ann_parameters()
},
NsFields.number_of_shards: 5,
NsFields.number_of_replicas : 1,
NsFields.number_of_replicas: 1,
}

def get_default_ann_parameters():
Expand Down
11 changes: 11 additions & 0 deletions src/marqo/tensor_search/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,14 @@
NON_TENSORISABLE_FIELD_TYPES = [int, float, bool, list]

ALLOWED_MULTIMODAL_FIELD_TYPES = [str]

LUCENE_SPECIAL_CHARS = {
'/', '*', '^', '\\', '!', '[', '||', '?',
'&&', '"', ']', '-', '{', '~', '+', '}', ':', ')', '('
}

# these are chars that are not officially listed as Lucene special chars, but
# aren't treated as normal chars either
NON_OFFICIAL_LUCENE_SPECIAL_CHARS = {
' '
}
1 change: 1 addition & 0 deletions src/marqo/tensor_search/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class TensorField:
chunk_ids = "__chunk_ids"
# the prefix will have the customer's field name appended to the end of it
vector_prefix = "__vector_"
marqo_knn_field = "__vector_marqo_knn_field"
chunks = "__chunks"
output_highlights = "_highlights"
output_score = "_score"
Expand Down
146 changes: 146 additions & 0 deletions src/marqo/tensor_search/filtering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
import os
import typing
from timeit import default_timer as timer
from marqo import errors
from marqo.tensor_search import enums, configs, constants
from typing import (
List, Optional, Union, Callable, Iterable, Sequence, Dict, Tuple
)
from marqo.marqo_logging import logger
import copy
from marqo.tensor_search.enums import EnvVars
import re

def build_tensor_search_filter(
filter_string: str, simple_properties: dict,
searchable_attribs: Sequence):
"""Builds a Lucene-DSL filter string for OpenSearch, that combines the user's filter string
with searchable_attributes
args:
filter_string: user input string to filter results on. special chars must be escaped.
simple_properties: dict containing the index's fields as keys. will be used to add chunks prefix to fields in filter string.
searchable_attribs: user input list of attributes to search on. will be turned into a filter string.
"""
if searchable_attribs is not None:
copied_searchable_attribs = copy.deepcopy(searchable_attribs)
searchable_attribs_filter = build_searchable_attributes_filter(
searchable_attribs=copied_searchable_attribs)
else:
searchable_attribs_filter = ""

filter_string_with_chunks_prefixes = add_chunks_prefix_to_filter_string_fields(
filter_string=filter_string, simple_properties=simple_properties)

if filter_string_with_chunks_prefixes and searchable_attribs_filter:
return f"({searchable_attribs_filter}) AND ({filter_string_with_chunks_prefixes})"
else:
return f"{searchable_attribs_filter}{filter_string_with_chunks_prefixes}"


def build_searchable_attributes_filter(searchable_attribs: Sequence) -> str:
"""Recursively constructs the filter used to narrow the search down to specific searchable attributes"""
if searchable_attribs is None or len(searchable_attribs) == 0:
return ""

vector_prop_count = len(searchable_attribs)

# brackets surround field name, in case it contains a space:
sanitised_attr_name = f"({sanitise_lucene_special_chars(searchable_attribs.pop())})"

# base case
if vector_prop_count == 1:
return f"{enums.TensorField.chunks}.{enums.TensorField.field_name}:{sanitised_attr_name}"
else:
return (
f"{enums.TensorField.chunks}.{enums.TensorField.field_name}:{sanitised_attr_name}"
f" OR {build_searchable_attributes_filter(searchable_attribs=searchable_attribs)}")


def sanitise_lucene_special_chars(to_be_sanitised: str) -> str:
"""Santitises Lucene's special chars in a string.
We shouldn't apply this to the user's filter string, as they can choose to escape
Lucene's special chars themselves.
This should be used to sanitise a filter string constructed for users behind the
scenes (such as for searchable attributes).
See here for more info:
https://lucene.apache.org/core/6_0_0/queryparser/org/apache/lucene/queryparser/classic/package-summary.html#Escaping_Special_Characters
"""

# always escape backslashes before the other special chars
to_be_sanitised = to_be_sanitised.replace("\\", "\\\\")

# this prevents us from double-escaping backslashes.
non_backslash_chars = constants.LUCENE_SPECIAL_CHARS.union(constants.NON_OFFICIAL_LUCENE_SPECIAL_CHARS) - {'\\'}

for char in non_backslash_chars:
to_be_sanitised = to_be_sanitised.replace(char, f'\\{char}')
return to_be_sanitised


def add_chunks_prefix_to_filter_string_fields(filter_string: Optional[str], simple_properties: typing.Iterable) -> str:
"""adds the chunk prefix to the start of properties found in simple string (filter_string)
This allows for filtering within chunks.
Because this is a user-defined filter, if they want to filter on field names that contain
special characters, we expect them to escape the special characters themselves.
In order to search chunks we need to append the chunk prefix to the start of the field name.
This will only work if they escape the special characters in the field names themselves in
the exact same way that we do.
Args:
filter_string: the user defined filter string
simple_properties: simple properties of an index (such as text or floats
and bools)
Returns:
a string where the properties are referenced as children of a chunk.
"""
if simple_properties is None:
# If an index has no simple properties, simple_properties should be {}, but never None
raise errors.InternalError("simple properties of an index can never be None!")

if filter_string is None:
return ''

prefixed_filter = filter_string

for field in simple_properties:
escaped_field_name = sanitise_lucene_special_chars(field)
if escaped_field_name in filter_string:
# The field name MUST be followed by a colon.
escaped_field_name_with_colon = f'{escaped_field_name}:'
# we want to replace the field name that directly corresponds to the simple property,
# not any other field names that contain the simple property as a substring.

# case 0: field name is at the start of the filter string
# it must be followed by a colon, otherwise it is a substring of another field name
# edge case example: "field_a_excess_chars:a, escaped_field_name=field_a"
if filter_string.startswith(escaped_field_name_with_colon):
# add the chunk prefix ONCE to the start of the field name
prefixed_filter = f'{enums.TensorField.chunks}.{prefixed_filter}'

# next we check every occurence of field name NOT at the start of the filter string
# note: we do this even if it was also at the start
possible_chars_before_field_name = {" ", "("}
i = 0
while i < len(prefixed_filter):
# find every occurence of the field name in the filter string
if prefixed_filter[i:i+len(escaped_field_name_with_colon)] == escaped_field_name_with_colon:
# check if it is preceded by a space or an opening parenthesis
# also check that the preceding char is NOT escaped
if \
(i > 0 and prefixed_filter[i-1] in possible_chars_before_field_name) and \
(i == 1 or prefixed_filter[i-2] != "\\"):
# if so, add the chunk prefix the start of the field name
prefixed_filter = prefixed_filter[:i] + f"{enums.TensorField.chunks}." + prefixed_filter[i:]
# skip checking the newly inserted part
i += len(f"{enums.TensorField.chunks}.")
i += 1

return prefixed_filter
2 changes: 1 addition & 1 deletion src/marqo/tensor_search/formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def _clean_doc(doc: dict, doc_id=None, include_vectors: bool = False) -> dict:
if include_vectors:
copied[TensorField.tensor_facets] = [
{ch[TensorField.field_name]: ch[TensorField.field_content],
TensorField.embedding: ch[utils.generate_vector_name(ch[TensorField.field_name])]
TensorField.embedding: ch[TensorField.marqo_knn_field]
} for ch in copied[TensorField.chunks]
]
if TensorField.chunks in copied:
Expand Down
56 changes: 43 additions & 13 deletions src/marqo/tensor_search/models/index_info.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,36 @@
import pprint
from typing import NamedTuple, Any, Dict
from marqo.tensor_search import enums
from marqo.tensor_search.enums import IndexSettingsField as NsFields
from marqo.tensor_search.enums import IndexSettingsField as NsField
from marqo.tensor_search import configs
from marqo.s2_inference import s2_inference
from marqo import errors
from marqo.s2_inference import errors as s2_inference_errors


# For use outside of this module
def get_model_properties_from_index_defaults(index_defaults: Dict, model_name: str):
""" Gets model_properties from index defaults if available. Otherwise, it attempts to get it from the model registry.
"""
try:
model_properties = index_defaults[NsField.model_properties]
except KeyError:
try:
model_properties = s2_inference.get_model_properties_from_registry(model_name)
except s2_inference_errors.UnknownModelError:
raise errors.InvalidArgError(
f"Could not find model properties for model={model_name}. "
f"Please check that the model name is correct. "
f"Please provide model_properties if the model is a custom model and is not supported by default")
return model_properties


class IndexInfo(NamedTuple):
"""
model_name: name of the ML model used to encode the data
properties: keys are different index field names, values
provide info about the properties
index_settings: settings for the index
"""
model_name: str
properties: dict
Expand All @@ -35,7 +57,17 @@ def get_text_properties(self) -> dict:
This returns more than just pure text fields. For example: ints
bool fields.
NOTE: get_text_properties will flatten the object properties
Example: left-text_properties right-true_text_properties
{'Description': {'type': 'text'}, {'Description': {'type': 'text'},
'Genre': {'type': 'text'}, 'Genre': {'type': 'text'},
'Title': {'type': 'text'}, 'Title': {'type': 'text'},
'my_combination_field': {'properties': {'lexical_field': {'type': 'text'}, -----> 'my_combination_field.lexical_field': {'type': 'text'},
'my_image': {'type': 'text'}, 'my_combination_field.my_image': {'type': 'text'},
'some_text': {'type': 'text'}}}} 'my_combination_field.some_text': {'type': 'text'}}
"""

text_props_dict = {}
for text_field, text_props in self.properties.items():
if not text_field.startswith(enums.TensorField.vector_prefix) and not text_field in enums.TensorField.__dict__.values():
Expand All @@ -46,14 +78,12 @@ def get_text_properties(self) -> dict:
text_props_dict[f"{text_field}.{sub_field}"] = sub_field_props
return text_props_dict

# get_text_properties will flatten the object properties
# Example: left-text_properties right-true_text_properties
# {'Description': {'type': 'text'}, {'Description': {'type': 'text'},
# 'Genre': {'type': 'text'}, 'Genre': {'type': 'text'},
# 'Title': {'type': 'text'}, 'Title': {'type': 'text'},
# 'my_combination_field': {'properties': {'lexical_field': {'type': 'text'}, -----> 'my_combination_field.lexical_field': {'type': 'text'},
# 'my_image': {'type': 'text'}, 'my_combination_field.my_image': {'type': 'text'},
# 'some_text': {'type': 'text'}}}} 'my_combination_field.some_text': {'type': 'text'}}
def get_model_properties(self) -> dict:
index_defaults = self.index_settings["index_defaults"]
return get_model_properties_from_index_defaults(
index_defaults=index_defaults, model_name=self.model_name
)


def get_true_text_properties(self) -> dict:
"""returns a dict containing only names and properties of fields that
Expand All @@ -79,16 +109,16 @@ def get_ann_parameters(self) -> Dict[str, Any]:
"""
ann_default = configs.get_default_ann_parameters()
index_ann_defaults = self.index_settings[NsFields.index_defaults].get(NsFields.ann_parameters, {})
index_ann_defaults = self.index_settings[NsField.index_defaults].get(NsField.ann_parameters, {})

# index defaults override generic defaults
ann_params = {
**ann_default,
**index_ann_defaults
}
ann_params[NsFields.ann_method_parameters] = {
**ann_default[NsFields.ann_method_parameters],
**index_ann_defaults.get(NsFields.ann_method_parameters, {})
ann_params[NsField.ann_method_parameters] = {
**ann_default[NsField.ann_method_parameters],
**index_ann_defaults.get(NsField.ann_method_parameters, {})
}

return ann_params
Loading

0 comments on commit aca7cd6

Please sign in to comment.