Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Unified search query syntax using the full-text search capabilities of the underlying DB #11635

Merged
merged 59 commits into from
Oct 25, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
59 commits
Select commit Hold shift + click to select a range
d49af47
use websearch_to_tsquery
novocaine Dec 22, 2021
fea2848
Support fallback using plainto_tsquery
novocaine Dec 22, 2021
ccb0d6c
cleanup
novocaine Dec 23, 2021
409afd6
add tests for search_rooms
novocaine Dec 23, 2021
ac002cb
deflake
novocaine Dec 23, 2021
bbbb651
improve docstring
novocaine Dec 23, 2021
02defb2
pass tsquery_func to _find_highlights_in_postgres
novocaine Dec 23, 2021
2eef5ed
Add tests for sqlite
novocaine Dec 23, 2021
28dc642
Don't preprocess sqllite queries
novocaine Dec 23, 2021
8ac3ea1
Also test the size of "results", as its generated by a different quer…
novocaine Dec 23, 2021
5198cb2
fix comment
novocaine Dec 23, 2021
b3c7e0d
Use plainto_tsquery instead of crafting the query ourselves
novocaine Dec 23, 2021
10f181a
black
novocaine Dec 23, 2021
905f572
Merge branch 'develop' of github.com:matrix-org/synapse into use-webs…
novocaine Dec 23, 2021
05a0cdd
Add feature file
novocaine Dec 23, 2021
b0417e9
Use common cases for all tests
novocaine Dec 23, 2021
04c394b
Merge branch 'develop' of github.com:novocaine/synapse into use-webse…
novocaine Jan 10, 2022
4cfb506
Merge branch 'develop' into use-websearch_to_tsquery-for-fts
novocaine May 26, 2022
d30a211
fix for removal of get_datastore()
novocaine May 26, 2022
68cae26
isort
novocaine May 26, 2022
5dbb2c0
add migration
novocaine May 26, 2022
64dd357
comment
novocaine May 26, 2022
20ab98e
black
novocaine May 26, 2022
e5aa916
isort
novocaine May 26, 2022
7941d8a
flake8
novocaine May 26, 2022
f7362f1
import List for python 3.7
novocaine May 26, 2022
f1769f9
create a background job, and don't do anything if the tokenizer is al…
novocaine May 26, 2022
de07c83
When creating a new db, create it with tokenize=porter in the first p…
novocaine May 26, 2022
1170e07
black
novocaine May 27, 2022
b987203
Revert change to 25/fts.py
novocaine May 27, 2022
6ef1ef8
fix json import
novocaine May 27, 2022
63c4270
slightly neater quote formatting
novocaine May 27, 2022
0ddf6b1
fix missing space
novocaine May 30, 2022
9d77bc4
Add a parser to produce uniform results on all DBs
novocaine May 30, 2022
b8b2e28
address flake8
novocaine May 30, 2022
8506b21
give mypy a hint
novocaine May 30, 2022
e279be5
Fix phrase handling of "word"
novocaine May 30, 2022
6a7cf49
Add comment
novocaine May 30, 2022
92ad70a
Merge branch 'develop' into use-websearch_to_tsquery-for-fts
novocaine May 31, 2022
a5f298b
document negation via -
novocaine May 31, 2022
d6ed19e
Merge remote-tracking branch 'origin/develop' into use-websearch_to_t…
clokep Oct 17, 2022
c544409
Move the database schema to the updated directory.
clokep Oct 17, 2022
be742f1
Use a deque.
clokep Oct 17, 2022
52700b0
Add basic tests for _tokenize_query.
clokep Oct 17, 2022
5d9d183
Handle edge-cases.
clokep Oct 17, 2022
219321b
temp
clokep Oct 18, 2022
9166035
Fix phrase handling.
clokep Oct 18, 2022
a7fd4f6
Handle not with a space after.
clokep Oct 18, 2022
6751f5f
Use the int version number to check if the feature is supported.
clokep Oct 18, 2022
6e6ebd9
Lint
clokep Oct 18, 2022
3272819
Simplify schema delta.
clokep Oct 18, 2022
4993e1c
Add docstring.
clokep Oct 18, 2022
9b7bb08
Remove support for parens.
clokep Oct 24, 2022
13f5306
Fix edge cases with double quotes.
clokep Oct 24, 2022
62f6f18
Lint.
clokep Oct 24, 2022
abc56ad
Merge remote-tracking branch 'origin/develop' into use-websearch_to_t…
clokep Oct 24, 2022
b3755ae
Remove backwards compat code for Postgres < 11 since (almost) EOL.
clokep Oct 25, 2022
5e5fc8d
Simplify phrase handling.
clokep Oct 25, 2022
223580a
Fix tests on postgres 10.
clokep Oct 25, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/11635.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Allow use of postgres and sqllite full-text search operators in search queries.
197 changes: 162 additions & 35 deletions synapse/storage/databases/main/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,22 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import enum
import logging
import re
from typing import TYPE_CHECKING, Any, Collection, Iterable, List, Optional, Set, Tuple
from collections import deque
from dataclasses import dataclass
from typing import (
TYPE_CHECKING,
Any,
Collection,
Iterable,
List,
Optional,
Set,
Tuple,
Union,
)

import attr

Expand All @@ -27,7 +39,7 @@
LoggingTransaction,
)
from synapse.storage.databases.main.events_worker import EventRedactBehaviour
from synapse.storage.engines import BaseDatabaseEngine, PostgresEngine, Sqlite3Engine
from synapse.storage.engines import PostgresEngine, Sqlite3Engine
from synapse.types import JsonDict

if TYPE_CHECKING:
Expand Down Expand Up @@ -421,8 +433,6 @@ async def search_msgs(
"""
clauses = []

search_query = _parse_query(self.database_engine, search_term)

args: List[Any] = []

# Make sure we don't explode because the person is in too many rooms.
Expand All @@ -444,20 +454,24 @@ async def search_msgs(
count_clauses = clauses

if isinstance(self.database_engine, PostgresEngine):
search_query = search_term
tsquery_func = self.database_engine.tsquery_func
sql = (
"SELECT ts_rank_cd(vector, to_tsquery('english', ?)) AS rank,"
f"SELECT ts_rank_cd(vector, {tsquery_func}('english', ?)) AS rank,"
" room_id, event_id"
" FROM event_search"
" WHERE vector @@ to_tsquery('english', ?)"
f" WHERE vector @@ {tsquery_func}('english', ?)"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we move these to multiline strings while we're here 😇

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was planning to do a follow-up PR to update the entire module to multi-line strings. Would that be acceptable?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure!

)
args = [search_query, search_query] + args

count_sql = (
"SELECT room_id, count(*) as count FROM event_search"
" WHERE vector @@ to_tsquery('english', ?)"
f" WHERE vector @@ {tsquery_func}('english', ?)"
)
count_args = [search_query] + count_args
elif isinstance(self.database_engine, Sqlite3Engine):
search_query = _parse_query_for_sqlite(search_term)

sql = (
"SELECT rank(matchinfo(event_search)) as rank, room_id, event_id"
" FROM event_search"
Expand All @@ -469,7 +483,7 @@ async def search_msgs(
"SELECT room_id, count(*) as count FROM event_search"
" WHERE value MATCH ?"
)
count_args = [search_term] + count_args
Copy link
Contributor Author

@novocaine novocaine Dec 23, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it was a bug to pass search_term here rather than search_query..?

count_args = [search_query] + count_args
else:
# This should be unreachable.
raise Exception("Unrecognized database engine")
Expand Down Expand Up @@ -501,7 +515,9 @@ async def search_msgs(

highlights = None
if isinstance(self.database_engine, PostgresEngine):
highlights = await self._find_highlights_in_postgres(search_query, events)
highlights = await self._find_highlights_in_postgres(
search_query, events, tsquery_func
)

count_sql += " GROUP BY room_id"

Expand All @@ -510,7 +526,6 @@ async def search_msgs(
)

count = sum(row["count"] for row in count_results if row["room_id"] in room_ids)

return {
"results": [
{"event": event_map[r["event_id"]], "rank": r["rank"]}
Expand Down Expand Up @@ -542,9 +557,6 @@ async def search_rooms(
Each match as a dictionary.
"""
clauses = []

search_query = _parse_query(self.database_engine, search_term)

args: List[Any] = []

# Make sure we don't explode because the person is in too many rooms.
Expand Down Expand Up @@ -582,20 +594,23 @@ async def search_rooms(
args.extend([origin_server_ts, origin_server_ts, stream])

if isinstance(self.database_engine, PostgresEngine):
search_query = search_term
tsquery_func = self.database_engine.tsquery_func
sql = (
"SELECT ts_rank_cd(vector, to_tsquery('english', ?)) as rank,"
f"SELECT ts_rank_cd(vector, {tsquery_func}('english', ?)) as rank,"
" origin_server_ts, stream_ordering, room_id, event_id"
" FROM event_search"
" WHERE vector @@ to_tsquery('english', ?) AND "
f" WHERE vector @@ {tsquery_func}('english', ?) AND "
)
args = [search_query, search_query] + args

count_sql = (
"SELECT room_id, count(*) as count FROM event_search"
" WHERE vector @@ to_tsquery('english', ?) AND "
f" WHERE vector @@ {tsquery_func}('english', ?) AND "
)
count_args = [search_query] + count_args
elif isinstance(self.database_engine, Sqlite3Engine):

# We use CROSS JOIN here to ensure we use the right indexes.
# https://sqlite.org/optoverview.html#crossjoin
#
Expand All @@ -614,13 +629,14 @@ async def search_rooms(
" CROSS JOIN events USING (event_id)"
" WHERE "
)
search_query = _parse_query_for_sqlite(search_term)
args = [search_query] + args

count_sql = (
"SELECT room_id, count(*) as count FROM event_search"
" WHERE value MATCH ? AND "
)
count_args = [search_term] + count_args
count_args = [search_query] + count_args
else:
# This should be unreachable.
raise Exception("Unrecognized database engine")
Expand Down Expand Up @@ -660,7 +676,9 @@ async def search_rooms(

highlights = None
if isinstance(self.database_engine, PostgresEngine):
highlights = await self._find_highlights_in_postgres(search_query, events)
highlights = await self._find_highlights_in_postgres(
search_query, events, tsquery_func
)

count_sql += " GROUP BY room_id"

Expand All @@ -686,7 +704,7 @@ async def search_rooms(
}

async def _find_highlights_in_postgres(
self, search_query: str, events: List[EventBase]
self, search_query: str, events: List[EventBase], tsquery_func: str
) -> Set[str]:
"""Given a list of events and a search term, return a list of words
that match from the content of the event.
Expand All @@ -697,6 +715,7 @@ async def _find_highlights_in_postgres(
Args:
search_query
events: A list of events
tsquery_func: The tsquery_* function to use when making queries

Returns:
A set of strings.
Expand Down Expand Up @@ -729,7 +748,7 @@ def f(txn: LoggingTransaction) -> Set[str]:
while stop_sel in value:
stop_sel += ">"

query = "SELECT ts_headline(?, to_tsquery('english', ?), %s)" % (
query = f"SELECT ts_headline(?, {tsquery_func}('english', ?), %s)" % (
_to_postgres_options(
{
"StartSel": start_sel,
Expand Down Expand Up @@ -760,20 +779,128 @@ def _to_postgres_options(options_dict: JsonDict) -> str:
return "'%s'" % (",".join("%s=%s" % (k, v) for k, v in options_dict.items()),)


def _parse_query(database_engine: BaseDatabaseEngine, search_term: str) -> str:
"""Takes a plain unicode string from the user and converts it into a form
that can be passed to database.
We use this so that we can add prefix matching, which isn't something
that is supported by default.
@dataclass
class Phrase:
phrase: List[str]


class SearchToken(enum.Enum):
Not = enum.auto()
Or = enum.auto()
And = enum.auto()


Token = Union[str, Phrase, SearchToken]
TokenList = List[Token]


def _is_stop_word(word: str) -> bool:
# TODO Pull these out of the dictionary:
# https://github.com/postgres/postgres/blob/master/src/backend/snowball/stopwords/english.stop
return word in {"the", "a", "you", "me", "and", "but"}


def _tokenize_query(query: str) -> TokenList:
clokep marked this conversation as resolved.
Show resolved Hide resolved
"""
Convert the user-supplied `query` into a TokenList, which can be translated into
some DB-specific syntax.

The following constructs are supported:

- phrase queries using "double quotes"
- case-insensitive `or` and `and` operators
- negation of a keyword via unary `-`
- unary hyphen to denote NOT e.g. 'include -exclude'

The following differs from websearch_to_tsquery:

- Stop words are not removed.
- Unclosed phrases are treated differently.

"""
tokens: TokenList = []

# Find phrases.
in_phrase = False
parts = deque(query.split('"'))
for i, part in enumerate(parts):
# The contents inside double quotes is treated as a phrase, a trailing
# double quote is not implied.
in_phrase = bool(i % 2) and i != (len(parts) - 1)

# Pull out the individual words, discarding any non-word characters.
words = deque(re.findall(r"([\w\-]+)", part, re.UNICODE))

# Phrases have simplified handling of words.
if in_phrase:
# Skip stop words.
phrase = [word for word in words if not _is_stop_word(word)]

# Consecutive words are implicitly ANDed together.
if tokens and tokens[-1] not in (SearchToken.Not, SearchToken.Or):
tokens.append(SearchToken.And)

# Add the phrase.
tokens.append(Phrase(phrase))
continue

# Otherwise, not in a phrase.
while words:
word = words.popleft()

if word.startswith("-"):
tokens.append(SearchToken.Not)

# If there's more word, put it back to be processed again.
word = word[1:]
if word:
words.appendleft(word)
elif word.lower() == "or":
tokens.append(SearchToken.Or)
else:
# Skip stop words.
if _is_stop_word(word):
continue

# Consecutive words are implicitly ANDed together.
if tokens and tokens[-1] not in (SearchToken.Not, SearchToken.Or):
tokens.append(SearchToken.And)

# Add the search term.
tokens.append(word)

return tokens


def _tokens_to_sqlite_match_query(tokens: TokenList) -> str:
"""
Convert the list of tokens to a string suitable for passing to sqlite's MATCH.
Assume sqlite was compiled with enhanced query syntax.

Ref: https://www.sqlite.org/fts3.html#full_text_index_queries
"""
match_query = []
for token in tokens:
if isinstance(token, str):
match_query.append(token)
elif isinstance(token, Phrase):
match_query.append('"' + " ".join(token.phrase) + '"')
elif token == SearchToken.Not:
# TODO: SQLite treats NOT as a *binary* operator. Hopefully a search
# term has already been added before this.
match_query.append(" NOT ")
elif token == SearchToken.Or:
match_query.append(" OR ")
elif token == SearchToken.And:
match_query.append(" AND ")
else:
raise ValueError(f"unknown token {token}")

return "".join(match_query)

# Pull out the individual words, discarding any non-word characters.
results = re.findall(r"([\w\-]+)", search_term, re.UNICODE)

if isinstance(database_engine, PostgresEngine):
return " & ".join(result + ":*" for result in results)
elif isinstance(database_engine, Sqlite3Engine):
return " & ".join(result + "*" for result in results)
else:
# This should be unreachable.
raise Exception("Unrecognized database engine")
def _parse_query_for_sqlite(search_term: str) -> str:
"""Takes a plain unicode string from the user and converts it into a form
that can be passed to sqllite's matchinfo().
"""
return _tokens_to_sqlite_match_query(_tokenize_query(search_term))
16 changes: 16 additions & 0 deletions synapse/storage/engines/postgres.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,22 @@ def supports_returning(self) -> bool:
"""Do we support the `RETURNING` clause in insert/update/delete?"""
return True

@property
def tsquery_func(self) -> str:
"""
Selects a tsquery_* func to use.

Ref: https://www.postgresql.org/docs/current/textsearch-controls.html

Returns:
The function name.
"""
# Postgres 11 added support for websearch_to_tsquery.
assert self._version is not None
if self._version >= 110000:
return "websearch_to_tsquery"
return "plainto_tsquery"

def is_deadlock(self, error: Exception) -> bool:
if isinstance(error, psycopg2.DatabaseError):
# https://www.postgresql.org/docs/current/static/errcodes-appendix.html
Expand Down
Loading