Skip to content

Commit 175b9ae

Browse files
authored
Merge pull request #1062 from julep-ai/x/hybrid-search
fix(memory-store): add a migration to fix duplication issue in hybrid…
2 parents 58e006f + 8860918 commit 175b9ae

File tree

2 files changed

+118
-0
lines changed

2 files changed

+118
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
BEGIN;
2+
3+
DROP FUNCTION IF EXISTS search_hybrid;
4+
5+
COMMIT;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
BEGIN;
2+
3+
-- Hybrid search function combining text and vector search
4+
CREATE
5+
OR REPLACE FUNCTION search_hybrid (
6+
developer_id UUID,
7+
query_text text,
8+
query_embedding vector (1024),
9+
owner_types TEXT[],
10+
owner_ids UUID [],
11+
k integer DEFAULT 3,
12+
alpha float DEFAULT 0.7, -- Weight for embedding results
13+
confidence float DEFAULT 0.5,
14+
metadata_filter jsonb DEFAULT NULL,
15+
search_language text DEFAULT 'english'
16+
) RETURNS SETOF doc_search_result AS $$
17+
DECLARE
18+
text_weight float;
19+
embedding_weight float;
20+
intermediate_limit integer;
21+
BEGIN
22+
-- Input validation
23+
IF k <= 0 THEN
24+
RAISE EXCEPTION 'k must be greater than 0';
25+
END IF;
26+
27+
text_weight := 1.0 - alpha;
28+
embedding_weight := alpha;
29+
-- Get more intermediate results than final to allow for better fusion
30+
intermediate_limit := k * 4;
31+
32+
RETURN QUERY
33+
WITH text_results AS (
34+
SELECT * FROM search_by_text(
35+
developer_id,
36+
query_text,
37+
owner_types,
38+
owner_ids,
39+
search_language,
40+
intermediate_limit, -- Use larger intermediate limit
41+
metadata_filter
42+
)
43+
),
44+
embedding_results AS (
45+
SELECT * FROM search_by_vector(
46+
developer_id,
47+
query_embedding,
48+
owner_types,
49+
owner_ids,
50+
intermediate_limit, -- Use larger intermediate limit
51+
confidence,
52+
metadata_filter
53+
)
54+
),
55+
all_results AS (
56+
SELECT DISTINCT doc_id, title, content, metadata, embedding,
57+
index, owner_type, owner_id
58+
FROM (
59+
SELECT * FROM text_results
60+
UNION
61+
SELECT * FROM embedding_results
62+
) combined
63+
),
64+
scores AS (
65+
SELECT
66+
r.doc_id,
67+
r.title,
68+
r.content,
69+
r.metadata,
70+
r.embedding,
71+
r.index,
72+
r.owner_type,
73+
r.owner_id,
74+
COALESCE(t.distance, 0.0) as text_score,
75+
COALESCE(e.distance, 0.0) as embedding_score,
76+
RANK() OVER (ORDER BY COALESCE(t.distance, 0.0) DESC) as text_rank,
77+
RANK() OVER (ORDER BY COALESCE(e.distance, 0.0) DESC) as embedding_rank
78+
FROM all_results r
79+
LEFT JOIN text_results t ON r.doc_id = t.doc_id
80+
LEFT JOIN embedding_results e ON r.doc_id = e.doc_id
81+
),
82+
normalized_scores AS (
83+
SELECT
84+
s.*,
85+
normalized_text_scores[row_number() OVER (ORDER BY s.doc_id)] as norm_text_score,
86+
normalized_embedding_scores[row_number() OVER (ORDER BY s.doc_id)] as norm_embedding_score
87+
FROM
88+
scores s,
89+
(SELECT
90+
dbsf_normalize(array_agg(text_score ORDER BY doc_id)) as normalized_text_scores,
91+
dbsf_normalize(array_agg(embedding_score ORDER BY doc_id)) as normalized_embedding_scores
92+
FROM scores) n
93+
)
94+
SELECT
95+
developer_id,
96+
doc_id,
97+
index,
98+
title,
99+
content,
100+
1.0 - (text_weight * norm_text_score + embedding_weight * norm_embedding_score) as distance,
101+
embedding,
102+
metadata,
103+
owner_type,
104+
owner_id
105+
FROM normalized_scores
106+
ORDER BY distance ASC
107+
LIMIT k;
108+
END;
109+
$$ LANGUAGE plpgsql;
110+
111+
COMMENT ON FUNCTION search_hybrid IS 'Hybrid search combining text and vector search using Distribution-Based Score Fusion (DBSF)';
112+
113+
COMMIT;

0 commit comments

Comments
 (0)