Skip to content

Commit f7cfaae

Browse files
authored
feat(ai-help): index text-embedding-3-model embeddings (#10818)
* feat(ai-help): index text-embedding-3-model embeddings * fix(ai-help): extract constants with correct v3 model name * feat(ai-help): add embedding backfill mechanism * refactor(ai-help): avoid SQL generation * chore(ai-help): set embedding_next = null unless EMBEDDING_MODEL_NEXT * fix(ai-help): add embeddings also if formatting changed * fixup! fix(ai-help): extract constants with correct v3 model name * perf(ai-help): generate both embeddings in parallel
1 parent 1ffb026 commit f7cfaae

File tree

2 files changed

+140
-21
lines changed

2 files changed

+140
-21
lines changed

Diff for: scripts/ai-help-macros.ts

+138-20
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,18 @@ import {
2222
} from "@mdn/browser-compat-data/types";
2323
import { h2mSync } from "../markdown/index.js";
2424

25+
const EMBEDDING_MODEL = "text-embedding-ada-002";
26+
const EMBEDDING_MODEL_NEXT = "text-embedding-3-small";
27+
2528
const { program } = caporal;
2629

2730
interface IndexedDoc {
2831
id: number;
2932
mdn_url: string;
3033
title: string;
3134
token_count: number | null;
35+
has_embedding: boolean;
36+
has_embedding_next: boolean;
3237
markdown_hash: string;
3338
text_hash: string;
3439
}
@@ -43,6 +48,16 @@ interface Doc {
4348
text_hash?: string;
4449
}
4550

51+
type FormattingUpdate = Pick<
52+
Doc,
53+
"mdn_url" | "title" | "title_short" | "markdown" | "markdown_hash"
54+
>;
55+
56+
type EmbeddingUpdate = Pick<Doc, "mdn_url" | "text"> & {
57+
has_embedding: boolean;
58+
has_embedding_next: boolean;
59+
};
60+
4661
export async function updateEmbeddings(
4762
directory: string,
4863
updateFormatting: boolean
@@ -65,11 +80,11 @@ export async function updateEmbeddings(
6580
apiKey: OPENAI_KEY,
6681
});
6782

68-
const createEmbedding = async (input: string) => {
83+
const createEmbedding = async (input: string, model: string) => {
6984
let embeddingResponse: OpenAI.Embeddings.CreateEmbeddingResponse;
7085
try {
7186
embeddingResponse = await openai.embeddings.create({
72-
model: "text-embedding-ada-002",
87+
model,
7388
input,
7489
});
7590
} catch ({ error: { message, type }, status }: any) {
@@ -78,7 +93,7 @@ export async function updateEmbeddings(
7893
);
7994
// Try again with trimmed content.
8095
embeddingResponse = await openai.embeddings.create({
81-
model: "text-embedding-ada-002",
96+
model,
8297
input: input.substring(0, 15000),
8398
});
8499
}
@@ -106,7 +121,8 @@ export async function updateEmbeddings(
106121

107122
const seenUrls = new Set<string>();
108123
const updates: Doc[] = [];
109-
const formattingUpdates: Doc[] = [];
124+
const formattingUpdates: FormattingUpdate[] = [];
125+
const embeddingUpdates: EmbeddingUpdate[] = [];
110126

111127
for await (const { mdn_url, title, title_short, markdown, text } of builtDocs(
112128
directory
@@ -122,6 +138,7 @@ export async function updateEmbeddings(
122138
.digest("base64");
123139

124140
if (existingDoc?.text_hash !== text_hash) {
141+
// Document added or content changed => (re)generate embeddings.
125142
updates.push({
126143
mdn_url,
127144
title,
@@ -131,31 +148,55 @@ export async function updateEmbeddings(
131148
text,
132149
text_hash,
133150
});
134-
} else if (
135-
updateFormatting ||
136-
existingDoc?.markdown_hash !== markdown_hash
137-
) {
138-
formattingUpdates.push({
139-
mdn_url,
140-
title,
141-
title_short,
142-
markdown,
143-
markdown_hash,
144-
});
151+
} else {
152+
if (updateFormatting || existingDoc?.markdown_hash !== markdown_hash) {
153+
// Document formatting changed => update markdown.
154+
formattingUpdates.push({
155+
mdn_url,
156+
title,
157+
title_short,
158+
markdown,
159+
markdown_hash,
160+
});
161+
}
162+
163+
if (
164+
!existingDoc.has_embedding ||
165+
!existingDoc.has_embedding_next !== !EMBEDDING_MODEL_NEXT
166+
) {
167+
// Embedding missing => add embeddings.
168+
const { has_embedding, has_embedding_next } = existingDoc;
169+
embeddingUpdates.push({
170+
mdn_url,
171+
text,
172+
has_embedding,
173+
has_embedding_next,
174+
});
175+
}
145176
}
146177
}
147178

148179
console.log(
149180
`-> ${updates.length} (${formattingUpdates.length}) of ${seenUrls.size} documents were changed or added (or formatted).`
150181
);
182+
if (embeddingUpdates.length > 0) {
183+
console.log(
184+
`-> ${embeddingUpdates.length} documents have outdated embeddings.`
185+
);
186+
}
187+
151188
const deletions: IndexedDoc[] = [...existingDocByUrl.entries()]
152189
.filter(([key]) => !seenUrls.has(key))
153190
.map(([, value]) => value);
154191
console.log(
155192
`-> ${deletions.length} of ${existingDocs.length} indexed documents were deleted (or moved).`
156193
);
157194

158-
if (updates.length > 0 || formattingUpdates.length > 0) {
195+
if (
196+
updates.length > 0 ||
197+
formattingUpdates.length > 0 ||
198+
embeddingUpdates.length > 0
199+
) {
159200
console.log(`Applying updates...`);
160201
for (const {
161202
mdn_url,
@@ -170,7 +211,16 @@ export async function updateEmbeddings(
170211
console.log(`-> [${mdn_url}] Updating document...`);
171212

172213
// Embedding for full document.
173-
const { total_tokens, embedding } = await createEmbedding(text);
214+
const [{ total_tokens, embedding }, embedding_next] = await Promise.all(
215+
[
216+
createEmbedding(text, EMBEDDING_MODEL),
217+
EMBEDDING_MODEL_NEXT
218+
? createEmbedding(text, EMBEDDING_MODEL_NEXT).then(
219+
({ embedding }) => embedding
220+
)
221+
: null,
222+
]
223+
);
174224

175225
// Create/update document record.
176226
const query = {
@@ -184,9 +234,10 @@ export async function updateEmbeddings(
184234
markdown_hash,
185235
token_count,
186236
embedding,
237+
embedding_next,
187238
text_hash
188239
)
189-
VALUES($1, $2, $3, $4, $5, $6, $7, $8) ON CONFLICT (mdn_url) DO
240+
VALUES($1, $2, $3, $4, $5, $6, $7, $8, $9) ON CONFLICT (mdn_url) DO
190241
UPDATE
191242
SET mdn_url = $1,
192243
title = $2,
@@ -195,7 +246,8 @@ export async function updateEmbeddings(
195246
markdown_hash = $5,
196247
token_count = $6,
197248
embedding = $7,
198-
text_hash = $8
249+
embedding_next = $8,
250+
text_hash = $9
199251
`,
200252
values: [
201253
mdn_url,
@@ -205,6 +257,7 @@ export async function updateEmbeddings(
205257
markdown_hash,
206258
total_tokens,
207259
pgvector.toSql(embedding),
260+
embedding_next ? pgvector.toSql(embedding_next) : null,
208261
text_hash,
209262
],
210263
rowMode: "array",
@@ -217,6 +270,7 @@ export async function updateEmbeddings(
217270
console.error(context);
218271
}
219272
}
273+
220274
for (const {
221275
mdn_url,
222276
title,
@@ -253,6 +307,57 @@ export async function updateEmbeddings(
253307
console.error(context);
254308
}
255309
}
310+
311+
for (const {
312+
mdn_url,
313+
text,
314+
has_embedding,
315+
has_embedding_next,
316+
} of embeddingUpdates) {
317+
try {
318+
console.log(`-> [${mdn_url}] Updating embeddings...`);
319+
320+
if (!has_embedding) {
321+
const { total_tokens, embedding } = await createEmbedding(
322+
text,
323+
EMBEDDING_MODEL
324+
);
325+
326+
const query = {
327+
name: "upsert-doc-embedding",
328+
text: "UPDATE mdn_doc_macro SET total_tokens = $2, embedding = $3 WHERE mdn_url = $1",
329+
values: [
330+
mdn_url,
331+
total_tokens,
332+
embedding ? pgvector.toSql(embedding) : null,
333+
],
334+
rowMode: "array",
335+
};
336+
337+
await pgClient.query(query);
338+
}
339+
340+
if (!has_embedding_next) {
341+
const embedding = EMBEDDING_MODEL_NEXT
342+
? (await createEmbedding(text, EMBEDDING_MODEL_NEXT)).embedding
343+
: null;
344+
345+
const query = {
346+
name: "upsert-doc-embedding-next",
347+
text: "UPDATE mdn_doc_macro SET embedding_next = $2 WHERE mdn_url = $1",
348+
values: [mdn_url, embedding ? pgvector.toSql(embedding) : null],
349+
rowMode: "array",
350+
};
351+
352+
await pgClient.query(query);
353+
}
354+
} catch (err: any) {
355+
console.error(`!> [${mdn_url}] Failed to add embeddings.`);
356+
const context = err?.response?.data ?? err?.response ?? err;
357+
console.error(context);
358+
}
359+
}
360+
256361
console.log(`-> Done.`);
257362
}
258363

@@ -508,6 +613,8 @@ async function fetchAllExistingDocs(pgClient): Promise<IndexedDoc[]> {
508613
mdn_url,
509614
title,
510615
token_count,
616+
embedding IS NOT NULL as has_embedding,
617+
embedding_next IS NOT NULL as has_embedding_next,
511618
markdown_hash,
512619
text_hash
513620
from mdn_doc_macro
@@ -520,12 +627,23 @@ async function fetchAllExistingDocs(pgClient): Promise<IndexedDoc[]> {
520627
};
521628
const result = await pgClient.query(query);
522629
return result.rows.map(
523-
([id, mdn_url, title, token_count, markdown_hash, text_hash]) => {
630+
([
631+
id,
632+
mdn_url,
633+
title,
634+
token_count,
635+
has_embedding,
636+
has_embedding_next,
637+
markdown_hash,
638+
text_hash,
639+
]) => {
524640
return {
525641
id,
526642
mdn_url,
527643
title,
528644
token_count,
645+
has_embedding,
646+
has_embedding_next,
529647
markdown_hash,
530648
text_hash,
531649
};

Diff for: scripts/ai-help.sql

+2-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,8 @@ create table
3434
html text null,
3535
markdown text null,
3636
token_count integer null,
37-
embedding extensions.vector null,
37+
embedding extensions.vector(1536) null,
38+
embedding_next extensions.vector(1536) null,
3839
text_hash text null,
3940
constraint mdn_doc_macro_pkey primary key (id),
4041
constraint mdn_doc_macro_url_key unique (mdn_url)

0 commit comments

Comments
 (0)