@@ -22,13 +22,18 @@ import {
22
22
} from "@mdn/browser-compat-data/types" ;
23
23
import { h2mSync } from "../markdown/index.js" ;
24
24
25
+ const EMBEDDING_MODEL = "text-embedding-ada-002" ;
26
+ const EMBEDDING_MODEL_NEXT = "text-embedding-3-small" ;
27
+
25
28
const { program } = caporal ;
26
29
27
30
interface IndexedDoc {
28
31
id : number ;
29
32
mdn_url : string ;
30
33
title : string ;
31
34
token_count : number | null ;
35
+ has_embedding : boolean ;
36
+ has_embedding_next : boolean ;
32
37
markdown_hash : string ;
33
38
text_hash : string ;
34
39
}
@@ -43,6 +48,16 @@ interface Doc {
43
48
text_hash ?: string ;
44
49
}
45
50
51
+ type FormattingUpdate = Pick <
52
+ Doc ,
53
+ "mdn_url" | "title" | "title_short" | "markdown" | "markdown_hash"
54
+ > ;
55
+
56
+ type EmbeddingUpdate = Pick < Doc , "mdn_url" | "text" > & {
57
+ has_embedding : boolean ;
58
+ has_embedding_next : boolean ;
59
+ } ;
60
+
46
61
export async function updateEmbeddings (
47
62
directory : string ,
48
63
updateFormatting : boolean
@@ -65,11 +80,11 @@ export async function updateEmbeddings(
65
80
apiKey : OPENAI_KEY ,
66
81
} ) ;
67
82
68
- const createEmbedding = async ( input : string ) => {
83
+ const createEmbedding = async ( input : string , model : string ) => {
69
84
let embeddingResponse : OpenAI . Embeddings . CreateEmbeddingResponse ;
70
85
try {
71
86
embeddingResponse = await openai . embeddings . create ( {
72
- model : "text-embedding-ada-002" ,
87
+ model,
73
88
input,
74
89
} ) ;
75
90
} catch ( { error : { message, type } , status } : any ) {
@@ -78,7 +93,7 @@ export async function updateEmbeddings(
78
93
) ;
79
94
// Try again with trimmed content.
80
95
embeddingResponse = await openai . embeddings . create ( {
81
- model : "text-embedding-ada-002" ,
96
+ model,
82
97
input : input . substring ( 0 , 15000 ) ,
83
98
} ) ;
84
99
}
@@ -106,7 +121,8 @@ export async function updateEmbeddings(
106
121
107
122
const seenUrls = new Set < string > ( ) ;
108
123
const updates : Doc [ ] = [ ] ;
109
- const formattingUpdates : Doc [ ] = [ ] ;
124
+ const formattingUpdates : FormattingUpdate [ ] = [ ] ;
125
+ const embeddingUpdates : EmbeddingUpdate [ ] = [ ] ;
110
126
111
127
for await ( const { mdn_url, title, title_short, markdown, text } of builtDocs (
112
128
directory
@@ -122,6 +138,7 @@ export async function updateEmbeddings(
122
138
. digest ( "base64" ) ;
123
139
124
140
if ( existingDoc ?. text_hash !== text_hash ) {
141
+ // Document added or content changed => (re)generate embeddings.
125
142
updates . push ( {
126
143
mdn_url,
127
144
title,
@@ -131,31 +148,55 @@ export async function updateEmbeddings(
131
148
text,
132
149
text_hash,
133
150
} ) ;
134
- } else if (
135
- updateFormatting ||
136
- existingDoc ?. markdown_hash !== markdown_hash
137
- ) {
138
- formattingUpdates . push ( {
139
- mdn_url,
140
- title,
141
- title_short,
142
- markdown,
143
- markdown_hash,
144
- } ) ;
151
+ } else {
152
+ if ( updateFormatting || existingDoc ?. markdown_hash !== markdown_hash ) {
153
+ // Document formatting changed => update markdown.
154
+ formattingUpdates . push ( {
155
+ mdn_url,
156
+ title,
157
+ title_short,
158
+ markdown,
159
+ markdown_hash,
160
+ } ) ;
161
+ }
162
+
163
+ if (
164
+ ! existingDoc . has_embedding ||
165
+ ! existingDoc . has_embedding_next !== ! EMBEDDING_MODEL_NEXT
166
+ ) {
167
+ // Embedding missing => add embeddings.
168
+ const { has_embedding, has_embedding_next } = existingDoc ;
169
+ embeddingUpdates . push ( {
170
+ mdn_url,
171
+ text,
172
+ has_embedding,
173
+ has_embedding_next,
174
+ } ) ;
175
+ }
145
176
}
146
177
}
147
178
148
179
console . log (
149
180
`-> ${ updates . length } (${ formattingUpdates . length } ) of ${ seenUrls . size } documents were changed or added (or formatted).`
150
181
) ;
182
+ if ( embeddingUpdates . length > 0 ) {
183
+ console . log (
184
+ `-> ${ embeddingUpdates . length } documents have outdated embeddings.`
185
+ ) ;
186
+ }
187
+
151
188
const deletions : IndexedDoc [ ] = [ ...existingDocByUrl . entries ( ) ]
152
189
. filter ( ( [ key ] ) => ! seenUrls . has ( key ) )
153
190
. map ( ( [ , value ] ) => value ) ;
154
191
console . log (
155
192
`-> ${ deletions . length } of ${ existingDocs . length } indexed documents were deleted (or moved).`
156
193
) ;
157
194
158
- if ( updates . length > 0 || formattingUpdates . length > 0 ) {
195
+ if (
196
+ updates . length > 0 ||
197
+ formattingUpdates . length > 0 ||
198
+ embeddingUpdates . length > 0
199
+ ) {
159
200
console . log ( `Applying updates...` ) ;
160
201
for ( const {
161
202
mdn_url,
@@ -170,7 +211,16 @@ export async function updateEmbeddings(
170
211
console . log ( `-> [${ mdn_url } ] Updating document...` ) ;
171
212
172
213
// Embedding for full document.
173
- const { total_tokens, embedding } = await createEmbedding ( text ) ;
214
+ const [ { total_tokens, embedding } , embedding_next ] = await Promise . all (
215
+ [
216
+ createEmbedding ( text , EMBEDDING_MODEL ) ,
217
+ EMBEDDING_MODEL_NEXT
218
+ ? createEmbedding ( text , EMBEDDING_MODEL_NEXT ) . then (
219
+ ( { embedding } ) => embedding
220
+ )
221
+ : null ,
222
+ ]
223
+ ) ;
174
224
175
225
// Create/update document record.
176
226
const query = {
@@ -184,9 +234,10 @@ export async function updateEmbeddings(
184
234
markdown_hash,
185
235
token_count,
186
236
embedding,
237
+ embedding_next,
187
238
text_hash
188
239
)
189
- VALUES($1, $2, $3, $4, $5, $6, $7, $8) ON CONFLICT (mdn_url) DO
240
+ VALUES($1, $2, $3, $4, $5, $6, $7, $8, $9 ) ON CONFLICT (mdn_url) DO
190
241
UPDATE
191
242
SET mdn_url = $1,
192
243
title = $2,
@@ -195,7 +246,8 @@ export async function updateEmbeddings(
195
246
markdown_hash = $5,
196
247
token_count = $6,
197
248
embedding = $7,
198
- text_hash = $8
249
+ embedding_next = $8,
250
+ text_hash = $9
199
251
` ,
200
252
values : [
201
253
mdn_url ,
@@ -205,6 +257,7 @@ export async function updateEmbeddings(
205
257
markdown_hash ,
206
258
total_tokens ,
207
259
pgvector . toSql ( embedding ) ,
260
+ embedding_next ? pgvector . toSql ( embedding_next ) : null ,
208
261
text_hash ,
209
262
] ,
210
263
rowMode : "array" ,
@@ -217,6 +270,7 @@ export async function updateEmbeddings(
217
270
console . error ( context ) ;
218
271
}
219
272
}
273
+
220
274
for ( const {
221
275
mdn_url,
222
276
title,
@@ -253,6 +307,57 @@ export async function updateEmbeddings(
253
307
console . error ( context ) ;
254
308
}
255
309
}
310
+
311
+ for ( const {
312
+ mdn_url,
313
+ text,
314
+ has_embedding,
315
+ has_embedding_next,
316
+ } of embeddingUpdates ) {
317
+ try {
318
+ console . log ( `-> [${ mdn_url } ] Updating embeddings...` ) ;
319
+
320
+ if ( ! has_embedding ) {
321
+ const { total_tokens, embedding } = await createEmbedding (
322
+ text ,
323
+ EMBEDDING_MODEL
324
+ ) ;
325
+
326
+ const query = {
327
+ name : "upsert-doc-embedding" ,
328
+ text : "UPDATE mdn_doc_macro SET total_tokens = $2, embedding = $3 WHERE mdn_url = $1" ,
329
+ values : [
330
+ mdn_url ,
331
+ total_tokens ,
332
+ embedding ? pgvector . toSql ( embedding ) : null ,
333
+ ] ,
334
+ rowMode : "array" ,
335
+ } ;
336
+
337
+ await pgClient . query ( query ) ;
338
+ }
339
+
340
+ if ( ! has_embedding_next ) {
341
+ const embedding = EMBEDDING_MODEL_NEXT
342
+ ? ( await createEmbedding ( text , EMBEDDING_MODEL_NEXT ) ) . embedding
343
+ : null ;
344
+
345
+ const query = {
346
+ name : "upsert-doc-embedding-next" ,
347
+ text : "UPDATE mdn_doc_macro SET embedding_next = $2 WHERE mdn_url = $1" ,
348
+ values : [ mdn_url , embedding ? pgvector . toSql ( embedding ) : null ] ,
349
+ rowMode : "array" ,
350
+ } ;
351
+
352
+ await pgClient . query ( query ) ;
353
+ }
354
+ } catch ( err : any ) {
355
+ console . error ( `!> [${ mdn_url } ] Failed to add embeddings.` ) ;
356
+ const context = err ?. response ?. data ?? err ?. response ?? err ;
357
+ console . error ( context ) ;
358
+ }
359
+ }
360
+
256
361
console . log ( `-> Done.` ) ;
257
362
}
258
363
@@ -508,6 +613,8 @@ async function fetchAllExistingDocs(pgClient): Promise<IndexedDoc[]> {
508
613
mdn_url,
509
614
title,
510
615
token_count,
616
+ embedding IS NOT NULL as has_embedding,
617
+ embedding_next IS NOT NULL as has_embedding_next,
511
618
markdown_hash,
512
619
text_hash
513
620
from mdn_doc_macro
@@ -520,12 +627,23 @@ async function fetchAllExistingDocs(pgClient): Promise<IndexedDoc[]> {
520
627
} ;
521
628
const result = await pgClient . query ( query ) ;
522
629
return result . rows . map (
523
- ( [ id , mdn_url , title , token_count , markdown_hash , text_hash ] ) => {
630
+ ( [
631
+ id ,
632
+ mdn_url ,
633
+ title ,
634
+ token_count ,
635
+ has_embedding ,
636
+ has_embedding_next ,
637
+ markdown_hash ,
638
+ text_hash ,
639
+ ] ) => {
524
640
return {
525
641
id,
526
642
mdn_url,
527
643
title,
528
644
token_count,
645
+ has_embedding,
646
+ has_embedding_next,
529
647
markdown_hash,
530
648
text_hash,
531
649
} ;
0 commit comments