@@ -28,19 +28,17 @@ interface IndexedDoc {
28
28
id : number ;
29
29
mdn_url : string ;
30
30
title : string ;
31
- title_short : string ;
32
31
token_count : number | null ;
33
- hash : string ;
32
+ markdown_hash : string ;
34
33
text_hash : string ;
35
34
}
36
35
37
36
interface Doc {
38
37
mdn_url : string ;
39
38
title : string ;
40
39
title_short : string ;
41
- hash : string ;
42
- html : string ;
43
40
markdown : string ;
41
+ markdown_hash : string ;
44
42
text ?: string ;
45
43
text_hash ?: string ;
46
44
}
@@ -110,41 +108,39 @@ export async function updateEmbeddings(
110
108
const updates : Doc [ ] = [ ] ;
111
109
const formattingUpdates : Doc [ ] = [ ] ;
112
110
113
- for await ( const {
114
- mdn_url,
115
- title,
116
- title_short,
117
- hash,
118
- html,
119
- markdown,
120
- text,
121
- } of builtDocs ( directory ) ) {
111
+ for await ( const { mdn_url, title, title_short, markdown, text } of builtDocs (
112
+ directory
113
+ ) ) {
122
114
seenUrls . add ( mdn_url ) ;
123
115
124
116
// Check for existing document in DB and compare checksums.
125
117
const existingDoc = existingDocByUrl . get ( mdn_url ) ;
126
118
127
119
const text_hash = createHash ( "sha256" ) . update ( text ) . digest ( "base64" ) ;
120
+ const markdown_hash = createHash ( "sha256" )
121
+ . update ( markdown )
122
+ . digest ( "base64" ) ;
128
123
129
124
if ( existingDoc ?. text_hash !== text_hash ) {
130
125
updates . push ( {
131
126
mdn_url,
132
127
title,
133
128
title_short,
134
- hash,
135
- html,
136
129
markdown,
130
+ markdown_hash,
137
131
text,
138
132
text_hash,
139
133
} ) ;
140
- } else if ( updateFormatting || existingDoc ?. hash !== hash ) {
134
+ } else if (
135
+ updateFormatting ||
136
+ existingDoc ?. markdown_hash !== markdown_hash
137
+ ) {
141
138
formattingUpdates . push ( {
142
139
mdn_url,
143
140
title,
144
141
title_short,
145
- hash,
146
- html,
147
142
markdown,
143
+ markdown_hash,
148
144
} ) ;
149
145
}
150
146
}
@@ -165,9 +161,8 @@ export async function updateEmbeddings(
165
161
mdn_url,
166
162
title,
167
163
title_short,
168
- hash,
169
- html,
170
164
markdown,
165
+ markdown_hash,
171
166
text,
172
167
text_hash,
173
168
} of updates ) {
@@ -185,32 +180,29 @@ export async function updateEmbeddings(
185
180
mdn_url,
186
181
title,
187
182
title_short,
188
- hash,
189
- html,
190
183
markdown,
184
+ markdown_hash,
191
185
token_count,
192
186
embedding,
193
187
text_hash
194
188
)
195
- VALUES($1, $2, $3, $4, $5, $6, $7, $8, $9 ) ON CONFLICT (mdn_url) DO
189
+ VALUES($1, $2, $3, $4, $5, $6, $7, $8) ON CONFLICT (mdn_url) DO
196
190
UPDATE
197
191
SET mdn_url = $1,
198
192
title = $2,
199
193
title_short = $3,
200
- hash = $4,
201
- html = $5,
202
- markdown = $6,
203
- token_count = $7,
204
- embedding = $8,
205
- text_hash = $9
194
+ markdown = $4,
195
+ markdown_hash = $5,
196
+ token_count = $6,
197
+ embedding = $7,
198
+ text_hash = $8
206
199
` ,
207
200
values : [
208
201
mdn_url ,
209
202
title ,
210
203
title_short ,
211
- hash ,
212
- html ,
213
204
markdown ,
205
+ markdown_hash ,
214
206
total_tokens ,
215
207
pgvector . toSql ( embedding ) ,
216
208
text_hash ,
@@ -229,9 +221,8 @@ export async function updateEmbeddings(
229
221
mdn_url,
230
222
title,
231
223
title_short,
232
- hash,
233
- html,
234
224
markdown,
225
+ markdown_hash,
235
226
} of formattingUpdates ) {
236
227
try {
237
228
console . log (
@@ -242,17 +233,16 @@ export async function updateEmbeddings(
242
233
const query = {
243
234
name : "upsert-doc" ,
244
235
text : `
245
- INSERT INTO mdn_doc_macro(mdn_url, title, title_short, hash, html, markdown )
246
- VALUES($1, $2, $3, $4, $5, $6 ) ON CONFLICT (mdn_url) DO
236
+ INSERT INTO mdn_doc_macro(mdn_url, title, title_short, markdown, markdown_hash )
237
+ VALUES($1, $2, $3, $4, $5) ON CONFLICT (mdn_url) DO
247
238
UPDATE
248
239
SET mdn_url = $1,
249
240
title = $2,
250
241
title_short = $3,
251
- hash = $4,
252
- html = $5,
253
- markdown = $6
242
+ markdown = $4,
243
+ markdown_hash = $5
254
244
` ,
255
- values : [ mdn_url , title , title_short , hash , html , markdown ] ,
245
+ values : [ mdn_url , title , title_short , markdown , markdown_hash ] ,
256
246
rowMode : "array" ,
257
247
} ;
258
248
@@ -285,8 +275,8 @@ export async function updateEmbeddings(
285
275
}
286
276
287
277
async function formatDocs ( directory : string ) {
288
- for await ( const { html , markdown, text } of builtDocs ( directory ) ) {
289
- console . log ( html , markdown , text ) ;
278
+ for await ( const { markdown, text } of builtDocs ( directory ) ) {
279
+ console . log ( markdown , text ) ;
290
280
}
291
281
}
292
282
@@ -340,7 +330,6 @@ async function* builtDocs(directory: string) {
340
330
title,
341
331
title_short : short_title || title ,
342
332
hash,
343
- html,
344
333
markdown,
345
334
text,
346
335
} ;
@@ -509,7 +498,7 @@ export function isNotSupportedAtAll(support: SimpleSupportStatement) {
509
498
return ! support . version_added && ! hasLimitation ( support ) ;
510
499
}
511
500
512
- async function fetchAllExistingDocs ( pgClient ) {
501
+ async function fetchAllExistingDocs ( pgClient ) : Promise < IndexedDoc [ ] > {
513
502
const PAGE_SIZE = 1000 ;
514
503
const selectDocs = async ( lastId ) => {
515
504
const query = {
@@ -518,8 +507,8 @@ async function fetchAllExistingDocs(pgClient) {
518
507
SELECT id,
519
508
mdn_url,
520
509
title,
521
- hash,
522
510
token_count,
511
+ markdown_hash,
523
512
text_hash
524
513
from mdn_doc_macro
525
514
WHERE id > $1
@@ -531,8 +520,15 @@ async function fetchAllExistingDocs(pgClient) {
531
520
} ;
532
521
const result = await pgClient . query ( query ) ;
533
522
return result . rows . map (
534
- ( [ id , mdn_url , title , hash , token_count , text_hash ] ) => {
535
- return { id, mdn_url, title, hash, token_count, text_hash } ;
523
+ ( [ id , mdn_url , title , token_count , markdown_hash , text_hash ] ) => {
524
+ return {
525
+ id,
526
+ mdn_url,
527
+ title,
528
+ token_count,
529
+ markdown_hash,
530
+ text_hash,
531
+ } ;
536
532
}
537
533
) ;
538
534
} ;
0 commit comments