Skip to content

Commit de1aae9

Browse files
authored
enhance(ai-help): hash markdown to identify formatting updates (#10643)
* chore(ai-help): stop indexing HTML version * enhance(ai-help): use markdown hash to determine formatting updates * chore(ai-help): add function return type * chore(ai-help): stop indexing Doc hash
1 parent 685cde7 commit de1aae9

File tree

1 file changed

+42
-46
lines changed

1 file changed

+42
-46
lines changed

Diff for: scripts/ai-help-macros.ts

+42-46
Original file line numberDiff line numberDiff line change
@@ -28,19 +28,17 @@ interface IndexedDoc {
2828
id: number;
2929
mdn_url: string;
3030
title: string;
31-
title_short: string;
3231
token_count: number | null;
33-
hash: string;
32+
markdown_hash: string;
3433
text_hash: string;
3534
}
3635

3736
interface Doc {
3837
mdn_url: string;
3938
title: string;
4039
title_short: string;
41-
hash: string;
42-
html: string;
4340
markdown: string;
41+
markdown_hash: string;
4442
text?: string;
4543
text_hash?: string;
4644
}
@@ -110,41 +108,39 @@ export async function updateEmbeddings(
110108
const updates: Doc[] = [];
111109
const formattingUpdates: Doc[] = [];
112110

113-
for await (const {
114-
mdn_url,
115-
title,
116-
title_short,
117-
hash,
118-
html,
119-
markdown,
120-
text,
121-
} of builtDocs(directory)) {
111+
for await (const { mdn_url, title, title_short, markdown, text } of builtDocs(
112+
directory
113+
)) {
122114
seenUrls.add(mdn_url);
123115

124116
// Check for existing document in DB and compare checksums.
125117
const existingDoc = existingDocByUrl.get(mdn_url);
126118

127119
const text_hash = createHash("sha256").update(text).digest("base64");
120+
const markdown_hash = createHash("sha256")
121+
.update(markdown)
122+
.digest("base64");
128123

129124
if (existingDoc?.text_hash !== text_hash) {
130125
updates.push({
131126
mdn_url,
132127
title,
133128
title_short,
134-
hash,
135-
html,
136129
markdown,
130+
markdown_hash,
137131
text,
138132
text_hash,
139133
});
140-
} else if (updateFormatting || existingDoc?.hash !== hash) {
134+
} else if (
135+
updateFormatting ||
136+
existingDoc?.markdown_hash !== markdown_hash
137+
) {
141138
formattingUpdates.push({
142139
mdn_url,
143140
title,
144141
title_short,
145-
hash,
146-
html,
147142
markdown,
143+
markdown_hash,
148144
});
149145
}
150146
}
@@ -165,9 +161,8 @@ export async function updateEmbeddings(
165161
mdn_url,
166162
title,
167163
title_short,
168-
hash,
169-
html,
170164
markdown,
165+
markdown_hash,
171166
text,
172167
text_hash,
173168
} of updates) {
@@ -185,32 +180,29 @@ export async function updateEmbeddings(
185180
mdn_url,
186181
title,
187182
title_short,
188-
hash,
189-
html,
190183
markdown,
184+
markdown_hash,
191185
token_count,
192186
embedding,
193187
text_hash
194188
)
195-
VALUES($1, $2, $3, $4, $5, $6, $7, $8, $9) ON CONFLICT (mdn_url) DO
189+
VALUES($1, $2, $3, $4, $5, $6, $7, $8) ON CONFLICT (mdn_url) DO
196190
UPDATE
197191
SET mdn_url = $1,
198192
title = $2,
199193
title_short = $3,
200-
hash = $4,
201-
html = $5,
202-
markdown = $6,
203-
token_count = $7,
204-
embedding = $8,
205-
text_hash = $9
194+
markdown = $4,
195+
markdown_hash = $5,
196+
token_count = $6,
197+
embedding = $7,
198+
text_hash = $8
206199
`,
207200
values: [
208201
mdn_url,
209202
title,
210203
title_short,
211-
hash,
212-
html,
213204
markdown,
205+
markdown_hash,
214206
total_tokens,
215207
pgvector.toSql(embedding),
216208
text_hash,
@@ -229,9 +221,8 @@ export async function updateEmbeddings(
229221
mdn_url,
230222
title,
231223
title_short,
232-
hash,
233-
html,
234224
markdown,
225+
markdown_hash,
235226
} of formattingUpdates) {
236227
try {
237228
console.log(
@@ -242,17 +233,16 @@ export async function updateEmbeddings(
242233
const query = {
243234
name: "upsert-doc",
244235
text: `
245-
INSERT INTO mdn_doc_macro(mdn_url, title, title_short, hash, html, markdown)
246-
VALUES($1, $2, $3, $4, $5, $6) ON CONFLICT (mdn_url) DO
236+
INSERT INTO mdn_doc_macro(mdn_url, title, title_short, markdown, markdown_hash)
237+
VALUES($1, $2, $3, $4, $5) ON CONFLICT (mdn_url) DO
247238
UPDATE
248239
SET mdn_url = $1,
249240
title = $2,
250241
title_short = $3,
251-
hash = $4,
252-
html = $5,
253-
markdown = $6
242+
markdown = $4,
243+
markdown_hash = $5
254244
`,
255-
values: [mdn_url, title, title_short, hash, html, markdown],
245+
values: [mdn_url, title, title_short, markdown, markdown_hash],
256246
rowMode: "array",
257247
};
258248

@@ -285,8 +275,8 @@ export async function updateEmbeddings(
285275
}
286276

287277
async function formatDocs(directory: string) {
288-
for await (const { html, markdown, text } of builtDocs(directory)) {
289-
console.log(html, markdown, text);
278+
for await (const { markdown, text } of builtDocs(directory)) {
279+
console.log(markdown, text);
290280
}
291281
}
292282

@@ -340,7 +330,6 @@ async function* builtDocs(directory: string) {
340330
title,
341331
title_short: short_title || title,
342332
hash,
343-
html,
344333
markdown,
345334
text,
346335
};
@@ -509,7 +498,7 @@ export function isNotSupportedAtAll(support: SimpleSupportStatement) {
509498
return !support.version_added && !hasLimitation(support);
510499
}
511500

512-
async function fetchAllExistingDocs(pgClient) {
501+
async function fetchAllExistingDocs(pgClient): Promise<IndexedDoc[]> {
513502
const PAGE_SIZE = 1000;
514503
const selectDocs = async (lastId) => {
515504
const query = {
@@ -518,8 +507,8 @@ async function fetchAllExistingDocs(pgClient) {
518507
SELECT id,
519508
mdn_url,
520509
title,
521-
hash,
522510
token_count,
511+
markdown_hash,
523512
text_hash
524513
from mdn_doc_macro
525514
WHERE id > $1
@@ -531,8 +520,15 @@ async function fetchAllExistingDocs(pgClient) {
531520
};
532521
const result = await pgClient.query(query);
533522
return result.rows.map(
534-
([id, mdn_url, title, hash, token_count, text_hash]) => {
535-
return { id, mdn_url, title, hash, token_count, text_hash };
523+
([id, mdn_url, title, token_count, markdown_hash, text_hash]) => {
524+
return {
525+
id,
526+
mdn_url,
527+
title,
528+
token_count,
529+
markdown_hash,
530+
text_hash,
531+
};
536532
}
537533
);
538534
};

0 commit comments

Comments
 (0)