Skip to content

Commit ab93f16

Browse files
authored
chore(ai-help): use index.json instead of plain.html for embedding sources (#12182)
update ai-help-macros to create embedding source texts from index.json instead of plainhtml
1 parent f800c24 commit ab93f16

File tree

1 file changed

+69
-16
lines changed

1 file changed

+69
-16
lines changed

Diff for: scripts/ai-help-macros.ts

+69-16
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import pg from "pg";
66
import pgvector from "pgvector/pg";
77
import { fdir } from "fdir";
88
import OpenAI from "openai";
9-
import { load as cheerio } from "cheerio";
9+
import { load as cheerio, CheerioAPI } from "cheerio";
1010

1111
import { DocMetadata } from "../libs/types/document.js";
1212
import { BUILD_OUT_ROOT, OPENAI_KEY, PG_URI } from "../libs/env/index.js";
@@ -21,6 +21,7 @@ import {
2121
VersionValue,
2222
} from "@mdn/browser-compat-data/types";
2323
import { h2mSync } from "../markdown/index.js";
24+
import { Doc as JSONDoc } from "../libs/types/document.js";
2425

2526
const EMBEDDING_MODEL = "text-embedding-3-small";
2627
const EMBEDDING_MODEL_NEXT = "text-embedding-3-small";
@@ -60,7 +61,8 @@ type EmbeddingUpdate = Pick<Doc, "mdn_url" | "text"> & {
6061

6162
export async function updateEmbeddings(
6263
directory: string,
63-
updateFormatting: boolean
64+
updateFormatting: boolean,
65+
usePlainHtml: boolean
6466
) {
6567
if (!OPENAI_KEY || !PG_URI) {
6668
throw Error("Please set these environment variables: OPENAI_KEY, PG_URI");
@@ -125,7 +127,8 @@ export async function updateEmbeddings(
125127
const embeddingUpdates: EmbeddingUpdate[] = [];
126128

127129
for await (const { mdn_url, title, title_short, markdown, text } of builtDocs(
128-
directory
130+
directory,
131+
usePlainHtml
129132
)) {
130133
seenUrls.add(mdn_url);
131134

@@ -379,8 +382,8 @@ export async function updateEmbeddings(
379382
pgClient.end();
380383
}
381384

382-
async function formatDocs(directory: string) {
383-
for await (const { markdown, text } of builtDocs(directory)) {
385+
async function formatDocs(directory: string, usePlainHtml: boolean) {
386+
for await (const { markdown, text } of builtDocs(directory, usePlainHtml)) {
384387
console.log(markdown, text);
385388
}
386389
}
@@ -399,19 +402,55 @@ async function* builtPaths(directory: string) {
399402
}
400403
}
401404

402-
async function* builtDocs(directory: string) {
405+
async function* builtDocs(directory: string, usePlainHtml: boolean) {
403406
for await (const metadataPath of builtPaths(directory)) {
404407
try {
405408
const raw = await readFile(metadataPath, "utf-8");
406409
const { title, short_title, mdn_url, hash } = JSON.parse(
407410
raw
408411
) as DocMetadata;
409-
410-
const plainPath = path.join(path.dirname(metadataPath), "plain.html");
411-
const plainHTML = await readFile(plainPath, "utf-8");
412-
413-
// reformat HTML version, used as context
414-
const $ = cheerio(plainHTML);
412+
let $: CheerioAPI;
413+
414+
if (usePlainHtml) {
415+
const plainPath = path.join(path.dirname(metadataPath), "plain.html");
416+
const plainHTML = await readFile(plainPath, "utf-8");
417+
418+
// reformat HTML version, used as context
419+
$ = cheerio(plainHTML);
420+
} else {
421+
const jsonPath = path.join(path.dirname(metadataPath), "index.json");
422+
const json = JSON.parse(await readFile(jsonPath, "utf-8"));
423+
const doc = json.doc as JSONDoc;
424+
425+
// Assemble the interim HTML from the json data
426+
$ = cheerio("<html><head></head><body></body></html>");
427+
for (const section of doc.body) {
428+
const tag = section.value.isH3 ? "h3" : "h2";
429+
if (section.value.title) {
430+
$("body").append("\n");
431+
$("body").append(
432+
`<${tag} id="${section.value.id ?? ""}">${section.value.title}</${tag}>`
433+
);
434+
}
435+
switch (section.type) {
436+
case "prose": {
437+
$("body").append("\n");
438+
$("body").append(section.value.content);
439+
break;
440+
}
441+
case "specifications":
442+
break;
443+
case "browser_compatibility": {
444+
$("body").append("\n");
445+
$("body").append(
446+
` <div>${buildBCDTable(section.value.query)}</div> `
447+
);
448+
break;
449+
}
450+
}
451+
}
452+
$("span.language-name").remove();
453+
}
415454
$("#specifications, .bc-specs").remove();
416455
$("body").prepend(`<h1>${title}</h1>`);
417456
$("head").prepend(`<title>${title}</title>`);
@@ -422,6 +461,7 @@ async function* builtDocs(directory: string) {
422461
$(".bc-data[data-query]").each((_, el) => {
423462
$(el).replaceWith(buildBCDTable($(el).data("query") as string));
424463
});
464+
425465
const html = $.html();
426466
const markdown = h2mSync(html);
427467

@@ -672,24 +712,37 @@ program
672712
.argument("<directory>", "Path in which to execute it", {
673713
default: path.join(BUILD_OUT_ROOT, "en-us", "docs"),
674714
})
715+
.option(
716+
"--use-plain-html",
717+
"Use `plain.html` files instead of `index.json` files."
718+
)
675719
.option(
676720
"--update-formatting",
677721
"Even if hashes match, update without generating a new embedding."
678722
)
679723
.action(function (params) {
680724
const { directory } = params.args as { directory: string };
681-
const { updateFormatting } = params.options as {
725+
const { updateFormatting, usePlainHtml } = params.options as {
682726
updateFormatting: boolean;
727+
usePlainHtml: boolean;
683728
};
684-
return updateEmbeddings(directory, updateFormatting);
729+
return updateEmbeddings(directory, updateFormatting, usePlainHtml);
685730
})
731+
686732
.command("format-docs", "Generates formatted docs for local debugging")
687733
.argument("<directory>", "Path in which to execute it", {
688734
default: path.join(BUILD_OUT_ROOT, "en-us", "docs"),
689735
})
736+
.option(
737+
"--use-plain-html",
738+
"Use `plain.html` files instead of `index.json` files."
739+
)
690740
.action(function (params) {
691-
const { directory } = params.args as { directory: string };
692-
return formatDocs(directory);
741+
const { directory, usePlainHtml } = params.args as {
742+
directory: string;
743+
usePlainHtml: boolean;
744+
};
745+
return formatDocs(directory, usePlainHtml);
693746
});
694747

695748
program.run();

0 commit comments

Comments
 (0)