@@ -6,7 +6,7 @@ import pg from "pg";
6
6
import pgvector from "pgvector/pg" ;
7
7
import { fdir } from "fdir" ;
8
8
import OpenAI from "openai" ;
9
- import { load as cheerio } from "cheerio" ;
9
+ import { load as cheerio , CheerioAPI } from "cheerio" ;
10
10
11
11
import { DocMetadata } from "../libs/types/document.js" ;
12
12
import { BUILD_OUT_ROOT , OPENAI_KEY , PG_URI } from "../libs/env/index.js" ;
@@ -21,6 +21,7 @@ import {
21
21
VersionValue ,
22
22
} from "@mdn/browser-compat-data/types" ;
23
23
import { h2mSync } from "../markdown/index.js" ;
24
+ import { Doc as JSONDoc } from "../libs/types/document.js" ;
24
25
25
26
const EMBEDDING_MODEL = "text-embedding-3-small" ;
26
27
const EMBEDDING_MODEL_NEXT = "text-embedding-3-small" ;
@@ -60,7 +61,8 @@ type EmbeddingUpdate = Pick<Doc, "mdn_url" | "text"> & {
60
61
61
62
export async function updateEmbeddings (
62
63
directory : string ,
63
- updateFormatting : boolean
64
+ updateFormatting : boolean ,
65
+ usePlainHtml : boolean
64
66
) {
65
67
if ( ! OPENAI_KEY || ! PG_URI ) {
66
68
throw Error ( "Please set these environment variables: OPENAI_KEY, PG_URI" ) ;
@@ -125,7 +127,8 @@ export async function updateEmbeddings(
125
127
const embeddingUpdates : EmbeddingUpdate [ ] = [ ] ;
126
128
127
129
for await ( const { mdn_url, title, title_short, markdown, text } of builtDocs (
128
- directory
130
+ directory ,
131
+ usePlainHtml
129
132
) ) {
130
133
seenUrls . add ( mdn_url ) ;
131
134
@@ -379,8 +382,8 @@ export async function updateEmbeddings(
379
382
pgClient . end ( ) ;
380
383
}
381
384
382
- async function formatDocs ( directory : string ) {
383
- for await ( const { markdown, text } of builtDocs ( directory ) ) {
385
+ async function formatDocs ( directory : string , usePlainHtml : boolean ) {
386
+ for await ( const { markdown, text } of builtDocs ( directory , usePlainHtml ) ) {
384
387
console . log ( markdown , text ) ;
385
388
}
386
389
}
@@ -399,19 +402,55 @@ async function* builtPaths(directory: string) {
399
402
}
400
403
}
401
404
402
- async function * builtDocs ( directory : string ) {
405
+ async function * builtDocs ( directory : string , usePlainHtml : boolean ) {
403
406
for await ( const metadataPath of builtPaths ( directory ) ) {
404
407
try {
405
408
const raw = await readFile ( metadataPath , "utf-8" ) ;
406
409
const { title, short_title, mdn_url, hash } = JSON . parse (
407
410
raw
408
411
) as DocMetadata ;
409
-
410
- const plainPath = path . join ( path . dirname ( metadataPath ) , "plain.html" ) ;
411
- const plainHTML = await readFile ( plainPath , "utf-8" ) ;
412
-
413
- // reformat HTML version, used as context
414
- const $ = cheerio ( plainHTML ) ;
412
+ let $ : CheerioAPI ;
413
+
414
+ if ( usePlainHtml ) {
415
+ const plainPath = path . join ( path . dirname ( metadataPath ) , "plain.html" ) ;
416
+ const plainHTML = await readFile ( plainPath , "utf-8" ) ;
417
+
418
+ // reformat HTML version, used as context
419
+ $ = cheerio ( plainHTML ) ;
420
+ } else {
421
+ const jsonPath = path . join ( path . dirname ( metadataPath ) , "index.json" ) ;
422
+ const json = JSON . parse ( await readFile ( jsonPath , "utf-8" ) ) ;
423
+ const doc = json . doc as JSONDoc ;
424
+
425
+ // Assemble the interim HTML from the json data
426
+ $ = cheerio ( "<html><head></head><body></body></html>" ) ;
427
+ for ( const section of doc . body ) {
428
+ const tag = section . value . isH3 ? "h3" : "h2" ;
429
+ if ( section . value . title ) {
430
+ $ ( "body" ) . append ( "\n" ) ;
431
+ $ ( "body" ) . append (
432
+ `<${ tag } id="${ section . value . id ?? "" } ">${ section . value . title } </${ tag } >`
433
+ ) ;
434
+ }
435
+ switch ( section . type ) {
436
+ case "prose" : {
437
+ $ ( "body" ) . append ( "\n" ) ;
438
+ $ ( "body" ) . append ( section . value . content ) ;
439
+ break ;
440
+ }
441
+ case "specifications" :
442
+ break ;
443
+ case "browser_compatibility" : {
444
+ $ ( "body" ) . append ( "\n" ) ;
445
+ $ ( "body" ) . append (
446
+ ` <div>${ buildBCDTable ( section . value . query ) } </div> `
447
+ ) ;
448
+ break ;
449
+ }
450
+ }
451
+ }
452
+ $ ( "span.language-name" ) . remove ( ) ;
453
+ }
415
454
$ ( "#specifications, .bc-specs" ) . remove ( ) ;
416
455
$ ( "body" ) . prepend ( `<h1>${ title } </h1>` ) ;
417
456
$ ( "head" ) . prepend ( `<title>${ title } </title>` ) ;
@@ -422,6 +461,7 @@ async function* builtDocs(directory: string) {
422
461
$ ( ".bc-data[data-query]" ) . each ( ( _ , el ) => {
423
462
$ ( el ) . replaceWith ( buildBCDTable ( $ ( el ) . data ( "query" ) as string ) ) ;
424
463
} ) ;
464
+
425
465
const html = $ . html ( ) ;
426
466
const markdown = h2mSync ( html ) ;
427
467
@@ -672,24 +712,37 @@ program
672
712
. argument ( "<directory>" , "Path in which to execute it" , {
673
713
default : path . join ( BUILD_OUT_ROOT , "en-us" , "docs" ) ,
674
714
} )
715
+ . option (
716
+ "--use-plain-html" ,
717
+ "Use `plain.html` files instead of `index.json` files."
718
+ )
675
719
. option (
676
720
"--update-formatting" ,
677
721
"Even if hashes match, update without generating a new embedding."
678
722
)
679
723
. action ( function ( params ) {
680
724
const { directory } = params . args as { directory : string } ;
681
- const { updateFormatting } = params . options as {
725
+ const { updateFormatting, usePlainHtml } = params . options as {
682
726
updateFormatting : boolean ;
727
+ usePlainHtml : boolean ;
683
728
} ;
684
- return updateEmbeddings ( directory , updateFormatting ) ;
729
+ return updateEmbeddings ( directory , updateFormatting , usePlainHtml ) ;
685
730
} )
731
+
686
732
. command ( "format-docs" , "Generates formatted docs for local debugging" )
687
733
. argument ( "<directory>" , "Path in which to execute it" , {
688
734
default : path . join ( BUILD_OUT_ROOT , "en-us" , "docs" ) ,
689
735
} )
736
+ . option (
737
+ "--use-plain-html" ,
738
+ "Use `plain.html` files instead of `index.json` files."
739
+ )
690
740
. action ( function ( params ) {
691
- const { directory } = params . args as { directory : string } ;
692
- return formatDocs ( directory ) ;
741
+ const { directory, usePlainHtml } = params . args as {
742
+ directory : string ;
743
+ usePlainHtml : boolean ;
744
+ } ;
745
+ return formatDocs ( directory , usePlainHtml ) ;
693
746
} ) ;
694
747
695
748
program . run ( ) ;
0 commit comments