feat(ai-help): index text-embedding-3-model embeddings (#10818)

caugner · web-flow · commit f7cfaae77f9e · 2024-04-03T00:04:19.000+02:00
* feat(ai-help): index text-embedding-3-model embeddings

* fix(ai-help): extract constants with correct v3 model name

* feat(ai-help): add embedding backfill mechanism

* refactor(ai-help): avoid SQL generation

* chore(ai-help): set embedding_next = null unless EMBEDDING_MODEL_NEXT

* fix(ai-help): add embeddings also if formatting changed

* fixup! fix(ai-help): extract constants with correct v3 model name

* perf(ai-help): generate both embeddings in parallel
diff --git a/scripts/ai-help-macros.ts b/scripts/ai-help-macros.ts
@@ -22,13 +22,18 @@ import {
 } from "@mdn/browser-compat-data/types";
 import { h2mSync } from "../markdown/index.js";
 
+const EMBEDDING_MODEL = "text-embedding-ada-002";
+const EMBEDDING_MODEL_NEXT = "text-embedding-3-small";
+
 const { program } = caporal;
 
 interface IndexedDoc {
   id: number;
   mdn_url: string;
   title: string;
   token_count: number | null;
+  has_embedding: boolean;
+  has_embedding_next: boolean;
   markdown_hash: string;
   text_hash: string;
 }
@@ -43,6 +48,16 @@ interface Doc {
   text_hash?: string;
 }
 
+type FormattingUpdate = Pick<
+  Doc,
+  "mdn_url" | "title" | "title_short" | "markdown" | "markdown_hash"
+>;
+
+type EmbeddingUpdate = Pick<Doc, "mdn_url" | "text"> & {
+  has_embedding: boolean;
+  has_embedding_next: boolean;
+};
+
 export async function updateEmbeddings(
   directory: string,
   updateFormatting: boolean
@@ -65,11 +80,11 @@ export async function updateEmbeddings(
     apiKey: OPENAI_KEY,
   });
 
-  const createEmbedding = async (input: string) => {
+  const createEmbedding = async (input: string, model: string) => {
     let embeddingResponse: OpenAI.Embeddings.CreateEmbeddingResponse;
     try {
       embeddingResponse = await openai.embeddings.create({
-        model: "text-embedding-ada-002",
+        model,
         input,
       });
     } catch ({ error: { message, type }, status }: any) {
@@ -78,7 +93,7 @@ export async function updateEmbeddings(
       );
       // Try again with trimmed content.
       embeddingResponse = await openai.embeddings.create({
-        model: "text-embedding-ada-002",
+        model,
         input: input.substring(0, 15000),
       });
     }
@@ -106,7 +121,8 @@ export async function updateEmbeddings(
 
   const seenUrls = new Set<string>();
   const updates: Doc[] = [];
-  const formattingUpdates: Doc[] = [];
+  const formattingUpdates: FormattingUpdate[] = [];
+  const embeddingUpdates: EmbeddingUpdate[] = [];
 
   for await (const { mdn_url, title, title_short, markdown, text } of builtDocs(
     directory
@@ -122,6 +138,7 @@ export async function updateEmbeddings(
       .digest("base64");
 
     if (existingDoc?.text_hash !== text_hash) {
+      // Document added or content changed => (re)generate embeddings.
       updates.push({
         mdn_url,
         title,
@@ -131,31 +148,55 @@ export async function updateEmbeddings(
         text,
         text_hash,
       });
-    } else if (
-      updateFormatting ||
-      existingDoc?.markdown_hash !== markdown_hash
-    ) {
-      formattingUpdates.push({
-        mdn_url,
-        title,
-        title_short,
-        markdown,
-        markdown_hash,
-      });
+    } else {
+      if (updateFormatting || existingDoc?.markdown_hash !== markdown_hash) {
+        // Document formatting changed => update markdown.
+        formattingUpdates.push({
+          mdn_url,
+          title,
+          title_short,
+          markdown,
+          markdown_hash,
+        });
+      }
+
+      if (
+        !existingDoc.has_embedding ||
+        !existingDoc.has_embedding_next !== !EMBEDDING_MODEL_NEXT
+      ) {
+        // Embedding missing => add embeddings.
+        const { has_embedding, has_embedding_next } = existingDoc;
+        embeddingUpdates.push({
+          mdn_url,
+          text,
+          has_embedding,
+          has_embedding_next,
+        });
+      }
     }
   }
 
   console.log(
     `-> ${updates.length} (${formattingUpdates.length}) of ${seenUrls.size} documents were changed or added (or formatted).`
   );
+  if (embeddingUpdates.length > 0) {
+    console.log(
+      `-> ${embeddingUpdates.length} documents have outdated embeddings.`
+    );
+  }
+
   const deletions: IndexedDoc[] = [...existingDocByUrl.entries()]
     .filter(([key]) => !seenUrls.has(key))
     .map(([, value]) => value);
   console.log(
     `-> ${deletions.length} of ${existingDocs.length} indexed documents were deleted (or moved).`
   );
 
-  if (updates.length > 0 || formattingUpdates.length > 0) {
+  if (
+    updates.length > 0 ||
+    formattingUpdates.length > 0 ||
+    embeddingUpdates.length > 0
+  ) {
     console.log(`Applying updates...`);
     for (const {
       mdn_url,
@@ -170,7 +211,16 @@ export async function updateEmbeddings(
         console.log(`-> [${mdn_url}] Updating document...`);
 
         // Embedding for full document.
-        const { total_tokens, embedding } = await createEmbedding(text);
+        const [{ total_tokens, embedding }, embedding_next] = await Promise.all(
+          [
+            createEmbedding(text, EMBEDDING_MODEL),
+            EMBEDDING_MODEL_NEXT
+              ? createEmbedding(text, EMBEDDING_MODEL_NEXT).then(
+                  ({ embedding }) => embedding
+                )
+              : null,
+          ]
+        );
 
         // Create/update document record.
         const query = {
@@ -184,9 +234,10 @@ export async function updateEmbeddings(
                     markdown_hash,
                     token_count,
                     embedding,
+                    embedding_next,
                     text_hash
                 )
-            VALUES($1, $2, $3, $4, $5, $6, $7, $8) ON CONFLICT (mdn_url) DO
+            VALUES($1, $2, $3, $4, $5, $6, $7, $8, $9) ON CONFLICT (mdn_url) DO
             UPDATE
             SET mdn_url = $1,
                 title = $2,
@@ -195,7 +246,8 @@ export async function updateEmbeddings(
                 markdown_hash = $5,
                 token_count = $6,
                 embedding = $7,
-                text_hash = $8
+                embedding_next = $8,
+                text_hash = $9
           `,
           values: [
             mdn_url,
@@ -205,6 +257,7 @@ export async function updateEmbeddings(
             markdown_hash,
             total_tokens,
             pgvector.toSql(embedding),
+            embedding_next ? pgvector.toSql(embedding_next) : null,
             text_hash,
           ],
           rowMode: "array",
@@ -217,6 +270,7 @@ export async function updateEmbeddings(
         console.error(context);
       }
     }
+
     for (const {
       mdn_url,
       title,
@@ -253,6 +307,57 @@ export async function updateEmbeddings(
         console.error(context);
       }
     }
+
+    for (const {
+      mdn_url,
+      text,
+      has_embedding,
+      has_embedding_next,
+    } of embeddingUpdates) {
+      try {
+        console.log(`-> [${mdn_url}] Updating embeddings...`);
+
+        if (!has_embedding) {
+          const { total_tokens, embedding } = await createEmbedding(
+            text,
+            EMBEDDING_MODEL
+          );
+
+          const query = {
+            name: "upsert-doc-embedding",
+            text: "UPDATE mdn_doc_macro SET total_tokens = $2, embedding = $3 WHERE mdn_url = $1",
+            values: [
+              mdn_url,
+              total_tokens,
+              embedding ? pgvector.toSql(embedding) : null,
+            ],
+            rowMode: "array",
+          };
+
+          await pgClient.query(query);
+        }
+
+        if (!has_embedding_next) {
+          const embedding = EMBEDDING_MODEL_NEXT
+            ? (await createEmbedding(text, EMBEDDING_MODEL_NEXT)).embedding
+            : null;
+
+          const query = {
+            name: "upsert-doc-embedding-next",
+            text: "UPDATE mdn_doc_macro SET embedding_next = $2 WHERE mdn_url = $1",
+            values: [mdn_url, embedding ? pgvector.toSql(embedding) : null],
+            rowMode: "array",
+          };
+
+          await pgClient.query(query);
+        }
+      } catch (err: any) {
+        console.error(`!> [${mdn_url}] Failed to add embeddings.`);
+        const context = err?.response?.data ?? err?.response ?? err;
+        console.error(context);
+      }
+    }
+
     console.log(`-> Done.`);
   }
 
@@ -508,6 +613,8 @@ async function fetchAllExistingDocs(pgClient): Promise<IndexedDoc[]> {
             mdn_url,
             title,
             token_count,
+            embedding IS NOT NULL as has_embedding,
+            embedding_next IS NOT NULL as has_embedding_next,
             markdown_hash,
             text_hash
         from mdn_doc_macro
@@ -520,12 +627,23 @@ async function fetchAllExistingDocs(pgClient): Promise<IndexedDoc[]> {
     };
     const result = await pgClient.query(query);
     return result.rows.map(
-      ([id, mdn_url, title, token_count, markdown_hash, text_hash]) => {
+      ([
+        id,
+        mdn_url,
+        title,
+        token_count,
+        has_embedding,
+        has_embedding_next,
+        markdown_hash,
+        text_hash,
+      ]) => {
         return {
           id,
           mdn_url,
           title,
           token_count,
+          has_embedding,
+          has_embedding_next,
           markdown_hash,
           text_hash,
         };
diff --git a/scripts/ai-help.sql b/scripts/ai-help.sql
@@ -34,7 +34,8 @@ create table
     html text null,
     markdown text null,
     token_count integer null,
-    embedding extensions.vector null,
+    embedding extensions.vector(1536) null,
+    embedding_next extensions.vector(1536) null,
     text_hash text null,
     constraint mdn_doc_macro_pkey primary key (id),
     constraint mdn_doc_macro_url_key unique (mdn_url)