Skip to content

feat(ai-help): index full docs as well #9608

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Oct 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions scripts/ai-help.sql
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ create table
url text not null,
slug text not null,
title text not null,
content text null,
token_count integer null,
embedding extensions.vector null,
checksum text null,
constraint mdn_doc_pkey primary key (id),
constraint mdn_doc_url_key unique (url),
Expand Down
102 changes: 73 additions & 29 deletions scripts/ai-help.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ interface IndexedDoc {
url: string;
slug: string;
title: string;
token_count: number | null;
checksum: string;
}

Expand Down Expand Up @@ -52,6 +53,45 @@ export async function updateEmbeddings(directory: string) {
});
const openai = new OpenAIApi(configuration);

const createEmbedding = async (content: string) => {
// OpenAI recommends replacing newlines with spaces for best results (specific to embeddings)
const input = content.replace(/\n/g, " ");

let embeddingResponse;
try {
embeddingResponse = await openai.createEmbedding({
model: "text-embedding-ada-002",
input,
});
} catch (e: any) {
const {
data: {
error: { message, type },
},
status,
statusText,
} = e.response;
console.error(
`[!] Failed to create embedding (${status} ${statusText}): ${type} - ${message}`
);
// Try again with trimmed content.
embeddingResponse = await openai.createEmbedding({
model: "text-embedding-ada-002",
input: input.substring(0, 15000),
});
}

const {
data: [{ embedding }],
usage: { total_tokens },
} = embeddingResponse.data;

return {
total_tokens,
embedding,
};
};

console.log(`Retrieving all indexed documents...`);
const existingDocs = await fetchAllExistingDocs(supabaseClient);
console.log(`-> Done.`);
Expand Down Expand Up @@ -81,6 +121,20 @@ export async function updateEmbeddings(directory: string) {
checksum,
});
continue;
} else if (existingDoc && existingDoc.token_count === null) {
// (Legacy migration:) Add content, token_count, embedding where missing.
console.log(`-> [${url}] Adding content/token_count/embedding...`);
const { total_tokens, embedding } = await createEmbedding(content);

await supabaseClient
.from("mdn_doc")
.update({
content,
token_count: total_tokens,
embedding,
})
.filter("id", "eq", existingDoc.id)
.throwOnError();
}
}
console.log(
Expand Down Expand Up @@ -108,6 +162,9 @@ export async function updateEmbeddings(directory: string) {
.throwOnError();
}

// Embedding for full document.
const { total_tokens, embedding } = await createEmbedding(content);

// Create/update document record. Intentionally clear checksum until we
// have successfully generated all document sections.
const { data: doc } = await supabaseClient
Expand All @@ -118,6 +175,9 @@ export async function updateEmbeddings(directory: string) {
url,
slug,
title,
content,
token_count: total_tokens,
embedding,
},
{ onConflict: "url" }
)
Expand All @@ -133,29 +193,16 @@ export async function updateEmbeddings(directory: string) {

await Promise.all(
sections.map(async ({ heading, content }) => {
// OpenAI recommends replacing newlines with spaces for best results (specific to embeddings)
const input = content.replace(/\n/g, " ");

const embeddingResponse = await openai.createEmbedding({
model: "text-embedding-ada-002",
input,
});

if (embeddingResponse.status !== 200) {
console.error("Embedding request failed", embeddingResponse.data);
throw new Error("Embedding request failed");
}

const [responseData] = embeddingResponse.data.data;
const { total_tokens, embedding } = await createEmbedding(content);

await supabaseClient
.from("mdn_doc_section")
.insert({
doc_id: doc.id,
heading,
content,
token_count: embeddingResponse.data.usage.total_tokens,
embedding: responseData.embedding,
token_count: total_tokens,
embedding: embedding,
})
.select()
.single()
Expand Down Expand Up @@ -260,24 +307,21 @@ function splitAndFilterSections(
}
async function fetchAllExistingDocs(supabase: SupabaseClient) {
const PAGE_SIZE = 1000;
let { data } = await supabase
.from("mdn_doc")
.select("id, url, slug, title, checksum")
.order("id")
.limit(PAGE_SIZE)
.throwOnError();
const selectDocs = () =>
supabase
.from("mdn_doc")
.select("id, url, slug, title, checksum, token_count")
.order("id")
.limit(PAGE_SIZE);

let { data } = await selectDocs().throwOnError();
let allData = data;
while (data.length === PAGE_SIZE) {
const lastItem = data[data.length - 1];
({ data } = await supabase
.from("mdn_doc")
.select("id, url, slug, title, checksum")
.order("id")
.gt("id", lastItem.id)
.limit(PAGE_SIZE)
.throwOnError());
({ data } = await selectDocs().gt("id", lastItem.id).throwOnError());
allData = [...allData, ...data];
}

return allData;
}

Expand Down