Skip to content

Commit

Permalink
feature: completed fern adapter loading in all mdx files + opeanpi spec
Browse files Browse the repository at this point in the history
  • Loading branch information
skeptrunedev committed Dec 10, 2024
1 parent 23b277c commit 918c7f9
Show file tree
Hide file tree
Showing 5 changed files with 198 additions and 63 deletions.
4 changes: 4 additions & 0 deletions clients/trieve-fern-adapter/.env.dist
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
TRIEVE_API_HOST=https://api.trieve.ai
TRIEVE_API_KEY=
TRIEVE_ORGANIZATION_ID=
TRIEVE_DATASET_TRACKING_ID=
216 changes: 171 additions & 45 deletions clients/trieve-fern-adapter/index.ts
Original file line number Diff line number Diff line change
@@ -1,34 +1,22 @@
/* eslint-disable @typescript-eslint/no-explicit-any */
import { program } from 'commander';
import { Command } from 'commander';
import { parse } from 'yaml';
import { Window } from 'happy-dom';
import fs from 'node:fs';
import { TrieveSDK, ChunkReqPayload } from 'trieve-ts-sdk';
import { marked } from 'marked';

const flattenHtmlIntoElements = (html: Element): Element[] => {
const elements = [];
for (let i = 0; i < html.children.length; i++) {
const child = html.children[i];
elements.push(child);
if (child.children.length > 0) {
elements.push(...flattenHtmlIntoElements(child));
}
}
return elements;
};
import { dereferenceSync } from '@trojs/openapi-dereference';

const splitHtmlIntoHeadAndBodies = (html: Element): [string, string][] => {
const headingRegex = /h\d/gi;
const flattenedElements = flattenHtmlIntoElements(html);
const tuples: [string, string][] = [];
let head = '';
let body = '';
for (const element of flattenedElements) {
for (const element of html.children) {
const isHeading =
element.tagName == 'title' || headingRegex.test(element.tagName);
if (!isHeading) {
body += element.textContent;
body += `${body ? '\n' : ''}` + element.textContent;
continue;
}

Expand Down Expand Up @@ -103,26 +91,28 @@ const extractChunksFromPath = async (

const html = await marked(content);
document.body.innerHTML = html;
if (title) {
const titleEl = document.createElement('h1');
titleEl.textContent = title;
document.body.insertBefore(titleEl, document.body.firstChild);
}
if (subtitle) {
const subtitleEl = document.createElement('h2');
subtitleEl.textContent = subtitle;
document.body.insertBefore(subtitleEl, document.body.firstChild);
}
if (title) {
const titleEl = document.createElement('h1');
titleEl.textContent = title;
document.body.insertBefore(titleEl, document.body.firstChild);
}

tuples = splitHtmlIntoHeadAndBodies(document.body as unknown as Element);
} catch (err) {
console.error(`Error processing path: ${path}`, err);
}

for (const [heading, chunk_html] of tuples) {
for (const [heading, body] of tuples) {
if (!heading) {
continue;
}
let chunk_html = `<h3>${heading}</h3>`;
chunk_html += `<p>${body}</p>`;

const link = `${rootUrl}/${slug ?? path.replace('.mdx', '')}`;
const tag_set = (slug ?? path.replace('.mdx', ''))
Expand All @@ -134,13 +124,17 @@ const extractChunksFromPath = async (
heading: heading,
};

const semantic_boost_phrase = heading;
const fulltext_boost_phrase = heading;
let semantic_boost_phrase = heading;
let fulltext_boost_phrase = heading;

if (title) {
semantic_boost_phrase = `${title} ${semantic_boost_phrase}`;
fulltext_boost_phrase = `${title} ${fulltext_boost_phrase}`;
metadata['title'] = title;
}
if (subtitle) {
semantic_boost_phrase = `${subtitle} ${semantic_boost_phrase}`;
fulltext_boost_phrase = `${subtitle} ${fulltext_boost_phrase}`;
metadata['description'] = subtitle;
}

Expand All @@ -163,7 +157,7 @@ const extractChunksFromPath = async (
if (fulltext_boost_phrase) {
chunk.fulltext_boost = {
phrase: fulltext_boost_phrase,
boost_factor: 0.3,
boost_factor: 1.3,
};
}

Expand All @@ -173,10 +167,83 @@ const extractChunksFromPath = async (
return chunks;
};

const extractChunksFromOpenapiSpec = async (
openapiSpecUrl: string,
siteUrl: string | undefined = undefined,
apiRefParent: string | undefined = undefined,
): Promise<ChunkReqPayload[]> => {
const chunks: ChunkReqPayload[] = [];
try {
const openapiSpecResp = await fetch(openapiSpecUrl);
const openapiSpec = await openapiSpecResp.text();
// if the URL ended in .json, we'll assume it's JSON
// otherwise, we'll assume it's YAML
const isJson = openapiSpecUrl.endsWith('.json');
const openapiSpecObj = isJson
? JSON.parse(openapiSpec)
: parse(openapiSpec);
const schemaWithNoRefs: any = dereferenceSync(openapiSpecObj);

const pathObj = schemaWithNoRefs.paths;
if (!pathObj) {
console.error('No paths found in OpenAPI spec');
}
const paths = Object.keys(pathObj);
for (const path of paths) {
const pathData = pathObj[path];
const methods = Object.keys(pathData);
for (const method of methods) {
const operationId = pathData[method].operationId;
const summary = pathData[method].summary;
const description = pathData[method].description;
const pageLink = `${siteUrl}/${apiRefParent}/${summary?.split(' ').join('-').toLowerCase() ?? path}`;
const metadata = {
operation_id: operationId,
url: pageLink,
heirarchy: [
apiRefParent,
summary?.split(' ').join('-').toLowerCase() ?? path,
],
summary,
description,
};
const heading = `<h2><span class="openapi-method">${method.toUpperCase()}</span> ${summary}</h2>`;
let chunk_html = heading;
if (description) {
chunk_html += `\n\n<p>${description}</p>`;
}

const chunk: ChunkReqPayload = {
chunk_html,
link: pageLink,
tag_set: ['openapi-route', operationId, method],
metadata,
group_tracking_ids: [path],
fulltext_boost: {
phrase: heading,
boost_factor: 1.3,
},
semantic_boost: {
phrase: heading,
distance_factor: 0.3,
},
convert_html_to_text: true,
};

chunks.push(chunk);
}
}
} catch (err) {
console.error(`Error processing OpenAPI spec: ${openapiSpecUrl}`, err);
}

return chunks;
};

const trieveApiHost = process.env.TRIEVE_API_HOST;
const trieveApiKey = process.env.TRIEVE_API_KEY;
const trieveDatasetTrackingId = process.env.TRIEVE_DATASET_ID;
const trieveOrganizationId = process.env.TRIEVE_ORGANIZATION_ID;
const trieveDatasetTrackingId = process.env.TRIEVE_DATASET_TRACKING_ID;
if (
!trieveApiHost ||
!trieveApiKey ||
Expand All @@ -187,57 +254,116 @@ if (
process.exit(1);
}

program.option('-f, --file <file>', 'docs.yml file to process');
program.option(
'-r, --root-url <rootUrl>',
'Root URL to use for relative paths',
);
const program = new Command();
program
.option('-f, --file <file>', 'docs.yml file to process')
.option('-r, --root-url <rootUrl>', 'Root URL to use for relative paths')
.option('-s, --openapi-spec <openapiSpec>', 'URL of OpenAPI spec file')
.option('-a, --api-ref-path <apiRefPath>', 'Path to API reference pages');

program.parse();
program.parse(process.argv);

const options = program.opts();
if (!options.file) {
const apiRefPath = options.apiRefPath;
const filePath = options.file;
const rootUrl = options.rootUrl;
const openapiSpec = options.openapiSpec;

if (!filePath) {
console.error('Missing required --file option', options);
program.help();
}

const pathParts = options.file.split('/');
const pathWithoutFileName = pathParts.slice(0, pathParts.length - 1).join('/');

let chunkReqPayloads: ChunkReqPayload[] = [];

if (openapiSpec) {
console.log('Processing OpenAPI spec...', openapiSpec);
await extractChunksFromOpenapiSpec(openapiSpec, rootUrl, apiRefPath).then(
(res) => {
chunkReqPayloads = chunkReqPayloads.concat(res);
},
);
} else {
console.warn('No OpenAPI spec provided, skipping...');
}

try {
const rootUrl = options.rootUrl;
const file = fs.readFileSync(options.file, 'utf8');
const file = fs.readFileSync(filePath, 'utf8');
const data = parse(file);
const paths = extractPathsFromAny(data);

for (const path of paths) {
void extractChunksFromPath(path, rootUrl).then((res) => {
await extractChunksFromPath(path, rootUrl).then((res) => {
chunkReqPayloads = chunkReqPayloads.concat(res);
});
}
} catch (err) {
console.error(`Error reading file: ${options.file}`);
console.error(err);
process.exit(1);
console.error(`Error reading file: ${filePath}`, err);
}

export const trieve = new TrieveSDK({
baseUrl: trieveApiHost,
apiKey: trieveApiKey,
datasetId: trieveDatasetTrackingId,
organizationId: trieveOrganizationId,
});

try {
await trieve.getDatasetByTrackingId(trieveDatasetTrackingId);
console.info('Checking for existing dataset...');
const dataset = await trieve.getDatasetByTrackingId(trieveDatasetTrackingId);
trieve.datasetId = dataset.id;
console.info('Dataset found, clearing...');
try {
await trieve.clearDataset(dataset.id);
} catch (err) {
console.error('Error clearing dataset', err);
}
while (true) {
try {
console.info('Checking for groups...');
const groups = await trieve.getGroupsForDataset({
page: 1,
});

if (groups.groups.length === 0) {
console.info('Dataset cleared');
break;
}
} catch (err) {
console.error('Error getting groups', err);
}
console.info('Waiting on delete...');
}
} catch (err) {
console.info('Dataset not found, creating...', err);
await trieve.createDataset({
tracking_id: trieveDatasetTrackingId,
dataset_name: trieveDatasetTrackingId,
});
try {
const createdDataset = await trieve.createDataset({
tracking_id: trieveDatasetTrackingId,
dataset_name: trieveDatasetTrackingId,
});
console.info('Dataset created');
trieve.datasetId = createdDataset.id;
} catch (err) {
console.error('Error creating dataset', err);
process.exit(1);
}
}

for (let i = 0; i < chunkReqPayloads.length; i += 120) {
const chunkBatch = chunkReqPayloads.slice(i, i + 120);
await trieve.createChunk(chunkBatch);
console.log(`Creating chunk batch ${i + 1} - ${i + 120}`);
while (true) {
try {
await trieve.createChunk(chunkBatch);
break;
} catch (err) {
console.error('Error creating chunk batch, retrying...', err);
}
}
}

console.log('Done!');
process.exit(0);
3 changes: 2 additions & 1 deletion clients/trieve-fern-adapter/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,11 @@
"typescript-eslint": "^8.16.0"
},
"dependencies": {
"@trojs/openapi-dereference": "^1.0.0",
"commander": "^12.1.0",
"happy-dom": "^15.11.6",
"marked": "^15.0.2",
"trieve-ts-sdk": "^0.0.35",
"yaml": "^2.6.1"
}
}
}
Loading

0 comments on commit 918c7f9

Please sign in to comment.