Skip to content

Commit

Permalink
feat: langchain turbopuffer
Browse files Browse the repository at this point in the history
  • Loading branch information
mattzcarey committed Dec 23, 2023
1 parent 7088a85 commit edaf2d4
Show file tree
Hide file tree
Showing 10 changed files with 1,289 additions and 39 deletions.
13 changes: 9 additions & 4 deletions packages/crgpt-loader/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,30 @@
"main": "dist/index.js",
"types": "dist/index.d.ts",
"scripts": {
"test": "ts-node test.ts"
"test": "ts-node test.ts",
"build": "tsc",
"publish-package": "pnpm build && npm publish --access public"
},
"keywords": [
"github",
"AI",
"loader",
"vector store",
"crgpt"
"crgpt",
"turbopuffer"
],
"author": "Matt Carey",
"license": "MIT",
"dependencies": {
"dotenv": "^16.3.1",
"ignore": "^5.3.0",
"langchain": "^0.0.204"
},
"devDependencies": {
"@types/node": "^20.10.5",
"ts-node": "^10.9.2",
"typescript": "^5.3.3"
}
},
"files": [
"dist/*"
]
}
13 changes: 2 additions & 11 deletions packages/crgpt-loader/pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

65 changes: 41 additions & 24 deletions packages/crgpt-loader/src/crgpt-loader.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
import axios, { AxiosResponse } from "axios";
import dotenv from "dotenv";
import { promises as fsPromises } from "fs";
import { Document } from "langchain/document";
import { OpenAIEmbeddings } from "langchain/embeddings/openai";
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
import os from "os";
import path from "path";
import { tmpdir } from "os";
import { join } from "path";
import { removeFilesCommand, removeFoldersCommand } from "./constants";
import { executeCommand, openFile, savePage } from "./utils";

Expand Down Expand Up @@ -67,15 +66,15 @@ export class CRGPTLoader {
"Content-Type": "application/json",
};

await axios.post(
apiEndpoint,
{
await fetch(apiEndpoint, {
method: "POST",
headers,
body: JSON.stringify({
ids,
vectors: embeddings,
attributes,
},
{ headers }
);
}),
});
} catch (error) {
console.error("Error storing documents:", error);
throw error;
Expand All @@ -102,9 +101,7 @@ export class CRGPTLoader {
}

private async cloneRepository(): Promise<string> {
const tempDir = await fsPromises.mkdtemp(
path.join(os.tmpdir(), "CRGPTLoader-")
);
const tempDir = await fsPromises.mkdtemp(join(tmpdir(), "CRGPTLoader-"));
const cloneCommand = `git clone --depth 1 ${this.link} ${tempDir}`;
await executeCommand(cloneCommand);
return tempDir;
Expand Down Expand Up @@ -132,7 +129,7 @@ export class CRGPTLoader {
const documents: Document<{ source: string }>[] = [];

for (const entry of entries) {
const fullPath = path.join(directory, entry.name);
const fullPath = join(directory, entry.name);
if (entry.isDirectory()) {
documents.push(...(await this.createDocuments(fullPath)));
} else if (entry.isFile()) {
Expand Down Expand Up @@ -164,7 +161,10 @@ export class CRGPTLoader {
continue;
}

const { ids, vectors, attributes, next_cursor } = response.data;
// Parse the response body as JSON
const data = await response.json();
const { ids, vectors, attributes, next_cursor } = data;

savePage(dataDir, pageIndex, ids, vectors, attributes);

nextCursor = next_cursor;
Expand All @@ -179,16 +179,27 @@ export class CRGPTLoader {
private async fetchPage(
namespace: string,
cursor: string | null
): Promise<AxiosResponse> {
const apiEndpoint = `https://api.turbopuffer.com/v1/vectors/${namespace}`;
const params = cursor ? { cursor } : {};

return axios.get(apiEndpoint, {
headers: { Authorization: `Bearer ${process.env.TURBOPUFFER_API_KEY}` },
params,
maxContentLength: Infinity,
maxBodyLength: Infinity,
): Promise<Response> {
const apiEndpoint = new URL(
`https://api.turbopuffer.com/v1/vectors/${namespace}`
);

if (cursor) {
apiEndpoint.searchParams.append("cursor", cursor);
}

const response = await fetch(apiEndpoint.toString(), {
method: "GET",
headers: {
Authorization: `Bearer ${process.env.TURBOPUFFER_API_KEY}`,
},
});

if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`);
}

return response;
}

public async delete(indexName = this.extractRepoName()): Promise<void> {
Expand All @@ -200,7 +211,13 @@ export class CRGPTLoader {
};

// Make the DELETE request
const response = await axios.delete(apiEndpoint, { headers });
const res = await fetch(apiEndpoint, {
method: "DELETE",
headers,
});

// Parse the response
const response = await res.json();

// Log the response status
console.log("Delete response:", response.data);
Expand Down
15 changes: 15 additions & 0 deletions packages/crgpt-loader/src/lc_wip/turbopufferVectorStore.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@ import { Document } from "langchain/document";
import { Embeddings } from "langchain/embeddings/base";
import { VectorStore } from "langchain/vectorstores/base";

interface TurboPufferIntegrationParams {
apiKey?: string;
namespace?: string;
}

interface TurboPufferHeaders {
headers: {
Authorization: string;
Expand Down Expand Up @@ -185,4 +190,14 @@ export class TurboPuffer extends VectorStore {

return result;
}

static async fromDocuments(
docs: Document[],
embeddings: Embeddings,
dbConfig: TurboPufferIntegrationParams
): Promise<TurboPuffer> {
const instance = new this(embeddings, dbConfig);
await instance.addDocuments(docs);
return instance;
}
}
42 changes: 42 additions & 0 deletions packages/lc-turbopuffer/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# LangChainJS x TurboPuffer

This is a 3rd party integration of [TurboPuffer](https://turbopuffer.com/) as a Vector Store class into the [LangChain](https://langchain.org/) ecosystem.

Turbopuffer is a vector store built ontop of object storage so it is remarkably cheap and scalable.

Python is always going to come first as an official version so I thought I'd make a JS version for the community.

## Installation

```bash
npm install langchain-turbopuffer
```

or if you use pnpm

```bash
pnpm install langchain-turbopuffer
```

## Usage

```javascript
import { TurboPufferVectorStore } from "langchain-turbopuffer";

const embeddings = new OpenAIEmbeddings();

const vectorStore = new TurboPufferVectorStore(embeddings);

const doc = new Document({
pageContent: "This is a test",
metadata: {
source: "https://example.com",
},
});

await vectorStore.addDocuments([doc]);
```

## Contribute to the project

This is a community project, so feel free to contribute to it and bring up any issues. If you have any questions, please contact me on the Turbopuffer Slack.
31 changes: 31 additions & 0 deletions packages/lc-turbopuffer/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{
"name": "langchain-turbopuffer",
"version": "0.0.2",
"description": "An open source and LangChain compatible Vector Store class for Turbopuffer. Store vectors in object storage for cheap.",
"main": "dist/index.js",
"types": "dist/index.d.ts",
"scripts": {
"build": "tsc",
"publish-package": "pnpm build && npm publish --access public"
},
"keywords": [
"AI",
"retriever",
"vector store",
"object store",
"turbopuffer",
"langchain"
],
"author": "Matt Carey",
"license": "MIT",
"dependencies": {
"langchain": "^0.0.204"
},
"devDependencies": {
"@types/node": "^20.10.5",
"typescript": "^5.3.3"
},
"files": [
"dist/*"
]
}
Loading

0 comments on commit edaf2d4

Please sign in to comment.