Skip to content

Commit c979f0c

Browse files
skeptrunedevcdxker
authored andcommitted
feature: upload CSV processing done up to the worker stage
1 parent 7980eb2 commit c979f0c

File tree

13 files changed

+639
-103
lines changed

13 files changed

+639
-103
lines changed

clients/ts-sdk/openapi.json

+209-9
Original file line numberDiff line numberDiff line change
@@ -4269,7 +4269,7 @@
42694269
"File"
42704270
],
42714271
"summary": "Upload File",
4272-
"description": "Upload a file to S3 attached to the server. The file will be converted to HTML with tika and chunked algorithmically, images will be OCR'ed with tesseract. The resulting chunks will be indexed and searchable. Optionally, you can only upload the file and manually create chunks associated to the file after. See docs.trieve.ai and/or contact us for more details and tips. Auth'ed user must be an admin or owner of the dataset's organization to upload a file.",
4272+
"description": "Upload a file to S3 bucket attached to your dataset. You can select between a naive chunking strategy where the text is extracted with Apache Tika and split into segments with a target number of segments per chunk OR you can use a vision LLM to convert the file to markdown and create chunks per page. Auth'ed user must be an admin or owner of the dataset's organization to upload a file.",
42734273
"operationId": "upload_file_handler",
42744274
"parameters": [
42754275
{
@@ -4300,7 +4300,69 @@
43004300
"content": {
43014301
"application/json": {
43024302
"schema": {
4303-
"$ref": "#/components/schemas/UploadFileResult"
4303+
"$ref": "#/components/schemas/UploadFileResponseBody"
4304+
}
4305+
}
4306+
}
4307+
},
4308+
"400": {
4309+
"description": "Service error relating to uploading the file",
4310+
"content": {
4311+
"application/json": {
4312+
"schema": {
4313+
"$ref": "#/components/schemas/ErrorResponseBody"
4314+
}
4315+
}
4316+
}
4317+
}
4318+
},
4319+
"security": [
4320+
{
4321+
"ApiKey": [
4322+
"admin"
4323+
]
4324+
}
4325+
]
4326+
}
4327+
},
4328+
"/api/file/csv_or_jsonl": {
4329+
"post": {
4330+
"tags": [
4331+
"File"
4332+
],
4333+
"summary": "Create Presigned CSV/JSONL S3 PUT URL",
4334+
"description": "This route is useful for uploading very large CSV or JSONL files. Once you have completed the upload, chunks will be automatically created from the file for each line in the CSV or JSONL file. The chunks will be indexed and searchable. Auth'ed user must be an admin or owner of the dataset's organization to upload a file.",
4335+
"operationId": "create_presigned_url_for_csv_jsonl",
4336+
"parameters": [
4337+
{
4338+
"name": "TR-Dataset",
4339+
"in": "header",
4340+
"description": "The dataset id or tracking_id to use for the request. We assume you intend to use an id if the value is a valid uuid.",
4341+
"required": true,
4342+
"schema": {
4343+
"type": "string",
4344+
"format": "uuid"
4345+
}
4346+
}
4347+
],
4348+
"requestBody": {
4349+
"description": "JSON request payload to upload a CSV or JSONL file",
4350+
"content": {
4351+
"application/json": {
4352+
"schema": {
4353+
"$ref": "#/components/schemas/CreatePresignedUrlForCsvJsonlReqPayload"
4354+
}
4355+
}
4356+
},
4357+
"required": true
4358+
},
4359+
"responses": {
4360+
"200": {
4361+
"description": "File object information and signed put URL",
4362+
"content": {
4363+
"application/json": {
4364+
"schema": {
4365+
"$ref": "#/components/schemas/CreatePresignedUrlForCsvJsonResponseBody"
43044366
}
43054367
}
43064368
}
@@ -4330,8 +4392,8 @@
43304392
"tags": [
43314393
"File"
43324394
],
4333-
"summary": "Get File",
4334-
"description": "Download a file based on its id.",
4395+
"summary": "Get File Signed URL",
4396+
"description": "Get a signed s3 url corresponding to the file_id requested such that you can download the file.",
43354397
"operationId": "get_file_handler",
43364398
"parameters": [
43374399
{
@@ -4357,7 +4419,7 @@
43574419
],
43584420
"responses": {
43594421
"200": {
4360-
"description": "The signed s3 url corresponding to the file_id requested",
4422+
"description": "The file's information and s3_url where the original file can be downloaded",
43614423
"content": {
43624424
"application/json": {
43634425
"schema": {
@@ -7911,6 +7973,46 @@
79117973
"tracking_id": "tracking_id"
79127974
}
79137975
},
7976+
"ChunkReqPayloadKeys": {
7977+
"type": "string",
7978+
"description": "The key in the ChunkReqPayload which you can map a column or field from the CSV or JSONL file to.",
7979+
"enum": [
7980+
"link",
7981+
"tag_set",
7982+
"num_value",
7983+
"tracking_id",
7984+
"time_stamp",
7985+
"lat",
7986+
"lon",
7987+
"image_urls",
7988+
"weight",
7989+
"boost_phrase"
7990+
]
7991+
},
7992+
"ChunkReqPayloadMapping": {
7993+
"type": "object",
7994+
"description": "Express a mapping between a column or field in a CSV or JSONL field and a key in the ChunkReqPayload created for each row or object.",
7995+
"required": [
7996+
"csv_jsonl_field",
7997+
"chunk_req_payload_key"
7998+
],
7999+
"properties": {
8000+
"chunk_req_payload_key": {
8001+
"$ref": "#/components/schemas/ChunkReqPayloadKeys"
8002+
},
8003+
"csv_jsonl_field": {
8004+
"type": "string",
8005+
"description": "The column or field in the CSV or JSONL file that you want to map to a key in the ChunkReqPayload"
8006+
}
8007+
}
8008+
},
8009+
"ChunkReqPayloadMappings": {
8010+
"type": "array",
8011+
"items": {
8012+
"$ref": "#/components/schemas/ChunkReqPayloadMapping"
8013+
},
8014+
"description": "Specify all of the mappings between columns or fields in a CSV or JSONL file and keys in the ChunkReqPayload. Array fields like tag_set and image_urls can have multiple mappings. Boost phrase can also have multiple mappings which get concatenated. Other fields can only have one mapping and only the last mapping will be used."
8015+
},
79148016
"ChunkReturnTypes": {
79158017
"oneOf": [
79168018
{
@@ -8829,6 +8931,105 @@
88298931
}
88308932
}
88318933
},
8934+
"CreatePresignedUrlForCsvJsonResponseBody": {
8935+
"type": "object",
8936+
"required": [
8937+
"file_metadata",
8938+
"presigned_put_url"
8939+
],
8940+
"properties": {
8941+
"file_metadata": {
8942+
"$ref": "#/components/schemas/File"
8943+
},
8944+
"presigned_put_url": {
8945+
"type": "string",
8946+
"description": "Signed URL to upload the file to."
8947+
}
8948+
}
8949+
},
8950+
"CreatePresignedUrlForCsvJsonlReqPayload": {
8951+
"type": "object",
8952+
"required": [
8953+
"file_name"
8954+
],
8955+
"properties": {
8956+
"description": {
8957+
"type": "string",
8958+
"description": "Description is an optional convience field so you do not have to remember what the file contains or is about. It will be included on the group resulting from the file which will hold its chunk.",
8959+
"nullable": true
8960+
},
8961+
"file_name": {
8962+
"type": "string",
8963+
"description": "Name of the file being uploaded, including the extension. Will be used to determine CSV or JSONL for processing."
8964+
},
8965+
"fulltext_boost_factor": {
8966+
"type": "number",
8967+
"format": "double",
8968+
"description": "Amount to multiplicatevly increase the frequency of the tokens in the boost phrase for each row's chunk by. Applies to fulltext (SPLADE) and keyword (BM25) search.",
8969+
"nullable": true
8970+
},
8971+
"group_tracking_id": {
8972+
"type": "string",
8973+
"description": "Group tracking id is an optional field which allows you to specify the tracking id of the group that is created from the file. Chunks created will be created with the tracking id of `group_tracking_id|<index of chunk>`",
8974+
"nullable": true
8975+
},
8976+
"link": {
8977+
"type": "string",
8978+
"description": "Link to the file. This can also be any string. This can be used to filter when searching for the file's resulting chunks. The link value will not affect embedding creation.",
8979+
"nullable": true
8980+
},
8981+
"mappings": {
8982+
"allOf": [
8983+
{
8984+
"$ref": "#/components/schemas/ChunkReqPayloadMappings"
8985+
}
8986+
],
8987+
"nullable": true
8988+
},
8989+
"metadata": {
8990+
"description": "Metadata is a JSON object which can be used to filter chunks. This is useful for when you want to filter chunks by arbitrary metadata. Unlike with tag filtering, there is a performance hit for filtering on metadata. Will be passed down to the file's chunks.",
8991+
"nullable": true
8992+
},
8993+
"semantic_boost_factor": {
8994+
"type": "number",
8995+
"format": "double",
8996+
"description": "Arbitrary float (positive or negative) specifying the multiplicate factor to apply before summing the phrase vector with the chunk_html embedding vector. Applies to semantic (embedding model) search.",
8997+
"nullable": true
8998+
},
8999+
"tag_set": {
9000+
"type": "array",
9001+
"items": {
9002+
"type": "string"
9003+
},
9004+
"description": "Tag set is a comma separated list of tags which will be passed down to the chunks made from the file. Each tag will be joined with what's creatd per row of the CSV or JSONL file.",
9005+
"nullable": true
9006+
},
9007+
"time_stamp": {
9008+
"type": "string",
9009+
"description": "Time stamp should be an ISO 8601 combined date and time without timezone. Time_stamp is used for time window filtering and recency-biasing search results. Will be passed down to the file's chunks.",
9010+
"nullable": true
9011+
},
9012+
"upsert_by_tracking_id": {
9013+
"type": "boolean",
9014+
"description": "Upsert by tracking_id. If true, chunks will be upserted by tracking_id. If false, chunks with the same tracking_id as another already existing chunk will be ignored. Defaults to true.",
9015+
"nullable": true
9016+
}
9017+
},
9018+
"example": {
9019+
"description": "This is an example file",
9020+
"file_name": "example.pdf",
9021+
"link": "https://example.com",
9022+
"metadata": {
9023+
"key1": "value1",
9024+
"key2": "value2"
9025+
},
9026+
"tag_set": [
9027+
"tag1",
9028+
"tag2"
9029+
],
9030+
"time_stamp": "2021-01-01 00:00:00.000Z"
9031+
}
9032+
},
88329033
"CreateSetupCheckoutSessionResPayload": {
88339034
"type": "object",
88349035
"required": [
@@ -16081,15 +16282,14 @@
1608116282
},
1608216283
"use_pdf2md_ocr": {
1608316284
"type": "boolean",
16084-
"description": "Parameter to use pdf2md_ocr. If true, the file will be converted to markdown using gpt-4o.\nDefault is false.",
16285+
"description": "Parameter to use pdf2md_ocr. If true, the file will be converted to markdown using gpt-4o. Default is false.",
1608516286
"nullable": true
1608616287
}
1608716288
},
1608816289
"example": {
16089-
"base64_file": "base64_encoded_file",
16290+
"base64_file": "<base64_encoded_file>",
1609016291
"create_chunks": true,
1609116292
"description": "This is an example file",
16092-
"file_mime_type": "application/pdf",
1609316293
"file_name": "example.pdf",
1609416294
"link": "https://example.com",
1609516295
"metadata": {
@@ -16110,7 +16310,7 @@
1611016310
"use_pdf2md_ocr": false
1611116311
}
1611216312
},
16113-
"UploadFileResult": {
16313+
"UploadFileResponseBody": {
1611416314
"type": "object",
1611516315
"required": [
1611616316
"file_metadata"

clients/ts-sdk/src/__tests__/constants.ts

+10-10
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,19 @@ import { TrieveSDK } from "../sdk";
22

33
export const GROUP_EXAMPLE_ID = "460e5ee8-98bc-4fed-b4ec-68f4d6453e5f";
44
export const GROUP_EXAMPLE_TRACKING_ID = "1234";
5-
export const TRIEVE = new TrieveSDK({
6-
apiKey: "tr-mKHF9sstPHQHcCbh6Qk6Uw54hx7uwDGU",
7-
datasetId: "6cba9148-9cbb-417a-a955-93ea749ef27c",
8-
organizationId: "de73679c-707f-4fc2-853e-994c910d944c",
9-
});
10-
115
// export const TRIEVE = new TrieveSDK({
12-
// baseUrl: "http://localhost:8090",
13-
// organizationId: "967d4740-d8f0-4f3a-8a62-3c1297e5f6c4",
14-
// datasetId: "88fb2a53-17bd-4311-9763-051dc5c9c476",
15-
// apiKey: "tr-5OiU6tPsjgcMz0AeujPbKlBJFqeXVJ9G",
6+
// apiKey: "tr-mKHF9sstPHQHcCbh6Qk6Uw54hx7uwDGU",
7+
// datasetId: "6cba9148-9cbb-417a-a955-93ea749ef27c",
8+
// organizationId: "de73679c-707f-4fc2-853e-994c910d944c",
169
// });
1710

11+
export const TRIEVE = new TrieveSDK({
12+
baseUrl: "http://localhost:8090",
13+
organizationId: "61593129-7394-49ec-a3c7-c59faa7a7c5d",
14+
datasetId: "11e55b82-1c20-43a8-90af-576a2697c282",
15+
apiKey: "tr-hTtHCq4gOfwMGzluCSxZBtfS4ktmmqiq",
16+
});
17+
1818
export const EXAMPLE_TOPIC_ID = "f85984e1-7818-4971-b300-2f462fe1a5a2";
1919
export const EXAMPLE_MESSAGE_ID = "48d0d2ef-3bfa-4124-8625-3c625ffa45a6";
2020

clients/ts-sdk/src/functions/file/file.test.ts

+40-9
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
1-
import { beforeAll, describe, expectTypeOf } from "vitest";
1+
import { beforeAll, describe, expect, expectTypeOf } from "vitest";
22
import { TrieveSDK } from "../../sdk";
3-
import { File, FileDTO, UploadFileResult } from "../../types.gen";
3+
import {
4+
CreatePresignedUrlForCsvJsonResponseBody,
5+
File,
6+
FileDTO,
7+
UploadFileResponseBody,
8+
} from "../../types.gen";
49
import { EXAMPLE_FILE_ID, TRIEVE } from "../../__tests__/constants";
510
import fs from "fs";
611
import { test } from "../../__tests__/utils";
@@ -24,7 +29,33 @@ describe("File Tests", async () => {
2429
file_name: "uploadme.pdf",
2530
group_tracking_id: "file-upload-group",
2631
});
27-
expectTypeOf(data).toEqualTypeOf<UploadFileResult>();
32+
expectTypeOf(data).toEqualTypeOf<UploadFileResponseBody>();
33+
});
34+
35+
test("createPresignedUrlForCsvJsonl", async () => {
36+
const data = await trieve.createPresignedUrlForCsvJsonl({
37+
file_name: "uploadme.csv",
38+
group_tracking_id: "file-upload-group",
39+
});
40+
expectTypeOf(
41+
data
42+
).toEqualTypeOf<CreatePresignedUrlForCsvJsonResponseBody>();
43+
44+
const presignedPutUrl = data.presigned_put_url;
45+
const fileResponse = await fetch(
46+
"https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv"
47+
);
48+
const blob = await fileResponse.blob();
49+
50+
const uploadResponse = await fetch(presignedPutUrl, {
51+
method: "PUT",
52+
body: blob,
53+
headers: {
54+
"Content-Type": "text/csv",
55+
},
56+
});
57+
58+
expect(uploadResponse.ok).toBeTruthy();
2859
});
2960

3061
test("getFilesForDataset", async () => {
@@ -34,10 +65,10 @@ describe("File Tests", async () => {
3465
expectTypeOf(data).toEqualTypeOf<File[]>();
3566
});
3667

37-
test("getFile", async () => {
38-
const data = await trieve.getFile({
39-
fileId: EXAMPLE_FILE_ID,
40-
});
41-
expectTypeOf(data).toEqualTypeOf<FileDTO>();
42-
});
68+
// test("getFile", async () => {
69+
// const data = await trieve.getFile({
70+
// fileId: EXAMPLE_FILE_ID,
71+
// });
72+
// expectTypeOf(data).toEqualTypeOf<FileDTO>();
73+
// });
4374
});

0 commit comments

Comments
 (0)