Skip to content

Commit

Permalink
feature: completed testing of csv and jsonl ingests with the new worker
Browse files Browse the repository at this point in the history
  • Loading branch information
skeptrunedev authored and cdxker committed Dec 7, 2024
1 parent c979f0c commit ce05422
Show file tree
Hide file tree
Showing 13 changed files with 1,046 additions and 33 deletions.
52 changes: 49 additions & 3 deletions .github/workflows/push-server.yml
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ jobs:
labels: ${{ steps.meta.outputs.labels }}


file_worker:
csv_jsonl_worker:
name: Push File Worker image
runs-on: ${{ matrix.runner }}
strategy:
Expand Down Expand Up @@ -187,7 +187,7 @@ jobs:
with:
# list of Docker images to use as base name for tags
images: |
trieve/file_worker
trieve/csv_jsonl_worker:
tags: |
type=raw,latest
type=sha
Expand All @@ -197,11 +197,57 @@ jobs:
with:
platforms: ${{ matrix.platform }}
context: server/
file: ./server/Dockerfile.file-worker
file: ./server/Dockerfile.csv-jsonl-worker
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}

file_worker:
name: Push File Worker image
runs-on: ${{ matrix.runner }}
strategy:
matrix:
runner: [blacksmith-8vcpu-ubuntu-2204]
platform: [linux/amd64]
exclude:
- runner: blacksmith-8vcpu-ubuntu-2204
platform: linux/arm64
- runner: blacksmith-8vcpu-ubuntu-2204-arm
platform: linux/amd64
steps:
- name: Checkout the repo
uses: actions/checkout@v4

- name: Setup buildx
uses: docker/setup-buildx-action@v3

- name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}

- name: Docker meta
id: meta
uses: docker/metadata-action@v5
with:
# list of Docker images to use as base name for tags
images: |
trieve/file_worker
tags: |
type=raw,latest
type=sha
- name: Build and push Docker image
uses: useblacksmith/[email protected]
with:
platforms: ${{ matrix.platform }}
context: server/
file: ./server/Dockerfile.file-worker
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}

group_worker:
name: Push Group worker
runs-on: ${{ matrix.runner }}
Expand Down
13 changes: 8 additions & 5 deletions clients/ts-sdk/openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -7973,7 +7973,7 @@
"tracking_id": "tracking_id"
}
},
"ChunkReqPayloadKeys": {
"ChunkReqPayloadFields": {
"type": "string",
"description": "The key in the ChunkReqPayload which you can map a column or field from the CSV or JSONL file to.",
"enum": [
Expand All @@ -7994,11 +7994,11 @@
"description": "Express a mapping between a column or field in a CSV or JSONL field and a key in the ChunkReqPayload created for each row or object.",
"required": [
"csv_jsonl_field",
"chunk_req_payload_key"
"chunk_req_payload_field"
],
"properties": {
"chunk_req_payload_key": {
"$ref": "#/components/schemas/ChunkReqPayloadKeys"
"chunk_req_payload_field": {
"$ref": "#/components/schemas/ChunkReqPayloadFields"
},
"csv_jsonl_field": {
"type": "string",
Expand Down Expand Up @@ -9938,7 +9938,10 @@
"group_chunks_action_failed",
"crawl_completed",
"crawl_failed",
"crawl_started"
"crawl_started",
"csv_jsonl_processing_failed",
"csv_jsonl_processing_checkpoint",
"csv_jsonl_processing_completed"
]
},
"EventTypes": {
Expand Down
2 changes: 1 addition & 1 deletion clients/ts-sdk/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"files": [
"dist"
],
"version": "0.0.38",
"version": "0.0.39",
"license": "MIT",
"scripts": {
"lint": "eslint 'src/**/*.ts'",
Expand Down
20 changes: 10 additions & 10 deletions clients/ts-sdk/src/__tests__/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,19 @@ import { TrieveSDK } from "../sdk";

export const GROUP_EXAMPLE_ID = "460e5ee8-98bc-4fed-b4ec-68f4d6453e5f";
export const GROUP_EXAMPLE_TRACKING_ID = "1234";
// export const TRIEVE = new TrieveSDK({
// apiKey: "tr-mKHF9sstPHQHcCbh6Qk6Uw54hx7uwDGU",
// datasetId: "6cba9148-9cbb-417a-a955-93ea749ef27c",
// organizationId: "de73679c-707f-4fc2-853e-994c910d944c",
// });

export const TRIEVE = new TrieveSDK({
baseUrl: "http://localhost:8090",
organizationId: "61593129-7394-49ec-a3c7-c59faa7a7c5d",
datasetId: "11e55b82-1c20-43a8-90af-576a2697c282",
apiKey: "tr-hTtHCq4gOfwMGzluCSxZBtfS4ktmmqiq",
apiKey: "tr-mKHF9sstPHQHcCbh6Qk6Uw54hx7uwDGU",
datasetId: "6cba9148-9cbb-417a-a955-93ea749ef27c",
organizationId: "de73679c-707f-4fc2-853e-994c910d944c",
});

// export const TRIEVE = new TrieveSDK({
// baseUrl: "http://localhost:8090",
// organizationId: "61593129-7394-49ec-a3c7-c59faa7a7c5d",
// datasetId: "f83f08ef-c05d-421c-baf1-4f1509ea069b",
// apiKey: "tr-hTtHCq4gOfwMGzluCSxZBtfS4ktmmqiq",
// });

export const EXAMPLE_TOPIC_ID = "f85984e1-7818-4971-b300-2f462fe1a5a2";
export const EXAMPLE_MESSAGE_ID = "48d0d2ef-3bfa-4124-8625-3c625ffa45a6";

Expand Down
68 changes: 61 additions & 7 deletions clients/ts-sdk/src/functions/file/file.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,64 @@ describe("File Tests", async () => {
expectTypeOf(data).toEqualTypeOf<UploadFileResponseBody>();
});

test("createPresignedUrlForCsvJsonl", async () => {
test("createPresignedUrlForJsonl", async () => {
const data = await trieve.createPresignedUrlForCsvJsonl({
file_name: "flipkart.jsonl",
group_tracking_id: "flipkart-file-upload-group",
mappings: [
{
csv_jsonl_field: "product_url",
chunk_req_payload_field: "link",
},
{
csv_jsonl_field: "retail_price",
chunk_req_payload_field: "num_value",
},
{
csv_jsonl_field: "image",
chunk_req_payload_field: "image_urls",
},
{
csv_jsonl_field: "uniq_id",
chunk_req_payload_field: "tracking_id",
},
],
});
expectTypeOf(
data
).toEqualTypeOf<CreatePresignedUrlForCsvJsonResponseBody>();

const presignedPutUrl = data.presigned_put_url;
const fileResponse = await fetch(
"https://trieve.b-cdn.net/csvjsonltesting/flipkart_com-ecommerce_sample.jsonl"
);
const blob = await fileResponse.blob();

const uploadResponse = await fetch(presignedPutUrl, {
method: "PUT",
body: blob,
headers: {
"Content-Type": "text/jsonl",
},
});

expect(uploadResponse.ok).toBeTruthy();
});

test("createPresignedUrlForCsv", async () => {
const data = await trieve.createPresignedUrlForCsvJsonl({
file_name: "uploadme.csv",
group_tracking_id: "file-upload-group",
mappings: [
{
csv_jsonl_field: "PassengerId",
chunk_req_payload_field: "tracking_id",
},
{
csv_jsonl_field: "Name",
chunk_req_payload_field: "tag_set",
},
],
});
expectTypeOf(
data
Expand Down Expand Up @@ -65,10 +119,10 @@ describe("File Tests", async () => {
expectTypeOf(data).toEqualTypeOf<File[]>();
});

// test("getFile", async () => {
// const data = await trieve.getFile({
// fileId: EXAMPLE_FILE_ID,
// });
// expectTypeOf(data).toEqualTypeOf<FileDTO>();
// });
test("getFile", async () => {
const data = await trieve.getFile({
fileId: EXAMPLE_FILE_ID,
});
expectTypeOf(data).toEqualTypeOf<FileDTO>();
});
});
6 changes: 3 additions & 3 deletions clients/ts-sdk/src/types.gen.ts
Original file line number Diff line number Diff line change
Expand Up @@ -432,13 +432,13 @@ export type ChunkReqPayload = {
/**
* The key in the ChunkReqPayload which you can map a column or field from the CSV or JSONL file to.
*/
export type ChunkReqPayloadKeys = 'link' | 'tag_set' | 'num_value' | 'tracking_id' | 'time_stamp' | 'lat' | 'lon' | 'image_urls' | 'weight' | 'boost_phrase';
export type ChunkReqPayloadFields = 'link' | 'tag_set' | 'num_value' | 'tracking_id' | 'time_stamp' | 'lat' | 'lon' | 'image_urls' | 'weight' | 'boost_phrase';

/**
* Express a mapping between a column or field in a CSV or JSONL field and a key in the ChunkReqPayload created for each row or object.
*/
export type ChunkReqPayloadMapping = {
chunk_req_payload_key: ChunkReqPayloadKeys;
chunk_req_payload_field: ChunkReqPayloadFields;
/**
* The column or field in the CSV or JSONL file that you want to map to a key in the ChunkReqPayload
*/
Expand Down Expand Up @@ -1210,7 +1210,7 @@ export type EventReturn = {
page_count: number;
};

export type EventTypeRequest = 'file_uploaded' | 'file_upload_failed' | 'chunks_uploaded' | 'chunk_action_failed' | 'chunk_updated' | 'bulk_chunks_deleted' | 'dataset_delete_failed' | 'qdrant_upload_failed' | 'bulk_chunk_upload_failed' | 'group_chunks_updated' | 'group_chunks_action_failed' | 'crawl_completed' | 'crawl_failed' | 'crawl_started';
export type EventTypeRequest = 'file_uploaded' | 'file_upload_failed' | 'chunks_uploaded' | 'chunk_action_failed' | 'chunk_updated' | 'bulk_chunks_deleted' | 'dataset_delete_failed' | 'qdrant_upload_failed' | 'bulk_chunk_upload_failed' | 'group_chunks_updated' | 'group_chunks_action_failed' | 'crawl_completed' | 'crawl_failed' | 'crawl_started' | 'csv_jsonl_processing_failed' | 'csv_jsonl_processing_checkpoint' | 'csv_jsonl_processing_completed';

export type EventTypes = {
/**
Expand Down
18 changes: 18 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,24 @@ services:
condition: service_healthy
env_file: .env

csv-jsonl-worker:
image: trieve/file_worker
build:
context: ./server/
dockerfile: Dockerfile.csv-jsonl-worker
restart: always
network_mode: "host"
depends_on:
tika:
condition: service_healthy
db:
condition: service_healthy
qdrant-database:
condition: service_started
redis:
condition: service_healthy
env_file: .env

delete-worker:
image: trieve/delete_worker
build:
Expand Down
4 changes: 4 additions & 0 deletions server/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ path = "src/bin/word-worker.rs"
name = "file-worker"
path = "src/bin/file-worker.rs"

[[bin]]
name = "csv-jsonl-worker"
path = "src/bin/csv-jsonl-worker.rs"

[[bin]]
name = "grupdate-worker"
path = "src/bin/grupdate-worker.rs"
Expand Down
28 changes: 28 additions & 0 deletions server/Dockerfile.csv-jsonl-worker
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
FROM rust:1.81-slim-bookworm AS chef
# We only pay the installation cost once,
# it will be cached from the second build onwards
RUN apt-get update -y && apt-get -y install pkg-config libssl-dev libpq-dev g++ curl
RUN cargo install cargo-chef
WORKDIR app

FROM chef AS planner
COPY . .
RUN cargo chef prepare --recipe-path recipe.json

FROM chef AS builder
COPY --from=planner /app/recipe.json recipe.json
# Build dependencies - this is the caching Docker layer!
RUN cargo chef cook --release --recipe-path recipe.json --bin "csv-jsonl-worker"
# Build application
COPY . .
RUN cargo build --release --features "runtime-env" --bin "csv-jsonl-worker"

FROM debian:bookworm-slim as runtime
RUN apt-get update -y && apt-get -y install pkg-config libssl-dev libpq-dev ca-certificates
WORKDIR /app
COPY ./migrations/ /app/migrations
COPY --from=builder /app/target/release/csv-jsonl-worker" /app/csv-jsonl-worker"


EXPOSE 8090
ENTRYPOINT ["/app/csv-jsonl-worker"]
Loading

0 comments on commit ce05422

Please sign in to comment.