Skip to content

Commit

Permalink
feat: working pdf highlighting
Browse files Browse the repository at this point in the history
  • Loading branch information
drew-harris committed Dec 21, 2024
1 parent 7f47e8e commit a9ba38d
Show file tree
Hide file tree
Showing 14 changed files with 327 additions and 62 deletions.
2 changes: 1 addition & 1 deletion clients/search-component/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@
"@r2wc/react-to-web-component": "^2.0.3",
"@thumbmarkjs/thumbmarkjs": "^0.14.8",
"react-markdown": "^9.0.1",
"react-pdf-spotlight": "^0.0.6",
"react-pdf-spotlight": "^0.0.7",
"react-snap-carousel": "^0.5.0",
"trieve-ts-sdk": "*"
},
Expand Down
66 changes: 64 additions & 2 deletions clients/search-component/src/TrieveModal/Search/PdfItem.tsx
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
import React from "react";
import React, { useEffect, useState } from "react";
import { PdfChunk } from "../../utils/types";
import { useFileContext } from "../../utils/hooks/file-context";
import { FileDTO } from "trieve-ts-sdk";
import { useModalState } from "../../utils/hooks/modal-context";
import { cached } from "../../utils/cache";
import { PdfSpotlight } from "react-pdf-spotlight";
import { file } from "bun";

type Props = {
item: PdfChunk;
Expand All @@ -14,12 +20,68 @@ function extractMarkedContent(text: string): string {
return match ? match[1] : "";
}

const getPresignedUrl = async (
baseUrl: string,
datasetId: string,
fileId: string,
apiKey: string,
) => {
const params = {
content_type: "application/pdf",
};
const queryParams = new URLSearchParams(params).toString();
const result = await fetch(`${baseUrl}/api/file/${fileId}?${queryParams}`, {
headers: {
"TR-Dataset": datasetId,
Authorization: `Bearer ${apiKey}`,
},
});

if (!result.ok) {
throw new Error("Error fetching presigned url");
}

const presignedUrl = (await result.json()) as FileDTO;

return presignedUrl.s3_url;
};

export const PdfItem = (props: Props) => {
const [presigned, setPresigned] = useState<string | null>(null);
const toHighlight = extractMarkedContent(props.item.chunk.highlight || "");
const fileCtx = useFileContext();
const state = useModalState();

useEffect(() => {
const getPresigned = async () => {
const presignedUrlResult = await cached(() => {
return getPresignedUrl(
state.props.baseUrl || "http://localhost:8090",
state.props.datasetId,
fileCtx.files[props.item.chunk.metadata.file_name],
state.props.apiKey,
);
}, `file-presigned:${props.item.chunk.metadata.file_name}`);
setPresigned(presignedUrlResult);
};

getPresigned();
}, []);

return (
<div>
<div>Pdf item</div>
{presigned && (
<div className="max-w-[400px]">
<PdfSpotlight
padding={{
horizontal: 100,
}}
page={props.item.chunk.metadata.page}
searchFor={toHighlight}
url={presigned}
></PdfSpotlight>
</div>
)}
</div>
);
};
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ export const SearchMode = () => {
index: number,
) => {
const isChunk = isChunkWithHighlights(result);
console.log(result);

// Target non group pdf search
if (isChunk && props.type === "pdf") {
Expand Down
19 changes: 14 additions & 5 deletions clients/search-component/src/utils/hooks/file-context.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,11 @@ export const FileContextProvider = (props: { children: ReactNode }) => {

useEffect(() => {
const getFiles = async () => {
const page = 1;
let page = 1;
let done = false;
const fileMapResult: Record<string, string> = {};
while (!done) {
let totalPages = Number.MAX_SAFE_INTEGER;
while (!done && page <= totalPages) {
const files = await state.trieveSDK.trieve.fetch(
"/api/dataset/files/{dataset_id}/{page}",
"get",
Expand All @@ -25,14 +26,18 @@ export const FileContextProvider = (props: { children: ReactNode }) => {
},
);

if (files.length) {
files.reduce((acc, file) => {
acc[file.file_name] = file.id;
totalPages = files.total_pages;

if (files.file_and_group_ids.length) {
files.file_and_group_ids.reduce((acc, file) => {
acc[file.file.file_name] = file.file.id;
return acc;
}, fileMapResult);
} else {
done = true;
}

page += 1;
}

setFiles(fileMapResult);
Expand All @@ -46,3 +51,7 @@ export const FileContextProvider = (props: { children: ReactNode }) => {
</FileContext.Provider>
);
};

export const useFileContext = () => {
return React.useContext(FileContext);
};
1 change: 0 additions & 1 deletion clients/search-component/src/utils/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ export type PdfChunk = {
};

export function isPdfChunk(result: ChunkWithHighlights): result is PdfChunk {
console.log("testing", result);
return (result as PdfChunk).chunk.metadata.file_name !== undefined;
}

Expand Down
50 changes: 46 additions & 4 deletions clients/ts-sdk/openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -3671,10 +3671,7 @@
"content": {
"application/json": {
"schema": {
"type": "array",
"items": {
"$ref": "#/components/schemas/File"
}
"$ref": "#/components/schemas/FileData"
}
}
}
Expand Down Expand Up @@ -4544,6 +4541,16 @@
"type": "string",
"format": "uuid"
}
},
{
"name": "content_type",
"in": "query",
"description": "Optional field to override the presigned url's Content-Type header",
"required": false,
"schema": {
"type": "string",
"nullable": true
}
}
],
"responses": {
Expand Down Expand Up @@ -10783,6 +10790,22 @@
"updated_at": "2021-01-01 00:00:00.000"
}
},
"FileAndGroupId": {
"type": "object",
"required": [
"file"
],
"properties": {
"file": {
"$ref": "#/components/schemas/File"
},
"group_id": {
"type": "string",
"format": "uuid",
"nullable": true
}
}
},
"FileDTO": {
"type": "object",
"required": [
Expand Down Expand Up @@ -10837,6 +10860,25 @@
"updated_at": "2021-01-01 00:00:00.000"
}
},
"FileData": {
"type": "object",
"required": [
"file_and_group_ids",
"total_pages"
],
"properties": {
"file_and_group_ids": {
"type": "array",
"items": {
"$ref": "#/components/schemas/FileAndGroupId"
}
},
"total_pages": {
"type": "integer",
"format": "int64"
}
}
},
"FullTextBoost": {
"type": "object",
"description": "Boost the presence of certain tokens for fulltext (SPLADE) and keyword (BM25) search. I.e. boosting title phrases to priortize title matches or making sure that the listing for AirBNB itself ranks higher than companies who make software for AirBNB hosts by boosting the in-document-frequency of the AirBNB token (AKA word) for its official listing. Conceptually it multiples the in-document-importance second value in the tuples of the SPLADE or BM25 sparse vector of the chunk_html innerText for all tokens present in the boost phrase by the boost factor like so: (token, in-document-importance) -> (token, in-document-importance*boost_factor).",
Expand Down
Loading

0 comments on commit a9ba38d

Please sign in to comment.