Skip to content

Commit 322ec12

Browse files
committed
feature: add flag to enable pdf2md
1 parent 5276259 commit 322ec12

File tree

3 files changed

+35
-20
lines changed

3 files changed

+35
-20
lines changed

frontends/search/src/components/UploadFile.tsx

+33-19
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ interface RequestBody {
1717
group_tracking_id?: string;
1818
metadata: any;
1919
time_stamp?: string;
20+
use_pdf2md_ocr?: boolean;
2021
}
2122

2223
export const UploadFile = () => {
@@ -38,6 +39,7 @@ export const UploadFile = () => {
3839
const [splitDelimiters, setSplitDelimiters] = createSignal([".", "?", "\\n"]);
3940
const [targetSplitsPerChunk, setTargetSplitsPerChunk] = createSignal(20);
4041
const [rebalanceChunks, setRebalanceChunks] = createSignal(false);
42+
const [useGptChunking, setUseGptChunking] = createSignal(false);
4143
const [groupTrackingId, setGroupTrackingId] = createSignal("");
4244

4345
const [showFileInput, setShowFileInput] = createSignal(true);
@@ -136,19 +138,20 @@ export const UploadFile = () => {
136138
});
137139

138140
const requestBodyTemplate: Omit<RequestBody, "base64_file" | "file_name"> =
139-
{
140-
link: link() === "" ? undefined : link(),
141-
tag_set:
142-
tagSet().split(",").length > 0 ? undefined : tagSet().split(","),
143-
split_delimiters: splitDelimiters(),
144-
target_splits_per_chunk: targetSplitsPerChunk(),
145-
rebalance_chunks: rebalanceChunks(),
146-
group_tracking_id:
147-
groupTrackingId() === "" ? undefined : groupTrackingId(),
148-
// eslint-disable-next-line @typescript-eslint/no-unsafe-assignment
149-
metadata: metadata(),
150-
time_stamp: timestamp() ? timestamp() + " 00:00:00" : undefined,
151-
};
141+
{
142+
link: link() === "" ? undefined : link(),
143+
tag_set:
144+
tagSet().split(",").length > 0 ? undefined : tagSet().split(","),
145+
split_delimiters: splitDelimiters(),
146+
target_splits_per_chunk: targetSplitsPerChunk(),
147+
rebalance_chunks: rebalanceChunks(),
148+
use_pdf2md_ocr: useGptChunking(),
149+
group_tracking_id:
150+
groupTrackingId() === "" ? undefined : groupTrackingId(),
151+
// eslint-disable-next-line @typescript-eslint/no-unsafe-assignment
152+
metadata: metadata(),
153+
time_stamp: timestamp() ? timestamp() + " 00:00:00" : undefined,
154+
};
152155

153156
const uploadFilePromises = files().map(async (file) => {
154157
let base64File = await toBase64(file);
@@ -323,15 +326,27 @@ export const UploadFile = () => {
323326
onInput={(e) => setRebalanceChunks(e.currentTarget.checked)}
324327
class="h-4 w-4 rounded-md border border-gray-300 bg-neutral-100 px-4 py-1 dark:bg-neutral-700"
325328
/>
329+
<div class="flex flex-row items-center space-x-2">
330+
<div>Use gpt4o chunking</div>
331+
<Tooltip
332+
body={<BsInfoCircle />}
333+
tooltipText="Use gpt4o chunking. If set to true, Trieve will use the gpt4o model to chunk the document if it is a pdf file. This is an experimental feature and may not work as expected."
334+
/>
335+
</div>
336+
<input
337+
type="checkbox"
338+
checked={useGptChunking()}
339+
onInput={(e) => setUseGptChunking(e.currentTarget.checked)}
340+
class="h-4 w-4 rounded-md border border-gray-300 bg-neutral-100 px-4 py-1 dark:bg-neutral-700"
341+
/>
326342
</div>
327343
</Show>
328344
<div class="m-1 mb-1 flex flex-row gap-2">
329345
<button
330-
class={`rounded border-2 border-magenta p-2 px-4 font-semibold ${
331-
showFileInput()
346+
class={`rounded border-2 border-magenta p-2 px-4 font-semibold ${showFileInput()
332347
? "bg-magenta-600 text-white"
333348
: "text-magenta hover:bg-magenta-500 hover:text-white"
334-
}`}
349+
}`}
335350
onClick={() => {
336351
setFiles([]);
337352
setShowFileInput(true);
@@ -341,11 +356,10 @@ export const UploadFile = () => {
341356
Select Files
342357
</button>
343358
<button
344-
class={`rounded border-2 border-magenta p-2 px-4 font-semibold ${
345-
showFolderInput()
359+
class={`rounded border-2 border-magenta p-2 px-4 font-semibold ${showFolderInput()
346360
? "bg-magenta-600 text-white"
347361
: "text-magenta hover:bg-magenta-500 hover:text-white"
348-
}`}
362+
}`}
349363
onClick={() => {
350364
setFiles([]);
351365
setShowFolderInput(true);

server/src/bin/file-worker.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -353,7 +353,7 @@ async fn upload_file(
353353

354354
if file_name.ends_with(".pdf") {
355355
if let Some(true) = file_worker_message.upload_file_data.use_pdf2md_ocr {
356-
// Send file to router PDF2MD
356+
log::info!("Using pdf2md for OCR for file");
357357
let pdf2md_url = std::env::var("PDF2MD_URL")
358358
.expect("PDF2MD_URL must be set")
359359
.to_string();

server/src/handlers/file_handler.rs

+1
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ pub fn validate_file_name(s: String) -> Result<String, actix_web::Error> {
5656
"create_chunks": true,
5757
"split_delimiters": [",",".","\n"],
5858
"target_splits_per_chunk": 20,
59+
"use_pdf2md_ocr": false
5960
}))]
6061
pub struct UploadFileReqPayload {
6162
/// Base64 encoded file. This is the standard base64url encoding.

0 commit comments

Comments
 (0)