Skip to content

Commit a1f0d24

Browse files
committed
feature: incrementally add pages
1 parent c0b3c3b commit a1f0d24

File tree

8 files changed

+264
-181
lines changed

8 files changed

+264
-181
lines changed

frontends/search/src/components/UploadFile.tsx

+6-2
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,11 @@ interface RequestBody {
1717
group_tracking_id?: string;
1818
metadata: any;
1919
time_stamp?: string;
20-
use_pdf2md_ocr?: boolean;
20+
pdf2md_options?: {
21+
use_pdf2md_ocr: boolean;
22+
system_prompt?: string;
23+
split_headings?: boolean;
24+
};
2125
}
2226

2327
export const UploadFile = () => {
@@ -145,7 +149,7 @@ export const UploadFile = () => {
145149
split_delimiters: splitDelimiters(),
146150
target_splits_per_chunk: targetSplitsPerChunk(),
147151
rebalance_chunks: rebalanceChunks(),
148-
use_pdf2md_ocr: useGptChunking(),
152+
pdf2md_options: { use_pdf2md_ocr: useGptChunking() },
149153
group_tracking_id:
150154
groupTrackingId() === "" ? undefined : groupTrackingId(),
151155
// eslint-disable-next-line @typescript-eslint/no-unsafe-assignment

pdf2md/server/src/models.rs

+3-3
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,7 @@ impl From<TaskResponse> for Vec<Chunk> {
251251

252252
#[derive(Debug, serde::Serialize, serde::Deserialize, Clone)]
253253
pub struct GetTaskRequest {
254-
pub pagination_token: Option<uuid::Uuid>,
254+
pub pagination_token: Option<u32>,
255255
pub limit: Option<u32>,
256256
}
257257

@@ -265,7 +265,7 @@ pub struct GetTaskResponse {
265265
pub status: String,
266266
pub created_at: String,
267267
pub pages: Option<Vec<Chunk>>,
268-
pub pagination_token: Option<String>,
268+
pub pagination_token: Option<u32>,
269269
}
270270

271271
impl GetTaskResponse {
@@ -296,7 +296,7 @@ impl GetTaskResponse {
296296
pages_processed: task.pages_processed,
297297
status: task.status,
298298
created_at: task.created_at.to_string(),
299-
pagination_token: pages.last().map(|c| c.id.clone()),
299+
pagination_token: pages.last().map(|c| c.page),
300300
pages: Some(pages.into_iter().map(Chunk::from).collect()),
301301
}
302302
}

pdf2md/server/src/operators/clickhouse.rs

+3-3
Original file line numberDiff line numberDiff line change
@@ -170,18 +170,18 @@ pub async fn get_task(
170170
pub async fn get_task_pages(
171171
task: FileTaskClickhouse,
172172
limit: Option<u32>,
173-
offset_id: Option<uuid::Uuid>,
173+
offset_id: Option<u32>,
174174
clickhouse_client: &clickhouse::Client,
175175
) -> Result<Vec<ChunkClickhouse>, ServiceError> {
176176
if FileTaskStatus::from(task.status.clone()) == FileTaskStatus::Completed || task.pages > 0 {
177177
let limit = limit.unwrap_or(20);
178178

179179
let pages: Vec<ChunkClickhouse> = clickhouse_client
180180
.query(
181-
"SELECT ?fields FROM file_chunks WHERE task_id = ? AND id > ? ORDER BY page LIMIT ?",
181+
"SELECT ?fields FROM file_chunks WHERE task_id = ? AND page > ? ORDER BY page LIMIT ?",
182182
)
183183
.bind(task.id.clone())
184-
.bind(offset_id.unwrap_or(uuid::Uuid::nil()))
184+
.bind(offset_id.unwrap_or(0))
185185
.bind(limit)
186186
.fetch_all()
187187
.await

pdf2md/server/src/routes/jinja_templates.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ use crate::{
33
get_env, Templates,
44
};
55
use actix_web::{get, HttpResponse};
6-
use minijinja::{context, path_loader, Environment};
6+
use minijinja::context;
77

88
#[utoipa::path(
99
get,

server/src/bin/csv-jsonl-worker.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -505,7 +505,7 @@ async fn process_csv_jsonl_file(
505505
rebalance_chunks: Some(false),
506506
split_delimiters: None,
507507
target_splits_per_chunk: None,
508-
use_pdf2md_ocr: None,
508+
pdf2md_options: None,
509509
base64_file: "".to_string(),
510510
},
511511
csv_jsonl_worker_message.dataset_id,

0 commit comments

Comments
 (0)