Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature: incrementally add pages #2932

Merged
merged 2 commits into from
Dec 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 43 additions & 2 deletions frontends/search/src/components/UploadFile.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,11 @@ interface RequestBody {
group_tracking_id?: string;
metadata: any;
time_stamp?: string;
use_pdf2md_ocr?: boolean;
pdf2md_options?: {
use_pdf2md_ocr: boolean;
system_prompt?: string;
split_headings?: boolean;
};
}

export const UploadFile = () => {
Expand All @@ -40,7 +44,10 @@ export const UploadFile = () => {
const [targetSplitsPerChunk, setTargetSplitsPerChunk] = createSignal(20);
const [rebalanceChunks, setRebalanceChunks] = createSignal(false);
const [useGptChunking, setUseGptChunking] = createSignal(false);
const [useHeadingBasedChunking, setUseHeadingBasedChunking] =
createSignal(false);
const [groupTrackingId, setGroupTrackingId] = createSignal("");
const [systemPrompt, setSystemPrompt] = createSignal("");

const [showFileInput, setShowFileInput] = createSignal(true);
const [showFolderInput, setShowFolderInput] = createSignal(false);
Expand Down Expand Up @@ -145,7 +152,11 @@ export const UploadFile = () => {
split_delimiters: splitDelimiters(),
target_splits_per_chunk: targetSplitsPerChunk(),
rebalance_chunks: rebalanceChunks(),
use_pdf2md_ocr: useGptChunking(),
pdf2md_options: {
use_pdf2md_ocr: useGptChunking(),
split_headings: useHeadingBasedChunking(),
system_prompt: systemPrompt(),
},
group_tracking_id:
groupTrackingId() === "" ? undefined : groupTrackingId(),
// eslint-disable-next-line @typescript-eslint/no-unsafe-assignment
Expand Down Expand Up @@ -339,6 +350,36 @@ export const UploadFile = () => {
onInput={(e) => setUseGptChunking(e.currentTarget.checked)}
class="h-4 w-4 rounded-md border border-gray-300 bg-neutral-100 px-4 py-1 dark:bg-neutral-700"
/>
<div class="flex flex-row items-center space-x-2">
<div>Heading Based Chunking</div>
<Tooltip
body={<BsInfoCircle />}
tooltipText="If set to true, Trieve will use the headings in the document to chunk the text."
/>
</div>
<input
type="checkbox"
checked={useHeadingBasedChunking()}
onInput={(e) =>
setUseHeadingBasedChunking(e.currentTarget.checked)
}
class="h-4 w-4 rounded-md border border-gray-300 bg-neutral-100 px-4 py-1 dark:bg-neutral-700"
/>
<div class="flex flex-col space-y-2">
<div class="flex flex-row items-center space-x-2">
<div>System Prompt</div>
<Tooltip
body={<BsInfoCircle />}
tooltipText="System prompt to use when chunking. This is an optional field which allows you to specify the system prompt to use when chunking the text. If not specified, the default system prompt is used. However, you may want to use a different system prompt."
/>
</div>
<textarea
placeholder="optional system prompt to use when chunking"
value={systemPrompt()}
onInput={(e) => setSystemPrompt(e.target.value)}
class="w-full rounded-md border border-gray-300 bg-neutral-100 px-4 py-1 dark:bg-neutral-700"
/>
</div>
</div>
</Show>
<div class="m-1 mb-1 flex flex-row gap-2">
Expand Down
6 changes: 3 additions & 3 deletions pdf2md/server/src/models.rs
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ impl From<TaskResponse> for Vec<Chunk> {

#[derive(Debug, serde::Serialize, serde::Deserialize, Clone)]
pub struct GetTaskRequest {
pub pagination_token: Option<uuid::Uuid>,
pub pagination_token: Option<u32>,
pub limit: Option<u32>,
}

Expand All @@ -265,7 +265,7 @@ pub struct GetTaskResponse {
pub status: String,
pub created_at: String,
pub pages: Option<Vec<Chunk>>,
pub pagination_token: Option<String>,
pub pagination_token: Option<u32>,
}

impl GetTaskResponse {
Expand Down Expand Up @@ -296,7 +296,7 @@ impl GetTaskResponse {
pages_processed: task.pages_processed,
status: task.status,
created_at: task.created_at.to_string(),
pagination_token: pages.last().map(|c| c.id.clone()),
pagination_token: pages.last().map(|c| c.page),
pages: Some(pages.into_iter().map(Chunk::from).collect()),
}
}
Expand Down
6 changes: 3 additions & 3 deletions pdf2md/server/src/operators/clickhouse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -170,18 +170,18 @@ pub async fn get_task(
pub async fn get_task_pages(
task: FileTaskClickhouse,
limit: Option<u32>,
offset_id: Option<uuid::Uuid>,
offset_id: Option<u32>,
clickhouse_client: &clickhouse::Client,
) -> Result<Vec<ChunkClickhouse>, ServiceError> {
if FileTaskStatus::from(task.status.clone()) == FileTaskStatus::Completed || task.pages > 0 {
let limit = limit.unwrap_or(20);

let pages: Vec<ChunkClickhouse> = clickhouse_client
.query(
"SELECT ?fields FROM file_chunks WHERE task_id = ? AND id > ? ORDER BY page LIMIT ?",
"SELECT ?fields FROM file_chunks WHERE task_id = ? AND page > ? ORDER BY page LIMIT ?",
)
.bind(task.id.clone())
.bind(offset_id.unwrap_or(uuid::Uuid::nil()))
.bind(offset_id.unwrap_or(0))
.bind(limit)
.fetch_all()
.await
Expand Down
16 changes: 12 additions & 4 deletions pdf2md/server/src/operators/pdf_chunk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,17 @@ use regex::Regex;
use s3::creds::time::OffsetDateTime;

const CHUNK_SYSTEM_PROMPT: &str = "
Convert the following PDF page to markdown.
Return only the markdown with no explanation text.
Do not exclude any content from the page.";
Convert this PDF page to markdown formatting, following these requirements:

1. Break the content into logical sections with clear markdown headings (# for main sections, ## for subsections, etc.)
2. Create section headers that accurately reflect the content and hierarchy of each part
3. Include all body content from the page
4. Exclude any PDF headers and footers
5. Return only the formatted markdown without any explanatory text
6. Match the original document's content organization but with explicit markdown structure

Please provide the markdown version using this structured approach.
";

fn get_data_url_from_image(img: DynamicImage) -> Result<String, ServiceError> {
let mut encoded = Vec::new();
Expand Down Expand Up @@ -108,7 +116,7 @@ async fn get_markdown_from_image(
if let Some(prev_md_doc) = prev_md_doc {
let prev_md_doc_message = ChatMessage::System {
content: ChatMessageContent::Text(format!(
"Markdown must maintain consistent formatting with the following page: \n\n {}",
"Markdown must maintain consistent formatting with the following page, DO NOT INCLUDE CONTENT FROM THIS PAGE IN YOUR RESPONSE: \n\n {}",
prev_md_doc
)),
name: None,
Expand Down
2 changes: 1 addition & 1 deletion pdf2md/server/src/routes/jinja_templates.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use crate::{
get_env, Templates,
};
use actix_web::{get, HttpResponse};
use minijinja::{context, path_loader, Environment};
use minijinja::context;

#[utoipa::path(
get,
Expand Down
2 changes: 1 addition & 1 deletion server/src/bin/csv-jsonl-worker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -505,7 +505,7 @@ async fn process_csv_jsonl_file(
rebalance_chunks: Some(false),
split_delimiters: None,
target_splits_per_chunk: None,
use_pdf2md_ocr: None,
pdf2md_options: None,
base64_file: "".to_string(),
},
csv_jsonl_worker_message.dataset_id,
Expand Down
Loading