From 499119a61e050d08790fa44d20bd797508e7f82d Mon Sep 17 00:00:00 2001 From: Jacopo Chevallard Date: Tue, 8 Oct 2024 17:27:51 +0200 Subject: [PATCH 1/3] fix: fixing pdf parsing, by configuring the default pdf parser (Unstructured) to use the 'auto' strategy instead of the 'fast' one --- backend/core/MegaParse/megaparse/config.py | 2 +- .../processor/implementations/megaparse_processor.py | 1 - backend/worker/quivr_worker/celery_monitor.py | 3 +++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/backend/core/MegaParse/megaparse/config.py b/backend/core/MegaParse/megaparse/config.py index 2f001c443df5..dff35161b3dc 100644 --- a/backend/core/MegaParse/megaparse/config.py +++ b/backend/core/MegaParse/megaparse/config.py @@ -22,6 +22,6 @@ def from_yaml(cls, file_path: str): class MegaparseConfig(MegaparseBaseConfig): - strategy: str = "fast" + strategy: str = "auto" llama_parse_api_key: str | None = None pdf_parser: PdfParser = PdfParser.UNSTRUCTURED diff --git a/backend/core/quivr_core/processor/implementations/megaparse_processor.py b/backend/core/quivr_core/processor/implementations/megaparse_processor.py index 9a7c63f1d6c7..6b5fcc182e07 100644 --- a/backend/core/quivr_core/processor/implementations/megaparse_processor.py +++ b/backend/core/quivr_core/processor/implementations/megaparse_processor.py @@ -59,7 +59,6 @@ def processor_metadata(self): async def process_file_inner(self, file: QuivrFile) -> list[Document]: mega_parse = MegaParse(file_path=file.path, config=self.megaparse_config) # type: ignore document: Document = await mega_parse.aload() - print("\n\n document: ", document.page_content) if len(document.page_content) > self.splitter_config.chunk_size: docs = self.text_splitter.split_documents([document]) for doc in docs: diff --git a/backend/worker/quivr_worker/celery_monitor.py b/backend/worker/quivr_worker/celery_monitor.py index 546e4b9d8206..f191c17fde79 100644 --- a/backend/worker/quivr_worker/celery_monitor.py +++ b/backend/worker/quivr_worker/celery_monitor.py @@ -178,6 +178,9 @@ def is_being_executed(task_name: str) -> bool: running currently. """ active_tasks = celery.control.inspect().active() + if not active_tasks: + return False + for worker, running_tasks in active_tasks.items(): for task in running_tasks: if task["name"] == task_name: # type: ignore From 4da6f5000461bc7178a55da9ebd4fd3d5ca77ec8 Mon Sep 17 00:00:00 2001 From: chloedia Date: Mon, 14 Oct 2024 11:49:13 +0200 Subject: [PATCH 2/3] fix: parsing fallback --- .../quivr_api/modules/sync/utils/syncutils.py | 2 ++ .../upload/controller/upload_routes.py | 12 +++++----- backend/core/MegaParse/megaparse/Converter.py | 22 ++++++++++--------- backend/core/MegaParse/megaparse/config.py | 2 +- 4 files changed, 22 insertions(+), 16 deletions(-) diff --git a/backend/api/quivr_api/modules/sync/utils/syncutils.py b/backend/api/quivr_api/modules/sync/utils/syncutils.py index 5fe9f53105b0..e3f01e2b9c33 100644 --- a/backend/api/quivr_api/modules/sync/utils/syncutils.py +++ b/backend/api/quivr_api/modules/sync/utils/syncutils.py @@ -168,6 +168,8 @@ async def process_sync_file( ]: raise ValueError(f"Incompatible file extension for {downloaded_file}") + storage_path = f"{storage_path.split('.')[0]}.{storage_path.split('.')[-1]}" + response = await upload_file_storage( downloaded_file.file_data, storage_path, diff --git a/backend/api/quivr_api/modules/upload/controller/upload_routes.py b/backend/api/quivr_api/modules/upload/controller/upload_routes.py index 0bf6e952dc5e..687eecb4a84c 100644 --- a/backend/api/quivr_api/modules/upload/controller/upload_routes.py +++ b/backend/api/quivr_api/modules/upload/controller/upload_routes.py @@ -85,12 +85,14 @@ async def upload_file( brain_id=str(brain_id), ) ) + file_name = f"{str(uploadFile.filename).split('.')[0]}.{str(uploadFile.filename).split('.')[-1]}" background_tasks.add_task( - maybe_send_telemetry, "upload_file", {"file_name": uploadFile.filename} + maybe_send_telemetry, "upload_file", {"file_name": file_name} ) - filename_with_brain_id = str(brain_id) + "/" + str(uploadFile.filename) + filename_with_brain_id = str(brain_id) + "/" + file_name + filename_with_brain_id = f"{filename_with_brain_id.split('.')[0]}.{filename_with_brain_id.split('.')[-1]}" buff_reader = io.BufferedReader(uploadFile.file) # type: ignore try: @@ -110,9 +112,9 @@ async def upload_file( knowledge_to_add = CreateKnowledgeProperties( brain_id=brain_id, - file_name=uploadFile.filename, + file_name=file_name, extension=os.path.splitext( - uploadFile.filename # pyright: ignore reportPrivateUsage=none + file_name # pyright: ignore reportPrivateUsage=none )[-1].lower(), source=integration if integration else "local", source_link=integration_link, # FIXME: Should return the s3 link @chloedia @@ -127,7 +129,7 @@ async def upload_file( "process_file_task", kwargs={ "file_name": filename_with_brain_id, - "file_original_name": uploadFile.filename, + "file_original_name": file_name, "brain_id": brain_id, "notification_id": upload_notification.id, "knowledge_id": knowledge.id, diff --git a/backend/core/MegaParse/megaparse/Converter.py b/backend/core/MegaParse/megaparse/Converter.py index 1dc1dedc4b57..01cbfe327c7a 100644 --- a/backend/core/MegaParse/megaparse/Converter.py +++ b/backend/core/MegaParse/megaparse/Converter.py @@ -320,22 +320,24 @@ async def convert( else: raise ValueError(f"Method {self.method} not supported") - if not gpt4o_cleaner: - return LangChainDocument( - page_content=parsed_md, - metadata={"filename": file_path.name, "type": "pdf"}, - ) - else: + if gpt4o_cleaner: md_processor = MarkdownProcessor( parsed_md, strict=True, remove_pagination=True, ) md_cleaned = md_processor.process(gpt4o_cleaner=gpt4o_cleaner) - return LangChainDocument( - page_content=md_cleaned, - metadata={"filename": file_path.name, "type": "pdf"}, - ) + parsed_md = md_cleaned + + if len(parsed_md) < 5 and self.strategy == "fast": + logger.debug(f"Switching to auto strategy for {file_path.name}") + self.strategy = "auto" + return await self.convert(file_path, model, gpt4o_cleaner=gpt4o_cleaner) + + return LangChainDocument( + page_content=parsed_md, + metadata={"filename": file_path.name, "type": "pdf"}, + ) def save_md(self, md_content: str, file_path: Path | str) -> None: with open(file_path, "w") as f: diff --git a/backend/core/MegaParse/megaparse/config.py b/backend/core/MegaParse/megaparse/config.py index dff35161b3dc..2f001c443df5 100644 --- a/backend/core/MegaParse/megaparse/config.py +++ b/backend/core/MegaParse/megaparse/config.py @@ -22,6 +22,6 @@ def from_yaml(cls, file_path: str): class MegaparseConfig(MegaparseBaseConfig): - strategy: str = "auto" + strategy: str = "fast" llama_parse_api_key: str | None = None pdf_parser: PdfParser = PdfParser.UNSTRUCTURED From a6875d69876c362affb47bbd1d89258fe37a73c9 Mon Sep 17 00:00:00 2001 From: chloedia Date: Mon, 14 Oct 2024 14:47:06 +0200 Subject: [PATCH 3/3] comments fix --- .../quivr_api/modules/sync/utils/normalize.py | 33 +++++++++++++++++++ .../quivr_api/modules/sync/utils/syncutils.py | 3 +- .../upload/controller/upload_routes.py | 3 +- backend/core/MegaParse/megaparse/Converter.py | 8 +++-- 4 files changed, 43 insertions(+), 4 deletions(-) diff --git a/backend/api/quivr_api/modules/sync/utils/normalize.py b/backend/api/quivr_api/modules/sync/utils/normalize.py index e7518a085d05..3f8042c9c5a2 100644 --- a/backend/api/quivr_api/modules/sync/utils/normalize.py +++ b/backend/api/quivr_api/modules/sync/utils/normalize.py @@ -1,3 +1,4 @@ +import os import re import unicodedata @@ -15,3 +16,35 @@ def remove_special_characters(input): except Exception as e: logger.error(f"Error removing special characters: {e}") return input + + +def sanitize_filename(filename: str) -> str: + """ + Sanitize the filename to make it usable. + + Args: + filename (str): The original filename. + + Returns: + str: The sanitized filename. + + This function: + 1. Removes or replaces invalid characters + 2. Handles double extensions + 3. Ensures the filename is not empty + 4. Truncates long filenames + """ + valid_chars = re.sub(r"[^\w\-_\. ]", "", filename) + + name, ext = os.path.splitext(valid_chars) + + name = name.replace(".", "_") + + if not name: + name = "unnamed" + max_length = 255 - len(ext) + if len(name) > max_length: + name = name[:max_length] + sanitized_filename = f"{name}{ext}" + + return sanitized_filename diff --git a/backend/api/quivr_api/modules/sync/utils/syncutils.py b/backend/api/quivr_api/modules/sync/utils/syncutils.py index e3f01e2b9c33..9a0735495535 100644 --- a/backend/api/quivr_api/modules/sync/utils/syncutils.py +++ b/backend/api/quivr_api/modules/sync/utils/syncutils.py @@ -29,6 +29,7 @@ ISyncService, ISyncUserService, ) +from quivr_api.modules.sync.utils.normalize import sanitize_filename from quivr_api.modules.sync.utils.sync import BaseSync from quivr_api.modules.upload.service.upload_file import ( check_file_exists, @@ -168,7 +169,7 @@ async def process_sync_file( ]: raise ValueError(f"Incompatible file extension for {downloaded_file}") - storage_path = f"{storage_path.split('.')[0]}.{storage_path.split('.')[-1]}" + storage_path = sanitize_filename(storage_path) response = await upload_file_storage( downloaded_file.file_data, diff --git a/backend/api/quivr_api/modules/upload/controller/upload_routes.py b/backend/api/quivr_api/modules/upload/controller/upload_routes.py index 687eecb4a84c..93b33eafa3ca 100644 --- a/backend/api/quivr_api/modules/upload/controller/upload_routes.py +++ b/backend/api/quivr_api/modules/upload/controller/upload_routes.py @@ -30,6 +30,7 @@ from quivr_api.modules.notification.service.notification_service import ( NotificationService, ) +from quivr_api.modules.sync.utils.normalize import sanitize_filename from quivr_api.modules.upload.service.upload_file import ( upload_file_storage, ) @@ -92,7 +93,7 @@ async def upload_file( ) filename_with_brain_id = str(brain_id) + "/" + file_name - filename_with_brain_id = f"{filename_with_brain_id.split('.')[0]}.{filename_with_brain_id.split('.')[-1]}" + filename_with_brain_id = sanitize_filename(filename_with_brain_id) buff_reader = io.BufferedReader(uploadFile.file) # type: ignore try: diff --git a/backend/core/MegaParse/megaparse/Converter.py b/backend/core/MegaParse/megaparse/Converter.py index 01cbfe327c7a..de9d0e6dd458 100644 --- a/backend/core/MegaParse/megaparse/Converter.py +++ b/backend/core/MegaParse/megaparse/Converter.py @@ -329,8 +329,12 @@ async def convert( md_cleaned = md_processor.process(gpt4o_cleaner=gpt4o_cleaner) parsed_md = md_cleaned - if len(parsed_md) < 5 and self.strategy == "fast": - logger.debug(f"Switching to auto strategy for {file_path.name}") + if ( + len(parsed_md) < 5 + and file_path.stat().st_size > 100 + and self.strategy == "fast" + ): + logger.info(f"Switching to auto strategy for {file_path.name}") self.strategy = "auto" return await self.convert(file_path, model, gpt4o_cleaner=gpt4o_cleaner)