QuivrHQ · StanGirard · Oct 14, 2024 · Oct 8, 2024 · Oct 14, 2024 · Oct 14, 2024
diff --git a/backend/api/quivr_api/modules/sync/utils/syncutils.py b/backend/api/quivr_api/modules/sync/utils/syncutils.py
@@ -168,6 +168,8 @@ async def process_sync_file(
         ]:
             raise ValueError(f"Incompatible file extension for {downloaded_file}")
 
+        storage_path = f"{storage_path.split('.')[0]}.{storage_path.split('.')[-1]}"
+
         response = await upload_file_storage(
             downloaded_file.file_data,
             storage_path,

diff --git a/backend/api/quivr_api/modules/upload/controller/upload_routes.py b/backend/api/quivr_api/modules/upload/controller/upload_routes.py
@@ -85,12 +85,14 @@ async def upload_file(
             brain_id=str(brain_id),
         )
     )
+    file_name = f"{str(uploadFile.filename).split('.')[0]}.{str(uploadFile.filename).split('.')[-1]}"
 
     background_tasks.add_task(
-        maybe_send_telemetry, "upload_file", {"file_name": uploadFile.filename}
+        maybe_send_telemetry, "upload_file", {"file_name": file_name}
     )
 
-    filename_with_brain_id = str(brain_id) + "/" + str(uploadFile.filename)
+    filename_with_brain_id = str(brain_id) + "/" + file_name
+    filename_with_brain_id = f"{filename_with_brain_id.split('.')[0]}.{filename_with_brain_id.split('.')[-1]}"
 
     buff_reader = io.BufferedReader(uploadFile.file)  # type: ignore
     try:
@@ -110,9 +112,9 @@ async def upload_file(
 
     knowledge_to_add = CreateKnowledgeProperties(
         brain_id=brain_id,
-        file_name=uploadFile.filename,
+        file_name=file_name,
         extension=os.path.splitext(
-            uploadFile.filename  # pyright: ignore reportPrivateUsage=none
+            file_name  # pyright: ignore reportPrivateUsage=none
         )[-1].lower(),
         source=integration if integration else "local",
         source_link=integration_link,  # FIXME: Should return the s3 link @chloedia
@@ -127,7 +129,7 @@ async def upload_file(
         "process_file_task",
         kwargs={
             "file_name": filename_with_brain_id,
-            "file_original_name": uploadFile.filename,
+            "file_original_name": file_name,
             "brain_id": brain_id,
             "notification_id": upload_notification.id,
             "knowledge_id": knowledge.id,

diff --git a/backend/core/MegaParse/megaparse/Converter.py b/backend/core/MegaParse/megaparse/Converter.py
@@ -320,22 +320,24 @@ async def convert(
         else:
             raise ValueError(f"Method {self.method} not supported")
 
-        if not gpt4o_cleaner:
-            return LangChainDocument(
-                page_content=parsed_md,
-                metadata={"filename": file_path.name, "type": "pdf"},
-            )
-        else:
+        if gpt4o_cleaner:
             md_processor = MarkdownProcessor(
                 parsed_md,
                 strict=True,
                 remove_pagination=True,
             )
             md_cleaned = md_processor.process(gpt4o_cleaner=gpt4o_cleaner)
-            return LangChainDocument(
-                page_content=md_cleaned,
-                metadata={"filename": file_path.name, "type": "pdf"},
-            )
+            parsed_md = md_cleaned
+
+        if len(parsed_md) < 5 and self.strategy == "fast":
+            logger.debug(f"Switching to auto strategy for {file_path.name}")
+            self.strategy = "auto"
+            return await self.convert(file_path, model, gpt4o_cleaner=gpt4o_cleaner)
+
+        return LangChainDocument(
+            page_content=parsed_md,
+            metadata={"filename": file_path.name, "type": "pdf"},
+        )
 
     def save_md(self, md_content: str, file_path: Path | str) -> None:
         with open(file_path, "w") as f:

diff --git a/backend/core/quivr_core/processor/implementations/megaparse_processor.py b/backend/core/quivr_core/processor/implementations/megaparse_processor.py
@@ -59,7 +59,6 @@ def processor_metadata(self):
     async def process_file_inner(self, file: QuivrFile) -> list[Document]:
         mega_parse = MegaParse(file_path=file.path, config=self.megaparse_config)  # type: ignore
         document: Document = await mega_parse.aload()
-        print("\n\n document: ", document.page_content)
         if len(document.page_content) > self.splitter_config.chunk_size:
             docs = self.text_splitter.split_documents([document])
             for doc in docs:

diff --git a/backend/worker/quivr_worker/celery_monitor.py b/backend/worker/quivr_worker/celery_monitor.py
@@ -178,6 +178,9 @@ def is_being_executed(task_name: str) -> bool:
         running currently.
     """
     active_tasks = celery.control.inspect().active()
+    if not active_tasks:
+        return False
+
     for worker, running_tasks in active_tasks.items():
         for task in running_tasks:
             if task["name"] == task_name:  # type: ignore