diff --git a/.env.example b/.env.example index da38f069983c..d69382ebe103 100644 --- a/.env.example +++ b/.env.example @@ -1,8 +1,30 @@ #### QUIVR Configuration # This file is used to configure the Quivr stack. It is used by the `docker-compose.yml` file to configure the stack. +# API KEYS # OPENAI. Update this to use your API key. To skip OpenAI integration use a fake key, for example: tk-aabbccddAABBCCDDEeFfGgHhIiJKLmnopjklMNOPqQqQqQqQ -OPENAI_API_KEY=CHANGE_ME +OPENAI_API_KEY=your-openai-api-key +# ANTHROPIC_API_KEY=your-anthropic-api-key +# MISTRAL_API_KEY=your-mistral-api-key +# GROQ_API_KEY=your-groq-api-key + +COHERE_API_KEY=your-cohere-api-key +# JINA_API_KEY=your-jina-api-key + +# UNSTRUCTURED_API_KEY=your-unstructured-api-key +# UNSTRUCTURED_API_URL=https://api.unstructured.io/general/v0/general + +# LLAMA_PARSE_API_KEY=your-llamaparse-api-key + +# Configuration files path +BRAIN_CONFIG_PATH=config/retrieval_config_workflow.yaml +CHAT_LLM_CONFIG_PATH=config/chat_llm_config.yaml + +# LangSmith +# LANGCHAIN_TRACING_V2=true +# LANGCHAIN_ENDPOINT="https://api.smith.langchain.com" +# LANGCHAIN_API_KEY=your-langchain-api-key +# LANGCHAIN_PROJECT=your-langchain-project-name # LOCAL # OLLAMA_API_BASE_URL=http://host.docker.internal:11434 # Uncomment to activate ollama. This is the local url for the ollama api @@ -32,7 +54,6 @@ EXTERNAL_SUPABASE_URL=http://localhost:54321 SUPABASE_SERVICE_KEY=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImV4cCI6MTk4MzgxMjk5Nn0.EGIM96RAZx35lJzdJsyH-qQwv8Hdp7fsn3W0YpN81IU PG_DATABASE_URL=postgresql://postgres:postgres@host.docker.internal:54322/postgres PG_DATABASE_ASYNC_URL=postgresql+asyncpg://postgres:postgres@host.docker.internal:54322/postgres -ANTHROPIC_API_KEY=null JWT_SECRET_KEY=super-secret-jwt-token-with-at-least-32-characters-long AUTHENTICATE=true TELEMETRY_ENABLED=true @@ -41,7 +62,6 @@ CELEBRY_BROKER_QUEUE_NAME=quivr-preview.fifo QUIVR_DOMAIN=http://localhost:3000/ BACKEND_URL=http://localhost:5050 EMBEDDING_DIM=1536 -#COHERE_API_KEY=CHANGE_ME DEACTIVATE_STRIPE=true #RESEND diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9496988b0b75..c433407b70c9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -21,17 +21,19 @@ repos: hooks: # Run the linter. - id: ruff - args: [--fix] + args: [--fix, --isolated] additional_dependencies: [] # Run the formatter. - id: ruff-format + args: [--isolated] additional_dependencies: [] - repo: https://github.com/pre-commit/mirrors-mypy rev: v1.10.1 hooks: - id: mypy name: mypy - additional_dependencies: ["types-aiofiles"] + args: ["--ignore-missing-imports", "--no-incremental", "--follow-imports=skip"] + additional_dependencies: ["types-aiofiles", "types-pyyaml", "pydantic", "sqlmodel"] ci: autofix_commit_msg: | [pre-commit.ci] auto fixes from pre-commit.com hooks diff --git a/.vscode/settings.json b/.vscode/settings.json index 700a8799b95a..86370d352832 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -16,7 +16,6 @@ "**/.docusaurus/": true, "**/node_modules/": true }, - "json.sortOnSave.enable": true, "[python]": { "editor.defaultFormatter": "charliermarsh.ruff", "editor.formatOnSave": true, @@ -25,19 +24,10 @@ "source.fixAll": "explicit" } }, - "python.formatting.provider": "black", "python.analysis.extraPaths": [ "./backend" ], - "python.sortImports.path": "isort", - "python.linting.mypyEnabled": true, "python.defaultInterpreterPath": "python3", - "python.linting.enabled": true, - "python.linting.flake8Enabled": true, - "python.linting.pycodestyleEnabled": true, - "python.linting.pylintEnabled": true, - "python.linting.pycodestyleCategorySeverity.W": "Error", - "python.linting.flake8CategorySeverity.W": "Error", "python.testing.pytestArgs": [ "-v", "--color=yes", @@ -53,5 +43,6 @@ "reportMissingImports": "error", "reportUnusedImport": "warning", "reportGeneralTypeIssues": "warning" - } -} \ No newline at end of file + }, + "makefile.configureOnOpen": false +} diff --git a/backend/Dockerfile.dev b/backend/Dockerfile.dev index 60e076295f7b..8efe6696c813 100644 --- a/backend/Dockerfile.dev +++ b/backend/Dockerfile.dev @@ -33,6 +33,8 @@ COPY core/pyproject.toml core/README.md ./core/ COPY core/quivr_core/__init__.py ./core/quivr_core/__init__.py COPY worker/pyproject.toml worker/README.md ./worker/ COPY worker/quivr_worker/__init__.py ./worker/quivr_worker/__init__.py +COPY core/MegaParse/pyproject.toml core/MegaParse/README.md ./core/MegaParse/ +COPY core/MegaParse/megaparse/__init__.py ./core/MegaParse/megaparse/__init__.py RUN PYTHONDONTWRITEBYTECODE=1 pip install --no-cache-dir -r requirements.lock diff --git a/backend/api/quivr_api/modules/brain/entity/brain_entity.py b/backend/api/quivr_api/modules/brain/entity/brain_entity.py index 0b8e3460c396..708b8d48220c 100644 --- a/backend/api/quivr_api/modules/brain/entity/brain_entity.py +++ b/backend/api/quivr_api/modules/brain/entity/brain_entity.py @@ -4,6 +4,7 @@ from uuid import UUID from pydantic import BaseModel +from quivr_core.config import BrainConfig from sqlalchemy.dialects.postgresql import ENUM as PGEnum from sqlalchemy.ext.asyncio import AsyncAttrs from sqlmodel import TIMESTAMP, Column, Field, Relationship, SQLModel, text @@ -58,43 +59,39 @@ class Brain(AsyncAttrs, SQLModel, table=True): default=BrainType.integration, ), ) - brain_chat_history: List["ChatHistory"] = Relationship( # noqa: F821 + brain_chat_history: List["ChatHistory"] = Relationship( # type: ignore # noqa: F821 back_populates="brain", sa_relationship_kwargs={"lazy": "select"} ) prompt_id: UUID | None = Field(default=None, foreign_key="prompts.id") - prompt: Prompt | None = Relationship( # noqa: f821 + prompt: Prompt | None = Relationship( # noqa: F821 back_populates="brain", sa_relationship_kwargs={"lazy": "joined"} ) knowledges: List[KnowledgeDB] = Relationship( back_populates="brains", link_model=KnowledgeBrain ) - # TODO : add # "meaning" "public"."vector", # "tags" "public"."tags"[] -class BrainEntity(BaseModel): - brain_id: UUID - name: str +class BrainEntity(BrainConfig): + last_update: datetime | None = None + brain_type: BrainType | None = None description: Optional[str] = None temperature: Optional[float] = None + meaning: Optional[str] = None + openai_api_key: Optional[str] = None + tags: Optional[List[str]] = None model: Optional[str] = None max_tokens: Optional[int] = None status: Optional[str] = None prompt_id: Optional[UUID] = None - last_update: datetime - brain_type: BrainType integration: Optional[IntegrationEntity] = None integration_description: Optional[IntegrationDescriptionEntity] = None snippet_emoji: Optional[str] = None snippet_color: Optional[str] = None - @property - def id(self) -> UUID: - return self.brain_id - def dict(self, **kwargs): data = super().dict( **kwargs, diff --git a/backend/api/quivr_api/modules/brain/repository/integration_brains.py b/backend/api/quivr_api/modules/brain/repository/integration_brains.py index 51491ad33de6..df1f0c475dae 100644 --- a/backend/api/quivr_api/modules/brain/repository/integration_brains.py +++ b/backend/api/quivr_api/modules/brain/repository/integration_brains.py @@ -100,7 +100,7 @@ def delete_integration_brain(self, brain_id, user_id): def get_integration_brain_by_type_integration( self, integration_name - ) -> List[IntegrationEntity]: + ) -> List[IntegrationEntity] | None: response = ( self.db.table("integrations_user") .select("*, integrations ()") diff --git a/backend/api/quivr_api/modules/brain/service/brain_service.py b/backend/api/quivr_api/modules/brain/service/brain_service.py index 891dc8ea4119..e5b403d8f03e 100644 --- a/backend/api/quivr_api/modules/brain/service/brain_service.py +++ b/backend/api/quivr_api/modules/brain/service/brain_service.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Tuple, Dict from uuid import UUID from fastapi import HTTPException @@ -54,7 +54,7 @@ def find_brain_from_question( chat_id: UUID, history, vector_store: CustomSupabaseVectorStore, - ) -> (Optional[BrainEntity], dict[str, str]): + ) -> Tuple[Optional[BrainEntity], Dict[str, str]]: """Find the brain to use for a question. Args: @@ -106,12 +106,12 @@ def find_brain_from_question( brain_id_to_use = list_brains[0]["id"] brain_to_use = self.get_brain_by_id(brain_id_to_use) - return brain_to_use, metadata + return brain_to_use, metadata # type: ignore def create_brain( self, user_id: UUID, - brain: Optional[CreateBrainProperties], + brain: CreateBrainProperties | None = None, ) -> BrainEntity: if brain is None: brain = CreateBrainProperties() @@ -226,28 +226,3 @@ def get_brain_details( ) return brain - - def get_connected_brains(self, brain_id: UUID) -> list[BrainEntity]: - return self.composite_brains_connections_repository.get_connected_brains( - brain_id - ) - - def update_secret_value( - self, - user_id: UUID, - brain_id: UUID, - secret_name: str, - secret_value: str, - ) -> None: - """Update an existing secret.""" - self.external_api_secrets_repository.delete_secret( - user_id=user_id, - brain_id=brain_id, - secret_name=secret_name, - ) - self.external_api_secrets_repository.create_secret( - user_id=user_id, - brain_id=brain_id, - secret_name=secret_name, - secret_value=secret_value, - ) diff --git a/backend/api/quivr_api/modules/chat/controller/chat/utils.py b/backend/api/quivr_api/modules/chat/controller/chat/utils.py index 55b098eb0835..d6cb61d6692c 100644 --- a/backend/api/quivr_api/modules/chat/controller/chat/utils.py +++ b/backend/api/quivr_api/modules/chat/controller/chat/utils.py @@ -1,4 +1,6 @@ import time +import os +from enum import Enum from fastapi import HTTPException from quivr_api.logger import get_logger @@ -6,10 +8,54 @@ from quivr_api.modules.models.service.model_service import ModelService from quivr_api.modules.user.entity.user_identity import UserIdentity from quivr_api.modules.user.service.user_usage import UserUsage +from quivr_core.config import RetrievalConfig logger = get_logger(__name__) +class RetrievalConfigPathEnv(Enum): + CHAT_WITH_LLM = ("CHAT_LLM_CONFIG_PATH", "chat_llm_config.yaml") + RAG = ("BRAIN_CONFIG_PATH", "config/retrieval_config_workflow.yaml") + + @property + def env_var(self) -> str: + return self.value[0] + + @property + def default_path(self) -> str: + return self.value[1] + + +def get_config_file_path( + config_path_env: RetrievalConfigPathEnv, current_path: str | None = None +) -> str: + # Get the environment variable or fallback to the default path + _path = os.getenv(config_path_env.env_var, config_path_env.default_path) + + if not current_path: + return _path + + return os.path.join(current_path, _path) + + +def load_and_merge_retrieval_configuration( + config_file_path: str, sqlmodel: Model +) -> RetrievalConfig: + retrieval_config = RetrievalConfig.from_yaml(config_file_path) + field_mapping = { + "env_variable_name": "env_variable_name", + "endpoint_url": "llm_base_url", + } + + retrieval_config.llm_config.set_from_sqlmodel( + sqlmodel=sqlmodel, mapping=field_mapping + ) + + retrieval_config.llm_config.set_llm_model(sqlmodel.name) + + return retrieval_config + + # TODO: rewrite async def find_model_and_generate_metadata( brain_model: str | None, diff --git a/backend/api/quivr_api/modules/chat/controller/chat_routes.py b/backend/api/quivr_api/modules/chat/controller/chat_routes.py index e4154147c9f6..a42d7fe7fb7a 100644 --- a/backend/api/quivr_api/modules/chat/controller/chat_routes.py +++ b/backend/api/quivr_api/modules/chat/controller/chat_routes.py @@ -1,17 +1,23 @@ from typing import Annotated, List, Optional from uuid import UUID +import os from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Query, Request from fastapi.responses import StreamingResponse from quivr_api.logger import get_logger from quivr_api.middlewares.auth import AuthBearer, get_current_user -from quivr_api.modules.brain.entity.brain_entity import RoleEnum +from quivr_api.modules.brain.entity.brain_entity import BrainEntity, RoleEnum from quivr_api.modules.brain.service.brain_authorization_service import ( validate_brain_authorization, ) from quivr_api.modules.brain.service.brain_service import BrainService -from quivr_api.modules.chat.controller.chat.utils import check_and_update_user_usage +from quivr_api.modules.chat.controller.chat.utils import ( + RetrievalConfigPathEnv, + check_and_update_user_usage, + get_config_file_path, + load_and_merge_retrieval_configuration, +) from quivr_api.modules.chat.dto.chats import ChatItem, ChatQuestion from quivr_api.modules.chat.dto.inputs import ( ChatMessageProperties, @@ -21,7 +27,6 @@ ) from quivr_api.modules.chat.entity.chat import Chat from quivr_api.modules.chat.service.chat_service import ChatService -from quivr_api.modules.chat_llm_service.chat_llm_service import ChatLLMService from quivr_api.modules.dependencies import get_service from quivr_api.modules.knowledge.service.knowledge_service import KnowledgeService from quivr_api.modules.models.service.model_service import ModelService @@ -31,6 +36,7 @@ from quivr_api.modules.vector.service.vector_service import VectorService from quivr_api.utils.telemetry import maybe_send_telemetry from quivr_api.utils.uuid_generator import generate_uuid_from_string +from quivr_core.config import RetrievalConfig logger = get_logger(__name__) @@ -185,10 +191,11 @@ async def create_question_handler( for model in models: if brain_id == generate_uuid_from_string(model.name): model_to_use = model + _brain = {"brain_id": brain_id, "name": model.name} + brain = BrainEntity(**_brain) break try: - service = None | RAGService | ChatLLMService if not model_to_use: brain = brain_service.get_brain_details(brain_id, current_user.id) # type: ignore assert brain @@ -201,26 +208,32 @@ async def create_question_handler( brain.model = model.name validate_authorization(user_id=current_user.id, brain_id=brain_id) service = RAGService( - current_user, - brain, - chat_id, - brain_service, - prompt_service, - chat_service, - knowledge_service, - vector_service, - model_service, + current_user=current_user, + chat_id=chat_id, + brain=brain, + model_service=model_service, + brain_service=brain_service, + prompt_service=prompt_service, + chat_service=chat_service, + knowledge_service=knowledge_service, + vector_service=vector_service, ) else: await check_and_update_user_usage( current_user, model_to_use.name, model_service ) # type: ignore - service = ChatLLMService( - current_user, - model_to_use.name, - chat_id, - chat_service, - model_service, + if not os.getenv("CHAT_LLM_CONFIG_PATH"): + raise ValueError("CHAT_LLM_CONFIG_PATH not set") + current_path = os.path.dirname(os.path.abspath(__file__)) + file_path = os.path.join(current_path, os.getenv("CHAT_LLM_CONFIG_PATH")) # type: ignore + retrieval_config = RetrievalConfig.from_yaml(file_path) + service = RAGService( + current_user=current_user, + chat_id=chat_id, + brain=brain, + retrieval_config=retrieval_config, + model_service=model_service, + chat_service=chat_service, ) # type: ignore assert service is not None # type: ignore maybe_send_telemetry("question_asked", {"streaming": True}, request) @@ -271,6 +284,8 @@ async def create_stream_question_handler( for model in models: if brain_id == generate_uuid_from_string(model.name): model_to_use = model + _brain = {"name": model.name} + brain = BrainEntity(**_brain) break try: if model_to_use is None: @@ -283,27 +298,43 @@ async def create_stream_question_handler( assert model is not None brain.model = model.name validate_authorization(user_id=current_user.id, brain_id=brain_id) + current_path = os.path.dirname(os.path.abspath(__file__)) + file_path = get_config_file_path( + RetrievalConfigPathEnv.RAG, current_path=current_path + ) + retrieval_config = load_and_merge_retrieval_configuration( + config_file_path=file_path, sqlmodel=model + ) service = RAGService( - current_user, - brain, - chat_id, - brain_service, - prompt_service, - chat_service, - knowledge_service, - vector_service, - model_service, + current_user=current_user, + chat_id=chat_id, + brain=brain, + retrieval_config=retrieval_config, + model_service=model_service, + brain_service=brain_service, + prompt_service=prompt_service, + chat_service=chat_service, + knowledge_service=knowledge_service, + vector_service=vector_service, ) else: await check_and_update_user_usage( current_user, model_to_use.name, model_service ) # type: ignore - service = ChatLLMService( - current_user, - model_to_use.name, - chat_id, - chat_service, - model_service, + current_path = os.path.dirname(os.path.abspath(__file__)) + file_path = get_config_file_path( + RetrievalConfigPathEnv.CHAT_WITH_LLM, current_path=current_path + ) + retrieval_config = load_and_merge_retrieval_configuration( + config_file_path=file_path, sqlmodel=model_to_use + ) + service = RAGService( + current_user=current_user, + chat_id=chat_id, + brain=brain, + retrieval_config=retrieval_config, + model_service=model_service, + chat_service=chat_service, ) # type: ignore background_tasks.add_task( diff --git a/backend/api/quivr_api/modules/chat/controller/config/chat_llm_config.yaml b/backend/api/quivr_api/modules/chat/controller/config/chat_llm_config.yaml new file mode 100644 index 000000000000..bad270885c5c --- /dev/null +++ b/backend/api/quivr_api/modules/chat/controller/config/chat_llm_config.yaml @@ -0,0 +1,26 @@ +workflow_config: + name: "Chat LLM" + nodes: + - name: "START" + edges: ["filter_history"] + + - name: "filter_history" + edges: ["generate_chat_llm"] + + - name: "generate_chat_llm" # the name of the last node, from which we want to stream the answer to the user, should always start with "generate" + edges: ["END"] +# Maximum number of previous conversation iterations +# to include in the context of the answer +max_history: 10 + +#prompt: "my prompt" + +llm_config: + max_input_tokens: 2000 + + # Maximum number of tokens to pass to the LLM + # as a context to generate the answer + max_output_tokens: 2000 + + temperature: 0.7 + streaming: true diff --git a/backend/api/quivr_api/modules/chat/controller/config/retrieval_config_workflow.yaml b/backend/api/quivr_api/modules/chat/controller/config/retrieval_config_workflow.yaml new file mode 100644 index 000000000000..b444f64d2415 --- /dev/null +++ b/backend/api/quivr_api/modules/chat/controller/config/retrieval_config_workflow.yaml @@ -0,0 +1,43 @@ +workflow_config: + name: "standard RAG" + nodes: + - name: "START" + edges: ["filter_history"] + + - name: "filter_history" + edges: ["rewrite"] + + - name: "rewrite" + edges: ["retrieve"] + + - name: "retrieve" + edges: ["generate_rag"] + + - name: "generate_rag" # the name of the last node, from which we want to stream the answer to the user, should always start with "generate" + edges: ["END"] +# Maximum number of previous conversation iterations +# to include in the context of the answer +max_history: 10 + +prompt: "my prompt" + +max_files: 20 +reranker_config: + # The reranker supplier to use + supplier: "cohere" + + # The model to use for the reranker for the given supplier + model: "rerank-multilingual-v3.0" + + # Number of chunks returned by the reranker + top_n: 5 +llm_config: + + max_input_tokens: 2000 + + # Maximum number of tokens to pass to the LLM + # as a context to generate the answer + max_output_tokens: 2000 + + temperature: 0.7 + streaming: true diff --git a/backend/api/quivr_api/modules/chat_llm_service/__init__.py b/backend/api/quivr_api/modules/chat_llm_service/__init__.py deleted file mode 100644 index d3f79a025f18..000000000000 --- a/backend/api/quivr_api/modules/chat_llm_service/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .chat_llm_service import ChatLLMService - -__all__ = ["ChatLLMService"] diff --git a/backend/api/quivr_api/modules/chat_llm_service/chat_llm_service.py b/backend/api/quivr_api/modules/chat_llm_service/chat_llm_service.py deleted file mode 100644 index f6125de373ba..000000000000 --- a/backend/api/quivr_api/modules/chat_llm_service/chat_llm_service.py +++ /dev/null @@ -1,215 +0,0 @@ -import datetime -import os -from uuid import UUID, uuid4 - -from quivr_core.chat import ChatHistory as ChatHistoryCore -from quivr_core.chat_llm import ChatLLM -from quivr_core.config import LLMEndpointConfig -from quivr_core.llm.llm_endpoint import LLMEndpoint -from quivr_core.models import ChatLLMMetadata, ParsedRAGResponse, RAGResponseMetadata - -from quivr_api.logger import get_logger -from quivr_api.modules.brain.service.utils.format_chat_history import ( - format_chat_history, -) -from quivr_api.modules.chat.dto.inputs import CreateChatHistory -from quivr_api.modules.chat.dto.outputs import GetChatHistoryOutput -from quivr_api.modules.chat.service.chat_service import ChatService -from quivr_api.modules.models.service.model_service import ModelService -from quivr_api.modules.user.entity.user_identity import UserIdentity -from quivr_api.utils.uuid_generator import generate_uuid_from_string - -logger = get_logger(__name__) - - -class ChatLLMService: - def __init__( - self, - current_user: UserIdentity, - model_name: str, - chat_id: UUID, - chat_service: ChatService, - model_service: ModelService, - ): - # Services - self.chat_service = chat_service - self.model_service = model_service - - # Base models - self.current_user = current_user - self.chat_id = chat_id - - # check at init time - self.model_to_use = model_name - - def _build_chat_history( - self, - history: list[GetChatHistoryOutput], - ) -> ChatHistoryCore: - transformed_history = format_chat_history(history) - chat_history = ChatHistoryCore(brain_id=None, chat_id=self.chat_id) - - [chat_history.append(m) for m in transformed_history] - return chat_history - - async def build_llm(self) -> ChatLLM: - model = await self.model_service.get_model(self.model_to_use) - api_key = os.getenv(model.env_variable_name, "not-defined") - chat_llm = ChatLLM( - llm=LLMEndpoint.from_config( - LLMEndpointConfig( - model=self.model_to_use, - llm_base_url=model.endpoint_url, - llm_api_key=api_key, - temperature=(LLMEndpointConfig.model_fields["temperature"].default), - max_input=model.max_input, - max_tokens=model.max_output, - ), - ) - ) - return chat_llm - - def save_answer(self, question: str, answer: ParsedRAGResponse): - logger.info( - f"Saving answer for chat {self.chat_id} with model {self.model_to_use}" - ) - logger.info(answer) - return self.chat_service.update_chat_history( - CreateChatHistory( - **{ - "chat_id": self.chat_id, - "user_message": question, - "assistant": answer.answer, - "brain_id": None, - "prompt_id": None, - "metadata": answer.metadata.model_dump() if answer.metadata else {}, - } - ) - ) - - async def generate_answer( - self, - question: str, - ): - logger.info( - f"Creating question for chat {self.chat_id} with model {self.model_to_use} " - ) - chat_llm = await self.build_llm() - history = await self.chat_service.get_chat_history(self.chat_id) - model_metadata = await self.model_service.get_model(self.model_to_use) - # Format the history, sanitize the input - chat_history = self._build_chat_history(history) - - parsed_response = chat_llm.answer(question, chat_history) - - if parsed_response.metadata: - # TODO: check if this is the right way to do it - parsed_response.metadata.metadata_model = ChatLLMMetadata( - name=self.model_to_use, - description=model_metadata.description, - image_url=model_metadata.image_url, - display_name=model_metadata.display_name, - brain_id=str(generate_uuid_from_string(self.model_to_use)), - brain_name=self.model_to_use, - ) - - # Save the answer to db - new_chat_entry = self.save_answer(question, parsed_response) - - # Format output to be correct - return GetChatHistoryOutput( - **{ - "chat_id": self.chat_id, - "user_message": question, - "assistant": parsed_response.answer, - "message_time": new_chat_entry.message_time, - "prompt_title": None, - "brain_name": None, - "message_id": new_chat_entry.message_id, - "brain_id": None, - "metadata": ( - parsed_response.metadata.model_dump() - if parsed_response.metadata - else {} - ), - } - ) - - async def generate_answer_stream( - self, - question: str, - ): - logger.info( - f"Creating question for chat {self.chat_id} with model {self.model_to_use} " - ) - # Build the rag config - chat_llm = await self.build_llm() - - # Get model metadata - model_metadata = await self.model_service.get_model(self.model_to_use) - # Get chat history - history = await self.chat_service.get_chat_history(self.chat_id) - # Format the history, sanitize the input - chat_history = self._build_chat_history(history) - - full_answer = "" - - message_metadata = { - "chat_id": self.chat_id, - "message_id": uuid4(), # do we need it ?, - "user_message": question, # TODO: define result - "message_time": datetime.datetime.now(), # TODO: define result - "prompt_title": None, - "brain_name": None, - "brain_id": None, - } - metadata_model = ChatLLMMetadata( - name=self.model_to_use, - description=model_metadata.description, - image_url=model_metadata.image_url, - display_name=model_metadata.display_name, - brain_id=str(generate_uuid_from_string(self.model_to_use)), - brain_name=self.model_to_use, - ) - - async for response in chat_llm.answer_astream(question, chat_history): - # Format output to be correct servicedf;j - if not response.last_chunk: - streamed_chat_history = GetChatHistoryOutput( - assistant=response.answer, - metadata=response.metadata.model_dump(), - **message_metadata, - ) - streamed_chat_history.metadata["metadata_model"] = metadata_model # type: ignore - full_answer += response.answer - yield f"data: {streamed_chat_history.model_dump_json()}" - if response.last_chunk and full_answer == "": - full_answer += response.answer - - # For last chunk parse the sources, and the full answer - streamed_chat_history = GetChatHistoryOutput( - assistant="", - metadata=response.metadata.model_dump(), - **message_metadata, - ) - - metadata = RAGResponseMetadata(**streamed_chat_history.metadata) # type: ignore - metadata.metadata_model = ChatLLMMetadata( - name=self.model_to_use, - description=model_metadata.description, - image_url=model_metadata.image_url, - display_name=model_metadata.display_name, - brain_id=str(generate_uuid_from_string(self.model_to_use)), - brain_name=self.model_to_use, - ) - streamed_chat_history.metadata = metadata.model_dump() - - logger.info("Last chunk before saving") - self.save_answer( - question, - ParsedRAGResponse( - answer=full_answer, - metadata=metadata, - ), - ) - yield f"data: {streamed_chat_history.model_dump_json()}" diff --git a/backend/api/quivr_api/modules/knowledge/entity/knowledge.py b/backend/api/quivr_api/modules/knowledge/entity/knowledge.py index 7f9d10d95742..9b8ecaa8ab0b 100644 --- a/backend/api/quivr_api/modules/knowledge/entity/knowledge.py +++ b/backend/api/quivr_api/modules/knowledge/entity/knowledge.py @@ -2,8 +2,8 @@ from enum import Enum from typing import Any, Dict, List, Optional from uuid import UUID - from pydantic import BaseModel + from quivr_core.models import KnowledgeStatus from sqlalchemy import JSON, TIMESTAMP, Column, text from sqlalchemy.ext.asyncio import AsyncAttrs @@ -64,7 +64,7 @@ class KnowledgeDB(AsyncAttrs, SQLModel, table=True): primary_key=True, ), ) - file_name: Optional[str] = Field(default=None, max_length=255) + file_name: str = Field(default="", max_length=255) url: Optional[str] = Field(default=None, max_length=2048) extension: str = Field(default=".txt", max_length=100) status: str = Field(max_length=50) @@ -94,7 +94,7 @@ class KnowledgeDB(AsyncAttrs, SQLModel, table=True): ) is_folder: bool = Field(default=False) user_id: UUID = Field(foreign_key="users.id", nullable=False) - brains: List["Brain"] = Relationship( + brains: List["Brain"] = Relationship( # type: ignore # noqa: F821 back_populates="knowledges", link_model=KnowledgeBrain, sa_relationship_kwargs={"lazy": "select"}, diff --git a/backend/api/quivr_api/modules/knowledge/service/knowledge_service.py b/backend/api/quivr_api/modules/knowledge/service/knowledge_service.py index cfc88884b98f..8777a42c7709 100644 --- a/backend/api/quivr_api/modules/knowledge/service/knowledge_service.py +++ b/backend/api/quivr_api/modules/knowledge/service/knowledge_service.py @@ -50,9 +50,9 @@ def __init__( self.storage = storage async def get_knowledge_sync(self, sync_id: int) -> Knowledge: - km = await self.repository.get_knowledge_by_sync_id(sync_id) - assert km.id, "Knowledge ID not generated" - km = await km.to_dto() + km_db = await self.repository.get_knowledge_by_sync_id(sync_id) + assert km_db.id, "Knowledge ID not generated" + km = await km_db.to_dto() return km # TODO: this is temporary fix for getting knowledge path. diff --git a/backend/api/quivr_api/modules/models/repository/model.py b/backend/api/quivr_api/modules/models/repository/model.py index d865fd28f489..005c00a22ce1 100644 --- a/backend/api/quivr_api/modules/models/repository/model.py +++ b/backend/api/quivr_api/modules/models/repository/model.py @@ -22,7 +22,7 @@ async def get_model(self, model_name: str) -> Model | None: response = await self.session.exec(query) return response.first() - async def get_default_model(self) -> Model: + async def get_default_model(self) -> Model | None: query = select(Model).where(Model.default == True) # noqa: E712 response = await self.session.exec(query) return response.first() diff --git a/backend/api/quivr_api/modules/models/service/model_service.py b/backend/api/quivr_api/modules/models/service/model_service.py index 697064038493..3c20c28dc510 100644 --- a/backend/api/quivr_api/modules/models/service/model_service.py +++ b/backend/api/quivr_api/modules/models/service/model_service.py @@ -19,7 +19,7 @@ async def get_models(self) -> list[Model]: return models # type: ignore - async def get_model(self, model_name: str) -> Model: + async def get_model(self, model_name: str) -> Model | None: logger.info(f"Getting model {model_name}") model = await self.repository.get_model(model_name) diff --git a/backend/api/quivr_api/modules/prompt/entity/prompt.py b/backend/api/quivr_api/modules/prompt/entity/prompt.py index 2e91ee7bd4ce..2a7f90ce5d11 100644 --- a/backend/api/quivr_api/modules/prompt/entity/prompt.py +++ b/backend/api/quivr_api/modules/prompt/entity/prompt.py @@ -25,7 +25,7 @@ class Prompt(SQLModel, table=True): content: str | None = None title: str | None = Field(default=None, max_length=255) status: str = Field(default="private", max_length=255) - brain: List["Brain"] = Relationship( # noqa: F821 + brain: List["Brain"] = Relationship( # type: ignore # noqa: F821 back_populates="prompt", sa_relationship_kwargs={"lazy": "joined"} ) diff --git a/backend/api/quivr_api/modules/rag_service/rag_service.py b/backend/api/quivr_api/modules/rag_service/rag_service.py index c7aef4aaf4af..c1f3ee7da6a3 100644 --- a/backend/api/quivr_api/modules/rag_service/rag_service.py +++ b/backend/api/quivr_api/modules/rag_service/rag_service.py @@ -2,10 +2,12 @@ import os from uuid import UUID, uuid4 +from quivr_api.utils.uuid_generator import generate_uuid_from_string +from quivr_core.brain import Brain as BrainCore from quivr_core.chat import ChatHistory as ChatHistoryCore -from quivr_core.config import LLMEndpointConfig, RAGConfig +from quivr_core.config import LLMEndpointConfig, RetrievalConfig from quivr_core.llm.llm_endpoint import LLMEndpoint -from quivr_core.models import ParsedRAGResponse, RAGResponseMetadata +from quivr_core.models import ChatLLMMetadata, ParsedRAGResponse, RAGResponseMetadata from quivr_core.quivr_rag_langgraph import QuivrQARAGLangGraph from quivr_api.logger import get_logger @@ -38,14 +40,15 @@ class RAGService: def __init__( self, current_user: UserIdentity, - brain: BrainEntity, chat_id: UUID, - brain_service: BrainService, - prompt_service: PromptService, - chat_service: ChatService, - knowledge_service: KnowledgeService, - vector_service: VectorService, model_service: ModelService, + chat_service: ChatService, + brain: BrainEntity, + retrieval_config: RetrievalConfig | None = None, + brain_service: BrainService | None = None, + prompt_service: PromptService | None = None, + knowledge_service: KnowledgeService | None = None, + vector_service: VectorService | None = None, ): # Services self.brain_service = brain_service @@ -59,13 +62,21 @@ def __init__( self.current_user = current_user self.chat_id = chat_id self.brain = brain - self.prompt = self.get_brain_prompt(self.brain) + self.prompt = ( + self.get_brain_prompt(self.brain) + if self.brain and self.brain_service + else None + ) + + self.retrieval_config = retrieval_config # check at init time - self.model_to_use = brain.model - assert self.model_to_use is not None + self.model_to_use = brain.model if brain else None def get_brain_prompt(self, brain: BrainEntity) -> Prompt | None: + if not self.prompt_service: + raise ValueError("PromptService not provided") + return ( self.prompt_service.get_prompt_by_id(brain.prompt_id) if brain.prompt_id @@ -84,29 +95,42 @@ def _build_chat_history( [chat_history.append(m) for m in transformed_history] return chat_history - async def _build_rag_config(self) -> RAGConfig: + async def _get_retrieval_config(self) -> RetrievalConfig: + if self.retrieval_config: + retrieval_config = self.retrieval_config + else: + retrieval_config = await self._build_retrieval_config() + + return retrieval_config + + async def _build_retrieval_config(self) -> RetrievalConfig: model = await self.model_service.get_model(self.model_to_use) # type: ignore + if model is None: + raise ValueError(f"Cannot get model {self.model_to_use}") api_key = os.getenv(model.env_variable_name, "not-defined") - rag_config = RAGConfig( + retrieval_config = RetrievalConfig( llm_config=LLMEndpointConfig( model=self.model_to_use, # type: ignore llm_base_url=model.endpoint_url, llm_api_key=api_key, temperature=(LLMEndpointConfig.model_fields["temperature"].default), - max_input=model.max_input, - max_tokens=model.max_output, + max_input_tokens=model.max_input, + max_output_tokens=model.max_output, ), prompt=self.prompt.content if self.prompt else None, ) - return rag_config + return retrieval_config - def get_llm(self, rag_config: RAGConfig): - return LLMEndpoint.from_config(rag_config.llm_config) + def get_llm(self, retrieval_config: RetrievalConfig): + return LLMEndpoint.from_config(retrieval_config.llm_config) def create_vector_store( self, brain_id: UUID, max_input: int ) -> CustomSupabaseVectorStore: + if not self.vector_service: + raise ValueError("VectorService not provided") + supabase_client = get_supabase_client() embeddings = get_embedding_client() return CustomSupabaseVectorStore( @@ -144,29 +168,49 @@ async def generate_answer( logger.info( f"Creating question for chat {self.chat_id} with brain {self.brain.brain_id} " ) - rag_config = await self._build_rag_config() - logger.debug(f"generate_answer with config : {rag_config.model_dump()}") + retrieval_config = await self._get_retrieval_config() + logger.debug(f"generate_answer with config : {retrieval_config.model_dump()}") history = await self.chat_service.get_chat_history(self.chat_id) + # Format the history, sanitize the input + chat_history = self._build_chat_history(history) + # Get list of files - list_files = await self.knowledge_service.get_all_knowledge_in_brain( - self.brain.brain_id + list_files = ( + await self.knowledge_service.get_all_knowledge_in_brain(self.brain.brain_id) + if self.knowledge_service + else [] ) + # Build RAG dependencies to inject - vector_store = self.create_vector_store( - self.brain.brain_id, rag_config.llm_config.max_input + vector_store = ( + self.create_vector_store( + self.brain.brain_id, retrieval_config.llm_config.max_input_tokens + ) + if self.vector_service + else None ) - llm = self.get_llm(rag_config) - # Initialize the RAG pipline - rag_pipeline = QuivrQARAGLangGraph( - rag_config=rag_config, llm=llm, vector_store=vector_store + + llm = self.get_llm(retrieval_config) + + brain_core = BrainCore( + name=self.brain.name, + id=self.brain.id, + llm=llm, + vector_db=vector_store, + embedder=vector_store.embeddings if vector_store else None, ) - # Format the history, sanitize the input - chat_history = self._build_chat_history(history) - parsed_response = rag_pipeline.answer(question, chat_history, list_files) + parsed_response = brain_core.ask( + question=question, + retrieval_config=retrieval_config, + rag_pipeline=QuivrQARAGLangGraph, + list_files=list_files, + chat_history=chat_history, + ) # Save the answer to db - new_chat_entry = self.save_answer(question, parsed_response) + if self.brain_service: + new_chat_entry = self.save_answer(question, parsed_response) # Format output to be correct metadata = ( @@ -179,10 +223,10 @@ async def generate_answer( "chat_id": self.chat_id, "user_message": question, "assistant": parsed_response.answer, - "message_time": new_chat_entry.message_time, + "message_time": new_chat_entry.message_time if new_chat_entry else None, "prompt_title": (self.prompt.title if self.prompt else None), "brain_name": self.brain.name if self.brain else None, - "message_id": new_chat_entry.message_id, + "message_id": new_chat_entry.message_id if new_chat_entry else None, "brain_id": str(self.brain.brain_id) if self.brain else None, "metadata": metadata, } @@ -196,23 +240,38 @@ async def generate_answer_stream( f"Creating question for chat {self.chat_id} with brain {self.brain.brain_id} " ) # Build the rag config - rag_config = await self._build_rag_config() + retrieval_config = await self._get_retrieval_config() # Get chat history history = await self.chat_service.get_chat_history(self.chat_id) # Format the history, sanitize the input chat_history = self._build_chat_history(history) # Get list of files urls - list_files = await self.knowledge_service.get_all_knowledge_in_brain( - self.brain.brain_id + list_files = ( + await self.knowledge_service.get_all_knowledge_in_brain(self.brain.brain_id) + if self.knowledge_service + else [] ) - llm = self.get_llm(rag_config) - vector_store = self.create_vector_store( - self.brain.brain_id, rag_config.llm_config.max_input + + vector_store = ( + self.create_vector_store( + self.brain.brain_id, retrieval_config.llm_config.max_input_tokens + ) + if self.vector_service + else None ) - # Initialize the rag pipline - rag_pipeline = QuivrQARAGLangGraph( - rag_config=rag_config, llm=llm, vector_store=vector_store + + llm = self.get_llm(retrieval_config) + + # Get model metadata + model_metadata = await self.model_service.get_model(self.brain.name) + + brain_core = BrainCore( + name=self.brain.name, + id=self.brain.id, + llm=llm, + vector_db=vector_store, + embedder=vector_store.embeddings if vector_store else None, ) full_answer = "" @@ -226,12 +285,28 @@ async def generate_answer_stream( "user_message": question, # TODO: define result "message_time": datetime.datetime.now(), # TODO: define result "prompt_title": (self.prompt.title if self.prompt else ""), - "brain_name": self.brain.name if self.brain else None, - "brain_id": self.brain.brain_id if self.brain else None, + # brain_name and brain_id must be None in the chat-with-llm case, as this will force the front to look for the model_metadata + "brain_name": self.brain.name if self.brain_service else None, + "brain_id": self.brain.brain_id if self.brain_service else None, } - async for response in rag_pipeline.answer_astream( - question, chat_history, list_files + metadata_model = {} + if model_metadata: + metadata_model = ChatLLMMetadata( + name=self.brain.name, + description=model_metadata.description, + image_url=model_metadata.image_url, + display_name=model_metadata.display_name, + brain_id=str(generate_uuid_from_string(self.brain.name)), + brain_name=self.model_to_use, + ) + + async for response in brain_core.ask_streaming( + question=question, + retrieval_config=retrieval_config, + rag_pipeline=QuivrQARAGLangGraph, + chat_history=chat_history, + list_files=list_files, ): # Format output to be correct servicedf;j if not response.last_chunk: @@ -247,6 +322,10 @@ async def generate_answer_stream( streamed_chat_history.metadata["snippet_emoji"] = ( self.brain.snippet_emoji if self.brain else None ) + if metadata_model: + streamed_chat_history.metadata["metadata_model"] = ( + metadata_model + ) full_answer += response.answer yield f"data: {streamed_chat_history.model_dump_json()}" @@ -256,6 +335,7 @@ async def generate_answer_stream( metadata=response.metadata.model_dump(), **message_metadata, ) + if streamed_chat_history.metadata: streamed_chat_history.metadata["snippet_color"] = ( self.brain.snippet_color if self.brain else None @@ -263,16 +343,24 @@ async def generate_answer_stream( streamed_chat_history.metadata["snippet_emoji"] = ( self.brain.snippet_emoji if self.brain else None ) - sources_urls = await generate_source( - knowledge_service=self.knowledge_service, - brain_id=self.brain.brain_id, - source_documents=response.metadata.sources, - citations=( - streamed_chat_history.metadata["citations"] - if streamed_chat_history.metadata - else None - ), + if metadata_model: + streamed_chat_history.metadata["metadata_model"] = metadata_model + + sources_urls = ( + await generate_source( + knowledge_service=self.knowledge_service, + brain_id=self.brain.brain_id, + source_documents=response.metadata.sources, + citations=( + streamed_chat_history.metadata["citations"] + if streamed_chat_history.metadata + else None + ), + ) + if self.knowledge_service + else [] ) + if streamed_chat_history.metadata: streamed_chat_history.metadata["sources"] = sources_urls diff --git a/backend/core/MegaParse/.env.example b/backend/core/MegaParse/.env.example new file mode 100644 index 000000000000..b4776ec5bcc9 --- /dev/null +++ b/backend/core/MegaParse/.env.example @@ -0,0 +1 @@ +OPENAI_API_KEY=CHANGE_ME \ No newline at end of file diff --git a/backend/core/MegaParse/.gitattributes b/backend/core/MegaParse/.gitattributes new file mode 100644 index 000000000000..9030923a7819 --- /dev/null +++ b/backend/core/MegaParse/.gitattributes @@ -0,0 +1 @@ +*.ipynb linguist-vendored \ No newline at end of file diff --git a/backend/core/MegaParse/.github/workflows/release-please.yml b/backend/core/MegaParse/.github/workflows/release-please.yml new file mode 100644 index 000000000000..01ac897e2aef --- /dev/null +++ b/backend/core/MegaParse/.github/workflows/release-please.yml @@ -0,0 +1,50 @@ +on: + push: + branches: + - main + +permissions: + contents: write + pull-requests: write + +name: release-please + +jobs: + release-please: + runs-on: ubuntu-latest + outputs: + release_created: ${{ steps.release.outputs.release_created }} + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + fetch-depth: 0 # Fetch all history for tags and releases + + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + + - name: Run release-please + id: release + uses: google-github-actions/release-please-action@v4 + with: + token: ${{ secrets.RELEASE_PLEASE_TOKEN }} + + + deploy: + if: needs.release-please.outputs.release_created == 'true' + needs: release-please + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install Rye + uses: eifinger/setup-rye@v2 + with: + enable-cache: true + - name: Rye Sync + run: rye sync --no-lock + - name: Rye Build + run: rye build + - name: Rye Publish + run: rye publish --token ${{ secrets.PYPI_API_TOKEN }} --yes diff --git a/backend/core/MegaParse/.gitignore b/backend/core/MegaParse/.gitignore new file mode 100644 index 000000000000..ae93dd793415 --- /dev/null +++ b/backend/core/MegaParse/.gitignore @@ -0,0 +1,18 @@ +CHANGE*.md +/output +/input +.env +__pycache__/ +dist/** +megaparse.egg-info/ +*.pyc +build/* +ENV +venv +*/evaluations/* +*/cdp/* +*.pkl + +!megaparse/tests/output_tests/MegaFake_report.md +*.DS_Store +.tool-versions diff --git a/backend/core/MegaParse/.pre-commit-config.yaml b/backend/core/MegaParse/.pre-commit-config.yaml new file mode 100644 index 000000000000..afbea82654b0 --- /dev/null +++ b/backend/core/MegaParse/.pre-commit-config.yaml @@ -0,0 +1,41 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.6.0 + hooks: + - id: check-added-large-files + args: ["--maxkb=5000"] + - id: check-toml + - id: check-yaml + - id: end-of-file-fixer + - id: trailing-whitespace + - id: check-merge-conflict + - id: detect-private-key + - id: check-case-conflict + - repo: https://github.com/pre-commit/pre-commit + rev: v3.6.2 + hooks: + - id: validate_manifest + - repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.5.1 + hooks: + # Run the linter. + - id: ruff + args: [--fix] + additional_dependencies: [] + # Run the formatter. + - id: ruff-format + additional_dependencies: [] + - repo: https://github.com/pre-commit/mirrors-mypy + rev: v1.10.1 + hooks: + - id: mypy + name: mypy + additional_dependencies: ["types-aiofiles"] + - repo: https://github.com/python-poetry/poetry + rev: "1.8.0" + hooks: + - id: poetry-check + args: ["-C", "./backend/core"] + - id: poetry-lock + args: ["-C", "./backend/core"] diff --git a/backend/core/MegaParse/.release-please-manifest.json b/backend/core/MegaParse/.release-please-manifest.json new file mode 100644 index 000000000000..a065a580af25 --- /dev/null +++ b/backend/core/MegaParse/.release-please-manifest.json @@ -0,0 +1,3 @@ +{ + ".": "0.0.31" +} diff --git a/backend/core/MegaParse/Dockerfile b/backend/core/MegaParse/Dockerfile new file mode 100644 index 000000000000..77a5c0668ebd --- /dev/null +++ b/backend/core/MegaParse/Dockerfile @@ -0,0 +1,16 @@ +# Using a slim version for a smaller base image +FROM python:3.11.6-slim-bullseye + +# Install GEOS library, Rust, and other dependencies, then clean up +RUN apt-get clean && apt-get update && apt-get install -y \ + poppler-utils \ + tesseract-ocr + +WORKDIR /code + +# Upgrade pip and install dependencies +RUN pip install megaparse + +# You can run the application with the following command: +# docker run -it megaparse_image python your_script.py + diff --git a/backend/core/MegaParse/LICENSE b/backend/core/MegaParse/LICENSE new file mode 100644 index 000000000000..261eeb9e9f8b --- /dev/null +++ b/backend/core/MegaParse/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/backend/core/MegaParse/Makefile b/backend/core/MegaParse/Makefile new file mode 100644 index 000000000000..e0987605bebb --- /dev/null +++ b/backend/core/MegaParse/Makefile @@ -0,0 +1,13 @@ +# Makefile + +# Image name +IMAGE_NAME = megaparse_image + +# Dockerfile location +DOCKERFILE = Dockerfile + +# Build Docker image +build: + docker build -t $(IMAGE_NAME) -f $(DOCKERFILE) . + +.PHONY: build \ No newline at end of file diff --git a/backend/core/MegaParse/README.md b/backend/core/MegaParse/README.md new file mode 100644 index 000000000000..420f9a56bd63 --- /dev/null +++ b/backend/core/MegaParse/README.md @@ -0,0 +1,93 @@ +# MegaParse - Your Mega Parser for every type of documents + +
+ Quivr-logo +
+ +MegaParse is a powerful and versatile parser that can handle various types of documents with ease. Whether you're dealing with text, PDFs, Powerpoint presentations, Word documents MegaParse has got you covered. Focus on having no information loss during parsing. + +## Key Features 🎯 + +- **Versatile Parser**: MegaParse is a powerful and versatile parser that can handle various types of documents with ease. +- **No Information Loss**: Focus on having no information loss during parsing. +- **Fast and Efficient**: Designed with speed and efficiency at its core. +- **Wide File Compatibility**: Supports Text, PDF, Powerpoint presentations, Excel, CSV, Word documents. +- **Open Source**: Freedom is beautiful, and so is MegaParse. Open source and free to use. + +## Support + +- Files: ✅ PDF ✅ Powerpoint ✅ Word +- Content: ✅ Tables ✅ TOC ✅ Headers ✅ Footers ✅ Images + +### Example + +https://github.com/QuivrHQ/MegaParse/assets/19614572/1b4cdb73-8dc2-44ef-b8b4-a7509bc8d4f3 + +## Installation + +```bash +pip install megaparse +``` + +## Usage + +1. Add your OpenAI API key to the .env file + +2. Install poppler on your computer (images and PDFs) + +3. Install tesseract on your computer (images and PDFs) + +```python +from megaparse import MegaParse + +megaparse = MegaParse(file_path="./test.pdf") +document = megaparse.load() +print(document.page_content) +megaparse.save_md(document.page_content, "./test.md") +``` + +### (Optional) Use LlamaParse for Improved Results + +1. Create an account on [Llama Cloud](https://cloud.llamaindex.ai/) and get your API key. + +2. Call Megaparse with the `llama_parse_api_key` parameter + +```python +from megaparse import MegaParse + +megaparse = MegaParse(file_path="./test.pdf", llama_parse_api_key="llx-your_api_key") +document = megaparse.load() +print(document.page_content) +``` + +## BenchMark + + + +| Parser | Diff | +| ---------------------------------------- | ---- | +| LMM megaparse | 36 | +| Megaparse with LLamaParse and GPTCleaner | 74 | +| Megaparse with LLamaParse | 97 | +| Unstructured Augmented Parse | 99 | +| LLama Parse | 102 | +| **Megaparse** | 105 | + + + +_Lower is better_ + +## Next Steps + +- [ ] Improve Table Parsing +- [ ] Improve Image Parsing and description +- [ ] Add TOC for Docx +- [ ] Add Hyperlinks for Docx +- [ ] Order Headers for Docx to Markdown +- [X] Add Rye package manager + + + +## Star History + +[![Star History Chart](https://api.star-history.com/svg?repos=QuivrHQ/MegaParse&type=Date)](https://star-history.com/#QuivrHQ/MegaParse&Date) diff --git a/backend/core/MegaParse/images/tables.png b/backend/core/MegaParse/images/tables.png new file mode 100644 index 000000000000..d4537b2b1e64 Binary files /dev/null and b/backend/core/MegaParse/images/tables.png differ diff --git a/backend/core/MegaParse/logo.png b/backend/core/MegaParse/logo.png new file mode 100644 index 000000000000..55d67a36140e Binary files /dev/null and b/backend/core/MegaParse/logo.png differ diff --git a/backend/core/MegaParse/megaparse/Converter.py b/backend/core/MegaParse/megaparse/Converter.py new file mode 100644 index 000000000000..1dc1dedc4b57 --- /dev/null +++ b/backend/core/MegaParse/megaparse/Converter.py @@ -0,0 +1,412 @@ +import asyncio +import logging +import os +from collections import Counter +from pathlib import Path +from typing import List, Set + +import pandas as pd +from docx import Document +from docx.oxml.table import CT_Tbl +from docx.oxml.text.paragraph import CT_P +from docx.table import Table +from docx.text.paragraph import Paragraph +from docx.text.run import Run +from langchain_community.document_loaders.base import BaseLoader +from langchain_core.documents import Document as LangChainDocument +from llama_index.core.schema import Document as LlamaDocument +from llama_parse import LlamaParse +from llama_parse.utils import Language, ResultType +from pptx import Presentation +from pptx.enum.shapes import MSO_SHAPE_TYPE + +from megaparse.config import MegaparseConfig, PdfParser +from megaparse.markdown_processor import MarkdownProcessor +from megaparse.multimodal_convertor.megaparse_vision import MegaParseVision +from megaparse.unstructured_convertor import ModelEnum, UnstructuredParser + +logger = logging.getLogger("megaparse") + + +class Converter: + def __init__(self) -> None: + pass + + async def convert(self, file_path: str | Path) -> LangChainDocument: + raise NotImplementedError("Subclasses should implement this method") + + def save_md(self, md_content: str, file_path: Path | str) -> None: + with open(file_path, "w") as f: + f.write(md_content) + + +class XLSXConverter(Converter): + def __init__(self) -> None: + pass + + async def convert(self, file_path: str | Path) -> LangChainDocument: + if isinstance(file_path, str): + file_path = Path(file_path) + xls = pd.ExcelFile(file_path) # type: ignore + sheets = pd.read_excel(xls) + + target_text = self.table_to_text(sheets) + + return LangChainDocument( + page_content=target_text, + metadata={"filename": file_path.name, "type": "xlsx"}, + ) + + def convert_tab(self, file_path: str | Path, tab_name: str) -> str: + if isinstance(file_path, str): + file_path = Path(file_path) + xls = pd.ExcelFile(str(file_path)) + sheets = pd.read_excel(xls, tab_name) + target_text = self.table_to_text(sheets) + return target_text + + def table_to_text(self, df): + text_rows = [] + for _, row in df.iterrows(): + row_text = " | ".join(str(value) for value in row.values if pd.notna(value)) + if row_text: + text_rows.append("|" + row_text + "|") + return "\n".join(text_rows) + + +class DOCXConverter(Converter): + def __init__(self) -> None: + self.header_handled = False + + async def convert(self, file_path: str | Path) -> LangChainDocument: + if isinstance(file_path, str): + file_path = Path(file_path) + doc = Document(str(file_path)) + md_content = [] + # Handle header + if doc.sections and doc.sections[0].header: + header_content = self._handle_header(doc.sections[0].header) + if header_content: + md_content.append(header_content) + + for element in doc.element.body: + if isinstance(element, CT_P): + md_content.append(self._handle_paragraph(Paragraph(element, doc))) + elif isinstance(element, CT_Tbl): + md_content += self._handle_table(Table(element, doc)) + # Add more handlers here (image, header, footer, etc) + + return LangChainDocument( + page_content="\n".join(md_content), + metadata={"filename": file_path.name, "type": "docx"}, + ) + + def _handle_header(self, header) -> str: + if not self.header_handled: + parts = [] + for paragraph in header.paragraphs: + parts.append(f"# {paragraph.text}") + for table in header.tables: + parts += self._handle_header_table(table) + self.header_handled = True + return "\n".join(parts) + return "" + + def _handle_header_table(self, table: Table) -> List[str]: + cell_texts = [cell.text for row in table.rows for cell in row.cells] + cell_texts.remove("") + # Find the most repeated cell text + text_counts = Counter(cell_texts) + title = text_counts.most_common(1)[0][0] if cell_texts else "" + other_texts = [text for text in cell_texts if text != title and text != ""] + + md_table_content = [] + if title: + md_table_content.append(f"# {title}") + for text in other_texts: + md_table_content.append(f"*{text}*;") + return md_table_content + + def _handle_paragraph(self, paragraph: Paragraph) -> str: + if paragraph.style.name.startswith("Heading"): # type: ignore + level = int(paragraph.style.name.split()[-1]) # type: ignore + return f"{'#' * level} {paragraph.text}" + else: + parts = [] + for run in paragraph.runs: + if run.text != "": + parts.append(self._handle_run(run)) + return "".join(parts) + + def _handle_run(self, run: Run) -> str: + text: str = run.text + if run.bold: + if len(text) < 200: + # FIXME : handle table needs to be improved -> have the paragraph they are in + text = f"## {text}" + else: + text = f"**{text}**" + if run.italic: + text = f"*{text}*" + return text + + def _handle_table(self, table: Table) -> List[str]: + row_content = [] + for i, row in enumerate(table.rows): + row_content.append( + "| " + " | ".join(cell.text.strip() for cell in row.cells) + " |" + ) + if i == 0: + row_content.append("|" + "---|" * len(row.cells)) + + return row_content + + def save_md(self, md_content: str, file_path: Path | str) -> None: + with open(file_path, "w") as f: + f.write(md_content) + + +class PPTXConverter: + def __init__(self, add_images=False) -> None: + self.header_handled = False + self.add_images = add_images + + async def convert(self, file_path: str | Path) -> LangChainDocument: + if isinstance(file_path, str): + file_path = Path(file_path) + prs = Presentation(str(file_path)) + md_content = [] + unique_slides: Set[str] = set() + + # Handle header + if prs.slides and prs.slides[0].placeholders: + header_content = self._handle_header(prs.slides[0].placeholders) + if header_content: + md_content.append(header_content) + + for i, slide in enumerate(prs.slides): + slide_md_content: List[str] = [] + for shape in slide.shapes: + if shape.shape_type == MSO_SHAPE_TYPE.TABLE: # type: ignore + slide_md_content += self._handle_table(shape.table) + elif shape.shape_type == MSO_SHAPE_TYPE.PICTURE and self.add_images: # type: ignore + slide_md_content.append(self._handle_image(shape)) + elif hasattr(shape, "text"): + slide_md_content.append(self._handle_paragraph(shape.text)) + + slide_md_str = "\n".join(slide_md_content) + if slide_md_str not in unique_slides: + unique_slides.add(slide_md_str) + slide_md_str = f"## Slide {i+1}\n{slide_md_str}" + md_content.append(slide_md_str) + + return LangChainDocument( + page_content="\n".join(md_content), + metadata={"filename": file_path.name, "type": "pptx"}, + ) + + def _handle_header(self, placeholders) -> str: + if not self.header_handled: + parts = [] + for placeholder in placeholders: + if placeholder.placeholder_format.idx == 0: # Title placeholder + parts.append(f"# {placeholder.text}") + elif placeholder.placeholder_format.idx == 1: # Subtitle placeholder + parts.append(f"## {placeholder.text}") + self.header_handled = True + return "\n".join(parts) + return "" + + def _handle_paragraph(self, text: str) -> str: + # Assuming text is a simple paragraph without complex formatting + # if text contains letters return text + if any(c.isalpha() for c in text): + return text + "\n" + return "" + + def _handle_image(self, shape) -> str: + image = shape.image + image_bytes = image.blob + image_format = image.ext + image_filename = f"images/image_{shape.shape_id}.{image_format}" + with open(image_filename, "wb") as f: + f.write(image_bytes) + return f"![Image {shape.shape_id}](../{image_filename})" + + def _handle_table(self, table) -> List[str]: + row_content = [] + for i, row in enumerate(table.rows): + row_content.append( + "| " + " | ".join(cell.text.strip() for cell in row.cells) + " |" + ) + if i == 0: + row_content.append("|" + "---|" * len(row.cells)) + return row_content + + def save_md(self, md_content: str, file_path: Path | str) -> None: + with open(file_path, "w") as f: + f.write(md_content) + + +class PDFConverter: + def __init__( + self, + llama_parse_api_key: str, + method: PdfParser | str = PdfParser.UNSTRUCTURED, + model=ModelEnum.NONE, + strategy="fast", + ) -> None: + self.strategy = strategy + self.llama_parse_api_key = llama_parse_api_key + if isinstance(method, str): + try: + method = PdfParser(method) + except ValueError: + raise ValueError(f"Method {method} not supported") + self.method = method + + async def _llama_parse(self, api_key: str, file_path: str | Path): + logger.debug(f"Parsing {file_path.name} using llama_parse") + parsing_instructions = "Do not take into account the page breaks (no --- between pages), do not repeat the header and the footer so the tables are merged. Keep the same format for similar tables." + self.parser = LlamaParse( + api_key=str(api_key), + result_type=ResultType.MD, + gpt4o_mode=True, + verbose=True, + language=Language.FRENCH, + parsing_instruction=parsing_instructions, # Optionally you can define a parsing instruction + ) + documents: List[LlamaDocument] = await self.parser.aload_data(str(file_path)) + parsed_md = "" + for document in documents: + text_content = document.text + parsed_md = parsed_md + text_content + return parsed_md + + def _unstructured_parse( + self, file_path: str | Path, model: ModelEnum = ModelEnum.NONE + ): + logger.debug( + f"Parsing {file_path.name} using unstructured with strategy {self.strategy}" + ) + unstructured_parser = UnstructuredParser() + return unstructured_parser.convert( + file_path, model=model, strategy=self.strategy + ) + + async def _lmm_parse(self, file_path: str | Path): + lmm_parser = MegaParseVision() + return await lmm_parser.parse(file_path) + + async def convert( + self, + file_path: str | Path, + model: ModelEnum = ModelEnum.NONE, + gpt4o_cleaner=False, + ) -> LangChainDocument: + if isinstance(file_path, str): + file_path = Path(file_path) + + parsed_md = "" + if self.method == PdfParser.LLAMA_PARSE: + assert ( + self.llama_parse_api_key is not None + ), "LLama Parse API key is required for this method" + parsed_md = await self._llama_parse(self.llama_parse_api_key, file_path) + elif self.method == PdfParser.MEGAPARSE_VISION: + parsed_md = await self._lmm_parse(file_path) + elif self.method == PdfParser.UNSTRUCTURED: + parsed_md = self._unstructured_parse(file_path, model) + else: + raise ValueError(f"Method {self.method} not supported") + + if not gpt4o_cleaner: + return LangChainDocument( + page_content=parsed_md, + metadata={"filename": file_path.name, "type": "pdf"}, + ) + else: + md_processor = MarkdownProcessor( + parsed_md, + strict=True, + remove_pagination=True, + ) + md_cleaned = md_processor.process(gpt4o_cleaner=gpt4o_cleaner) + return LangChainDocument( + page_content=md_cleaned, + metadata={"filename": file_path.name, "type": "pdf"}, + ) + + def save_md(self, md_content: str, file_path: Path | str) -> None: + with open(file_path, "w") as f: + f.write(md_content) + + +class MegaParse(BaseLoader): + def __init__( + self, + file_path: str | Path, + config: MegaparseConfig = MegaparseConfig(), + ) -> None: + if isinstance(file_path, str): + file_path = Path(file_path) + self.file_path = file_path + self.config = config + + async def aload(self, **convert_kwargs) -> LangChainDocument: + file_extension: str = os.path.splitext(self.file_path)[1] + if file_extension == ".docx": + converter = DOCXConverter() + elif file_extension == ".pptx": + converter = PPTXConverter() + elif file_extension == ".pdf": + converter = PDFConverter( + llama_parse_api_key=str(self.config.llama_parse_api_key), + strategy=self.config.strategy, + method=self.config.pdf_parser, + ) + elif file_extension == ".xlsx": + converter = XLSXConverter() + else: + raise ValueError(f"Unsupported file extension: {file_extension}") + + return await converter.convert(self.file_path, **convert_kwargs) + + def load(self, **kwargs) -> LangChainDocument: + file_extension: str = os.path.splitext(self.file_path)[1] + if file_extension == ".docx": + converter = DOCXConverter() + elif file_extension == ".pptx": + converter = PPTXConverter() + elif file_extension == ".pdf": + converter = PDFConverter( + llama_parse_api_key=str(self.config.llama_parse_api_key), + strategy=self.config.strategy, + ) + elif file_extension == ".xlsx": + converter = XLSXConverter() + else: + print(self.file_path, file_extension) + raise ValueError(f"Unsupported file extension: {file_extension}") + + loop = asyncio.get_event_loop() + return loop.run_until_complete(converter.convert(self.file_path, **kwargs)) + + def load_tab(self, tab_name: str, **kwargs) -> LangChainDocument: + file_extension: str = os.path.splitext(self.file_path)[1] + if file_extension == ".xlsx": + converter = XLSXConverter() + else: + print(self.file_path, file_extension) + raise ValueError(f"Unsupported file extension for tabs: {file_extension}") + + result = converter.convert_tab(self.file_path, tab_name=tab_name) + return LangChainDocument( + page_content=result, + metadata={"filename": self.file_path.name, "type": "xlsx"}, + ) + + def save_md(self, md_content: str, file_path: Path | str) -> None: + os.makedirs(os.path.dirname(file_path), exist_ok=True) + with open(file_path, "w+") as f: + f.write(md_content) diff --git a/backend/core/MegaParse/megaparse/__init__.py b/backend/core/MegaParse/megaparse/__init__.py new file mode 100644 index 000000000000..126dc34e278e --- /dev/null +++ b/backend/core/MegaParse/megaparse/__init__.py @@ -0,0 +1,3 @@ +from .Converter import MegaParse + +__all__ = ["MegaParse"] diff --git a/backend/core/MegaParse/megaparse/config.py b/backend/core/MegaParse/megaparse/config.py new file mode 100644 index 000000000000..2f001c443df5 --- /dev/null +++ b/backend/core/MegaParse/megaparse/config.py @@ -0,0 +1,27 @@ +from enum import Enum + +import yaml +from pydantic import BaseModel + + +class PdfParser(str, Enum): + LLAMA_PARSE = "llama_parse" + UNSTRUCTURED = "unstructured" + MEGAPARSE_VISION = "megaparse_vision" + + +class MegaparseBaseConfig(BaseModel): + @classmethod + def from_yaml(cls, file_path: str): + # Load the YAML file + with open(file_path, "r") as stream: + config_data = yaml.safe_load(stream) + + # Instantiate the class using the YAML data + return cls(**config_data) + + +class MegaparseConfig(MegaparseBaseConfig): + strategy: str = "fast" + llama_parse_api_key: str | None = None + pdf_parser: PdfParser = PdfParser.UNSTRUCTURED diff --git a/backend/core/MegaParse/megaparse/markdown_processor.py b/backend/core/MegaParse/megaparse/markdown_processor.py new file mode 100644 index 000000000000..bc89e550298d --- /dev/null +++ b/backend/core/MegaParse/megaparse/markdown_processor.py @@ -0,0 +1,213 @@ +import os +from collections import Counter +from typing import Dict, List, Tuple + +from dotenv import load_dotenv +from langchain_openai import ChatOpenAI + + +class MarkdownProcessor: + """ + Class for MarkdownProcessor. + """ + + load_dotenv() + + def __init__(self, md_result: str, strict: bool, remove_pagination: bool): + self.md_result = md_result + self.strict = strict + self.remove_pagination = remove_pagination + + @staticmethod + def clean(text: str) -> str: + """ + Clean the input text by removing newlines, double asterisks, and trimming whitespace. + + Args: + text (str): Input text + + Returns: + str: Cleaned text + """ + text = text.replace("\n", "") + text = text.replace("**", "") + text = text.strip() + return text + + def split_into_pages(self) -> List[str]: + """ + Split the markdown result into pages using triple newlines as the delimiter. + + Returns: + List[str]: Splitted markdown + """ + return self.md_result.split("\n\n\n") + + @staticmethod + def split_into_paragraphs(pages: list) -> List[str]: + """ + Split pages into paragraphs using double newlines as the delimiter. + + Args: + pages (list): Pages + + Returns: + List[str]: Splitted pages + """ + return "\n\n".join(pages).split("\n\n") + + def remove_duplicates(self, paragraphs: list) -> Tuple[List[str], List[str]]: + """ + Remove duplicate paragraphs and identify unique and duplicate paragraphs. + + Args: + paragraphs (list): Paragraphs + + Returns: + Tuple[str, List[str]]: Cleaned paragraphs and duplicate paragraphs + """ + unique_paragraphs = list( + set([self.clean(paragraph) for paragraph in paragraphs]) + ) + duplicate_paragraphs: List[str] = [] + cleaned_paragraphs: List[str] = [] + + for paragraph in paragraphs: + cleaned_paragraph = self.clean(paragraph) + if cleaned_paragraph in unique_paragraphs: + cleaned_paragraphs.append(paragraph) + unique_paragraphs.remove(cleaned_paragraph) + else: + duplicate_paragraphs.append(paragraph) + return cleaned_paragraphs, duplicate_paragraphs + + def identify_header_components(self, duplicate_paragraphs: list) -> Counter: + """ + Identify words in duplicate paragraphs that are likely header components. + + Args: + duplicate_paragraphs (list): Duplicate paragraphs + + Returns: + Dict: Header components + """ + header_components = list( + set([self.clean(paragraph) for paragraph in duplicate_paragraphs]) + ) + header_components = " ".join(header_components).strip().split(" ") + header_components_count = Counter(header_components) + header_components_count = Counter( + { + k.replace(":", ""): v + for k, v in header_components_count.items() + if v > 1 and len(k) > 3 + } + ) + return header_components_count + + def remove_header_lines( + self, paragraphs: List[str], header_components_count: Dict + ) -> List[str]: + """ + Remove paragraphs that contain any of the header words or the word 'Page' if remove_pagination is true. + + Args: + paragraphs (List[str]): Paragraphs + header_components_count (Dict): Header components + + Returns: + List[str]: New paragraphs + """ + + def should_remove(paragraph): + if self.remove_pagination and "Page" in paragraph: + return True + return any(word in paragraph for word in header_components_count.keys()) + + return [paragraph for paragraph in paragraphs if not should_remove(paragraph)] + + def merge_tables(self, md_content: str) -> str: + """ + Merge tables inside Markdown content. + + Args: + md_content (str): Markdown content + + Returns: + str: Merged tables + """ + md_content = md_content.replace("|\n\n|", "|\n|") + return md_content + + def save_cleaned_result(self, cleaned_result: str, output_path: str) -> None: + """ + Save the cleaned paragraphs to a markdown file. + + Args: + cleaned_result (str): Cleaned result + output_path (str): Output path + """ + with open(output_path, "w") as f: + f.write(cleaned_result) + + def remove_header_llm(self): + llm = ChatOpenAI(model="gpt-4o", api_key=os.getenv("OPENAI_API_KEY")) + # Define the prompt + messages = [ + ( + "system", + "You are a document cleaner and you are used to remove repetitive headers / footer from parsed files in markdown.", + ), + ] + + prompt = f"""You are a document cleaner and you are used to remove repetitive headers / footer from parsed files in markdown. + Here is a md file : "{self.md_result}" + I want you to identify repetitive texts that could be associate to a document header and footer. Please identify the headers, the footer and remove them from the document. + Answer with only the cleaned document in markdown format. + Result : """ + + messages.append(("human", self.md_result)) # type: ignore + + result = llm.invoke(messages) + + return result.content + + def process(self, gpt4o_cleaner=False) -> str: + """ + Process the markdown result by removing duplicate paragraphs and headers. + + Args: + gpt4o_cleaner (bool, optional): GPT-4o cleaner. Defaults to False. + + Returns: + str: Cleaned result + """ + if gpt4o_cleaner: + cleaned_result = self.remove_header_llm() + + else: + pages = self.split_into_pages() + paragraphs = self.split_into_paragraphs(pages) + # other_pages_paragraphs = self.split_into_paragraphs(pages[1:]) + + cleaned_paragraphs, duplicate_paragraphs = self.remove_duplicates( + paragraphs + ) + header_components_count = self.identify_header_components( + duplicate_paragraphs + ) + + if self.strict: + final_paragraphs = self.remove_header_lines( + cleaned_paragraphs[5:], header_components_count + ) + final_paragraphs = cleaned_paragraphs[:5] + final_paragraphs + else: + final_paragraphs = cleaned_paragraphs + + # Combine first page paragraphs with cleaned paragraphs from other pages + all_paragraphs = final_paragraphs + cleaned_result = "\n\n".join(all_paragraphs) + + cleaned_result = self.merge_tables(str(cleaned_result)) + return cleaned_result diff --git a/backend/core/MegaParse/megaparse/multimodal_convertor/__init__.py b/backend/core/MegaParse/megaparse/multimodal_convertor/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/backend/core/MegaParse/megaparse/multimodal_convertor/megaparse_vision.py b/backend/core/MegaParse/megaparse/multimodal_convertor/megaparse_vision.py new file mode 100644 index 000000000000..0395a16ff922 --- /dev/null +++ b/backend/core/MegaParse/megaparse/multimodal_convertor/megaparse_vision.py @@ -0,0 +1,194 @@ +from enum import Enum +from io import BytesIO +from pathlib import Path +from typing import List +from langchain_core.messages import HumanMessage +from langchain_openai import ChatOpenAI +import base64 +from pdf2image import convert_from_path +import asyncio +import re + +# BASE_OCR_PROMPT = """ +# Transcribe the content of this file into markdown. Be mindful of the formatting. +# Add formatting if you think it is not clear. +# Do not include page breaks and merge content of tables if it is continued in the next page. +# Add tags around what you identify as a table [TABLE], header - complete chain of characters that are repeated at each start of pages - [HEADER], table of content [TOC] in the format '[tag] ... [/tag]' +# Return only the parsed content. +# """ + +BASE_OCR_PROMPT = """ +You are tasked with transcribing and formatting the content of a file into markdown. Your goal is to create a well-structured, readable markdown document that accurately represents the original content while adding appropriate formatting and tags. + + +Follow these instructions to complete the task: + +1. Carefully read through the entire file content. + +2. Transcribe the content into markdown format, paying close attention to the existing formatting and structure. + +3. If you encounter any unclear formatting in the original content, use your judgment to add appropriate markdown formatting to improve readability and structure. + +4. For tables, headers, and table of contents, add the following tags: + - Tables: Enclose the entire table in [TABLE] and [/TABLE] tags. Merge content of tables if it is continued in the next page. + - Headers (complete chain of characters repeated at the start of each page): Enclose in [HEADER] and [/HEADER] tags inside the markdown file. + - Table of contents: Enclose in [TOC] and [/TOC] tags + +5. When transcribing tables: + - If a table continues across multiple pages, merge the content into a single, cohesive table. + - Use proper markdown table formatting with pipes (|) and hyphens (-) for table structure. + +6. Do not include page breaks in your transcription. + +7. Maintain the logical flow and structure of the document, ensuring that sections and subsections are properly formatted using markdown headers (# for main headers, ## for subheaders, etc.). + +8. Use appropriate markdown syntax for other formatting elements such as bold, italic, lists, and code blocks as needed. + +10. Return only the parsed content in markdown format, including the specified tags for tables, headers, and table of contents. +""" + + +class ModelEnum(str, Enum): + """Model to use for the conversion""" + + CLAUDE = "claude-3.5" + GPT4O = "gpt-4o" + + +class TagEnum(str, Enum): + """Possible tags for the elements in the file""" + + TABLE = "TABLE" + TOC = "TOC" + HEADER = "HEADER" + IMAGE = "IMAGE" + + +class MegaParseVision: + def __init__(self, model: ModelEnum = ModelEnum.GPT4O): + if model == ModelEnum.GPT4O: + self.model = ChatOpenAI(model="gpt-4o") + elif model == ModelEnum.CLAUDE: + raise NotImplementedError("Claude support not yet implemented") + else: + raise ValueError(f"Model {model} not supported") + + self.parsed_chunks: list[str] | None = None + + def process_file(self, file_path: str, image_format: str = "PNG") -> List[str]: + """ + Process a PDF file and convert its pages to base64 encoded images. + + :param file_path: Path to the PDF file + :param image_format: Format to save the images (default: PNG) + :return: List of base64 encoded images + """ + try: + images = convert_from_path(file_path) + images_base64 = [] + for image in images: + buffered = BytesIO() + image.save(buffered, format=image_format) + image_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8") + images_base64.append(image_base64) + return images_base64 + except Exception as e: + raise ValueError(f"Error processing PDF file: {str(e)}") + + def get_element(self, tag: TagEnum, chunk: str): + pattern = rf"\[{tag.value}\]([\s\S]*?)\[/{tag.value}\]" + all_elmts = re.findall(pattern, chunk) + if not all_elmts: + print(f"No {tag.value} found in the chunk") + return [] + return [elmt.strip() for elmt in all_elmts] + + async def send_to_mlm(self, images_data: List[str]) -> str: + """ + Send images to the language model for processing. + + :param images_data: List of base64 encoded images + :return: Processed content as a string + """ + images_prompt = [ + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}, + } + for image_data in images_data + ] + message = HumanMessage( + content=[ + {"type": "text", "text": BASE_OCR_PROMPT}, + *images_prompt, + ], + ) + response = await self.model.ainvoke([message]) + return str(response.content) + + async def parse(self, file_path: str | Path, batch_size: int = 3) -> str: + """ + Parse a PDF file and process its content using the language model. + + :param file_path: Path to the PDF file + :param batch_size: Number of pages to process concurrently + :return: List of processed content strings + """ + if isinstance(file_path, Path): + file_path = str(file_path) + pdf_base64 = self.process_file(file_path) + tasks = [ + self.send_to_mlm(pdf_base64[i : i + batch_size]) + for i in range(0, len(pdf_base64), batch_size) + ] + self.parsed_chunks = await asyncio.gather(*tasks) + responses = self.get_cleaned_content("\n".join(self.parsed_chunks)) + return responses + + def get_cleaned_content(self, parsed_file: str) -> str: + """ + Get cleaned parsed file without any tags defined in TagEnum. + + This method removes all tags from TagEnum from the parsed file, formats the content, + and handles the HEADER tag specially by keeping only the first occurrence. + + Args: + parsed_file (str): The parsed file content with tags. + + Returns: + str: The cleaned content without TagEnum tags. + + """ + tag_pattern = "|".join(map(re.escape, TagEnum.__members__.values())) + tag_regex = rf"\[({tag_pattern})\](.*?)\[/\1\]" + # handle the HEADER tag specially + header_pattern = rf"\[{TagEnum.HEADER.value}\](.*?)\[/{TagEnum.HEADER.value}\]" + headers = re.findall(header_pattern, parsed_file, re.DOTALL) + if headers: + first_header = headers[0].strip() + # Remove all HEADER tags and their content + parsed_file = re.sub(header_pattern, "", parsed_file, flags=re.DOTALL) + # Add the first header back at the beginning + parsed_file = f"{first_header}\n{parsed_file}" + + # Remove all other tags + def remove_tag(match): + return match.group(2) + + cleaned_content = re.sub(tag_regex, remove_tag, parsed_file, flags=re.DOTALL) + + cleaned_content = re.sub(r"^```.*$\n?", "", cleaned_content, flags=re.MULTILINE) + cleaned_content = re.sub(r"\n\s*\n", "\n\n", cleaned_content) + cleaned_content = cleaned_content.replace("|\n\n|", "|\n|") + cleaned_content = cleaned_content.strip() + + return cleaned_content + + +if __name__ == "__main__": + parser = MegaParseVision() + responses = asyncio.run( + parser.parse("megaparse/tests/input_tests/MegaFake_report.pdf") + ) + print(responses) + print("Done!") diff --git a/backend/core/MegaParse/megaparse/unstructured_convertor.py b/backend/core/MegaParse/megaparse/unstructured_convertor.py new file mode 100644 index 000000000000..bbe8c32a1c9a --- /dev/null +++ b/backend/core/MegaParse/megaparse/unstructured_convertor.py @@ -0,0 +1,182 @@ +import re +from enum import Enum + +from dotenv import load_dotenv +from langchain_community.chat_models import ChatOllama +from langchain_core.prompts import ChatPromptTemplate +from langchain_openai import ChatOpenAI +from unstructured.partition.pdf import partition_pdf + + +class ModelEnum(str, Enum): + """Model to use for the conversion""" + + LOCAL = "llama3" + GPT4O = "gpt-4o" + NONE = None + + +class UnstructuredParser: + load_dotenv() + + # Function to convert element category to markdown format + def convert_to_markdown(self, elements): + markdown_content = "" + element_hierarchy = {} + + for el in elements: + markdown_content += self.get_markdown_line(el) + + return markdown_content + + def get_markdown_line(self, el): + element_type = el["type"] + text = el["text"] + metadata = el["metadata"] + parent_id = metadata.get("parent_id", None) + category_depth = metadata.get("category_depth", 0) + if "emphasized_text_contents" in metadata: + print(metadata["emphasized_text_contents"]) + + markdown_line = "" + + if element_type == "Title": + if parent_id: + markdown_line = ( + f"## {text}\n\n" # Adjusted to add sub headers if parent_id exists + ) + else: + markdown_line = f"# {text}\n\n" + elif element_type == "Subtitle": + markdown_line = f"## {text}\n\n" + elif element_type == "Header": + markdown_line = f"{'#' * (category_depth + 1)} {text}\n\n" + elif element_type == "Footer": + markdown_line = f"#### {text}\n\n" + elif element_type == "NarrativeText": + markdown_line = f"{text}\n\n" + elif element_type == "ListItem": + markdown_line = f"- {text}\n" + elif element_type == "Table": + markdown_line = el["metadata"]["text_as_html"] + elif element_type == "PageBreak": + markdown_line = "---\n\n" + elif element_type == "Image": + markdown_line = f"![Image]({el['metadata'].get('image_path', '')})\n\n" + elif element_type == "Formula": + markdown_line = f"$$ {text} $$\n\n" + elif element_type == "FigureCaption": + markdown_line = f"**Figure:** {text}\n\n" + elif element_type == "Address": + markdown_line = f"**Address:** {text}\n\n" + elif element_type == "EmailAddress": + markdown_line = f"**Email:** {text}\n\n" + elif element_type == "CodeSnippet": + markdown_line = f"```{el['metadata'].get('language', '')}\n{text}\n```\n\n" + elif element_type == "PageNumber": + markdown_line = f"**Page {text}**\n\n" + else: + markdown_line = f"{text}\n\n" + + return markdown_line + + def partition_pdf_file(self, path, strategy="fast"): + return partition_pdf( + filename=path, infer_table_structure=True, strategy=strategy + ) + + def improve_layout( + self, elements, remove_repeated_headers=True, model: ModelEnum = ModelEnum.GPT4O + ): + llm = None + chain = None + if model != ModelEnum.NONE: + llm = ( + ChatOpenAI(model="gpt-4o", temperature=0.1) + if model == ModelEnum.GPT4O + else ChatOllama(model=model.value, temperature=0.1) + ) + + # Define the prompt + prompt = ChatPromptTemplate.from_messages( + [ + ( + "human", + """You are an expert in markdown tables, match this text and this html table to fill a md table. You answer with just the table in pure markdown, nothing else. + + {text} + + + {html} + + + Note, the previous table (that might be related since appearing just before): + + {previous_table} + """, + ), + ] + ) + chain = prompt | llm + + table_stack: list[str] = [] + + improved_elements = [] + for el in elements: + if el.category == "Table": + if el.text not in set(table_stack): + if chain: + result = chain.invoke( + { + "text": el.text, + "html": el.metadata.text_as_html, + "previous_table": table_stack[-1] + if table_stack + else "", + } + ) + cleaned_result = result.content + cleaned_content = re.sub( + r"^```.*$\n?", "", str(cleaned_result), flags=re.MULTILINE + ) + else: + cleaned_content = el.text + + el.metadata.text_as_html = f"[TABLE]\n{cleaned_content}\n[/TABLE]" + # add line break to separate tables + el.metadata.text_as_html = el.metadata.text_as_html + "\n\n" # type: ignore + table_stack.append(el.text) + improved_elements.append(el) + + elif el.category not in ["Header", "Footer"]: + if "page" not in el.text.lower(): + if ( + el.text not in set(table_stack) + and "page" not in el.text.lower() + ) or remove_repeated_headers == False: + improved_elements.append(el) + + table_stack.append(el.text.strip()) + table_stack.append("") + + return improved_elements + + def convert(self, path, model: ModelEnum = ModelEnum.GPT4O, strategy="fast"): + # Partition the PDF + elements = self.partition_pdf_file(path, strategy=strategy) + + # Improve table elements + improved_elements = self.improve_layout(elements, model=model) + + elements_dict = [el.to_dict() for el in improved_elements] + markdown_content = self.convert_to_markdown(elements_dict) + return markdown_content + + +# if __name__ == "__main__": +# parser = UnstructuredParser() +# response = parser.convert("megaparse/tests/input_tests/MegaFake_report.pdf", model=ModelEnum.NONE) +# print(response) +# with open("megaparse/tests/output_tests/cdp.md", "w") as f: +# f.write(response) +# print("ok") diff --git a/backend/core/MegaParse/megaparse/utils.py b/backend/core/MegaParse/megaparse/utils.py new file mode 100644 index 000000000000..7dea8352481d --- /dev/null +++ b/backend/core/MegaParse/megaparse/utils.py @@ -0,0 +1,45 @@ +from docx.document import Document as DocumentObject +from docx.table import Table +from docx.text.paragraph import Paragraph +from docx.section import Section, _Header as Header, _Footer as Footer +from docx.oxml.text.paragraph import CT_P +from docx.oxml.table import CT_Tbl + + +def print_element(element): + if isinstance(element, Paragraph): + # Print the paragraph text + print(f"Paragraph: {element.text}") + elif isinstance(element, Table): + # Print the table content + print("Table:") + for row in element.rows: + for cell in row.cells: + print(cell.text, end="\t") + print() + elif isinstance(element, Section): + # Print section properties + print("Section:") + print(f" Start type: {element.start_type}") + print(f" Page height: {element.page_height}") + print(f" Page width: {element.page_width}") + elif isinstance(element, Header): + # Print header content + print("Header:") + for paragraph in element.paragraphs: + print(f" {paragraph.text}") + elif isinstance(element, Footer): + # Print footer content + print("Footer:") + for paragraph in element.paragraphs: + print(f" {paragraph.text}") + else: + print(f"Unknown element: {type(element)}") + + +def print_docx(doc: DocumentObject) -> None: + for element in doc.element.body: + if isinstance(element, CT_P): # Paragraph + print_element(Paragraph(element, doc)) + elif isinstance(element, CT_Tbl): # Table + print_element(Table(element, doc)) diff --git a/backend/core/MegaParse/notebooks/docx2md.ipynb b/backend/core/MegaParse/notebooks/docx2md.ipynb new file mode 100644 index 000000000000..cd6010f4ba80 --- /dev/null +++ b/backend/core/MegaParse/notebooks/docx2md.ipynb @@ -0,0 +1,57 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# -*- coding: utf-8 -*-\n", + "from pathlib import Path\n", + "from src.Converter import DOCXConverter" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "converter = DOCXConverter()\n", + "md_content = converter.convert('./input/CDP_QUAL_CHART_01_CHARTE PRODUITS_2023.12.13.docx')\n", + "converter.save_md(md_content, Path('./output/CDP_QUAL_CHART_01_CHARTE PRODUITS_2023.12.13.md'))" + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "metadata": {}, + "outputs": [], + "source": [ + "# import mammoth to compare results\n", + "# md = mammoth.convert_to_markdown('./input/CDP_QUAL_CHART_01_CHARTE PRODUITS_2023.12.13.docx')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "QuivrParse-DS8JDGq8", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/backend/core/MegaParse/notebooks/evaluate.ipynb b/backend/core/MegaParse/notebooks/evaluate.ipynb new file mode 100644 index 000000000000..537360f3f1e1 --- /dev/null +++ b/backend/core/MegaParse/notebooks/evaluate.ipynb @@ -0,0 +1,551 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Mega Parse" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Started parsing the file under job_id e5e0367d-2f83-4e4d-84e5-4d5df7119516\n", + "Started parsing the file under job_id 0b5d66aa-bbab-454b-b256-82495d20f91f\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']\n", + "- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" + ] + } + ], + "source": [ + "from pathlib import Path\n", + "import sys\n", + "sys.path.append('..')\n", + "from megaparse.Converter import MegaParse\n", + "import os \n", + "\n", + "api_key: str | None = os.getenv(\"LLAMA_CLOUD_API_KEY\")\n", + "\n", + "converter = MegaParse(file_path=\"../megaparse/tests/input_tests/MegaFake_report.pdf\", llama_parse_api_key=api_key)\n", + "md_content = converter.convert()\n", + "converter.save_md(md_content, Path(\"../megaparse/tests/output_tests/MegaFake_report_llama_parse_megaparse.md\"))\n", + "\n", + "converter = MegaParse(file_path=\"../megaparse/tests/input_tests/MegaFake_report.pdf\", llama_parse_api_key=api_key)\n", + "md_content = converter.convert(gpt4o_cleaner = True)\n", + "converter.save_md(md_content, Path(\"../megaparse/tests/output_tests/MegaFake_report_llama_parse_megaparse_gptcleaner.md\"))\n", + "\n", + "\n", + "converter = MegaParse(file_path=\"../megaparse/tests/input_tests/MegaFake_report.pdf\")\n", + "md_content = converter.convert()\n", + "converter.save_md(md_content, Path(\"../megaparse/tests/output_tests/MegaFake_report_unstructured_parse_megaparse.md\"))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### LLama Parse" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Started parsing the file under job_id f78ee794-ffde-4e0a-938d-987f1b22cfcb\n" + ] + } + ], + "source": [ + "from typing import List\n", + "from llama_index.core.schema import Document\n", + "import nest_asyncio\n", + "\n", + "nest_asyncio.apply()\n", + "#GET LLAMA_CLOUD_API_KEY\n", + "import os\n", + "from llama_parse import LlamaParse\n", + "from llama_parse.utils import ResultType, Language\n", + "\n", + "api_key: str | None = os.getenv(\"LLAMA_CLOUD_API_KEY\")\n", + "\n", + "parsing_instructions = \"Do not take into account the page breaks (no --- between pages), do not repeat the header and the footer so the tables are merged. Keep the same format for similar tables.\"\n", + "\n", + "parser = LlamaParse(\n", + " api_key=str(api_key), \n", + " result_type=ResultType.MD,\n", + " gpt4o_mode=True,\n", + " verbose=True,\n", + " language=Language.FRENCH,\n", + " parsing_instruction=parsing_instructions, # Optionally you can define a parsing instruction\n", + ")\n", + "# sync\n", + "documents: List[Document] = parser.load_data(\"../megaparse/tests/input_tests/MegaFake_report.pdf\")\n", + "\n", + "with open(\"../megaparse/tests/output_tests/MegaFake_report_llama.md\", \"w\") as f:\n", + " f.write(documents[0].get_content())\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Unstructured" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.document_loaders import UnstructuredPDFLoader\n", + "loader = UnstructuredPDFLoader(\"../megaparse/tests/input_tests/MegaFake_report.pdf\", strategy=\"hi_res\", infer_table_structure=True,\n", + ")\n", + "data = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"../megaparse/tests/output_tests/MegaFake_report_unstructured.md\", \"w\") as f:\n", + " f.write(data[0].page_content)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Evaluation with Diff Lib" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "import difflib\n", + "def read_file(file_path):\n", + " with open(file_path, 'r', encoding='utf-8') as file:\n", + " return file.readlines()\n", + "\n", + "def compare_files(source_path, target_path, with_formatting=False):\n", + " source_lines = read_file(source_path)\n", + " target_lines = read_file(target_path)\n", + " if not with_formatting:\n", + " source_lines = [line.replace(\"*\",\"\") for line in source_lines]\n", + " target_lines = [line.replace(\"*\",\"\") for line in target_lines]\n", + "\n", + " diff = difflib.unified_diff(\n", + " source_lines,\n", + " target_lines,\n", + " fromfile='target.md',\n", + " tofile='generated.md',\n", + " lineterm=''\n", + " )\n", + "\n", + " modifications = 0\n", + " for line in diff:\n", + " #print(line)\n", + " if line.startswith('+') and not line.startswith('+++'):\n", + " modifications += 1\n", + " elif line.startswith('-') and not line.startswith('---'):\n", + " modifications += 1\n", + "\n", + " return modifications\n", + " \n", + "diff_megaparse_unstructured = compare_files(\"../megaparse/tests/output_tests/MegaFake_report_unstructured_parse_megaparse.md\", \"../megaparse/tests/output_tests/MegaFake_report.md\")\n", + "diff_megaparse_llama_gptcleaner = compare_files(\"../megaparse/tests/output_tests/MegaFake_report_llama_parse_megaparse_gptcleaner.md\", \"../megaparse/tests/output_tests/MegaFake_report.md\")\n", + "diff_megaparse_llama = compare_files(\"../megaparse/tests/output_tests/MegaFake_report_llama_parse_megaparse.md\", \"../megaparse/tests/output_tests/MegaFake_report.md\")\n", + "diff_llamaparse = compare_files(\"../megaparse/tests/output_tests/MegaFake_report_llama.md\", \"../megaparse/tests/output_tests/MegaFake_report.md\")\n", + "diff_unstructured = compare_files(\"../megaparse/tests/output_tests/MegaFake_report_unstructured.md\", \"../megaparse/tests/output_tests/MegaFake_report.md\")\n", + "diff_megaparse_llm = compare_files(\"../megaparse/tests/output_tests/MegaFake_report_llm_megaparse.md\", \"../megaparse/tests/output_tests/MegaFake_report.md\")\n", + "diff_megaparse_unstructured_augmented = compare_files(\"../megaparse/tests/output_tests/MegaFake_report_unstructured_augmented.md\", \"../megaparse/tests/output_tests/MegaFake_report.md\")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "diff_results = {\n", + " \"**Megaparse**\": diff_megaparse_unstructured,\n", + " \"Megaparse with LLamaParse\": diff_megaparse_llama,\n", + " \"Megaparse with LLamaParse and GPTCleaner\": diff_megaparse_llama_gptcleaner,\n", + " \"LMM megaparse\": diff_megaparse_llm,\n", + " \"LLama Parse\": diff_llamaparse,\n", + " \"Unstructured Augmented Parse\": diff_megaparse_unstructured_augmented,\n", + "}\n", + "\n", + "# Sort the results\n", + "sorted_diff_results = sorted(diff_results.items(), key=lambda x: x[1])\n", + "\n", + "# Generate a table with the results\n", + "benchmark_results = \"| Parser | Diff |\\n|---|---|\\n\"\n", + "for parser, diff in sorted_diff_results:\n", + " benchmark_results += f\"| {parser} | {diff} |\\n\"\n", + "\n", + "# Update README.md file\n", + "with open(\"../README.md\", \"r\") as readme_file:\n", + " readme_content = readme_file.read()\n", + "\n", + "start_marker = \"\"\n", + "end_marker = \"\"\n", + "start_index = readme_content.find(start_marker) + len(start_marker)\n", + "end_index = readme_content.find(end_marker)\n", + "\n", + "updated_readme_content = readme_content[:start_index] + \"\\n\" + benchmark_results + readme_content[end_index:]\n", + "\n", + "with open(\"../README.md\", \"w\") as readme_file:\n", + " readme_file.write(updated_readme_content)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--- target.md\n", + "+++ generated.md\n", + "@@ -1,18 +1,19 @@\n", + "-| My Mega fake | report | #1756394 31/05/2024 |\n", + "\n", + "-|--------------|--------|---------------------|\n", + "\n", + "-| | | |\n", + "\n", + "+| My Mega fake report | #1756394 | 31/05/2024 |\n", + "\n", + "+|---------------------|----------|------------|\n", + "\n", + " \n", + "\n", + " # Why Mega Parse might be the best ?\n", + "\n", + " \n", + "\n", + "-# Introduction\n", + "\n", + "+## Introduction\n", + "\n", + " \n", + "\n", + " Mega Parse is a state-of-the-art document parser designed to convert various document formats such as PDF, DOCX, PPTX, and more into Markdown (MD) format, making them ready for Retrieval-Augmented Generation (RAG) ingestion. In today's data-driven world, the ability to efficiently manage and utilize large volumes of information is crucial. This report explores the features, benefits, and comparative performance of Mega Parse, illustrating why it stands out as a superior tool in the realm of document parsing.\n", + "\n", + " \n", + "\n", + "-# Features of Mega Parse\n", + "\n", + "+## Features of Mega Parse\n", + "\n", + " \n", + "\n", + " Mega Parse boasts an impressive array of features tailored to meet the diverse needs of modern enterprises.\n", + "\n", + " \n", + "\n", + " Multiple Format Support: Mega Parse supports a wide range of document formats including PDF, DOCX, and PPTX. This versatility allows users to handle various document types without needing multiple tools. Whether you are working with text documents, presentations, or scanned PDFs, Mega Parse has you covered.\n", + "\n", + "+\n", + "\n", + "+High-Speed Processing: One of the standout features of Mega Parse is its ability to convert documents at a rapid pace. With processing speeds of up to 120 pages per minute, it significantly enhances productivity by reducing the time spent on document conversion.\n", + "\n", + " \n", + "\n", + " Markdown Output: Mega Parse converts documents into a structured Markdown format. Markdown is a lightweight markup language with plain text formatting syntax, which is widely used because of its simplicity and ease of conversion to other formats. This makes it ideal for RAG ingestion, where structured and easily interpretable data is paramount.\n", + "\n", + " \n", + "\n", + "@@ -24,7 +25,7 @@\n", + " \n", + "\n", + " Error Handling: Advanced error handling capabilities ensure that any issues encountered during the conversion process are managed effectively, minimizing disruptions and maintaining workflow efficiency.\n", + "\n", + " \n", + "\n", + "-# Benefits of Mega Parse\n", + "\n", + "+## Benefits of Mega Parse\n", + "\n", + " \n", + "\n", + " The implementation of Mega Parse offers numerous benefits that can transform the way organizations manage their documents.\n", + "\n", + " \n", + "\n", + "@@ -32,9 +33,7 @@\n", + " \n", + "\n", + " Versatility: Mega Parse's ability to handle multiple document types makes it a versatile tool for various industries. Whether you need to convert legal documents, technical manuals, or business presentations, Mega Parse is equipped to handle the task.\n", + "\n", + " \n", + "\n", + "-Enhanced Knowledge Management: Converting documents to Markdown facilitates easier content management and retrieval. Markdown files are not only lightweight but\n", + "\n", + "-\n", + "\n", + "-also highly compatible with various knowledge management systems, making it easier to organize, search, and utilize information.\n", + "\n", + "+Enhanced Knowledge Management: Converting documents to Markdown facilitates easier content management and retrieval. Markdown files are not only lightweight but also highly compatible with various knowledge management systems, making it easier to organize, search, and utilize information.\n", + "\n", + " \n", + "\n", + " Improved Workflow: Mega Parse simplifies the process of preparing documents for machine learning and AI applications. By converting documents into a structured format, it reduces the time and effort required to preprocess data, allowing teams to focus on higher-level tasks.\n", + "\n", + " \n", + "\n", + "@@ -42,57 +41,45 @@\n", + " \n", + "\n", + " Scalability: Mega Parse is designed to scale with the needs of an organization. As document volumes grow, Mega Parse can handle the increased load without compromising performance, making it a future-proof solution for document management.\n", + "\n", + " \n", + "\n", + "-# Comparative Performance\n", + "\n", + "+## Comparative Performance\n", + "\n", + " \n", + "\n", + " The following table provides a comprehensive comparative analysis of Mega Parse against other document parsers based on fictional performance metrics. This comparison highlights the strengths of Mega Parse in various key areas.\n", + "\n", + " \n", + "\n", + "-| Metric | Mega Parse | Parser A | Parser B | Parser C | Parser D |\n", + "\n", + "-|-------------------------------|----------------------|------------|------------|------------|-------------------|\n", + "\n", + "-| Supported Formats | PDF, DOCX, PPTX | PDF, DOCX | DOCX, PPTX | PDF, PPTX | PDF, DOCX, XLSX |\n", + "\n", + "-| Conversion Speed (pages/min) | 120 | 90 | 100 | 85 | 95 |\n", + "\n", + "-\n", + "\n", + "-| Metric | Mega Parse | Parser A | Parser B | Parser C | Parser D | Plain Text |\n", + "\n", + "-|--------------------------------------|------------|----------|----------|----------|------------|-------------|\n", + "\n", + "-| Accuracy Rate (%) | 98 | 95 | 93 | 90 | 92 | 90 |\n", + "\n", + "-| Output Format | Markdown | HTML | Markdown | HTML | Plain Text | Plain Text |\n", + "\n", + "-| Error Rate (%) | 1 | 3 | 4 | 5 | 3 | 5 |\n", + "\n", + "-| Ease of Use | High | Medium | High | Medium | Medium | Medium |\n", + "\n", + "-| Integration Capability | Excellent | Good | Good | Fair | Good | Good |\n", + "\n", + "-| Batch Processing | Yes | No | Yes | No | Yes | No |\n", + "\n", + "-| Custom Parsing Rules | Yes | Limited | Yes | No | Yes | No |\n", + "\n", + "-| Multilingual Support | Yes | Yes | Yes | Yes | Yes | Yes |\n", + "\n", + "-| OCR (Optical Character Recognition) | Yes | Yes | Yes | Yes | Yes | No |\n", + "\n", + "-| Price (per user/month) | $30 | $25 | $20 | $15 | $18 | $15 |\n", + "\n", + "-| Customer Support Rating (out of 5) | 4.8 | 4.2 | 4.5 | 3.9 | 4.1 | 3.9 |\n", + "\n", + "-| Free Trial Available | Yes | Yes | No | Yes | No | Yes |\n", + "\n", + "-| Cloud Integration | Yes | No | Yes | No | No | Yes |\n", + "\n", + "-| Security Features | Advanced | Basic | Advanced | Basic | Intermediate| Basic |\n", + "\n", + "-\n", + "\n", + "-\n", + "\n", + "-| Feature | Tool 1 | Tool 2 | Tool 3 | Tool 4 | Tool 5 |\n", + "\n", + "-|--------------------------------|---------------------|------------------|----------------|---------------|------------------|\n", + "\n", + "-| User Community Size | Large | Medium | Medium | Small | Medium |\n", + "\n", + "-| Monthly Updates | Yes | Yes | No | No | No |\n", + "\n", + "-| Mobile App Availability | Yes | No | Yes | No | No |\n", + "\n", + "-| Platform Compatibility | Windows, Mac, Linux | Windows, Linux | Windows | Mac, Linux | Windows, Linux |\n", + "\n", + "-| Data Privacy Compliance | High | Medium | High | Low | Medium |\n", + "\n", + "-| AI-Driven Enhancements | Yes | No | Yes | No | Yes |\n", + "\n", + "-| File Size Limit (per document) | 1GB | 500MB | 750MB | 200MB | 500MB |\n", + "\n", + "-| User Training Resources | Extensive | Moderate | Extensive | Limited | Moderate |\n", + "\n", + "-| API Access | Yes | No | Yes | No | Yes |\n", + "\n", + "-| Customizable Output Templates | Yes | Limited | Yes | No | Limited |\n", + "\n", + "-| Collaboration Features | Yes | No | Yes | No | Limited |\n", + "\n", + "-| Document Version Control | Yes | No | Yes | No | Yes |\n", + "\n", + "-| Import/Export Options | Extensive | Moderate | Extensive | Limited | Moderate |\n", + "\n", + "-\n", + "\n", + "-\n", + "\n", + "-| Feedback Mechanism | Yes | No | Yes | No | Yes |\n", + "\n", + "-|--------------------|-----|----|-----|----|-----|\n", + "\n", + "-\n", + "\n", + "+| Metric | Mega Parse | Parser A | Parser B | Parser C | Parser D |\n", + "\n", + "+|---------------------|-------------|----------------|--------------|--------------|----------------|\n", + "\n", + "+| Supported Formats | PDF, DOCX, PPTX | PDF, DOCX | DOCX, PPTX | PDF, PPTX | PDF, DOCX, XLSX|\n", + "\n", + "+| Conversion Speed (pages/min) | 120 | 90 | 100 | 85 | 95 |\n", + "\n", + "+| Accuracy Rate (%) | 98 | 95 | 93 | 90 | 92 |\n", + "\n", + "+| Output Format | Markdown | HTML | Markdown | Plain Text | HTML |\n", + "\n", + "+| Error Rate (%) | 1 | 3 | 4 | 5 | 3 |\n", + "\n", + "+| Ease of Use | High | Medium | High | Medium | Medium |\n", + "\n", + "+| Integration Capability| Excellent| Good | Good | Fair | Good |\n", + "\n", + "+| Batch Processing | Yes | No | Yes | No | Yes |\n", + "\n", + "+| Custom Parsing Rules | Yes | Limited | Yes | No | Limited |\n", + "\n", + "+| Multilingual Support | Yes | Yes | No | Yes | Yes |\n", + "\n", + "+| OCR (Optical Character Recognition) | Yes | No | Yes | No | Yes |\n", + "\n", + "+| Price (per user/month)| $30 | $25 | $20 | $15 | $18 |\n", + "\n", + "+| Customer Support Rating (out of 5) | 4.8 | 4.2 | 4.5 | 3.9 | 4.1 |\n", + "\n", + "+| Free Trial Available | Yes | Yes | No | Yes | No |\n", + "\n", + "+| Cloud Integration | Yes | No | Yes | Yes | No |\n", + "\n", + "+| Security Features | Advanced | Basic | Advanced | Basic | Intermediate |\n", + "\n", + "+| User Community Size | Large | Medium | Medium | Small | Medium |\n", + "\n", + "+| Monthly Updates | Yes | Yes | No | Yes | No |\n", + "\n", + "+| Mobile App Availability| Yes | No | Yes | No | Yes |\n", + "\n", + "+| Platform Compatibility| Windows, Mac, Linux | Windows, Mac | Windows | Mac, Linux | Windows, Linux |\n", + "\n", + "+| Data Privacy Compliance| High | Medium | High | Low | Medium |\n", + "\n", + "+| AI-Driven Enhancements| Yes | No | Yes | No | Yes |\n", + "\n", + "+| File Size Limit (per document) | 1GB | 500MB | 750MB | 200MB | 500MB |\n", + "\n", + "+| User Training Resources| Extensive | Moderate | Extensive | Limited | Moderate |\n", + "\n", + "+| API Access | Yes | No | Yes | No | Yes |\n", + "\n", + "+| Customizable Output Templates | Yes | Limited | Yes | No | Yes |\n", + "\n", + "+| Collaboration Features| Yes | No | Yes | No | Limited |\n", + "\n", + "+| Document Version Control| Yes | No | Yes | No | Yes |\n", + "\n", + "+| Import/Export Options | Extensive | Moderate | Extensive | Limited | Moderate |\n", + "\n", + "+| Feedback Mechanism | Yes | No | Yes | No | Yes |\n", + "\n", + " \n", + "\n", + " Note: All data presented in this table is fictional and for illustrative purposes only.\n", + "\n", + " \n", + "\n", + "-# Conclusion\n", + "\n", + "+## Conclusion\n", + "\n", + " \n", + "\n", + "-Mega Parse stands out as a leading document parser due to its extensive format support, high-speed processing, and accuracy. Its ability to convert a variety of document types into Markdown format makes it an invaluable tool for organizations looking to streamline their document management processes and enhance their knowledge management systems. With features like customizable parsing rules, batch processing, and advanced error handling, Mega Parse is well-equipped to meet the demands of modern enterprises. Its scalability and cost-effectiveness further reinforce its position as a top choice for document parsing and conversion needs. By leveraging Mega Parse, organizations can improve their workflow efficiency, reduce operational costs, and better manage their information assets in the age of big data and artificial intelligence.\n", + "\n", + "-\n", + "\n", + "+Mega Parse stands out as a leading document parser due to its extensive format support, high-speed processing, and accuracy. Its ability to convert a variety of document types into Markdown format makes it an invaluable tool for organizations looking to streamline their document management processes and enhance their knowledge management systems. With features like customizable parsing rules, batch processing, and advanced error handling, Mega Parse is well-equipped to meet the demands of modern enterprises. Its scalability and cost-effectiveness further reinforce its position as a top choice for document parsing and conversion needs. By leveraging Mega Parse, organizations can improve their workflow efficiency, reduce operational costs, and better manage their information assets in the age of big data and artificial intelligence.\n" + ] + } + ], + "source": [ + "source_lines = read_file(\"../megaparse/tests/output_tests/MegaFake_report_unstructured_augmented.md\")\n", + "target_lines = read_file(\"../megaparse/tests/output_tests/MegaFake_report.md\")\n", + "\n", + "source_lines = [line.replace(\"*\",\"\") for line in source_lines]\n", + "target_lines = [line.replace(\"*\",\"\") for line in target_lines]\n", + "\n", + "diff = difflib.unified_diff(\n", + "source_lines,\n", + "target_lines,\n", + "fromfile='target.md',\n", + "tofile='generated.md',\n", + "lineterm=''\n", + ")\n", + "modifications = 0\n", + "for line in diff:\n", + " print(line)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "QuivrParse-DS8JDGq8", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/backend/core/MegaParse/notebooks/pdf2md_llamaParse.ipynb b/backend/core/MegaParse/notebooks/pdf2md_llamaParse.ipynb new file mode 100644 index 000000000000..e51378c8d22d --- /dev/null +++ b/backend/core/MegaParse/notebooks/pdf2md_llamaParse.ipynb @@ -0,0 +1,148 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "llx-2n4Awnlb1jwmF0Nn5iHtXNIntWYJFKIOP2rUJpJYjfi4ZECV\n", + "Started parsing the file under job_id 4fd224a0-f850-4ffb-8f4f-46831510ec1a\n", + "[Document(id_='86203ce1-cb60-4435-a909-6f8999d347ed', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text=\"\\n# CHARTE PRODUITS COUP DE PATES\\n\\n**Codification**: CDP_QUA_CHART_01 \\n**Version**: 5 \\n**Date d'application**: 13/12/2023\\n\\n| Date | Mises à jour |\\n|------------|------------------------------------------------------------------------------|\\n| 19/12/2014 | Création |\\n| 12/12/2019 | Insertion des additifs interdits et à éviter |\\n| 13/05/2022 | Revue des exigences recettes et annexes |\\n| 30/03/2023 | Revue des annexes I et II. Fréquence de mise à jour CDC |\\n| 13/12/2023 | Ajout d'une exigence de certification sur le cacao |\\n| | Revue des exigences de certification de l’huile de palme |\\n\\n## Table des matières\\n\\n1. [Exigence recette](#exigence-recette) .................................................. 2 \\n2. [Produits soumis à certification ou allégations](#produits-soumis-à-certification-ou-allégations) ........ 3 \\n 2.1. [Produits « sans gluten »](#produits-sans-gluten) ................................................. 3 \\n 2.2. [Produits issus de l’agriculture biologique](#produits-issus-de-lagriculture-biologique) ............ 3 \\n3. [Exigences générales relatives au fournisseur](#exigences-générales-relatives-au-fournisseur) .......... 4 \\n4. [Exigences relatives aux sites de production](#exigences-relatives-aux-sites-de-production) ............ 4 \\n5. [Traçabilité](#traçabilité) ............................................................ 4 \\n6. [Suivi analytique](#suivi-analytique) .................................................. 5 \\n 6.1. [Suivi microbiologique](#suivi-microbiologique) .................................................. 5 \\n 6.2. [Suivi nutritionnel](#suivi-nutritionnel) ....................................................... 5 \\n 6.3. [Suivi organoleptique](#suivi-organoleptique) ................................................... 5 \\n7. [Non conformités](#non-conformités) .................................................... 5 \\n8. [Gestion de crise Coup de Pates](#gestion-de-crise-coup-de-pates) ........................ 6 \\n\\n**ANNEXE I**: Additifs rouges : additifs pour lesquels les rapports scientifiques rapportent une potentielle cancérogénicité ou une implication dans les pathologies lourdes ........ 7 \\n**ANNEXE II**: Additifs oranges : additifs pour lesquels les rapports scientifiques sont contradictoires ........ 10 \\n**ANNEXE III**: Additifs verts : additifs identifiés à ce jour comme non dangereux pour la santé ........ 11 \\n**ANNEXE IV**: Ingrédients controversés ........ 12 \\n\\n## Liste des abréviations\\n\\n- **AFDIAG** : Association Française Des Intolérants Au Gluten\\n- **AOECS** : Association of European Coeliac Societies\\n- **COFRAC** : Comité français d'accréditation\\n- **DGHM** : Deutschen Gesellschaft für Hygiene und Mikrobiologie\\n- **FCD** : Fédération du Commerce et de la Distribution\\n- **GFSI** : Global Food Safety Initiative\\n- **ILAC** : International Laboratory Accreditation Cooperation\\n- **NPD** : New Product Development\\n\\n---\\n# CHARTE PRODUITS COUP DE PATES\\n\\n**Codification**: CDP_QUA_CHART_01 \\n**Version**: 5 \\n**Date d'application**: 13/12/2023\\n\\n## 1. Exigence recette\\n\\nPour le développement de nos produits, nous souhaitons favoriser une offre saine avec des recettes simples (sans colorant, sans arôme, sans conservateur), avec des ingrédients de qualité, en favorisant des produits locaux et labellisés.\\n\\nLe fournisseur s’engage à respecter la réglementation européenne et nationale ainsi que les codes d’usages professionnels applicables aux produits surgelés vendus à Coup de Pates.\\n\\nPour les produits commercialisés sous une marque appartenant à Coup de Pates, le fournisseur s’engage également à respecter les exigences spécifiques de cette même marque.\\n\\nDans ce cas, nos exigences recettes sont spécifiques à trois niveaux gammes : Entrée de gamme, Cœur de gamme, Haut de gamme.\\n\\nPour les produits développés en réponse à des demandes spécifiques de nos clients, il vous sera également demandé de prendre leurs exigences en considération.\\n\\n| Caractéristiques | Entrée de gamme | Cœur de gamme | Haut de Gamme |\\n|------------------|-----------------|---------------|---------------|\\n| Ingrédients soumis à déclaration OGM | INTERDIT | INTERDIT | INTERDIT |\\n| Traitement par ionisation | INTERDIT | INTERDIT | INTERDIT |\\n| Colorants azoïques (E102, E104, E110, E122, E124, E129) | INTERDIT | INTERDIT | INTERDIT |\\n| Nanoparticules (E170, E171, E172, E174, E152, E341, E551 et E552) | INTERDIT | INTERDIT | INTERDIT |\\n| Glutamates et exhausteurs de goût | INTERDIT | INTERDIT | INTERDIT |\\n| Œufs de poules élevées en cage | INTERDIT | INTERDIT | INTERDIT |\\n| Matières grasses partiellement hydrogénées | INTERDIT | INTERDIT | INTERDIT |\\n| Acides gras trans non naturellement présents | INTERDIT | INTERDIT | INTERDIT |\\n| Édulcorants de synthèse | INTERDIT | INTERDIT | INTERDIT |\\n| Viande Séparée Mécaniquement - VSM | INTERDIT | INTERDIT | INTERDIT |\\n| Cacao non certifié durable | * INTERDIT pour tous les NPD et plan action pour remplacer le cacao non certifié dans l’existant. | * INTERDIT pour tous les NPD et plan action pour remplacer le cacao non certifié dans l’existant. | * INTERDIT pour tous les NPD et plan action pour remplacer le cacao non certifié dans l’existant. |\\n| Gélatine porcine | INTERDIT | INTERDIT | INTERDIT |\\n| Gélatine animale – (autre que porcine) | À ÉVITER | INTERDIT (tolérance dans les pâtisseries) | INTERDIT (tolérance dans les pâtisseries) |\\n| Huile de palme + palmiste non RSPO | * INTERDIT pour tous les NPD et plan action pour retirer dans l’existant - (tolérée dans supports d’additifs) - En aucun cas, l’huile de palme non RSPO ne pourra être substituée par de l’huile de coprah ou coco. | * INTERDIT pour tous les NPD et plan action pour retirer dans l’existant - (tolérée dans supports d’additifs) - En aucun cas, l’huile de palme non RSPO ne pourra être substituée par de l’huile de coprah ou coco. | * INTERDIT pour tous les NPD et plan action pour retirer dans l’existant - (tolérée dans supports d’additifs) - En aucun cas, l’huile de palme non RSPO ne pourra être substituée par de l’huile de coprah ou coco. |\\n| Huile de palme + palmiste RSPO (certification « Segregated » demandée, à minima « Mass Balance » soumis à dérogation) | À ÉVITER | À ÉVITER | INTERDIT |\\n\\nPage 2 sur 15\\n---\\n\\n# CHARTE PRODUITS COUP DE PATES\\n\\n| | A ÉVITER | INTERDIT (tolérance dans les pâtisseries *sauf arômes de fumée et vanilline) | INTERDIT (sauf arômes de fumée) |\\n|--------------------------|----------|-----------------------------------------------------------------------------|---------------------------------|\\n| Arômes artificiels | A ÉVITER | INTERDIT (tolérance dans les pâtisseries *sauf arômes de fumée et vanilline) | INTERDIT (sauf arômes de fumée) |\\n| Colorants artificiels | A ÉVITER | Interdit dans les produits salés | INTERDIT |\\n| Ingrédients controversés (cf. Annexe IV) | A ÉVITER | A ÉVITER | INTERDIT |\\n| Additifs rouges (cf. Annexe I) | A ÉVITER | INTERDIT (hors nitrites et polyphosphates) | INTERDIT (hors nitrites dans les produits de salaison) |\\n| Additifs Oranges (cf. Annexe II) | A ÉVITER | A ÉVITER | INTERDIT |\\n| Nitrites (E250 à E252) | A ÉVITER | A ÉVITER | INTERDIT (Hors produits de salaison) |\\n| Polyphosphates (E450 à 452 - E339 à 341) | A ÉVITER | A ÉVITER | INTERDIT |\\n| Viande et volaille origine hors UE | A ÉVITER | A ÉVITER | INTERDIT |\\n\\nL’ensemble de ces critères est applicable à tous les produits vendus par Coup de Pates. Des dérogations peuvent être accordées au cas par cas, sur justificatifs fournis par le fournisseur et après validation par la direction qualité Coup de Pates.\\n\\n## 2. Produits soumis à certification ou allégations\\n\\nLe fournisseur se doit de communiquer tout document permettant de valider la certification ou allégation associée à un produit.\\n\\nEn vue de vérifier la véracité des critères déclarés, le fournisseur s’engage à transmettre sur demande expresse de Coup de Pates, tout document permettant de justifier la certification et/ou de l’allégation associée(s) au(x) produit(s).\\n\\n### 2.1. Produits « sans gluten »\\n\\nLe fournisseur doit confirmer annuellement à Coup de Pates que l’allégation « sans gluten » de son (ses) produit(s) est applicable, conformément au règlement européen n°828/2014. Pour cela, un bulletin d’analyse de quantification du taux de gluten dans le produit fini doit être communiqué au service qualité.\\n\\nSi le fournisseur possède un contrat de licence auprès d’une association de personnes cœliaques (AFDIAG, AOECS…), il en transmettra le numéro de licence à Coup de Pates et les rapports et/ou certificats d’audits selon le référentiel d’audit de l’AOECS.\\n\\n### 2.2. Produits issus de l’agriculture biologique\\n\\nLe fournisseur s’engage à transmettre sur demande expresse de Coup de Pates, les analyses pesticides sur produits finis pour répondre aux exigences de la réglementation européenne.\\n\\n---\\n\\n# CHARTE PRODUITS COUP DE PATES\\n\\nCodification : CDP_QUA_CHART_01\\n\\nVersion : 5\\n\\nDate d’application : 13/12/2023\\n\\n(Règlement (CE) N°834/2007 relatif à la production biologique et à l’étiquetage des produits biologiques).\\n\\nEn cas de déclassement produit, de non-renouvellement ou de perte de la certification « produit issu de l’agriculture biologique », le fournisseur doit immédiatement en informer le service qualité Coup de Pates afin d’organiser le blocage et le retrait de ces produits.\\n\\n## 3. Exigences générales relatives au fournisseur\\n\\nLe fournisseur se doit de disposer de moyens de contrôle et d'enregistrement permettant le respect de la chaîne du froid dans son stockage et son transport de denrées congelées/surgelées.\\n\\nTout envoi d’échantillon devra être accompagné de la « Fiche d’évolution produit » ou d’une fiche technique fournisseur, reprenant à minima les données techniques demandées dans le document précédent (composition, dimensions, DDM ...). Toute autre information jugée nécessaire par le service qualité Coup de Pates devra être communiquée sur demande. Le cahier des charges Coup de Pates devra être rempli dès que le référencement du produit aura été confirmé.\\n\\nIl revient au fournisseur d’appliquer la plus grande diligence dans le transfert exhaustif de ces données. Le dossier établi à l’issue du processus de référencement sera validé à la fois par le fournisseur et un représentant du service qualité Coup de Pates. Toute modification du dossier technique devra être validée en amont par le service qualité Coup de Pates. Si cela est jugé nécessaire, des échantillons (produit actuel / produit modifié) devront être envoyés au service qualité Coup de Pates. Le cahier des charges devra être revu dans son intégralité tous les 5 ans. Même s’il n’y a pas de modification, le cahier des charges sera de nouveau signé avec la nouvelle date.\\n\\n## 4. Exigences relatives aux sites de production\\n\\nLe fournisseur se doit de communiquer les certificats relatifs à son activité, en cours de validité, par exemple : IFS, BRC, FSSC 22000. Le service qualité Coup de Pates devra être informé de tout renouvellement ou perte de certification.\\n\\nLe fournisseur se doit de communiquer, sur demande de Coup de Pates, l’ensemble des documents permettant de justifier sa maîtrise des risques liés à son activité (étude HACCP par exemple).\\n\\nLa mise en place des mesures contre les actes malveillants en matière de protection de la chaine alimentaire/des produits sont de la responsabilité du fournisseur.\\n\\n---\\n\\n# CHARTE PRODUITS COUP DE PATES\\n\\n**Codification :** \\nCDP_QUA_CHART_01\\n\\n**Version :** 5\\n\\n**Date d’application :** 13/12/2023\\n\\nLe fournisseur doit posséder un **numéro d’enregistrement** auprès des services officiels, permettant l’export de ses produits par Coup de Pates.\\n\\n## 5. Traçabilité\\n\\nLe fournisseur se doit d’assurer la traçabilité de ses produits, de la réception des matières premières jusqu’à l’expédition des produits finis. La traçabilité d’une référence doit pouvoir être assurée via la date de durabilité minimale (au format jour/mois/année).\\n\\nSur demande de Coup de Pates, le fournisseur s’engage à transmettre les fiches ingrédients, certificats et éléments de traçabilité liés à la nature des matières premières, des emballages et du produit fini ainsi que les bilans de matière dans les délais stipulés.\\n\\n## 6. Suivi analytique\\n\\n### 6.1. Suivi microbiologique\\n\\nLes analyses microbiologiques réalisées sur les produits finis doivent être en adéquation avec la réglementation européenne n°2073/2005 et les recommandations de la FCD en France, du DGHM en Allemagne et en Suisse, ou équivalent local au sein de l’Europe.\\n\\nUne analyse microbiologique devra être réalisée lors de chaque première fabrication. Cette analyse devra être réalisée par un laboratoire accrédité COFRAC ou équivalent du COFRAC reconnu par l’ILAC dans les pays concernés ou certifié ISO 17025. Les résultats doivent être transmis au service qualité Coup de Pates.\\n\\nL’ensemble des produits Coup de Pates doivent être inclus dans le plan de contrôle microbiologique du fournisseur, selon les critères FCD. Sur demande de Coup de Pates, un nouveau bulletin d’analyse devra être communiqué.\\n\\n### 6.2. Suivi nutritionnel\\n\\nLe fournisseur doit communiquer à Coup de Pates une analyse nutritionnelle réalisée par un laboratoire accrédité COFRAC ou équivalent du COFRAC reconnu par l’ILAC dans les pays concernés. Cette analyse doit être réalisée pour chaque nouveau produit référencé, afin de répondre aux exigences d’étiquetage européennes (avec quantification des acides gras trans et des fibres), et à chaque modification de matières premières et/ou de recette. La communication d’analyses nutritionnelles calculées à l’aide d’un logiciel consolidé est également acceptée.\\n\\nSur demande de Coup de Pates, un nouveau bulletin d’analyse devra être communiqué.\\n\\n### 6.3. Suivi organoleptique\\n\\nL’ensemble des produits Coup de Pates doivent être inclus dans le plan de contrôle organoleptique du fournisseur. Sur demande de Coup de Pates, les résultats de ces analyses devront être communiqués.\\n\\n---\\n\\n# CHARTE PRODUITS COUP DE PATES\\n\\n| Codification : | CDP_QUA_CHART_01 |\\n|----------------|------------------|\\n| Version : | 5 |\\n| Date d’application : | 13/12/2023 |\\n\\n## 7. Non conformités\\n\\nEn cas de non-conformité produit, sanitaire ou réglementaire, le fournisseur s’engage à alerter immédiatement Coup de Pates et à communiquer les éléments de traçabilité nécessaires.\\n\\nEn cas de non-conformité détectée par le service qualité Coup de Pates ou un de ses clients, une notification est envoyée au fournisseur. Celui-ci s’engage à communiquer son analyse et son plan d’action dans les délais demandés.\\n\\n## 8. Gestion de crise Coup de Pates\\n\\nEn cas de crise, le fournisseur s’engage à suivre la procédure de gestion de crise/alerte qui lui a été communiquée par Coup de Pates. Un contact spécifique avec numéro d’astreinte doit être communiqué.\\n\\n---\\n\\n# CHARTE PRODUITS COUP DE PATES\\n\\n**Codification**: CDP_QUA_CHART_01 \\n**Version**: 5 \\n**Date d'application**: 13/12/2023 \\n\\n## ANNEXE 1 : Additifs rouges : additifs pour lesquels les rapports scientifiques rapportent une potentielle cancérogénicité ou une implication dans les pathologies lourdes\\n\\n| Additif | Code | Additif | Code |\\n|---------|------|---------|------|\\n| Tartrazine | E 102 | Acide propionique | E 280 |\\n| Jaune de quinoléine | E 104 | Propionate de sodium | E 281 |\\n| Sunset Yellow FCF/Jaune orange S | E 110 | Propionate de potassium | E 283 |\\n| Azorubine, carmoisine | E 122 | Acide borique | E 284 |\\n| Amarante | E 123 | Tétraborate de sodium (borax) | E 285 |\\n| Ponceau 4R, rouge cochenille A | E 124 | Acide fumarique | E 297 |\\n| Erythrosine | E 127 | Gamma-tocophérol | E 308 |\\n| Rouge allura AC | E 129 | Delta-tocophérol | E 309 |\\n| Indigotine, carmin d’indigo | E 132 | Gallate de propyle | E 310 |\\n| Bleu brillant FCF | E 133 | Acide érythorbique | E 315 |\\n| Vert S | E 142 | Butylhydro-quinone tertiaire (BHQT) | E 319 |\\n| Caramel ammoniacal | E 150c | Butylhydroxy-anisol (BHA) | E 320 |\\n| Caramel au sulfite d’ammonium | E 150d | Butylhydroxy-toluène (BHT) | E 321 |\\n| Noir brillant PN | E 151 | Tartrates de sodium | E 335 |\\n| Brun HT | E 155 | Tartrate double de sodium et de potassium | E 337 |\\n| Carbonate de calcium | E 170 | Acide phosphorique | E 338 |\\n| Dioxyde de titane | E 171 | Phosphates de sodium | E 339 |\\n| Oxyde et hydroxyde de fer | E 172 | Phosphates de potassium | E 340 |\\n| Aluminium | E 173 | Phosphates de calcium | E 341 |\\n| Argent | E 174 | Phosphates de magnésium | E 343 |\\n| Lithol-rubine BK | E 180 | Malates de sodium | E 350 |\\n\\n---\\n\\n# CHARTE PRODUITS COUP DE PATES\\n\\n| **Benozoate de potassium** | **E 212** | **Malates de calcium** | **E 352** |\\n|----------------------------|-----------|------------------------|-----------|\\n| Benzoate de calcium | E 213 | Acide adipique | E 355 |\\n| p- hydroxybenzoate d’éthyle| E 214 | Adipate de sodium | E 356 |\\n| Dérivé sodique de l’ester éthylique de l’acide p-hydroxybenzoïque | E 215 | Adipate de potassium | E 357 |\\n| p-hydroxybenzoate de méthyle | E 218 | Acide succinique | E 363 |\\n| Dérivé sodique de l’ester méthylique de l’acide p-hydroxybenzoïque | E 219 | Citrate de triammonium | E 380 |\\n| Nisine | E 234 | Alginate de potassium | E 402 |\\n| Hexaméthylènetétramine | E 239 | Alginate d’ammonium | E 403 |\\n| Dicarbonate de diméthyle | E 242 | Mannitol | E 421 |\\n| Éthyl Lauroyl Arginate | E 243 | Gomme arabique modifiée à l’acide octénylsuccinique (OSA) | E 423 |\\n| Nitrite de potassium | E 249 | Konjac | E 425 |\\n| Nitrite de sodium | E 250 | Hémicellulose de soja | E 426 |\\n| Nitrate de sodium | E 251 | Stéarate de polyoxyéthylène (40) | E 431 |\\n| Nitrate de potassium | E 252 | Mono laurate de polyoxyéthylène de sorbitane (polysorbate 20) | E 432 |\\n| Monooléate de polyoxyéthylène de sorbitane (polysorbate 80) | E 433 | Dioxyde de silicium | E 551 |\\n| Monopalmitate de polyoxyéthylène de sorbitane (polysorbate 40) | E 434 | Silicate de calcium | E 552 |\\n| Monostéarate de polyoxyéthylène de sorbitane (polysorbate 60) | E 435 | Silicate de magnésium | E 553a |\\n| Tristéarate de polyoxyéthylène de sorbitane (polysorbate 65) | E 436 | Talc | E 553b |\\n| Phosphatides d’ammonium | E 442 | Silicate alumino-sodique | E 554 |\\n\\n**Codification**: CDP_QUA_CHART_01 \\n**Version**: 5 \\n**Date d’application**: 13/12/2023\\n\\n---\\n\\n# CHARTE PRODUITS COUP DE PATES\\n\\n| **Acétate isobutyrate de saccharose** | **E 444** |\\n|---------------------------------------|-----------|\\n| Esters glycériques de résine de bois | E 445 |\\n| Diphosphates | E 450 |\\n| Triphosphates | E 451 |\\n| Polyphosphates | E 452 |\\n| Polyaspartate de potassium | E 456 |\\n| Bêta-cyclodextrine | E 459 |\\n| Éthylcellulose | E 462 |\\n| Hydroxypropylcellulose faiblement substituée (L-HPC) | E 463a |\\n| Méthyléthylcellulose | E 465 |\\n| Carboxyméthylcellulose de sodium réticulée, gomme de cellulose réticulée | E 468 |\\n| Carboxyméthylcellulose hydrolysée de manière enzymatique, gomme de cellulose hydrolysée de manière enzymatique | E 469 |\\n| Sucroglycérides | E 474 |\\n| Huile de soja oxydée par chauffage ayant réagi avec des mono- et diglycérides d’acides gras | E 479b |\\n| Monostéarate de sorbitane | E 491 |\\n| Tristéarate de sorbitane | E 492 |\\n| Monolaurate de sorbitane | E 493 |\\n| Monooléate de sorbitane | E 494 |\\n| Monopalmitate de sorbitane | E 495 |\\n| Chlorure d’étain | E 512 |\\n| Silicate alumino-potassique | E 555 |\\n| 4-Hexylrésorcinol | E 586 |\\n| Acide glutamique | E 620 |\\n| Glutamate monosodique | E 621 |\\n| Glutamate monopotassique | E 622 |\\n| Diglutamate de calcium | E 623 |\\n| Glutamate d’ammonium | E 624 |\\n| Diglutamate de magnésium | E 625 |\\n| Acide guanylique | E 626 |\\n| Guanylate disodique | E 627 |\\n| Guanylate dipotassique | E 628 |\\n| Guanylate de calcium | E 629 |\\n| Acide inosinique | E 630 |\\n| Inosinate disodique | E 631 |\\n| Inosinate dipotassique | E 632 |\\n| Inosinate de calcium | E 633 |\\n| 5'-ribonucléotide calcique | E 634 |\\n| 5'-ribonucléotide disodique | E 635 |\\n| Glycine et son sel de sodium | E 640 |\\n| Acétate de zinc | E 650 |\\n\\n**Codification**: CDP_QUA_CHART_01 \\n**Version**: 5 \\n**Date d’application**: 13/12/2023\\n\\n---\\n\\n# CHARTE PRODUITS COUP DE PATES\\n\\n| Sulfate d’aluminium | E 520 | Cire microcristalline | E 905 |\\n|------------------------------------------|--------|-----------------------------------------------------|--------|\\n| Sulfate d’aluminium sodique | E 521 | Poly-1-décène hydrogéné | E 907 |\\n| Sulfate d’aluminium potassique | E 522 | Cire de polyéthylène oxydée | E 914 |\\n| Sulfate d’aluminium ammonique | E 523 | Butane | E 943a |\\n| Hydroxyde d’ammonium | E 527 | Isobutane | E 943b |\\n| Oxyde de calcium | E 529 | Propane | E 944 |\\n| Oxyde de magnésium | E 530 | Acésulfame-K | E 950 |\\n| Ferrocyanure de calcium | E 538 | Aspartame | E 951 |\\n| Phosphate d’aluminium sodique acide | E 541 | Cyclamates | E 952 |\\n| Isomalt | E 953 | Polyvinylpolypyrrolidone | E 1202 |\\n| Saccharines | E 954 | Alcool polyvinylique (APV) | E 1203 |\\n| Sucralose | E 955 | Copolymère méthacrylate basique | E 1205 |\\n| Thaumatine | E 957 | Copolymère de méthacrylate neutre | E 1206 |\\n| Néotame | E 961 | Copolymère de méthacrylate anionique | E 1207 |\\n| Sel d’aspartame-acésulfame | E 962 | Copolymère d’acétate de vinyle et de polyvinylpyrrolidone | E 1208 |\\n| Sirop de polyglycitol | E 964 | Copolymère greffé d’alcool polyvinylique et de polyéthylèneglycol | E 1209 |\\n| Maltitols | E 965 | Octényl succinate d’amidon d’aluminium | E 1452 |\\n| Xylitol | E 967 | Diacétate de glycéryle (diacéitine) | E 1517 |\\n| Érythritol | E 968 | Alcool benzylique | E 1519 |\\n| Polyvinylpyrrolidone | E 1201 | Polyéthylène glycol | E 1521 |\\n\\n---\\n\\n# CHARTE PRODUITS COUP DE PATES\\n\\n**Codification**: CDP_QUA_CHART_01 \\n**Version**: 5 \\n**Date d'application**: 13/12/2023 \\n\\n## ANNEXE II : Additifs oranges : additifs pour lesquels les rapports scientifiques sont contradictoires\\n\\n| Additif | Code | Additif | Code |\\n|---------|------|---------|------|\\n| Acide carminique, carmins | E 120 | Esters lactiques des mono- et diglycérides d’acides gras | E 472b |\\n| Bleu patenté V | E 131 | Esters citriques des mono- et diglycérides d’acides gras | E 472c |\\n| Caramel de sulfite caustique | E 150b | Esters tartriques des mono- et diglycérides d’acides gras | E 472d |\\n| Or | E 175 | Esters monoacétyltartriques et diacétyltartriques des mono- et diglycérides d’acides gras | E 472e |\\n| Acide benzoïque | E 210 | Esters mixtes acétiques et tartriques des mono- et diglycérides d’acides gras | E 472f |\\n| Benzoate de sodium | E 211 | Sucroesters d’acides gras | E 473 |\\n| Anhydride sulfureux | E 220 | Esters polyglycériques d’acides gras | E 475 |\\n| Sulfite de sodium | E 221 | Esters de propane-1,2-diol d’acides gras | E 477 |\\n| Sulfite acide de sodium | E 222 | Stéaroyl-2-lactylate de sodium | E 481 |\\n| Disulfite de sodium | E 223 | Stéaroyl-2-lactylate de calcium | E 482 |\\n| Disulfite de potassium | E 224 | Tartrate de stéaryle | E 483 |\\n| Sulfite de calcium | E 226 | Diméthylpolysiloxane | E 900 |\\n| Sulfite acide de calcium | E 227 | Advantame | E 969 |\\n| Sulfite acide de potassium | E 228 | Extraits de quillaia | E 999 |\\n| Natamycine | E 235 | Lysozyme | E 1105 |\\n| Éthylène-diamine-tétra-acétate de calcium disodium (calcium disodium EDTA) | E 385 | Amidon oxydé | E 1404 |\\n| Alginate de propane-1,2-diol | E 405 | Phosphate de monoamidon | E 1410 |\\n\\n---\\n\\n# CHARTE PRODUITS COUP DE PATES\\n\\n| Algues Euchema transformées | E 407a | Phosphate de diamidon | E 1412 |\\n|-----------------------------|--------|-----------------------|--------|\\n| Carraghénanes | E 407 | Phosphate de diamidon phosphaté | E 1413 |\\n| Cellulose | E 460 | Phosphate de diamidon acétylé | E 1414 |\\n| Hydroxypropylcellulose | E 463 | Amidon acétylé | E 1420 |\\n| Hydroxypropylméthylcellulose| E 464 | Adipate de diamidon acétylé | E 1422 |\\n| Carboxyméthyl-cellulose sodique, gomme cellulosique | E 466 | Amidon hydroxypropylé | E 1440 |\\n| Sels de sodium, de potassium, calcium d’acides gras, magnésium d’acides gras | E 470 | Phosphate de diamidon hydroxypropylé | E 1442 |\\n| Mono- et diglycérides d’acides gras | E 471 | Octényle succinate d’amidon sodique | E 1450 |\\n| Esters acétiques des mono- et diglycérides d’acides gras | E 472a | Amidon oxydé acétylé | E 1451 |\\n\\n**Codification**: CDP_QUA_CHART_01 \\n**Version**: 5 \\n**Date d'application**: 13/12/2023\\n\\n---\\n\\n# CHARTE PRODUITS COUP DE PATES\\n\\n**Codification**: CDP_QUA_CHART_01 \\n**Version**: 5 \\n**Date d'application**: 13/12/2023 \\n\\n## ANNEXE III : Additifs verts : additifs identifiés à ce jour comme non dangereux pour la santé.\\n\\n| Additif | Code | Additif | Code |\\n|----------------------------------------------|-------|----------------------------------------------|-------|\\n| Curcumine | E 100 | Tartrates de potassium | E 336 |\\n| Riboflavines | E 101 | Malate de potassium | E 351 |\\n| Chlorophylles et chlorophyllines | E 140 | Acide métatartarique | E 353 |\\n| Complexes cuivre-chlorophylles et cuivre-chlorophyllines | E 141 | Tartrate de calcium | E 354 |\\n| Caramel ordinaire | E 150a| Extraits de romarin | E 392 |\\n| Charbon végétal médicinal | E 153 | Acide alginique | E 400 |\\n| Caroténoïdes | E 160a| Alginate de sodium | E 401 |\\n| Bixine de rocou / Norbixine de rocou | E 160b| Alginate de calcium | E 404 |\\n| Extrait de paprika, capsanthine, capsorubine| E 160c| Agar-agar | E 406 |\\n| Lycopène | E 160d| Farine de graines de caroube | E 410 |\\n| β- apocaroténal-8' (C 30) | E 160e| Gomme guar | E 412 |\\n| Lutéine | E 161b| Gomme adragante | E 413 |\\n| Rouge de betterave, bétanine | E 162 | Gomme arabique ou gomme d'acacia | E 414 |\\n| Anthocyanes | E 163 | Gomme xanthane | E 415 |\\n| Acide sorbique | E 200 | Gomme Karaya | E 416 |\\n| Sorbate de potassium | E 202 | Gomme Tara | E 417 |\\n| Acide acétique | E 260 | Gomme Gellane | E 418 |\\n| Acétates de potassium | E 261 | Sorbitols | E 420 |\\n| Acétates de sodium | E 262 | Glycérol | E 422 |\\n| Acétate de calcium | E 263 | Gomme cassia | E 427 |\\n| Acide lactique | E 270 | Pectines | E 440 |\\n| Propionate de calcium | E 282 | Méthylcellulose | E 461 |\\n| Dioxyde de carbone | E 290 | Sels de sodium, de potassium et de calcium d'acides gras | E 470a |\\n| Acide malique | E 296 | Sels de magnésium d'acides gras | E 470b |\\n| Acide ascorbique | E 300 | Polyglycérols de polyglycérol | E 476 |\\n| Ascorbate de sodium | E 301 | Phytostérols riches en stigmasterol | E 499 |\\n| Ascorbate de calcium | E 302 | Carbonates de sodium | E 500 |\\n| Esters d'acides gras de l'acide ascorbique | E 304 | Carbonates de potassium | E 501 |\\n| Extrait riche en tocophérols | E 306 | Carbonates d'ammonium | E 503 |\\n| Alpha-tocophérol | E 307 | Carbonates de magnésium | E 504 |\\n| Érythorbate de sodium | E 316 | Acide chlorhydrique | E 507 |\\n| Lécithines | E 322 | Chlorure de potassium | E 508 |\\n| Lactate de sodium | E 325 | Chlorure de calcium | E 509 |\\n| Lactate de potassium | E 326 | Chlorure de magnésium | E 511 |\\n| Lactate de calcium | E 327 | Acide sulfurique | E 513 |\\n| Acide citrique | E 330 | Sulfates de sodium | E 514 |\\n| Citrates de sodium | E 331 | Sulfates de potassium | E 515 |\\n| Citrates de potassium | E 332 | Sulfate de calcium | E 516 |\\n| Citrates de calcium | E 333 | Sulfate d'ammonium | E 517 |\\n| Acide tartrique [L (+)] | E 334 | Hydroxyde de sodium | E 524 |\\n\\n---\\n\\n# CHARTE PRODUITS COUP DE PATES\\n\\n| Hydroxyde de potassium | E 525 | Shellac | E 904 |\\n|------------------------------|--------|----------------------------------|--------|\\n| Hydroxyde de calcium | E 526 | L-cystéine | E 920 |\\n| Hydroxyde de magnésium | E 528 | Carbamide | E 927b |\\n| Tartrate de fer | E 534 | Argon | E 938 |\\n| Ferrocyanure de sodium | E 535 | Hélium | E 939 |\\n| Ferrocyanure de potassium | E 536 | Azote | E 941 |\\n| Acides gras | E 570 | Protoxyde d’azote | E 942 |\\n| Acide gluconique | E 574 | Oxygène | E 948 |\\n| Glucono-delta-lactone | E 575 | Hydrogène | E 949 |\\n| Gluconate de sodium | E 576 | Néo-hespéridine DC | E 959 |\\n| Gluconate de potassium | E 577 | Glycosides de stéviol | E 960 |\\n| Gluconate de calcium | E 578 | Lactitol | E 966 |\\n| Gluconate ferreux | E 579 | Invertase | E 1103 |\\n| Lactate ferreux | E 585 | Polydextrose | E 1200 |\\n| L-leucine | E 641 | Pullulan | E 1204 |\\n| Cire d’abeille blanche et jaune | E 901 | Citrate de triéthyle | E 1505 |\\n| Cire de candelilla | E 902 | Triacétate de glycéryle (triacétine) | E 1518 |\\n| Cire de carnauba | E 903 | Propanediol-1,2 (propylène glycol) | E 1520 |\\n\\n## ANNEXE IV : Ingrédients controversés : ingrédients faisant l’objet de rapports scientifiques controversés et/ou perçus négativement par le consommateur.\\n\\n| Ingrédient | Motif |\\n|---------------------------|-----------------------------------------------------------------------|\\n| Sirop de glucose-fructose | Niveau de transformation élevé + manque de transparence sur le niveau de sucre présent dans le produit |\\n| Maltodextrine | Ingrédient sans intérêt nutritionnel et organoleptique |\\n| Huile de coco/coprah | Contient 80% d’acides gras saturés dont l’excès augmente le risque de maladies cardiovasculaires |\\n| Sirop de maïs | Niveau de transformation élevé + manque de transparence sur le niveau de sucre présent dans le produit |\\n\", start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n')]\n" + ] + } + ], + "source": [ + "## Read PDF files\n", + "from typing import List\n", + "from llama_index.core.schema import Document\n", + "import nest_asyncio\n", + "\n", + "nest_asyncio.apply()\n", + "#GET LLAMA_CLOUD_API_KEY\n", + "import os\n", + "from llama_parse import LlamaParse\n", + "from llama_parse.utils import ResultType, Language\n", + "\n", + "api_key: str | None = os.getenv(\"LLAMA_CLOUD_API_KEY\")\n", + "print(api_key)\n", + "\n", + "parsing_instructions = \"Do not take into account the page breaks (no --- between pages), do not repeat the header and the footer so the tables are merged. Keep the same format for similar tables.\"\n", + "\n", + "parser = LlamaParse(\n", + " api_key=str(api_key), \n", + " result_type=ResultType.MD,\n", + " gpt4o_mode=True,\n", + " verbose=True,\n", + " language=Language.FRENCH,\n", + " parsing_instruction=parsing_instructions, # Optionally you can define a parsing instruction\n", + ")\n", + "# sync\n", + "documents: List[Document] = parser.load_data(\"../input/CDP_CHARTE_PRODUITS.pdf\")\n", + "print(documents)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# with open(\"../output/CDP_CHARTE_PRODUITS__llamaParse.md\", \"w\") as f:\n", + "# f.write(documents[0].get_content())" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "106\n", + "Found 76 unique paragraphs on 106 paragraphs.\n", + "Found 30 duplicate paragraphs.\n" + ] + } + ], + "source": [ + "import sys\n", + "sys.path.append('..')\n", + "from src.markdown_processor import MarkdownProcessor\n", + "\n", + "md_result: str = documents[0].get_content()\n", + "\n", + "output_path = \"../output/CDP_CHARTE_PRODUITS__llamaParse_cleaned.md\"\n", + "processor = MarkdownProcessor(md_result, strict=True, remove_pagination=True)\n", + "md_cleaned = processor.process()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "processor.save_cleaned_result(md_cleaned, output_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Started parsing the file under job_id b82bb4ce-18ac-4c84-a2c8-f48ab418aae0\n", + "106\n", + "Found 76 unique paragraphs on 106 paragraphs.\n", + "Found 30 duplicate paragraphs.\n" + ] + } + ], + "source": [ + "from pathlib import Path\n", + "import sys\n", + "sys.path.append('..')\n", + "from src.markdown_processor import MarkdownProcessor\n", + "from src.converter import PDFConverter\n", + "import os \n", + "\n", + "api_key: str | None = os.getenv(\"LLAMA_CLOUD_API_KEY\")\n", + "\n", + "converter = PDFConverter(api_key=str(api_key))\n", + "md_content = converter.convert(\"../input/CDP_CHARTE_PRODUITS.pdf\")\n", + "converter.save_md(md_content, Path(\"../output/CDP_CHARTE_PRODUITS.md\"))\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "QuivrParse-DS8JDGq8", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/backend/core/MegaParse/notebooks/pptx2md.ipynb b/backend/core/MegaParse/notebooks/pptx2md.ipynb new file mode 100644 index 000000000000..1d4102b5d254 --- /dev/null +++ b/backend/core/MegaParse/notebooks/pptx2md.ipynb @@ -0,0 +1,46 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from src.Converter import PPTXConverter\n", + "from pathlib import Path" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "converter = PPTXConverter()\n", + "md = converter.convert(\"./input/Quivr_Monotype_Proposal.pptx\")\n", + "converter.save_md(md, Path(\"./output/Quivr_Monotype_Proposal.md\"))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "QuivrParse-DS8JDGq8", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/backend/core/MegaParse/notebooks/test.ipynb b/backend/core/MegaParse/notebooks/test.ipynb new file mode 100644 index 000000000000..bfa2a7738e35 --- /dev/null +++ b/backend/core/MegaParse/notebooks/test.ipynb @@ -0,0 +1,159 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "from megaparse.Converter import MegaParse\n", + "from IPython.display import display_markdown\n", + "import pdfminer\n", + "from pdfminer.image import ImageWriter\n", + "from pdfminer.high_level import extract_pages\n", + "\n", + "import fitz\n", + "import io\n", + "from PIL import Image" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "file_path = \"megaparse/tests/input_tests/MegaFake_report.pdf\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "megaparse = MegaParse(file_path=file_path)\n", + "content = megaparse.convert()\n", + "megaparse.save_md(md_content=content, file_path=\"./content.md\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "display_markdown(content, raw=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# def extract_images_from_pdf(pdf_file_path, output_dir):\n", + "# iw = ImageWriter(output_dir)\n", + "# image_count = 0\n", + "\n", + "# for page_num, page_layout in enumerate(extract_pages(pdf_file_path)):\n", + "# for image in get_images_from_page(page_layout):\n", + "# image_name = f\"image_{image_count}_page_{page_num}.png\"\n", + "# iw.export_image(image)\n", + "# image_count += 1\n", + "\n", + "\n", + "# def get_images_from_page(page_layout):\n", + "# if isinstance(page_layout, pdfminer.layout.LTImage):\n", + "# return [page_layout]\n", + "# if isinstance(page_layout, pdfminer.layout.LTContainer):\n", + "# img_list = []\n", + "# for child in page_layout:\n", + "# img_list += get_images_from_page(child)\n", + "# return img_list\n", + "# else:\n", + "# return []" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# extract_images_from_pdf(pdf_file_path=file_path, output_dir=\"output/\")" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "def extract_images_from_pdf(pdf_file_path: str, output_dir: str):\n", + " pdf_file = fitz.open(pdf_file_path)\n", + " for page_number in range(1, len(pdf_file)):\n", + " page = pdf_file[page_number]\n", + " for image_index, img in enumerate(page.get_images(), start=1):\n", + " xref = img[0]\n", + " base_image = pdf_file.extract_image(xref)\n", + " image_bytes = base_image[\"image\"]\n", + " image_ext = base_image[\"ext\"]\n", + " pil_image = Image.open(io.BytesIO(image_bytes))\n", + " image_path = (\n", + " f\"{output_dir}image_{image_index}_page_{page_number}.{image_ext}\"\n", + " )\n", + " pil_image.save(image_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1\n", + "1\n", + "1\n", + "1\n", + "1\n" + ] + } + ], + "source": [ + "extract_images_from_pdf(pdf_file_path=file_path, output_dir=\"output/\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ENV", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/backend/core/MegaParse/notebooks/unstructured.ipynb b/backend/core/MegaParse/notebooks/unstructured.ipynb new file mode 100644 index 000000000000..6174d3341931 --- /dev/null +++ b/backend/core/MegaParse/notebooks/unstructured.ipynb @@ -0,0 +1,71 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Mega Parse" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']\n", + "- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" + ] + } + ], + "source": [ + "from pathlib import Path\n", + "import sys\n", + "sys.path.append('..')\n", + "from megaparse.unstructured import UnstructuredParser\n", + "import os \n", + "\n", + "unstructured = UnstructuredParser()\n", + "file_partitioned = unstructured.partition_pdf_file('../megaparse/tests/input_tests/MegaFake_report.pdf')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "unstructured2 = UnstructuredParser()\n", + "\n", + "\n", + "elements_dict = [el.to_dict() for el in file_partitioned]\n", + "markdown_content = unstructured2.convert_to_markdown(elements_dict)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "QuivrParse-DS8JDGq8", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/backend/core/MegaParse/pyproject.toml b/backend/core/MegaParse/pyproject.toml new file mode 100644 index 000000000000..20bdf1bb14de --- /dev/null +++ b/backend/core/MegaParse/pyproject.toml @@ -0,0 +1,53 @@ +[project] +name = "megaparse" +version = "0.0.31" +description = "Parse complex files (PDF,Docx,PPTX) for LLM consumption" +authors = [ + { name = "Stan Girard", email = "stan@quivr.app" }, + { name = "Chloé Daems", email = "chloe@quivr.app" } +] +readme = "README.md" +dependencies = [ + "python-docx>=1.1.0", + "mammoth>=1.8.0", + "python-pptx>=1.0.2", + "llama-parse>=0.4.0", + "pdf2docx>=0.5.0", + "unstructured[pdf]>=0.15.0", + "langchain>=0.2.0", + "langchain-community>=0.2.0", + "langchain-openai>=0.1.0", + "langchain-core>=0.2.0", + "python-dotenv>=1.0.0", + "pycryptodome>=3.20.0", + "llama-index>=0.10.0", + "pdfplumber>=0.11.0", +] +python = "^3.11" + + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.rye] +managed = true +universal = true +dev-dependencies = [ + "mypy>=1.11.1", + "pre-commit>=3.8.0", + "ipykernel>=6.29.5", + "ruff>=0.6.0", + "flake8>=7.1.1", + "flake8-black>=0.3.6", + "pytest-asyncio>=0.23.8", + "pytest>=8.3.2", + "pytest-xdist>=3.6.1", + "pytest-cov>=5.0.0", +] + +[tool.hatch.metadata] +allow-direct-references = true + +[tool.hatch.build.targets.wheel] +packages = ["megaparse"] \ No newline at end of file diff --git a/backend/core/MegaParse/release-please-config.json b/backend/core/MegaParse/release-please-config.json new file mode 100644 index 000000000000..f954720b905a --- /dev/null +++ b/backend/core/MegaParse/release-please-config.json @@ -0,0 +1,11 @@ +{ + "packages": { + ".": { + "release-type": "python", + "package-name": "megaparse", + "bump-patch-for-minor-pre-major": true, + "changelog-notes-type": "github", + "include-v-in-tag": true + } + } +} diff --git a/backend/core/MegaParse/requirements-dev.lock b/backend/core/MegaParse/requirements-dev.lock new file mode 100644 index 000000000000..02ee108e3c32 --- /dev/null +++ b/backend/core/MegaParse/requirements-dev.lock @@ -0,0 +1,710 @@ +# generated by rye +# use `rye lock` or `rye sync` to update this lockfile +# +# last locked with the following flags: +# pre: false +# features: [] +# all-features: true +# with-sources: false +# generate-hashes: false +# universal: true + +-e file:. +aiohappyeyeballs==2.4.0 + # via aiohttp +aiohttp==3.10.5 + # via langchain + # via langchain-community + # via llama-index-core + # via llama-index-legacy +aiosignal==1.3.1 + # via aiohttp +annotated-types==0.7.0 + # via pydantic +antlr4-python3-runtime==4.9.3 + # via omegaconf +anyio==4.4.0 + # via httpx + # via openai +appnope==0.1.4 ; platform_system == 'Darwin' + # via ipykernel +asttokens==2.4.1 + # via stack-data +attrs==24.2.0 + # via aiohttp +backoff==2.2.1 + # via unstructured +beautifulsoup4==4.12.3 + # via llama-index-readers-file + # via unstructured +black==24.8.0 + # via flake8-black +cachetools==5.5.0 + # via google-auth +certifi==2024.7.4 + # via httpcore + # via httpx + # via requests + # via unstructured-client +cffi==1.17.0 ; implementation_name == 'pypy' or platform_python_implementation != 'PyPy' + # via cryptography + # via pyzmq +cfgv==3.4.0 + # via pre-commit +chardet==5.2.0 + # via unstructured +charset-normalizer==3.3.2 + # via pdfminer-six + # via requests + # via unstructured-client +click==8.1.7 + # via black + # via nltk +cobble==0.1.4 + # via mammoth +colorama==0.4.6 ; platform_system == 'Windows' or sys_platform == 'win32' + # via click + # via ipython + # via pytest + # via tqdm +coloredlogs==15.0.1 + # via onnxruntime +comm==0.2.2 + # via ipykernel +contourpy==1.2.1 + # via matplotlib +coverage==7.6.1 + # via pytest-cov +cryptography==43.0.0 + # via pdfminer-six +cycler==0.12.1 + # via matplotlib +dataclasses-json==0.6.7 + # via langchain-community + # via llama-index-core + # via llama-index-legacy + # via unstructured + # via unstructured-client +debugpy==1.8.5 + # via ipykernel +decorator==5.1.1 + # via ipython +deepdiff==7.0.1 + # via unstructured-client +deprecated==1.2.14 + # via llama-index-core + # via llama-index-legacy + # via pikepdf +dirtyjson==1.0.8 + # via llama-index-core + # via llama-index-legacy +distlib==0.3.8 + # via virtualenv +distro==1.9.0 + # via openai +effdet==0.4.1 + # via unstructured +emoji==2.12.1 + # via unstructured +execnet==2.1.1 + # via pytest-xdist +executing==2.0.1 + # via stack-data +filelock==3.15.4 + # via huggingface-hub + # via torch + # via transformers + # via triton + # via virtualenv +filetype==1.2.0 + # via unstructured +fire==0.6.0 + # via pdf2docx +flake8==7.1.1 + # via flake8-black +flake8-black==0.3.6 +flatbuffers==24.3.25 + # via onnxruntime +fonttools==4.53.1 + # via matplotlib + # via pdf2docx +frozenlist==1.4.1 + # via aiohttp + # via aiosignal +fsspec==2024.6.1 + # via huggingface-hub + # via llama-index-core + # via llama-index-legacy + # via torch +google-api-core==2.19.1 + # via google-cloud-vision +google-auth==2.34.0 + # via google-api-core + # via google-cloud-vision +google-cloud-vision==3.7.4 + # via unstructured +googleapis-common-protos==1.63.2 + # via google-api-core + # via grpcio-status +greenlet==3.0.3 + # via sqlalchemy +grpcio==1.65.5 + # via google-api-core + # via grpcio-status +grpcio-status==1.65.5 + # via google-api-core +h11==0.14.0 + # via httpcore +httpcore==1.0.5 + # via httpx +httpx==0.27.0 + # via llama-cloud + # via llama-index-core + # via llama-index-legacy + # via openai + # via unstructured-client +huggingface-hub==0.24.6 + # via timm + # via tokenizers + # via transformers + # via unstructured-inference +humanfriendly==10.0 + # via coloredlogs +identify==2.6.0 + # via pre-commit +idna==3.7 + # via anyio + # via httpx + # via requests + # via unstructured-client + # via yarl +iniconfig==2.0.0 + # via pytest +iopath==0.1.10 + # via layoutparser +ipykernel==6.29.5 +ipython==8.26.0 + # via ipykernel +jedi==0.19.1 + # via ipython +jinja2==3.1.4 + # via torch +jiter==0.5.0 + # via openai +joblib==1.4.2 + # via nltk +jsonpatch==1.33 + # via langchain-core +jsonpath-python==1.0.6 + # via unstructured-client +jsonpointer==3.0.0 + # via jsonpatch +jupyter-client==8.6.2 + # via ipykernel +jupyter-core==5.7.2 + # via ipykernel + # via jupyter-client +kiwisolver==1.4.5 + # via matplotlib +langchain==0.2.14 + # via langchain-community + # via megaparse +langchain-community==0.2.12 + # via megaparse +langchain-core==0.2.33 + # via langchain + # via langchain-community + # via langchain-openai + # via langchain-text-splitters + # via megaparse +langchain-openai==0.1.22 + # via megaparse +langchain-text-splitters==0.2.2 + # via langchain +langdetect==1.0.9 + # via unstructured +langsmith==0.1.99 + # via langchain + # via langchain-community + # via langchain-core +layoutparser==0.3.4 + # via unstructured-inference +llama-cloud==0.0.13 + # via llama-index-indices-managed-llama-cloud +llama-index==0.10.67.post1 + # via megaparse +llama-index-agent-openai==0.2.9 + # via llama-index + # via llama-index-program-openai +llama-index-cli==0.1.13 + # via llama-index +llama-index-core==0.10.67 + # via llama-index + # via llama-index-agent-openai + # via llama-index-cli + # via llama-index-embeddings-openai + # via llama-index-indices-managed-llama-cloud + # via llama-index-llms-openai + # via llama-index-multi-modal-llms-openai + # via llama-index-program-openai + # via llama-index-question-gen-openai + # via llama-index-readers-file + # via llama-index-readers-llama-parse + # via llama-parse +llama-index-embeddings-openai==0.1.11 + # via llama-index + # via llama-index-cli +llama-index-indices-managed-llama-cloud==0.2.7 + # via llama-index +llama-index-legacy==0.9.48.post3 + # via llama-index +llama-index-llms-openai==0.1.29 + # via llama-index + # via llama-index-agent-openai + # via llama-index-cli + # via llama-index-multi-modal-llms-openai + # via llama-index-program-openai + # via llama-index-question-gen-openai +llama-index-multi-modal-llms-openai==0.1.9 + # via llama-index +llama-index-program-openai==0.1.7 + # via llama-index + # via llama-index-question-gen-openai +llama-index-question-gen-openai==0.1.3 + # via llama-index +llama-index-readers-file==0.1.33 + # via llama-index +llama-index-readers-llama-parse==0.1.6 + # via llama-index +llama-parse==0.4.9 + # via llama-index-readers-llama-parse + # via megaparse +lxml==5.3.0 + # via pikepdf + # via python-docx + # via python-pptx + # via unstructured +mammoth==1.8.0 + # via megaparse +markupsafe==2.1.5 + # via jinja2 +marshmallow==3.21.3 + # via dataclasses-json + # via unstructured-client +matplotlib==3.9.2 + # via pycocotools + # via unstructured-inference +matplotlib-inline==0.1.7 + # via ipykernel + # via ipython +mccabe==0.7.0 + # via flake8 +mpmath==1.3.0 + # via sympy +multidict==6.0.5 + # via aiohttp + # via yarl +mypy==1.11.1 +mypy-extensions==1.0.0 + # via black + # via mypy + # via typing-inspect + # via unstructured-client +nest-asyncio==1.6.0 + # via ipykernel + # via llama-index-core + # via llama-index-legacy + # via unstructured-client +networkx==3.3 + # via llama-index-core + # via llama-index-legacy + # via torch +nltk==3.9.1 + # via llama-index-core + # via llama-index-legacy + # via unstructured +nodeenv==1.9.1 + # via pre-commit +numpy==1.26.4 + # via contourpy + # via langchain + # via langchain-community + # via layoutparser + # via llama-index-core + # via llama-index-legacy + # via matplotlib + # via onnx + # via onnxruntime + # via opencv-python + # via opencv-python-headless + # via pandas + # via pdf2docx + # via pycocotools + # via scipy + # via torchvision + # via transformers + # via unstructured +nvidia-cublas-cu12==12.1.3.1 ; platform_machine == 'x86_64' and platform_system == 'Linux' + # via nvidia-cudnn-cu12 + # via nvidia-cusolver-cu12 + # via torch +nvidia-cuda-cupti-cu12==12.1.105 ; platform_machine == 'x86_64' and platform_system == 'Linux' + # via torch +nvidia-cuda-nvrtc-cu12==12.1.105 ; platform_machine == 'x86_64' and platform_system == 'Linux' + # via torch +nvidia-cuda-runtime-cu12==12.1.105 ; platform_machine == 'x86_64' and platform_system == 'Linux' + # via torch +nvidia-cudnn-cu12==9.1.0.70 ; platform_machine == 'x86_64' and platform_system == 'Linux' + # via torch +nvidia-cufft-cu12==11.0.2.54 ; platform_machine == 'x86_64' and platform_system == 'Linux' + # via torch +nvidia-curand-cu12==10.3.2.106 ; platform_machine == 'x86_64' and platform_system == 'Linux' + # via torch +nvidia-cusolver-cu12==11.4.5.107 ; platform_machine == 'x86_64' and platform_system == 'Linux' + # via torch +nvidia-cusparse-cu12==12.1.0.106 ; platform_machine == 'x86_64' and platform_system == 'Linux' + # via nvidia-cusolver-cu12 + # via torch +nvidia-nccl-cu12==2.20.5 ; platform_machine == 'x86_64' and platform_system == 'Linux' + # via torch +nvidia-nvjitlink-cu12==12.6.20 ; platform_machine == 'x86_64' and platform_system == 'Linux' + # via nvidia-cusolver-cu12 + # via nvidia-cusparse-cu12 +nvidia-nvtx-cu12==12.1.105 ; platform_machine == 'x86_64' and platform_system == 'Linux' + # via torch +omegaconf==2.3.0 + # via effdet +onnx==1.16.2 + # via unstructured + # via unstructured-inference +onnxruntime==1.19.0 + # via unstructured-inference +openai==1.41.1 + # via langchain-openai + # via llama-index-agent-openai + # via llama-index-core + # via llama-index-legacy + # via llama-index-llms-openai +opencv-python==4.10.0.84 + # via layoutparser + # via unstructured-inference +opencv-python-headless==4.10.0.84 + # via pdf2docx +ordered-set==4.1.0 + # via deepdiff +orjson==3.10.7 + # via langsmith +packaging==24.1 + # via black + # via huggingface-hub + # via ipykernel + # via langchain-core + # via marshmallow + # via matplotlib + # via onnxruntime + # via pikepdf + # via pytest + # via transformers + # via unstructured-client + # via unstructured-pytesseract +pandas==2.2.2 + # via layoutparser + # via llama-index-core + # via llama-index-legacy +parso==0.8.4 + # via jedi +pathspec==0.12.1 + # via black +pdf2docx==0.5.8 + # via megaparse +pdf2image==1.17.0 + # via layoutparser + # via unstructured +pdfminer-six==20231228 + # via pdfplumber + # via unstructured +pdfplumber==0.11.4 + # via layoutparser + # via megaparse +pexpect==4.9.0 ; sys_platform != 'emscripten' and sys_platform != 'win32' + # via ipython +pikepdf==9.1.1 + # via unstructured +pillow==10.4.0 + # via layoutparser + # via llama-index-core + # via matplotlib + # via pdf2image + # via pdfplumber + # via pikepdf + # via pillow-heif + # via python-pptx + # via torchvision + # via unstructured-pytesseract +pillow-heif==0.18.0 + # via unstructured +platformdirs==4.2.2 + # via black + # via jupyter-core + # via virtualenv +pluggy==1.5.0 + # via pytest +portalocker==2.10.1 + # via iopath +pre-commit==3.8.0 +prompt-toolkit==3.0.47 + # via ipython +proto-plus==1.24.0 + # via google-api-core + # via google-cloud-vision +protobuf==5.27.3 + # via google-api-core + # via google-cloud-vision + # via googleapis-common-protos + # via grpcio-status + # via onnx + # via onnxruntime + # via proto-plus +psutil==6.0.0 + # via ipykernel + # via unstructured +ptyprocess==0.7.0 ; sys_platform != 'emscripten' and sys_platform != 'win32' + # via pexpect +pure-eval==0.2.3 + # via stack-data +pyasn1==0.6.0 + # via pyasn1-modules + # via rsa +pyasn1-modules==0.4.0 + # via google-auth +pycocotools==2.0.8 + # via effdet +pycodestyle==2.12.1 + # via flake8 +pycparser==2.22 ; implementation_name == 'pypy' or platform_python_implementation != 'PyPy' + # via cffi +pycryptodome==3.20.0 + # via megaparse +pydantic==2.8.2 + # via langchain + # via langchain-core + # via langsmith + # via llama-cloud + # via openai +pydantic-core==2.20.1 + # via pydantic +pyflakes==3.2.0 + # via flake8 +pygments==2.18.0 + # via ipython +pymupdf==1.24.9 + # via pdf2docx +pymupdfb==1.24.9 + # via pymupdf +pyparsing==3.1.2 + # via matplotlib +pypdf==4.3.1 + # via llama-index-readers-file + # via unstructured + # via unstructured-client +pypdfium2==4.30.0 + # via pdfplumber +pyreadline3==3.4.1 ; sys_platform == 'win32' + # via humanfriendly +pytest==8.3.2 + # via pytest-asyncio + # via pytest-cov + # via pytest-xdist +pytest-asyncio==0.23.8 +pytest-cov==5.0.0 +pytest-xdist==3.6.1 +python-dateutil==2.9.0.post0 + # via jupyter-client + # via matplotlib + # via pandas + # via unstructured-client +python-docx==1.1.2 + # via megaparse + # via pdf2docx +python-dotenv==1.0.1 + # via megaparse +python-iso639==2024.4.27 + # via unstructured +python-magic==0.4.27 + # via unstructured +python-multipart==0.0.9 + # via unstructured-inference +python-pptx==1.0.2 + # via megaparse +pytz==2024.1 + # via pandas +pywin32==306 ; platform_system == 'Windows' or (platform_python_implementation != 'PyPy' and sys_platform == 'win32') + # via jupyter-core + # via portalocker +pyyaml==6.0.2 + # via huggingface-hub + # via langchain + # via langchain-community + # via langchain-core + # via layoutparser + # via llama-index-core + # via omegaconf + # via pre-commit + # via timm + # via transformers +pyzmq==26.1.1 + # via ipykernel + # via jupyter-client +rapidfuzz==3.9.6 + # via unstructured + # via unstructured-inference +regex==2024.7.24 + # via nltk + # via tiktoken + # via transformers +requests==2.32.3 + # via google-api-core + # via huggingface-hub + # via langchain + # via langchain-community + # via langsmith + # via llama-index-core + # via llama-index-legacy + # via requests-toolbelt + # via tiktoken + # via transformers + # via unstructured + # via unstructured-client +requests-toolbelt==1.0.0 + # via unstructured-client +rsa==4.9 + # via google-auth +ruff==0.6.1 +safetensors==0.4.4 + # via timm + # via transformers +scipy==1.14.0 + # via layoutparser +setuptools==73.0.0 + # via torch +six==1.16.0 + # via asttokens + # via fire + # via langdetect + # via python-dateutil + # via unstructured-client +sniffio==1.3.1 + # via anyio + # via httpx + # via openai +soupsieve==2.6 + # via beautifulsoup4 +sqlalchemy==2.0.32 + # via langchain + # via langchain-community + # via llama-index-core + # via llama-index-legacy +stack-data==0.6.3 + # via ipython +striprtf==0.0.26 + # via llama-index-readers-file +sympy==1.13.2 + # via onnxruntime + # via torch +tabulate==0.9.0 + # via unstructured +tenacity==8.5.0 + # via langchain + # via langchain-community + # via langchain-core + # via llama-index-core + # via llama-index-legacy +termcolor==2.4.0 + # via fire +tiktoken==0.7.0 + # via langchain-openai + # via llama-index-core + # via llama-index-legacy +timm==1.0.8 + # via effdet + # via unstructured-inference +tokenizers==0.19.1 + # via transformers +torch==2.4.0 + # via effdet + # via timm + # via torchvision + # via unstructured-inference +torchvision==0.19.0 + # via effdet + # via timm +tornado==6.4.1 + # via ipykernel + # via jupyter-client +tqdm==4.66.5 + # via huggingface-hub + # via iopath + # via llama-index-core + # via nltk + # via openai + # via transformers + # via unstructured +traitlets==5.14.3 + # via comm + # via ipykernel + # via ipython + # via jupyter-client + # via jupyter-core + # via matplotlib-inline +transformers==4.44.0 + # via unstructured-inference +triton==3.0.0 ; python_version < '3.13' and platform_machine == 'x86_64' and platform_system == 'Linux' + # via torch +typing-extensions==4.12.2 + # via emoji + # via huggingface-hub + # via iopath + # via langchain-core + # via llama-index-core + # via llama-index-legacy + # via mypy + # via openai + # via pydantic + # via pydantic-core + # via python-docx + # via python-pptx + # via sqlalchemy + # via torch + # via typing-inspect + # via unstructured + # via unstructured-client +typing-inspect==0.9.0 + # via dataclasses-json + # via llama-index-core + # via llama-index-legacy + # via unstructured-client +tzdata==2024.1 + # via pandas +unstructured==0.15.5 + # via megaparse +unstructured-client==0.25.5 + # via unstructured +unstructured-inference==0.7.36 + # via unstructured +unstructured-pytesseract==0.3.13 + # via unstructured +urllib3==2.2.2 + # via requests + # via unstructured-client +virtualenv==20.26.3 + # via pre-commit +wcwidth==0.2.13 + # via prompt-toolkit +wrapt==1.16.0 + # via deprecated + # via llama-index-core + # via unstructured +xlsxwriter==3.2.0 + # via python-pptx +yarl==1.9.4 + # via aiohttp diff --git a/backend/core/MegaParse/requirements.lock b/backend/core/MegaParse/requirements.lock new file mode 100644 index 000000000000..9c58f6746245 --- /dev/null +++ b/backend/core/MegaParse/requirements.lock @@ -0,0 +1,594 @@ +# generated by rye +# use `rye lock` or `rye sync` to update this lockfile +# +# last locked with the following flags: +# pre: false +# features: [] +# all-features: true +# with-sources: false +# generate-hashes: false +# universal: true + +-e file:. +aiohappyeyeballs==2.4.0 + # via aiohttp +aiohttp==3.10.5 + # via langchain + # via langchain-community + # via llama-index-core + # via llama-index-legacy +aiosignal==1.3.1 + # via aiohttp +annotated-types==0.7.0 + # via pydantic +antlr4-python3-runtime==4.9.3 + # via omegaconf +anyio==4.4.0 + # via httpx + # via openai +attrs==24.2.0 + # via aiohttp +backoff==2.2.1 + # via unstructured +beautifulsoup4==4.12.3 + # via llama-index-readers-file + # via unstructured +cachetools==5.5.0 + # via google-auth +certifi==2024.7.4 + # via httpcore + # via httpx + # via requests + # via unstructured-client +cffi==1.17.0 ; platform_python_implementation != 'PyPy' + # via cryptography +chardet==5.2.0 + # via unstructured +charset-normalizer==3.3.2 + # via pdfminer-six + # via requests + # via unstructured-client +click==8.1.7 + # via nltk +cobble==0.1.4 + # via mammoth +colorama==0.4.6 ; platform_system == 'Windows' + # via click + # via tqdm +coloredlogs==15.0.1 + # via onnxruntime +contourpy==1.2.1 + # via matplotlib +cryptography==43.0.0 + # via pdfminer-six +cycler==0.12.1 + # via matplotlib +dataclasses-json==0.6.7 + # via langchain-community + # via llama-index-core + # via llama-index-legacy + # via unstructured + # via unstructured-client +deepdiff==7.0.1 + # via unstructured-client +deprecated==1.2.14 + # via llama-index-core + # via llama-index-legacy + # via pikepdf +dirtyjson==1.0.8 + # via llama-index-core + # via llama-index-legacy +distro==1.9.0 + # via openai +effdet==0.4.1 + # via unstructured +emoji==2.12.1 + # via unstructured +filelock==3.15.4 + # via huggingface-hub + # via torch + # via transformers + # via triton +filetype==1.2.0 + # via unstructured +fire==0.6.0 + # via pdf2docx +flatbuffers==24.3.25 + # via onnxruntime +fonttools==4.53.1 + # via matplotlib + # via pdf2docx +frozenlist==1.4.1 + # via aiohttp + # via aiosignal +fsspec==2024.6.1 + # via huggingface-hub + # via llama-index-core + # via llama-index-legacy + # via torch +google-api-core==2.19.1 + # via google-cloud-vision +google-auth==2.34.0 + # via google-api-core + # via google-cloud-vision +google-cloud-vision==3.7.4 + # via unstructured +googleapis-common-protos==1.63.2 + # via google-api-core + # via grpcio-status +greenlet==3.0.3 + # via sqlalchemy +grpcio==1.65.5 + # via google-api-core + # via grpcio-status +grpcio-status==1.65.5 + # via google-api-core +h11==0.14.0 + # via httpcore +httpcore==1.0.5 + # via httpx +httpx==0.27.0 + # via llama-cloud + # via llama-index-core + # via llama-index-legacy + # via openai + # via unstructured-client +huggingface-hub==0.24.6 + # via timm + # via tokenizers + # via transformers + # via unstructured-inference +humanfriendly==10.0 + # via coloredlogs +idna==3.7 + # via anyio + # via httpx + # via requests + # via unstructured-client + # via yarl +iopath==0.1.10 + # via layoutparser +jinja2==3.1.4 + # via torch +jiter==0.5.0 + # via openai +joblib==1.4.2 + # via nltk +jsonpatch==1.33 + # via langchain-core +jsonpath-python==1.0.6 + # via unstructured-client +jsonpointer==3.0.0 + # via jsonpatch +kiwisolver==1.4.5 + # via matplotlib +langchain==0.2.14 + # via langchain-community + # via megaparse +langchain-community==0.2.12 + # via megaparse +langchain-core==0.2.33 + # via langchain + # via langchain-community + # via langchain-openai + # via langchain-text-splitters + # via megaparse +langchain-openai==0.1.22 + # via megaparse +langchain-text-splitters==0.2.2 + # via langchain +langdetect==1.0.9 + # via unstructured +langsmith==0.1.99 + # via langchain + # via langchain-community + # via langchain-core +layoutparser==0.3.4 + # via unstructured-inference +llama-cloud==0.0.13 + # via llama-index-indices-managed-llama-cloud +llama-index==0.10.67.post1 + # via megaparse +llama-index-agent-openai==0.2.9 + # via llama-index + # via llama-index-program-openai +llama-index-cli==0.1.13 + # via llama-index +llama-index-core==0.10.67 + # via llama-index + # via llama-index-agent-openai + # via llama-index-cli + # via llama-index-embeddings-openai + # via llama-index-indices-managed-llama-cloud + # via llama-index-llms-openai + # via llama-index-multi-modal-llms-openai + # via llama-index-program-openai + # via llama-index-question-gen-openai + # via llama-index-readers-file + # via llama-index-readers-llama-parse + # via llama-parse +llama-index-embeddings-openai==0.1.11 + # via llama-index + # via llama-index-cli +llama-index-indices-managed-llama-cloud==0.2.7 + # via llama-index +llama-index-legacy==0.9.48.post3 + # via llama-index +llama-index-llms-openai==0.1.29 + # via llama-index + # via llama-index-agent-openai + # via llama-index-cli + # via llama-index-multi-modal-llms-openai + # via llama-index-program-openai + # via llama-index-question-gen-openai +llama-index-multi-modal-llms-openai==0.1.9 + # via llama-index +llama-index-program-openai==0.1.7 + # via llama-index + # via llama-index-question-gen-openai +llama-index-question-gen-openai==0.1.3 + # via llama-index +llama-index-readers-file==0.1.33 + # via llama-index +llama-index-readers-llama-parse==0.1.6 + # via llama-index +llama-parse==0.4.9 + # via llama-index-readers-llama-parse + # via megaparse +lxml==5.3.0 + # via pikepdf + # via python-docx + # via python-pptx + # via unstructured +mammoth==1.8.0 + # via megaparse +markupsafe==2.1.5 + # via jinja2 +marshmallow==3.21.3 + # via dataclasses-json + # via unstructured-client +matplotlib==3.9.2 + # via pycocotools + # via unstructured-inference +mpmath==1.3.0 + # via sympy +multidict==6.0.5 + # via aiohttp + # via yarl +mypy-extensions==1.0.0 + # via typing-inspect + # via unstructured-client +nest-asyncio==1.6.0 + # via llama-index-core + # via llama-index-legacy + # via unstructured-client +networkx==3.3 + # via llama-index-core + # via llama-index-legacy + # via torch +nltk==3.9.1 + # via llama-index-core + # via llama-index-legacy + # via unstructured +numpy==1.26.4 + # via contourpy + # via langchain + # via langchain-community + # via layoutparser + # via llama-index-core + # via llama-index-legacy + # via matplotlib + # via onnx + # via onnxruntime + # via opencv-python + # via opencv-python-headless + # via pandas + # via pdf2docx + # via pycocotools + # via scipy + # via torchvision + # via transformers + # via unstructured +nvidia-cublas-cu12==12.1.3.1 ; platform_machine == 'x86_64' and platform_system == 'Linux' + # via nvidia-cudnn-cu12 + # via nvidia-cusolver-cu12 + # via torch +nvidia-cuda-cupti-cu12==12.1.105 ; platform_machine == 'x86_64' and platform_system == 'Linux' + # via torch +nvidia-cuda-nvrtc-cu12==12.1.105 ; platform_machine == 'x86_64' and platform_system == 'Linux' + # via torch +nvidia-cuda-runtime-cu12==12.1.105 ; platform_machine == 'x86_64' and platform_system == 'Linux' + # via torch +nvidia-cudnn-cu12==9.1.0.70 ; platform_machine == 'x86_64' and platform_system == 'Linux' + # via torch +nvidia-cufft-cu12==11.0.2.54 ; platform_machine == 'x86_64' and platform_system == 'Linux' + # via torch +nvidia-curand-cu12==10.3.2.106 ; platform_machine == 'x86_64' and platform_system == 'Linux' + # via torch +nvidia-cusolver-cu12==11.4.5.107 ; platform_machine == 'x86_64' and platform_system == 'Linux' + # via torch +nvidia-cusparse-cu12==12.1.0.106 ; platform_machine == 'x86_64' and platform_system == 'Linux' + # via nvidia-cusolver-cu12 + # via torch +nvidia-nccl-cu12==2.20.5 ; platform_machine == 'x86_64' and platform_system == 'Linux' + # via torch +nvidia-nvjitlink-cu12==12.6.20 ; platform_machine == 'x86_64' and platform_system == 'Linux' + # via nvidia-cusolver-cu12 + # via nvidia-cusparse-cu12 +nvidia-nvtx-cu12==12.1.105 ; platform_machine == 'x86_64' and platform_system == 'Linux' + # via torch +omegaconf==2.3.0 + # via effdet +onnx==1.16.2 + # via unstructured + # via unstructured-inference +onnxruntime==1.19.0 + # via unstructured-inference +openai==1.41.1 + # via langchain-openai + # via llama-index-agent-openai + # via llama-index-core + # via llama-index-legacy + # via llama-index-llms-openai +opencv-python==4.10.0.84 + # via layoutparser + # via unstructured-inference +opencv-python-headless==4.10.0.84 + # via pdf2docx +ordered-set==4.1.0 + # via deepdiff +orjson==3.10.7 + # via langsmith +packaging==24.1 + # via huggingface-hub + # via langchain-core + # via marshmallow + # via matplotlib + # via onnxruntime + # via pikepdf + # via transformers + # via unstructured-client + # via unstructured-pytesseract +pandas==2.2.2 + # via layoutparser + # via llama-index-core + # via llama-index-legacy +pdf2docx==0.5.8 + # via megaparse +pdf2image==1.17.0 + # via layoutparser + # via unstructured +pdfminer-six==20231228 + # via pdfplumber + # via unstructured +pdfplumber==0.11.4 + # via layoutparser + # via megaparse +pikepdf==9.1.1 + # via unstructured +pillow==10.4.0 + # via layoutparser + # via llama-index-core + # via matplotlib + # via pdf2image + # via pdfplumber + # via pikepdf + # via pillow-heif + # via python-pptx + # via torchvision + # via unstructured-pytesseract +pillow-heif==0.18.0 + # via unstructured +portalocker==2.10.1 + # via iopath +proto-plus==1.24.0 + # via google-api-core + # via google-cloud-vision +protobuf==5.27.3 + # via google-api-core + # via google-cloud-vision + # via googleapis-common-protos + # via grpcio-status + # via onnx + # via onnxruntime + # via proto-plus +psutil==6.0.0 + # via unstructured +pyasn1==0.6.0 + # via pyasn1-modules + # via rsa +pyasn1-modules==0.4.0 + # via google-auth +pycocotools==2.0.8 + # via effdet +pycparser==2.22 ; platform_python_implementation != 'PyPy' + # via cffi +pycryptodome==3.20.0 + # via megaparse +pydantic==2.8.2 + # via langchain + # via langchain-core + # via langsmith + # via llama-cloud + # via openai +pydantic-core==2.20.1 + # via pydantic +pymupdf==1.24.9 + # via pdf2docx +pymupdfb==1.24.9 + # via pymupdf +pyparsing==3.1.2 + # via matplotlib +pypdf==4.3.1 + # via llama-index-readers-file + # via unstructured + # via unstructured-client +pypdfium2==4.30.0 + # via pdfplumber +pyreadline3==3.4.1 ; sys_platform == 'win32' + # via humanfriendly +python-dateutil==2.9.0.post0 + # via matplotlib + # via pandas + # via unstructured-client +python-docx==1.1.2 + # via megaparse + # via pdf2docx +python-dotenv==1.0.1 + # via megaparse +python-iso639==2024.4.27 + # via unstructured +python-magic==0.4.27 + # via unstructured +python-multipart==0.0.9 + # via unstructured-inference +python-pptx==1.0.2 + # via megaparse +pytz==2024.1 + # via pandas +pywin32==306 ; platform_system == 'Windows' + # via portalocker +pyyaml==6.0.2 + # via huggingface-hub + # via langchain + # via langchain-community + # via langchain-core + # via layoutparser + # via llama-index-core + # via omegaconf + # via timm + # via transformers +rapidfuzz==3.9.6 + # via unstructured + # via unstructured-inference +regex==2024.7.24 + # via nltk + # via tiktoken + # via transformers +requests==2.32.3 + # via google-api-core + # via huggingface-hub + # via langchain + # via langchain-community + # via langsmith + # via llama-index-core + # via llama-index-legacy + # via requests-toolbelt + # via tiktoken + # via transformers + # via unstructured + # via unstructured-client +requests-toolbelt==1.0.0 + # via unstructured-client +rsa==4.9 + # via google-auth +safetensors==0.4.4 + # via timm + # via transformers +scipy==1.14.0 + # via layoutparser +setuptools==73.0.0 + # via torch +six==1.16.0 + # via fire + # via langdetect + # via python-dateutil + # via unstructured-client +sniffio==1.3.1 + # via anyio + # via httpx + # via openai +soupsieve==2.6 + # via beautifulsoup4 +sqlalchemy==2.0.32 + # via langchain + # via langchain-community + # via llama-index-core + # via llama-index-legacy +striprtf==0.0.26 + # via llama-index-readers-file +sympy==1.13.2 + # via onnxruntime + # via torch +tabulate==0.9.0 + # via unstructured +tenacity==8.5.0 + # via langchain + # via langchain-community + # via langchain-core + # via llama-index-core + # via llama-index-legacy +termcolor==2.4.0 + # via fire +tiktoken==0.7.0 + # via langchain-openai + # via llama-index-core + # via llama-index-legacy +timm==1.0.8 + # via effdet + # via unstructured-inference +tokenizers==0.19.1 + # via transformers +torch==2.4.0 + # via effdet + # via timm + # via torchvision + # via unstructured-inference +torchvision==0.19.0 + # via effdet + # via timm +tqdm==4.66.5 + # via huggingface-hub + # via iopath + # via llama-index-core + # via nltk + # via openai + # via transformers + # via unstructured +transformers==4.44.0 + # via unstructured-inference +triton==3.0.0 ; python_version < '3.13' and platform_machine == 'x86_64' and platform_system == 'Linux' + # via torch +typing-extensions==4.12.2 + # via emoji + # via huggingface-hub + # via iopath + # via langchain-core + # via llama-index-core + # via llama-index-legacy + # via openai + # via pydantic + # via pydantic-core + # via python-docx + # via python-pptx + # via sqlalchemy + # via torch + # via typing-inspect + # via unstructured + # via unstructured-client +typing-inspect==0.9.0 + # via dataclasses-json + # via llama-index-core + # via llama-index-legacy + # via unstructured-client +tzdata==2024.1 + # via pandas +unstructured==0.15.5 + # via megaparse +unstructured-client==0.25.5 + # via unstructured +unstructured-inference==0.7.36 + # via unstructured +unstructured-pytesseract==0.3.13 + # via unstructured +urllib3==2.2.2 + # via requests + # via unstructured-client +wrapt==1.16.0 + # via deprecated + # via llama-index-core + # via unstructured +xlsxwriter==3.2.0 + # via python-pptx +yarl==1.9.4 + # via aiohttp diff --git a/backend/core/MegaParse/tests/__init__.py b/backend/core/MegaParse/tests/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/backend/core/MegaParse/tests/data/dummy.pdf b/backend/core/MegaParse/tests/data/dummy.pdf new file mode 100644 index 000000000000..8da27b526712 Binary files /dev/null and b/backend/core/MegaParse/tests/data/dummy.pdf differ diff --git a/backend/core/MegaParse/tests/data/input_tests/MegaFake_report.pdf b/backend/core/MegaParse/tests/data/input_tests/MegaFake_report.pdf new file mode 100644 index 000000000000..602ae67ecd19 Binary files /dev/null and b/backend/core/MegaParse/tests/data/input_tests/MegaFake_report.pdf differ diff --git a/backend/core/MegaParse/tests/data/input_tests/sample.docx b/backend/core/MegaParse/tests/data/input_tests/sample.docx new file mode 100644 index 000000000000..330bd5000310 Binary files /dev/null and b/backend/core/MegaParse/tests/data/input_tests/sample.docx differ diff --git a/backend/core/MegaParse/tests/data/input_tests/sample.pptx b/backend/core/MegaParse/tests/data/input_tests/sample.pptx new file mode 100644 index 000000000000..ea727948c20f Binary files /dev/null and b/backend/core/MegaParse/tests/data/input_tests/sample.pptx differ diff --git a/backend/core/MegaParse/tests/data/input_tests/sample_pdf.pdf b/backend/core/MegaParse/tests/data/input_tests/sample_pdf.pdf new file mode 100644 index 000000000000..5dc1f2e3102c Binary files /dev/null and b/backend/core/MegaParse/tests/data/input_tests/sample_pdf.pdf differ diff --git a/backend/core/MegaParse/tests/data/input_tests/sample_table.pdf b/backend/core/MegaParse/tests/data/input_tests/sample_table.pdf new file mode 100644 index 000000000000..b8e1353851a0 Binary files /dev/null and b/backend/core/MegaParse/tests/data/input_tests/sample_table.pdf differ diff --git a/backend/core/MegaParse/tests/test_import.py b/backend/core/MegaParse/tests/test_import.py new file mode 100644 index 000000000000..72e196c3a9af --- /dev/null +++ b/backend/core/MegaParse/tests/test_import.py @@ -0,0 +1,10 @@ +import pytest + +from megaparse.Converter import MegaParse + + +@pytest.mark.skip("slow test") +def test_load(): + megaparse = MegaParse(file_path="./tests/data/dummy.pdf") + element = megaparse.load() + assert element.page_content.strip("\n") == "# Dummy PDF download" diff --git a/backend/core/examples/chat_llm.py b/backend/core/examples/chat_llm.py deleted file mode 100644 index 969f4b46655d..000000000000 --- a/backend/core/examples/chat_llm.py +++ /dev/null @@ -1,12 +0,0 @@ -from quivr_core import ChatLLM -from quivr_core.config import LLMEndpointConfig -from quivr_core.llm import LLMEndpoint - -if __name__ == "__main__": - llm_endpoint = LLMEndpoint.from_config(LLMEndpointConfig(model="gpt-4o-mini")) - chat_llm = ChatLLM( - llm=llm_endpoint, - ) - print(chat_llm.llm_endpoint.info()) - response = chat_llm.answer("Hello,what is your model?") - print(response) diff --git a/backend/core/examples/pdf_document_from_yaml.py b/backend/core/examples/pdf_document_from_yaml.py new file mode 100644 index 000000000000..02406931f264 --- /dev/null +++ b/backend/core/examples/pdf_document_from_yaml.py @@ -0,0 +1,146 @@ +import asyncio +import logging +import os +from pathlib import Path + +import dotenv +from quivr_core import Brain +from quivr_core.config import AssistantConfig +from rich.traceback import install as rich_install + +ConsoleOutputHandler = logging.StreamHandler() + +logger = logging.getLogger("quivr_core") +logger.setLevel(logging.DEBUG) +logger.addHandler(ConsoleOutputHandler) + + +logger = logging.getLogger("megaparse") +logger.setLevel(logging.DEBUG) +logger.addHandler(ConsoleOutputHandler) + + +# Install rich's traceback handler to automatically format tracebacks +rich_install() + + +async def main(): + file_path = [ + Path("data/YamEnterprises_Monotype Fonts Plan License.US.en 04.0 (BLP).pdf") + ] + file_path = [ + Path( + "data/YamEnterprises_Monotype Fonts Plan License.US.en 04.0 (BLP) reduced.pdf" + ) + ] + + config_file_name = ( + "/Users/jchevall/Coding/quivr/backend/core/tests/rag_config_workflow.yaml" + ) + + assistant_config = AssistantConfig.from_yaml(config_file_name) + # megaparse_config = find_nested_key(config, "megaparse_config") + megaparse_config = assistant_config.ingestion_config.parser_config.megaparse_config + megaparse_config.llama_parse_api_key = os.getenv("LLAMA_PARSE_API_KEY") + + processor_kwargs = { + "megaparse_config": megaparse_config, + "splitter_config": assistant_config.ingestion_config.parser_config.splitter_config, + } + + brain = await Brain.afrom_files( + name="test_brain", + file_paths=file_path, + processor_kwargs=processor_kwargs, + ) + + # # Check brain info + brain.print_info() + + questions = [ + "What is the contact name for Yam Enterprises?", + "What is the customer phone for Yam Enterprises?", + "What is the Production Fonts (maximum) for Yam Enterprises?", + "List the past use font software according to past use term for Yam Enterprises.", + "How many unique Font Name are there in the Add-On Font Software Section for Yam Enterprises?", + "What is the maximum number of Production Fonts allowed based on the license usage per term for Yam Enterprises?", + "What is the number of production fonts licensed by Yam Enterprises? List them one by one.", + "What is the number of Licensed Monthly Page Views for Yam Enterprises?", + "What is the monthly licensed impressions (Digital Marketing Communications) for Yam Enterprises?", + "What is the number of Licensed Applications for Yam Enterprises?", + "For Yam Enterprises what is the number of applications aggregate Registered users?", + "What is the number of licensed servers for Yam Enterprises?", + "When is swap of Production Fonts available in Yam Enterprises?", + "Who is the primary licensed monotype fonts user for Yam Enterprises?", + "What is the number of Licensed Commercial Electronic Documents for Yam Enterprises?", + "How many licensed monotype fonts users can Yam Enterprises have?", + "How many licensed desktop users can Yam Enterprises have?", + "Which contract type does Yam Enterprises follow?", + "What monotype fonts support does Yam Enterprises have?", + "Which monotype font services onboarding does Yam Enterprises have?", + "Which Font/User Management does Yam Enterprises have?", + "What Add-on inventory set did Yam Enterprises pick?", + "Does Yam Enterprises have Single sign on?", + "Is there Brand and Licence protection for Yam Enterprises?", + "Who is the Third Party Payor's contact in Yam Enterprises?", + "Does Yam Enterprises contract have Company Desktop License?", + "What is the Number of Swaps Allowed for Yam Enterprises?", + "When is swap of Production Fonts available in Yam Enterprises?", + ] + + answers = [ + "Haruko Yamamoto", + "81 90-1234-5603", + "300 Production Fonts", + "Helvetica Regular", + "7", + "300 Production Fonts", + "Yam Enterprises has licensed a total of 105 Production Fonts.", + "35,000,000", + "2,500,000", + "60", + "40", + "2", + "Once per quarter", + "Haruko Yamamoto", + "0", + "100", + "60", + "License", + "Premier", + "Premier", + "Premier", + "Plus", + "Yes", + "Yes", + """ + Name: Yami Enterprises + + Contact: Mei Mei + + Address: 20-22 Tsuki-Tsuki-dori, Tokyo, Japan + + Phone: +81 71-9336-54023 + + E-mail: mei.mei@example.com + """, + "Yes", + "One (1) swap per calendar quarter", + "The swap of Production Fonts will be available one (1) time per calendar quarter by removing Font Software as a Production Font and choosing other Font Software on the Monotype Fonts Platform.", + ] + + retrieval_config = assistant_config.retrieval_config + for i, (question, truth) in enumerate(zip(questions, answers, strict=False)): + chunk = brain.ask(question=question, retrieval_config=retrieval_config) + print( + "\n Question: ", question, "\n Answer: ", chunk.answer, "\n Truth: ", truth + ) + if i == 5: + break + + +if __name__ == "__main__": + dotenv.load_dotenv() + + # Run the main function in the existing event loop + asyncio.run(main()) diff --git a/backend/core/pyproject.toml b/backend/core/pyproject.toml index b8b07ba71e4b..0c5fa7cf9903 100644 --- a/backend/core/pyproject.toml +++ b/backend/core/pyproject.toml @@ -16,6 +16,8 @@ dependencies = [ "aiofiles>=23.1.0", "langchain-community>=0.2.12", "langchain-anthropic>=0.1.23", + "types-pyyaml>=6.0.12.20240808", + "transformers[sentencepiece]>=4.44.2", ] readme = "README.md" requires-python = ">= 3.11" @@ -25,7 +27,7 @@ all = [ "unstructured[epub,docx,odt,doc,pptx,ppt,xlsx,md]>=0.15.5", "faiss-cpu>=1.8.0.post1", "docx2txt>=0.8", - "megaparse>=0.0.29", + "megaparse" ] [build-system] @@ -62,3 +64,11 @@ markers = [ "tika: these tests require a tika server to be running", "unstructured: these tests require `unstructured` dependency", ] + +[[tool.mypy.overrides]] +module = "yaml" +ignore_missing_imports = true + +[[tool.rye.sources]] +name = "megaparse" +path = "./MegaParse" diff --git a/backend/core/quivr_core/__init__.py b/backend/core/quivr_core/__init__.py index 5ef621c0171d..1fcda28082d3 100644 --- a/backend/core/quivr_core/__init__.py +++ b/backend/core/quivr_core/__init__.py @@ -1,10 +1,9 @@ from importlib.metadata import entry_points from .brain import Brain -from .chat_llm import ChatLLM from .processor.registry import register_processor, registry -__all__ = ["Brain", "ChatLLM", "registry", "register_processor"] +__all__ = ["Brain", "registry", "register_processor"] def register_entries(): diff --git a/backend/core/quivr_core/base_config.py b/backend/core/quivr_core/base_config.py new file mode 100644 index 000000000000..0a2d11546fd5 --- /dev/null +++ b/backend/core/quivr_core/base_config.py @@ -0,0 +1,17 @@ +from pathlib import Path + +import yaml +from pydantic import BaseModel, ConfigDict + + +class QuivrBaseConfig(BaseModel): + model_config = ConfigDict(extra="forbid") + + @classmethod + def from_yaml(cls, file_path: str | Path): + # Load the YAML file + with open(file_path, "r") as stream: + config_data = yaml.safe_load(stream) + + # Instantiate the class using the YAML data + return cls(**config_data) diff --git a/backend/core/quivr_core/brain/brain.py b/backend/core/quivr_core/brain/brain.py index b1175b6be31d..3bbe7af02cff 100644 --- a/backend/core/quivr_core/brain/brain.py +++ b/backend/core/quivr_core/brain/brain.py @@ -23,10 +23,15 @@ TransparentStorageConfig, ) from quivr_core.chat import ChatHistory -from quivr_core.config import RAGConfig +from quivr_core.config import RetrievalConfig from quivr_core.files.file import load_qfile from quivr_core.llm import LLMEndpoint -from quivr_core.models import ParsedRAGChunkResponse, ParsedRAGResponse, SearchResult +from quivr_core.models import ( + ParsedRAGChunkResponse, + ParsedRAGResponse, + QuivrKnowledge, + SearchResult, +) from quivr_core.processor.registry import get_processor_class from quivr_core.quivr_rag import QuivrQARAG from quivr_core.quivr_rag_langgraph import QuivrQARAGLangGraph @@ -71,10 +76,10 @@ def __init__( *, name: str, id: UUID, - vector_db: VectorStore, llm: LLMEndpoint, - embedder: Embeddings, - storage: StorageBase, + vector_db: VectorStore | None = None, + embedder: Embeddings | None = None, + storage: StorageBase | None = None, ): self.id = id self.name = name @@ -110,11 +115,12 @@ def load(cls, folder_path: str | Path) -> Self: with open(os.path.join(folder_path, "config.json"), "r") as f: bserialized = BrainSerialized.model_validate_json(f.read()) + storage: StorageBase | None = None # Loading storage if bserialized.storage_config.storage_type == "transparent_storage": - storage: StorageBase = TransparentStorage.load(bserialized.storage_config) + storage = TransparentStorage.load(bserialized.storage_config) elif bserialized.storage_config.storage_type == "local_storage": - storage: StorageBase = LocalStorage.load(bserialized.storage_config) + storage = LocalStorage.load(bserialized.storage_config) else: raise ValueError("unknown storage") @@ -171,6 +177,7 @@ async def save(self, folder_path: str | Path): else: raise Exception("can't serialize embedder other than openai for now") + storage_config: Union[LocalStorageConfig, TransparentStorageConfig] # TODO : each instance should know how to serialize/deserialize itself if isinstance(self.storage, LocalStorage): serialized_files = { @@ -213,7 +220,7 @@ def info(self) -> BrainInfo: return BrainInfo( brain_id=self.id, brain_name=self.name, - files_info=self.storage.info(), + files_info=self.storage.info() if self.storage else None, chats_info=chats_info, llm_info=self.llm.info(), ) @@ -238,6 +245,7 @@ async def afrom_files( llm: LLMEndpoint | None = None, embedder: Embeddings | None = None, skip_file_error: bool = False, + processor_kwargs: dict[str, Any] | None = None, ): if llm is None: llm = default_llm() @@ -245,6 +253,8 @@ async def afrom_files( if embedder is None: embedder = default_embedder() + processor_kwargs = processor_kwargs or {} + brain_id = uuid4() # TODO: run in parallel using tasks @@ -259,6 +269,7 @@ async def afrom_files( docs = await process_files( storage=storage, skip_file_error=skip_file_error, + **processor_kwargs, ) # Building brain's vectordb @@ -289,6 +300,7 @@ def from_files( llm: LLMEndpoint | None = None, embedder: Embeddings | None = None, skip_file_error: bool = False, + processor_kwargs: dict[str, Any] | None = None, ) -> Self: loop = asyncio.get_event_loop() return loop.run_until_complete( @@ -300,6 +312,7 @@ def from_files( llm=llm, embedder=embedder, skip_file_error=skip_file_error, + processor_kwargs=processor_kwargs, ) ) @@ -344,6 +357,9 @@ async def asearch( filter: Callable | Dict[str, Any] | None = None, fetch_n_neighbors: int = 20, ) -> list[SearchResult]: + if not self.vector_db: + raise ValueError("No vector db configured for this brain") + result = await self.vector_db.asimilarity_search_with_score( query, k=n_results, filter=filter, fetch_k=fetch_n_neighbors ) @@ -362,28 +378,33 @@ def add_file(self) -> None: def ask( self, question: str, - rag_config: RAGConfig | None = None, + retrieval_config: RetrievalConfig | None = None, rag_pipeline: Type[Union[QuivrQARAG, QuivrQARAGLangGraph]] | None = None, + list_files: list[QuivrKnowledge] | None = None, + chat_history: ChatHistory | None = None, ) -> ParsedRAGResponse: llm = self.llm # If you passed a different llm model we'll override the brain one - if rag_config: - if rag_config.llm_config != self.llm.get_config(): - llm = LLMEndpoint.from_config(config=rag_config.llm_config) + if retrieval_config: + if retrieval_config.llm_config != self.llm.get_config(): + llm = LLMEndpoint.from_config(config=retrieval_config.llm_config) else: - rag_config = RAGConfig(llm_config=self.llm.get_config()) + retrieval_config = RetrievalConfig(llm_config=self.llm.get_config()) if rag_pipeline is None: - rag_pipeline = QuivrQARAG + rag_pipeline = QuivrQARAGLangGraph rag_instance = rag_pipeline( - rag_config=rag_config, llm=llm, vector_store=self.vector_db + retrieval_config=retrieval_config, llm=llm, vector_store=self.vector_db ) - chat_history = self.default_chat + chat_history = self.default_chat if chat_history is None else chat_history + list_files = [] if list_files is None else list_files - parsed_response = rag_instance.answer(question, chat_history, []) + parsed_response = rag_instance.answer( + question=question, history=chat_history, list_files=list_files + ) chat_history.append(HumanMessage(content=question)) chat_history.append(AIMessage(content=parsed_response.answer)) @@ -394,30 +415,34 @@ def ask( async def ask_streaming( self, question: str, - rag_config: RAGConfig | None = None, + retrieval_config: RetrievalConfig | None = None, rag_pipeline: Type[Union[QuivrQARAG, QuivrQARAGLangGraph]] | None = None, + list_files: list[QuivrKnowledge] | None = None, + chat_history: ChatHistory | None = None, ) -> AsyncGenerator[ParsedRAGChunkResponse, ParsedRAGChunkResponse]: llm = self.llm # If you passed a different llm model we'll override the brain one - if rag_config: - if rag_config.llm_config != self.llm.get_config(): - llm = LLMEndpoint.from_config(config=rag_config.llm_config) + if retrieval_config: + if retrieval_config.llm_config != self.llm.get_config(): + llm = LLMEndpoint.from_config(config=retrieval_config.llm_config) else: - rag_config = RAGConfig(llm_config=self.llm.get_config()) + retrieval_config = RetrievalConfig(llm_config=self.llm.get_config()) if rag_pipeline is None: - rag_pipeline = QuivrQARAG + rag_pipeline = QuivrQARAGLangGraph rag_instance = rag_pipeline( - rag_config=rag_config, llm=llm, vector_store=self.vector_db + retrieval_config=retrieval_config, llm=llm, vector_store=self.vector_db ) - chat_history = self.default_chat + chat_history = self.default_chat if chat_history is None else chat_history + list_files = [] if list_files is None else list_files - # TODO: List of files full_answer = "" - async for response in rag_instance.answer_astream(question, chat_history, []): + async for response in rag_instance.answer_astream( + question=question, history=chat_history, list_files=list_files + ): # Format output to be correct servicedf;j if not response.last_chunk: yield response diff --git a/backend/core/quivr_core/brain/info.py b/backend/core/quivr_core/brain/info.py index bb0747f32eaa..b049bb7fa5f8 100644 --- a/backend/core/quivr_core/brain/info.py +++ b/backend/core/quivr_core/brain/info.py @@ -53,17 +53,18 @@ def add_to_tree(self, files_tree: Tree): class BrainInfo: brain_id: UUID brain_name: str - files_info: StorageInfo chats_info: ChatHistoryInfo llm_info: LLMInfo + files_info: StorageInfo | None = None def to_tree(self): tree = Tree("📊 Brain Information") tree.add(f"🆔 ID: [bold cyan]{self.brain_id}[/bold cyan]") tree.add(f"🧠 Brain Name: [bold green]{self.brain_name}[/bold green]") - files_tree = tree.add("📁 Files") - self.files_info.add_to_tree(files_tree) + if self.files_info: + files_tree = tree.add("📁 Files") + self.files_info.add_to_tree(files_tree) chats_tree = tree.add("💬 Chats") self.chats_info.add_to_tree(chats_tree) diff --git a/backend/core/quivr_core/chat.py b/backend/core/quivr_core/chat.py index 90697eecc8e4..b8d3b1057774 100644 --- a/backend/core/quivr_core/chat.py +++ b/backend/core/quivr_core/chat.py @@ -1,6 +1,7 @@ from datetime import datetime -from typing import Any, Generator, Tuple +from typing import Any, Generator, Tuple, List from uuid import UUID, uuid4 +from copy import deepcopy from langchain_core.messages import AIMessage, HumanMessage @@ -63,3 +64,21 @@ def iter_pairs(self) -> Generator[Tuple[HumanMessage, AIMessage], None, None]: ai_message.msg, AIMessage ), f"msg {human_message} is not AIMessage" yield (human_message.msg, ai_message.msg) + + def to_list(self) -> List[HumanMessage | AIMessage]: + """Format the chat history into a list of HumanMessage and AIMessage""" + return [_msg.msg for _msg in self._msgs] + + def __deepcopy__(self, memo): + """ + Support for deepcopy of ChatHistory. + This method ensures that mutable objects (like lists) are copied deeply. + """ + # Create a new instance of ChatHistory + new_copy = ChatHistory(self.id, deepcopy(self.brain_id, memo)) + + # Perform a deepcopy of the _msgs list + new_copy._msgs = deepcopy(self._msgs, memo) + + # Return the deep copied instance + return new_copy diff --git a/backend/core/quivr_core/chat_llm.py b/backend/core/quivr_core/chat_llm.py deleted file mode 100644 index 4824b3bed548..000000000000 --- a/backend/core/quivr_core/chat_llm.py +++ /dev/null @@ -1,138 +0,0 @@ -import logging -from operator import itemgetter -from typing import AsyncGenerator - -from langchain_core.messages import AIMessage, HumanMessage -from langchain_core.messages.ai import AIMessageChunk -from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder -from langchain_core.runnables import RunnableLambda, RunnablePassthrough - -from quivr_core.chat import ChatHistory -from quivr_core.llm import LLMEndpoint -from quivr_core.models import ( - ChatLLMMetadata, - ParsedRAGChunkResponse, - ParsedRAGResponse, - RAGResponseMetadata, -) -from quivr_core.utils import get_chunk_metadata, parse_response - -logger = logging.getLogger("quivr_core") - - -class ChatLLM: - def __init__(self, *, llm: LLMEndpoint): - self.llm_endpoint = llm - - def filter_history( - self, - chat_history: ChatHistory | None, - ): - """ - Filter out the chat history to only include the messages that are relevant to the current question - - Returns a filtered chat_history with in priority: first max_tokens, then max_history where a Human message and an AI message count as one pair - a token is 4 characters - """ - total_tokens = 0 - total_pairs = 0 - filtered_chat_history: list[AIMessage | HumanMessage] = [] - if chat_history is None: - return filtered_chat_history - - # Convert generator to list to allow reversing - pairs = list(chat_history.iter_pairs()) - # Iterate in reverse to prioritize the last messages - for human_message, ai_message in reversed(pairs): - # TODO: replace with tiktoken - message_tokens = (len(human_message.content) + len(ai_message.content)) // 4 - if ( - total_tokens + message_tokens > self.llm_endpoint._config.max_input - or total_pairs >= 20 - ): - break - filtered_chat_history.append(human_message) - filtered_chat_history.append(ai_message) - total_tokens += message_tokens - total_pairs += 1 - - return filtered_chat_history - - def build_chain(self): - loaded_memory = RunnablePassthrough.assign( - chat_history=RunnableLambda( - lambda x: self.filter_history(x["chat_history"]), - ), - question=lambda x: x["question"], - ) - logger.info(f"loaded_memory: {loaded_memory}") - prompt = ChatPromptTemplate.from_messages( - [ - ( - "system", - "You are Quivr. You are an assistant.", - ), - MessagesPlaceholder(variable_name="chat_history"), - ("human", "{question}"), - ] - ) - - final_inputs = { - "question": itemgetter("question"), - "chat_history": itemgetter("chat_history"), - } - llm = self.llm_endpoint._llm - - answer = {"answer": final_inputs | prompt | llm, "docs": lambda _: []} - - return loaded_memory | answer - - def answer( - self, question: str, history: ChatHistory | None = None - ) -> ParsedRAGResponse: - chain = self.build_chain() - raw_llm_response = chain.invoke({"question": question, "chat_history": history}) - - response = parse_response(raw_llm_response, self.llm_endpoint._config.model) - return response - - async def answer_astream( - self, question: str, history: ChatHistory | None = None - ) -> AsyncGenerator[ParsedRAGChunkResponse, ParsedRAGChunkResponse]: - chain = self.build_chain() - rolling_message = AIMessageChunk(content="") - prev_answer = "" - chunk_id = 0 - - async for chunk in chain.astream( - {"question": question, "chat_history": history} - ): - if "answer" in chunk: - answer_str = chunk["answer"].content - rolling_message += chunk["answer"] - if len(answer_str) > 0: - parsed_chunk = ParsedRAGChunkResponse( - answer=answer_str, - metadata=RAGResponseMetadata(), - ) - prev_answer += answer_str - - logger.debug( - f"answer_astream func_calling=True question={question} rolling_msg={rolling_message} chunk_id={chunk_id}, chunk={parsed_chunk}" - ) - yield parsed_chunk - - chunk_id += 1 - # Last chunk provides metadata - last_chunk = ParsedRAGChunkResponse( - answer=rolling_message.content, - metadata=get_chunk_metadata(rolling_message), - last_chunk=True, - ) - last_chunk.metadata.metadata_model = ChatLLMMetadata( - name=self.llm_endpoint._config.model, - ) - logger.debug( - f"answer_astream last_chunk={last_chunk} question={question} rolling_msg={rolling_message} chunk_id={chunk_id}" - ) - yield last_chunk diff --git a/backend/core/quivr_core/config.py b/backend/core/quivr_core/config.py index f197ecb06ce2..b974d3220ed7 100644 --- a/backend/core/quivr_core/config.py +++ b/backend/core/quivr_core/config.py @@ -1,18 +1,294 @@ -from pydantic import BaseModel +import os +from enum import Enum +from typing import Dict, List, Optional +from uuid import UUID +from sqlmodel import SQLModel +from megaparse.config import MegaparseConfig -class LLMEndpointConfig(BaseModel): +from quivr_core.base_config import QuivrBaseConfig +from quivr_core.processor.splitter import SplitterConfig +from quivr_core.prompts import CustomPromptsModel + + +class BrainConfig(QuivrBaseConfig): + brain_id: UUID | None = None + name: str + + @property + def id(self) -> UUID | None: + return self.brain_id + + +class DefaultRerankers(str, Enum): + COHERE = "cohere" + JINA = "jina" + + @property + def default_model(self) -> str: + # Mapping of suppliers to their default models + return { + self.COHERE: "rerank-multilingual-v3.0", + self.JINA: "jina-reranker-v2-base-multilingual", + }[self] + + +class DefaultModelSuppliers(str, Enum): + OPENAI = "openai" + AZURE = "azure" + ANTHROPIC = "anthropic" + META = "meta" + MISTRAL = "mistral" + GROQ = "groq" + + +class LLMConfig(QuivrBaseConfig): + context: int | None = None + tokenizer_hub: str | None = None + + +class LLMModelConfig: + _model_defaults: Dict[DefaultModelSuppliers, Dict[str, LLMConfig]] = { + DefaultModelSuppliers.OPENAI: { + "gpt-4o": LLMConfig(context=128000, tokenizer_hub="Xenova/gpt-4o"), + "gpt-4o-mini": LLMConfig(context=128000, tokenizer_hub="Xenova/gpt-4o"), + "gpt-4-turbo": LLMConfig(context=128000, tokenizer_hub="Xenova/gpt-4"), + "gpt-4": LLMConfig(context=8192, tokenizer_hub="Xenova/gpt-4"), + "gpt-3.5-turbo": LLMConfig( + context=16385, tokenizer_hub="Xenova/gpt-3.5-turbo" + ), + "text-embedding-3-large": LLMConfig( + context=8191, tokenizer_hub="Xenova/text-embedding-ada-002" + ), + "text-embedding-3-small": LLMConfig( + context=8191, tokenizer_hub="Xenova/text-embedding-ada-002" + ), + "text-embedding-ada-002": LLMConfig( + context=8191, tokenizer_hub="Xenova/text-embedding-ada-002" + ), + }, + DefaultModelSuppliers.ANTHROPIC: { + "claude-3-5-sonnet": LLMConfig( + context=200000, tokenizer_hub="Xenova/claude-tokenizer" + ), + "claude-3-opus": LLMConfig( + context=200000, tokenizer_hub="Xenova/claude-tokenizer" + ), + "claude-3-sonnet": LLMConfig( + context=200000, tokenizer_hub="Xenova/claude-tokenizer" + ), + "claude-3-haiku": LLMConfig( + context=200000, tokenizer_hub="Xenova/claude-tokenizer" + ), + "claude-2-1": LLMConfig( + context=200000, tokenizer_hub="Xenova/claude-tokenizer" + ), + "claude-2-0": LLMConfig( + context=100000, tokenizer_hub="Xenova/claude-tokenizer" + ), + "claude-instant-1-2": LLMConfig( + context=100000, tokenizer_hub="Xenova/claude-tokenizer" + ), + }, + DefaultModelSuppliers.META: { + "llama-3.1": LLMConfig( + context=128000, tokenizer_hub="Xenova/Meta-Llama-3.1-Tokenizer" + ), + "llama-3": LLMConfig( + context=8192, tokenizer_hub="Xenova/llama3-tokenizer-new" + ), + "llama-2": LLMConfig(context=4096, tokenizer_hub="Xenova/llama2-tokenizer"), + "code-llama": LLMConfig( + context=16384, tokenizer_hub="Xenova/llama-code-tokenizer" + ), + }, + DefaultModelSuppliers.GROQ: { + "llama-3.1": LLMConfig( + context=128000, tokenizer_hub="Xenova/Meta-Llama-3.1-Tokenizer" + ), + "llama-3": LLMConfig( + context=8192, tokenizer_hub="Xenova/llama3-tokenizer-new" + ), + "llama-2": LLMConfig(context=4096, tokenizer_hub="Xenova/llama2-tokenizer"), + "code-llama": LLMConfig( + context=16384, tokenizer_hub="Xenova/llama-code-tokenizer" + ), + }, + DefaultModelSuppliers.MISTRAL: { + "mistral-large": LLMConfig( + context=128000, tokenizer_hub="Xenova/mistral-tokenizer-v3" + ), + "mistral-small": LLMConfig( + context=128000, tokenizer_hub="Xenova/mistral-tokenizer-v3" + ), + "mistral-nemo": LLMConfig( + context=128000, tokenizer_hub="Xenova/Mistral-Nemo-Instruct-Tokenizer" + ), + "codestral": LLMConfig( + context=32000, tokenizer_hub="Xenova/mistral-tokenizer-v3" + ), + }, + } + + @classmethod + def get_supplier_by_model_name(cls, model: str) -> DefaultModelSuppliers | None: + # Iterate over the suppliers and their models + for supplier, models in cls._model_defaults.items(): + # Check if the model name or a base part of the model name is in the supplier's models + for base_model_name in models: + if model.startswith(base_model_name): + return supplier + # Return None if no supplier matches the model name + return None + + @classmethod + def get_llm_model_config( + cls, supplier: DefaultModelSuppliers, model_name: str + ) -> Optional[LLMConfig]: + """Retrieve the LLMConfig (context and tokenizer_hub) for a given supplier and model.""" + supplier_defaults = cls._model_defaults.get(supplier) + if not supplier_defaults: + return None + + # Use startswith logic for matching model names + for key, config in supplier_defaults.items(): + if model_name.startswith(key): + return config + + return None + + +class LLMEndpointConfig(QuivrBaseConfig): + supplier: DefaultModelSuppliers = DefaultModelSuppliers.OPENAI model: str = "gpt-3.5-turbo-0125" + context_length: int | None = None + tokenizer_hub: str | None = None llm_base_url: str | None = None + env_variable_name: str = f"{supplier.upper()}_API_KEY" llm_api_key: str | None = None - max_input: int = 2000 - max_tokens: int = 2000 + max_input_tokens: int = 2000 + max_output_tokens: int = 2000 temperature: float = 0.7 streaming: bool = True + prompt: CustomPromptsModel | None = None + + _FALLBACK_TOKENIZER = "cl100k_base" + + @property + def fallback_tokenizer(self) -> str: + return self._FALLBACK_TOKENIZER + + def __init__(self, **data): + super().__init__(**data) + self.set_llm_model_config() + self.set_api_key() + + def set_api_key(self, force_reset: bool = False): + # Check if the corresponding API key environment variable is set + if not self.llm_api_key or force_reset: + self.llm_api_key = os.getenv(self.env_variable_name) + if not self.llm_api_key: + raise ValueError( + f"The API key for supplier '{self.supplier}' is not set. " + f"Please set the environment variable: {self.env_variable_name}" + ) -class RAGConfig(BaseModel): + def set_llm_model_config(self): + # Automatically set context_length and tokenizer_hub based on the supplier and model + llm_model_config = LLMModelConfig.get_llm_model_config( + self.supplier, self.model + ) + if llm_model_config: + self.context_length = llm_model_config.context + self.tokenizer_hub = llm_model_config.tokenizer_hub + + def set_llm_model(self, model: str): + supplier = LLMModelConfig.get_supplier_by_model_name(model) + if supplier is None: + raise ValueError( + f"Cannot find the corresponding supplier for model {model}" + ) + self.supplier = supplier + self.model = model + + self.set_llm_model_config() + self.set_api_key(force_reset=True) + + def set_from_sqlmodel(self, sqlmodel: SQLModel, mapping: Dict[str, str]): + """ + Set attributes in LLMEndpointConfig from Model attributes using a field mapping. + + :param model_instance: An instance of the Model class. + :param mapping: A dictionary that maps Model fields to LLMEndpointConfig fields. + Example: {"max_input": "max_input_tokens", "env_variable_name": "env_variable_name"} + """ + for model_field, llm_field in mapping.items(): + if hasattr(sqlmodel, model_field) and hasattr(self, llm_field): + setattr(self, llm_field, getattr(sqlmodel, model_field)) + else: + raise AttributeError( + f"Invalid mapping: {model_field} or {llm_field} does not exist." + ) + + +# Cannot use Pydantic v2 field_validator because of conflicts with pydantic v1 still in use in LangChain +class RerankerConfig(QuivrBaseConfig): + supplier: DefaultRerankers | None = None + model: str | None = None + top_n: int = 5 + api_key: str | None = None + + def __init__(self, **data): + super().__init__(**data) # Call Pydantic's BaseModel init + self.validate_model() # Automatically call external validation + + def validate_model(self): + # If model is not provided, get default model based on supplier + if self.model is None and self.supplier is not None: + self.model = self.supplier.default_model + + # Check if the corresponding API key environment variable is set + if self.supplier: + api_key_var = f"{self.supplier.upper()}_API_KEY" + self.api_key = os.getenv(api_key_var) + + if self.api_key is None: + raise ValueError( + f"The API key for supplier '{self.supplier}' is not set. " + f"Please set the environment variable: {api_key_var}" + ) + + +class NodeConfig(QuivrBaseConfig): + name: str + # config: QuivrBaseConfig # This can be any config like RerankerConfig or LLMEndpointConfig + edges: List[str] # List of names of other nodes this node links to + + +class WorkflowConfig(QuivrBaseConfig): + name: str + nodes: List[NodeConfig] + + +class RetrievalConfig(QuivrBaseConfig): + reranker_config: RerankerConfig = RerankerConfig() llm_config: LLMEndpointConfig = LLMEndpointConfig() max_history: int = 10 max_files: int = 20 prompt: str | None = None + workflow_config: WorkflowConfig | None = None + + +class ParserConfig(QuivrBaseConfig): + splitter_config: SplitterConfig = SplitterConfig() + megaparse_config: MegaparseConfig = MegaparseConfig() + + +class IngestionConfig(QuivrBaseConfig): + parser_config: ParserConfig = ParserConfig() + + +class AssistantConfig(QuivrBaseConfig): + retrieval_config: RetrievalConfig = RetrievalConfig() + ingestion_config: IngestionConfig = IngestionConfig() diff --git a/backend/core/quivr_core/llm/llm_endpoint.py b/backend/core/quivr_core/llm/llm_endpoint.py index 51b834195020..e26c0e6bf8be 100644 --- a/backend/core/quivr_core/llm/llm_endpoint.py +++ b/backend/core/quivr_core/llm/llm_endpoint.py @@ -1,13 +1,16 @@ import logging +import os +from typing import Union from urllib.parse import parse_qs, urlparse +import tiktoken from langchain_anthropic import ChatAnthropic from langchain_core.language_models.chat_models import BaseChatModel from langchain_openai import AzureChatOpenAI, ChatOpenAI from pydantic.v1 import SecretStr from quivr_core.brain.info import LLMInfo -from quivr_core.config import LLMEndpointConfig +from quivr_core.config import DefaultModelSuppliers, LLMEndpointConfig from quivr_core.utils import model_supports_function_calling logger = logging.getLogger("quivr_core") @@ -21,13 +24,39 @@ def __init__(self, llm_config: LLMEndpointConfig, llm: BaseChatModel): self._config.model ) + if llm_config.tokenizer_hub: + # To prevent the warning + # huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... + os.environ["TOKENIZERS_PARALLELISM"] = ( + "false" + if not os.environ.get("TOKENIZERS_PARALLELISM") + else os.environ["TOKENIZERS_PARALLELISM"] + ) + try: + from transformers import AutoTokenizer + + self.tokenizer = AutoTokenizer.from_pretrained(llm_config.tokenizer_hub) + except OSError: # if we don't manage to connect to huggingface and/or no cached models are present + logger.warning( + f"Cannot acces the configured tokenizer from {llm_config.tokenizer_hub}, using the default tokenizer {llm_config.fallback_tokenizer}" + ) + self.tokenizer = tiktoken.get_encoding(llm_config.fallback_tokenizer) + else: + self.tokenizer = tiktoken.get_encoding(llm_config.fallback_tokenizer) + + def count_tokens(self, text: str) -> int: + # Tokenize the input text and return the token count + encoding = self.tokenizer.encode(text) + return len(encoding) + def get_config(self): return self._config @classmethod def from_config(cls, config: LLMEndpointConfig = LLMEndpointConfig()): + _llm: Union[AzureChatOpenAI, ChatOpenAI, ChatAnthropic] try: - if config.model.startswith("azure/"): + if config.supplier == DefaultModelSuppliers.AZURE: # Parse the URL parsed_url = urlparse(config.llm_base_url) deployment = parsed_url.path.split("/")[3] # type: ignore @@ -40,16 +69,25 @@ def from_config(cls, config: LLMEndpointConfig = LLMEndpointConfig()): if config.llm_api_key else None, azure_endpoint=azure_endpoint, - max_tokens=config.max_tokens + max_tokens=config.max_output_tokens, ) - elif config.model.startswith("claude"): + elif config.supplier == DefaultModelSuppliers.ANTHROPIC: _llm = ChatAnthropic( model_name=config.model, api_key=SecretStr(config.llm_api_key) if config.llm_api_key else None, base_url=config.llm_base_url, - max_tokens=config.max_tokens + max_tokens=config.max_output_tokens, + ) + elif config.supplier == DefaultModelSuppliers.OPENAI: + _llm = ChatOpenAI( + model=config.model, + api_key=SecretStr(config.llm_api_key) + if config.llm_api_key + else None, + base_url=config.llm_base_url, + max_tokens=config.max_output_tokens, ) else: _llm = ChatOpenAI( @@ -58,7 +96,7 @@ def from_config(cls, config: LLMEndpointConfig = LLMEndpointConfig()): if config.llm_api_key else None, base_url=config.llm_base_url, - max_tokens=config.max_tokens + max_tokens=config.max_output_tokens, ) return cls(llm=_llm, llm_config=config) @@ -77,6 +115,6 @@ def info(self) -> LLMInfo: self._config.llm_base_url if self._config.llm_base_url else "openai" ), temperature=self._config.temperature, - max_tokens=self._config.max_tokens, + max_tokens=self._config.max_output_tokens, supports_function_calling=self.supports_func_calling(), ) diff --git a/backend/core/quivr_core/models.py b/backend/core/quivr_core/models.py index 8ebf2bbe23b8..0dc304c67b77 100644 --- a/backend/core/quivr_core/models.py +++ b/backend/core/quivr_core/models.py @@ -92,9 +92,10 @@ class ParsedRAGChunkResponse(BaseModel): class QuivrKnowledge(BaseModel): id: UUID - brain_id: UUID - file_name: Optional[str] = None + file_name: str + brain_ids: list[UUID] | None = None url: Optional[str] = None + extension: str = ".txt" mime_type: str = "txt" status: KnowledgeStatus = KnowledgeStatus.PROCESSING source: Optional[str] = None diff --git a/backend/core/quivr_core/processor/implementations/megaparse_processor.py b/backend/core/quivr_core/processor/implementations/megaparse_processor.py index d4dbf7e05381..9a7c63f1d6c7 100644 --- a/backend/core/quivr_core/processor/implementations/megaparse_processor.py +++ b/backend/core/quivr_core/processor/implementations/megaparse_processor.py @@ -4,6 +4,7 @@ from langchain_core.documents import Document from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter from megaparse import MegaParse +from megaparse.config import MegaparseConfig from quivr_core.files.file import QuivrFile from quivr_core.processor.processor_base import ProcessorBase @@ -34,16 +35,12 @@ def __init__( self, splitter: TextSplitter | None = None, splitter_config: SplitterConfig = SplitterConfig(), - llama_parse_api_key: str | None = None, - strategy: str = "fast", + megaparse_config: MegaparseConfig = MegaparseConfig(), ) -> None: self.loader_cls = MegaParse self.enc = tiktoken.get_encoding("cl100k_base") self.splitter_config = splitter_config - self.megaparse_kwargs = { - "llama_parse_api_key": llama_parse_api_key, - "strategy": strategy, - } + self.megaparse_config = megaparse_config if splitter: self.text_splitter = splitter @@ -60,11 +57,14 @@ def processor_metadata(self): } async def process_file_inner(self, file: QuivrFile) -> list[Document]: - mega_parse = MegaParse(file_path=file.path, **self.megaparse_kwargs) # type: ignore + mega_parse = MegaParse(file_path=file.path, config=self.megaparse_config) # type: ignore document: Document = await mega_parse.aload() + print("\n\n document: ", document.page_content) if len(document.page_content) > self.splitter_config.chunk_size: docs = self.text_splitter.split_documents([document]) for doc in docs: + # if "Production Fonts (maximum)" in doc.page_content: + # print('Doc: ', doc.page_content) doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))} return docs return [document] diff --git a/backend/core/quivr_core/prompts.py b/backend/core/quivr_core/prompts.py index eeb573fdd6fc..fa30cb5b8490 100644 --- a/backend/core/quivr_core/prompts.py +++ b/backend/core/quivr_core/prompts.py @@ -1,62 +1,119 @@ import datetime +from pydantic import ConfigDict, create_model +from langchain_core.prompts.base import BasePromptTemplate from langchain_core.prompts import ( ChatPromptTemplate, HumanMessagePromptTemplate, PromptTemplate, SystemMessagePromptTemplate, + MessagesPlaceholder, ) -# First step is to create the Rephrasing Prompt -_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language. Keep as much details as possible from previous messages. Keep entity names and all. -Chat History: -{chat_history} -Follow Up Input: {question} -Standalone question:""" -CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template) +class CustomPromptsDict(dict): + def __init__(self, type, *args, **kwargs): + super().__init__(*args, **kwargs) + self._type = type -# Next is the answering prompt + def __setitem__(self, key, value): + # Automatically convert the value into a tuple (my_type, value) + super().__setitem__(key, (self._type, value)) -template_answer = """ -Context: -{context} -User Question: {question} -Answer: -""" +def _define_custom_prompts() -> CustomPromptsDict: + custom_prompts: CustomPromptsDict = CustomPromptsDict(type=BasePromptTemplate) -today_date = datetime.datetime.now().strftime("%B %d, %Y") + today_date = datetime.datetime.now().strftime("%B %d, %Y") -system_message_template = ( - f"Your name is Quivr. You're a helpful assistant. Today's date is {today_date}." -) + # --------------------------------------------------------------------------- + # Prompt for question rephrasing + # --------------------------------------------------------------------------- + _template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language. Keep as much details as possible from previous messages. Keep entity names and all. -system_message_template += """ -When answering use markdown. -Use markdown code blocks for code snippets. -Answer in a concise and clear manner. -Use the following pieces of context from files provided by the user to answer the users. -Answer in the same language as the user question. -If you don't know the answer with the context provided from the files, just say that you don't know, don't try to make up an answer. -Don't cite the source id in the answer objects, but you can use the source to answer the question. -You have access to the files to answer the user question (limited to first 20 files): -{files} - -If not None, User instruction to follow to answer: {custom_instructions} -Don't cite the source id in the answer objects, but you can use the source to answer the question. -""" - - -ANSWER_PROMPT = ChatPromptTemplate.from_messages( - [ - SystemMessagePromptTemplate.from_template(system_message_template), - HumanMessagePromptTemplate.from_template(template_answer), - ] -) + Chat History: + {chat_history} + Follow Up Input: {question} + Standalone question:""" + + CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template) + custom_prompts["CONDENSE_QUESTION_PROMPT"] = CONDENSE_QUESTION_PROMPT + + # --------------------------------------------------------------------------- + # Prompt for RAG + # --------------------------------------------------------------------------- + system_message_template = ( + f"Your name is Quivr. You're a helpful assistant. Today's date is {today_date}." + ) + + system_message_template += """ + When answering use markdown. + Use markdown code blocks for code snippets. + Answer in a concise and clear manner. + Use the following pieces of context from files provided by the user to answer the users. + Answer in the same language as the user question. + If you don't know the answer with the context provided from the files, just say that you don't know, don't try to make up an answer. + Don't cite the source id in the answer objects, but you can use the source to answer the question. + You have access to the files to answer the user question (limited to first 20 files): + {files} + + If not None, User instruction to follow to answer: {custom_instructions} + Don't cite the source id in the answer objects, but you can use the source to answer the question. + """ + + template_answer = """ + Context: + {context} + User Question: {question} + Answer: + """ -# How we format documents -DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template( - template="Source: {index} \n {page_content}" + RAG_ANSWER_PROMPT = ChatPromptTemplate.from_messages( + [ + SystemMessagePromptTemplate.from_template(system_message_template), + HumanMessagePromptTemplate.from_template(template_answer), + ] + ) + custom_prompts["RAG_ANSWER_PROMPT"] = RAG_ANSWER_PROMPT + + # --------------------------------------------------------------------------- + # Prompt for formatting documents + # --------------------------------------------------------------------------- + DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template( + template="Source: {index} \n {page_content}" + ) + custom_prompts["DEFAULT_DOCUMENT_PROMPT"] = DEFAULT_DOCUMENT_PROMPT + + # --------------------------------------------------------------------------- + # Prompt for chatting directly with LLMs, without any document retrieval stage + # --------------------------------------------------------------------------- + system_message_template = ( + f"Your name is Quivr. You're a helpful assistant. Today's date is {today_date}." + ) + system_message_template += """ + If not None, also follow these user instructions when answering: {custom_instructions} + """ + + template_answer = """ + User Question: {question} + Answer: + """ + CHAT_LLM_PROMPT = ChatPromptTemplate.from_messages( + [ + SystemMessagePromptTemplate.from_template(system_message_template), + MessagesPlaceholder(variable_name="chat_history"), + HumanMessagePromptTemplate.from_template(template_answer), + ] + ) + custom_prompts["CHAT_LLM_PROMPT"] = CHAT_LLM_PROMPT + + return custom_prompts + + +_custom_prompts = _define_custom_prompts() +CustomPromptsModel = create_model( + "CustomPromptsModel", **_custom_prompts, __config__=ConfigDict(extra="forbid") ) + +custom_prompts = CustomPromptsModel() diff --git a/backend/core/quivr_core/quivr_rag.py b/backend/core/quivr_core/quivr_rag.py index 9f3049bc85ce..a11b98bfcf7d 100644 --- a/backend/core/quivr_core/quivr_rag.py +++ b/backend/core/quivr_core/quivr_rag.py @@ -13,7 +13,7 @@ from langchain_core.vectorstores import VectorStore from quivr_core.chat import ChatHistory -from quivr_core.config import RAGConfig +from quivr_core.config import RetrievalConfig from quivr_core.llm import LLMEndpoint from quivr_core.models import ( ParsedRAGChunkResponse, @@ -22,7 +22,7 @@ RAGResponseMetadata, cited_answer, ) -from quivr_core.prompts import ANSWER_PROMPT, CONDENSE_QUESTION_PROMPT +from quivr_core.prompts import custom_prompts from quivr_core.utils import ( combine_documents, format_file_list, @@ -52,12 +52,12 @@ class QuivrQARAG: def __init__( self, *, - rag_config: RAGConfig, + retrieval_config: RetrievalConfig, llm: LLMEndpoint, vector_store: VectorStore, reranker: BaseDocumentCompressor | None = None, ): - self.rag_config = rag_config + self.retrieval_config = retrieval_config self.vector_store = vector_store self.llm_endpoint = llm self.reranker = reranker if reranker is not None else IdempotentCompressor() @@ -87,8 +87,9 @@ def filter_history( # TODO: replace with tiktoken message_tokens = (len(human_message.content) + len(ai_message.content)) // 4 if ( - total_tokens + message_tokens > self.rag_config.llm_config.max_tokens - or total_pairs >= self.rag_config.max_history + total_tokens + message_tokens + > self.retrieval_config.llm_config.max_output_tokens + or total_pairs >= self.retrieval_config.max_history ): break filtered_chat_history.append(human_message) @@ -118,7 +119,7 @@ def build_chain(self, files: str): "question": lambda x: x["question"], "chat_history": itemgetter("chat_history"), } - | CONDENSE_QUESTION_PROMPT + | custom_prompts.CONDENSE_QUESTION_PROMPT | self.llm_endpoint._llm | StrOutputParser(), } @@ -127,7 +128,7 @@ def build_chain(self, files: str): retrieved_documents = { "docs": itemgetter("standalone_question") | compression_retriever, "question": lambda x: x["standalone_question"], - "custom_instructions": lambda x: self.rag_config.prompt, + "custom_instructions": lambda x: self.retrieval_config.prompt, } final_inputs = { @@ -146,7 +147,7 @@ def build_chain(self, files: str): ) answer = { - "answer": final_inputs | ANSWER_PROMPT | llm, + "answer": final_inputs | custom_prompts.ANSWER_PROMPT | llm, "docs": itemgetter("docs"), } @@ -162,17 +163,21 @@ def answer( """ Answers a question using the QuivrQA RAG synchronously. """ - concat_list_files = format_file_list(list_files, self.rag_config.max_files) + concat_list_files = format_file_list( + list_files, self.retrieval_config.max_files + ) conversational_qa_chain = self.build_chain(concat_list_files) raw_llm_response = conversational_qa_chain.invoke( { "question": question, "chat_history": history, - "custom_instructions": (self.rag_config.prompt), + "custom_instructions": (self.retrieval_config.prompt), }, config={"metadata": metadata}, ) - response = parse_response(raw_llm_response, self.rag_config.llm_config.model) + response = parse_response( + raw_llm_response, self.retrieval_config.llm_config.model + ) return response async def answer_astream( @@ -185,7 +190,9 @@ async def answer_astream( """ Answers a question using the QuivrQA RAG asynchronously. """ - concat_list_files = format_file_list(list_files, self.rag_config.max_files) + concat_list_files = format_file_list( + list_files, self.retrieval_config.max_files + ) conversational_qa_chain = self.build_chain(concat_list_files) rolling_message = AIMessageChunk(content="") @@ -197,7 +204,7 @@ async def answer_astream( { "question": question, "chat_history": history, - "custom_personality": (self.rag_config.prompt), + "custom_personality": (self.retrieval_config.prompt), }, config={"metadata": metadata}, ): diff --git a/backend/core/quivr_core/quivr_rag_langgraph.py b/backend/core/quivr_core/quivr_rag_langgraph.py index f856e52ceac7..7a18f83a111c 100644 --- a/backend/core/quivr_core/quivr_rag_langgraph.py +++ b/backend/core/quivr_core/quivr_rag_langgraph.py @@ -1,18 +1,22 @@ import logging from typing import Annotated, AsyncGenerator, Optional, Sequence, TypedDict +from uuid import uuid4 +from enum import Enum # TODO(@aminediro): this is the only dependency to langchain package, we should remove it from langchain.retrievers import ContextualCompressionRetriever +from langchain_cohere import CohereRerank +from langchain_community.document_compressors import JinaRerank from langchain_core.callbacks import Callbacks from langchain_core.documents import BaseDocumentCompressor, Document -from langchain_core.messages import AIMessage, BaseMessage, HumanMessage +from langchain_core.messages import BaseMessage from langchain_core.messages.ai import AIMessageChunk from langchain_core.vectorstores import VectorStore -from langgraph.graph import END, StateGraph +from langgraph.graph import START, END, StateGraph from langgraph.graph.message import add_messages from quivr_core.chat import ChatHistory -from quivr_core.config import RAGConfig +from quivr_core.config import DefaultRerankers, RetrievalConfig from quivr_core.llm import LLMEndpoint from quivr_core.models import ( ParsedRAGChunkResponse, @@ -21,7 +25,7 @@ RAGResponseMetadata, cited_answer, ) -from quivr_core.prompts import ANSWER_PROMPT, CONDENSE_QUESTION_PROMPT +from quivr_core.prompts import custom_prompts from quivr_core.utils import ( combine_documents, format_file_list, @@ -33,14 +37,17 @@ logger = logging.getLogger("quivr_core") +class SpecialEdges(str, Enum): + START = "START" + END = "END" + + class AgentState(TypedDict): # The add_messages function defines how an update should be processed # Default is to replace. add_messages says "append" messages: Annotated[Sequence[BaseMessage], add_messages] chat_history: ChatHistory - filtered_chat_history: list[AIMessage | HumanMessage] docs: list[Document] - transformed_question: BaseMessage files: str final_response: dict @@ -65,28 +72,47 @@ class QuivrQARAGLangGraph: def __init__( self, *, - rag_config: RAGConfig, + retrieval_config: RetrievalConfig, llm: LLMEndpoint, - vector_store: VectorStore, + vector_store: VectorStore | None = None, reranker: BaseDocumentCompressor | None = None, ): """ Construct a QuivrQARAGLangGraph object. Args: - rag_config (RAGConfig): The configuration for the RAG model. + retrieval_config (RetrievalConfig): The configuration for the RAG model. llm (LLMEndpoint): The LLM to use for generating text. vector_store (VectorStore): The vector store to use for storing and retrieving documents. reranker (BaseDocumentCompressor | None): The document compressor to use for re-ranking documents. Defaults to IdempotentCompressor if not provided. """ - self.rag_config = rag_config + self.retrieval_config = retrieval_config self.vector_store = vector_store self.llm_endpoint = llm - self.reranker = reranker if reranker is not None else IdempotentCompressor() - self.compression_retriever = ContextualCompressionRetriever( - base_compressor=self.reranker, base_retriever=self.retriever - ) + self.graph = None + + if reranker is not None: + self.reranker = reranker + elif self.retrieval_config.reranker_config.supplier == DefaultRerankers.COHERE: + self.reranker = CohereRerank( + model=self.retrieval_config.reranker_config.model, + top_n=self.retrieval_config.reranker_config.top_n, + cohere_api_key=self.retrieval_config.reranker_config.api_key, + ) + elif self.retrieval_config.reranker_config.supplier == DefaultRerankers.JINA: + self.reranker = JinaRerank( + model=self.retrieval_config.reranker_config.model, + top_n=self.retrieval_config.reranker_config.top_n, + jina_api_key=self.retrieval_config.reranker_config.api_key, + ) + else: + self.reranker = IdempotentCompressor() + + if self.vector_store: + self.compression_retriever = ContextualCompressionRetriever( + base_compressor=self.reranker, base_retriever=self.retriever + ) @property def retriever(self): @@ -96,9 +122,12 @@ def retriever(self): Returns: VectorStoreRetriever: The retriever. """ - return self.vector_store.as_retriever() + if self.vector_store: + return self.vector_store.as_retriever() + else: + raise ValueError("No vector store provided") - def filter_history(self, state): + def filter_history(self, state: AgentState) -> dict: """ Filter out the chat history to only include the messages that are relevant to the current question @@ -114,21 +143,25 @@ def filter_history(self, state): chat_history = state["chat_history"] total_tokens = 0 total_pairs = 0 - filtered_chat_history: list[AIMessage | HumanMessage] = [] + _chat_id = uuid4() + _chat_history = ChatHistory(chat_id=_chat_id, brain_id=chat_history.brain_id) for human_message, ai_message in reversed(list(chat_history.iter_pairs())): # TODO: replace with tiktoken - message_tokens = (len(human_message.content) + len(ai_message.content)) // 4 + message_tokens = self.llm_endpoint.count_tokens( + human_message.content + ) + self.llm_endpoint.count_tokens(ai_message.content) if ( - total_tokens + message_tokens > self.rag_config.llm_config.max_tokens - or total_pairs >= self.rag_config.max_history + total_tokens + message_tokens + > self.retrieval_config.llm_config.max_output_tokens + or total_pairs >= self.retrieval_config.max_history ): break - filtered_chat_history.append(human_message) - filtered_chat_history.append(ai_message) + _chat_history.append(human_message) + _chat_history.append(ai_message) total_tokens += message_tokens total_pairs += 1 - return {"filtered_chat_history": filtered_chat_history} + return {"chat_history": _chat_history} ### Nodes def rewrite(self, state): @@ -143,14 +176,14 @@ def rewrite(self, state): """ # Grader - msg = CONDENSE_QUESTION_PROMPT.format( - chat_history=state["filtered_chat_history"], + msg = custom_prompts.CONDENSE_QUESTION_PROMPT.format( + chat_history=state["chat_history"], question=state["messages"][0].content, ) model = self.llm_endpoint._llm response = model.invoke(msg) - return {"transformed_question": response} + return {"messages": [response]} def retrieve(self, state): """ @@ -162,11 +195,11 @@ def retrieve(self, state): Returns: dict: The retrieved chunks """ - - docs = self.compression_retriever.invoke(state["transformed_question"].content) + question = state["messages"][-1].content + docs = self.compression_retriever.invoke(question) return {"docs": docs} - def generate(self, state): + def generate_rag(self, state): """ Generate answer @@ -177,20 +210,19 @@ def generate(self, state): dict: The updated state with re-phrased question """ messages = state["messages"] - question = messages[0].content + user_question = messages[0].content files = state["files"] docs = state["docs"] # Prompt - prompt = self.rag_config.prompt + prompt = self.retrieval_config.prompt - final_inputs = { - "context": combine_documents(docs), - "question": question, - "custom_instructions": prompt, - "files": files, - } + final_inputs = {} + final_inputs["context"] = combine_documents(docs) if docs else "None" + final_inputs["question"] = user_question + final_inputs["custom_instructions"] = prompt if prompt else "None" + final_inputs["files"] = files if files else "None" # LLM llm = self.llm_endpoint._llm @@ -201,7 +233,7 @@ def generate(self, state): ) # Chain - rag_chain = ANSWER_PROMPT | llm + rag_chain = custom_prompts.RAG_ANSWER_PROMPT | llm # Run response = rag_chain.invoke(final_inputs) @@ -211,14 +243,51 @@ def generate(self, state): } return {"messages": [response], "final_response": formatted_response} - def build_langgraph_chain(self): + def generate_chat_llm(self, state): + """ + Generate answer + + Args: + state (messages): The current state + + Returns: + dict: The updated state with re-phrased question + """ + messages = state["messages"] + user_question = messages[0].content + + # Prompt + prompt = self.retrieval_config.prompt + + final_inputs = {} + final_inputs["question"] = user_question + final_inputs["custom_instructions"] = prompt if prompt else "None" + final_inputs["chat_history"] = state["chat_history"].to_list() + + # LLM + llm = self.llm_endpoint._llm + + # Chain + rag_chain = custom_prompts.CHAT_LLM_PROMPT | llm + + # Run + response = rag_chain.invoke(final_inputs) + formatted_response = { + "answer": response, # Assuming the last message contains the final answer + } + return {"messages": [response], "final_response": formatted_response} + + def build_chain(self): """ Builds the langchain chain for the given configuration. Returns: Callable[[Dict], Dict]: The langchain chain. """ - return self.create_graph() + if not self.graph: + self.graph = self.create_graph() + + return self.graph def create_graph(self): """ @@ -243,19 +312,39 @@ def create_graph(self): """ workflow = StateGraph(AgentState) - # Define the nodes we will cycle between - workflow.add_node("filter_history", self.filter_history) - workflow.add_node("rewrite", self.rewrite) # Re-writing the question - workflow.add_node("retrieve", self.retrieve) # retrieval - workflow.add_node("generate", self.generate) - - # Add node for filtering history - - workflow.set_entry_point("filter_history") - workflow.add_edge("filter_history", "rewrite") - workflow.add_edge("rewrite", "retrieve") - workflow.add_edge("retrieve", "generate") - workflow.add_edge("generate", END) # Add edge from generate to format_response + if self.retrieval_config.workflow_config: + if SpecialEdges.START not in [ + node.name for node in self.retrieval_config.workflow_config.nodes + ]: + raise ValueError("The workflow should contain a 'START' node") + for node in self.retrieval_config.workflow_config.nodes: + if node.name not in SpecialEdges._value2member_map_: + workflow.add_node(node.name, getattr(self, node.name)) + + for node in self.retrieval_config.workflow_config.nodes: + for edge in node.edges: + if node.name == SpecialEdges.START: + workflow.add_edge(START, edge) + elif edge == SpecialEdges.END: + workflow.add_edge(node.name, END) + else: + workflow.add_edge(node.name, edge) + else: + # Define the nodes we will cycle between + workflow.add_node("filter_history", self.filter_history) + workflow.add_node("rewrite", self.rewrite) # Re-writing the question + workflow.add_node("retrieve", self.retrieve) # retrieval + workflow.add_node("generate", self.generate_rag) + + # Add node for filtering history + + workflow.set_entry_point("filter_history") + workflow.add_edge("filter_history", "rewrite") + workflow.add_edge("rewrite", "retrieve") + workflow.add_edge("retrieve", "generate") + workflow.add_edge( + "generate", END + ) # Add edge from generate to format_response # Compile graph = workflow.compile() @@ -280,8 +369,10 @@ def answer( Returns: ParsedRAGResponse: The answer to the question. """ - concat_list_files = format_file_list(list_files, self.rag_config.max_files) - conversational_qa_chain = self.build_langgraph_chain() + concat_list_files = format_file_list( + list_files, self.retrieval_config.max_files + ) + conversational_qa_chain = self.build_chain() inputs = { "messages": [ ("user", question), @@ -294,7 +385,7 @@ def answer( config={"metadata": metadata}, ) response = parse_response( - raw_llm_response["final_response"], self.rag_config.llm_config.model + raw_llm_response["final_response"], self.retrieval_config.llm_config.model ) return response @@ -317,11 +408,13 @@ async def answer_astream( Yields: ParsedRAGChunkResponse: Each chunk of the answer. """ - concat_list_files = format_file_list(list_files, self.rag_config.max_files) - conversational_qa_chain = self.build_langgraph_chain() + concat_list_files = format_file_list( + list_files, self.retrieval_config.max_files + ) + conversational_qa_chain = self.build_chain() rolling_message = AIMessageChunk(content="") - sources = [] + sources: list[Document] | None = None prev_answer = "" chunk_id = 0 @@ -337,7 +430,6 @@ async def answer_astream( config={"metadata": metadata}, ): kind = event["event"] - if ( not sources and "output" in event["data"] @@ -347,18 +439,19 @@ async def answer_astream( if ( kind == "on_chat_model_stream" - and event["metadata"]["langgraph_node"] == "generate" + and "generate" in event["metadata"]["langgraph_node"] ): chunk = event["data"]["chunk"] - rolling_message, answer_str = parse_chunk_response( rolling_message, chunk, self.llm_endpoint.supports_func_calling(), ) - if len(answer_str) > 0: - if self.llm_endpoint.supports_func_calling(): + if ( + self.llm_endpoint.supports_func_calling() + and rolling_message.tool_calls + ): diff_answer = answer_str[len(prev_answer) :] if len(diff_answer) > 0: parsed_chunk = ParsedRAGChunkResponse( diff --git a/backend/core/quivr_core/utils.py b/backend/core/quivr_core/utils.py index 38f8c51c54f6..0ff992ec4589 100644 --- a/backend/core/quivr_core/utils.py +++ b/backend/core/quivr_core/utils.py @@ -12,7 +12,7 @@ RAGResponseMetadata, RawRAGResponse, ) -from quivr_core.prompts import DEFAULT_DOCUMENT_PROMPT +from quivr_core.prompts import custom_prompts # TODO(@aminediro): define a types packages where we clearly define IO types # This should be used for serialization/deseriallization later @@ -56,10 +56,10 @@ def cited_answer_filter(tool): def get_chunk_metadata( - msg: AIMessageChunk, sources: list[Any] = [] + msg: AIMessageChunk, sources: list[Any] | None = None ) -> RAGResponseMetadata: # Initiate the source - metadata = {"sources": sources} + metadata = {"sources": sources} if sources else {"sources": []} if msg.tool_calls: cited_answer = next(x for x in msg.tool_calls if cited_answer_filter(x)) @@ -73,7 +73,7 @@ def get_chunk_metadata( followup_questions = gathered_args["followup_questions"] metadata["followup_questions"] = followup_questions - return RAGResponseMetadata(**metadata) + return RAGResponseMetadata(**metadata, metadata_model=None) def get_prev_message_str(msg: AIMessageChunk) -> str: @@ -101,36 +101,32 @@ def parse_chunk_response( answer = raw_chunk rolling_msg += answer - if supports_func_calling: - if rolling_msg.tool_calls: - cited_answer = next( - x for x in rolling_msg.tool_calls if cited_answer_filter(x) - ) - if "args" in cited_answer: - gathered_args = cited_answer["args"] - if "answer" in gathered_args: - # Only send the difference between answer and response_tokens which was the previous answer - answer_str = gathered_args["answer"] - return rolling_msg, answer_str - else: - return rolling_msg, answer.content + if supports_func_calling and rolling_msg.tool_calls: + cited_answer = next(x for x in rolling_msg.tool_calls if cited_answer_filter(x)) + if "args" in cited_answer and "answer" in cited_answer["args"]: + gathered_args = cited_answer["args"] + # Only send the difference between answer and response_tokens which was the previous answer + answer_str = gathered_args["answer"] + return rolling_msg, answer_str + + return rolling_msg, answer.content @no_type_check def parse_response(raw_response: RawRAGResponse, model_name: str) -> ParsedRAGResponse: answer = "" - sources = raw_response["docs"] or [] + sources = raw_response["docs"] if "docs" in raw_response else [] metadata = RAGResponseMetadata( sources=sources, metadata_model=ChatLLMMetadata(name=model_name) ) - if model_supports_function_calling(model_name): - if ( - "tool_calls" in raw_response["answer"] - and raw_response["answer"].tool_calls - and "citations" in raw_response["answer"].tool_calls[-1]["args"] - ): + if ( + model_supports_function_calling(model_name) + and "tool_calls" in raw_response["answer"] + and raw_response["answer"].tool_calls + ): + if "citations" in raw_response["answer"].tool_calls[-1]["args"]: citations = raw_response["answer"].tool_calls[-1]["args"]["citations"] metadata.citations = citations followup_questions = raw_response["answer"].tool_calls[-1]["args"][ @@ -149,7 +145,9 @@ def parse_response(raw_response: RawRAGResponse, model_name: str) -> ParsedRAGRe def combine_documents( - docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, document_separator="\n\n" + docs, + document_prompt=custom_prompts.DEFAULT_DOCUMENT_PROMPT, + document_separator="\n\n", ): # for each docs, add an index in the metadata to be able to cite the sources for doc, index in zip(docs, range(len(docs)), strict=False): diff --git a/backend/core/tests/conftest.py b/backend/core/tests/conftest.py index cc4204b05c99..a6e262e77258 100644 --- a/backend/core/tests/conftest.py +++ b/backend/core/tests/conftest.py @@ -67,7 +67,7 @@ def chunks_stream_answer(): @pytest.fixture(autouse=True) def openai_api_key(): - os.environ["OPENAI_API_KEY"] = "abcd" + os.environ["OPENAI_API_KEY"] = "this-is-a-test-key" @pytest.fixture diff --git a/backend/core/tests/fixture_chunks.py b/backend/core/tests/fixture_chunks.py index 47f0e28d21c5..ae521f6eed19 100644 --- a/backend/core/tests/fixture_chunks.py +++ b/backend/core/tests/fixture_chunks.py @@ -6,36 +6,47 @@ from langchain_core.messages.ai import AIMessageChunk from langchain_core.vectorstores import InMemoryVectorStore from quivr_core.chat import ChatHistory -from quivr_core.config import LLMEndpointConfig, RAGConfig +from quivr_core.config import LLMEndpointConfig, RetrievalConfig from quivr_core.llm import LLMEndpoint -from quivr_core.quivr_rag import QuivrQARAG +from quivr_core.quivr_rag_langgraph import QuivrQARAGLangGraph async def main(): - rag_config = RAGConfig(llm_config=LLMEndpointConfig(model="gpt-4o")) + retrieval_config = RetrievalConfig(llm_config=LLMEndpointConfig(model="gpt-4o")) embedder = DeterministicFakeEmbedding(size=20) vec = InMemoryVectorStore(embedder) - llm = LLMEndpoint.from_config(rag_config.llm_config) + llm = LLMEndpoint.from_config(retrieval_config.llm_config) chat_history = ChatHistory(uuid4(), uuid4()) - rag_pipeline = QuivrQARAG(rag_config=rag_config, llm=llm, vector_store=vec) + rag_pipeline = QuivrQARAGLangGraph( + retrieval_config=retrieval_config, llm=llm, vector_store=vec + ) - conversational_qa_chain = rag_pipeline.build_chain("") + conversational_qa_chain = rag_pipeline.build_chain() with open("response.jsonl", "w") as f: - async for chunk in conversational_qa_chain.astream( + async for event in conversational_qa_chain.astream_events( { - "question": "What is NLP, give a very long detailed answer", + "messages": [ + ("user", "What is NLP, give a very long detailed answer"), + ], "chat_history": chat_history, "custom_personality": None, }, + version="v1", config={"metadata": {}}, ): - dict_chunk = { - k: v.dict() if isinstance(v, AIMessageChunk) else v - for k, v in chunk.items() - } - f.write(json.dumps(dict_chunk) + "\n") + kind = event["event"] + if ( + kind == "on_chat_model_stream" + and event["metadata"]["langgraph_node"] == "generate" + ): + chunk = event["data"]["chunk"] + dict_chunk = { + k: v.dict() if isinstance(v, AIMessageChunk) else v + for k, v in chunk.items() + } + f.write(json.dumps(dict_chunk) + "\n") asyncio.run(main()) diff --git a/backend/core/tests/rag_config.yaml b/backend/core/tests/rag_config.yaml new file mode 100644 index 000000000000..3a4a5214ca27 --- /dev/null +++ b/backend/core/tests/rag_config.yaml @@ -0,0 +1,39 @@ +ingestion_config: + parser_config: + megaparse_config: + strategy: "fast" + pdf_parser: "unstructured" + splitter_config: + chunk_size: 400 + chunk_overlap: 100 + +retrieval_config: + # Maximum number of previous conversation iterations + # to include in the context of the answer + max_history: 10 + + max_files: 20 + reranker_config: + # The reranker supplier to use + supplier: "cohere" + + # The model to use for the reranker for the given supplier + model: "rerank-multilingual-v3.0" + + # Number of chunks returned by the reranker + top_n: 5 + llm_config: + # The LLM supplier to use + supplier: "openai" + + # The model to use for the LLM for the given supplier + model: "gpt-3.5-turbo-0125" + + max_input_tokens: 2000 + + # Maximum number of tokens to pass to the LLM + # as a context to generate the answer + max_output_tokens: 2000 + + temperature: 0.7 + streaming: true diff --git a/backend/core/tests/rag_config_workflow.yaml b/backend/core/tests/rag_config_workflow.yaml new file mode 100644 index 000000000000..a1750cf61554 --- /dev/null +++ b/backend/core/tests/rag_config_workflow.yaml @@ -0,0 +1,52 @@ +ingestion_config: + parser_config: + megaparse_config: + strategy: "fast" + pdf_parser: "unstructured" + splitter_config: + chunk_size: 400 + chunk_overlap: 100 + +retrieval_config: + workflow_config: + name: "standard RAG" + nodes: + - name: "START" + edges: ["filter_history"] + + - name: "filter_history" + edges: ["generate_chat_llm"] + + - name: "generate_chat_llm" # the name of the last node, from which we want to stream the answer to the user, should always start with "generate" + edges: ["END"] + # Maximum number of previous conversation iterations + # to include in the context of the answer + max_history: 10 + + #prompt: "my prompt" + + max_files: 20 + reranker_config: + # The reranker supplier to use + supplier: "cohere" + + # The model to use for the reranker for the given supplier + model: "rerank-multilingual-v3.0" + + # Number of chunks returned by the reranker + top_n: 5 + llm_config: + # The LLM supplier to use + supplier: "openai" + + # The model to use for the LLM for the given supplier + model: "gpt-3.5-turbo-0125" + + max_input_tokens: 2000 + + # Maximum number of tokens to pass to the LLM + # as a context to generate the answer + max_output_tokens: 2000 + + temperature: 0.7 + streaming: true diff --git a/backend/core/tests/test_chat_llm.py b/backend/core/tests/test_chat_llm.py deleted file mode 100644 index 0af31929496b..000000000000 --- a/backend/core/tests/test_chat_llm.py +++ /dev/null @@ -1,18 +0,0 @@ -import pytest -from quivr_core import ChatLLM - - -@pytest.mark.base -def test_chat_llm(fake_llm): - chat_llm = ChatLLM( - llm=fake_llm, - ) - answer = chat_llm.answer("Hello, how are you?") - - assert len(answer.answer) > 0 - assert answer.metadata is not None - assert answer.metadata.citations is None - assert answer.metadata.followup_questions is None - assert answer.metadata.sources == [] - assert answer.metadata.metadata_model is not None - assert answer.metadata.metadata_model.name is not None diff --git a/backend/core/tests/test_config.py b/backend/core/tests/test_config.py index 26593be0a1e6..6eeaca97dbdf 100644 --- a/backend/core/tests/test_config.py +++ b/backend/core/tests/test_config.py @@ -1,4 +1,4 @@ -from quivr_core.config import LLMEndpointConfig, RAGConfig +from quivr_core.config import LLMEndpointConfig, RetrievalConfig def test_default_llm_config(): @@ -10,16 +10,16 @@ def test_default_llm_config(): model="gpt-3.5-turbo-0125", llm_base_url=None, llm_api_key=None, - max_input=2000, - max_tokens=2000, + max_input_tokens=2000, + max_output_tokens=2000, temperature=0.7, streaming=True, ).model_dump() ) -def test_default_ragconfig(): - config = RAGConfig() +def test_default_retrievalconfig(): + config = RetrievalConfig() assert config.max_files == 20 assert config.prompt is None diff --git a/backend/core/tests/test_llm_endpoint.py b/backend/core/tests/test_llm_endpoint.py index d50f60222f96..04c5556aa355 100644 --- a/backend/core/tests/test_llm_endpoint.py +++ b/backend/core/tests/test_llm_endpoint.py @@ -13,7 +13,7 @@ def test_llm_endpoint_from_config_default(): del os.environ["OPENAI_API_KEY"] - with pytest.raises(ValidationError): + with pytest.raises((ValidationError, ValueError)): llm = LLMEndpoint.from_config(LLMEndpointConfig()) # Working default diff --git a/backend/core/tests/test_quivr_rag.py b/backend/core/tests/test_quivr_rag.py index e752888205f6..629d808b88f1 100644 --- a/backend/core/tests/test_quivr_rag.py +++ b/backend/core/tests/test_quivr_rag.py @@ -2,37 +2,41 @@ import pytest from quivr_core.chat import ChatHistory -from quivr_core.config import LLMEndpointConfig, RAGConfig +from quivr_core.config import LLMEndpointConfig, RetrievalConfig from quivr_core.llm import LLMEndpoint from quivr_core.models import ParsedRAGChunkResponse, RAGResponseMetadata -from quivr_core.quivr_rag import QuivrQARAG +from quivr_core.quivr_rag_langgraph import QuivrQARAGLangGraph @pytest.fixture(scope="function") def mock_chain_qa_stream(monkeypatch, chunks_stream_answer): class MockQAChain: - async def astream(self, *args, **kwargs): + async def astream_events(self, *args, **kwargs): for c in chunks_stream_answer: - yield c + yield { + "event": "on_chat_model_stream", + "metadata": {"langgraph_node": "generate"}, + "data": {"chunk": c}, + } def mock_qa_chain(*args, **kwargs): return MockQAChain() - monkeypatch.setattr(QuivrQARAG, "build_chain", mock_qa_chain) + monkeypatch.setattr(QuivrQARAGLangGraph, "build_chain", mock_qa_chain) @pytest.mark.base @pytest.mark.asyncio -async def test_quivrqarag( +async def test_quivrqaraglanggraph( mem_vector_store, full_response, mock_chain_qa_stream, openai_api_key ): # Making sure the model llm_config = LLMEndpointConfig(model="gpt-4o") llm = LLMEndpoint.from_config(llm_config) - rag_config = RAGConfig(llm_config=llm_config) + retrieval_config = RetrievalConfig(llm_config=llm_config) chat_history = ChatHistory(uuid4(), uuid4()) - rag_pipeline = QuivrQARAG( - rag_config=rag_config, llm=llm, vector_store=mem_vector_store + rag_pipeline = QuivrQARAGLangGraph( + retrieval_config=retrieval_config, llm=llm, vector_store=mem_vector_store ) stream_responses: list[ParsedRAGChunkResponse] = [] diff --git a/backend/pyproject.toml b/backend/pyproject.toml index 0f57a09db2d4..491e98cfe6f4 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -9,7 +9,8 @@ authors = [ { name = "Jacopo Chevallard", email = "jacopo@quivr.app" }, ] dependencies = [ - "packaging>=22.0" + "packaging>=22.0", + "langchain-anthropic>=0.1.23", ] readme = "README.md" requires-python = ">= 3.11" @@ -40,7 +41,7 @@ dev-dependencies = [ ] [tool.rye.workspace] -members = [".", "core", "worker", "api", "docs", "core/examples/chatbot"] +members = [".", "core", "worker", "api", "docs", "core/examples/chatbot", "core/MegaParse"] [tool.hatch.metadata] allow-direct-references = true diff --git a/backend/requirements-dev.lock b/backend/requirements-dev.lock index c6dec2b92f96..0623ba330324 100644 --- a/backend/requirements-dev.lock +++ b/backend/requirements-dev.lock @@ -18,6 +18,8 @@ -e file:core # via quivr-api # via quivr-worker +-e file:core/MegaParse + # via quivr-core -e file:worker aiofiles==23.2.1 # via chainlit @@ -404,6 +406,7 @@ langchain==0.2.14 # via quivr-core langchain-anthropic==0.1.23 # via quivr-core + # via quivr-monorepo langchain-cohere==0.2.2 # via quivr-api langchain-community==0.2.12 @@ -543,8 +546,6 @@ mdit-py-plugins==0.4.1 # via jupytext mdurl==0.1.2 # via markdown-it-py -megaparse==0.0.31 - # via quivr-core mergedeep==1.3.4 # via mkdocs # via mkdocs-get-deps @@ -800,6 +801,7 @@ protobuf==4.25.4 # via onnxruntime # via opentelemetry-proto # via proto-plus + # via transformers psutil==6.0.0 # via ipykernel # via unstructured @@ -1021,6 +1023,8 @@ safetensors==0.4.4 # via transformers scipy==1.14.1 # via layoutparser +sentencepiece==0.2.0 + # via transformers sentry-sdk==2.13.0 # via quivr-api setuptools==70.0.0 @@ -1149,8 +1153,11 @@ traitlets==5.14.3 # via nbclient # via nbconvert # via nbformat -transformers==4.44.1 +transformers==4.44.2 + # via quivr-core # via unstructured-inference +types-pyyaml==6.0.12.20240808 + # via quivr-core types-requests==2.31.0.6 # via cohere types-urllib3==1.26.25.14 diff --git a/backend/requirements.lock b/backend/requirements.lock index 23de6bdc3b63..3d8c76fcdb4b 100644 --- a/backend/requirements.lock +++ b/backend/requirements.lock @@ -18,6 +18,8 @@ -e file:core # via quivr-api # via quivr-worker +-e file:core/MegaParse + # via quivr-core -e file:worker aiofiles==24.1.0 # via quivr-core @@ -355,6 +357,7 @@ langchain==0.2.14 # via quivr-core langchain-anthropic==0.1.23 # via quivr-core + # via quivr-monorepo langchain-cohere==0.2.2 # via quivr-api langchain-community==0.2.12 @@ -488,8 +491,6 @@ mdit-py-plugins==0.4.1 # via jupytext mdurl==0.1.2 # via markdown-it-py -megaparse==0.0.31 - # via quivr-core mergedeep==1.3.4 # via mkdocs # via mkdocs-get-deps @@ -695,6 +696,7 @@ protobuf==5.27.3 # via onnx # via onnxruntime # via proto-plus + # via transformers psutil==6.0.0 # via ipykernel # via unstructured @@ -882,6 +884,8 @@ safetensors==0.4.4 # via transformers scipy==1.14.1 # via layoutparser +sentencepiece==0.2.0 + # via transformers sentry-sdk==2.13.0 # via quivr-api six==1.16.0 @@ -1000,8 +1004,11 @@ traitlets==5.14.3 # via nbclient # via nbconvert # via nbformat -transformers==4.44.1 +transformers==4.44.2 + # via quivr-core # via unstructured-inference +types-pyyaml==6.0.12.20240808 + # via quivr-core types-requests==2.31.0.6 # via cohere types-urllib3==1.26.25.14