diff --git a/README.md b/README.md index 7248c6d..fc2d792 100644 --- a/README.md +++ b/README.md @@ -28,17 +28,11 @@ The core purpose of "No OCR" is to simplify AI-based PDF processing: - Perform text and/or visual queries using modern embeddings. - Use open source models for advanced question-answering on document-based diagrams, text, and more. -Key technologies: -- React-based front end (no-ocr-ui) for uploading, managing, and searching documents. -- Python-based API (no-ocr-api) that coordinates ingestion, indexing, and searching. -- Qdrant for efficient vector search and retrieval. -- ColPali & Qwen2-VL handle inference tasks (both text and vision-based). - ## Key Features - Create and manage PDF/document collections, also referred to as "cases". - Automated ingestion to build Hugging Face-style datasets (HF_Dataset). -- Vector-based search over PDF pages (and relevant images) in Qdrant. +- Vector-based search over PDF pages (and relevant images) in LanceDB. - Visual question-answering on images and diagrams via Qwen2-VL. - Deployable via Docker for both the backend (Python) and UI (React). @@ -58,8 +52,8 @@ sequenceDiagram participant no-ocr-ui (CreateCase) participant no-ocr-api participant HF_Dataset - participant IngestClient - participant Qdrant + participant SearchClient + participant LanceDB User->>no-ocr-ui (CreateCase): Upload PDFs & specify case name no-ocr-ui (CreateCase)->>no-ocr-api: POST /create_case with PDFs @@ -67,10 +61,10 @@ sequenceDiagram no-ocr-api->>no-ocr-api: Spawn background task (process_case) no-ocr-api->>HF_Dataset: Convert PDFs to HF dataset HF_Dataset-->>no-ocr-api: Return dataset - no-ocr-api->>IngestClient: Ingest dataset - IngestClient->>Qdrant: Create collection & upload points - Qdrant-->>IngestClient: Acknowledge ingestion - IngestClient-->>no-ocr-api: Done ingestion + no-ocr-api->>SearchClient: Ingest dataset + SearchClient->>LanceDB: Create collection & upload points + LanceDB-->>SearchClient: Acknowledge ingestion + SearchClient-->>no-ocr-api: Done ingestion no-ocr-api->>no-ocr-api: Mark case status as 'done' no-ocr-api-->>no-ocr-ui (CreateCase): Return creation response no-ocr-ui (CreateCase)-->>User: Display success message @@ -83,14 +77,14 @@ sequenceDiagram participant User participant no-ocr-ui participant SearchClient - participant Qdrant + participant LanceDB participant HF_Dataset participant VLLM User->>no-ocr-ui: Enter search query and select case no-ocr-ui->>SearchClient: Search images by text - SearchClient->>Qdrant: Query collection with text embedding - Qdrant-->>SearchClient: Return search results + SearchClient->>LanceDB: Query collection with text embedding + LanceDB-->>SearchClient: Return search results SearchClient-->>no-ocr-ui: Provide search results no-ocr-ui->>HF_Dataset: Load dataset for collection HF_Dataset-->>no-ocr-ui: Return dataset @@ -166,7 +160,3 @@ sequenceDiagram cd no-ocr-ui npm run dev ``` -5. (Qdrant) Run qdrant - ```bash - docker run -p 6333:6333 qdrant/qdrant:v1.12.5 - ``` \ No newline at end of file diff --git a/docs/architecture.png b/docs/architecture.png index e6f8106..13f85ad 100644 Binary files a/docs/architecture.png and b/docs/architecture.png differ diff --git a/no-ocr-api/.env.example b/no-ocr-api/.env.example index 24df422..b38380b 100644 --- a/no-ocr-api/.env.example +++ b/no-ocr-api/.env.example @@ -5,10 +5,4 @@ SEARCH_TOP_K=3 COLPALI_TOKEN= VLLM_URL= COLPALI_BASE_URL= -QDRANT_URI="localhost" -QDRANT_PORT=6333 VECTOR_SIZE=128 -INDEXING_THRESHOLD=100 -QUANTILE=0.99 -TOP_K=5 -QDRANT_HTTPS=False \ No newline at end of file diff --git a/no-ocr-api/Dockerfile b/no-ocr-api/Dockerfile index f210860..076b2af 100644 --- a/no-ocr-api/Dockerfile +++ b/no-ocr-api/Dockerfile @@ -16,7 +16,10 @@ RUN pip install --upgrade pip COPY requirements.txt requirements.txt RUN pip install -r requirements.txt +# TODO: replace with lancedb==0.18.1b1 +RUN pip install --pre --extra-index-url https://pypi.fury.io/lancedb/ lancedb==0.18.1b1 + COPY . . ENV PYTHONPATH /app/ -CMD fastapi run --host 0.0.0.0 --port 8000 --workers 1 np_ocr/api.py \ No newline at end of file +CMD fastapi run --host 0.0.0.0 --port 8000 --workers 1 np_ocr/api.py diff --git a/no-ocr-api/np_ocr/api.py b/no-ocr-api/np_ocr/api.py index d153db9..a68bc43 100644 --- a/no-ocr-api/np_ocr/api.py +++ b/no-ocr-api/np_ocr/api.py @@ -16,7 +16,7 @@ from pydantic_settings import BaseSettings from np_ocr.data import pdfs_to_hf_dataset -from np_ocr.search import IngestClient, SearchClient, call_vllm +from np_ocr.search import SearchClient, call_vllm class CustomRailwayLogFormatter(logging.Formatter): @@ -60,13 +60,7 @@ class Settings(BaseSettings): COLPALI_TOKEN: str VLLM_URL: str COLPALI_BASE_URL: str - QDRANT_URI: str - QDRANT_PORT: int VECTOR_SIZE: int = 128 - INDEXING_THRESHOLD: int = 100 - QUANTILE: float = 0.99 - TOP_K: int = 5 - QDRANT_HTTPS: bool = True VLLM_API_KEY: str VLLM_MODEL: str = "Qwen2-VL-7B-Instruct" @@ -77,12 +71,20 @@ class Config: settings = Settings() +class SearchResult(BaseModel): + score: float + pdf_name: str + pdf_page: int + image_base64: str + +class SearchResponse(BaseModel): + search_results: List[SearchResult] + class ImageAnswer(BaseModel): answer: str class CaseInfo(BaseModel): name: str - unique_name: str status: str number_of_PDFs: int files: List[str] @@ -97,8 +99,7 @@ def update_status(self, new_status: str): self.save() -search_client = SearchClient(qdrant_uri=settings.QDRANT_URI, port=settings.QDRANT_PORT, https=settings.QDRANT_HTTPS, top_k=settings.TOP_K, base_url=settings.COLPALI_BASE_URL, token=settings.COLPALI_TOKEN) -ingest_client = IngestClient(qdrant_uri=settings.QDRANT_URI, port=settings.QDRANT_PORT, https=settings.QDRANT_HTTPS, index_threshold=settings.INDEXING_THRESHOLD, vector_size=settings.VECTOR_SIZE, quantile=settings.QUANTILE, top_k=settings.TOP_K, base_url=settings.COLPALI_BASE_URL, token=settings.COLPALI_TOKEN) +search_client = SearchClient(storage_dir=settings.STORAGE_DIR, vector_size=settings.VECTOR_SIZE, base_url=settings.COLPALI_BASE_URL, token=settings.COLPALI_TOKEN) @app.post("/vllm_call") @@ -137,17 +138,6 @@ def vllm_call( return image_answer - - -class SearchResult(BaseModel): - score: float - pdf_name: str - pdf_page: int - image_base64: str - -class SearchResponse(BaseModel): - search_results: List[SearchResult] - @app.post("/search", response_model=SearchResponse) def ai_search(user_query: str = Form(...), user_id: str = Form(...), case_name: str = Form(...)): logger.info("start ai_search") @@ -167,8 +157,7 @@ def ai_search(user_query: str = Form(...), user_id: str = Form(...), case_name: with open(case_info_path, "r") as json_file: _ = json.load(json_file) # case_info is not used directly below - unique_name =f"{user_id}_{case_name}" - search_results = search_client.search_images_by_text(user_query, case_name=unique_name, top_k=settings.SEARCH_TOP_K) + search_results = search_client.search_images_by_text(user_query, case_name=case_name, user_id=user_id, top_k=settings.SEARCH_TOP_K) if not search_results: return {"message": "No results found."} @@ -178,13 +167,14 @@ def ai_search(user_query: str = Form(...), user_id: str = Form(...), case_name: dataset = load_from_disk(dataset_path) search_results_data = [] - for result in search_results.points: - payload = result.payload - logger.info(payload) - score = result.score - image_data = dataset[payload["index"]]["image"] - pdf_name = dataset[payload["index"]]["pdf_name"] - pdf_page = dataset[payload["index"]]["pdf_page"] + print(search_results) + for point in search_results: + logger.info(point) + score = point['_distance'] + index = point['index'] + image_data = dataset[index]["image"] + pdf_name = dataset[index]["pdf_name"] + pdf_page = dataset[index]["pdf_page"] # Convert image to base64 string buffered = BytesIO() @@ -204,13 +194,13 @@ def ai_search(user_query: str = Form(...), user_id: str = Form(...), case_name: return SearchResponse(search_results=search_results_data) -def process_case(case_info: CaseInfo): +def process_case(case_info: CaseInfo, user_id: str): logger.info("start post_process_case") start_time = time.time() dataset = pdfs_to_hf_dataset(case_info.case_dir) dataset.save_to_disk(case_info.case_dir / settings.HF_DATASET_DIRNAME) - ingest_client.ingest(case_info.unique_name, dataset) + search_client.ingest(case_info.name, dataset, user_id) case_info.update_status("done") @@ -247,7 +237,6 @@ def create_new_case( case_info = CaseInfo( name=case_name, - unique_name=f"{user_id}_{case_name}", status="processing", number_of_PDFs=len(files), files=file_names, @@ -256,7 +245,7 @@ def create_new_case( case_info.save() - background_tasks.add_task(process_case, case_info=case_info) + background_tasks.add_task(process_case, case_info=case_info, user_id=user_id) end_time = time.time() logger.info(f"done create_new_case, total time {end_time - start_time}") @@ -308,12 +297,6 @@ def get_cases(user_id: str): @app.get("/get_case/{case_name}") def get_case(user_id: str, case_name: str) -> CaseInfo: - logger.info("start get_case") - start_time = time.time() - - """ - Return the metadata of a specific case by its name for a specific user. - """ case_info_path = os.path.join(settings.STORAGE_DIR, user_id, case_name, settings.CASE_INFO_FILENAME) if not os.path.exists(case_info_path): # Check common cases @@ -323,11 +306,7 @@ def get_case(user_id: str, case_name: str) -> CaseInfo: with open(case_info_path, "r") as json_file: case_info = CaseInfo(**json.load(json_file)) - - end_time = time.time() - logger.info(f"done get_case, total time {end_time - start_time}") - - return case_info.dict() + return case_info @app.delete("/delete_case/{case_name}") def delete_case(user_id: str, case_name: str): @@ -344,12 +323,6 @@ def delete_case(user_id: str, case_name: str): else: raise HTTPException(status_code=404, detail="Case not found in storage.") - # Delete the case from Qdrant - try: - ingest_client.qdrant_client.delete_collection(case_name) - except Exception as e: - raise HTTPException(status_code=500, detail=f"An error occurred while deleting the case from Qdrant: {str(e)}") - end_time = time.time() logger.info(f"done delete_case, total time {end_time - start_time}") diff --git a/no-ocr-api/np_ocr/search.py b/no-ocr-api/np_ocr/search.py index 18bfb1b..55260bc 100644 --- a/no-ocr-api/np_ocr/search.py +++ b/no-ocr-api/np_ocr/search.py @@ -11,7 +11,9 @@ import requests from openai import OpenAI from pydantic import BaseModel -from qdrant_client import QdrantClient, models +import lancedb +import numpy as np +import pyarrow as pa from tqdm import tqdm logger = logging.getLogger() @@ -62,93 +64,65 @@ def process_pil_image(self, pil_image): response.raise_for_status() return response.json() - -class IngestClient: - def __init__(self, qdrant_uri: str, port: int, https: bool, index_threshold: int, vector_size: int, quantile: float, top_k: int, base_url: str, token: str): - self.qdrant_client = QdrantClient(qdrant_uri, port=port, https=https) - self.colpali_client = ColPaliClient(base_url, token) - self.index_threshold = index_threshold +class SearchClient: + def __init__(self, storage_dir: str, vector_size: int, base_url: str, token: str): + self.storage_dir = storage_dir self.vector_size = vector_size - self.quantile = quantile - self.top_k = top_k - - def ingest(self, case_name, dataset): + self.colpali_client = ColPaliClient(base_url, token) + + def ingest(self, case_name: str, dataset, user_id: str): logger.info("start ingest") start_time = time.time() - self.qdrant_client.create_collection( - collection_name=case_name, - on_disk_payload=True, - optimizers_config=models.OptimizersConfigDiff(indexing_threshold=self.index_threshold), - vectors_config=models.VectorParams( - size=self.vector_size, - distance=models.Distance.COSINE, - multivector_config=models.MultiVectorConfig(comparator=models.MultiVectorComparator.MAX_SIM), - quantization_config=models.ScalarQuantization( - scalar=models.ScalarQuantizationConfig( - type=models.ScalarType.INT8, - quantile=self.quantile, - always_ram=True, - ), - ), - ), + schema = pa.schema( + [ + pa.field("index", pa.int64()), + pa.field("pdf_name", pa.string()), + pa.field("pdf_page", pa.int64()), + pa.field("vector", pa.list_(pa.list_(pa.float32(), self.vector_size))), + ] ) + lance_client = lancedb.connect(f"{self.storage_dir}/{user_id}/{case_name}") + tbl = lance_client.create_table(case_name, schema=schema) + + # TODO: ingest in batches - # Use tqdm to create a progress bar with tqdm(total=len(dataset), desc="Indexing Progress") as pbar: for i in range(len(dataset)): - # The images are already PIL Image objects, so we can use them directly image = dataset[i]["image"] - - # Process and encode image using ColPaliClient response = self.colpali_client.process_pil_image(image) image_embedding = response["embedding"] - # Prepare point for Qdrant - point = models.PointStruct( - id=i, # we just use the index as the ID - vector=image_embedding, # This is now a list of vectors - payload={ - "index": dataset[i]["index"], - "pdf_name": dataset[i]["pdf_name"], - "pdf_page": dataset[i]["pdf_page"], - }, # can also add other metadata/data - ) + data = { + "index": dataset[i]["index"], + "pdf_name": dataset[i]["pdf_name"], + "pdf_page": dataset[i]["pdf_page"], + "vector": image_embedding, + } try: - self.qdrant_client.upsert( - collection_name=case_name, - points=[point], - wait=False, - ) + tbl.add([data]) except Exception as e: logger.error(f"Error during upsert: {e}") continue pbar.update(1) + tbl.create_index(metric="cosine") + logger.info("Indexing complete!") end_time = time.time() logger.info(f"done ingest, total time {end_time - start_time}") - -class SearchClient: - def __init__(self, qdrant_uri: str, port: int, https: bool, top_k: int, base_url: str, token: str): - self.qdrant_client = QdrantClient(qdrant_uri, port=port, https=https) - self.colpali_client = ColPaliClient(base_url=base_url, token=token) - self.top_k = top_k - - def search_images_by_text(self, query_text, case_name: str, top_k: int): + def search_images_by_text(self, query_text, case_name: str, user_id: str,top_k: int): logger.info("start search_images_by_text") start_time = time.time() - # Use ColPaliClient to query text and get the embedding - query_embedding = self.colpali_client.query_text(query_text) + lance_client = lancedb.connect(f"{self.storage_dir}/{user_id}/{case_name}") + tbl = lance_client.open_table(case_name) - # Extract the embedding from the response - multivector_query = query_embedding["embedding"] - - # Search in Qdrant - search_result = self.qdrant_client.query_points(collection_name=case_name, query=multivector_query, limit=top_k) + query_embedding = self.colpali_client.query_text(query_text) + multivector_query = np.array(query_embedding["embedding"]) + search_result = tbl.search(multivector_query).limit(top_k).select(["index", "pdf_name", "pdf_page"]).to_list() end_time = time.time() logger.info(f"done search_images_by_text, total time {end_time - start_time}") @@ -156,7 +130,6 @@ def search_images_by_text(self, query_text, case_name: str, top_k: int): return search_result - def call_vllm(image_data: PIL.Image.Image, user_query: str, base_url: str, api_key: str, model: str) -> ImageAnswer: logger.info("start call_vllm") start_time = time.time() diff --git a/no-ocr-api/requirements.txt b/no-ocr-api/requirements.txt index 7b02934..d541715 100644 --- a/no-ocr-api/requirements.txt +++ b/no-ocr-api/requirements.txt @@ -1,4 +1,3 @@ -qdrant-client==1.12.1 datasets==3.1.0 pdf2image==1.16.3 pypdf==5.0.1 diff --git a/no-ocr-api/tests/lance_vs_qdrant.py b/no-ocr-api/tests/lance_vs_qdrant.py new file mode 100644 index 0000000..88557fa --- /dev/null +++ b/no-ocr-api/tests/lance_vs_qdrant.py @@ -0,0 +1,300 @@ +import PIL.Image +import lancedb +import numpy as np +import pyarrow as pa +import base64 +import io +import json +import logging +import time +from io import BytesIO +from pathlib import Path +from typing import List + +import PIL +import PIL.Image +from PIL import Image +import requests +from openai import OpenAI +from pydantic import BaseModel +from qdrant_client import QdrantClient, models +from tqdm import tqdm + +class CustomRailwayLogFormatter(logging.Formatter): + def format(self, record): + log_record = { + "time": self.formatTime(record), + "level": record.levelname, + "message": record.getMessage() + } + return json.dumps(log_record) + +def get_logger(): + logger = logging.getLogger() + logger.setLevel(logging.INFO) + handler = logging.StreamHandler() + for handler in logger.handlers[:]: + logger.removeHandler(handler) + formatter = CustomRailwayLogFormatter() + handler.setFormatter(formatter) + logger.addHandler(handler) + return logger + +logger = get_logger() + + +class ColPaliClient: + def __init__(self, base_url: str = "http://localhost:8000", token: str = "super-secret-token"): + self.base_url = base_url + self.headers = {"Authorization": f"Bearer {token}"} + + def query_text(self, query_text: str): + response = requests.post(f"{self.base_url}/query", headers=self.headers, params={"query_text": query_text}) + response.raise_for_status() + return response.json() + + def process_image(self, image_path: str): + with open(image_path, "rb") as image_file: + files = {"image": image_file} + response = requests.post(f"{self.base_url}/process_image", files=files, headers=self.headers) + response.raise_for_status() + return response.json() + + def process_pil_image(self, pil_image): + buffered = io.BytesIO() + pil_image.save(buffered, format="JPEG") + files = {"image": buffered.getvalue()} + response = requests.post(f"{self.base_url}/process_image", files=files, headers=self.headers) + response.raise_for_status() + return response.json() + + +class IngestClientQdrant: + def __init__(self, qdrant_uri: str, port: int, https: bool, index_threshold: int, vector_size: int, quantile: float, top_k: int, base_url: str, token: str): + self.qdrant_client = QdrantClient(qdrant_uri, port=port, https=https) + self.colpali_client = ColPaliClient(base_url, token) + self.index_threshold = index_threshold + self.vector_size = vector_size + self.quantile = quantile + self.top_k = top_k + + def ingest(self, case_name, dataset): + logger.info("start ingest") + start_time = time.time() + + self.qdrant_client.create_collection( + collection_name=case_name, + on_disk_payload=True, + optimizers_config=models.OptimizersConfigDiff(indexing_threshold=self.index_threshold), + vectors_config=models.VectorParams( + size=self.vector_size, + distance=models.Distance.COSINE, + multivector_config=models.MultiVectorConfig(comparator=models.MultiVectorComparator.MAX_SIM), + quantization_config=models.ScalarQuantization( + scalar=models.ScalarQuantizationConfig( + type=models.ScalarType.INT8, + quantile=self.quantile, + always_ram=True, + ), + ), + ), + ) + + # Use tqdm to create a progress bar + with tqdm(total=len(dataset), desc="Indexing Progress") as pbar: + for i in range(len(dataset)): + # The images are already PIL Image objects, so we can use them directly + image = dataset[i]["image"] + + # Process and encode image using ColPaliClient + response = self.colpali_client.process_pil_image(image) + image_embedding = response["embedding"] + + # Prepare point for Qdrant + point = models.PointStruct( + id=i, # we just use the index as the ID + vector=image_embedding, # This is now a list of vectors + payload={ + "index": dataset[i]["index"], + "pdf_name": dataset[i]["pdf_name"], + "pdf_page": dataset[i]["pdf_page"], + }, # can also add other metadata/data + ) + + try: + self.qdrant_client.upsert( + collection_name=case_name, + points=[point], + wait=False, + ) + except Exception as e: + logger.error(f"Error during upsert: {e}") + continue + pbar.update(1) + + logger.info("Indexing complete!") + end_time = time.time() + logger.info(f"done ingest, total time {end_time - start_time}") + +class IngestClientLance: + def __init__(self, lance_uri: str, vector_size: int, base_url: str = "http://localhost:8000", token: str = "super-secret-token"): + self.lance_client = lancedb.connect(lance_uri) + self.vector_size = vector_size + self.colpali_client = ColPaliClient(base_url, token) + + def ingest(self, case_name, dataset): + logger.info("start ingest") + start_time = time.time() + + + schema = pa.schema( + [ + pa.field("index", pa.int64()), + pa.field("pdf_name", pa.string()), + pa.field("pdf_page", pa.int64()), + pa.field("vector", pa.list_(pa.list_(pa.float32(), self.vector_size))), + ] + ) + + tbl = self.lance_client.create_table(case_name, schema=schema) + with tqdm(total=len(dataset), desc="Indexing Progress") as pbar: + for i in range(len(dataset)): + image = dataset[i]["image"] + response = self.colpali_client.process_pil_image(image) + image_embedding = response["embedding"] + + data = { + "index": dataset[i]["index"], + "pdf_name": dataset[i]["pdf_name"], + "pdf_page": dataset[i]["pdf_page"], + "vector": image_embedding, + } + + try: + tbl.add([data]) + except Exception as e: + logger.error(f"Error during upsert: {e}") + continue + pbar.update(1) + + tbl.create_index(metric="cosine") + + logger.info("Indexing complete!") + end_time = time.time() + logger.info(f"done ingest, total time {end_time - start_time}") + + def search_images_by_text(self, query_text, case_name: str, top_k: int): + logger.info("start search_images_by_text") + start_time = time.time() + + query_embedding = self.colpali_client.query_text(query_text) + multivector_query = np.array(query_embedding["embedding"]) + tbl = self.lance_client.open_table(case_name) + search_result = tbl.search(multivector_query).limit(top_k).select(["index", "pdf_name", "pdf_page"]).to_list() + + end_time = time.time() + logger.info(f"done search_images_by_text, total time {end_time - start_time}") + + return search_result + +class SearchClientQdrant: + def __init__(self, qdrant_uri: str, port: int, https: bool, top_k: int, base_url: str, token: str): + self.qdrant_client = QdrantClient(qdrant_uri, port=port, https=https) + self.colpali_client = ColPaliClient(base_url=base_url, token=token) + self.top_k = top_k + + def search_images_by_text(self, query_text, case_name: str, top_k: int): + logger.info("start search_images_by_text") + start_time = time.time() + + # Use ColPaliClient to query text and get the embedding + query_embedding = self.colpali_client.query_text(query_text) + + # Extract the embedding from the response + multivector_query = query_embedding["embedding"] + + # Search in Qdrant + search_result = self.qdrant_client.query_points(collection_name=case_name, query=multivector_query, limit=top_k) + + end_time = time.time() + logger.info(f"done search_images_by_text, total time {end_time - start_time}") + + return search_result + + +def benchmark_ingest_clients(): + # Create mock data + mock_data = [ + { + "index": i, + "pdf_name": f"document_{i}.pdf", + "pdf_page": i % 10, + "image": Image.fromarray(np.random.randint(0, 255, (100, 100, 3), dtype=np.uint8)) + } + for i in range(1024) + ] + + # Initialize clients + qdrant_client = IngestClientQdrant( + qdrant_uri="http://localhost", + port=6333, + https=False, + index_threshold=100, + vector_size=128, + quantile=0.99, + top_k=10, + base_url="http://localhost:8000", + token="super-secret-token" + ) + + lance_client = IngestClientLance( + lance_uri="lance-db-data/qwe123", + vector_size=128, + base_url="http://localhost:8000", + token="super-secret-token" + ) + + # Benchmark Qdrant ingestion + # start_time = time.time() + # qdrant_client.ingest("test_case_qdrant", mock_data) + # qdrant_duration = time.time() - start_time + # logger.info(f"Qdrant ingestion time: {qdrant_duration} seconds") + + # # Benchmark Lance ingestion + # start_time = time.time() + # lance_client.ingest("test_case_lance", mock_data) + # lance_duration = time.time() - start_time + # logger.info(f"Lance ingestion time: {lance_duration} seconds") + + # Initialize search clients + qdrant_search_client = SearchClientQdrant( + qdrant_uri="http://localhost", + port=6333, + https=False, + top_k=10, + base_url="http://localhost:8000", + token="super-secret-token" + ) + + # Benchmark Qdrant search + start_time = time.time() + qdrant_search_client.search_images_by_text("example query", "test_case_qdrant", 10) + qdrant_search_duration = time.time() - start_time + logger.info(f"Qdrant search time: {qdrant_search_duration} seconds") + + # Benchmark Lance search + start_time = time.time() + lance_client.search_images_by_text("example query", "test_case_lance", 10) + lance_search_duration = time.time() - start_time + logger.info(f"Lance search time: {lance_search_duration} seconds") + + return { + # "qdrant_ingestion_duration": qdrant_duration, + # "lance_ingestion_duration": lance_duration, + "qdrant_search_duration": qdrant_search_duration, + "lance_search_duration": lance_search_duration + } + +if __name__ == "__main__": + benchmark_results = benchmark_ingest_clients() + print(benchmark_results) diff --git a/no-ocr-api/tests/test_api.py b/no-ocr-api/tests/test_api.py index eaf2a36..191102d 100644 --- a/no-ocr-api/tests/test_api.py +++ b/no-ocr-api/tests/test_api.py @@ -95,3 +95,12 @@ def test_end2end(client): vllm_result = response.json() assert "answer" in vllm_result print(f"VLLM result: {vllm_result['answer']}") + + # Step 5: Delete the case + print(f"Deleting case '{case_name}' for user '{user_id}'") + response = client.delete(f"/delete_case/{case_name}", params={"user_id": user_id}) + print(f"Response status code for delete_case: {response.status_code}") + assert response.status_code == 200 + delete_result = response.json() + assert "message" in delete_result + print(f"Delete result: {delete_result['message']}") diff --git a/no-ocr-ui/src/components/Search.tsx b/no-ocr-ui/src/components/Search.tsx index e489fe5..e366cbb 100644 --- a/no-ocr-ui/src/components/Search.tsx +++ b/no-ocr-ui/src/components/Search.tsx @@ -192,7 +192,7 @@ export default function Search() { />