diff --git a/app/services/embedding.py b/app/services/embedding.py index b148316..51ba24a 100644 --- a/app/services/embedding.py +++ b/app/services/embedding.py @@ -19,7 +19,12 @@ def generate_embedding(self, text: str) -> list[float]: Returns a list of 384 floats. """ # The embed_query method returns a list of floats - return self.embeddings.embed_query(text) + embedding = self.embeddings.embed_query(text) + if len(embedding) != 384: + raise ValueError( + f"Unexpected embedding dimension: {len(embedding)} (expected 384)" + ) + return embedding # Singleton instance embedding_service = EmbeddingService() diff --git a/app/services/llm.py b/app/services/llm.py index 51b2894..dc15573 100644 --- a/app/services/llm.py +++ b/app/services/llm.py @@ -8,7 +8,7 @@ genai.configure(api_key=settings.GEMINI_API_KEY) # Configure LangChain model -# We use gemini-1.5-pro or gemini-pro depending on availability and needs +# We use gemini-2.5-flash for faster and highly capable inference llm = ChatGoogleGenerativeAI( model="gemini-2.5-flash", google_api_key=settings.GEMINI_API_KEY, diff --git a/download_books.py b/download_books.py index fb5ef29..4347927 100644 --- a/download_books.py +++ b/download_books.py @@ -3,12 +3,13 @@ import time import urllib.request import urllib.error +import urllib.parse from html.parser import HTMLParser def get_html(url): req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'}) try: - with urllib.request.urlopen(req) as response: + with urllib.request.urlopen(req, timeout=20) as response: return response.read().decode('utf-8', errors='replace') except Exception as e: print(f"Error fetching {url}: {e}") @@ -98,7 +99,7 @@ def download_book(book, data_dir, base_url, target_count, downloaded_count): if not href.startswith('/ebooks/'): return downloaded_count - book_url = base_url + href + book_url = urllib.parse.urljoin(base_url, href) title = book['title'].replace('\n', ' ').replace('\r', '') author = book['author'].replace('\n', ' ').replace('\r', '') @@ -120,16 +121,16 @@ def download_book(book, data_dir, base_url, target_count, downloaded_count): txt_url = parser.txt_url if txt_url: - if not txt_url.startswith('http'): - if txt_url.startswith('//'): - txt_url = 'https:' + txt_url - else: - txt_url = base_url + txt_url + txt_url = urllib.parse.urljoin(base_url, txt_url) + # Verify valid scheme to prevent SSRF + if not txt_url.startswith(('http://', 'https://')): + print(f"Skipping invalid URL scheme: {txt_url}") + return downloaded_count print(f"Downloading [{downloaded_count+1}/{target_count}]: {safe_title}") try: req = urllib.request.Request(txt_url, headers={'User-Agent': 'Mozilla/5.0'}) - with urllib.request.urlopen(req) as resp: + with urllib.request.urlopen(req, timeout=20) as resp: text = resp.read() text = text.decode('utf-8', errors='replace') with open(file_path, 'w', encoding='utf-8') as f: @@ -175,7 +176,10 @@ def main(): downloaded_count = download_book(book, data_dir, base_url, target_count, downloaded_count) if parser.next_page: - current_url = base_url + parser.next_page + current_url = urllib.parse.urljoin(base_url, parser.next_page) + if not current_url.startswith(('http://', 'https://')): + print(f"Invalid next page URL scheme: {current_url}") + current_url = None else: current_url = None diff --git a/requirements.txt b/requirements.txt index 616e26a..42046cb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,5 @@ sse-starlette pydantic>=2.7.0 pydantic-settings python-dotenv +langchain-community fastembed diff --git a/scripts/check_db.py b/scripts/check_db.py index 4496950..e2caf81 100644 --- a/scripts/check_db.py +++ b/scripts/check_db.py @@ -6,9 +6,14 @@ res = supabase_client.table('documents').select('id').limit(1).execute() print('Table exists') if res.data: - print(f'Found {len(res.data)} rows') + print('Found at least 1 row') else: print('Table is empty') except Exception as e: - print(f'Error or Table does not exist: {e}') - # No exit(1) because we want to know if it's just not there + error_msg = str(e).lower() + if 'relation "documents" does not exist' in error_msg or 'table missing' in error_msg or "code': '42p01'" in error_msg: + print("Table 'documents' does not exist yet. Please run migrations.") + sys.exit(0) + else: + print(f"Database connection or query error: {e}") + raise diff --git a/scripts/ingest_all_data.py b/scripts/ingest_all_data.py index 6301f34..1ca4e55 100644 --- a/scripts/ingest_all_data.py +++ b/scripts/ingest_all_data.py @@ -17,7 +17,7 @@ def main(): failed_files = [] - files = [f for f in os.listdir(data_dir) if f.endswith(".txt")] + files = sorted(f for f in os.listdir(data_dir) if f.endswith(".txt")) print(f"Found {len(files)} text files in {data_dir}.") for idx, filename in enumerate(files): @@ -38,7 +38,7 @@ def main(): print(f"\n[{idx+1}/{len(files)}] ==================================") print(f"Processing: {title} by {philosopher}") - print(f"==================================================") + print("==================================================") try: with open(filepath, 'r', encoding='utf-8') as f: diff --git a/scripts/ingest_data.py b/scripts/ingest_data.py index 9ce8213..72fb863 100644 --- a/scripts/ingest_data.py +++ b/scripts/ingest_data.py @@ -2,7 +2,6 @@ import sys import json import uuid -import hashlib import urllib.request import argparse import concurrent.futures @@ -16,6 +15,12 @@ from app.services.database import supabase_client from langchain.text_splitter import RecursiveCharacterTextSplitter +class IngestionError(Exception): + """Raised when data ingestion fails.""" + def __init__(self, failed_batches): + self.failed_batches = failed_batches + super().__init__(f"Ingestion incomplete. Failed batches: {failed_batches}") + def fetch_aladin_metadata(title: str, author: str) -> Dict: """ Dummy function for Aladin Open API. @@ -36,11 +41,29 @@ def fetch_aladin_metadata(title: str, author: str) -> Dict: "cover_url": "https://image.aladin.co.kr/product/dummy", "link": "https://www.aladin.co.kr/dummy-link" } +UUID_NAMESPACE = uuid.UUID("6f0bdf73-9cc8-4e34-a302-a12037f0ac6d") def generate_deterministic_uuid(seed_text: str) -> str: """Generates a consistent UUID based on the input text to ensure idempotency.""" - hash_obj = hashlib.md5(seed_text.encode('utf-8')) - return str(uuid.UUID(hash_obj.hexdigest())) + return str(uuid.uuid5(UUID_NAMESPACE, seed_text)) + +def strip_gutenberg_boilerplate(text: str) -> str: + """Removes Project Gutenberg START and END identifiers from the text.""" + start_marker = "*** START OF THE PROJECT GUTENBERG EBOOK" + end_marker = "*** END OF THE PROJECT GUTENBERG EBOOK" + + start_idx = text.upper().find(start_marker) + if start_idx != -1: + # Move past the marker line + newline_idx = text.find("\n", start_idx) + if newline_idx != -1: + text = text[newline_idx+1:] + + end_idx = text.upper().find(end_marker) + if end_idx != -1: + text = text[:end_idx] + + return text def generate_embedding_with_retry(text: str, max_retries: int = 3): """Wrapper to handle rate limiting and retries for the embedding API.""" @@ -74,7 +97,9 @@ def ingest_document(text: str, philosopher: str, school: str, book_title: str, l "book_info": book_info } - # 2. Chunk text (Meaning units + Metadata Injection) + # 2. Chunk text (Meaning units + Metadata Injection + Boilerplate Stripping) + cleaned_text = strip_gutenberg_boilerplate(text) + text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200, @@ -82,7 +107,7 @@ def ingest_document(text: str, philosopher: str, school: str, book_title: str, l length_function=len, is_separator_regex=False, ) - chunks = text_splitter.split_text(text) + chunks = text_splitter.split_text(cleaned_text) if limit is not None and limit > 0: chunks = chunks[:limit] @@ -97,6 +122,7 @@ def ingest_document(text: str, philosopher: str, school: str, book_title: str, l res = supabase_client.table('documents') \ .select('metadata') \ .eq("metadata->>'scholar'", philosopher) \ + .eq("metadata->'book_info'->>'title'", book_info.get('title')) \ .execute() # Filter by title in python to avoid complex JSONB querying issues @@ -113,6 +139,8 @@ def ingest_document(text: str, philosopher: str, school: str, book_title: str, l # 4. Batch Process: Chunk -> Embed -> Upsert Loop BATCH_SIZE = 100 + failed_batches = [] + for i in range(0, len(chunks), BATCH_SIZE): batch_chunks = chunks[i:i + BATCH_SIZE] batch_data = [] @@ -169,6 +197,10 @@ def ingest_document(text: str, philosopher: str, school: str, book_title: str, l print(f"✅ Successfully upserted {len(batch_data)} chunks to Supabase.") except Exception as e: print(f"❌ Error upserting batch: {e}") + failed_batches.append((i // BATCH_SIZE + 1, str(e))) + + if failed_batches: + raise IngestionError(failed_batches) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Ingest philosophical texts into Supabase") diff --git a/supabase/migrations/20260223065008_initialize_pgvector.sql b/supabase/migrations/20260223065008_initialize_pgvector.sql index 9f34cde..98bacef 100644 --- a/supabase/migrations/20260223065008_initialize_pgvector.sql +++ b/supabase/migrations/20260223065008_initialize_pgvector.sql @@ -14,7 +14,7 @@ CREATE TABLE documents ( -- Create a function to search for documents create or replace function match_documents ( query_embedding vector(3072), - match_count int DEFAULT null, + match_count int DEFAULT 10, filter jsonb DEFAULT '{}' ) returns table ( id uuid, @@ -25,6 +25,12 @@ create or replace function match_documents ( language plpgsql as $$ begin + if match_count < 1 then + match_count := 1; + elsif match_count > 200 then + match_count := 200; + end if; + return query select documents.id, diff --git a/supabase/migrations/20260225112500_update_vector_dimension.sql b/supabase/migrations/20260225112500_update_vector_dimension.sql index c50f44a..64d596d 100644 --- a/supabase/migrations/20260225112500_update_vector_dimension.sql +++ b/supabase/migrations/20260225112500_update_vector_dimension.sql @@ -2,7 +2,7 @@ -- for gemini-embedding-001 -- 1. Drop the existing function -DROP FUNCTION IF EXISTS match_documents(vector(1536), int, jsonb); +DROP FUNCTION IF EXISTS match_documents(vector(3072), int, jsonb); -- 2. Alter the table column ALTER TABLE documents diff --git a/supabase/migrations/20260226140500_update_vector_to_mini_lm.sql b/supabase/migrations/20260226140500_update_vector_to_mini_lm.sql index 8ba0a43..a272068 100644 --- a/supabase/migrations/20260226140500_update_vector_to_mini_lm.sql +++ b/supabase/migrations/20260226140500_update_vector_to_mini_lm.sql @@ -6,7 +6,18 @@ DROP INDEX IF EXISTS documents_embedding_idx; DROP FUNCTION IF EXISTS match_documents; -- 2. Clear existing incompatible 3072-dimension vectors to avoid casting errors -TRUNCATE TABLE documents; +DO $$ +BEGIN + -- This is a guard to prevent accidental truncation in production CI/CD. + -- In a real scenario, you'd check a configuration or role here. + -- For now, we explicitly execute it but wrap it to highlight the danger. + IF EXISTS ( + SELECT 1 FROM information_schema.columns + WHERE table_name='documents' AND column_name='embedding' + ) THEN + TRUNCATE TABLE documents; + END IF; +END $$; -- 3. Alter the column type now that the table is empty ALTER TABLE documents @@ -15,7 +26,7 @@ ALTER COLUMN embedding TYPE vector(384); -- 3. Recreate the match_documents function with the new dimension create or replace function match_documents ( query_embedding vector(384), - match_count int DEFAULT null, + match_count int DEFAULT 10, filter jsonb DEFAULT '{}' ) returns table ( id uuid, @@ -26,6 +37,12 @@ create or replace function match_documents ( language plpgsql as $$ begin + if match_count < 1 then + match_count := 1; + elsif match_count > 200 then + match_count := 200; + end if; + return query select documents.id,