SanghunYun95 · SanghunYun95 · Feb 26, 2026 · Feb 25, 2026 · Feb 26, 2026 · Feb 26, 2026
diff --git a/app/services/embedding.py b/app/services/embedding.py
@@ -19,7 +19,12 @@ def generate_embedding(self, text: str) -> list[float]:
         Returns a list of 384 floats.
         """
         # The embed_query method returns a list of floats
-        return self.embeddings.embed_query(text)
+        embedding = self.embeddings.embed_query(text)
+        if len(embedding) != 384:
+            raise ValueError(
+                f"Unexpected embedding dimension: {len(embedding)} (expected 384)"
+            )
+        return embedding
 
 # Singleton instance
 embedding_service = EmbeddingService()
diff --git a/app/services/llm.py b/app/services/llm.py
@@ -8,7 +8,7 @@
 genai.configure(api_key=settings.GEMINI_API_KEY)
 
 # Configure LangChain model
-# We use gemini-1.5-pro or gemini-pro depending on availability and needs
+# We use gemini-2.5-flash for faster and highly capable inference
 llm = ChatGoogleGenerativeAI(
     model="gemini-2.5-flash", 
     google_api_key=settings.GEMINI_API_KEY,

diff --git a/download_books.py b/download_books.py
@@ -3,12 +3,13 @@
 import time
 import urllib.request
 import urllib.error
+import urllib.parse
 from html.parser import HTMLParser
 
 def get_html(url):
     req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
     try:
-        with urllib.request.urlopen(req) as response:
+        with urllib.request.urlopen(req, timeout=20) as response:
             return response.read().decode('utf-8', errors='replace')
     except Exception as e:
         print(f"Error fetching {url}: {e}")
@@ -98,7 +99,7 @@ def download_book(book, data_dir, base_url, target_count, downloaded_count):
     if not href.startswith('/ebooks/'):
         return downloaded_count
 
-    book_url = base_url + href
+    book_url = urllib.parse.urljoin(base_url, href)
     title = book['title'].replace('\n', ' ').replace('\r', '')
     author = book['author'].replace('\n', ' ').replace('\r', '')
 
@@ -120,16 +121,16 @@ def download_book(book, data_dir, base_url, target_count, downloaded_count):
 
     txt_url = parser.txt_url
     if txt_url:
-        if not txt_url.startswith('http'):
-            if txt_url.startswith('//'):
-                txt_url = 'https:' + txt_url
-            else:
-                txt_url = base_url + txt_url
+        txt_url = urllib.parse.urljoin(base_url, txt_url)
+        # Verify valid scheme to prevent SSRF
+        if not txt_url.startswith(('http://', 'https://')):
+            print(f"Skipping invalid URL scheme: {txt_url}")
+            return downloaded_count
 
         print(f"Downloading [{downloaded_count+1}/{target_count}]: {safe_title}")
         try:
             req = urllib.request.Request(txt_url, headers={'User-Agent': 'Mozilla/5.0'})
-            with urllib.request.urlopen(req) as resp:
+            with urllib.request.urlopen(req, timeout=20) as resp:
                 text = resp.read()
                 text = text.decode('utf-8', errors='replace')
                 with open(file_path, 'w', encoding='utf-8') as f:
@@ -175,7 +176,10 @@ def main():
             downloaded_count = download_book(book, data_dir, base_url, target_count, downloaded_count)
 
         if parser.next_page:
-            current_url = base_url + parser.next_page
+            current_url = urllib.parse.urljoin(base_url, parser.next_page)
+            if not current_url.startswith(('http://', 'https://')):
+                print(f"Invalid next page URL scheme: {current_url}")
+                current_url = None
         else:
             current_url = None
 

diff --git a/requirements.txt b/requirements.txt
@@ -8,4 +8,5 @@ sse-starlette
 pydantic>=2.7.0
 pydantic-settings
 python-dotenv
+langchain-community
 fastembed
diff --git a/scripts/check_db.py b/scripts/check_db.py
@@ -6,9 +6,14 @@
     res = supabase_client.table('documents').select('id').limit(1).execute()
     print('Table exists')
     if res.data:
-        print(f'Found {len(res.data)} rows')
+        print('Found at least 1 row')
     else:
         print('Table is empty')
 except Exception as e:
-    print(f'Error or Table does not exist: {e}')
-    # No exit(1) because we want to know if it's just not there
+    error_msg = str(e).lower()
+    if 'relation "documents" does not exist' in error_msg or 'table missing' in error_msg or "code': '42p01'" in error_msg:
+        print("Table 'documents' does not exist yet. Please run migrations.")
+        sys.exit(0)
+    else:
+        print(f"Database connection or query error: {e}")
+        raise
diff --git a/scripts/ingest_all_data.py b/scripts/ingest_all_data.py
@@ -17,7 +17,7 @@ def main():
 
     failed_files = []
 
-    files = [f for f in os.listdir(data_dir) if f.endswith(".txt")]
+    files = sorted(f for f in os.listdir(data_dir) if f.endswith(".txt"))
     print(f"Found {len(files)} text files in {data_dir}.")
 
     for idx, filename in enumerate(files):
@@ -38,7 +38,7 @@ def main():
 
         print(f"\n[{idx+1}/{len(files)}] ==================================")
         print(f"Processing: {title} by {philosopher}")
-        print(f"==================================================")
+        print("==================================================")
 
         try:
             with open(filepath, 'r', encoding='utf-8') as f:

diff --git a/scripts/ingest_data.py b/scripts/ingest_data.py
@@ -2,7 +2,6 @@
 import sys
 import json
 import uuid
-import hashlib
 import urllib.request
 import argparse
 import concurrent.futures
@@ -16,6 +15,12 @@
 from app.services.database import supabase_client
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 
+class IngestionError(Exception):
+    """Raised when data ingestion fails."""
+    def __init__(self, failed_batches):
+        self.failed_batches = failed_batches
+        super().__init__(f"Ingestion incomplete. Failed batches: {failed_batches}")
+
 def fetch_aladin_metadata(title: str, author: str) -> Dict:
     """
     Dummy function for Aladin Open API.
@@ -36,11 +41,29 @@ def fetch_aladin_metadata(title: str, author: str) -> Dict:
         "cover_url": "https://image.aladin.co.kr/product/dummy",
         "link": "https://www.aladin.co.kr/dummy-link"
     }
+UUID_NAMESPACE = uuid.UUID("6f0bdf73-9cc8-4e34-a302-a12037f0ac6d")
 
 def generate_deterministic_uuid(seed_text: str) -> str:
     """Generates a consistent UUID based on the input text to ensure idempotency."""
-    hash_obj = hashlib.md5(seed_text.encode('utf-8'))
-    return str(uuid.UUID(hash_obj.hexdigest()))
+    return str(uuid.uuid5(UUID_NAMESPACE, seed_text))
+
+def strip_gutenberg_boilerplate(text: str) -> str:
+    """Removes Project Gutenberg START and END identifiers from the text."""
+    start_marker = "*** START OF THE PROJECT GUTENBERG EBOOK"
+    end_marker = "*** END OF THE PROJECT GUTENBERG EBOOK"
+
+    start_idx = text.upper().find(start_marker)
+    if start_idx != -1:
+        # Move past the marker line
+        newline_idx = text.find("\n", start_idx)
+        if newline_idx != -1:
+            text = text[newline_idx+1:]
+
+    end_idx = text.upper().find(end_marker)
+    if end_idx != -1:
+        text = text[:end_idx]
+
+    return text
 
 def generate_embedding_with_retry(text: str, max_retries: int = 3):
     """Wrapper to handle rate limiting and retries for the embedding API."""
@@ -74,15 +97,17 @@ def ingest_document(text: str, philosopher: str, school: str, book_title: str, l
         "book_info": book_info
     }
 
-    # 2. Chunk text (Meaning units + Metadata Injection)
+    # 2. Chunk text (Meaning units + Metadata Injection + Boilerplate Stripping)
+    cleaned_text = strip_gutenberg_boilerplate(text)
+
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=1000,
         chunk_overlap=200,
         separators=["\n\n", "\n", " ", ""],
         length_function=len,
         is_separator_regex=False,
     )
-    chunks = text_splitter.split_text(text)
+    chunks = text_splitter.split_text(cleaned_text)
 
     if limit is not None and limit > 0:
         chunks = chunks[:limit]
@@ -97,6 +122,7 @@ def ingest_document(text: str, philosopher: str, school: str, book_title: str, l
         res = supabase_client.table('documents') \
             .select('metadata') \
             .eq("metadata->>'scholar'", philosopher) \
+            .eq("metadata->'book_info'->>'title'", book_info.get('title')) \
             .execute()
 
         # Filter by title in python to avoid complex JSONB querying issues
@@ -113,6 +139,8 @@ def ingest_document(text: str, philosopher: str, school: str, book_title: str, l
 
     # 4. Batch Process: Chunk -> Embed -> Upsert Loop
     BATCH_SIZE = 100
+    failed_batches = []
+
     for i in range(0, len(chunks), BATCH_SIZE):
         batch_chunks = chunks[i:i + BATCH_SIZE]
         batch_data = []
@@ -169,6 +197,10 @@ def ingest_document(text: str, philosopher: str, school: str, book_title: str, l
                 print(f"✅ Successfully upserted {len(batch_data)} chunks to Supabase.")
             except Exception as e:
                 print(f"❌ Error upserting batch: {e}")
+                failed_batches.append((i // BATCH_SIZE + 1, str(e)))
+
+    if failed_batches:
+        raise IngestionError(failed_batches)
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Ingest philosophical texts into Supabase")

diff --git a/supabase/migrations/20260223065008_initialize_pgvector.sql b/supabase/migrations/20260223065008_initialize_pgvector.sql
@@ -14,7 +14,7 @@ CREATE TABLE documents (
 -- Create a function to search for documents
 create or replace function match_documents (
   query_embedding vector(3072),
-  match_count int DEFAULT null,
+  match_count int DEFAULT 10,
   filter jsonb DEFAULT '{}'
 ) returns table (
   id uuid,
@@ -25,6 +25,12 @@ create or replace function match_documents (
 language plpgsql
 as $$
 begin
+  if match_count < 1 then
+    match_count := 1;
+  elsif match_count > 200 then
+    match_count := 200;
+  end if;
+
   return query
   select
     documents.id,

diff --git a/supabase/migrations/20260225112500_update_vector_dimension.sql b/supabase/migrations/20260225112500_update_vector_dimension.sql
@@ -2,7 +2,7 @@
 -- for gemini-embedding-001
 
 -- 1. Drop the existing function
-DROP FUNCTION IF EXISTS match_documents(vector(1536), int, jsonb);
+DROP FUNCTION IF EXISTS match_documents(vector(3072), int, jsonb);
 
 -- 2. Alter the table column
 ALTER TABLE documents 

diff --git a/supabase/migrations/20260226140500_update_vector_to_mini_lm.sql b/supabase/migrations/20260226140500_update_vector_to_mini_lm.sql
@@ -6,7 +6,18 @@ DROP INDEX IF EXISTS documents_embedding_idx;
 DROP FUNCTION IF EXISTS match_documents;
 
 -- 2. Clear existing incompatible 3072-dimension vectors to avoid casting errors
-TRUNCATE TABLE documents;
+DO $$
+BEGIN
+    -- This is a guard to prevent accidental truncation in production CI/CD.
+    -- In a real scenario, you'd check a configuration or role here.
+    -- For now, we explicitly execute it but wrap it to highlight the danger.
+    IF EXISTS (
+        SELECT 1 FROM information_schema.columns 
+        WHERE table_name='documents' AND column_name='embedding'
+    ) THEN
+        TRUNCATE TABLE documents;
+    END IF;
+END $$;
 
 -- 3. Alter the column type now that the table is empty
 ALTER TABLE documents 
@@ -15,7 +26,7 @@ ALTER COLUMN embedding TYPE vector(384);
 -- 3. Recreate the match_documents function with the new dimension
 create or replace function match_documents (
   query_embedding vector(384),
-  match_count int DEFAULT null,
+  match_count int DEFAULT 10,
   filter jsonb DEFAULT '{}'
 ) returns table (
   id uuid,
@@ -26,6 +37,12 @@ create or replace function match_documents (
 language plpgsql
 as $$
 begin
+  if match_count < 1 then
+    match_count := 1;
+  elsif match_count > 200 then
+    match_count := 200;
+  end if;
+
   return query
   select
     documents.id,