Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion app/services/embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,12 @@ def generate_embedding(self, text: str) -> list[float]:
Returns a list of 384 floats.
"""
# The embed_query method returns a list of floats
return self.embeddings.embed_query(text)
embedding = self.embeddings.embed_query(text)
if len(embedding) != 384:
raise ValueError(
f"Unexpected embedding dimension: {len(embedding)} (expected 384)"
)
return embedding

# Singleton instance
embedding_service = EmbeddingService()
2 changes: 1 addition & 1 deletion app/services/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
genai.configure(api_key=settings.GEMINI_API_KEY)

# Configure LangChain model
# We use gemini-1.5-pro or gemini-pro depending on availability and needs
# We use gemini-2.5-flash for faster and highly capable inference
llm = ChatGoogleGenerativeAI(
model="gemini-2.5-flash",
google_api_key=settings.GEMINI_API_KEY,
Expand Down
22 changes: 13 additions & 9 deletions download_books.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@
import time
import urllib.request
import urllib.error
import urllib.parse
from html.parser import HTMLParser

def get_html(url):
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
try:
with urllib.request.urlopen(req) as response:
with urllib.request.urlopen(req, timeout=20) as response:
return response.read().decode('utf-8', errors='replace')
except Exception as e:
print(f"Error fetching {url}: {e}")
Expand Down Expand Up @@ -98,7 +99,7 @@ def download_book(book, data_dir, base_url, target_count, downloaded_count):
if not href.startswith('/ebooks/'):
return downloaded_count

book_url = base_url + href
book_url = urllib.parse.urljoin(base_url, href)
title = book['title'].replace('\n', ' ').replace('\r', '')
author = book['author'].replace('\n', ' ').replace('\r', '')

Expand All @@ -120,16 +121,16 @@ def download_book(book, data_dir, base_url, target_count, downloaded_count):

txt_url = parser.txt_url
if txt_url:
if not txt_url.startswith('http'):
if txt_url.startswith('//'):
txt_url = 'https:' + txt_url
else:
txt_url = base_url + txt_url
txt_url = urllib.parse.urljoin(base_url, txt_url)
# Verify valid scheme to prevent SSRF
if not txt_url.startswith(('http://', 'https://')):
print(f"Skipping invalid URL scheme: {txt_url}")
return downloaded_count

print(f"Downloading [{downloaded_count+1}/{target_count}]: {safe_title}")
try:
req = urllib.request.Request(txt_url, headers={'User-Agent': 'Mozilla/5.0'})
with urllib.request.urlopen(req) as resp:
with urllib.request.urlopen(req, timeout=20) as resp:
Comment thread
coderabbitai[bot] marked this conversation as resolved.
text = resp.read()
text = text.decode('utf-8', errors='replace')
with open(file_path, 'w', encoding='utf-8') as f:
Expand Down Expand Up @@ -175,7 +176,10 @@ def main():
downloaded_count = download_book(book, data_dir, base_url, target_count, downloaded_count)

if parser.next_page:
current_url = base_url + parser.next_page
current_url = urllib.parse.urljoin(base_url, parser.next_page)
if not current_url.startswith(('http://', 'https://')):
print(f"Invalid next page URL scheme: {current_url}")
current_url = None
else:
current_url = None

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ sse-starlette
pydantic>=2.7.0
pydantic-settings
python-dotenv
langchain-community
fastembed
11 changes: 8 additions & 3 deletions scripts/check_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,14 @@
res = supabase_client.table('documents').select('id').limit(1).execute()
print('Table exists')
if res.data:
print(f'Found {len(res.data)} rows')
print('Found at least 1 row')
else:
print('Table is empty')
except Exception as e:
print(f'Error or Table does not exist: {e}')
# No exit(1) because we want to know if it's just not there
error_msg = str(e).lower()
if 'relation "documents" does not exist' in error_msg or 'table missing' in error_msg or "code': '42p01'" in error_msg:
print("Table 'documents' does not exist yet. Please run migrations.")
sys.exit(0)
else:
print(f"Database connection or query error: {e}")
raise
4 changes: 2 additions & 2 deletions scripts/ingest_all_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def main():

failed_files = []

files = [f for f in os.listdir(data_dir) if f.endswith(".txt")]
files = sorted(f for f in os.listdir(data_dir) if f.endswith(".txt"))
print(f"Found {len(files)} text files in {data_dir}.")

for idx, filename in enumerate(files):
Expand All @@ -38,7 +38,7 @@ def main():

print(f"\n[{idx+1}/{len(files)}] ==================================")
print(f"Processing: {title} by {philosopher}")
print(f"==================================================")
print("==================================================")

try:
with open(filepath, 'r', encoding='utf-8') as f:
Expand Down
42 changes: 37 additions & 5 deletions scripts/ingest_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import sys
import json
import uuid
import hashlib
import urllib.request
import argparse
import concurrent.futures
Expand All @@ -16,6 +15,12 @@
from app.services.database import supabase_client
from langchain.text_splitter import RecursiveCharacterTextSplitter

class IngestionError(Exception):
"""Raised when data ingestion fails."""
def __init__(self, failed_batches):
self.failed_batches = failed_batches
super().__init__(f"Ingestion incomplete. Failed batches: {failed_batches}")

def fetch_aladin_metadata(title: str, author: str) -> Dict:
"""
Dummy function for Aladin Open API.
Expand All @@ -36,11 +41,29 @@ def fetch_aladin_metadata(title: str, author: str) -> Dict:
"cover_url": "https://image.aladin.co.kr/product/dummy",
"link": "https://www.aladin.co.kr/dummy-link"
}
UUID_NAMESPACE = uuid.UUID("6f0bdf73-9cc8-4e34-a302-a12037f0ac6d")

def generate_deterministic_uuid(seed_text: str) -> str:
"""Generates a consistent UUID based on the input text to ensure idempotency."""
hash_obj = hashlib.md5(seed_text.encode('utf-8'))
return str(uuid.UUID(hash_obj.hexdigest()))
return str(uuid.uuid5(UUID_NAMESPACE, seed_text))

def strip_gutenberg_boilerplate(text: str) -> str:
"""Removes Project Gutenberg START and END identifiers from the text."""
start_marker = "*** START OF THE PROJECT GUTENBERG EBOOK"
end_marker = "*** END OF THE PROJECT GUTENBERG EBOOK"

start_idx = text.upper().find(start_marker)
if start_idx != -1:
# Move past the marker line
newline_idx = text.find("\n", start_idx)
if newline_idx != -1:
text = text[newline_idx+1:]

end_idx = text.upper().find(end_marker)
if end_idx != -1:
text = text[:end_idx]

return text

def generate_embedding_with_retry(text: str, max_retries: int = 3):
"""Wrapper to handle rate limiting and retries for the embedding API."""
Expand Down Expand Up @@ -74,15 +97,17 @@ def ingest_document(text: str, philosopher: str, school: str, book_title: str, l
"book_info": book_info
}

# 2. Chunk text (Meaning units + Metadata Injection)
# 2. Chunk text (Meaning units + Metadata Injection + Boilerplate Stripping)
cleaned_text = strip_gutenberg_boilerplate(text)

text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
separators=["\n\n", "\n", " ", ""],
length_function=len,
is_separator_regex=False,
)
chunks = text_splitter.split_text(text)
chunks = text_splitter.split_text(cleaned_text)

if limit is not None and limit > 0:
chunks = chunks[:limit]
Expand All @@ -97,6 +122,7 @@ def ingest_document(text: str, philosopher: str, school: str, book_title: str, l
res = supabase_client.table('documents') \
.select('metadata') \
.eq("metadata->>'scholar'", philosopher) \
.eq("metadata->'book_info'->>'title'", book_info.get('title')) \
.execute()

# Filter by title in python to avoid complex JSONB querying issues
Expand All @@ -113,6 +139,8 @@ def ingest_document(text: str, philosopher: str, school: str, book_title: str, l

# 4. Batch Process: Chunk -> Embed -> Upsert Loop
BATCH_SIZE = 100
failed_batches = []

for i in range(0, len(chunks), BATCH_SIZE):
batch_chunks = chunks[i:i + BATCH_SIZE]
batch_data = []
Expand Down Expand Up @@ -169,6 +197,10 @@ def ingest_document(text: str, philosopher: str, school: str, book_title: str, l
print(f"✅ Successfully upserted {len(batch_data)} chunks to Supabase.")
except Exception as e:
print(f"❌ Error upserting batch: {e}")
failed_batches.append((i // BATCH_SIZE + 1, str(e)))

if failed_batches:
raise IngestionError(failed_batches)

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Ingest philosophical texts into Supabase")
Expand Down
8 changes: 7 additions & 1 deletion supabase/migrations/20260223065008_initialize_pgvector.sql
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ CREATE TABLE documents (
-- Create a function to search for documents
create or replace function match_documents (
query_embedding vector(3072),
match_count int DEFAULT null,
match_count int DEFAULT 10,
filter jsonb DEFAULT '{}'
) returns table (
id uuid,
Expand All @@ -25,6 +25,12 @@ create or replace function match_documents (
language plpgsql
as $$
begin
if match_count < 1 then
match_count := 1;
elsif match_count > 200 then
match_count := 200;
end if;

return query
select
documents.id,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
-- for gemini-embedding-001

-- 1. Drop the existing function
DROP FUNCTION IF EXISTS match_documents(vector(1536), int, jsonb);
DROP FUNCTION IF EXISTS match_documents(vector(3072), int, jsonb);
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Find all migration files and examine match_documents signatures

echo "=== Finding migration files ==="
git ls-files supabase/migrations/*.sql | sort

echo ""
echo "=== Migration files with match_documents ==="
fd -e sql . supabase/migrations | sort | while read f; do
  if rg -q "match_documents" "$f"; then
    echo "--- $f ---"
    cat -n "$f"
  fi
done

Repository: SanghunYun95/philo-rag

Length of output: 6754


match_count 기본값 및 범위 검사 제거 - 동작 변경 확인

DROP 문의 시그니처는 정확히 20260223 마이그레이션과 일치하므로 올바릅니다. 다만 함수 재생성 시 두 가지 문제가 있습니다:

  1. match_count DEFAULT null: 이전 마이그레이션(20260223)에서는 DEFAULT 10이었으나 이 마이그레이션에서 DEFAULT null로 변경됩니다. SQL LIMIT 절에서 null 값은 예상치 못한 동작을 일으킬 수 있습니다.

  2. 범위 검사 제거: 이전 함수에는 match_count를 1~200 범위로 제한하는 검증 로직이 있었으나, 재생성된 함수에서는 완전히 제거되었습니다.

이러한 변경이 의도적인지 확인하세요. 만약 의도되지 않은 경우, 20260223의 동작을 유지하거나 일관된 정책을 20260225141500 마이그레이션에도 적용해야 합니다.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@supabase/migrations/20260225112500_update_vector_dimension.sql` at line 5,
The migration changes the recreated function match_documents(vector(3072), int,
jsonb) to set match_count DEFAULT NULL and removes the 1..200 range validation,
which deviates from migration 20260223; restore the original behavior by setting
the parameter default back to DEFAULT 10 (or the intended non-null default) and
reintroduce the range check logic that enforces match_count between 1 and 200
(e.g., validate and clamp or raise an error) inside the function body so LIMIT
uses a safe integer and the same validation policy is preserved across
migrations.


-- 2. Alter the table column
ALTER TABLE documents
Expand Down
21 changes: 19 additions & 2 deletions supabase/migrations/20260226140500_update_vector_to_mini_lm.sql
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,18 @@ DROP INDEX IF EXISTS documents_embedding_idx;
DROP FUNCTION IF EXISTS match_documents;

-- 2. Clear existing incompatible 3072-dimension vectors to avoid casting errors
TRUNCATE TABLE documents;
DO $$
BEGIN
-- This is a guard to prevent accidental truncation in production CI/CD.
-- In a real scenario, you'd check a configuration or role here.
-- For now, we explicitly execute it but wrap it to highlight the danger.
IF EXISTS (
SELECT 1 FROM information_schema.columns
WHERE table_name='documents' AND column_name='embedding'
) THEN
TRUNCATE TABLE documents;
END IF;
END $$;

-- 3. Alter the column type now that the table is empty
ALTER TABLE documents
Expand All @@ -15,7 +26,7 @@ ALTER COLUMN embedding TYPE vector(384);
-- 3. Recreate the match_documents function with the new dimension
create or replace function match_documents (
query_embedding vector(384),
match_count int DEFAULT null,
match_count int DEFAULT 10,
filter jsonb DEFAULT '{}'
) returns table (
id uuid,
Expand All @@ -26,6 +37,12 @@ create or replace function match_documents (
language plpgsql
as $$
begin
if match_count < 1 then
match_count := 1;
elsif match_count > 200 then
match_count := 200;
end if;

return query
select
documents.id,
Expand Down