From f50fccee3586fe1d99ff57c69c5be9fc8ae572e4 Mon Sep 17 00:00:00 2001 From: HillviewCap Date: Tue, 1 Apr 2025 14:31:44 -0400 Subject: [PATCH 1/5] fix: Pass embedding config to processing script --- streamlit_pages/documentation.py | 54 +++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 4 deletions(-) diff --git a/streamlit_pages/documentation.py b/streamlit_pages/documentation.py index ccfd793c2e..25556a7256 100644 --- a/streamlit_pages/documentation.py +++ b/streamlit_pages/documentation.py @@ -273,6 +273,28 @@ def update_progress(status): # Use a unique key for the status context if needed, or rely on rerun clearing it with process_placeholder.status(f"Processing uploaded file: {st.session_state.uploaded_file_original_name}...", expanded=True) as status: try: + # --- START MODIFICATION: Pass Environment --- + current_env = os.environ.copy() + llm_provider = get_env_var("LLM_PROVIDER", "openai") + current_env["LLM_PROVIDER"] = llm_provider + current_env["EMBEDDING_MODEL"] = get_env_var("EMBEDDING_MODEL", "") + current_env["LLM_API_KEY"] = get_env_var("LLM_API_KEY", "") + + if llm_provider.lower() != "openai": + current_env["LLM_BASE_URL"] = get_env_var("LLM_BASE_URL", "") + elif "LLM_BASE_URL" in current_env: + del current_env["LLM_BASE_URL"] + + current_env["SUPABASE_URL"] = get_env_var("SUPABASE_URL", "") + current_env["SUPABASE_SERVICE_KEY"] = get_env_var("SUPABASE_SERVICE_KEY", "") + current_env["DOCS_RETRIEVAL_TABLE"] = get_env_var("DOCS_RETRIEVAL_TABLE", "") + + status.write(f"Using Embedding Model: {current_env.get('EMBEDDING_MODEL', 'N/A')}") + status.write(f"Using LLM Provider: {current_env.get('LLM_PROVIDER', 'N/A')}") + if "LLM_BASE_URL" in current_env: + status.write(f"Using Base URL: {current_env.get('LLM_BASE_URL')}") + # --- END MODIFICATION --- + status.write(f"Running command: `{' '.join(command)}`") result = subprocess.run( command, @@ -280,7 +302,8 @@ def update_progress(status): text=True, check=False, # Handle non-zero exit codes manually encoding='utf-8', - errors='replace' # Handle potential encoding issues in output + errors='replace', # Handle potential encoding issues in output + env=current_env # Pass the modified environment ) # Store output/error in session state for display later @@ -403,15 +426,38 @@ def sanitize_filename(source_identifier: str) -> str: if command and persistent_file_path: with process_placeholder.status("Processing documentation...", expanded=True) as status: try: - st.write(f"Running command: `{' '.join(command)}`") - # Use subprocess.run + # --- START MODIFICATION: Pass Environment --- + current_env = os.environ.copy() + llm_provider = get_env_var("LLM_PROVIDER", "openai") + current_env["LLM_PROVIDER"] = llm_provider + current_env["EMBEDDING_MODEL"] = get_env_var("EMBEDDING_MODEL", "") + current_env["LLM_API_KEY"] = get_env_var("LLM_API_KEY", "") + + if llm_provider.lower() != "openai": + current_env["LLM_BASE_URL"] = get_env_var("LLM_BASE_URL", "") + elif "LLM_BASE_URL" in current_env: + del current_env["LLM_BASE_URL"] + + current_env["SUPABASE_URL"] = get_env_var("SUPABASE_URL", "") + current_env["SUPABASE_SERVICE_KEY"] = get_env_var("SUPABASE_SERVICE_KEY", "") + current_env["DOCS_RETRIEVAL_TABLE"] = get_env_var("DOCS_RETRIEVAL_TABLE", "") + + status.write(f"Using Embedding Model: {current_env.get('EMBEDDING_MODEL', 'N/A')}") + status.write(f"Using LLM Provider: {current_env.get('LLM_PROVIDER', 'N/A')}") + if "LLM_BASE_URL" in current_env: + status.write(f"Using Base URL: {current_env.get('LLM_BASE_URL')}") + # --- END MODIFICATION --- + + status.write(f"Running command: `{' '.join(command)}`") + # Use subprocess.run with the modified environment result = subprocess.run( command, capture_output=True, text=True, check=False, # Don't raise exception on non-zero exit code encoding='utf-8', - errors='replace' # Handle potential encoding errors + errors='replace', # Handle potential encoding errors + env=current_env # Pass the modified environment ) st.session_state.llms_processing_output = result.stdout From bbfa0bceabcb98beffb5bf112ec2b41aef3fb646 Mon Sep 17 00:00:00 2001 From: HillviewCap Date: Tue, 1 Apr 2025 14:35:36 -0400 Subject: [PATCH 2/5] fix: Handle NoneType error for LLM_PROVIDER in doc processing --- streamlit_pages/documentation.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/streamlit_pages/documentation.py b/streamlit_pages/documentation.py index 25556a7256..b27eb11dec 100644 --- a/streamlit_pages/documentation.py +++ b/streamlit_pages/documentation.py @@ -275,16 +275,23 @@ def update_progress(status): try: # --- START MODIFICATION: Pass Environment --- current_env = os.environ.copy() - llm_provider = get_env_var("LLM_PROVIDER", "openai") - current_env["LLM_PROVIDER"] = llm_provider + llm_provider = get_env_var("LLM_PROVIDER", "openai") # Get provider, default to openai + + # Ensure llm_provider is treated as a string for comparison, defaulting to 'openai' if None + provider_check = llm_provider if llm_provider else "openai" + current_env["EMBEDDING_MODEL"] = get_env_var("EMBEDDING_MODEL", "") current_env["LLM_API_KEY"] = get_env_var("LLM_API_KEY", "") - if llm_provider.lower() != "openai": + if provider_check.lower() != "openai": current_env["LLM_BASE_URL"] = get_env_var("LLM_BASE_URL", "") elif "LLM_BASE_URL" in current_env: + # Ensure base_url is removed if provider is openai and it exists in inherited env del current_env["LLM_BASE_URL"] + # Also ensure the actual value passed in the env is not None + current_env["LLM_PROVIDER"] = provider_check + current_env["SUPABASE_URL"] = get_env_var("SUPABASE_URL", "") current_env["SUPABASE_SERVICE_KEY"] = get_env_var("SUPABASE_SERVICE_KEY", "") current_env["DOCS_RETRIEVAL_TABLE"] = get_env_var("DOCS_RETRIEVAL_TABLE", "") @@ -428,16 +435,23 @@ def sanitize_filename(source_identifier: str) -> str: try: # --- START MODIFICATION: Pass Environment --- current_env = os.environ.copy() - llm_provider = get_env_var("LLM_PROVIDER", "openai") - current_env["LLM_PROVIDER"] = llm_provider + llm_provider = get_env_var("LLM_PROVIDER", "openai") # Get provider, default to openai + + # Ensure llm_provider is treated as a string for comparison, defaulting to 'openai' if None + provider_check = llm_provider if llm_provider else "openai" + current_env["EMBEDDING_MODEL"] = get_env_var("EMBEDDING_MODEL", "") current_env["LLM_API_KEY"] = get_env_var("LLM_API_KEY", "") - if llm_provider.lower() != "openai": + if provider_check.lower() != "openai": current_env["LLM_BASE_URL"] = get_env_var("LLM_BASE_URL", "") elif "LLM_BASE_URL" in current_env: + # Ensure base_url is removed if provider is openai and it exists in inherited env del current_env["LLM_BASE_URL"] + # Also ensure the actual value passed in the env is not None + current_env["LLM_PROVIDER"] = provider_check + current_env["SUPABASE_URL"] = get_env_var("SUPABASE_URL", "") current_env["SUPABASE_SERVICE_KEY"] = get_env_var("SUPABASE_SERVICE_KEY", "") current_env["DOCS_RETRIEVAL_TABLE"] = get_env_var("DOCS_RETRIEVAL_TABLE", "") From e1381245455554fcd53604b8785728fea6763d43 Mon Sep 17 00:00:00 2001 From: HillviewCap Date: Tue, 1 Apr 2025 17:08:39 -0400 Subject: [PATCH 3/5] fix: Resolve Ollama embedding 404 error - Install langchain-ollama package - Update embedding_manager.py to use langchain_ollama.embeddings - Correct EMBEDDING_BASE_URL in env_vars.json (remove /v1) - Add detailed error logging in embedding_manager.py - Comment out verbose node insertion failure message in run_processing.py --- .../llms_txt/vector_db/embedding_manager.py | 257 ++++++++++++------ requirements.txt | 1 + run_processing.py | 6 +- 3 files changed, 177 insertions(+), 87 deletions(-) diff --git a/archon/llms_txt/vector_db/embedding_manager.py b/archon/llms_txt/vector_db/embedding_manager.py index 5577151a46..a47091d5e0 100644 --- a/archon/llms_txt/vector_db/embedding_manager.py +++ b/archon/llms_txt/vector_db/embedding_manager.py @@ -1,46 +1,111 @@ from typing import Dict, List, Any, Optional # Added Optional import os # Added os -# Ensure openai library is installed: pip install openai +# Ensure necessary libraries are installed: pip install openai langchain-community try: - from openai import OpenAI, APIError # Added APIError for specific exception handling + from openai import OpenAI, APIError except ImportError: - raise ImportError("OpenAI library not found. Please install it using: pip install openai") + # Allow running without OpenAI if Ollama is the provider + OpenAI = None + APIError = None # Define APIError as None if openai isn't installed + print("Warning: OpenAI library not found. OpenAI provider will not be available.") + +try: + from langchain_ollama.embeddings import OllamaEmbeddings +except ImportError: + OllamaEmbeddings = None + print("Warning: langchain-community library not found. Ollama provider will not be available.") # Corrected import path assuming vector_db is sibling to utils from ..utils.env_loader import EnvironmentLoader -class OpenAIEmbeddingGenerator: - """Generate embeddings using OpenAI's API""" +class EmbeddingManager: + """Manages embedding generation using configured provider (OpenAI or Ollama).""" def __init__(self, env_loader: Optional[EnvironmentLoader] = None): - """Initialize the OpenAI API client""" - # Allow passing an existing env_loader or create a new one + """Initialize the embedding client based on configuration.""" self.env_loader = env_loader or EnvironmentLoader(env_file_path="../../workbench/env_vars.json") # Adjusted path - self.openai_config = self.env_loader.get_openai_config() + self.config = self.env_loader.config # Access the config directly - # Validate required config - if not self.openai_config.get("api_key"): - raise ValueError("OpenAI API Key ('LLM_API_KEY') is missing in configuration.") - if not self.openai_config.get("embedding_model"): - raise ValueError("OpenAI Embedding Model ('EMBEDDING_MODEL') is missing in configuration.") + self.provider = self.config.get("EMBEDDING_PROVIDER", "OpenAI").lower() # Default to OpenAI + self.client = None + self.embedding_model = None + self.embedding_dim = None # Store embedding dimension - # Initialize OpenAI client - try: - self.client = OpenAI( - api_key=self.openai_config["api_key"], - # Pass base_url only if it exists in the config - base_url=self.openai_config.get("base_url") - ) - print("OpenAI client initialized successfully.") - except Exception as e: - print(f"Error initializing OpenAI client: {e}") - raise + print(f"Selected embedding provider: {self.provider}") + + if self.provider == "ollama": + if OllamaEmbeddings is None: + raise ImportError("Ollama provider selected, but langchain-community library is not installed.") + + ollama_base_url = self.config.get("EMBEDDING_BASE_URL") + self.embedding_model = self.config.get("EMBEDDING_MODEL") + + if not self.embedding_model: + raise ValueError("Ollama Embedding Model ('EMBEDDING_MODEL') is missing in configuration.") + if not ollama_base_url: + print("Warning: Ollama Base URL ('EMBEDDING_BASE_URL') not found, using default.") + # OllamaEmbeddings might have a default, or handle None + + try: + self.client = OllamaEmbeddings( + base_url=ollama_base_url, + model=self.embedding_model + ) + # Note: OllamaEmbeddings doesn't expose dimension easily, may need to infer or hardcode common values + print(f"Ollama client initialized successfully. Model: {self.embedding_model}, Base URL: {ollama_base_url or 'Default'}") + # Attempt a dummy embedding to get dimension (or use common defaults) + try: + dummy_embedding = self.client.embed_query("test") + self.embedding_dim = len(dummy_embedding) + print(f"Inferred Ollama embedding dimension: {self.embedding_dim}") + except Exception as e: + print(f"Warning: Could not determine Ollama embedding dimension via test query: {e}. Falling back to default.") + # Fallback based on common Ollama model dimensions if needed + self.embedding_dim = 768 # Common default, adjust if necessary + + except Exception as e: + print(f"Error initializing Ollama client: {e}") + raise + + elif self.provider == "openai": + if OpenAI is None: + raise ImportError("OpenAI provider selected, but openai library is not installed.") + + openai_api_key = self.config.get("EMBEDDING_API_KEY") or self.config.get("LLM_API_KEY") # Try embedding-specific key first + openai_base_url = self.config.get("EMBEDDING_BASE_URL") or self.config.get("BASE_URL") # Try embedding-specific URL first + self.embedding_model = self.config.get("EMBEDDING_MODEL") + + if not openai_api_key: + raise ValueError("OpenAI API Key ('LLM_API_KEY') is missing in configuration.") + if not self.embedding_model: + raise ValueError("OpenAI Embedding Model ('EMBEDDING_MODEL') is missing in configuration.") + + try: + self.client = OpenAI( + api_key=openai_api_key, + base_url=openai_base_url # Pass base_url only if it exists + ) + print(f"OpenAI client initialized successfully. Model: {self.embedding_model}, Base URL: {openai_base_url or 'Default'}") + # Infer dimension from model name (common OpenAI models) + if "1536" in self.embedding_model or "small" in self.embedding_model or "large" in self.embedding_model or "3" in self.embedding_model: + self.embedding_dim = 1536 + elif "ada" in self.embedding_model: # Older model + self.embedding_dim = 1024 + else: + self.embedding_dim = 768 # Fallback/other models + print(f"Inferred OpenAI embedding dimension: {self.embedding_dim}") - self.embedding_model = self.openai_config["embedding_model"] - print(f"Using OpenAI embedding model: {self.embedding_model}") + except Exception as e: + print(f"Error initializing OpenAI client: {e}") + raise + else: + raise ValueError(f"Unsupported embedding provider: {self.provider}. Choose 'OpenAI' or 'Ollama'.") + + if not self.client: + raise RuntimeError("Embedding client failed to initialize.") def generate_embedding(self, text: str) -> List[float]: - """Generate an embedding for a single text string. + """Generate an embedding for a single text string using the configured provider. Args: text: Text to generate embedding for. @@ -49,39 +114,51 @@ def generate_embedding(self, text: str) -> List[float]: List of floats representing the embedding vector. Raises: - APIError: If the OpenAI API call fails. - Exception: For other unexpected errors. + ValueError: If the input text is invalid. + Exception: If the API call to the provider fails or other errors occur. """ if not text: print("Warning: Attempting to generate embedding for empty text. Returning zero vector.") - # Determine embedding dimension based on model (common case: 1536 for text-embedding-3-small) - # This is a simplification; ideally, fetch dimension from model info if possible. - # For now, assume 1536 if model name suggests it. - dim = 1536 if "1536" in self.embedding_model or "small" in self.embedding_model or "large" in self.embedding_model else 768 # Default fallback - return [0.0] * dim + # Use the stored dimension + if self.embedding_dim is None: + print("Error: Embedding dimension not determined during initialization.") + # Fallback dimension if not set + return [0.0] * 768 + return [0.0] * self.embedding_dim try: - response = self.client.embeddings.create( - model=self.embedding_model, - input=text, - encoding_format="float" # Explicitly request float format - ) - - # Check if response data is valid and contains embeddings - if response.data and len(response.data) > 0 and response.data[0].embedding: - return response.data[0].embedding + if self.provider == "openai": + if APIError is None: # Check if OpenAI client could be initialized + raise RuntimeError("OpenAI client not available.") + response = self.client.embeddings.create( + model=self.embedding_model, + input=text, + encoding_format="float" + ) + if response.data and len(response.data) > 0 and response.data[0].embedding: + return response.data[0].embedding + else: + raise ValueError("Invalid response received from OpenAI API: No embedding data found.") + elif self.provider == "ollama": + embedding = self.client.embed_query(text) + if embedding and isinstance(embedding, list): + return embedding + else: + raise ValueError("Invalid response received from Ollama API: No embedding data found.") else: - raise ValueError("Invalid response received from OpenAI API: No embedding data found.") + # This case should ideally not be reached due to __init__ validation + raise RuntimeError(f"Invalid provider '{self.provider}' encountered during embedding generation.") - except APIError as e: + except APIError as e: # Specific to OpenAI print(f"OpenAI API error generating single embedding: {e}") raise # Re-raise the specific API error except Exception as e: - print(f"Unexpected error generating single embedding: {e}") + # Catch general exceptions which might come from Ollama or other issues + print(f"Error generating single embedding with {self.provider.upper()} provider: {e}") raise # Re-raise other exceptions def generate_embeddings(self, texts: List[str]) -> List[List[float]]: - """Generate embeddings for a list of text strings (handles batching). + """Generate embeddings for a list of text strings using the configured provider. Args: texts: List of texts to generate embeddings for. @@ -90,8 +167,8 @@ def generate_embeddings(self, texts: List[str]) -> List[List[float]]: List of embedding vectors (list of lists of floats). Returns empty list if input is empty. Raises: - APIError: If the OpenAI API call fails. - Exception: For other unexpected errors. + ValueError: If the input texts are invalid or API response is malformed. + Exception: If the API call to the provider fails or other errors occur. """ if not texts: return [] @@ -100,43 +177,55 @@ def generate_embeddings(self, texts: List[str]) -> List[List[float]]: processed_texts = [text if text else " " for text in texts] all_embeddings = [] - batch_size = 2000 # OpenAI limit is often 2048, use 2000 for safety - - for i in range(0, len(processed_texts), batch_size): - batch = processed_texts[i:i + batch_size] - batch_index_info = f"(Indices {i} to {i + len(batch) - 1})" # For logging + if self.provider == "openai": + if APIError is None: raise RuntimeError("OpenAI client not available.") + batch_size = 2000 # OpenAI batch size limit + for i in range(0, len(processed_texts), batch_size): + batch = processed_texts[i:i + batch_size] + batch_index_info = f"(Indices {i} to {i + len(batch) - 1})" + try: + print(f"Generating OpenAI embeddings for batch {batch_index_info} of size {len(batch)}...") + response = self.client.embeddings.create( + model=self.embedding_model, + input=batch, + encoding_format="float" + ) + if response.data and len(response.data) == len(batch): + batch_embeddings = [item.embedding for item in response.data] + if not all(e is not None for e in batch_embeddings): # Check for None embeddings + print(f"Warning: Missing embedding data in OpenAI response for batch {batch_index_info}.") + raise ValueError(f"Invalid response from OpenAI API: Missing embedding data in batch {batch_index_info}.") + all_embeddings.extend(batch_embeddings) + print(f"Successfully processed OpenAI batch {batch_index_info}.") + else: + raise ValueError(f"Invalid response from OpenAI API for batch {batch_index_info}: Mismatch in batch size or missing data. Expected {len(batch)}, got {len(response.data) if response.data else 0}.") + except APIError as e: + print(f"OpenAI API error generating embeddings for batch {batch_index_info}: {e}") + raise + except Exception as e: + print(f"Unexpected error generating OpenAI embeddings for batch {batch_index_info}: {e}") + raise + + elif self.provider == "ollama": + # Ollama's embed_documents handles batching internally (usually) try: - print(f"Generating embeddings for batch {batch_index_info} of size {len(batch)}...") # Added print statement - response = self.client.embeddings.create( - model=self.embedding_model, - input=batch, - encoding_format="float" - ) - - if response.data and len(response.data) == len(batch): - batch_embeddings = [item.embedding for item in response.data] - if not all(batch_embeddings): - # Log which batch failed if possible - print(f"Warning: Missing embedding data in response for batch {batch_index_info}.") - # Handle missing embeddings - Option: fill with zero vectors of correct dimension - # For now, we'll raise an error as before, but the error is now batch-specific. - raise ValueError(f"Invalid response received from OpenAI API: Missing embedding data in batch {batch_index_info}.") - all_embeddings.extend(batch_embeddings) - print(f"Successfully processed batch {batch_index_info}.") # Added success print - else: - raise ValueError(f"Invalid response received from OpenAI API for batch {batch_index_info}: Mismatch in batch size or missing data. Expected {len(batch)}, got {len(response.data) if response.data else 0}.") - - except APIError as e: - print(f"OpenAI API error generating embeddings for batch {batch_index_info}: {e}") - # Option 1: Re-raise immediately, stopping the process - raise - # Option 2: Log error and continue, potentially skipping this batch (results in incomplete data) - # print(f"Skipping batch {batch_index_info} due to API error.") - # continue # This would require careful handling of indices later + print(f"Generating Ollama embeddings for {len(processed_texts)} texts...") + all_embeddings = self.client.embed_documents(processed_texts) + if len(all_embeddings) != len(processed_texts): + raise ValueError(f"Ollama API response length mismatch. Expected {len(processed_texts)}, got {len(all_embeddings)}.") + if not all(e is not None for e in all_embeddings): # Check for None embeddings + raise ValueError("Ollama API returned None for one or more embeddings.") + print(f"Successfully generated Ollama embeddings for {len(processed_texts)} texts.") except Exception as e: - print(f"Unexpected error generating embeddings for batch {batch_index_info}: {e}") - raise # Re-raise other exceptions + # More detailed error logging + print(f"Error generating Ollama embeddings: Type={type(e).__name__}, Message={e}") + # Optionally print traceback if needed: + # import traceback + # traceback.print_exc() + raise + else: + raise RuntimeError(f"Invalid provider '{self.provider}' encountered during batch embedding generation.") # Final check: Ensure the number of embeddings matches the number of processed texts if len(all_embeddings) != len(processed_texts): @@ -182,7 +271,7 @@ def generate_node_embeddings(self, nodes: List[Dict[str, Any]]) -> List[Dict[str try: embeddings = self.generate_embeddings(texts_to_embed) except Exception as e: - print(f"Error generating batch embeddings for nodes: {e}. Proceeding without embeddings for affected nodes.") + print(f"Error generating batch embeddings for nodes using {self.provider.upper()}: {e}. Proceeding without embeddings for affected nodes.") # In case of error, embeddings list will be empty or incomplete # Add embeddings back to the corresponding nodes diff --git a/requirements.txt b/requirements.txt index de63a0dcd3..6e340bd138 100644 --- a/requirements.txt +++ b/requirements.txt @@ -70,6 +70,7 @@ langgraph-checkpoint==2.0.10 langgraph-cli==0.1.71 langgraph-sdk==0.1.51 langsmith==0.3.6 +langchain-community litellm==1.57.8 logfire==3.1.0 logfire-api==3.1.0 diff --git a/run_processing.py b/run_processing.py index ca01ca4730..a6f4a29bcf 100644 --- a/run_processing.py +++ b/run_processing.py @@ -16,7 +16,7 @@ # Phase 4 Components & Utilities # Assuming these files exist based on previous main.py imports from archon.llms_txt.vector_db.supabase_manager import SupabaseManager - from archon.llms_txt.vector_db.embedding_manager import OpenAIEmbeddingGenerator + from archon.llms_txt.vector_db.embedding_manager import EmbeddingManager from archon.llms_txt.vector_db.query_manager import HierarchicalQueryManager from archon.llms_txt.utils.env_loader import EnvironmentLoader # Used implicitly by managers @@ -73,7 +73,7 @@ def process_document(file_path: str, document_id: Optional[str] = None) -> Optio chunker = HierarchicalChunker() enricher = MetadataEnricher() db = SupabaseManager() # Uses default env loader path (workbench/env_vars.json relative to root) - embedder = OpenAIEmbeddingGenerator() # Uses default env loader path + embedder = EmbeddingManager() # Uses default env loader path print("Components initialized successfully.") # Perform a quick check of DB connection if desired # db._check_tables() # Optional: Check tables exist before proceeding @@ -285,7 +285,7 @@ def process_document(file_path: str, document_id: Optional[str] = None) -> Optio # Only insert nodes for which embedding was successful (or if allowing nodes without embeddings) if not node.get("metadata", {}).get("embedding_generated", False): - print(f"Skipping insertion for node {original_id} because embedding generation failed.") + # print(f"Skipping insertion for node {original_id} because embedding generation failed.") # Commented out to reduce noise failed_count += 1 continue # OR: If allowing nodes without embeddings, remove the check but handle potential DB constraints From 21273796fb57b82e4e8799e64c4ef432498f9d76 Mon Sep 17 00:00:00 2001 From: HillviewCap Date: Tue, 1 Apr 2025 17:09:01 -0400 Subject: [PATCH 4/5] ollama embeddings resolved --- archon/llms_txt/retrieval/query_processor.py | 8 ++++---- archon/llms_txt/vector_db/query_manager.py | 6 +++--- run_retrieval.py | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/archon/llms_txt/retrieval/query_processor.py b/archon/llms_txt/retrieval/query_processor.py index a2b6d30ce2..71d87523c7 100644 --- a/archon/llms_txt/retrieval/query_processor.py +++ b/archon/llms_txt/retrieval/query_processor.py @@ -1,5 +1,5 @@ import re -from archon.llms_txt.vector_db.embedding_manager import OpenAIEmbeddingGenerator +from archon.llms_txt.vector_db.embedding_manager import EmbeddingManager from typing import Any, Dict, List, Optional, Tuple class QueryProcessor: @@ -13,10 +13,10 @@ def __init__(self): Initializes the QueryProcessor and the embedding generator. """ try: - self.embedder = OpenAIEmbeddingGenerator() - print("QueryProcessor: OpenAIEmbeddingGenerator initialized.") + self.embedder = EmbeddingManager() + print("QueryProcessor: EmbeddingManager initialized.") except Exception as e: - print(f"Error initializing OpenAIEmbeddingGenerator in QueryProcessor: {e}") + print(f"Error initializing EmbeddingManager in QueryProcessor: {e}") self.embedder = None # Handle initialization failure def extract_contextual_info(self, query: str) -> Dict[str, Any]: diff --git a/archon/llms_txt/vector_db/query_manager.py b/archon/llms_txt/vector_db/query_manager.py index d5ec943858..3dc0d8e551 100644 --- a/archon/llms_txt/vector_db/query_manager.py +++ b/archon/llms_txt/vector_db/query_manager.py @@ -2,7 +2,7 @@ # Corrected import paths assuming query_manager is in vector_db directory from .supabase_manager import SupabaseManager -from .embedding_manager import OpenAIEmbeddingGenerator +from .embedding_manager import EmbeddingManager # Import EnvironmentLoader to allow creating default managers if not provided from ..utils.env_loader import EnvironmentLoader @@ -12,7 +12,7 @@ class HierarchicalQueryManager: def __init__( self, supabase_manager: Optional[SupabaseManager] = None, - embedding_generator: Optional[OpenAIEmbeddingGenerator] = None, + embedding_generator: Optional[EmbeddingManager] = None, env_loader: Optional[EnvironmentLoader] = None # Allow passing env_loader ): """Initialize with database and embedding managers. @@ -26,7 +26,7 @@ def __init__( # Use provided managers or instantiate them using the env_loader self.db = supabase_manager or SupabaseManager(env_loader=_env_loader) - self.embedder = embedding_generator or OpenAIEmbeddingGenerator(env_loader=_env_loader) + self.embedder = embedding_generator or EmbeddingManager(env_loader=_env_loader) print("HierarchicalQueryManager initialized.") def search( diff --git a/run_retrieval.py b/run_retrieval.py index 1523158c58..45b5e31f0a 100644 --- a/run_retrieval.py +++ b/run_retrieval.py @@ -18,7 +18,7 @@ from archon.llms_txt.retrieval.response_builder import ResponseBuilder from archon.llms_txt.retrieval.retrieval_manager import RetrievalManager from archon.llms_txt.vector_db.supabase_manager import SupabaseManager - # EnvironmentLoader is used implicitly by SupabaseManager and OpenAIEmbeddingGenerator (in QueryProcessor) + # EnvironmentLoader is used implicitly by SupabaseManager and EmbeddingManager (in QueryProcessor) # No need to explicitly import if components handle their own loading # from archon.llms_txt.utils.env_loader import EnvironmentLoader except ImportError as e: From 507a917720bde2505360ee008279fee409fcf0ce Mon Sep 17 00:00:00 2001 From: HillviewCap Date: Tue, 1 Apr 2025 19:28:05 -0400 Subject: [PATCH 5/5] Fix: Resolve embedding environment configuration issues --- archon/agent_tools.py | 45 +++++--------- archon/archon_graph.py | 11 +++- .../llms_txt/vector_db/embedding_manager.py | 60 +++++++++---------- archon/llms_txt/vector_db/supabase_manager.py | 1 + archon/pydantic_ai_coder.py | 7 ++- archon/refiner_agents/agent_refiner_agent.py | 2 +- archon/refiner_agents/tools_refiner_agent.py | 2 +- streamlit_pages/environment.py | 14 +++++ utils/llms_txt.sql | 4 +- 9 files changed, 77 insertions(+), 69 deletions(-) diff --git a/archon/agent_tools.py b/archon/agent_tools.py index f31ce6ddeb..29c25f0c16 100644 --- a/archon/agent_tools.py +++ b/archon/agent_tools.py @@ -12,35 +12,24 @@ # Imports for hierarchical retrieval try: - from archon.llms_txt.retrieval.query_processor import QueryProcessor - # from archon.llms_txt.retrieval.ranking import HierarchicalRanker # Not needed for direct search - # from archon.llms_txt.retrieval.response_builder import ResponseBuilder # Not needed for direct search - # from archon.llms_txt.retrieval.retrieval_manager import RetrievalManager # Not needed for direct search + # from archon.llms_txt.retrieval.query_processor import QueryProcessor # Handled by EmbeddingManager now + # from archon.llms_txt.retrieval.ranking import HierarchicalRanker + # from archon.llms_txt.retrieval.response_builder import ResponseBuilder + # from archon.llms_txt.retrieval.retrieval_manager import RetrievalManager from archon.llms_txt.vector_db.supabase_manager import SupabaseManager + from archon.llms_txt.vector_db.embedding_manager import EmbeddingManager # Import EmbeddingManager HIERARCHICAL_IMPORTS_AVAILABLE = True except ImportError as e: print(f"Warning: Failed to import hierarchical retrieval components: {e}. Hierarchical retrieval disabled.") HIERARCHICAL_IMPORTS_AVAILABLE = False # Define dummy classes if imports fail to prevent NameErrors later - class QueryProcessor: pass + class QueryProcessor: pass # Keep dummy for now if needed elsewhere class SupabaseManager: pass + class EmbeddingManager: pass # Add dummy for EmbeddingManager +# Remove the old get_embedding function and embedding_model global variable -embedding_model = get_env_var('EMBEDDING_MODEL') or 'text-embedding-3-small' - -async def get_embedding(text: str, embedding_client: AsyncOpenAI) -> List[float]: - """Get embedding vector from OpenAI.""" - try: - response = await embedding_client.embeddings.create( - model=embedding_model, - input=text - ) - return response.data[0].embedding - except Exception as e: - print(f"Error getting embedding: {e}") - return [0] * 1536 # Return zero vector on error - -async def retrieve_relevant_documentation_tool(supabase: Client, embedding_client: AsyncOpenAI, user_query: str) -> str: +async def retrieve_relevant_documentation_tool(supabase: Client, embedding_manager: EmbeddingManager, user_query: str) -> str: # Changed embedding_client to embedding_manager """ Retrieve relevant documentation chunks based on the query. Conditionally uses either standard RAG from 'site_pages' or hierarchical RAG @@ -58,14 +47,11 @@ async def retrieve_relevant_documentation_tool(supabase: Client, embedding_clien try: # 1. Initialize Hierarchical Components # SupabaseManager handles its own env loading and client creation - db_manager = SupabaseManager() - # QueryProcessor handles its own env loading and embedder creation - # We need to ensure QueryProcessor uses the *same* embedding client/model - # Let's reuse the existing get_embedding function for simplicity and consistency - # query_processor = QueryProcessor(embedding_client=embedding_client) # Pass existing client + db_manager = SupabaseManager() # Assumes SupabaseManager uses its own env loader correctly - # 2. Process Query to get Embedding - query_embedding = await get_embedding(user_query, embedding_client) + # 2. Process Query to get Embedding using EmbeddingManager + print(f"DEBUG: Generating embedding for query '{user_query}' using {embedding_manager.provider} provider.") + query_embedding = embedding_manager.generate_embedding(user_query) # Use manager's method if not query_embedding or len(query_embedding) < 10: # Basic check return "Error: Failed to generate embedding for hierarchical search." @@ -108,8 +94,9 @@ async def retrieve_relevant_documentation_tool(supabase: Client, embedding_clien else: # Default to 'site_pages' print(f"--- Performing Standard Retrieval ('site_pages') for: '{user_query}' ---") - # Get the embedding for the query - query_embedding = await get_embedding(user_query, embedding_client) + # Get the embedding for the query using EmbeddingManager + print(f"DEBUG: Generating embedding for query '{user_query}' using {embedding_manager.provider} provider.") + query_embedding = embedding_manager.generate_embedding(user_query) # Use manager's method # Query Supabase for relevant documents using the existing RPC result = supabase.rpc( diff --git a/archon/archon_graph.py b/archon/archon_graph.py index bc2e01c6e8..9c5fcbbb1c 100644 --- a/archon/archon_graph.py +++ b/archon/archon_graph.py @@ -22,6 +22,7 @@ # Add the parent directory to Python path sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from archon.pydantic_ai_coder import pydantic_ai_coder, PydanticAIDeps +from archon.llms_txt.vector_db.embedding_manager import EmbeddingManager # Import EmbeddingManager from archon.refiner_agents.prompt_refiner_agent import prompt_refiner_agent from archon.refiner_agents.tools_refiner_agent import tools_refiner_agent, ToolsRefinerDeps from archon.refiner_agents.agent_refiner_agent import agent_refiner_agent, AgentRefinerDeps @@ -114,11 +115,15 @@ async def define_scope_with_reasoner(state: AgentState): return {"scope": scope} # Coding Node with Feedback Handling -async def coder_agent(state: AgentState, writer): - # Prepare dependencies +async def coder_agent(state: AgentState, writer): + # Initialize EmbeddingManager (it loads its own env vars) + # Note: Consider initializing this once outside the node if possible for efficiency + embedding_manager = EmbeddingManager() + + # Prepare dependencies, passing the embedding_manager deps = PydanticAIDeps( supabase=supabase, - embedding_client=embedding_client, + embedding_manager=embedding_manager, # Pass the initialized EmbeddingManager reasoner_output=state['scope'] ) diff --git a/archon/llms_txt/vector_db/embedding_manager.py b/archon/llms_txt/vector_db/embedding_manager.py index a47091d5e0..80cd1ed9f4 100644 --- a/archon/llms_txt/vector_db/embedding_manager.py +++ b/archon/llms_txt/vector_db/embedding_manager.py @@ -29,9 +29,16 @@ def __init__(self, env_loader: Optional[EnvironmentLoader] = None): self.provider = self.config.get("EMBEDDING_PROVIDER", "OpenAI").lower() # Default to OpenAI self.client = None self.embedding_model = None - self.embedding_dim = None # Store embedding dimension + # Read dimension from config, default to None if not found + self.embedding_dim = self.config.get("EMBEDDING_DIMENSION") + if self.embedding_dim: + try: + self.embedding_dim = int(self.embedding_dim) + except ValueError: + print(f"Warning: Invalid EMBEDDING_DIMENSION value '{self.embedding_dim}'. Must be an integer. Ignoring.") + self.embedding_dim = None + # else: # No need for else if only print was removed - print(f"Selected embedding provider: {self.provider}") if self.provider == "ollama": if OllamaEmbeddings is None: @@ -52,17 +59,14 @@ def __init__(self, env_loader: Optional[EnvironmentLoader] = None): model=self.embedding_model ) # Note: OllamaEmbeddings doesn't expose dimension easily, may need to infer or hardcode common values - print(f"Ollama client initialized successfully. Model: {self.embedding_model}, Base URL: {ollama_base_url or 'Default'}") - # Attempt a dummy embedding to get dimension (or use common defaults) - try: - dummy_embedding = self.client.embed_query("test") - self.embedding_dim = len(dummy_embedding) - print(f"Inferred Ollama embedding dimension: {self.embedding_dim}") - except Exception as e: - print(f"Warning: Could not determine Ollama embedding dimension via test query: {e}. Falling back to default.") - # Fallback based on common Ollama model dimensions if needed - self.embedding_dim = 768 # Common default, adjust if necessary - + # Only infer dimension if not provided in config + if self.embedding_dim is None: + try: + dummy_embedding = self.client.embed_query("test") + self.embedding_dim = len(dummy_embedding) + except Exception as e: + print(f"Warning: Could not determine Ollama embedding dimension via test query: {e}. Using fallback dimension 768.") + self.embedding_dim = 768 # Fallback if inference fails and not configured except Exception as e: print(f"Error initializing Ollama client: {e}") raise @@ -85,16 +89,16 @@ def __init__(self, env_loader: Optional[EnvironmentLoader] = None): api_key=openai_api_key, base_url=openai_base_url # Pass base_url only if it exists ) - print(f"OpenAI client initialized successfully. Model: {self.embedding_model}, Base URL: {openai_base_url or 'Default'}") - # Infer dimension from model name (common OpenAI models) - if "1536" in self.embedding_model or "small" in self.embedding_model or "large" in self.embedding_model or "3" in self.embedding_model: - self.embedding_dim = 1536 - elif "ada" in self.embedding_model: # Older model - self.embedding_dim = 1024 - else: - self.embedding_dim = 768 # Fallback/other models - print(f"Inferred OpenAI embedding dimension: {self.embedding_dim}") - + # Only infer dimension if not provided in config + if self.embedding_dim is None: + if "ada-002" in self.embedding_model or "3-small" in self.embedding_model or "3-large" in self.embedding_model: + self.embedding_dim = 1536 + elif "ada-001" in self.embedding_model: # Older model? Check specific name + self.embedding_dim = 1024 + else: + # Default fallback if model name doesn't match common patterns + self.embedding_dim = 1536 # Defaulting to common OpenAI dimension + print(f"Warning: Could not infer dimension for model '{self.embedding_model}'. Using fallback: {self.embedding_dim}") except Exception as e: print(f"Error initializing OpenAI client: {e}") raise @@ -119,10 +123,10 @@ def generate_embedding(self, text: str) -> List[float]: """ if not text: print("Warning: Attempting to generate embedding for empty text. Returning zero vector.") - # Use the stored dimension + # Use the stored dimension, ensure it's set if self.embedding_dim is None: - print("Error: Embedding dimension not determined during initialization.") - # Fallback dimension if not set + # This should ideally not happen if initialization worked + print("Error: Embedding dimension not determined. Using fallback 768.") return [0.0] * 768 return [0.0] * self.embedding_dim @@ -185,7 +189,6 @@ def generate_embeddings(self, texts: List[str]) -> List[List[float]]: batch = processed_texts[i:i + batch_size] batch_index_info = f"(Indices {i} to {i + len(batch) - 1})" try: - print(f"Generating OpenAI embeddings for batch {batch_index_info} of size {len(batch)}...") response = self.client.embeddings.create( model=self.embedding_model, input=batch, @@ -197,7 +200,6 @@ def generate_embeddings(self, texts: List[str]) -> List[List[float]]: print(f"Warning: Missing embedding data in OpenAI response for batch {batch_index_info}.") raise ValueError(f"Invalid response from OpenAI API: Missing embedding data in batch {batch_index_info}.") all_embeddings.extend(batch_embeddings) - print(f"Successfully processed OpenAI batch {batch_index_info}.") else: raise ValueError(f"Invalid response from OpenAI API for batch {batch_index_info}: Mismatch in batch size or missing data. Expected {len(batch)}, got {len(response.data) if response.data else 0}.") except APIError as e: @@ -210,13 +212,11 @@ def generate_embeddings(self, texts: List[str]) -> List[List[float]]: elif self.provider == "ollama": # Ollama's embed_documents handles batching internally (usually) try: - print(f"Generating Ollama embeddings for {len(processed_texts)} texts...") all_embeddings = self.client.embed_documents(processed_texts) if len(all_embeddings) != len(processed_texts): raise ValueError(f"Ollama API response length mismatch. Expected {len(processed_texts)}, got {len(all_embeddings)}.") if not all(e is not None for e in all_embeddings): # Check for None embeddings raise ValueError("Ollama API returned None for one or more embeddings.") - print(f"Successfully generated Ollama embeddings for {len(processed_texts)} texts.") except Exception as e: # More detailed error logging print(f"Error generating Ollama embeddings: Type={type(e).__name__}, Message={e}") diff --git a/archon/llms_txt/vector_db/supabase_manager.py b/archon/llms_txt/vector_db/supabase_manager.py index 1c0cc6acc3..b11a2798a0 100644 --- a/archon/llms_txt/vector_db/supabase_manager.py +++ b/archon/llms_txt/vector_db/supabase_manager.py @@ -157,6 +157,7 @@ def vector_search( # Remove None values from params as RPC might expect missing keys vs null values params = {k: v for k, v in params.items() if v is not None} + print(f"DEBUG: Calling match_hierarchical_nodes with params: {json.dumps(params, indent=2, default=lambda x: '')}") # Log params, hiding embedding for brevity try: # Call the RPC function response = self.client.rpc( diff --git a/archon/pydantic_ai_coder.py b/archon/pydantic_ai_coder.py index ac1cb814ef..eae75b0df9 100644 --- a/archon/pydantic_ai_coder.py +++ b/archon/pydantic_ai_coder.py @@ -13,8 +13,9 @@ from pydantic_ai import Agent, ModelRetry, RunContext from pydantic_ai.models.anthropic import AnthropicModel from pydantic_ai.models.openai import OpenAIModel -from openai import AsyncOpenAI +from openai import AsyncOpenAI # Keep for now if needed elsewhere, though likely removable later from supabase import Client +from archon.llms_txt.vector_db.embedding_manager import EmbeddingManager # Import EmbeddingManager # Add the parent directory to sys.path to allow importing from the parent directory sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -40,7 +41,7 @@ @dataclass class PydanticAIDeps: supabase: Client - embedding_client: AsyncOpenAI + embedding_manager: EmbeddingManager # Replace embedding_client with embedding_manager reasoner_output: str pydantic_ai_coder = Agent( @@ -70,7 +71,7 @@ async def retrieve_relevant_documentation(ctx: RunContext[PydanticAIDeps], user_ Returns: A formatted string containing the top 4 most relevant documentation chunks """ - return await retrieve_relevant_documentation_tool(ctx.deps.supabase, ctx.deps.embedding_client, user_query) + return await retrieve_relevant_documentation_tool(ctx.deps.supabase, ctx.deps.embedding_manager, user_query) # Use embedding_manager @pydantic_ai_coder.tool async def list_documentation_pages(ctx: RunContext[PydanticAIDeps]) -> List[str]: diff --git a/archon/refiner_agents/agent_refiner_agent.py b/archon/refiner_agents/agent_refiner_agent.py index cc535abd6f..a72d3a1168 100644 --- a/archon/refiner_agents/agent_refiner_agent.py +++ b/archon/refiner_agents/agent_refiner_agent.py @@ -63,7 +63,7 @@ async def retrieve_relevant_documentation(ctx: RunContext[AgentRefinerDeps], que Returns: A formatted string containing the top 4 most relevant documentation chunks """ - return await retrieve_relevant_documentation_tool(ctx.deps.supabase, ctx.deps.embedding_client, query) + return await retrieve_relevant_documentation_tool(ctx.deps.supabase, ctx.deps.embedding_manager, query) # Use embedding_manager @agent_refiner_agent.tool async def list_documentation_pages(ctx: RunContext[AgentRefinerDeps]) -> List[str]: diff --git a/archon/refiner_agents/tools_refiner_agent.py b/archon/refiner_agents/tools_refiner_agent.py index a3afcd7515..537c65772c 100644 --- a/archon/refiner_agents/tools_refiner_agent.py +++ b/archon/refiner_agents/tools_refiner_agent.py @@ -63,7 +63,7 @@ async def retrieve_relevant_documentation(ctx: RunContext[ToolsRefinerDeps], que Returns: A formatted string containing the top 4 most relevant documentation chunks """ - return await retrieve_relevant_documentation_tool(ctx.deps.supabase, ctx.deps.embedding_client, query) + return await retrieve_relevant_documentation_tool(ctx.deps.supabase, ctx.deps.embedding_manager, query) # Use embedding_manager @tools_refiner_agent.tool async def list_documentation_pages(ctx: RunContext[ToolsRefinerDeps]) -> List[str]: diff --git a/streamlit_pages/environment.py b/streamlit_pages/environment.py index 37204ba89f..ddb18ff9ba 100644 --- a/streamlit_pages/environment.py +++ b/streamlit_pages/environment.py @@ -312,6 +312,20 @@ def environment_tab(): key="input_EMBEDDING_MODEL" ) updated_values["EMBEDDING_MODEL"] = embedding_model + + # VECTOR_DIMENSION + vector_dimension_help = "The dimension of the vectors generated by your embedding model.\n\n" + \ + "Example for nomic-embed-text: 768\n\n" + \ + "Example for text-embedding-3-small: 1536" + + vector_dimension = st.text_input( + "VECTOR_DIMENSION:", + value=profile_env_vars.get("VECTOR_DIMENSION", ""), + help=vector_dimension_help, + key="input_VECTOR_DIMENSION" + ) + updated_values["VECTOR_DIMENSION"] = vector_dimension + st.markdown("---") diff --git a/utils/llms_txt.sql b/utils/llms_txt.sql index 3274005367..0e5d181b45 100644 --- a/utils/llms_txt.sql +++ b/utils/llms_txt.sql @@ -18,7 +18,7 @@ CREATE TABLE hierarchical_nodes ( section_type VARCHAR, -- e.g. "API documentation", "Concepts", etc. content_type VARCHAR, -- e.g. "link_list", "descriptive_text", "code_example" document_position FLOAT, -- Normalized position in document (0-1) - embedding VECTOR(1536), -- Same dimension as your current embeddings + embedding VECTOR(768), -- Dimension for nomic-embed-text created_at TIMESTAMP WITH TIME ZONE DEFAULT TIMEZONE('utc'::text, now()) NOT NULL, updated_at TIMESTAMP WITH TIME ZONE DEFAULT TIMEZONE('utc'::text, now()) NOT NULL, metadata JSONB NOT NULL DEFAULT '{}'::JSONB -- Additional metadata @@ -60,7 +60,7 @@ CREATE INDEX idx_hierarchical_references_type ON hierarchical_references(referen -- 4. Create vector search function for hierarchical nodes CREATE OR REPLACE FUNCTION match_hierarchical_nodes ( - query_embedding VECTOR(1536), + query_embedding VECTOR(768), match_count INT DEFAULT 10, filter JSONB DEFAULT '{}'::JSONB, section_filter VARCHAR DEFAULT NULL,