diff --git a/Dockerfile b/Dockerfile index ba4e29a32..9cb4fba85 100644 --- a/Dockerfile +++ b/Dockerfile @@ -29,13 +29,6 @@ RUN pip install --no-cache-dir -r requirements.txt && \ # Stage 3: Final stage with non-root user and app FROM gpt-researcher-install AS gpt-researcher -# Use environment variables for API keys (defaults can be overridden at runtime) -ARG OPENAI_API_KEY -ARG TAVILY_API_KEY - -ENV OPENAI_API_KEY=${OPENAI_API_KEY} -ENV TAVILY_API_KEY=${TAVILY_API_KEY} - # Create a non-root user for security RUN useradd -ms /bin/bash gpt-researcher && \ chown -R gpt-researcher:gpt-researcher /usr/src/app diff --git a/gpt_researcher/llm_provider/generic/base.py b/gpt_researcher/llm_provider/generic/base.py index 7e64a8254..e7c8e2814 100644 --- a/gpt_researcher/llm_provider/generic/base.py +++ b/gpt_researcher/llm_provider/generic/base.py @@ -68,7 +68,7 @@ def from_provider(cls, provider: str, **kwargs: Any): llm = ChatFireworks(**kwargs) elif provider == "ollama": _check_pkg("langchain_community") - from langchain_community.chat_models import ChatOllama + from langchain_ollama import ChatOllama llm = ChatOllama(base_url=os.environ["OLLAMA_BASE_URL"], **kwargs) elif provider == "together": diff --git a/gpt_researcher/memory/embeddings.py b/gpt_researcher/memory/embeddings.py index 0e917ed4e..5bfcc8cb5 100644 --- a/gpt_researcher/memory/embeddings.py +++ b/gpt_researcher/memory/embeddings.py @@ -17,7 +17,7 @@ def __init__(self, embedding_provider: str, model: str, **embdding_kwargs: Any): _embeddings = None match embedding_provider: case "ollama": - from langchain_community.embeddings import OllamaEmbeddings + from langchain_ollama import OllamaEmbeddings _embeddings = OllamaEmbeddings( model=model, diff --git a/gpt_researcher/retrievers/searx/searx.py b/gpt_researcher/retrievers/searx/searx.py index 46bc10cb6..912ea27d0 100644 --- a/gpt_researcher/retrievers/searx/searx.py +++ b/gpt_researcher/retrievers/searx/searx.py @@ -1,45 +1,77 @@ -# Tavily API Retriever - -# libraries import os -from langchain_community.utilities import SearxSearchWrapper +import json +import requests +from typing import List, Dict +from urllib.parse import urljoin class SearxSearch(): """ - Tavily API Retriever + SearxNG API Retriever """ - def __init__(self, query): + def __init__(self, query: str): """ - Initializes the TavilySearch object + Initializes the SearxSearch object Args: - query: + query: Search query string """ self.query = query - self.api_key = self.get_api_key() + self.base_url = self.get_searxng_url() - def get_api_key(self): + def get_searxng_url(self) -> str: """ - Gets the Tavily API key + Gets the SearxNG instance URL from environment variables Returns: - + str: Base URL of SearxNG instance """ - # Get the API key try: - api_key = os.environ["SEARX_URL"] - except: - raise Exception("Searx URL key not found. Please set the SEARX_URL environment variable. " - "You can get your key from https://searx.space/") - return api_key + base_url = os.environ["SEARX_URL"] + if not base_url.endswith('/'): + base_url += '/' + return base_url + except KeyError: + raise Exception( + "SearxNG URL not found. Please set the SEARX_URL environment variable. " + "You can find public instances at https://searx.space/" + ) - def search(self, max_results=7): + def search(self, max_results: int = 10) -> List[Dict[str, str]]: """ - Searches the query + Searches the query using SearxNG API + Args: + max_results: Maximum number of results to return Returns: - + List of dictionaries containing search results """ - searx = SearxSearchWrapper(searx_host=os.environ["SEARX_URL"]) - results = searx.results(self.query, max_results) - # Normalizing results to match the format of the other search APIs - search_response = [{"href": obj["link"], "body": obj["snippet"]} for obj in results] - return search_response + search_url = urljoin(self.base_url, "search") + + params = { + # The search query. + 'q': self.query, + # Output format of results. Format needs to be activated in searxng config. + 'format': 'json' + } + + try: + response = requests.get( + search_url, + params=params, + headers={'Accept': 'application/json'} + ) + response.raise_for_status() + results = response.json() + + # Normalize results to match the expected format + search_response = [] + for result in results.get('results', [])[:max_results]: + search_response.append({ + "href": result.get('url', ''), + "body": result.get('content', '') + }) + + return search_response + + except requests.exceptions.RequestException as e: + raise Exception(f"Error querying SearxNG: {str(e)}") + except json.JSONDecodeError: + raise Exception("Error parsing SearxNG response") diff --git a/gpt_researcher/scraper/arxiv/arxiv.py b/gpt_researcher/scraper/arxiv/arxiv.py index e3c902830..2af550417 100644 --- a/gpt_researcher/scraper/arxiv/arxiv.py +++ b/gpt_researcher/scraper/arxiv/arxiv.py @@ -18,5 +18,5 @@ def scrape(self): """ query = self.link.split("/")[-1] retriever = ArxivRetriever(load_max_docs=2, doc_content_chars_max=None) - docs = retriever.get_relevant_documents(query=query) + docs = retriever.invoke(query=query) return docs[0].page_content diff --git a/requirements.txt b/requirements.txt index c7575b04f..626f683ce 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,6 +12,7 @@ markdown langchain langchain_community langchain-openai +langchain-ollama langgraph tiktoken gpt-researcher diff --git a/tests/gptr-logs-handler.py b/tests/gptr-logs-handler.py index db84af0a1..fb05694ce 100644 --- a/tests/gptr-logs-handler.py +++ b/tests/gptr-logs-handler.py @@ -1,27 +1,35 @@ -from typing import Dict, Any +import logging +from typing import List, Dict, Any import asyncio from gpt_researcher import GPTResearcher class CustomLogsHandler: """A custom Logs handler class to handle JSON data.""" def __init__(self): - self.logs = [] # Initialize logs to store data + self.logs: List[Dict[str, Any]] = [] # Initialize logs to store data + logging.basicConfig(level=logging.INFO) # Set up logging configuration async def send_json(self, data: Dict[str, Any]) -> None: - """Send JSON data and log it.""" - self.logs.append(data) # Append data to logs - print(f"My custom Log: {data}") # For demonstration, print the log - -async def run(): - # Define the necessary parameters with sample values - + """Send JSON data and log it, with error handling.""" + try: + self.logs.append(data) # Append data to logs + logging.info(f"My custom Log: {data}") # Use logging instead of print + except Exception as e: + logging.error(f"Error logging data: {e}") # Log any errors + + def clear_logs(self) -> None: + """Clear the logs.""" + self.logs.clear() # Clear the logs list + logging.info("Logs cleared.") # Log the clearing action + +async def run() -> None: + """Run the research process and generate a report.""" query = "What happened in the latest burning man floods?" - report_type = "research_report" # Type of report to generate - report_source = "online" # Could specify source like 'online', 'books', etc. - tone = "informative" # Tone of the report ('informative', 'casual', etc.) - config_path = None # Path to a config file, if needed + report_type = "research_report" + report_source = "online" + tone = "informative" + config_path = None - # Initialize researcher with a custom WebSocket custom_logs_handler = CustomLogsHandler() researcher = GPTResearcher( @@ -35,6 +43,7 @@ async def run(): await researcher.conduct_research() # Conduct the research report = await researcher.write_report() # Write the research report + logging.info("Report generated successfully.") # Log report generation return report