From dff2f8afd1395acffebe194ade284ea474ffacc4 Mon Sep 17 00:00:00 2001 From: Ivaylo Gochkov Date: Mon, 28 Oct 2024 09:40:26 +0100 Subject: [PATCH 1/4] SearxNG API Retriever implementation --- gpt_researcher/retrievers/searx/searx.py | 86 +++++++++++++++++------- 1 file changed, 60 insertions(+), 26 deletions(-) diff --git a/gpt_researcher/retrievers/searx/searx.py b/gpt_researcher/retrievers/searx/searx.py index 46bc10cb6..852bf3e33 100644 --- a/gpt_researcher/retrievers/searx/searx.py +++ b/gpt_researcher/retrievers/searx/searx.py @@ -1,45 +1,79 @@ -# Tavily API Retriever - -# libraries import os -from langchain_community.utilities import SearxSearchWrapper +import json +import requests +from typing import List, Dict +from urllib.parse import urljoin class SearxSearch(): """ - Tavily API Retriever + SearxNG API Retriever """ - def __init__(self, query): + def __init__(self, query: str): """ - Initializes the TavilySearch object + Initializes the SearxSearch object Args: - query: + query: Search query string """ self.query = query - self.api_key = self.get_api_key() + self.base_url = self.get_searxng_url() - def get_api_key(self): + def get_searxng_url(self) -> str: """ - Gets the Tavily API key + Gets the SearxNG instance URL from environment variables Returns: - + str: Base URL of SearxNG instance """ - # Get the API key try: - api_key = os.environ["SEARX_URL"] - except: - raise Exception("Searx URL key not found. Please set the SEARX_URL environment variable. " - "You can get your key from https://searx.space/") - return api_key + base_url = os.environ["SEARX_URL"] + if not base_url.endswith('/'): + base_url += '/' + return base_url + except KeyError: + raise Exception( + "SearxNG URL not found. Please set the SEARX_URL environment variable. " + "You can find public instances at https://searx.space/" + ) - def search(self, max_results=7): + def search(self, max_results: int = 7) -> List[Dict[str, str]]: """ - Searches the query + Searches the query using SearxNG API + Args: + max_results: Maximum number of results to return Returns: - + List of dictionaries containing search results """ - searx = SearxSearchWrapper(searx_host=os.environ["SEARX_URL"]) - results = searx.results(self.query, max_results) - # Normalizing results to match the format of the other search APIs - search_response = [{"href": obj["link"], "body": obj["snippet"]} for obj in results] - return search_response + search_url = urljoin(self.base_url, "search") + + params = { + 'q': self.query, + 'format': 'json', + 'pageno': 1, + 'categories': 'general', + 'engines': 'google,bing,duckduckgo', # TODO: Add environment variable to customize the engines + 'results': max_results + } + + try: + response = requests.get( + search_url, + params=params, + headers={'Accept': 'application/json'} + ) + response.raise_for_status() + results = response.json() + + # Normalize results to match the expected format + search_response = [] + for result in results.get('results', [])[:max_results]: + search_response.append({ + "href": result.get('url', ''), + "body": result.get('content', '') + }) + + return search_response + + except requests.exceptions.RequestException as e: + raise Exception(f"Error querying SearxNG: {str(e)}") + except json.JSONDecodeError: + raise Exception("Error parsing SearxNG response") From c54003dada9a2ba7c194729065cf3807a39a78c5 Mon Sep 17 00:00:00 2001 From: Ivaylo Gochkov Date: Mon, 28 Oct 2024 21:05:43 +0100 Subject: [PATCH 2/4] Simplified parameters --- gpt_researcher/retrievers/searx/searx.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/gpt_researcher/retrievers/searx/searx.py b/gpt_researcher/retrievers/searx/searx.py index 852bf3e33..912ea27d0 100644 --- a/gpt_researcher/retrievers/searx/searx.py +++ b/gpt_researcher/retrievers/searx/searx.py @@ -35,7 +35,7 @@ def get_searxng_url(self) -> str: "You can find public instances at https://searx.space/" ) - def search(self, max_results: int = 7) -> List[Dict[str, str]]: + def search(self, max_results: int = 10) -> List[Dict[str, str]]: """ Searches the query using SearxNG API Args: @@ -46,12 +46,10 @@ def search(self, max_results: int = 7) -> List[Dict[str, str]]: search_url = urljoin(self.base_url, "search") params = { - 'q': self.query, - 'format': 'json', - 'pageno': 1, - 'categories': 'general', - 'engines': 'google,bing,duckduckgo', # TODO: Add environment variable to customize the engines - 'results': max_results + # The search query. + 'q': self.query, + # Output format of results. Format needs to be activated in searxng config. + 'format': 'json' } try: From e5df45ac0f7a3f333db0d63487a81eb574500357 Mon Sep 17 00:00:00 2001 From: Ivaylo Gochkov Date: Mon, 28 Oct 2024 22:09:34 +0100 Subject: [PATCH 3/4] test --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index c7575b04f..626f683ce 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,6 +12,7 @@ markdown langchain langchain_community langchain-openai +langchain-ollama langgraph tiktoken gpt-researcher From cbb986b5a2b65fed8ee257722eb86a088c3b7ff1 Mon Sep 17 00:00:00 2001 From: Ivaylo Gochkov Date: Mon, 28 Oct 2024 22:19:10 +0100 Subject: [PATCH 4/4] fixed warnings --- Dockerfile | 7 ------- gpt_researcher/llm_provider/generic/base.py | 2 +- gpt_researcher/memory/embeddings.py | 2 +- gpt_researcher/scraper/arxiv/arxiv.py | 2 +- 4 files changed, 3 insertions(+), 10 deletions(-) diff --git a/Dockerfile b/Dockerfile index ba4e29a32..9cb4fba85 100644 --- a/Dockerfile +++ b/Dockerfile @@ -29,13 +29,6 @@ RUN pip install --no-cache-dir -r requirements.txt && \ # Stage 3: Final stage with non-root user and app FROM gpt-researcher-install AS gpt-researcher -# Use environment variables for API keys (defaults can be overridden at runtime) -ARG OPENAI_API_KEY -ARG TAVILY_API_KEY - -ENV OPENAI_API_KEY=${OPENAI_API_KEY} -ENV TAVILY_API_KEY=${TAVILY_API_KEY} - # Create a non-root user for security RUN useradd -ms /bin/bash gpt-researcher && \ chown -R gpt-researcher:gpt-researcher /usr/src/app diff --git a/gpt_researcher/llm_provider/generic/base.py b/gpt_researcher/llm_provider/generic/base.py index 7e64a8254..e7c8e2814 100644 --- a/gpt_researcher/llm_provider/generic/base.py +++ b/gpt_researcher/llm_provider/generic/base.py @@ -68,7 +68,7 @@ def from_provider(cls, provider: str, **kwargs: Any): llm = ChatFireworks(**kwargs) elif provider == "ollama": _check_pkg("langchain_community") - from langchain_community.chat_models import ChatOllama + from langchain_ollama import ChatOllama llm = ChatOllama(base_url=os.environ["OLLAMA_BASE_URL"], **kwargs) elif provider == "together": diff --git a/gpt_researcher/memory/embeddings.py b/gpt_researcher/memory/embeddings.py index 0e917ed4e..5bfcc8cb5 100644 --- a/gpt_researcher/memory/embeddings.py +++ b/gpt_researcher/memory/embeddings.py @@ -17,7 +17,7 @@ def __init__(self, embedding_provider: str, model: str, **embdding_kwargs: Any): _embeddings = None match embedding_provider: case "ollama": - from langchain_community.embeddings import OllamaEmbeddings + from langchain_ollama import OllamaEmbeddings _embeddings = OllamaEmbeddings( model=model, diff --git a/gpt_researcher/scraper/arxiv/arxiv.py b/gpt_researcher/scraper/arxiv/arxiv.py index e3c902830..2af550417 100644 --- a/gpt_researcher/scraper/arxiv/arxiv.py +++ b/gpt_researcher/scraper/arxiv/arxiv.py @@ -18,5 +18,5 @@ def scrape(self): """ query = self.link.split("/")[-1] retriever = ArxivRetriever(load_max_docs=2, doc_content_chars_max=None) - docs = retriever.get_relevant_documents(query=query) + docs = retriever.invoke(query=query) return docs[0].page_content