Skip to content

Commit

Permalink
Deduplicate online references returned by chat API to clients
Browse files Browse the repository at this point in the history
This will ensure only unique online references are shown in all
clients.

The duplication issue was exacerbated in research mode as even with
different online search queries, you can get previously seen results.

This change does a global deduplication across all online results seen
across research iterations before returning them in client reponse.
  • Loading branch information
debanjum committed Nov 11, 2024
1 parent 137687e commit 7468f6a
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 2 deletions.
22 changes: 22 additions & 0 deletions src/khoj/processor/tools/online_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,3 +367,25 @@ async def search_with_jina(query: str, location: LocationData) -> Tuple[str, Dic
for item in response_json["data"]
]
return query, {"organic": parsed_response}


def deduplicate_organic_results(online_results: dict) -> dict:
"""Deduplicate organic search results based on links across all queries."""
# Keep track of seen links to filter out duplicates across queries
seen_links = set()
deduplicated_results = {}

# Process each query's results
for query, results in online_results.items():
# Filter organic results keeping only first occurrence of each link
filtered_organic = []
for result in results.get("organic", []):
link = result.get("link")
if link and link not in seen_links:
seen_links.add(link)
filtered_organic.append(result)

# Update results with deduplicated organic entries
deduplicated_results[query] = {**results, "organic": filtered_organic}

return deduplicated_results
9 changes: 7 additions & 2 deletions src/khoj/routers/api_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,11 @@
from khoj.processor.conversation.utils import defilter_query, save_to_conversation_log
from khoj.processor.image.generate import text_to_image
from khoj.processor.speech.text_to_speech import generate_text_to_speech
from khoj.processor.tools.online_search import read_webpages, search_online
from khoj.processor.tools.online_search import (
deduplicate_organic_results,
read_webpages,
search_online,
)
from khoj.processor.tools.run_code import run_code
from khoj.routers.api import extract_references_and_questions
from khoj.routers.email import send_query_feedback
Expand Down Expand Up @@ -1026,12 +1030,13 @@ def collect_telemetry():
)

## Send Gathered References
unique_online_results = deduplicate_organic_results(online_results)
async for result in send_event(
ChatEvent.REFERENCES,
{
"inferredQueries": inferred_queries,
"context": compiled_references,
"onlineContext": online_results,
"onlineContext": unique_online_results,
"codeContext": code_results,
},
):
Expand Down

0 comments on commit 7468f6a

Please sign in to comment.