-
Notifications
You must be signed in to change notification settings - Fork 1
/
google_search.py
104 lines (72 loc) · 3.04 KB
/
google_search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import serpapi
from utils import load_file, save_file_jsonl
serpapi_api_key = "<Your SerApi API key>"
def call_search_engine(query):
params = {
"q": query,
"engine": "google", # Set parameter to google to use the Google API engine
# "location": "California, United States",
"hl": "en", # Parameter defines the language to use for the Google search.
"gl": "au", # Parameter defines the country to use for the Google search.
"google_domain": "google.com", # Parameter defines the Google domain to use.
"api_key": serpapi_api_key,
}
results = serpapi.search(**params)
return results
def parse_google_research_results(results, retrieved_num=5):
retrieved_docs = []
# Answer box has higher priority
if 'answer_box' in results:
parsed_item = {}
answer_box = results["answer_box"]
if "title" in answer_box:
parsed_item["title"] = answer_box["title"]
if "snippet" in answer_box:
parsed_item["text"] = answer_box["snippet"]
print(f"answwer_box: {parsed_item}")
retrieved_docs.append(parsed_item)
if "organic_results" in results:
items = results['organic_results']
print(f"# organic_results: {len(items)}")
if len(items) < retrieved_num:
retrieved_num = len(items)
for idx in list(range(len(items)))[:retrieved_num]:
item = items[idx]
parsed_item = {}
if "title" in item:
parsed_item["title"] = item['title']
# if "snippet_highlighted_words" in item:
# highlights = item['snippet_highlighted_words']
if "snippet" in item:
parsed_item["text"] = item['snippet']
# if "link" in item:
# link = item['link']
retrieved_docs.append(parsed_item)
return retrieved_docs
def main():
input_file_path = "./data/retrievalqa.jsonl"
# load input data
input_data = load_file(input_file_path)
print(f"input data: {input_file_path}, #: {len(input_data)}")
print(input_data[0])
# only using Google search for questions from realtimeqa and freshqa
data_sources = ["realtimeqa", "freshqa"]
query_count = 0
for item in input_data:
if item["data_source"] not in data_sources:
continue
query_count += 1
query = item["question"]
results = call_search_engine(query)
retrieved_docs = parse_google_research_results(results, retrieved_num=5)
item["context"] = retrieved_docs
print(f"query: {query}")
print(f"source: {item['data_source']}, # context: {len(item['context'])}")
print(item["context"])
print(f"total query times: {query_count}")
# sanity check
count_of_empty_context = sum([1 if len(item["context"]) == 0 else 0 for item in input_data])
assert count_of_empty_context == 0
save_file_jsonl(input_data, "./data/retrieved_docs.jsonl")
if __name__ == "__main__":
main()