-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathoctopus_internet_research_agent.py
404 lines (351 loc) · 16.5 KB
/
octopus_internet_research_agent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
import os
dependencies = [
'pip install -q beautifulsoup4==4.12.3',
'pip install -q Flask==3.1.0',
'pip install -q openai==1.60.2',
'pip install -q requests==2.32.3',
]
for command in dependencies:
os.system(command)
import json, re, requests, textwrap, time
from bs4 import BeautifulSoup
from flask import Flask, jsonify, request
from openai import AzureOpenAI, OpenAI
config_str = '''{
"device_map": {
"cpu": "15GiB"
},
"required_python_version": "cp311",
"models": {
"model": "gpt-4o-mini-2024-07-18"
},
"functions": [
{
"name": "internet_research_agent",
"display_name": "Internet research agent",
"description": "The Internet Research Agent is a robust tool designed to conduct comprehensive online research by automatically generating and executing detailed search queries based on a user’s input. It efficiently gathers, filters, and analyzes information from various web sources, providing a structured and thorough response. This function is particularly valuable for tasks that require deep dives into specific topics, such as comparing products, identifying the latest industry research, or exploring market trends. This function is triggered when the user explicitly requests detailed internet research. Examples include: (1) comparing features of different industrial printers, (2) finding the latest studies on coating technologies, or (3) gathering market trends in the printing industry. It should not be triggered for tasks such as (1) retrieving simple facts, (2) answering general questions, or (3) handling requests that do not involve web-based information gathering.",
"parameters": {
"type": "object",
"properties": {
"full_prompt": { "type": "string", "description": "Full prompt created by the user" }
},
"required": ["full_prompt"]
},
"input_type": "application/json",
"return_type": "application/json"
},
{
"name": "internet_research_urls",
"display_name": "Internet research urls",
"description": "The Internet Research URLs function is a powerful tool that enables you to perform specific, user-defined actions on designated web pages. It allows you to input URLs along with precise tasks, such as extracting data, summarizing content, or conducting detailed analyses directly from those sites. This function is triggered when you need to perform multiple, detailed operations across different web resources, such as {extracting insights from} {url1}, {summarizing key points from} {url2}, and {analyzing market trends in} {url3}. Examples include: (1) extracting information from specific research articles on coating technologies, (2) summarizing content from multiple blog posts about industrial printing, or (3) performing an in-depth analysis of market data from provided links. This function should not be triggered when (1) broad internet research is required without specific URLs, (2) the task involves simple fact-checking or general information retrieval.",
"parameters": {
"type": "object",
"properties": {
"full_prompt_with_urls": { "type": "string", "description": "Full prompt created by the user with urls" }
},
"required": ["full_prompt_with_urls"]
},
"input_type": "application/json",
"return_type": "application/json"
}
]}'''
config = json.loads(config_str)
app = Flask(__name__)
AZURE_OPENAI_API_KEY = os.getenv('AZURE_OPENAI_API_KEY')
AZURE_OPENAI_DEPLOYMENT_ID = os.getenv('AZURE_OPENAI_DEPLOYMENT_ID')
AZURE_OPENAI_ENABLED = os.getenv('AZURE_OPENAI_ENABLED')
if AZURE_OPENAI_ENABLED == "0":
AZURE_OPENAI_ENABLED = "False"
elif AZURE_OPENAI_ENABLED == "1":
AZURE_OPENAI_ENABLED = "True"
azure_enabled = json.loads(AZURE_OPENAI_ENABLED.lower())
AZURE_OPENAI_URL = os.getenv('AZURE_OPENAI_URL')
print("azure_enabled")
print(azure_enabled)
if azure_enabled == True:
client = AzureOpenAI(
api_key=AZURE_OPENAI_API_KEY,
api_version="2024-05-01-preview",
azure_endpoint=AZURE_OPENAI_URL,
azure_deployment=AZURE_OPENAI_DEPLOYMENT_ID,
)
else:
client = OpenAI()
def ira_step1(prompt: str) -> str:
print("ira_step1")
print("prompt")
print(prompt)
while True:
try:
content = str("I want to check the internet for the following thing. What is the best google search input to get the best results? Give me the google search input, nothing else. If user has a complex research task for example compare things, you can suggest a few different prompts separated with semicolon ; character to cover different information sources. Here is the thing: " + prompt)
chat_completion = client.chat.completions.create(
messages=[
{
"role": "user",
"content": content,
}
],
model=config["models"]["model"],
)
return chat_completion.choices[0].message.content
except:
print("some OpenAI problem")
def ira_step2(prompt: str, website_infos: []) -> str:
print("ira_step2")
while True:
try:
content = str("I will give you summaries of homepages that eventually provide useful information for the subject of interest: \"" + prompt + "\" Make a plan how to come to a good answer to the subject of interest. ")
chat_completion = client.chat.completions.create(
messages=[
{
"role": "user",
"content": content,
}
],
model=config["models"]["model"],
)
return chat_completion.choices[0].message.content
except:
print("some OpenAI problem")
def ira_step2scrape(prompt: str) -> []:
print("ira_step2scrape")
links = []
website_infos = []
queries = prompt.split(";")
for query in queries:
query = query.replace(" ", "+")
url = str("http://localhost:8080/api/v1/scraper-search-service?prompt=" + query)
max_tries = 2
for i in range(max_tries):
try:
print("request")
print("url")
print(url)
html_result = requests.get(url)
links_json = html_result.json()
print(type(links_json))
if (type(links_json) is dict) == True:
raise Exception('Wrong data!')
if (type(links_json) is list) == False:
raise Exception('Wrong data!')
if len(links_json) > 0:
for link_json in links_json:
if link_json not in links:
links.append(link_json)
break
except Exception as e:
print(f"Error while gettting googlesearchresults:\n{e}")
print("sleeping")
time.sleep(5)
if i==max_tries-1:
print(f"Warning: Googlesearch found no websites for prompt:\n{query}")
print("links")
print(links)
for link in links:
url = str("http://localhost:8080/api/v1/scraper-service?url=" + link)
print(url)
html_result = requests.get(url)
soup = BeautifulSoup(html_result.text, "html.parser")
text = soup.get_text().replace("\n", "")
text = text.replace("\t", " ")
website_info = {
"link": link,
"text": text,
"url": url,
}
website_infos.append(website_info)
return website_infos
def ira_step3(prompt: str, website_infos: []) -> []:
print("ira_step3")
result_website_infos = []
for website_info in website_infos:
try:
content = str("I will give you a subject of interest and the text of a homepage and you filter out marketing claims and spam. Make me detailed report of all quantitative or qualitative information that are useful for the subject of interest or to answer the question in the subject. don't get distracted from the subject. Only use the information from the provided homepage. When the homepage is marketing or spam, mark it clearly in the report, then this information is not very helpful. Just give me the detailed report, nothing else. Subject of interest: " + prompt + " Homepage: Please note: This website includes an accessibility system. Press Control-F11 to adjust the website to the visually impaired who are using a screen reader; Press Control-F10 to open an accessibility menu Accessibility. " + website_info["text"])
chat_completion = client.chat.completions.create(
messages=[
{
"role": "user",
"content": content,
}
],
model=config["models"]["model"],
)
summary = chat_completion.choices[0].message.content
result_website_info = {
"link": website_info["link"],
"text": website_info["text"],
"summary": summary,
"url": website_info["url"],
}
result_website_infos.append(result_website_info)
except:
print("some OpenAI problem")
print("result_website_infos")
print(result_website_infos)
return result_website_infos
def ira_step4(prompt: str, strategy: str, website_infos: []) -> str:
print("ira_step4")
content = str("I give you a subject of interest, a strategy and several source of information. \n\nGive me an optimal answer to the subject of interest without relativizing by using the provided source of information with academic footnotes like [1] followed by the corresponding URL. Just give me the conclusion and make the answer very clear and simple without referring the strategy again. Don't explain yourself. \n\nSubject of interest:\n "+ prompt + "\n\nStrategy:\n")
i = 1
for website_info in website_infos:
content = str(content + " \n\n\n\nURL" + str(i) + " \n" + website_info["link"] + " \n\nSummary of URL" + str(i) + " \n" + website_info["summary"])
i += 1
print("content")
print(content)
while True:
try:
chat_completion = client.chat.completions.create(
messages=[
{
"role": "system",
"content": "Reply in the language the user question is asked. Make sure you provide links in your response. Try to create a very long report.",
},
{
"role": "user",
"content": content,
}
],
model=config["models"]["model"],
)
return chat_completion.choices[0].message.content
except:
print("some OpenAI problem")
def iru_step1(prompt: str) -> []:
print("iru_step1")
while True:
try:
content = str("User provides in prompt what he want to do with certain website and url. Return this data as formated json of array of objects with keys: what_to_do, url. Here is the prompt: " + prompt)
chat_completion = client.chat.completions.create(
messages=[
{
"role": "user",
"content": content,
}
],
model=config["models"]["model"],
)
result = chat_completion.choices[0].message.content
if "```json" in result:
result = result.lstrip("```json")
if "```" in result:
result = result.rstrip("```")
result = json.loads(result)
return result
except:
print("some OpenAI problem")
def iru_step2scrape(websites: []) -> []:
print("iru_step2scrape")
links = []
website_infos = []
print("websites")
print(websites)
for website in websites:
link = website["url"]
url = str("http://localhost:8080/api/v1/scraper-service?url=" + link)
print(url)
html_result = requests.get(url)
soup = BeautifulSoup(html_result.text, "html.parser")
text = soup.get_text().replace("\n", "")
text = text.replace("\t", " ")
website_info = {
"link": link,
"text": text,
"url": url,
"what_to_do": website["what_to_do"],
}
website_infos.append(website_info)
return website_infos
def iru_step3(website_infos: []) -> []:
print("iru_step3")
result_website_infos = []
for website_info in website_infos:
try:
content = str("I will give you a user command and website text and you will provide the answer. User command: " + website_info["what_to_do"] + " Website text: " + website_info["text"])
chat_completion = client.chat.completions.create(
messages=[
{
"role": "user",
"content": content,
}
],
model=config["models"]["model"],
)
result = chat_completion.choices[0].message.content
result_website_info = {
"link": website_info["link"],
"text": website_info["text"],
"result": result,
"url": website_info["url"],
"what_to_do": website_info["what_to_do"],
}
result_website_infos.append(result_website_info)
except:
print("some OpenAI problem")
print("result_website_infos")
print(result_website_infos)
return result_website_infos
def iru_step4(website_infos: []) -> str:
print("iru_step4")
content = str("I give you a list of websites. Website link, website text, user command and result of user command from previous GPT call. Return a long report.")
i = 1
for website_info in website_infos:
content = str(content + " \n\n\n\nURL" + str(i) + " \n" + website_info["link"] + " \n\nText of URL" + str(i) + " \n" + website_info["text"] + " \n\nUser command of URL" + str(i) + " \n" + website_info["what_to_do"] + " \n\nPrevious GPT call result of URL" + str(i) + " \n" + website_info["result"])
i += 1
print("content")
print(content)
while True:
try:
chat_completion = client.chat.completions.create(
messages=[
{
"role": "system",
"content": "Reply in the language the user question is asked. Make sure you provide links in your response. Try to create a very long report.",
},
{
"role": "user",
"content": content,
}
],
model=config["models"]["model"],
)
return chat_completion.choices[0].message.content
except:
print("some OpenAI problem")
@app.route('/v1/internet_research_agent', methods=['POST'])
def internet_research_agent():
print("internet_research_agent")
data = request.json
prompt = data.get("full_prompt", "")
print("prompt")
print(prompt)
ira_step1result = ira_step1(prompt)
print("ira_step1result")
print(ira_step1result)
website_infos = ira_step2scrape(ira_step1result)
ira_step2result = ira_step2(prompt, website_infos)
website_infos = ira_step3(prompt, website_infos)
ira_step4result = ira_step4(prompt, ira_step2result, website_infos)
response = {
"response": ira_step4result,
}
return jsonify(response), 201
@app.route('/v1/internet_research_urls', methods=['POST'])
def internet_research_urls():
print("internet_research_urls")
data = request.json
prompt = data.get("full_prompt_with_urls", "")
iru_step1result = iru_step1(prompt)
website_infos = iru_step2scrape(iru_step1result)
website_infos = iru_step3(website_infos)
iru_step4result = iru_step4(website_infos)
response = {
"response": iru_step4result,
}
return jsonify(response), 201
@app.route("/v1/setup", methods=["POST"])
def setup():
response = {
"setup": "Performed"
}
return jsonify(response), 201
app.run(host = "0.0.0.0", threaded=True)