-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcompetitor_finder_and_crawl.py
159 lines (116 loc) · 5.14 KB
/
competitor_finder_and_crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
from urllib import response
from phi.agent import Agent
from phi.tools.tavily import TavilyTools
from typing import List
from rich.pretty import pprint
from pydantic import BaseModel, Field
from phi.agent import Agent, RunResponse
from phi.model.groq import Groq
from phi.utils.pprint import pprint_run_response
import json
from exa_py import Exa
import requests
import os
from dotenv import load_dotenv
load_dotenv()
# gorq models
# llama-3.1-70b-versatile
# llama-3.3-70b-versatile
# llama-3.2-11b-vision-preview
# llama-3.2-90b-vision-preview
# llama-3.3-70b-specdec
user_startup = "100xEngineering" # Replace with the actual user startup name
user_startup_website = "https://www.100xengineers.com/" # Replace with the actual user startup
def get_data_folder(user_startup: str) -> str:
data_folder = f'data_{user_startup}'
os.makedirs(data_folder, exist_ok=True)
return data_folder
data_folder = get_data_folder(user_startup)
# ================================================================
class CompetitorList(BaseModel):
competitor: List[str] = Field(..., description="List of all the competitors of the startup.")
competitor_website: List[str] = Field(..., description="List of all website of the startup.")
# Initialize the agent with the specified instructions
company_analysis_agent = Agent(
model=Groq(id="llama-3.3-70b-versatile"),
api_key = "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",
description=f'You are a business analyst analyzing the company - {user_startup} and Explain in detail',
instructions= [f'Identify and find websites of competitors of {user_startup} using the Traxcn, business intelligence platform. Select only 3 direct competitor.'],
tools=[TavilyTools(
api_key="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",
include_answer=False,
search_depth='basic',
)],
response_model=CompetitorList,
structured_response=True
)
company_analysis_agent.print_response("")
# Get the structured response from the agent
response = company_analysis_agent.run()
# Parse the response - it should be a dictionary-like object
if isinstance(response.content, str):
print("Unexpected string response:", response.content)
raise ValueError("Agent returned string instead of structured data")
# Access the competitor data
competitor_list = response.content
pprint(competitor_list.competitor)
pprint(competitor_list.competitor_website)
print("Competitor Names:", competitor_list.competitor)
print("Competitor Websites:", competitor_list.competitor_website)
# ================================================================
from exa_py import Exa
import os
import requests
# Fix the company list handling
company_names = competitor_list.competitor[:] # Create a copy of the list
company_websites = competitor_list.competitor_website[:] # Create a copy of the list
# Add the user startup to the lists
company_names.append(user_startup)
company_websites.append(user_startup_website)
# Create a dictionary of companies and their websites
companies = dict(zip(company_names, company_websites))
print("Companies:", companies)
print("Company Names:", company_names)
print("Company Websites:", company_websites)
save_path = os.path.join(data_folder, 'companies.json')
with open(save_path, 'w') as f:
json.dump(companies, f)
exa = Exa(api_key=os.getenv('EXA_API_KEY'))
headers = {
'Authorization': f"Bearer {os.getenv('JINA_API_KEY')}",
'X-Return-Format': 'screenshot'
}
# Improved error handling for screenshots
for company, website in companies.items():
try:
url = 'https://r.jina.ai/' + website
print(f"Processing screenshot for {company} with URL: {url}")
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status() # Raise an exception for bad status codes
save_path = os.path.join(data_folder, f'{company}_ss.png')
with open(save_path, 'wb') as f:
f.write(response.content)
print(f"Screenshot saved for {company} at {save_path}")
except Exception as e:
print(f"Error processing screenshot for {company}: {str(e)}")
# # Improved error handling for Exa crawl
# for company, website in companies.items():
# try:
# print(f"Processing crawl for {company} with URL: {website}")
# result = exa.get_contents(
# [website],
# text=True,
# subpages=10,
# subpage_target=["about", "product", "pricing", "Teams", "Individual",
# "docs", "company", "Who We Are", "about", "contact us",
# "resources", "community", "Investor Relations"]
# )
# # Save crawl results to text file
# crawl_file_path = os.path.join(data_folder, f'{company}_crawl.txt')
# with open(crawl_file_path, 'w', encoding='utf-8') as f:
# f.write(f"Crawl results for {company} ({website}):\n\n")
# f.write(str(result))
# print(f"Crawl result saved to {crawl_file_path}")
# except Exception as e:
# print(f"Error crawling {company}: {str(e)}")
# ================================================================