-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmatch_software.py
122 lines (100 loc) · 3.53 KB
/
match_software.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import psycopg2
import requests
from bs4 import BeautifulSoup
import time
from urllib.parse import urlparse
import pandas as pd
def connect_to_db():
return psycopg2.connect(
dbname='postgres',
user='YOUR USER NAME',
password='YOUR PASSWORD',
host='localhost',
port='5432'
)
def get_software_keywords():
conn = connect_to_db()
cur = conn.cursor()
# Get keywords and corresponding software providers
cur.execute("SELECT keyword, software_provider FROM softwares")
keyword_data = cur.fetchall()
cur.close()
conn.close()
# Create a dictionary of keywords to software providers
keyword_dict = {keyword.lower(): provider for keyword, provider in keyword_data}
return keyword_dict
def analyze_website(url, keyword_dict):
try:
# Add scheme if not present
if not url.startswith(('http://', 'https://')):
url = 'https://' + url
# Get website content
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/91.0.4472.124 Safari/537.36'}
response = requests.get(url, headers=headers, timeout=10)
soup = BeautifulSoup(response.text, 'html.parser')
# Get all text content
page_text = soup.get_text().lower()
page_source = response.text.lower()
# Search for keywords
for keyword, provider in keyword_dict.items():
if keyword.lower() in page_text or keyword.lower() in page_source:
return provider
return None
except Exception as e:
print(f"Error analyzing {url}: {str(e)}")
return None
def main():
try:
conn = connect_to_db()
cur = conn.cursor()
# Get software keywords
keyword_dict = get_software_keywords()
print(f"Loaded {len(keyword_dict)} keywords to search for")
# Get first 5 links for testing
cur.execute("""
SELECT url, link
FROM website_scraping
WHERE link IS NOT NULL
AND length(link) > 0
LIMIT 5
""")
links = cur.fetchall()
print(f"\nAnalyzing {len(links)} websites...")
results = []
for url, link in links:
print(f"\nProcessing: {url}")
software = analyze_website(link, keyword_dict)
if software:
print(f"✓ Found software: {software}")
# Update database
cur.execute("""
UPDATE website_scraping
SET software = %s
WHERE url = %s
""", (software, url))
conn.commit()
results.append({
'url': url,
'link': link,
'software': software
})
else:
print("✗ No matching software found")
# Print summary
print("\n=== Results ===")
if results:
df = pd.DataFrame(results)
print("\nMatched Software:")
print(df.to_string(index=False))
else:
print("No software matches found in the test sample")
cur.close()
conn.close()
except Exception as e:
print(f"Error: {str(e)}")
if 'cur' in locals():
cur.close()
if 'conn' in locals():
conn.close()
if __name__ == "__main__":
main()