-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwebpage.py
140 lines (107 loc) · 4.34 KB
/
webpage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import re
import requests
import numpy as np
from bs4 import BeautifulSoup
from typing import List
from urllib.parse import urlparse, urljoin, urldefrag
class Webpage:
# Class to store individual webpages
@staticmethod
def fetch_html(url: str) -> str:
# Gives html code of url
#print(url)
response = requests.get(url)
response.raise_for_status()
return response.text
@staticmethod
def compare_domain(url, base_domain) -> bool:
"""
Compares the domain of a URL with the base domain.
"""
base_domain_len = len(base_domain.split('.'))
parsed_url = urlparse(url)
domain_parts = parsed_url.netloc.split('.')
if len(domain_parts) < base_domain_len:
return False
url_domain = ".".join(domain_parts[-base_domain_len:])
return url_domain == base_domain
@staticmethod
def normalize_url(url: str) -> str:
"""
Normalizes the URL by removing the fragment and trailing slash.
"""
url = urldefrag(url)[0] # Remove the fragment
if url.endswith('/'):
url = url[:-1]
return url
@staticmethod
def get_clean_content(html: str) -> str:
# Parses the html in a way to make it easier for model to understand what is on the page
# Parse HTML with BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
# Define a list of tag names that often contain the main content
content_tags = ['article', 'main', 'section', 'div']
# Extract elements with these tags
content = []
for tag in content_tags:
for element in soup.find_all(tag):
# You may add conditions to filter elements by class, id, etc.
if element.get('class') and 'content' in element.get('class'):
content.append(element)
elif element.get('id') and 'content' in element.get('id'):
content.append(element)
else:
# Add other conditions as necessary
if tag == 'div' and ('post' in element.get('class', []) or 'entry' in element.get('class', [])):
content.append(element)
# Join the extracted elements' HTML
main_content_html = ''.join(str(element) for element in content)
# Optional: Clean with BeautifulSoup again
clean_soup = BeautifulSoup(main_content_html, 'html.parser')
# Remove unwanted tags or elements
for unwanted in clean_soup(['script', 'style', 'nav', 'footer', 'header', 'noscript']):
unwanted.decompose()
# Extract text
clean_text = clean_soup.get_text(separator='\n')
# Remove whitespaces
clean_text = re.sub(r'\n+', '\n', clean_text)
clean_text = re.sub(r' +', ' ', clean_text)
return clean_text.strip()
def __init__(self, link: str):
self.link = link
self.html = None
self.embeddings = None
def set_embedding(self, embedding: List[np.ndarray]) -> None:
self.embeddings=embedding
def get_text(self) -> str:
# Returns text from webpage
if self.html == None:
self.html = Webpage.fetch_html(self.link)
return Webpage.get_clean_content(self.html)
def get_connects(self) -> List[str]:
html = Webpage.fetch_html(self.link)
self.html = html
soup = BeautifulSoup(html, "html.parser")
base_domain = urlparse(self.link).netloc
connections = []
links = soup.find_all('a')
temp_link=self.link
if temp_link[-1] != '/':
temp_link += "/"
for link in links:
href = link.get('href')
if href:
full_url = urljoin(temp_link, href)
if Webpage.compare_domain(full_url, base_domain):
connections.append(full_url)
if len(connections)==0:
return []
for i in range(len(connections)):
connections[i] = Webpage.normalize_url(connections[i])
connections.sort(key = lambda x: (len(x),x))
result=[]
result.append(connections[0])
for i in range(1, len(connections)):
if connections[i] != connections[i - 1]:
result.append(connections[i])
return result