1
+ from io import BytesIO
2
+ from PyPDF2 import PdfFileReader
3
+ from PyPDF2 import PdfReader
4
+ import requests
5
+ import re
6
+ from requests .exceptions import RequestException
7
+ from bs4 import BeautifulSoup
8
+ from newspaper import Article , ArticleException , Config
9
+ from requests_html import HTMLSession
10
+ import time
11
+ import random
12
+ from lxml import html
13
+ from querent .lib .logger import logger
14
+
15
+ USER_AGENTS = [
16
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" ,
17
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0" ,
18
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.1 Safari/605.1.15" ,
19
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.0" ,
20
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0" ,
21
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36" ,
22
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Safari/605.1.15" ,
23
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0" ,
24
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36" ,
25
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 13_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.4 Mobile/15E148 Safari/604.1" ,
26
+ "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:73.0) Gecko/20100101 Firefox/73.0" ,
27
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36"
28
+ ]
29
+
30
+ class WebpageExtractor :
31
+
32
+ def __init__ (self , num_extracts = 3 ):
33
+ """
34
+ Initialize the WebpageExtractor class.
35
+ """
36
+ self .num_extracts = num_extracts
37
+
38
+ def extract_with_3k (self , url ):
39
+ """
40
+ Extract the text from a webpage using the 3k method.
41
+
42
+ Args:
43
+ url (str): The URL of the webpage to extract from.
44
+
45
+ Returns:
46
+ str: The extracted text.
47
+ """
48
+ try :
49
+ if url .lower ().endswith (".pdf" ):
50
+ response = requests .get (url )
51
+ response .raise_for_status ()
52
+
53
+ with BytesIO (response .content ) as pdf_data :
54
+ reader = PdfReader (pdf_data )
55
+ content = " " .join ([reader .getPage (i ).extract_text () for i in range (reader .getNumPages ())])
56
+
57
+ else :
58
+ config = Config ()
59
+ config .browser_user_agent = random .choice (USER_AGENTS )
60
+ config .request_timeout = 10
61
+ session = HTMLSession ()
62
+
63
+ response = session .get (url )
64
+ response .html .render (timeout = config .request_timeout )
65
+ html_content = response .html .html
66
+
67
+ article = Article (url , config = config )
68
+ article .set_html (html_content )
69
+ article .parse ()
70
+ content = article .text .replace ('\t ' , ' ' ).replace ('\n ' , ' ' ).strip ()
71
+
72
+ return content [:1500 ]
73
+
74
+ except ArticleException as ae :
75
+ logger .error (f"Error while extracting text from HTML (newspaper3k): { str (ae )} " )
76
+ return f"Error while extracting text from HTML (newspaper3k): { str (ae )} "
77
+
78
+ except RequestException as re :
79
+ logger .error (f"Error while making the request to the URL (newspaper3k): { str (re )} " )
80
+ return f"Error while making the request to the URL (newspaper3k): { str (re )} "
81
+
82
+ except Exception as e :
83
+ logger .error (f"Unknown error while extracting text from HTML (newspaper3k): { str (e )} " )
84
+ return ""
85
+
86
+ def extract_with_bs4 (self , url ):
87
+ """
88
+ Extract the text from a webpage using the BeautifulSoup4 method.
89
+
90
+ Args:
91
+ url (str): The URL of the webpage to extract from.
92
+
93
+ Returns:
94
+ str: The extracted text.
95
+ """
96
+ headers = {
97
+ "User-Agent" : random .choice (USER_AGENTS )
98
+ }
99
+
100
+ try :
101
+ response = requests .get (url , headers = headers , timeout = 10 )
102
+ if response .status_code == 200 :
103
+ soup = BeautifulSoup (response .text , 'html.parser' )
104
+ for tag in soup (['script' , 'style' , 'nav' , 'footer' , 'head' , 'link' , 'meta' , 'noscript' ]):
105
+ tag .decompose ()
106
+
107
+ main_content_areas = soup .find_all (['main' , 'article' , 'section' , 'div' ])
108
+ if main_content_areas :
109
+ main_content = max (main_content_areas , key = lambda x : len (x .text ))
110
+ content_tags = ['p' , 'h1' , 'h2' , 'h3' , 'h4' , 'h5' , 'h6' ]
111
+ content = ' ' .join ([tag .text .strip () for tag in main_content .find_all (content_tags )])
112
+ else :
113
+ content = ' ' .join ([tag .text .strip () for tag in soup .find_all (['p' , 'h1' , 'h2' , 'h3' , 'h4' , 'h5' , 'h6' ])])
114
+
115
+ content = re .sub (r'\t' , ' ' , content )
116
+ content = re .sub (r'\s+' , ' ' , content )
117
+ return content
118
+ elif response .status_code == 404 :
119
+ return f"Error: 404. Url is invalid or does not exist. Try with valid url..."
120
+ else :
121
+ logger .error (f"Error while extracting text from HTML (bs4): { response .status_code } " )
122
+ return f"Error while extracting text from HTML (bs4): { response .status_code } "
123
+
124
+ except Exception as e :
125
+ logger .error (f"Unknown error while extracting text from HTML (bs4): { str (e )} " )
126
+ return ""
127
+
128
+ def extract_with_lxml (self , url ):
129
+ """
130
+ Extract the text from a webpage using the lxml method.
131
+
132
+ Args:
133
+ url (str): The URL of the webpage to extract from.
134
+
135
+ Returns:
136
+ str: The extracted text.
137
+ """
138
+ try :
139
+ config = Config ()
140
+ config .browser_user_agent = random .choice (USER_AGENTS )
141
+ config .request_timeout = 10
142
+ session = HTMLSession ()
143
+
144
+ response = session .get (url )
145
+ response .html .render (timeout = config .request_timeout )
146
+ html_content = response .html .html
147
+
148
+ tree = html .fromstring (html_content )
149
+ paragraphs = tree .cssselect ('p, h1, h2, h3, h4, h5, h6' )
150
+ content = ' ' .join ([para .text_content () for para in paragraphs if para .text_content ()])
151
+ content = content .replace ('\t ' , ' ' ).replace ('\n ' , ' ' ).strip ()
152
+
153
+ return content
154
+
155
+ except ArticleException as ae :
156
+ logger .error ("Error while extracting text from HTML (lxml): {str(ae)}" )
157
+ return ""
158
+
159
+ except RequestException as re :
160
+ logger .error (f"Error while making the request to the URL (lxml): { str (re )} " )
161
+ return ""
162
+
163
+ except Exception as e :
164
+ logger .error (f"Unknown error while extracting text from HTML (lxml): { str (e )} " )
165
+ return ""
166
+
0 commit comments