Skip to content

Commit d12fcac

Browse files
Merge pull request #17 from Querent-ai/setup_webscaper_collector
modyfing interfaces and setting up scraper
2 parents b01fe16 + da54eed commit d12fcac

8 files changed

+226
-14
lines changed

querent/collectors/collector_factory.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from enum import Enum
44
from querent.collectors.collector_base import Collector
55
from querent.collectors.collector_errors import CollectorResolverError, CollectorErrorKind
6-
from querent.config.collector_config import CollectorBackend
6+
from querent.config.collector_config import CollectConfig, CollectorBackend
77

88

99
class CollectorFactory(ABC):
@@ -12,7 +12,7 @@ def backend(self) -> CollectorBackend:
1212
pass
1313

1414
@abstractmethod
15-
async def resolve(self, uri: str) -> Optional[CollectorBackend]:
15+
async def resolve(self, uri: str, config: CollectConfig) -> Optional[CollectorBackend]:
1616
pass
1717

1818

@@ -21,7 +21,7 @@ def __init__(self, backend: CollectorBackend, message: str):
2121
self.backend = backend
2222
self.message = message
2323

24-
async def resolve(self, uri: str) -> Optional[Collector]:
24+
async def resolve(self, uri: str, config: CollectConfig) -> Optional[Collector]:
2525
raise CollectorResolverError(
2626
CollectorErrorKind.NotSupported, self.backend, self.message
2727
)

querent/collectors/collector_resolver.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from typing import Optional
22
from querent.collectors.fs.fs_collector import FSCollectorFactory
3-
from querent.config.collector_config import CollectorBackend
3+
from querent.collectors.webscaper.web_scraper_collector import WebScraperFactory
4+
from querent.config.collector_config import CollectConfig, CollectorBackend
45
from querent.collectors.collector_base import Collector
56
from querent.collectors.collector_errors import CollectorResolverError, CollectorErrorKind
67
from querent.common.uri import Protocol, Uri
@@ -9,15 +10,16 @@ class CollectorResolver:
910
def __init__(self):
1011
self.collector_factories = {
1112
CollectorBackend.LocalFile: FSCollectorFactory(),
13+
CollectorBackend.WebScraper: WebScraperFactory(),
1214
# Add other collector factories as needed
1315
}
1416

15-
def resolve(self, uri: Uri) -> Optional[Collector]:
17+
def resolve(self, uri: Uri, config: CollectConfig) -> Optional[Collector]:
1618
backend = self._determine_backend(uri.protocol)
1719

1820
if backend in self.collector_factories:
1921
factory = self.collector_factories[backend]
20-
return factory.resolve(uri)
22+
return factory.resolve(uri, config)
2123
else:
2224
raise CollectorResolverError(
2325
CollectorErrorKind.NotSupported, backend, "Unsupported backend"

querent/collectors/fs/fs_collector.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -40,15 +40,15 @@ async def walk_files(self, root: Path) -> AsyncGenerator[Path, None]:
4040
for item in root.iterdir():
4141
if item.is_file():
4242
yield item
43-
44-
43+
elif item.is_dir():
44+
async for file_path in self.walk_files(item):
45+
yield file_path
4546
class FSCollectorFactory(CollectorFactory):
4647
def __init__(self):
4748
pass
4849

4950
def backend(self) -> CollectorBackend:
5051
return CollectorBackend.LocalFile
5152

52-
def resolve(self, uri: Uri) -> Collector:
53-
config = FSCollectorConfig(root_path=uri.path)
53+
def resolve(self, uri: Uri, config: FSCollectorConfig) -> Collector:
5454
return FSCollector(config)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
from querent.collectors.collector_base import Collector
2+
from querent.collectors.collector_factory import CollectorFactory
3+
from querent.collectors.collector_result import CollectorResult
4+
from querent.config.collector_config import CollectorBackend, WebScraperConfig
5+
from querent.tools.web_page_extractor import WebpageExtractor
6+
7+
class WebScraperCollector(Collector):
8+
def __init__(self, config: WebScraperConfig):
9+
self.website_url = config.website_url
10+
11+
async def connect(self):
12+
pass # Any setup logic before scraping
13+
14+
async def disconnect(self):
15+
pass # Any cleanup logic after scraping
16+
17+
async def poll(self):
18+
content = await self.scrape_website(self.website_url)
19+
yield CollectorResult(content)
20+
21+
async def scrape_website(self, website_url: str):
22+
content = WebpageExtractor().extract_with_bs4(website_url)
23+
max_length = len(' '.join(content.split(" ")[:600]))
24+
return content[:max_length]
25+
26+
class WebScraperFactory(CollectorFactory):
27+
def __init__(self):
28+
pass
29+
30+
def backend(self) -> CollectorBackend:
31+
return CollectorBackend.WebScraper
32+
33+
def resolve(self, config: WebScraperConfig) -> Collector:
34+
return WebScraperCollector(config)

querent/config/collector_config.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from enum import Enum
22
from typing import Optional
3-
from pydantic import BaseModel
3+
from pydantic import BaseModel, Field
44

55

66
class CollectorBackend(str, Enum):
@@ -35,6 +35,10 @@ class GcsCollectConfig(BaseModel):
3535
access_key: str
3636
secret_key: str
3737

38+
class WebScraperConfig(BaseModel):
39+
website_url: str = Field(
40+
..., description="The URL of the website to scrape."
41+
)
3842

3943
class CollectConfigWrapper(BaseModel):
4044
backend: CollectorBackend
@@ -50,5 +54,9 @@ def from_collect_config(cls, collect_config: CollectConfig):
5054
return cls(backend=CollectorBackend.S3, config=S3CollectConfig())
5155
elif collect_config.backend == CollectorBackend.Gcs:
5256
return cls(backend=CollectorBackend.Gcs, config=GcsCollectConfig())
57+
elif collect_config.backend == CollectorBackend.WebScraper:
58+
return cls(
59+
backend=CollectorBackend.WebScraper, config=WebScraperConfig()
60+
)
5361
else:
5462
raise ValueError(f"Unsupported collector backend: {collect_config.backend}")

querent/tools/__init__.py

Whitespace-only changes.

querent/tools/web_page_extractor.py

+166
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
from io import BytesIO
2+
from PyPDF2 import PdfFileReader
3+
from PyPDF2 import PdfReader
4+
import requests
5+
import re
6+
from requests.exceptions import RequestException
7+
from bs4 import BeautifulSoup
8+
from newspaper import Article, ArticleException, Config
9+
from requests_html import HTMLSession
10+
import time
11+
import random
12+
from lxml import html
13+
from querent.lib.logger import logger
14+
15+
USER_AGENTS = [
16+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
17+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
18+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.1 Safari/605.1.15",
19+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.0",
20+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0",
21+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36",
22+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Safari/605.1.15",
23+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0",
24+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36",
25+
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.4 Mobile/15E148 Safari/604.1",
26+
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:73.0) Gecko/20100101 Firefox/73.0",
27+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36"
28+
]
29+
30+
class WebpageExtractor:
31+
32+
def __init__(self, num_extracts=3):
33+
"""
34+
Initialize the WebpageExtractor class.
35+
"""
36+
self.num_extracts = num_extracts
37+
38+
def extract_with_3k(self, url):
39+
"""
40+
Extract the text from a webpage using the 3k method.
41+
42+
Args:
43+
url (str): The URL of the webpage to extract from.
44+
45+
Returns:
46+
str: The extracted text.
47+
"""
48+
try:
49+
if url.lower().endswith(".pdf"):
50+
response = requests.get(url)
51+
response.raise_for_status()
52+
53+
with BytesIO(response.content) as pdf_data:
54+
reader = PdfReader(pdf_data)
55+
content = " ".join([reader.getPage(i).extract_text() for i in range(reader.getNumPages())])
56+
57+
else:
58+
config = Config()
59+
config.browser_user_agent = random.choice(USER_AGENTS)
60+
config.request_timeout = 10
61+
session = HTMLSession()
62+
63+
response = session.get(url)
64+
response.html.render(timeout=config.request_timeout)
65+
html_content = response.html.html
66+
67+
article = Article(url, config=config)
68+
article.set_html(html_content)
69+
article.parse()
70+
content = article.text.replace('\t', ' ').replace('\n', ' ').strip()
71+
72+
return content[:1500]
73+
74+
except ArticleException as ae:
75+
logger.error(f"Error while extracting text from HTML (newspaper3k): {str(ae)}")
76+
return f"Error while extracting text from HTML (newspaper3k): {str(ae)}"
77+
78+
except RequestException as re:
79+
logger.error(f"Error while making the request to the URL (newspaper3k): {str(re)}")
80+
return f"Error while making the request to the URL (newspaper3k): {str(re)}"
81+
82+
except Exception as e:
83+
logger.error(f"Unknown error while extracting text from HTML (newspaper3k): {str(e)}")
84+
return ""
85+
86+
def extract_with_bs4(self, url):
87+
"""
88+
Extract the text from a webpage using the BeautifulSoup4 method.
89+
90+
Args:
91+
url (str): The URL of the webpage to extract from.
92+
93+
Returns:
94+
str: The extracted text.
95+
"""
96+
headers = {
97+
"User-Agent": random.choice(USER_AGENTS)
98+
}
99+
100+
try:
101+
response = requests.get(url, headers=headers, timeout=10)
102+
if response.status_code == 200:
103+
soup = BeautifulSoup(response.text, 'html.parser')
104+
for tag in soup(['script', 'style', 'nav', 'footer', 'head', 'link', 'meta', 'noscript']):
105+
tag.decompose()
106+
107+
main_content_areas = soup.find_all(['main', 'article', 'section', 'div'])
108+
if main_content_areas:
109+
main_content = max(main_content_areas, key=lambda x: len(x.text))
110+
content_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
111+
content = ' '.join([tag.text.strip() for tag in main_content.find_all(content_tags)])
112+
else:
113+
content = ' '.join([tag.text.strip() for tag in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])
114+
115+
content = re.sub(r'\t', ' ', content)
116+
content = re.sub(r'\s+', ' ', content)
117+
return content
118+
elif response.status_code == 404:
119+
return f"Error: 404. Url is invalid or does not exist. Try with valid url..."
120+
else:
121+
logger.error(f"Error while extracting text from HTML (bs4): {response.status_code}")
122+
return f"Error while extracting text from HTML (bs4): {response.status_code}"
123+
124+
except Exception as e:
125+
logger.error(f"Unknown error while extracting text from HTML (bs4): {str(e)}")
126+
return ""
127+
128+
def extract_with_lxml(self, url):
129+
"""
130+
Extract the text from a webpage using the lxml method.
131+
132+
Args:
133+
url (str): The URL of the webpage to extract from.
134+
135+
Returns:
136+
str: The extracted text.
137+
"""
138+
try:
139+
config = Config()
140+
config.browser_user_agent = random.choice(USER_AGENTS)
141+
config.request_timeout = 10
142+
session = HTMLSession()
143+
144+
response = session.get(url)
145+
response.html.render(timeout=config.request_timeout)
146+
html_content = response.html.html
147+
148+
tree = html.fromstring(html_content)
149+
paragraphs = tree.cssselect('p, h1, h2, h3, h4, h5, h6')
150+
content = ' '.join([para.text_content() for para in paragraphs if para.text_content()])
151+
content = content.replace('\t', ' ').replace('\n', ' ').strip()
152+
153+
return content
154+
155+
except ArticleException as ae:
156+
logger.error("Error while extracting text from HTML (lxml): {str(ae)}")
157+
return ""
158+
159+
except RequestException as re:
160+
logger.error(f"Error while making the request to the URL (lxml): {str(re)}")
161+
return ""
162+
163+
except Exception as e:
164+
logger.error(f"Unknown error while extracting text from HTML (lxml): {str(e)}")
165+
return ""
166+

tests/test_local_collector.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import pytest
77

88
from querent.common.uri import Uri
9-
from querent.config.collector_config import CollectorBackend
9+
from querent.config.collector_config import CollectorBackend, FSCollectorConfig
1010

1111

1212
@pytest.fixture
@@ -19,7 +19,8 @@ def temp_dir():
1919
def test_fs_collector(temp_dir):
2020
uri = Uri("file://" + temp_dir)
2121
resolver = CollectorResolver()
22-
collector = resolver.resolve(uri)
22+
fileConfig = FSCollectorConfig(root_path=uri.path)
23+
collector = resolver.resolve(uri, fileConfig)
2324
assert collector is not None
2425

2526

@@ -35,7 +36,8 @@ def test_add_files_read_via_collector(temp_dir):
3536
file.write(b"test_add_files_read_via_collector")
3637
uri = Uri("file://" + temp_dir)
3738
resolver = CollectorResolver()
38-
collector = resolver.resolve(uri)
39+
fileConfig = FSCollectorConfig(root_path=uri.path)
40+
collector = resolver.resolve(uri, fileConfig)
3941
assert collector is not None
4042

4143
async def poll_and_print():

0 commit comments

Comments
 (0)