Skip to content

Commit b297156

Browse files
authored
implement subdomain focus feature in data-prep-connector (#725)
* implement subdomain focus feature in data-prep-connector Signed-off-by: Hiroya Matsubara <[email protected]> * refactoring Signed-off-by: Hiroya Matsubara <[email protected]> * bump version Signed-off-by: Hiroya Matsubara <[email protected]> --------- Signed-off-by: Hiroya Matsubara <[email protected]>
1 parent bd2b6dc commit b297156

File tree

6 files changed

+59
-10
lines changed

6 files changed

+59
-10
lines changed

data-connector-lib/pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "data_prep_connector"
3-
version = "0.2.2.dev0"
3+
version = "0.2.2.dev1"
44
requires-python = ">=3.10"
55
keywords = [
66
"data",

data-connector-lib/src/dpk_connector/core/crawler.py

+6
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ def async_crawl(
7474
user_agent: str = "",
7575
headers: dict[str, str] = {},
7676
allow_domains: Collection[str] = (),
77+
subdomain_focus: bool = False,
7778
path_focus: bool = False,
7879
allow_mime_types: Collection[str] = (
7980
"application/pdf",
@@ -96,6 +97,7 @@ def async_crawl(
9697
user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)".
9798
headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary.
9899
allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs.
100+
subdomain_focus (bool): If specified, only links under the subdomains of the input seed URLs will be extracted. Ignored if `allow_domains` is specified.
99101
path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted.
100102
allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain".
101103
disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
@@ -140,6 +142,7 @@ def async_crawl(
140142
seed_urls=seed_urls,
141143
callback=on_downloaded,
142144
allow_domains=allow_domains,
145+
subdomain_focus=subdomain_focus,
143146
path_focus=path_focus,
144147
allow_mime_types=allow_mime_types,
145148
disallow_mime_types=disallow_mime_types,
@@ -155,6 +158,7 @@ def crawl(
155158
user_agent: str = "",
156159
headers: dict[str, str] = {},
157160
allow_domains: Collection[str] = (),
161+
subdomain_focus: bool = False,
158162
path_focus: bool = False,
159163
allow_mime_types: Collection[str] = (
160164
"application/pdf",
@@ -177,6 +181,7 @@ def crawl(
177181
user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)".
178182
headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary.
179183
allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs.
184+
subdomain_focus (bool): If specified, only links under the subdomains of the input seed URLs will be extracted. Ignored if `allow_domains` is specified.
180185
path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted.
181186
allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain".
182187
disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
@@ -198,6 +203,7 @@ def on_completed(result: Any):
198203
user_agent,
199204
headers,
200205
allow_domains,
206+
subdomain_focus,
201207
path_focus,
202208
allow_mime_types,
203209
disallow_mime_types,

data-connector-lib/src/dpk_connector/core/spiders/sitemap.py

+14-6
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
get_content_type,
2929
get_etld1,
3030
get_focus_path,
31+
get_fqdn,
3132
is_allowed_path,
3233
urlparse_cached,
3334
)
@@ -42,6 +43,7 @@ def __init__(
4243
self,
4344
seed_urls: Collection[str],
4445
allow_domains: Collection[str] = (),
46+
subdomain_focus: bool = False,
4547
path_focus: bool = False,
4648
allow_mime_types: Collection[str] = (),
4749
disallow_mime_types: Collection[str] = (),
@@ -88,11 +90,15 @@ def __init__(
8890
self.focus_paths.add(path)
8991

9092
# Domains and mime types filtering
91-
self.allowed_domains = set(
92-
allow_domains
93-
if len(allow_domains) > 0
94-
else [get_etld1(url) for url in seed_urls]
95-
)
93+
if allow_domains:
94+
self.allowed_domains = set(allow_domains)
95+
elif subdomain_focus:
96+
self.allowed_domains = set()
97+
for url in seed_urls:
98+
if fqdn := get_fqdn(url):
99+
self.allowed_domains.add(fqdn)
100+
else:
101+
self.allowed_domains = set(get_etld1(url) for url in seed_urls)
96102
self.allow_mime_types = set(
97103
[m.lower() for m in allow_mime_types] if len(allow_mime_types) > 0 else ()
98104
)
@@ -155,7 +161,9 @@ def start_requests(self):
155161
)
156162

157163
def _parse_sitemap(self, response: Response):
158-
yield ConnectorItem(dropped=False, downloaded=False, system_request=True, sitemap=True)
164+
yield ConnectorItem(
165+
dropped=False, downloaded=False, system_request=True, sitemap=True
166+
)
159167

160168
seed_url = response.meta["seed_url"]
161169

data-connector-lib/src/dpk_connector/core/utils.py

+5
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,11 @@ def get_etld1(url: str) -> str:
5757
return f"{ext.domain}.{ext.suffix}"
5858

5959

60+
def get_fqdn(url: str) -> str:
61+
ext = tldextract.extract(url)
62+
return ext.fqdn
63+
64+
6065
def get_focus_path(url: str) -> str | None:
6166
parts = urlparse_cached(url)
6267
if len(parts.path.split("/")) > 2:

data-connector-lib/test/dpk_connector/core/test_sitemap_spider.py

+17-3
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,12 @@
11
from pathlib import Path
22

33
import pytest
4+
from dpk_connector.core.item import ConnectorItem
5+
from dpk_connector.core.spiders.sitemap import BaseSitemapSpider, ConnectorSitemapSpider
46
from scrapy import Request
57
from scrapy.crawler import Crawler
68
from scrapy.http import HtmlResponse
79

8-
from dpk_connector.core.item import ConnectorItem
9-
from dpk_connector.core.spiders.sitemap import BaseSitemapSpider, ConnectorSitemapSpider
10-
1110

1211
@pytest.fixture
1312
def crawler() -> Crawler:
@@ -22,6 +21,21 @@ def crawler() -> Crawler:
2221
return crawler
2322

2423

24+
def test_init_subdomain_focus():
25+
spider = BaseSitemapSpider(
26+
seed_urls=(
27+
"http://blog.example.com/",
28+
"http://contents.example.com/",
29+
),
30+
subdomain_focus=True,
31+
)
32+
assert spider.seed_urls == {
33+
"http://blog.example.com/",
34+
"http://contents.example.com/",
35+
}
36+
assert spider.allowed_domains == {"blog.example.com", "contents.example.com"}
37+
38+
2539
def test_init_path_focus():
2640
spider = BaseSitemapSpider(
2741
seed_urls=(

data-connector-lib/test/dpk_connector/core/test_utils.py

+16
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
get_content_type,
88
get_etld1,
99
get_focus_path,
10+
get_fqdn,
1011
get_header_value,
1112
get_mime_type,
1213
is_allowed_path,
@@ -83,6 +84,21 @@ def test_get_etld1(url: str, expected: str):
8384
assert get_etld1(url) == expected
8485

8586

87+
@pytest.mark.parametrize(
88+
"url,expected",
89+
[
90+
("http://www.example.com", "www.example.com"),
91+
("https://www.example.co.uk", "www.example.co.uk"),
92+
("http://www.example.com/path?query=string#fragment", "www.example.com"),
93+
("http://localhost:8080/", ""),
94+
("http://www.example.com:8080/", "www.example.com"),
95+
("http://www.sub.example.com:8080/", "www.sub.example.com"),
96+
],
97+
)
98+
def test_get_fqdn(url: str, expected: str):
99+
assert get_fqdn(url) == expected
100+
101+
86102
@pytest.mark.parametrize(
87103
"url,expected",
88104
[

0 commit comments

Comments
 (0)