implement subdomain focus feature in data-prep-connector (#725)

hmtbr · web-flow · commit b297156f0ec4 · 2024-10-22T12:29:06.000+09:00
* implement subdomain focus feature in data-prep-connector

Signed-off-by: Hiroya Matsubara &lt;hmtbr@jp.ibm.com&gt;

* refactoring

Signed-off-by: Hiroya Matsubara &lt;hmtbr@jp.ibm.com&gt;

* bump version

Signed-off-by: Hiroya Matsubara &lt;hmtbr@jp.ibm.com&gt;

---------

Signed-off-by: Hiroya Matsubara &lt;hmtbr@jp.ibm.com&gt;
diff --git a/data-connector-lib/pyproject.toml b/data-connector-lib/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "data_prep_connector"
-version = "0.2.2.dev0"
+version = "0.2.2.dev1"
 requires-python = ">=3.10"
 keywords = [
     "data",
diff --git a/data-connector-lib/src/dpk_connector/core/crawler.py b/data-connector-lib/src/dpk_connector/core/crawler.py
@@ -74,6 +74,7 @@ def async_crawl(
     user_agent: str = "",
     headers: dict[str, str] = {},
     allow_domains: Collection[str] = (),
+    subdomain_focus: bool = False,
     path_focus: bool = False,
     allow_mime_types: Collection[str] = (
         "application/pdf",
@@ -96,6 +97,7 @@ def async_crawl(
         user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)".
         headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary.
         allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs.
+        subdomain_focus (bool): If specified, only links under the subdomains of the input seed URLs will be extracted. Ignored if `allow_domains` is specified.
         path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted.
         allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain".
         disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
@@ -140,6 +142,7 @@ def async_crawl(
         seed_urls=seed_urls,
         callback=on_downloaded,
         allow_domains=allow_domains,
+        subdomain_focus=subdomain_focus,
         path_focus=path_focus,
         allow_mime_types=allow_mime_types,
         disallow_mime_types=disallow_mime_types,
@@ -155,6 +158,7 @@ def crawl(
     user_agent: str = "",
     headers: dict[str, str] = {},
     allow_domains: Collection[str] = (),
+    subdomain_focus: bool = False,
     path_focus: bool = False,
     allow_mime_types: Collection[str] = (
         "application/pdf",
@@ -177,6 +181,7 @@ def crawl(
         user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)".
         headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary.
         allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs.
+        subdomain_focus (bool): If specified, only links under the subdomains of the input seed URLs will be extracted. Ignored if `allow_domains` is specified.
         path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted.
         allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain".
         disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
@@ -198,6 +203,7 @@ def on_completed(result: Any):
         user_agent,
         headers,
         allow_domains,
+        subdomain_focus,
         path_focus,
         allow_mime_types,
         disallow_mime_types,
diff --git a/data-connector-lib/src/dpk_connector/core/spiders/sitemap.py b/data-connector-lib/src/dpk_connector/core/spiders/sitemap.py
@@ -28,6 +28,7 @@
     get_content_type,
     get_etld1,
     get_focus_path,
+    get_fqdn,
     is_allowed_path,
     urlparse_cached,
 )
@@ -42,6 +43,7 @@ def __init__(
         self,
         seed_urls: Collection[str],
         allow_domains: Collection[str] = (),
+        subdomain_focus: bool = False,
         path_focus: bool = False,
         allow_mime_types: Collection[str] = (),
         disallow_mime_types: Collection[str] = (),
@@ -88,11 +90,15 @@ def __init__(
                     self.focus_paths.add(path)
 
         # Domains and mime types filtering
-        self.allowed_domains = set(
-            allow_domains
-            if len(allow_domains) > 0
-            else [get_etld1(url) for url in seed_urls]
-        )
+        if allow_domains:
+            self.allowed_domains = set(allow_domains)
+        elif subdomain_focus:
+            self.allowed_domains = set()
+            for url in seed_urls:
+                if fqdn := get_fqdn(url):
+                    self.allowed_domains.add(fqdn)
+        else:
+            self.allowed_domains = set(get_etld1(url) for url in seed_urls)
         self.allow_mime_types = set(
             [m.lower() for m in allow_mime_types] if len(allow_mime_types) > 0 else ()
         )
@@ -155,7 +161,9 @@ def start_requests(self):
             )
 
     def _parse_sitemap(self, response: Response):
-        yield ConnectorItem(dropped=False, downloaded=False, system_request=True, sitemap=True)
+        yield ConnectorItem(
+            dropped=False, downloaded=False, system_request=True, sitemap=True
+        )
 
         seed_url = response.meta["seed_url"]
 
diff --git a/data-connector-lib/src/dpk_connector/core/utils.py b/data-connector-lib/src/dpk_connector/core/utils.py
@@ -57,6 +57,11 @@ def get_etld1(url: str) -> str:
     return f"{ext.domain}.{ext.suffix}"
 
 
+def get_fqdn(url: str) -> str:
+    ext = tldextract.extract(url)
+    return ext.fqdn
+
+
 def get_focus_path(url: str) -> str | None:
     parts = urlparse_cached(url)
     if len(parts.path.split("/")) > 2:
diff --git a/data-connector-lib/test/dpk_connector/core/test_sitemap_spider.py b/data-connector-lib/test/dpk_connector/core/test_sitemap_spider.py
@@ -1,13 +1,12 @@
 from pathlib import Path
 
 import pytest
+from dpk_connector.core.item import ConnectorItem
+from dpk_connector.core.spiders.sitemap import BaseSitemapSpider, ConnectorSitemapSpider
 from scrapy import Request
 from scrapy.crawler import Crawler
 from scrapy.http import HtmlResponse
 
-from dpk_connector.core.item import ConnectorItem
-from dpk_connector.core.spiders.sitemap import BaseSitemapSpider, ConnectorSitemapSpider
-
 
 @pytest.fixture
 def crawler() -> Crawler:
@@ -22,6 +21,21 @@ def crawler() -> Crawler:
     return crawler
 
 
+def test_init_subdomain_focus():
+    spider = BaseSitemapSpider(
+        seed_urls=(
+            "http://blog.example.com/",
+            "http://contents.example.com/",
+        ),
+        subdomain_focus=True,
+    )
+    assert spider.seed_urls == {
+        "http://blog.example.com/",
+        "http://contents.example.com/",
+    }
+    assert spider.allowed_domains == {"blog.example.com", "contents.example.com"}
+
+
 def test_init_path_focus():
     spider = BaseSitemapSpider(
         seed_urls=(
diff --git a/data-connector-lib/test/dpk_connector/core/test_utils.py b/data-connector-lib/test/dpk_connector/core/test_utils.py
@@ -7,6 +7,7 @@
     get_content_type,
     get_etld1,
     get_focus_path,
+    get_fqdn,
     get_header_value,
     get_mime_type,
     is_allowed_path,
@@ -83,6 +84,21 @@ def test_get_etld1(url: str, expected: str):
     assert get_etld1(url) == expected
 
 
+@pytest.mark.parametrize(
+    "url,expected",
+    [
+        ("http://www.example.com", "www.example.com"),
+        ("https://www.example.co.uk", "www.example.co.uk"),
+        ("http://www.example.com/path?query=string#fragment", "www.example.com"),
+        ("http://localhost:8080/", ""),
+        ("http://www.example.com:8080/", "www.example.com"),
+        ("http://www.sub.example.com:8080/", "www.sub.example.com"),
+    ],
+)
+def test_get_fqdn(url: str, expected: str):
+    assert get_fqdn(url) == expected
+
+
 @pytest.mark.parametrize(
     "url,expected",
     [