You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: data-connector-lib/src/dpk_connector/core/crawler.py
+6
Original file line number
Diff line number
Diff line change
@@ -74,6 +74,7 @@ def async_crawl(
74
74
user_agent: str="",
75
75
headers: dict[str, str] = {},
76
76
allow_domains: Collection[str] = (),
77
+
subdomain_focus: bool=False,
77
78
path_focus: bool=False,
78
79
allow_mime_types: Collection[str] = (
79
80
"application/pdf",
@@ -96,6 +97,7 @@ def async_crawl(
96
97
user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)".
97
98
headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary.
98
99
allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs.
100
+
subdomain_focus (bool): If specified, only links under the subdomains of the input seed URLs will be extracted. Ignored if `allow_domains` is specified.
99
101
path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted.
100
102
allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain".
101
103
disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
@@ -140,6 +142,7 @@ def async_crawl(
140
142
seed_urls=seed_urls,
141
143
callback=on_downloaded,
142
144
allow_domains=allow_domains,
145
+
subdomain_focus=subdomain_focus,
143
146
path_focus=path_focus,
144
147
allow_mime_types=allow_mime_types,
145
148
disallow_mime_types=disallow_mime_types,
@@ -155,6 +158,7 @@ def crawl(
155
158
user_agent: str="",
156
159
headers: dict[str, str] = {},
157
160
allow_domains: Collection[str] = (),
161
+
subdomain_focus: bool=False,
158
162
path_focus: bool=False,
159
163
allow_mime_types: Collection[str] = (
160
164
"application/pdf",
@@ -177,6 +181,7 @@ def crawl(
177
181
user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)".
178
182
headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary.
179
183
allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs.
184
+
subdomain_focus (bool): If specified, only links under the subdomains of the input seed URLs will be extracted. Ignored if `allow_domains` is specified.
180
185
path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted.
181
186
allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain".
182
187
disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
0 commit comments