Skip to content

Commit 44d5316

Browse files
mrwbargVMRuiz
andauthored
Support for list Field Coverage (#391)
* Add logic to count list fields * Docs and tests * formatting * Remove unused * Lint * Update logic and add setting * Docs * lint docs --------- Co-authored-by: mauricio.barg <> Co-authored-by: Víctor Ruiz <[email protected]>
1 parent 0cf783f commit 44d5316

File tree

6 files changed

+504
-7
lines changed

6 files changed

+504
-7
lines changed

docs/source/settings.rst

+82
Original file line numberDiff line numberDiff line change
@@ -182,3 +182,85 @@ If this setting is not provided or set to ``False``, spider statistics will be:
182182
'spidermon_item_scraped_count/dict/field_2': 2,
183183
'spidermon_field_coverage/dict/field_1': 1, # Did not ignore None value
184184
'spidermon_item_scraped_count/dict/field_2': 1,
185+
186+
SPIDERMON_LIST_FIELDS_COVERAGE_LEVELS
187+
-------------------------------------
188+
Default: ``0``
189+
190+
If larger than 0, field coverage will be computed for items inside fields that are lists.
191+
The number represents how deep in the objects tree the coverage is computed.
192+
Be aware that enabling this might have a significant impact in performance.
193+
194+
Considering your spider returns the following items:
195+
196+
.. code-block:: python
197+
198+
[
199+
{
200+
"field_1": None,
201+
"field_2": [{"nested_field1": "value", "nested_field2": "value"}],
202+
},
203+
{
204+
"field_1": "value",
205+
"field_2": [
206+
{"nested_field2": "value", "nested_field3": {"deeper_field1": "value"}}
207+
],
208+
},
209+
{
210+
"field_1": "value",
211+
"field_2": [
212+
{
213+
"nested_field2": "value",
214+
"nested_field4": [
215+
{"deeper_field41": "value"},
216+
{"deeper_field41": "value"},
217+
],
218+
}
219+
],
220+
},
221+
]
222+
223+
If this setting is not provided or set to ``0``, spider statistics will be:
224+
225+
.. code-block:: python
226+
227+
'item_scraped_count': 3,
228+
'spidermon_item_scraped_count': 3,
229+
'spidermon_item_scraped_count/dict': 3,
230+
'spidermon_item_scraped_count/dict/field_1': 3,
231+
'spidermon_item_scraped_count/dict/field_2': 3
232+
233+
If set to ``1``, spider statistics will be:
234+
235+
.. code-block:: python
236+
237+
'item_scraped_count': 3,
238+
'spidermon_item_scraped_count': 3,
239+
'spidermon_item_scraped_count/dict': 3,
240+
'spidermon_item_scraped_count/dict/field_1': 3,
241+
'spidermon_item_scraped_count/dict/field_2': 3,
242+
'spidermon_item_scraped_count/dict/field_2/_items': 3,
243+
'spidermon_item_scraped_count/dict/field_2/_items/nested_field1': 1,
244+
'spidermon_item_scraped_count/dict/field_2/_items/nested_field2': 3,
245+
'spidermon_item_scraped_count/dict/field_2/_items/nested_field3': 1,
246+
'spidermon_item_scraped_count/dict/field_2/_items/nested_field3/deeper_field1': 1,
247+
'spidermon_item_scraped_count/dict/field_2/_items/nested_field4': 1
248+
249+
If set to ``2``, spider statistics will be:
250+
251+
.. code-block:: python
252+
253+
'item_scraped_count': 3,
254+
'spidermon_item_scraped_count': 3,
255+
'spidermon_item_scraped_count/dict': 3,
256+
'spidermon_item_scraped_count/dict/field_1': 3,
257+
'spidermon_item_scraped_count/dict/field_2': 3,
258+
'spidermon_item_scraped_count/dict/field_2/_items': 3,
259+
'spidermon_item_scraped_count/dict/field_2/_items/nested_field1': 1,
260+
'spidermon_item_scraped_count/dict/field_2/_items/nested_field2': 3,
261+
'spidermon_item_scraped_count/dict/field_2/_items/nested_field3': 1,
262+
'spidermon_item_scraped_count/dict/field_2/_items/nested_field3/deeper_field1': 1,
263+
'spidermon_item_scraped_count/dict/field_2/_items/nested_field4': 1,
264+
'spidermon_item_scraped_count/dict/field_2/_items/nested_field4/_items': 2,
265+
'spidermon_item_scraped_count/dict/field_2/_items/nested_field4/_items/deeper_field41': 2
266+

spidermon/contrib/scrapy/extensions.py

+34-2
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ def from_crawler(cls, crawler):
108108
crawler.signals.connect(ext.engine_stopped, signal=signals.engine_stopped)
109109

110110
has_field_coverage = crawler.settings.getbool("SPIDERMON_ADD_FIELD_COVERAGE")
111+
111112
if has_field_coverage:
112113
crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped)
113114

@@ -132,7 +133,14 @@ def engine_stopped(self):
132133
spider = self.crawler.spider
133134
self._run_suites(spider, self.engine_stopped_suites)
134135

135-
def _count_item(self, item, skip_none_values, item_count_stat=None):
136+
def _count_item(
137+
self,
138+
item,
139+
skip_none_values,
140+
item_count_stat=None,
141+
max_list_nesting_level=0,
142+
nesting_level=0,
143+
):
136144
if item_count_stat is None:
137145
item_type = type(item).__name__
138146
item_count_stat = f"spidermon_item_scraped_count/{item_type}"
@@ -149,6 +157,24 @@ def _count_item(self, item, skip_none_values, item_count_stat=None):
149157
self._count_item(value, skip_none_values, field_item_count_stat)
150158
continue
151159

160+
if (
161+
isinstance(value, list)
162+
and max_list_nesting_level > 0
163+
and nesting_level < max_list_nesting_level
164+
):
165+
items_count_stat = f"{field_item_count_stat}/_items"
166+
for list_item in value:
167+
self.crawler.stats.inc_value(items_count_stat)
168+
if isinstance(list_item, dict):
169+
self._count_item(
170+
list_item,
171+
skip_none_values,
172+
items_count_stat,
173+
max_list_nesting_level=max_list_nesting_level,
174+
nesting_level=nesting_level + 1,
175+
)
176+
continue
177+
152178
def _add_field_coverage_to_stats(self):
153179
stats = self.crawler.stats.get_stats()
154180
coverage_stats = calculate_field_coverage(stats)
@@ -158,8 +184,14 @@ def item_scraped(self, item, response, spider):
158184
skip_none_values = spider.crawler.settings.getbool(
159185
"SPIDERMON_FIELD_COVERAGE_SKIP_NONE", False
160186
)
187+
list_field_coverage_levels = spider.crawler.settings.getint(
188+
"SPIDERMON_LIST_FIELDS_COVERAGE_LEVELS", 0
189+
)
190+
161191
self.crawler.stats.inc_value("spidermon_item_scraped_count")
162-
self._count_item(item, skip_none_values)
192+
self._count_item(
193+
item, skip_none_values, max_list_nesting_level=list_field_coverage_levels
194+
)
163195

164196
def _run_periodic_suites(self, spider, suites):
165197
suites = [self.load_suite(s) for s in suites]

spidermon/contrib/scrapy/monitors/monitors.py

+24-1
Original file line numberDiff line numberDiff line change
@@ -380,6 +380,27 @@ class FieldCoverageMonitor(BaseScrapyMonitor):
380380
You are not obligated to set rules for every field, just for the ones in which you are interested.
381381
Also, you can monitor nested fields if available in your returned items.
382382
383+
If a field returned by your spider is a list of dicts (or objects) and you want to check their
384+
coverage, that is also possible. You need to set the ``SPIDERMON_LIST_FIELDS_COVERAGE_LEVELS``
385+
setting. This value represents for how many levels inside the list the coverage will be computed
386+
(if the objects inside the list also have fields that are objects/lists).
387+
The coverage for list fields is computed in two ways: with
388+
respect to the total items scraped (these values can be greater than 1) and with respect to the
389+
total of items in the list. The stats are in the following form:
390+
391+
.. code-block:: python
392+
393+
{
394+
"spidermon_field_coverage/dict/field2/_items/nested_field1": "some_value",
395+
"spidermon_field_coverage/dict/field2/nested_field1": "other_value",
396+
}
397+
398+
The stat containing `_items` means it is calculated based on the total list items, while the
399+
other, based on the total number of scraped items.
400+
401+
If the objects in the list also contain another list field, that coverage is also computed in
402+
both ways, with the total list items considered for the `_items` stat that of the innermost list.
403+
383404
In case you have a job without items scraped, and you want to skip this test, you have to enable the
384405
``SPIDERMON_FIELD_COVERAGE_SKIP_IF_NO_ITEM`` setting to avoid the field coverage monitor error.
385406
@@ -410,7 +431,9 @@ class MyCustomItem(scrapy.Item):
410431
SPIDERMON_FIELD_COVERAGE_RULES = {
411432
"MyCustomItem/field_1": 0.4,
412433
"MyCustomItem/field_2": 1.0,
413-
}"""
434+
}
435+
436+
"""
414437

415438
def run(self, result):
416439
add_field_coverage_set = self.crawler.settings.getbool(

spidermon/utils/field_coverage.py

+27-4
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,33 @@ def calculate_field_coverage(stats):
1515
item_key = item_type_m.group(2)
1616

1717
item_type_total = stats.get(f"spidermon_item_scraped_count/{item_type}")
18-
field_coverage = value / item_type_total
1918

20-
coverage[
21-
f"spidermon_field_coverage/{item_type}/{item_key}"
22-
] = field_coverage
19+
if "_items" in item_key:
20+
if item_key.endswith("_items"):
21+
continue
22+
23+
levels = item_key.split("/_items/")
24+
25+
root_field_type_total = stats.get(
26+
f"spidermon_item_scraped_count/{item_type}/{'/_items/'.join(levels[:-1])}/_items"
27+
)
28+
29+
item_field_coverage = value / root_field_type_total
30+
global_field_coverage = value / item_type_total
31+
32+
coverage[
33+
f"spidermon_field_coverage/{item_type}/{'/'.join(levels)}"
34+
] = global_field_coverage
35+
36+
coverage[
37+
f"spidermon_field_coverage/{item_type}/{'/_items/'.join(levels)}"
38+
] = item_field_coverage
39+
40+
else:
41+
field_coverage = value / item_type_total
42+
43+
coverage[
44+
f"spidermon_field_coverage/{item_type}/{item_key}"
45+
] = field_coverage
2346

2447
return coverage

0 commit comments

Comments
 (0)