Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for list Field Coverage #391

Merged
merged 10 commits into from
Apr 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 82 additions & 0 deletions docs/source/settings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -182,3 +182,85 @@ If this setting is not provided or set to ``False``, spider statistics will be:
'spidermon_item_scraped_count/dict/field_2': 2,
'spidermon_field_coverage/dict/field_1': 1, # Did not ignore None value
'spidermon_item_scraped_count/dict/field_2': 1,

SPIDERMON_LIST_FIELDS_COVERAGE_LEVELS
-------------------------------------
Default: ``0``

If larger than 0, field coverage will be computed for items inside fields that are lists.
The number represents how deep in the objects tree the coverage is computed.
Be aware that enabling this might have a significant impact in performance.

Considering your spider returns the following items:

.. code-block:: python

[
{
"field_1": None,
"field_2": [{"nested_field1": "value", "nested_field2": "value"}],
},
{
"field_1": "value",
"field_2": [
{"nested_field2": "value", "nested_field3": {"deeper_field1": "value"}}
],
},
{
"field_1": "value",
"field_2": [
{
"nested_field2": "value",
"nested_field4": [
{"deeper_field41": "value"},
{"deeper_field41": "value"},
],
}
],
},
]

If this setting is not provided or set to ``0``, spider statistics will be:

.. code-block:: python

'item_scraped_count': 3,
'spidermon_item_scraped_count': 3,
'spidermon_item_scraped_count/dict': 3,
'spidermon_item_scraped_count/dict/field_1': 3,
'spidermon_item_scraped_count/dict/field_2': 3

If set to ``1``, spider statistics will be:

.. code-block:: python

'item_scraped_count': 3,
'spidermon_item_scraped_count': 3,
'spidermon_item_scraped_count/dict': 3,
'spidermon_item_scraped_count/dict/field_1': 3,
'spidermon_item_scraped_count/dict/field_2': 3,
'spidermon_item_scraped_count/dict/field_2/_items': 3,
'spidermon_item_scraped_count/dict/field_2/_items/nested_field1': 1,
'spidermon_item_scraped_count/dict/field_2/_items/nested_field2': 3,
'spidermon_item_scraped_count/dict/field_2/_items/nested_field3': 1,
'spidermon_item_scraped_count/dict/field_2/_items/nested_field3/deeper_field1': 1,
'spidermon_item_scraped_count/dict/field_2/_items/nested_field4': 1

If set to ``2``, spider statistics will be:

.. code-block:: python

'item_scraped_count': 3,
'spidermon_item_scraped_count': 3,
'spidermon_item_scraped_count/dict': 3,
'spidermon_item_scraped_count/dict/field_1': 3,
'spidermon_item_scraped_count/dict/field_2': 3,
'spidermon_item_scraped_count/dict/field_2/_items': 3,
'spidermon_item_scraped_count/dict/field_2/_items/nested_field1': 1,
'spidermon_item_scraped_count/dict/field_2/_items/nested_field2': 3,
'spidermon_item_scraped_count/dict/field_2/_items/nested_field3': 1,
'spidermon_item_scraped_count/dict/field_2/_items/nested_field3/deeper_field1': 1,
'spidermon_item_scraped_count/dict/field_2/_items/nested_field4': 1,
'spidermon_item_scraped_count/dict/field_2/_items/nested_field4/_items': 2,
'spidermon_item_scraped_count/dict/field_2/_items/nested_field4/_items/deeper_field41': 2

36 changes: 34 additions & 2 deletions spidermon/contrib/scrapy/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ def from_crawler(cls, crawler):
crawler.signals.connect(ext.engine_stopped, signal=signals.engine_stopped)

has_field_coverage = crawler.settings.getbool("SPIDERMON_ADD_FIELD_COVERAGE")

if has_field_coverage:
crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped)

Expand All @@ -132,7 +133,14 @@ def engine_stopped(self):
spider = self.crawler.spider
self._run_suites(spider, self.engine_stopped_suites)

def _count_item(self, item, skip_none_values, item_count_stat=None):
def _count_item(
self,
item,
skip_none_values,
item_count_stat=None,
max_list_nesting_level=0,
nesting_level=0,
):
if item_count_stat is None:
item_type = type(item).__name__
item_count_stat = f"spidermon_item_scraped_count/{item_type}"
Expand All @@ -149,6 +157,24 @@ def _count_item(self, item, skip_none_values, item_count_stat=None):
self._count_item(value, skip_none_values, field_item_count_stat)
continue

if (
isinstance(value, list)
and max_list_nesting_level > 0
and nesting_level < max_list_nesting_level
):
items_count_stat = f"{field_item_count_stat}/_items"
for list_item in value:
self.crawler.stats.inc_value(items_count_stat)
if isinstance(list_item, dict):
self._count_item(
list_item,
skip_none_values,
items_count_stat,
max_list_nesting_level=max_list_nesting_level,
nesting_level=nesting_level + 1,
)
continue

def _add_field_coverage_to_stats(self):
stats = self.crawler.stats.get_stats()
coverage_stats = calculate_field_coverage(stats)
Expand All @@ -158,8 +184,14 @@ def item_scraped(self, item, response, spider):
skip_none_values = spider.crawler.settings.getbool(
"SPIDERMON_FIELD_COVERAGE_SKIP_NONE", False
)
list_field_coverage_levels = spider.crawler.settings.getint(
"SPIDERMON_LIST_FIELDS_COVERAGE_LEVELS", 0
)

self.crawler.stats.inc_value("spidermon_item_scraped_count")
self._count_item(item, skip_none_values)
self._count_item(
item, skip_none_values, max_list_nesting_level=list_field_coverage_levels
)

def _run_periodic_suites(self, spider, suites):
suites = [self.load_suite(s) for s in suites]
Expand Down
25 changes: 24 additions & 1 deletion spidermon/contrib/scrapy/monitors/monitors.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,27 @@ class FieldCoverageMonitor(BaseScrapyMonitor):
You are not obligated to set rules for every field, just for the ones in which you are interested.
Also, you can monitor nested fields if available in your returned items.

If a field returned by your spider is a list of dicts (or objects) and you want to check their
coverage, that is also possible. You need to set the ``SPIDERMON_LIST_FIELDS_COVERAGE_LEVELS``
setting. This value represents for how many levels inside the list the coverage will be computed
(if the objects inside the list also have fields that are objects/lists).
The coverage for list fields is computed in two ways: with
respect to the total items scraped (these values can be greater than 1) and with respect to the
total of items in the list. The stats are in the following form:

.. code-block:: python

{
"spidermon_field_coverage/dict/field2/_items/nested_field1": "some_value",
"spidermon_field_coverage/dict/field2/nested_field1": "other_value",
}

The stat containing `_items` means it is calculated based on the total list items, while the
other, based on the total number of scraped items.

If the objects in the list also contain another list field, that coverage is also computed in
both ways, with the total list items considered for the `_items` stat that of the innermost list.

In case you have a job without items scraped, and you want to skip this test, you have to enable the
``SPIDERMON_FIELD_COVERAGE_SKIP_IF_NO_ITEM`` setting to avoid the field coverage monitor error.

Expand Down Expand Up @@ -410,7 +431,9 @@ class MyCustomItem(scrapy.Item):
SPIDERMON_FIELD_COVERAGE_RULES = {
"MyCustomItem/field_1": 0.4,
"MyCustomItem/field_2": 1.0,
}"""
}

"""

def run(self, result):
add_field_coverage_set = self.crawler.settings.getbool(
Expand Down
31 changes: 27 additions & 4 deletions spidermon/utils/field_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,33 @@ def calculate_field_coverage(stats):
item_key = item_type_m.group(2)

item_type_total = stats.get(f"spidermon_item_scraped_count/{item_type}")
field_coverage = value / item_type_total

coverage[
f"spidermon_field_coverage/{item_type}/{item_key}"
] = field_coverage
if "_items" in item_key:
if item_key.endswith("_items"):
continue

levels = item_key.split("/_items/")

root_field_type_total = stats.get(
f"spidermon_item_scraped_count/{item_type}/{'/_items/'.join(levels[:-1])}/_items"
)

item_field_coverage = value / root_field_type_total
global_field_coverage = value / item_type_total

coverage[
f"spidermon_field_coverage/{item_type}/{'/'.join(levels)}"
] = global_field_coverage

coverage[
f"spidermon_field_coverage/{item_type}/{'/_items/'.join(levels)}"
] = item_field_coverage

else:
field_coverage = value / item_type_total

coverage[
f"spidermon_field_coverage/{item_type}/{item_key}"
] = field_coverage

return coverage
Loading