Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for list Field Coverage #391

Merged
merged 10 commits into from
Apr 26, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions spidermon/contrib/scrapy/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,14 @@ def _count_item(self, item, skip_none_values, item_count_stat=None):
self._count_item(value, skip_none_values, field_item_count_stat)
continue

if isinstance(value, list):
items_count_stat = f"{field_item_count_stat}/_items"
for list_item in value:
self.crawler.stats.inc_value(items_count_stat)
if isinstance(list_item, dict):
self._count_item(list_item, skip_none_values, items_count_stat)
continue

def _add_field_coverage_to_stats(self):
stats = self.crawler.stats.get_stats()
coverage_stats = calculate_field_coverage(stats)
Expand Down
18 changes: 18 additions & 0 deletions spidermon/contrib/scrapy/monitors/monitors.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,24 @@ class FieldCoverageMonitor(BaseScrapyMonitor):
You are not obligated to set rules for every field, just for the ones in which you are interested.
Also, you can monitor nested fields if available in your returned items.

If a field returned by your spider is a list of dicts (or objects) and you want to check their
coverage, that is also possible. The coverage for list fields is computed in two ways: with
respect to the total items scraped (these values can be greater than 1) and with respect to the
total of items in the list. The stats are in the following form:

.. code-block:: python

{
"spidermon_field_coverage/dict/field2/_items/nested_field1": "some_value",
"spidermon_field_coverage/dict/field2/nested_field1": "other_value",
}

The stat containing `_items` means it is calculated based on the total list items, while the
other, based on the total number of scraped items.

If the objects in the list also contain another list field, that coverage is also computed in
both ways, with the total list items considered for the `_items` stat that of the innermost list.

.. warning::

Rules for nested fields will be validated against the total number of items returned.
Expand Down
34 changes: 30 additions & 4 deletions spidermon/utils/field_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@


def calculate_field_coverage(stats):
def handle_list_fields(stats, coverage, base_path):
root_field, _, nested_key = item_key.split("/", 2)

coverage = {}
for key, value in stats.items():
if not key.startswith("spidermon_item_scraped_count"):
Expand All @@ -15,10 +18,33 @@ def calculate_field_coverage(stats):
item_key = item_type_m.group(2)

item_type_total = stats.get(f"spidermon_item_scraped_count/{item_type}")
field_coverage = value / item_type_total

coverage[
f"spidermon_field_coverage/{item_type}/{item_key}"
] = field_coverage
if "_items" in item_key:
if item_key.endswith("_items"):
continue

levels = item_key.split("/_items/")

root_field_type_total = stats.get(
f"spidermon_item_scraped_count/{item_type}/{'/_items/'.join(levels[:-1])}/_items"
)

item_field_coverage = value / root_field_type_total
global_field_coverage = value / item_type_total

coverage[
f"spidermon_field_coverage/{item_type}/{'/'.join(levels)}"
] = global_field_coverage

coverage[
f"spidermon_field_coverage/{item_type}/{'/_items/'.join(levels)}"
] = item_field_coverage

else:
field_coverage = value / item_type_total

coverage[
f"spidermon_field_coverage/{item_type}/{item_key}"
] = field_coverage

return coverage
85 changes: 85 additions & 0 deletions tests/test_item_scraped_signal.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,3 +306,88 @@ def test_item_scraped_count_do_not_ignore_none_values_by_default(spider):

assert stats.get("spidermon_item_scraped_count/dict/field1") == 2
assert stats.get("spidermon_item_scraped_count/dict/field2") == 2


def test_item_scraped_count_list_of_dicts(spider):
returned_items = [
{
"field1": 1,
"field2": [
{
"nested_field1": 1,
"nested_field2": 1,
"nested_field3": [
{"deep_field1": 1},
{"deep_field1": 1},
{"deep_field2": 1},
],
},
{"nested_field2": 1},
],
},
{
"field1": 1,
"field2": [
{"nested_field1": 1},
{
"nested_field1": 1,
"nested_field4": {"deep_field1": 1, "deep_field2": 1},
},
{"nested_field1": 1, "nested_field2": 1},
],
},
]

for item in returned_items:
spider.crawler.signals.send_catch_log_deferred(
signal=signals.item_scraped,
item=item,
response="",
spider=spider,
)

stats = spider.crawler.stats.get_stats()

assert stats.get("spidermon_item_scraped_count/dict/field2/_items") == 5
assert (
stats.get("spidermon_item_scraped_count/dict/field2/_items/nested_field1") == 4
)
assert (
stats.get("spidermon_item_scraped_count/dict/field2/_items/nested_field2") == 3
)
assert (
stats.get("spidermon_item_scraped_count/dict/field2/_items/nested_field3") == 1
)
assert (
stats.get(
"spidermon_item_scraped_count/dict/field2/_items/nested_field3/_items"
)
== 3
)
assert (
stats.get(
"spidermon_item_scraped_count/dict/field2/_items/nested_field3/_items/deep_field1"
)
== 2
)
assert (
stats.get(
"spidermon_item_scraped_count/dict/field2/_items/nested_field3/_items/deep_field2"
)
== 1
)
assert (
stats.get("spidermon_item_scraped_count/dict/field2/_items/nested_field4") == 1
)
assert (
stats.get(
"spidermon_item_scraped_count/dict/field2/_items/nested_field4/deep_field1"
)
== 1
)
assert (
stats.get(
"spidermon_item_scraped_count/dict/field2/_items/nested_field4/deep_field2"
)
== 1
)
45 changes: 45 additions & 0 deletions tests/utils/test_field_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,48 @@ def test_calculate_field_coverage_from_stats():
coverage = calculate_field_coverage(spider_stats)

assert coverage == expected_coverage


def test_calculate_field_coverage_from_stats_with_nested_fields():
spider_stats = {
"finish_reason": "finished",
"spidermon_item_scraped_count": 100,
"spidermon_item_scraped_count/dict": 100,
"spidermon_item_scraped_count/dict/field1": 100,
"spidermon_item_scraped_count/dict/field2": 90,
"spidermon_item_scraped_count/dict/field2/_items": 1000,
"spidermon_item_scraped_count/dict/field2/_items/nested_field1": 550,
"spidermon_item_scraped_count/dict/field2/_items/nested_field2": 1000,
"spidermon_item_scraped_count/dict/field2/_items/nested_field3": 300,
"spidermon_item_scraped_count/dict/field2/_items/nested_field3/_items": 500,
"spidermon_item_scraped_count/dict/field2/_items/nested_field3/_items/deep_field1": 500,
"spidermon_item_scraped_count/dict/field2/_items/nested_field3/_items/deep_field2": 250,
"spidermon_item_scraped_count/dict/field2/_items/nested_field4": 500,
"spidermon_item_scraped_count/dict/field2/_items/nested_field4/deep_field1": 500,
"spidermon_item_scraped_count/dict/field2/_items/nested_field4/deep_field2": 250,
}

expected_coverage = {
"spidermon_field_coverage/dict/field1": 1.0,
"spidermon_field_coverage/dict/field2": 0.9,
"spidermon_field_coverage/dict/field2/_items/nested_field1": 0.55,
"spidermon_field_coverage/dict/field2/_items/nested_field2": 1.0,
"spidermon_field_coverage/dict/field2/_items/nested_field3": 0.3,
"spidermon_field_coverage/dict/field2/_items/nested_field3/_items/deep_field1": 1.0,
"spidermon_field_coverage/dict/field2/_items/nested_field3/_items/deep_field2": 0.5,
"spidermon_field_coverage/dict/field2/_items/nested_field4": 0.5,
"spidermon_field_coverage/dict/field2/_items/nested_field4/deep_field1": 0.5,
"spidermon_field_coverage/dict/field2/_items/nested_field4/deep_field2": 0.25,
"spidermon_field_coverage/dict/field2/nested_field1": 5.5,
"spidermon_field_coverage/dict/field2/nested_field2": 10.0,
"spidermon_field_coverage/dict/field2/nested_field3": 3.0,
"spidermon_field_coverage/dict/field2/nested_field3/deep_field1": 5.0,
"spidermon_field_coverage/dict/field2/nested_field3/deep_field2": 2.5,
"spidermon_field_coverage/dict/field2/nested_field4": 5.0,
"spidermon_field_coverage/dict/field2/nested_field4/deep_field1": 5.0,
"spidermon_field_coverage/dict/field2/nested_field4/deep_field2": 2.5,
}

coverage = calculate_field_coverage(spider_stats)

assert coverage == expected_coverage