Skip to content

Commit ea21cee

Browse files
authored
Fix ItemValidationPipeline using __setitem__ without ItemAdapter (#415)
* Fix ItemValidationPipeline using __setitem__ without ItemAdapter * Using typing Dict for python3.8 compatibility * Refactor to avoid casting to list before len * Add test case to check when validation field is None * Simplify check for validation field in item or None * Increase coverage with additional tests
1 parent e070dd8 commit ea21cee

File tree

2 files changed

+94
-25
lines changed

2 files changed

+94
-25
lines changed

Diff for: spidermon/contrib/scrapy/pipelines.py

+10-11
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
from collections import defaultdict
2+
from typing import Dict
3+
24
from itemadapter import ItemAdapter
35

46
from scrapy.exceptions import DropItem, NotConfigured
@@ -103,15 +105,16 @@ def process_item(self, item, _):
103105
# No validators match this specific item type
104106
return item
105107

106-
data = self._convert_item_to_dict(item)
108+
item_adapter = ItemAdapter(item)
109+
item_dict = item_adapter.asdict()
107110
self.stats.add_item()
108-
self.stats.add_fields(len(list(data.keys())))
111+
self.stats.add_fields(len(item_dict.keys()))
109112
for validator in validators:
110-
ok, errors = validator.validate(data)
113+
ok, errors = validator.validate(item_dict)
111114
if not ok:
112115
self._add_error_stats(errors)
113116
if self.add_errors_to_items:
114-
self._add_errors_to_item(item, errors)
117+
self._add_errors_to_item(item_adapter, errors)
115118
if self.drop_items_with_errors:
116119
self._drop_item(item, errors)
117120
return item
@@ -120,16 +123,12 @@ def find_validators(self, item):
120123
find = lambda x: self.validators.get(x.__name__, [])
121124
return find(item.__class__) or find(Item)
122125

123-
def _convert_item_to_dict(self, item):
124-
return ItemAdapter(item).asdict()
125-
126-
def _add_errors_to_item(self, item, errors):
127-
data = ItemAdapter(item)
128-
if self.errors_field not in data:
126+
def _add_errors_to_item(self, item: ItemAdapter, errors: Dict[str, str]):
127+
if item.get(self.errors_field, None) is None:
129128
item[self.errors_field] = defaultdict(list)
130129

131130
for field_name, messages in errors.items():
132-
data[self.errors_field][field_name] += messages
131+
item[self.errors_field][field_name] += messages
133132

134133
def _drop_item(self, item, errors):
135134
"""

Diff for: tests/contrib/validation/test_item_validation_pipeline.py

+84-14
Original file line numberDiff line numberDiff line change
@@ -98,13 +98,19 @@ def test_validation_errors_field(dummy_schema):
9898
"SPIDERMON_VALIDATION_ERRORS_FIELD": "custom_validation_field",
9999
}
100100

101-
item = {"no": "schema"}
102-
103101
crawler = get_crawler(settings_dict=settings)
104102
pipeline = ItemValidationPipeline.from_crawler(crawler)
103+
104+
# Instantiate validation field if not defined
105+
item = {"no": "schema"}
105106
item = pipeline.process_item(item, None)
106107
assert "custom_validation_field" in item
107108

109+
# Instantiate validation field if None
110+
item = {"no": "schema", "custom_validation_field": None}
111+
item = pipeline.process_item(item, None)
112+
assert item["custom_validation_field"] is not None
113+
108114

109115
def test_add_error_to_items_undefined_validation_field(dummy_schema):
110116
settings = {
@@ -138,20 +144,84 @@ class DataclassItem:
138144
foo: str
139145

140146
item = DataclassItem(foo="invalid")
141-
# Does not support item assignment
147+
# Supports item assignment but does not support field
148+
with pytest.raises(KeyError, match="custom_validation_field"):
149+
item = pipeline.process_item(item, None)
150+
151+
152+
def test_not_configured():
153+
# No validators
154+
settings = {
155+
"SPIDERMON_ENABLED": True,
156+
"SPIDERMON_VALIDATION_ADD_ERRORS_TO_ITEMS": True,
157+
"SPIDERMON_VALIDATION_ERRORS_FIELD": "custom_validation_field",
158+
}
159+
crawler = get_crawler(settings_dict=settings)
142160
with pytest.raises(
143-
TypeError, match="'DataclassItem' object does not support item assignment"
161+
scrapy.exceptions.NotConfigured, match="No validators were found"
144162
):
145-
item = pipeline.process_item(item, None)
163+
ItemValidationPipeline.from_crawler(crawler)
146164

147-
@dataclass
148-
class DataclassItemWithItemAssignment:
149-
foo: str
165+
# Invalid validator type
166+
settings = {
167+
"SPIDERMON_ENABLED": True,
168+
"SPIDERMON_VALIDATION_ADD_ERRORS_TO_ITEMS": True,
169+
"SPIDERMON_VALIDATION_SCHEMAS": object(),
170+
"SPIDERMON_VALIDATION_ERRORS_FIELD": "custom_validation_field",
171+
}
172+
crawler = get_crawler(settings_dict=settings)
173+
with pytest.raises(
174+
scrapy.exceptions.NotConfigured,
175+
match=r"Invalid <.*> type for <.*> settings",
176+
):
177+
ItemValidationPipeline.from_crawler(crawler)
150178

151-
def __setitem__(self, key, value):
152-
setattr(self, key, value)
179+
# Invalid schema type
180+
settings = {
181+
"SPIDERMON_ENABLED": True,
182+
"SPIDERMON_VALIDATION_ADD_ERRORS_TO_ITEMS": True,
183+
"SPIDERMON_VALIDATION_SCHEMAS": [False],
184+
"SPIDERMON_VALIDATION_ERRORS_FIELD": "custom_validation_field",
185+
}
186+
crawler = get_crawler(settings_dict=settings)
187+
with pytest.raises(
188+
scrapy.exceptions.NotConfigured,
189+
match=r"Invalid schema, jsonschemas must be defined as:.*",
190+
):
191+
ItemValidationPipeline.from_crawler(crawler)
153192

154-
item = DataclassItemWithItemAssignment(foo="invalid")
155-
# Supports item assignment but does not support field
156-
with pytest.raises(KeyError, match="custom_validation_field"):
157-
item = pipeline.process_item(item, None)
193+
194+
def test_drop_invalid_item(dummy_schema):
195+
settings = {
196+
"SPIDERMON_ENABLED": True,
197+
"SPIDERMON_VALIDATION_ADD_ERRORS_TO_ITEMS": True,
198+
"SPIDERMON_VALIDATION_SCHEMAS": [dummy_schema],
199+
"SPIDERMON_VALIDATION_DROP_ITEMS_WITH_ERRORS": True,
200+
"SPIDERMON_VALIDATION_ERRORS_FIELD": "custom_validation_field",
201+
}
202+
203+
crawler = get_crawler(settings_dict=settings)
204+
pipeline = ItemValidationPipeline.from_crawler(crawler)
205+
206+
item = {"foo": "invalid"}
207+
with pytest.raises(scrapy.exceptions.DropItem):
208+
pipeline.process_item(item, None)
209+
210+
211+
def test_ignore_classes_without_schema(dummy_schema):
212+
settings = {
213+
"SPIDERMON_ENABLED": True,
214+
"SPIDERMON_VALIDATION_ADD_ERRORS_TO_ITEMS": True,
215+
"SPIDERMON_VALIDATION_SCHEMAS": {scrapy.Item: dummy_schema},
216+
"SPIDERMON_VALIDATION_DROP_ITEMS_WITH_ERRORS": True,
217+
"SPIDERMON_VALIDATION_ERRORS_FIELD": "custom_validation_field",
218+
}
219+
crawler = get_crawler(settings_dict=settings)
220+
pipeline = ItemValidationPipeline.from_crawler(crawler)
221+
222+
@dataclass
223+
class DummyItem:
224+
foo: str = "bar"
225+
226+
item = DummyItem()
227+
pipeline.process_item(item, None)

0 commit comments

Comments
 (0)