-
Notifications
You must be signed in to change notification settings - Fork 12
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Consider using @dataclass
in items.py
#995
Comments
I tried to switch to Pydantic per open-contracting/software-development-handbook#52 but I couldn't get rid of these remaining errors. Note that Scrapy doesn't yet support Pydantic 2, so this switch would have to be updated again once it does: scrapy/itemadapter#72
I also had to add some diff --git a/kingfisher_scrapy/base_spiders/compressed_file_spider.py b/kingfisher_scrapy/base_spiders/compressed_file_spider.py
index ceeb7a12..9c0bf047 100644
--- a/kingfisher_scrapy/base_spiders/compressed_file_spider.py
+++ b/kingfisher_scrapy/base_spiders/compressed_file_spider.py
@@ -58,6 +58,7 @@ class CompressedFileSpider(BaseSpider):
def parse(self, response):
archive_name, archive_format = get_file_name_and_extension(response.request.meta['file_name'])
+ # NOTE: If support is added for additional archive formats, remember to update the `data` field in `items.py`.
if archive_format == 'zip':
cls = ZipFile
elif archive_format == 'rar':
diff --git a/kingfisher_scrapy/item_schema/File.json b/kingfisher_scrapy/item_schema/File.json
deleted file mode 100644
index 2e2edc43..00000000
--- a/kingfisher_scrapy/item_schema/File.json
+++ /dev/null
@@ -1,14 +0,0 @@
-{
- "$schema": "http://json-schema.org/draft-04/schema#",
- "allOf": [
- {
- "$ref": "urn:item#/definitions/KingfisherFileItem"
- }
- ],
- "type": "object",
- "properties": {
- "path": {
- "type": "string"
- }
- }
-}
diff --git a/kingfisher_scrapy/item_schema/FileError.json b/kingfisher_scrapy/item_schema/FileError.json
deleted file mode 100644
index fc7e076a..00000000
--- a/kingfisher_scrapy/item_schema/FileError.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
- "$schema": "http://json-schema.org/draft-04/schema#",
- "allOf": [
- {
- "$ref": "urn:item#/definitions/KingfisherItem"
- }
- ],
- "type": "object",
- "properties": {
- "errors": {
- "minLength": 1
- }
- },
- "required": [
- "errors"
- ]
-}
diff --git a/kingfisher_scrapy/item_schema/FileItem.json b/kingfisher_scrapy/item_schema/FileItem.json
deleted file mode 100644
index 922852d2..00000000
--- a/kingfisher_scrapy/item_schema/FileItem.json
+++ /dev/null
@@ -1,18 +0,0 @@
-{
- "$schema": "http://json-schema.org/draft-04/schema#",
- "allOf": [
- {
- "$ref": "urn:item#/definitions/KingfisherFileItem"
- }
- ],
- "type": "object",
- "properties": {
- "number": {
- "type": "integer",
- "minimum": 1
- }
- },
- "required": [
- "number"
- ]
-}
diff --git a/kingfisher_scrapy/item_schema/item.json b/kingfisher_scrapy/item_schema/item.json
deleted file mode 100644
index 821ba31f..00000000
--- a/kingfisher_scrapy/item_schema/item.json
+++ /dev/null
@@ -1,48 +0,0 @@
-{
- "$schema": "http://json-schema.org/draft-04/schema#",
- "definitions": {
- "KingfisherItem": {
- "type": "object",
- "properties": {
- "file_name": {
- "type": "string",
- "pattern": "^[^/]+$"
- },
- "url": {
- "type": "string",
- "format": "uri"
- }
- },
- "required": [
- "file_name",
- "url"
- ]
- },
- "KingfisherFileItem": {
- "allOf": [
- {
- "$ref": "#/definitions/KingfisherItem"
- }
- ],
- "type": "object",
- "properties": {
- "data_type": {
- "type": "string",
- "enum": [
- "record",
- "release",
- "record_package",
- "release_package"
- ]
- },
- "data": {
- "minLength": 1
- }
- },
- "required": [
- "data_type",
- "data"
- ]
- }
- }
-}
diff --git a/kingfisher_scrapy/items.py b/kingfisher_scrapy/items.py
index 849112b4..a3714d83 100644
--- a/kingfisher_scrapy/items.py
+++ b/kingfisher_scrapy/items.py
@@ -1,37 +1,51 @@
+import enum
import typing
+import zipfile
-import scrapy
-from dataclasses import dataclass
+import pydantic
+import rarfile
+
+# CompressedFileSpider
+# zipfile.ZipExtFile: https://docs.python.org/3/library/zipfile.html#zipfile.ZipFile.open
+# rarfile.RarExtFile: https://rarfile.readthedocs.io/api.html#rarfile.RarFile.open
+# DigiwhistBase
+# io.BufferedReader: https://docs.python.org/3/library/tarfile.html#tarfile.TarFile.extractfile
+Data = bytes | dict | list | typing.Any | rarfile.RarExtFile | zipfile.ZipExtFile
+
+kwargs = {'arbitrary_types_allowed': True, 'use_enum_values': True}
-@dataclass
-class Item:
- file_name: str
- url: str
+class DataType(str, enum.Enum):
+ record = "record"
+ release = "release"
+ record_package = "record_package"
+ release_package = "release_package"
-@dataclass
-class File(Item):
- data_type: str
- data: typing.Any
+class Base(pydantic.BaseModel):
+ file_name: pydantic.constr(regex=r"^[^/]+$")
+ url: pydantic.AnyUrl
+
+
+class File(Base, **kwargs):
+ data: Data
+ data_type: DataType
# Added by the FilesStore extension, for the KingfisherProcessAPI2 extension to refer to the file.
path: str = ""
-@dataclass
-class FileItem(Item):
- data_type: str
- data: typing.Any
- number: int
+# Do not inherit from File, as otherwise ``isinstance(item, File)`` passes for FileItem items.
+class FileItem(Base, **kwargs):
+ data: Data
+ data_type: DataType
+ number: pydantic.PositiveInt
# Added by the FilesStore extension, for the KingfisherProcessAPI2 extension to refer to the file.
path: str = ""
-@dataclass
-class FileError(Item):
+class FileError(Base):
errors: dict
-@dataclass
-class PluckedItem:
+class PluckedItem(pydantic.BaseModel):
value: typing.Any
diff --git a/kingfisher_scrapy/pipelines.py b/kingfisher_scrapy/pipelines.py
index 9160307c..7f7774d7 100644
--- a/kingfisher_scrapy/pipelines.py
+++ b/kingfisher_scrapy/pipelines.py
@@ -9,44 +9,25 @@ import warnings
import ijson
import jsonpointer
from flattentool import unflatten
-from jsonschema import FormatChecker
-from jsonschema.validators import Draft4Validator
from ocdsmerge.util import get_release_schema_url, get_tags
-from referencing import Registry, Resource
from scrapy.exceptions import DropItem, NotSupported
from kingfisher_scrapy.items import File, FileItem, PluckedItem
from kingfisher_scrapy.util import transcode
-def _json_loads(basename):
- return json.loads(pkgutil.get_data('kingfisher_scrapy', f'item_schema/{basename}.json'))
-
# https://docs.scrapy.org/en/latest/topics/item-pipeline.html#duplicates-filter
class Validate:
"""
Drops duplicate files based on ``file_name`` and file items based on ``file_name`` and ``number``.
-
- :raises jsonschema.ValidationError: if the item is invalid
"""
def __init__(self):
- self.validators = {}
self.files = set()
self.file_items = set()
- schema = Resource.from_contents(_json_loads('item'))
- registry = Registry().with_resource('urn:item', schema)
- checker = FormatChecker()
- for item in ('File', 'FileError', 'FileItem'):
- self.validators[item] = Draft4Validator(_json_loads(item), registry=registry, format_checker=checker)
-
def process_item(self, item, spider):
- validator = self.validators.get(item.__class__.__name__)
- if validator:
- validator.validate(item.__dict__)
-
if isinstance(item, FileItem):
key = (item.file_name, item.number)
if key in self.file_items:
diff --git a/kingfisher_scrapy/spidermiddlewares.py b/kingfisher_scrapy/spidermiddlewares.py
index 330a1866..ff0a2247 100644
--- a/kingfisher_scrapy/spidermiddlewares.py
+++ b/kingfisher_scrapy/spidermiddlewares.py
@@ -65,7 +65,9 @@ class LineDelimitedMiddleware:
data = item.data
# Data can be bytes or a file-like object.
- if isinstance(data, bytes):
+ if hasattr(data, 'read'):
+ data.seek(0)
+ elif isinstance(data, bytes):
data = data.splitlines(True)
for number, line in enumerate(data, 1):
@@ -174,6 +176,7 @@ class AddPackageMiddleware:
data = item.data
if hasattr(data, 'read'):
+ data.seek(0)
data = data.read()
# If the spider's ``root_path`` class attribute is non-empty, then the JSON data is already parsed.
@@ -262,9 +265,12 @@ class ReadDataMiddleware:
yield item
continue
+ item.data.seek(0)
data = item.data.read()
+
item.data.close()
item.data = data
+
yield item
diff --git a/kingfisher_scrapy/util.py b/kingfisher_scrapy/util.py
index d64adfb1..20544575 100644
--- a/kingfisher_scrapy/util.py
+++ b/kingfisher_scrapy/util.py
@@ -284,6 +284,8 @@ class TranscodeFile:
self.file = file
self.encoding = encoding
+ self.file.seek(0)
+
def read(self, buf_size):
"""
Re-encodes bytes read from the file to UTF-8.
diff --git a/requirements.in b/requirements.in
index 97796633..f5b4e11c 100644
--- a/requirements.in
+++ b/requirements.in
@@ -1,12 +1,11 @@
flattentool
ijson
jsonpointer
-jsonschema
ocdskit[perf]
ocdsmerge
psycopg2
+pydantic<2
rarfile
-referencing
requests
rfc3986-validator
scrapy
diff --git a/requirements.txt b/requirements.txt
index ee3d0ae4..387d9167 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,8 +7,6 @@
attrs==22.2.0
# via
# automat
- # jsonschema
- # referencing
# service-identity
# twisted
automat==0.8.0
@@ -84,10 +82,6 @@ jsonref==1.0.1
# ocdsextensionregistry
# ocdskit
# ocdsmerge
-jsonschema==4.18.0a9
- # via -r requirements.in
-jsonschema-specifications==2023.5.1
- # via jsonschema
lxml==4.9.2
# via
# flattentool
@@ -135,6 +129,8 @@ pyasn1-modules==0.2.7
# via service-identity
pycparser==2.19
# via cffi
+pydantic==1.10.15
+ # via -r requirements.in
pydispatcher==2.0.6
# via scrapy
pyopenssl==24.0.0
@@ -145,11 +141,6 @@ queuelib==1.6.2
# via scrapy
rarfile==3.1
# via -r requirements.in
-referencing==0.28.5
- # via
- # -r requirements.in
- # jsonschema
- # jsonschema-specifications
requests==2.31.0
# via
# -r requirements.in
@@ -165,10 +156,6 @@ requests-file==1.5.1
# via tldextract
rfc3986-validator==0.1.1
# via -r requirements.in
-rpds-py==0.7.1
- # via
- # jsonschema
- # referencing
schema==0.7.2
# via flattentool
scrapy==2.11.1
@@ -202,7 +189,9 @@ twisted==23.10.0
# scrapy
# scrapyd
typing-extensions==4.8.0
- # via twisted
+ # via
+ # pydantic
+ # twisted
uberegg==0.1.1
# via scrapyd-client
url-normalize==1.4.3
diff --git a/requirements_dev.txt b/requirements_dev.txt
index 85dd9313..f0a8ce45 100644
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
@@ -8,8 +8,6 @@ attrs==22.2.0
# via
# -r requirements.txt
# automat
- # jsonschema
- # referencing
# service-identity
# twisted
automat==0.8.0
@@ -151,12 +149,6 @@ jsonref==1.0.1
# ocdsextensionregistry
# ocdskit
# ocdsmerge
-jsonschema==4.18.0a9
- # via -r requirements.txt
-jsonschema-specifications==2023.5.1
- # via
- # -r requirements.txt
- # jsonschema
lxml==4.9.2
# via
# -r requirements.txt
@@ -238,6 +230,8 @@ pycparser==2.19
# via
# -r requirements.txt
# cffi
+pydantic==1.10.15
+ # via -r requirements.txt
pydispatcher==2.0.6
# via
# -r requirements.txt
@@ -275,11 +269,6 @@ queuelib==1.6.2
# scrapy
rarfile==3.1
# via -r requirements.txt
-referencing==0.28.5
- # via
- # -r requirements.txt
- # jsonschema
- # jsonschema-specifications
requests==2.31.0
# via
# -r requirements.txt
@@ -300,11 +289,6 @@ requests-file==1.5.1
# tldextract
rfc3986-validator==0.1.1
# via -r requirements.txt
-rpds-py==0.7.1
- # via
- # -r requirements.txt
- # jsonschema
- # referencing
schema==0.7.2
# via
# -r requirements.txt
@@ -357,6 +341,7 @@ twisted==23.10.0
typing-extensions==4.8.0
# via
# -r requirements.txt
+ # pydantic
# twisted
uberegg==0.1.1
# via
diff --git a/setup.py b/setup.py
index 61a8bbbe..2c0ca8e8 100644
--- a/setup.py
+++ b/setup.py
@@ -5,10 +5,6 @@ from setuptools import find_packages, setup
setup(
packages=find_packages(exclude=['tests', 'tests.*']),
- package_data={
- 'kingfisher_scrapy': ['item_schema/*.json'],
- },
- include_package_data=True,
entry_points={
'scrapy': [
'settings = kingfisher_scrapy.settings', |
I did a bit more testing and Pydantic v1 and v2 both have issues, if the field is allowed to be a dict (v1) or a list (v2) in addition to Also noting that Edit: I think it's related to import pydantic
import io
import zipfile
from pathlib import Path
from typing import Any
class AnyData(pydantic.BaseModel, arbitrary_types_allowed=True):
data: Any
class DictOrAnyData(pydantic.BaseModel, arbitrary_types_allowed=True):
data: dict | Any
class ListOrAnyData(pydantic.BaseModel, arbitrary_types_allowed=True):
data: list | Any
path = Path("test.txt")
path.write_text("foo\nbar\nbaz\nbzz\nzzz")
t = AnyData(data=path.open("rb"))
print(t.data.read())
t = DictOrAnyData(data=path.open("rb"))
print(t.data.read())
t = ListOrAnyData(data=path.open("rb"))
print(t.data.read()) |
https://docs.scrapy.org/en/latest/topics/items.html
Once we deploy to Python >= 3.7.
The text was updated successfully, but these errors were encountered: