Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
cli: marc21json cli function to use splitted json schemas
Browse files Browse the repository at this point in the history
* Fixes the marc21json cli function to work proberly with json schema files with $refs.

Co-Authored-by: Peter Weber <peter.weber@rero.ch>
rerowep committed Jul 31, 2020
1 parent 444d6fd commit 1142f2e
Showing 3 changed files with 183 additions and 115 deletions.
63 changes: 25 additions & 38 deletions rero_ils/modules/cli.py
Original file line number Diff line number Diff line change
@@ -34,7 +34,6 @@
from glob import glob

import click
import jsonref
import polib
import pycountry
import requests
@@ -48,6 +47,7 @@
from invenio_accounts.cli import commit, users
from invenio_app.factory import static_folder
from invenio_db import db
from invenio_jsonschemas.proxies import current_jsonschemas
from invenio_pidstore.models import PersistentIdentifier, PIDStatus
from invenio_records.api import Record
from invenio_records_rest.utils import obj_or_import_string
@@ -56,7 +56,6 @@
from jsonschema import validate
from jsonschema.exceptions import ValidationError
from lxml import etree
from pkg_resources import resource_string
from werkzeug.local import LocalProxy

from .api import IlsRecordsIndexer
@@ -68,8 +67,10 @@
from .tasks import process_bulk_queue
from .utils import read_json_record
from ..modules.providers import append_fixtures_new_identifiers
from ..modules.utils import get_schema_for_resource

_datastore = LocalProxy(lambda: current_app.extensions['security'].datastore)
_records_state = LocalProxy(lambda: current_app.extensions['invenio-records'])


def abort_if_false(ctx, param, value):
@@ -555,23 +556,20 @@ def test_license(file, extension, license_lines, verbose):

@utils.command('validate')
@click.argument('jsonfile', type=click.File('r'))
@click.argument('type', default='documents')
@click.argument('schema', default='document-v0.0.1.json')
@click.argument('type', default='doc')
@click.option('-v', '--verbose', 'verbose', is_flag=True, default=False)
@click.option('-e', '--error_file', 'error_file', type=click.File('w'),
default=None)
@click.option('-o', '--ok_file', 'ok_file', type=click.File('w'), default=None)
def check_validate(jsonfile, type, schema, verbose, error_file, ok_file):
@with_appcontext
def check_validate(jsonfile, type, verbose, error_file, ok_file):
"""Check record validation."""
click.secho('Testing json schema for file', fg='green')
schema_in_bytes = resource_string(
'rero_ils.modules.{type}.jsonschemas'.format(type=type),
'{type}/{schema}'.format(
type=type,
schema=schema
)
)
schema = jsonref.loads(schema_in_bytes.decode('utf8'))

path = current_jsonschemas.url_to_path(get_schema_for_resource(type))
schema = current_jsonschemas.get_schema(path=path)
schema = _records_state.replace_refs(schema)

datas = json.load(jsonfile)
count = 0
for data in datas:
@@ -599,27 +597,8 @@ def check_validate(jsonfile, type, schema, verbose, error_file, ok_file):
click.secho(str(err))


@utils.command('compile_json')
@click.argument('src_jsonfile', type=click.File('r'))
@click.option('-o', '--output', 'output', type=click.File('w'), default=None)
@click.option('-v', '--verbose', 'verbose', is_flag=True, default=False)
def compile_json(src_jsonfile, output, verbose):
"""Compile source json file (resolve $ref)."""
click.secho('Compile json file (resolve $ref): ', fg='green', nl=False)
click.secho(src_jsonfile.name)
data = jsonref.load(src_jsonfile)
if not output:
output = sys.stdout
json.dump(data, fp=output, indent=2)


def do_worker(marc21records, results, pid_required, debug):
def do_worker(marc21records, results, pid_required, debug, schema=None):
"""Worker for marc21 to json transformation."""
schema_in_bytes = resource_string(
'rero_ils.modules.documents.jsonschemas',
'documents/document-v0.0.1.json'
)
schema = jsonref.loads(schema_in_bytes.decode('utf8'))
for data in marc21records:
data_json = data['json']
pid = data_json.get('001', '???')
@@ -632,7 +611,8 @@ def do_worker(marc21records, results, pid_required, debug):
if not record.get("pid"):
# create dummy pid in data
record["pid"] = 'dummy'
validate(record, schema)
if schema:
validate(record, schema)
if record["$schema"] == 'dummy':
del record["$schema"]
if not pid_required:
@@ -660,11 +640,12 @@ class Marc21toJson():
__slots__ = ['xml_file', 'json_file_ok', 'xml_file_error', 'parallel',
'chunk', 'verbose', 'debug', 'pid_required',
'count', 'count_ok', 'count_ko', 'ctx',
'results', 'active_buffer', 'buffer', 'first_result']
'results', 'active_buffer', 'buffer', 'first_result',
'schema']

def __init__(self, xml_file, json_file_ok, xml_file_error,
parallel=8, chunk=5000,
verbose=False, debug=False, pid_required=False):
verbose=False, debug=False, pid_required=False, schema=None):
"""Constructor."""
self.count = 0
self.count_ok = 0
@@ -675,6 +656,7 @@ def __init__(self, xml_file, json_file_ok, xml_file_error,
self.parallel = parallel
self.chunk = chunk
self.verbose = verbose
self.schema = schema
self.first_result = True
if verbose:
click.echo('Main process pid: {pid}'.format(
@@ -746,7 +728,7 @@ def start_new_process(self):
new_process = self.ctx.Process(
target=do_worker,
args=(self.active_records, self.results, self.pid_required,
self.debug)
self.debug, self.schema)
)
self.wait_free_process()
new_process.start()
@@ -829,6 +811,7 @@ def active_records(self):
@click.option('-d', '--debug', 'debug', is_flag=True, default=False)
@click.option('-r', '--pidrequired', 'pid_required', is_flag=True,
default=False)
@with_appcontext
def marc21json(xml_file, json_file_ok, xml_file_error, parallel, chunk,
verbose, debug, pid_required):
"""Convert xml file to json with dojson."""
@@ -837,8 +820,12 @@ def marc21json(xml_file, json_file_ok, xml_file_error, parallel, chunk,
click.secho(' (validation tests pid) ', nl=False)
click.secho(xml_file.name)

path = current_jsonschemas.url_to_path(get_schema_for_resource(type))
schema = current_jsonschemas.get_schema(path=path)
schema = _records_state.replace_refs(schema)
transform = Marc21toJson(xml_file, json_file_ok, xml_file_error,
parallel, chunk, verbose, debug, pid_required)
parallel, chunk, verbose, debug, pid_required,
schema)

count, count_ok, count_ko = transform.counts()

194 changes: 117 additions & 77 deletions tests/data/documents.json
Original file line number Diff line number Diff line change
@@ -1,183 +1,223 @@
[
{
"type": "book",
"issuance": {
"main_type": "rdami:1001",
"subtype": "materialUnit"
},
"pid": "1",
"language": [
{
"value": "ita",
"value": "fre",
"type": "bf:Language"
}
],
"identifiedBy": [
{
"value": "9788898983056",
"type": "bf:Isbn"
},
{
"value": "R008400428",
"value": "0812781",
"type": "bf:Local",
"source": "RERO"
},
{
"source": "OCoLC",
"value": "ocn945401320",
"type": "bf:Local"
}
],
"authors": [
{
"type": "person",
"$ref": "https://mef.rero.ch/api/idref/20109313"
},
"responsibilityStatement": [
[
{
"value": "[\u00e9d.] Hans E. Bachmann"
}
],
[
{
"value": "trad. Henri Perrin"
}
]
],
"title": [
{
"type": "person",
"$ref": "https://mef.rero.ch/api/gnd/25552024"
"mainTitle": [
{
"value": "La norme S.I.A. 118 et l'actualit\u00e9 juridique en mati\u00e8re de construction"
}
],
"subtitle": [
{
"value": "un ouvrage pratique pour tous les entrepreneurs en rapport avec la construction, avec \u00e9tudes de cas, check-lists, exemples de contrats et de lettres relatifs au contrat d'entreprise"
}
],
"type": "bf:Title"
}
],
"title": "Le due tensioni : appunti per una ideologia della letteratura",
"provisionActivity": [
{
"type": "bf:Publication",
"place": [
{
"type": "bf:Place",
"country": "it"
}
],
"statement": [
{
"type": "bf:Place",
"label": [
{
"value": "Matelica (MC)"
"value": "Z\u00fcrich"
}
]
},
{
"type": "bf:Agent",
"label": [
{
"value": "Hacca"
"value": "Ed. Weka"
}
]
},
{
"type": "Date",
"label": [
{
"value": "2016"
"value": "1987->"
}
]
],
"type": "Date"
}
],
"startDate": 2016
"startDate": 1987,
"place": [
{
"country": "sz",
"type": "bf:Place"
}
]
}
],
"extent": "380 pages",
"formats": [
"21 cm"
],
"series": [
"extent": "8 classeurs",
"note": [
{
"name": "Novecento.0",
"number": "68"
"noteType": "otherPhysicalDetails",
"label": "ill."
},
{
"noteType": "general",
"label": "Publication \u00e0 feuillets mobiles avec mises \u00e0 jour p\u00e9riodiques"
}
],
"notes": [
"Collected writings",
"Includes preface (pages 9-22) and postface (pages 347-357)",
"Includes writings, published for the first time"
"illustrativeContent": [
"illustrations"
],
"dimensions": [
"23 cm"
],
"subjects": [
"Litt\u00e9rature",
"Culture",
"[Notes, esquisses, etc.]"
"contrat de construction",
"Suisse"
],
"partOf": [
"authors": [
{
"document": {
"$ref": "https://ils.rero.ch/api/documents/12"
},
"numbering": [
{
"volume": 25
}
]
"type": "person",
"$ref": "https://mef.rero.ch/api/idref/074755978"
},
{
"type": "person",
"$ref": "https://mef.rero.ch/api/rero/A003683610"
}
],
"titlesProper": [
"La norme SIA 118 et l'actualit\u00e9 juridique en mati\u00e8re de construction"
]
},
{
"type": "book",
"issuance": {
"main_type": "rdami:1001",
"subtype": "materialUnit"
},
"pid": "2",
"language": [
{
"value": "fre",
"value": "ger",
"type": "bf:Language"
}
],
"identifiedBy": [
{
"value": "R006039425",
"value": "9783503057221",
"type": "bf:Isbn"
},
{
"value": "R270072860",
"type": "bf:Local",
"source": "RERO"
}
],
"authors": [
{
"type": "person",
"$ref": "https://mef.rero.ch/api/mef/19985648"
"$ref": "https://mef.rero.ch/api/rero/A006010680"
}
],
"title": "Sukkwan island : roman",
"provisionActivity": [
"responsibilityStatement": [
[
{
"value": "von Erwin Zacharias"
}
]
],
"title": [
{
"type": "bf:Publication",
"place": [
"mainTitle": [
{
"type": "bf:Place",
"country": "fr"
"value": "Going Public einer Fussball-Kapitalgesellschaft"
}
],
"subtitle": [
{
"value": "rechtliche, betriebswirtschaftliche und strategische Konzepte bei der Vorbereitung der B\u00f6rseneinf\u00fchrung eines Fussball-Bundesligavereins"
}
],
"type": "bf:Title"
}
],
"provisionActivity": [
{
"type": "bf:Publication",
"statement": [
{
"type": "bf:Place",
"label": [
{
"value": "Paris"
"value": "Bielefeld"
}
]
},
{
"type": "bf:Agent",
"label": [
{
"value": "Gallmeister"
"value": "Erich Schmidt"
}
]
},
{
"type": "Date",
"label": [
{
"value": "2009"
"value": "1999"
}
]
],
"type": "Date"
}
],
"startDate": 2009
"startDate": 1999,
"place": [
{
"country": "gw",
"type": "bf:Place"
}
]
}
],
"extent": "191 p.",
"formats": [
"21 cm"
],
"series": [
"extent": "617 S.",
"note": [
{
"name": "Nature writing"
"noteType": "otherPhysicalDetails",
"label": "Taf."
}
],
"abstracts": [
"Une \u00eele sauvage du Sud de l'Alaska, accessible uniquement par bateau ou par hydravion, tout en for\u00eats humides et montagnes escarp\u00e9es. C'est dans ce d\u00e9cor que Jim d\u00e9cide d'emmener son fils de treize ans pour y vivre dans une cabane isol\u00e9e, une ann\u00e9e durant. Apr\u00e8s une succession d'\u00e9checs personnels, il voit l\u00e0 l'occasion de prendre un nouveau d\u00e9part et de renouer avec ce gar\u00e7on qu'il conna\u00eet si mal. La rigueur de cette vie et les d\u00e9faillances du p\u00e8re ne tardent pas \u00e0 transformer ce s\u00e9jour en cauchemar, et la situation devient vite incontr\u00f4lable. Jusqu'au drame violent et impr\u00e9visible qui scellera leur destin. Sukkwan Island est une histoire au suspense insoutenable. Avec ce roman qui nous entra\u00eene au coeur des t\u00e9n\u00e8bres de l'\u00e2me humaine, David Vann s'installe d'embl\u00e9e parmi les jeunes auteurs am\u00e9ricains de tout premier plan."
"dimensions": [
"21 cm"
]
}
]
41 changes: 41 additions & 0 deletions tests/unit/test_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# -*- coding: utf-8 -*-
#
# RERO ILS
# Copyright (C) 2019 RERO
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

"""Test cli."""

from os.path import dirname, join

from click.testing import CliRunner

from rero_ils.modules.cli import check_validate


def test_cli_validate(app, script_info):
"""Test validate cli."""
runner = CliRunner()
file_name = join(dirname(__file__), '../data/documents.json')

res = runner.invoke(
check_validate,
[file_name, 'doc', '-v'],
obj=script_info
)
assert res.output.strip().split('\n') == [
'Testing json schema for file',
'\tTest record: 1',
'\tTest record: 2'
]

0 comments on commit 1142f2e

Please sign in to comment.