Skip to content

Commit b48e771

Browse files
committed
documents: add files support
- Uses rero-invenio-files to add file support to documents. - Pushes full text into the document index. - Creates a jsonschema store to support local and bib.rero.ch prefix. - Adds full text boosting field. - Creates randomly files during the setup. Co-Authored-by: Johnny Mariéthoz <[email protected]>
1 parent 06a39c9 commit b48e771

File tree

15 files changed

+920
-76
lines changed

15 files changed

+920
-76
lines changed

poetry.lock

+538-5
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

+1
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ jsonresolver = "*"
115115
urllib3 = "<2.0.0"
116116
pyparsing = "^3.1.1"
117117
flask-wiki = "^0.3.1"
118+
rero-invenio-files = {git = "https://github.com/rero/rero-invenio-files.git"}
118119

119120
[tool.poetry.dev-dependencies]
120121
## Python packages development dependencies (order matters)

rero_ils/config.py

+11
Original file line numberDiff line numberDiff line change
@@ -2366,6 +2366,8 @@ def _(x):
23662366
'documents': {
23672367
'autocomplete_title': 3,
23682368
'title\.*': 3,
2369+
'fulltext': 3,
2370+
'fulltext\.*': 6,
23692371
'contribution.entity.authorized_access_point_*': 2,
23702372
'contribution.entity.authorized_access_point': 2,
23712373
'publicationYearText': 2,
@@ -3974,3 +3976,12 @@ def search_type(field):
39743976
}
39753977
},
39763978
]
3979+
3980+
3981+
FILES_REST_STORAGE_CLASS_LIST = {
3982+
"L": "Local"
3983+
}
3984+
3985+
FILES_REST_DEFAULT_STORAGE_CLASS = "L"
3986+
RECORDS_REFRESOLVER_CLS = "invenio_records.resolver.InvenioRefResolver"
3987+
RECORDS_REFRESOLVER_STORE = "rero_ils.modules.utils.refresolver_store"

rero_ils/modules/cli/fixtures.py

+2
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
from werkzeug.local import LocalProxy
4040

4141
from ..collections.cli import create_collections
42+
from ..files.cli import create_files
4243
from ..holdings.cli import create_patterns
4344
from ..ill_requests.cli import create_ill_requests
4445
from ..items.cli import create_items, reindex_items
@@ -66,6 +67,7 @@ def fixtures():
6667
fixtures.add_command(create_items)
6768
fixtures.add_command(reindex_items)
6869
fixtures.add_command(create_loans)
70+
fixtures.add_command(create_files)
6971
fixtures.add_command(load_virtua_transactions)
7072
fixtures.add_command(create_patterns)
7173
fixtures.add_command(create_ill_requests)

rero_ils/modules/documents/dumpers/indexer.py

+26
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
"""Indexing dumper."""
2020

2121
from flask import current_app
22+
from invenio_access.permissions import system_identity
2223
from invenio_records.dumpers import Dumper
2324

2425
from ..extensions import TitleExtension
@@ -227,6 +228,30 @@ def _process_provision_activity(record, data):
227228
data['sort_date_new'] = end_date or start_date
228229
data['sort_date_old'] = start_date
229230

231+
def _process_files(self, record, data):
232+
"""Add full text from files."""
233+
ext = current_app.extensions['rero-invenio-files']
234+
sfr = ext.records_service
235+
search = sfr.search_request(
236+
system_identity, dict(size=1), sfr.record_cls, sfr.config.search
237+
)
238+
search = search.source('uuid')\
239+
.filter('term', metadata__links=f'doc_{record.pid}')
240+
files = []
241+
for rec in search.scan():
242+
record_file = sfr.record_cls.get_record(rec.uuid)
243+
for file_name in record_file.files:
244+
file = record_file.files[file_name]
245+
metadata = file.get('metadata', {})
246+
if metadata.get('type') == 'fulltext':
247+
files.append(dict(
248+
file_name=metadata.get('fulltext_for', file_name),
249+
text=file.get_stream('r').read(),
250+
rec_id=record_file.pid.pid_value
251+
))
252+
if files:
253+
data['files'] = files
254+
230255
def dump(self, record, data):
231256
"""Dump a document instance with basic document information's.
232257
@@ -240,6 +265,7 @@ def dump(self, record, data):
240265
self._process_sort_title(record, data)
241266
self._process_host_document(record, data)
242267
self._process_provision_activity(record, data)
268+
self._process_files(record, data)
243269
# import pytz
244270
# from datetime import datetime
245271
# iso_now = pytz.utc.localize(datetime.utcnow()).isoformat()

rero_ils/modules/documents/mappings/v7/documents/document-v0.0.1.json

+43-1
Original file line numberDiff line numberDiff line change
@@ -31,13 +31,55 @@
3131
"mappings": {
3232
"date_detection": false,
3333
"numeric_detection": false,
34+
"_source": {
35+
"excludes": [
36+
"files.text"
37+
]
38+
},
3439
"properties": {
3540
"$schema": {
3641
"type": "keyword"
3742
},
3843
"pid": {
3944
"type": "keyword"
4045
},
46+
"files": {
47+
"type": "object",
48+
"properties": {
49+
"text": {
50+
"type": "text",
51+
"index": false,
52+
"copy_to": "fulltext"
53+
},
54+
"file_name": {
55+
"type": "keyword"
56+
},
57+
"rec_id": {
58+
"type": "keyword"
59+
}
60+
}
61+
},
62+
"fulltext": {
63+
"type": "text",
64+
"fields": {
65+
"eng": {
66+
"type": "text",
67+
"analyzer": "english"
68+
},
69+
"fre": {
70+
"type": "text",
71+
"analyzer": "french"
72+
},
73+
"ger": {
74+
"type": "text",
75+
"analyzer": "german"
76+
},
77+
"ita": {
78+
"type": "text",
79+
"analyzer": "italian"
80+
}
81+
}
82+
},
4183
"title": {
4284
"type": "object",
4385
"properties": {
@@ -2156,4 +2198,4 @@
21562198
}
21572199
}
21582200
}
2159-
}
2201+
}

rero_ils/modules/ext.py

+13-2
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,6 @@
1818

1919
"""RERO ILS invenio module declaration."""
2020

21-
from __future__ import absolute_import, print_function
22-
2321
import logging
2422

2523
import jinja2
@@ -145,12 +143,25 @@ def load_actions(sender, app):
145143
for action in app.config.get('RERO_ILS_PERMISSIONS_ACTIONS', []):
146144
access_ext.register_action(obj_or_import_string(action))
147145

146+
# add jsonschema resolution from local:// and bib.rero.ch
147+
data = app.extensions["invenio-jsonschemas"].refresolver_store()
148+
app.extensions['rero-ils'].jsonschema_store = dict(
149+
**data,
150+
**{
151+
k.replace('local://', 'https://bib.rero.ch/schemas/'): v
152+
for k, v in data.items()
153+
}
154+
)
155+
148156

149157
class REROILSAPP(object):
150158
"""rero-ils extension."""
151159

152160
def __init__(self, app=None):
153161
"""RERO ILS App module."""
162+
# jsonschema store
163+
# SEE: RECORDS_REFRESOLVER_STORE for more details
164+
self.jsonschema_store = {}
154165
if app:
155166
self.init_app(app)
156167
# force to load ils template before others

rero_ils/modules/files/__init__.py

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# -*- coding: utf-8 -*-
2+
#
3+
# RERO ILS
4+
# Copyright (C) 2019-2024 RERO
5+
#
6+
# This program is free software: you can redistribute it and/or modify
7+
# it under the terms of the GNU Affero General Public License as published by
8+
# the Free Software Foundation, version 3 of the License.
9+
#
10+
# This program is distributed in the hope that it will be useful,
11+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
12+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13+
# GNU Affero General Public License for more details.
14+
#
15+
# You should have received a copy of the GNU Affero General Public License
16+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
17+
18+
"""Files module."""

rero_ils/modules/files/cli.py

+113
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
# -*- coding: utf-8 -*-
2+
#
3+
# RERO ILS
4+
# Copyright (C) 2019-2022 RERO
5+
#
6+
# This program is free software: you can redistribute it and/or modify
7+
# it under the terms of the GNU Affero General Public License as published by
8+
# the Free Software Foundation, version 3 of the License.
9+
#
10+
# This program is distributed in the hope that it will be useful,
11+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
12+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13+
# GNU Affero General Public License for more details.
14+
#
15+
# You should have received a copy of the GNU Affero General Public License
16+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
17+
18+
"""Click command-line interface for item record management."""
19+
20+
from io import BytesIO
21+
from random import choice, shuffle
22+
23+
import click
24+
from flask import current_app
25+
from flask.cli import with_appcontext
26+
from invenio_access.permissions import system_identity
27+
from invenio_db import db
28+
from invenio_search import current_search
29+
from rero_invenio_files.pdf import PDFGenerator
30+
31+
from rero_ils.modules.documents.api import Document
32+
from rero_ils.modules.documents.dojson.contrib.jsontodc import dublincore
33+
from rero_ils.modules.libraries.api import Library
34+
35+
36+
def create_pdf_file(document):
37+
"""Create a pdf file from a document record.
38+
39+
:param doc: Document - a given bibliographic document
40+
:returns: a byte stream of the pdf content
41+
"""
42+
# get the dublin core format of the given document
43+
dc = dublincore.do(document, "english")
44+
data = dict(header=f"Document ({document.pid})")
45+
if titles := dc.get("titles"):
46+
data["title"] = "\n".join(titles)
47+
if contributors := dc.get("contributors"):
48+
data["authors"] = contributors
49+
# Some fields are not well converted
50+
# TODO: remove this when the dc conversion will be fixed
51+
try:
52+
if descriptions := dc.get("descriptions"):
53+
data["summary"] = "\n".join(descriptions)
54+
except Exception:
55+
pass
56+
generator = PDFGenerator(data)
57+
generator.render()
58+
return generator.output()
59+
60+
61+
def create_pdf_record_files(document, metadata, flush=False):
62+
"""Creates and attach a pdf file to a given document.
63+
64+
:param document: Document - the document record.
65+
:param metadata: dict - file metadata.
66+
"""
67+
# add document link
68+
metadata.setdefault("links", []).append(f"doc_{document.pid}")
69+
ext = current_app.extensions["rero-invenio-files"]
70+
# get services
71+
record_service = ext.records_service
72+
file_service = ext.records_files_service
73+
# generate the PDF file
74+
stream = BytesIO(create_pdf_file(document))
75+
# create the record file
76+
record = record_service.record_cls.create(data={"metadata": metadata})
77+
record.commit()
78+
recid = record["id"]
79+
# index the file record
80+
record_service.indexer.index_by_id(record.id)
81+
if flush:
82+
current_search.flush_and_refresh(record_service.record_cls.index._name)
83+
# attach the file record to the document
84+
file_name = f"doc_{document.pid}.pdf"
85+
file_service.init_files(system_identity, recid, [{"key": file_name}])
86+
file_service.set_file_content(system_identity, recid, file_name, stream)
87+
file_service.commit_file(system_identity, recid, file_name)
88+
db.session.commit()
89+
90+
91+
@click.command("create_files")
92+
@click.argument("number", type=int)
93+
@with_appcontext
94+
def create_files(number):
95+
"""Create attached files.
96+
97+
:param number: integer - number of the files to generate
98+
"""
99+
collections = ["col1", "col2", "col3"]
100+
doc_pids = list(Document.get_all_pids())
101+
lib_pids = list(Library.get_all_pids())
102+
shuffle(doc_pids)
103+
104+
for _ in range(0, number):
105+
pid = choice(doc_pids)
106+
click.echo(f"Create file for {pid}")
107+
doc = Document.get_record_by_pid(pid)
108+
lib_pid = choice(lib_pids)
109+
metadata = dict(
110+
collections=[choice(collections)],
111+
owners=[f"lib_{lib_pid}"]
112+
)
113+
create_pdf_record_files(document=doc, metadata=metadata)

rero_ils/modules/utils.py

+7
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,13 @@
5050
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
5151
from requests.adapters import HTTPAdapter
5252
from requests.packages.urllib3.util.retry import Retry
53+
from werkzeug.local import LocalProxy
54+
55+
# jsonschema resolver
56+
# SEE: RECORDS_REFRESOLVER_STORE for more details
57+
refresolver_store = LocalProxy(
58+
lambda: current_app.extensions['rero-ils'].jsonschema_store
59+
)
5360

5461

5562
def get_mef_url(entity_type):

scripts/setup

+7
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,10 @@ eval ${PREFIX} invenio reroils index queue init
235235
info_msg "Delete invenio_circulations index"
236236
eval ${PREFIX} invenio index delete loans-loan-v1.0.0 --force --yes-i-know
237237

238+
info_msg "Initialize files location."
239+
eval rm -fr data/files
240+
eval ${PREFIX} invenio files location create --default fixtures "./data/files"
241+
238242
if ${ES_MAPPING}
239243
then
240244
info_msg "ES mappings:"
@@ -421,6 +425,9 @@ eval ${PREFIX} invenio reroils fixtures create --pid_type hold --schema 'https:/
421425
info_msg "- Items: ${ITEMS} ${CREATE_LAZY} ${DONT_STOP}"
422426
eval ${PREFIX} invenio reroils fixtures create --pid_type item --schema 'https://bib.rero.ch/schemas/items/item-v0.0.1.json' ${ITEMS} --append ${CREATE_LAZY} ${DONT_STOP}
423427

428+
info_msg "- Generate files"
429+
eval ${PREFIX} invenio reroils fixtures create_files 500
430+
424431
# index items
425432
eval ${PREFIX} invenio reroils index reindex -t item --yes-i-know
426433
if [ ${INDEX_PARALLEL} -gt 0 ]; then

0 commit comments

Comments
 (0)