Skip to content

Commit 5a32ea3

Browse files
committed
check whether PAGE input has words, toggelable, OCR-D/ocrd_calamari#63
1 parent 1fdcaf0 commit 5a32ea3

File tree

3 files changed

+17
-4
lines changed

3 files changed

+17
-4
lines changed

ocrd_page_to_alto/cli.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,14 @@
33
from ocrd_utils import initLogging
44

55
@click.command()
6+
@click.option('--check-words/--no-check-words', default=True, help='Check whether PAGE-XML contains any Words and fail if not')
67
@click.argument('filename')
7-
def main(filename):
8+
def main(check_words, filename):
89
"""
910
Convert PAGE to ALTO
1011
"""
1112
initLogging()
12-
converter = OcrdPageAltoConverter(page_filename=filename)
13+
converter = OcrdPageAltoConverter(page_filename=filename, check_words=check_words)
1314
converter.convert()
1415
print(converter)
1516

ocrd_page_to_alto/convert.py

+10-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# pylint: disable=no-member, c-extension-no-member
22
from lxml import etree as ET
3-
from ocrd_models.ocrd_page import (parse, parseString)
3+
from ocrd_models.ocrd_page import parse, parseString, to_xml
44
from ocrd_models.constants import NAMESPACES as NAMESPACES_
55
from ocrd_utils import getLogger, xywh_from_points
66

@@ -36,7 +36,11 @@
3636

3737
class OcrdPageAltoConverter():
3838

39-
def __init__(self, *, page_filename=None, page_etree=None, pcgts=None, logger=None):
39+
def __init__(self, *, check_words=True, page_filename=None, page_etree=None, pcgts=None, logger=None):
40+
"""
41+
Keyword Args:
42+
check_words (boolean): Whether to check if PAGE-XML contains any words before conversion and fail if not
43+
"""
4044
if not (page_filename or page_etree or pcgts):
4145
raise ValueError("Must pass either pcgts, page_etree or page_filename to constructor")
4246
self.logger = logger if logger else getLogger('page-to-alto')
@@ -47,6 +51,10 @@ def __init__(self, *, page_filename=None, page_etree=None, pcgts=None, logger=No
4751
else:
4852
self.page_pcgts = parse(page_filename)
4953
self.page_page = self.page_pcgts.get_Page()
54+
if check_words:
55+
xml_ = ET.fromstring(to_xml(self.page_pcgts).encode('utf-8'))
56+
if xml_.find('.//page:Word', NAMESPACES) is None:
57+
raise ValueError("The PAGE-XML to transform contains no Words, hence nothing to convert.")
5058
# TODO self.check_for_words()
5159
self.alto_alto, self.alto_description, self.alto_styles, self.alto_tags, self.alto_page = self.create_alto()
5260
self.alto_printspace = self.convert_border()

tests/test_convert.py

+4
Original file line numberDiff line numberDiff line change
@@ -35,5 +35,9 @@ def test_convert3():
3535
print(c)
3636
# assert 0
3737

38+
def test_convert_no_words():
39+
with raises(ValueError) as e:
40+
OcrdPageAltoConverter(page_filename='tests/assets/SBB0000F29300010000/data/OCR-D-GT-PAGE/FILE_0001_FULLTEXT.xml')
41+
3842
if __name__ == "__main__":
3943
main([__file__])

0 commit comments

Comments
 (0)