Skip to content

Commit

Permalink
toggleable behavior for empty lines
Browse files Browse the repository at this point in the history
  • Loading branch information
kba committed Apr 6, 2021
1 parent eb256b7 commit cd0bb8d
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 9 deletions.
10 changes: 8 additions & 2 deletions ocrd_page_to_alto/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,19 @@
@click.command()
@click.option('--check-words/--no-check-words', default=True, help='Check whether PAGE-XML contains any Words and fail if not')
@click.option('--check-border/--no-check-border', default=True, help='Check whether PAGE-XML contains Border or PrintSpace')
@click.option('--skip-empty-lines/--no-skip-empty-lines', default=False, help='Whether to omit or keep empty lines in PAGE-XML')
@click.argument('filename')
def main(check_words, check_border, filename):
def main(check_words, check_border, skip_empty_lines, filename):
"""
Convert PAGE to ALTO
"""
initLogging()
converter = OcrdPageAltoConverter(page_filename=filename, check_words=check_words, check_border=check_border)
converter = OcrdPageAltoConverter(
page_filename=filename,
check_words=check_words,
check_border=check_border,
skip_empty_lines=skip_empty_lines,
)
converter.convert()
print(converter)

Expand Down
16 changes: 9 additions & 7 deletions ocrd_page_to_alto/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,16 @@

class OcrdPageAltoConverter():

def __init__(self, *, check_words=True, check_border=True, page_filename=None, page_etree=None, pcgts=None, logger=None):
def __init__(self, *, check_words=True, check_border=True, skip_empty_lines=False, page_filename=None, page_etree=None, pcgts=None, logger=None):
"""
Keyword Args:
check_words (boolean): Whether to check if PAGE-XML contains any words before conversion and fail if not
check_border (boolean): Whether to abort if neither Border nor PrintSpace is defined
skip_empty_lines (boolean): Whether to omit empty lines completely (True) or create a placeholder empty String in ALTO (False)
"""
if not (page_filename or page_etree or pcgts):
raise ValueError("Must pass either pcgts, page_etree or page_filename to constructor")
self.skip_empty_lines = skip_empty_lines
self.logger = logger if logger else getLogger('page-to-alto')
if pcgts:
self.page_pcgts = pcgts
Expand Down Expand Up @@ -161,17 +163,17 @@ def convert_metadata(self):

def _convert_textlines(self, reg_alto, reg_page):
for line_page in reg_page.get_TextLine():
is_empty_line = not(line_page.get_TextEquiv() and line_page.get_TextEquiv()[0].get_Unicode())
if is_empty_line and self.skip_empty_lines:
return
line_alto = ET.SubElement(reg_alto, 'TextLine')
if is_empty_line:
word_alto_empty = ET.SubElement(line_alto, 'String')
word_alto_empty.set('CONTENT', '')
set_alto_id_from_page_id(line_alto, line_page)
set_alto_xywh_from_coords(line_alto, line_page)
set_alto_shape_from_coords(line_alto, line_page)
is_empty_line = not(line_page.get_TextEquiv() and line_page.get_TextEquiv()[0].get_Unicode())
if not line_page.get_Word() and not is_empty_line:
raise ValueError("pc:TextLine '%s' has no pc:Word" % line_page.id)
# XXX ALTO does not allow TextLine without at least one String
if is_empty_line:
word_alto_empty = ET.SubElement(line_alto, 'String')
word_alto_empty.set('CONTENT', '')
for word_page in line_page.get_Word():
word_alto = ET.SubElement(line_alto, 'String')
set_alto_id_from_page_id(word_alto, word_page)
Expand Down

0 comments on commit cd0bb8d

Please sign in to comment.