1
1
# pylint: disable=no-member, c-extension-no-member
2
2
from lxml import etree as ET
3
- from ocrd_models .ocrd_page import ( parse , parseString )
3
+ from ocrd_models .ocrd_page import parse , parseString , to_xml
4
4
from ocrd_models .constants import NAMESPACES as NAMESPACES_
5
5
from ocrd_utils import getLogger , xywh_from_points
6
6
36
36
37
37
class OcrdPageAltoConverter ():
38
38
39
- def __init__ (self , * , page_filename = None , page_etree = None , pcgts = None , logger = None ):
39
+ def __init__ (self , * , check_words = True , page_filename = None , page_etree = None , pcgts = None , logger = None ):
40
+ """
41
+ Keyword Args:
42
+ check_words (boolean): Whether to check if PAGE-XML contains any words before conversion and fail if not
43
+ """
40
44
if not (page_filename or page_etree or pcgts ):
41
45
raise ValueError ("Must pass either pcgts, page_etree or page_filename to constructor" )
42
46
self .logger = logger if logger else getLogger ('page-to-alto' )
@@ -47,6 +51,10 @@ def __init__(self, *, page_filename=None, page_etree=None, pcgts=None, logger=No
47
51
else :
48
52
self .page_pcgts = parse (page_filename )
49
53
self .page_page = self .page_pcgts .get_Page ()
54
+ if check_words :
55
+ xml_ = ET .fromstring (to_xml (self .page_pcgts ).encode ('utf-8' ))
56
+ if xml_ .find ('.//page:Word' , NAMESPACES ) is None :
57
+ raise ValueError ("The PAGE-XML to transform contains no Words, hence nothing to convert." )
50
58
# TODO self.check_for_words()
51
59
self .alto_alto , self .alto_description , self .alto_styles , self .alto_tags , self .alto_page = self .create_alto ()
52
60
self .alto_printspace = self .convert_border ()
0 commit comments