Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adapt to utils moved to core, #49 #66

Merged
merged 6 commits into from
Aug 21, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 8 additions & 13 deletions ocrd_tesserocr/binarize.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,6 @@
from ocrd import Processor

from .config import TESSDATA_PREFIX, OCRD_TOOL
from .common import (
image_from_page,
image_from_segment,
save_image_file,
membername
)

TOOL = 'ocrd-tesserocr-binarize'
LOG = getLogger('processor.TesserocrBinarize')
Expand All @@ -52,6 +46,7 @@ def process(self):

Produce a new output file by serialising the resulting hierarchy.
"""
# pylint: disable=attribute-defined-outside-init
try:
self.page_grp, self.image_grp = self.output_file_grp.split(',')
except ValueError:
Expand All @@ -77,16 +72,16 @@ def process(self):
value=self.parameter[name])
for name in self.parameter.keys()])]))
page = pcgts.get_Page()
page_image, page_xywh, _ = image_from_page(
self.workspace, page, page_id)
page_image, page_xywh, _ = self.workspace.image_from_page(
page, page_id)
LOG.info("Binarizing on '%s' level in page '%s'", oplevel, page_id)

regions = page.get_TextRegion() + page.get_TableRegion()
if not regions:
LOG.warning("Page '%s' contains no text regions", page_id)
for region in regions:
region_image, region_xywh = image_from_segment(
self.workspace, region, page_image, page_xywh)
region_image, region_xywh = self.workspace.image_from_segment(
region, page_image, page_xywh)
if oplevel == 'region':
tessapi.SetPageSegMode(PSM.SINGLE_BLOCK)
self._process_segment(tessapi, RIL.BLOCK, region, region_image, region_xywh,
Expand All @@ -98,8 +93,8 @@ def process(self):
LOG.warning("Page '%s' region '%s' contains no text lines",
page_id, region.id)
for line in lines:
line_image, line_xywh = image_from_segment(
self.workspace, line, region_image, region_xywh)
line_image, line_xywh = self.workspace.image_from_segment(
line, region_image, region_xywh)
tessapi.SetPageSegMode(PSM.SINGLE_LINE)
self._process_segment(tessapi, RIL.TEXTLINE, line, line_image, line_xywh,
"line '%s'" % line.id, input_file.pageId,
Expand Down Expand Up @@ -129,7 +124,7 @@ def _process_segment(self, tessapi, ril, segment, image, xywh, where, page_id, f
LOG.error('Cannot binarize %s', where)
return
# update METS (add the image file):
file_path = save_image_file(self.workspace, image_bin,
file_path = self.workspace.save_image_file(image_bin,
file_id,
page_id=page_id,
file_grp=self.image_grp)
Expand Down
413 changes: 0 additions & 413 deletions ocrd_tesserocr/common.py

This file was deleted.

9 changes: 3 additions & 6 deletions ocrd_tesserocr/crop.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import tesserocr
from ocrd_utils import (
getLogger, concat_padded,
bbox_from_points, points_from_bbox, bbox_from_xywh,
MIMETYPE_PAGE
)
from ocrd_modelfactory import page_from_file
Expand All @@ -18,10 +19,6 @@
from ocrd import Processor

from .config import TESSDATA_PREFIX, OCRD_TOOL
from .common import (
bbox_from_points, points_from_bbox,
bbox_from_xywh, save_image_file
)

TOOL = 'ocrd-tesserocr-crop'
LOG = getLogger('processor.TesserocrCrop')
Expand Down Expand Up @@ -113,7 +110,7 @@ def process(self):
# iterate over all text blocks and compare their
# bbox extent to the running min and max values
for component in tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True):
image, xywh, index, para = component
image, xywh, index, _ = component
#
# the region reference in the reading order element
#
Expand Down Expand Up @@ -163,7 +160,7 @@ def process(self):
file_id = input_file.ID.replace(self.input_file_grp, FILEGRP_IMG)
if file_id == input_file.ID:
file_id = concat_padded(FILEGRP_IMG, n)
file_path = save_image_file(self.workspace, page_image,
file_path = self.workspace.save_image_file(page_image,
file_id,
page_id=page_id,
file_grp=FILEGRP_IMG)
Expand Down
25 changes: 10 additions & 15 deletions ocrd_tesserocr/deskew.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from ocrd_utils import (
getLogger, concat_padded,
membername,
MIMETYPE_PAGE
)
from ocrd_modelfactory import page_from_file
Expand All @@ -25,12 +26,6 @@
from ocrd import Processor

from .config import TESSDATA_PREFIX, OCRD_TOOL
from .common import (
image_from_page,
image_from_segment,
save_image_file,
membername
)

TOOL = 'ocrd-tesserocr-deskew'
LOG = getLogger('processor.TesserocrDeskew')
Expand All @@ -45,19 +40,19 @@ def __init__(self, *args, **kwargs):

def process(self):
"""Performs deskewing of the page / region with Tesseract on the workspace.

Open and deserialise PAGE input files and their respective images,
then iterate over the element hierarchy down to the region level
for all text and table regions.

Set up Tesseract to recognise the region image's orientation, skew
and script (with both OSD and AnalyseLayout). Rotate the image
accordingly, and annotate the angle, readingDirection and textlineOrder.

Create a corresponding image file, and reference it as AlternativeImage
in the region element and as file with a fileGrp USE `OCR-D-IMG-DESKEW`
in the workspace.

Produce a new output file by serialising the resulting hierarchy.
"""
oplevel = self.parameter['operation_level']
Expand All @@ -84,8 +79,8 @@ def process(self):
value=self.parameter[name])
for name in self.parameter.keys()])]))
page = pcgts.get_Page()
page_image, page_xywh, page_image_info = image_from_page(
self.workspace, page, page_id)
page_image, page_xywh, page_image_info = self.workspace.image_from_page(
page, page_id)
if page_image_info.xResolution != 1:
dpi = page_image_info.xResolution
if page_image_info.resolutionUnit == 'cm':
Expand All @@ -102,8 +97,8 @@ def process(self):
if not regions:
LOG.warning("Page '%s' contains no text regions", page_id)
for region in regions:
region_image, region_xywh = image_from_segment(
self.workspace, region, page_image, page_xywh)
region_image, region_xywh = self.workspace.image_from_segment(
region, page_image, page_xywh)
self._process_segment(tessapi, region, region_image, region_xywh,
"region '%s'" % region.id, input_file.pageId,
file_id + '_' + region.id)
Expand Down Expand Up @@ -269,7 +264,7 @@ def _process_segment(self, tessapi, segment, image, xywh, where, page_id, file_i
# points = points_from_x0y0x1y1(list(baseline[0]) + list(baseline[1]))
# segment.add_Baseline(BaselineType(points=points))
# update METS (add the image file):
file_path = save_image_file(self.workspace, image,
file_path = self.workspace.save_image_file(image,
file_id,
page_id=page_id,
file_grp=FILEGRP_IMG)
Expand Down
39 changes: 17 additions & 22 deletions ocrd_tesserocr/recognize.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,13 @@
PyTessBaseAPI, get_languages)

from ocrd_utils import (
getLogger, concat_padded,
points_from_x0y0x1y1,
xywh_from_points, points_from_xywh,
MIMETYPE_PAGE)
getLogger,
concat_padded,
points_from_polygon,
polygon_from_x0y0x1y1,
coordinates_for_segment,
MIMETYPE_PAGE
)
from ocrd_models.ocrd_page import (
CoordsType,
GlyphType, WordType,
Expand All @@ -21,14 +24,6 @@
from ocrd import Processor

from .config import TESSDATA_PREFIX, OCRD_TOOL
from .common import (
points_from_polygon,
xywh_from_polygon,
polygon_from_x0y0x1y1,
coordinates_for_segment,
image_from_page,
image_from_segment
)

TOOL = 'ocrd-tesserocr-recognize'
LOG = getLogger('processor.TesserocrRecognize')
Expand Down Expand Up @@ -133,8 +128,8 @@ def process(self):
value=self.parameter[name])
for name in self.parameter.keys()])]))
page = pcgts.get_Page()
page_image, page_xywh, page_image_info = image_from_page(
self.workspace, page, page_id)
page_image, page_xywh, page_image_info = self.workspace.image_from_page(
page, page_id)
if page_image_info.xResolution != 1:
dpi = page_image_info.xResolution
if page_image_info.resolutionUnit == 'cm':
Expand Down Expand Up @@ -165,8 +160,8 @@ def process(self):

def _process_regions(self, tessapi, regions, page_image, page_xywh):
for region in regions:
region_image, region_xywh = image_from_segment(
self.workspace, region, page_image, page_xywh)
region_image, region_xywh = self.workspace.image_from_segment(
region, page_image, page_xywh)
if self.parameter['textequiv_level'] == 'region':
tessapi.SetImage(region_image)
tessapi.SetPageSegMode(PSM.SINGLE_BLOCK)
Expand All @@ -191,8 +186,8 @@ def _process_lines(self, tessapi, textlines, region_image, region_xywh):
for line in textlines:
if self.parameter['overwrite_words']:
line.set_Word([])
line_image, line_xywh = image_from_segment(
self.workspace, line, region_image, region_xywh)
line_image, line_xywh = self.workspace.image_from_segment(
line, region_image, region_xywh)
# todo: Tesseract works better if the line images have a 5px margin everywhere
tessapi.SetImage(line_image)
# RAW_LINE fails with pre-LSTM models, but sometimes better with LSTM models
Expand Down Expand Up @@ -268,8 +263,8 @@ def _process_words_in_line(self, result_it, line, line_xywh):

def _process_existing_words(self, tessapi, words, line_image, line_xywh):
for word in words:
word_image, word_xywh = image_from_segment(
self.workspace, word, line_image, line_xywh)
word_image, word_xywh = self.workspace.image_from_segment(
word, line_image, line_xywh)
tessapi.SetImage(word_image)
tessapi.SetPageSegMode(PSM.SINGLE_WORD)
if self.parameter['textequiv_level'] == 'word':
Expand All @@ -296,8 +291,8 @@ def _process_existing_words(self, tessapi, words, line_image, line_xywh):

def _process_existing_glyphs(self, tessapi, glyphs, word_image, word_xywh):
for glyph in glyphs:
glyph_image, glyph_xywh = image_from_segment(
self.workspace, glyph, word_image, word_xywh)
glyph_image, _ = self.workspace.image_from_segment(
glyph, word_image, word_xywh)
tessapi.SetImage(glyph_image)
tessapi.SetPageSegMode(PSM.SINGLE_CHAR)
LOG.debug("Recognizing text in glyph '%s'", glyph.id)
Expand Down
12 changes: 4 additions & 8 deletions ocrd_tesserocr/segment_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,6 @@
)

from .config import TESSDATA_PREFIX, OCRD_TOOL
from .common import (
image_from_page,
image_from_segment
)

TOOL = 'ocrd-tesserocr-segment-line'
LOG = getLogger('processor.TesserocrSegmentLine')
Expand Down Expand Up @@ -70,8 +66,8 @@ def process(self):
value=self.parameter[name])
for name in self.parameter.keys()])]))
page = pcgts.get_Page()
page_image, page_xywh, page_image_info = image_from_page(
self.workspace, page, page_id)
page_image, page_xywh, page_image_info = self.workspace.image_from_page(
page, page_id)
if page_image_info.xResolution != 1:
dpi = page_image_info.xResolution
if page_image_info.resolutionUnit == 'cm':
Expand All @@ -86,8 +82,8 @@ def process(self):
else:
LOG.warning('keeping existing TextLines in region "%s"', region.id)
LOG.debug("Detecting lines in region '%s'", region.id)
region_image, region_xywh = image_from_segment(
self.workspace, region, page_image, page_xywh)
region_image, region_xywh = self.workspace.image_from_segment(
region, page_image, page_xywh)
tessapi.SetImage(region_image)
for line_no, component in enumerate(tessapi.GetComponentImages(RIL.TEXTLINE, True, raw_image=True)):
line_id = '%s_line%04d' % (region.id, line_no)
Expand Down
27 changes: 13 additions & 14 deletions ocrd_tesserocr/segment_region.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,15 @@
)

from ocrd_utils import (
getLogger, concat_padded,
getLogger,
concat_padded,
points_from_x0y0x1y1,
points_from_xywh, xywh_from_points,
MIMETYPE_PAGE)
points_from_xywh,
xywh_from_points,
MIMETYPE_PAGE,
points_from_polygon,
membername
)
from ocrd_modelfactory import page_from_file
from ocrd_models.ocrd_page import (
MetadataItemType,
Expand All @@ -29,12 +34,6 @@
from ocrd import Processor

from .config import TESSDATA_PREFIX, OCRD_TOOL
from .common import (
image_from_page,
save_image_file,
points_from_polygon,
membername
)

TOOL = 'ocrd-tesserocr-segment-region'
LOG = getLogger('processor.TesserocrSegmentRegion')
Expand Down Expand Up @@ -106,7 +105,7 @@ def process(self):
page.set_TextRegion([])
else:
LOG.warning('keeping existing TextRegions')
# todo: also make non-text regions protected?
# TODO: also make non-text regions protected?
page.set_AdvertRegion([])
page.set_ChartRegion([])
page.set_ChemRegion([])
Expand All @@ -126,8 +125,8 @@ def process(self):
page.set_ReadingOrder([])
else:
LOG.warning('keeping existing ReadingOrder')
page_image, page_xywh, page_image_info = image_from_page(
self.workspace, page, page_id)
page_image, page_xywh, page_image_info = self.workspace.image_from_page(
page, page_id)
if page_image_info.xResolution != 1:
dpi = page_image_info.xResolution
if page_image_info.resolutionUnit == 'cm':
Expand Down Expand Up @@ -259,9 +258,9 @@ def _process_page(self, it, page, page_image, page_xywh, page_id, file_id):
# GetBinaryImage).
# You have been warned!
# get the raw image (masked by white space along the block polygon):
region_image, top, left = it.GetImage(RIL.BLOCK, self.parameter['padding'], page_image)
region_image, _, _ = it.GetImage(RIL.BLOCK, self.parameter['padding'], page_image)
# update METS (add the image file):
file_path = save_image_file(self.workspace, region_image,
file_path = self.workspace.save_image_file(region_image,
file_id + '_' + ID,
page_id=page_id,
file_grp=FILEGRP_IMG)
Expand Down
Loading