-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathgt-binarize-page-olena-sauvola-denoise-ocropy-deskew-page-ocropy-clip-deskew-region-tesseract-resegment-dewarp-ocr-ocropy-tesseract-extract-lines.mk
139 lines (102 loc) · 4.21 KB
/
gt-binarize-page-olena-sauvola-denoise-ocropy-deskew-page-ocropy-clip-deskew-region-tesseract-resegment-dewarp-ocr-ocropy-tesseract-extract-lines.mk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# Install by copying (or symlinking) makefiles into a directory
# where all OCR-D workspaces (unpacked BagIts) reside. Then
# chdir to that location.
# Call via:
# `make -f WORKFLOW-CONFIG.mk WORKSPACE-DIRS` or
# `make -f WORKFLOW-CONFIG.mk all` or just
# `make -f WORKFLOW-CONFIG.mk`
# To rebuild partially, you must pass -W to recursive make:
# `make -f WORKFLOW-CONFIG.mk EXTRA_MAKEFLAGS="-W FILEGRP"`
###
# From here on, custom configuration begins.
info:
@echo "Read GT line segmentation,"
@echo "then binarize+denoise+deskew pages,"
@echo "then clip+deskew regions,"
@echo "then resegment+dewarp lines,"
@echo "then recognize lines with various Ocropus+Tesseract models,"
@echo "and finally extract line images and line texts"
@echo "(both the GT and OCR versions) into one directory,"
@echo "with conventional filename suffixes for OCR/post-correction training."
INPUT = OCR-D-GT-SEG-LINE
$(INPUT):
ocrd workspace find -G $@ --download
ocrd workspace find -G OCR-D-IMG --download # just in case
BIN = $(INPUT)-BINPAGE-sauvola
$(BIN): $(INPUT)
$(BIN): TOOL = ocrd-olena-binarize
$(BIN): PARAMS = "impl": "sauvola-ms-split"
DEN = $(BIN)-DENOISE-ocropy
$(DEN): $(BIN)
$(DEN): TOOL = ocrd-cis-ocropy-denoise
$(DEN): PARAMS = "level-of-operation": "page", "noise_maxsize": 3.0
FLIP = $(DEN)-DESKEW-tesseract
$(FLIP): $(DEN)
$(FLIP): TOOL = ocrd-tesserocr-deskew
$(FLIP): PARAMS = "operation_level": "page"
DESK = $(FLIP)-DESKEW-ocropy
$(DESK): $(FLIP)
$(DESK): TOOL = ocrd-cis-ocropy-deskew
$(DESK): PARAMS = "level-of-operation": "page", "maxskew": 5
CLIP = $(DESK)-CLIP
$(CLIP): $(DESK)
$(CLIP): TOOL = ocrd-cis-ocropy-clip
FLIP2 = $(CLIP)-DESKEW-tesseract
$(FLIP2): $(CLIP)
$(FLIP2): TOOL = ocrd-tesserocr-deskew
$(FLIP2): PARAMS = "operation_level": "region"
DESK2 = $(FLIP2)-DESKEW-ocropy
$(DESK2): $(FLIP2)
$(DESK2): TOOL = ocrd-cis-ocropy-deskew
$(DESK2): PARAMS = "level-of-operation": "region"
RESEG = $(DESK2)-RESEG
$(RESEG): $(DESK2)
$(RESEG): TOOL = ocrd-cis-ocropy-resegment
DEW = $(RESEG)-DEWARP
$(DEW): $(RESEG)
$(DEW): TOOL = ocrd-cis-ocropy-dewarp
OCR1 = $(DEW:$(INPUT)-%=OCR-D-OCR-OCRO-fraktur-%)
OCR2 = $(DEW:$(INPUT)-%=OCR-D-OCR-OCRO-frakturjze-%)
OCR3 = $(DEW:$(INPUT)-%=OCR-D-OCR-TESS-Fraktur-%)
OCR4 = $(DEW:$(INPUT)-%=OCR-D-OCR-TESS-Fraktur-Latin-%)
OCR5 = $(DEW:$(INPUT)-%=OCR-D-OCR-TESS-frk-%)
OCR6 = $(DEW:$(INPUT)-%=OCR-D-OCR-TESS-frk-deu-%)
OCR7 = $(DEW:$(INPUT)-%=OCR-D-OCR-TESS-gt4histocr-%)
OCR8 = $(DEW:$(INPUT)-%=OCR-D-OCR-CALA-gt4histocr-%)
$(OCR1) $(OCR2) $(OCR3) $(OCR4) $(OCR5) $(OCR6) $(OCR7) $(OCR8): $(DEW)
$(OCR1) $(OCR2): TOOL = ocrd-cis-ocropy-recognize
$(OCR1): PARAMS = "textequiv_level": "glyph", "model": "fraktur.pyrnn"
$(OCR2): PARAMS = "textequiv_level": "glyph", "model": "fraktur-jze.pyrnn"
$(OCR3) $(OCR4) $(OCR5) $(OCR6) $(OCR7): TOOL = ocrd-tesserocr-recognize
$(OCR3): PARAMS = "textequiv_level" : "glyph", "overwrite_words": true, "model" : "script/Fraktur"
$(OCR4): PARAMS = "textequiv_level" : "glyph", "overwrite_words": true, "model" : "script/Fraktur+script/Latin"
$(OCR5): PARAMS = "textequiv_level" : "glyph", "overwrite_words": true, "model" : "frk"
$(OCR6): PARAMS = "textequiv_level" : "glyph", "overwrite_words": true, "model" : "frk+deu"
$(OCR7): PARAMS = "textequiv_level" : "glyph", "overwrite_words": true, "model" : "GT4HistOCR_2000000+GT4HistOCR_300000+GT4HistOCR_100000"
$(OCR8): TOOL = ocrd-calamari-recognize
$(OCR8): GPU = 1
$(OCR8): PARAMS = "checkpoint" : "$(VIRTUAL_ENV)/share/calamari/GT4HistOCR/*.ckpt.json"
LINES = $(patsubst %,OCR-D-IMG-LINES-%,$(DEW) $(OCR1) $(OCR2) $(OCR3) $(OCR4) $(OCR5) $(OCR6) $(OCR7) $(OCR8))
$(LINES): OCR-D-IMG-LINES-%: %
$(LINES): TOOL = ocrd-segment-extract-lines
$(LINES): PARAMS = "transparency": true
OUTPUT = OCR-D-IMG-LINES
$(OUTPUT): $(LINES)
@mkdir -p $(OUTPUT)
set -e; \
ln -frs $</* $@; \
for grp in $(filter-out $<,$^); do \
suffix=$(<:OCR-D-IMG-LINES-$(INPUT)-%=%); \
ocr=$${grp%-$$suffix}; \
ocr=$${ocr#OCR-D-IMG-LINES-}; \
for file in $$grp/*.gt.txt; do \
newfile=$${file/$$grp\/$$grp/$@\/$<}; \
newfile=$${newfile/.gt.txt/.$$ocr.txt}; \
ln -frs $$file $$newfile; \
done \
done || { rm -fr $(OUTPUT); exit 1; }
.DEFAULT_GOAL = $(OUTPUT)
.PHONY: $(OUTPUT)
# Down here, custom configuration ends.
###
include Makefile