-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathhtr-united.yml
142 lines (140 loc) · 3.05 KB
/
htr-united.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
schema: https://htr-united.github.io/schema/2021-10-15/schema.json
title: Caroline Minuscule by Rescribe
url: https://github.com/rescribe/carolineminuscule-groundtruth
project-name: 'Rescribe''
'
project-website: https://rescribe.xyz/
authors:
- name: White
surname: Nick
roles:
- transcriber
- project-manager
- name: "Cl\xE9rice"
surname: Thibault
roles:
- aligner
- name: Karaisl
surname: Antonia
roles:
- transcriber
- project-manager
description: 'This ground truth repository is a work in process; it currently accounts
for a part of our complete Caroline Minuscule training pool of around 70 manuscripts
used for our OCRopus Caroline Minuscule model (see ocropus-models repository).
'
language:
- lat
script:
- Latn
script-type: only-manuscript
time:
notBefore: '800'
notAfter: '1199'
hands:
count: 1-per-file
precision: exact
license:
- name: CC-BY 4.0
url: https://creativecommons.org/licenses/by/4.0/
format: Alto-XML
volume:
- metric: characters
count: 17155
- metric: files
count: 17
- metric: lines
count: 457
- metric: regions
count: 46
transcription-guidelines: "In general this meant deciding between diplomatic transcription\
\ (i.e. sticking to what it says on the page) and gently modernized features (i.e.\
\ reinterpreting medieval signs into modern equivalents) with a view to specific\
\ categories. Read on for a summary of the rules and the respective rationale behind\
\ them.\nSUMMARY\nPUNCTUATION\n\n Modern: medieval punctuation is transcribed\
\ with modern equivalents; punctus elevatus transcribed as semicolon\n\nCAPITALIZATION\n\
\n Diplomatic: Original capitalization retained\n\nABBREVIATIONS\n\n Diplomatic\
\ where possible: Retain abbreviations and render glyphs as opposed to expanded\
\ versions where possible\n \"*\" where original character isn't served: OCRopus\
\ (at the point in time of transcription) could not handle some of the medieval\
\ glyphs, even where a Unicode version was present. Abbreviations not in OCRopus\
\ are uniformly transcribed as \"*\", in the case of a combined character (such\
\ as a consonant with a macron) as the base character followed by \"*\" (e.g. \"\
t*\"). The list of accepted characters in OCRopus can be found in this repository,\
\ and downloaded and used as codec in the OCRopus training process.\n\nSPACING\n\
\n Diplomatic: Preserve manuscript spacing, i.e. give diplomatic transcription\n\
\nNUMBERS\n\n Diplomatic: retain original version of both Roman and Arabic numerals'"
characters:
mode: NFD
members:
- i
- e
- t
- u
- a
- s
- n
- o
- r
- m
- c
- d
- l
- p
- .
- b
- q
- g
- '*'
- h
- ;
- "\u0303"
- f
- x
- I
- "\u0304"
- E
- N
- "\u0328"
- ':'
- '&'
- S
- "\uA751"
- C
- A
- "\u0111"
- D
- U
- T
- "\uA753"
- Q
- v
- ','
- O
- R
- P
- L
- M
- "\xE6"
- H
- F
- '?'
- '1'
- y
- "\uA75D"
- "\uA759"
- V
- '4'
- B
- z
- '5'
- X
- '6'
- "\uA75B"
- /
- ''''
- '0'
- '2'
- '9'
- K
- '-'