-
Notifications
You must be signed in to change notification settings - Fork 39
/
annotate.go
198 lines (174 loc) · 5.36 KB
/
annotate.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
package sentences
import (
"strings"
)
/*
AnnotateTokens is an interface used for the sentence tokenizer to add properties to
any given token during tokenization.
*/
type AnnotateTokens interface {
Annotate([]*Token) []*Token
}
/*
TypeBasedAnnotation performs the first pass of annotation, which makes decisions
based purely based on the word type of each word:
* '?', '!', and '.' are marked as sentence breaks.
* sequences of two or more periods are marked as ellipsis.
* any word ending in '.' that's a known abbreviation is marked as an abbreviation.
* any other word ending in '.' is marked as a sentence break.
Return these annotations as a tuple of three sets:
* sentbreak_toks: The indices of all sentence breaks.
* abbrev_toks: The indices of all abbreviations.
* ellipsis_toks: The indices of all ellipsis marks.
*/
type TypeBasedAnnotation struct {
*Storage
PunctStrings
TokenExistential
}
// NewTypeBasedAnnotation creates an instance of the TypeBasedAnnotation struct
func NewTypeBasedAnnotation(s *Storage, p PunctStrings, e TokenExistential) *TypeBasedAnnotation {
return &TypeBasedAnnotation{
Storage: s,
PunctStrings: p,
TokenExistential: e,
}
}
// NewAnnotations is the default AnnotateTokens struct that the tokenizer uses
func NewAnnotations(s *Storage, p PunctStrings, word WordTokenizer) []AnnotateTokens {
return []AnnotateTokens{
&TypeBasedAnnotation{s, p, word},
&TokenBasedAnnotation{s, p, word, &DefaultTokenGrouper{}, &OrthoContext{
s, p, word, word,
}},
}
}
// Annotate iterates over all tokens and applies the type annotation on them
func (a *TypeBasedAnnotation) Annotate(tokens []*Token) []*Token {
for _, augTok := range tokens {
a.typeAnnotation(augTok)
}
return tokens
}
func (a *TypeBasedAnnotation) typeAnnotation(token *Token) {
chars := []rune(token.Tok)
if a.HasSentEndChars(token) {
token.SentBreak = true
} else if a.HasPeriodFinal(token) && !strings.HasSuffix(token.Tok, "..") {
tokNoPeriod := strings.ToLower(token.Tok[:len(chars)-1])
tokNoPeriodHypen := strings.Split(tokNoPeriod, "-")
tokLastHyphEl := string(tokNoPeriodHypen[len(tokNoPeriodHypen)-1])
if a.IsAbbr(tokNoPeriod, tokLastHyphEl) {
token.Abbr = true
} else {
token.SentBreak = true
}
}
}
/*
TokenBasedAnnotation performs a token-based classification (section 4) over the given
tokens, making use of the orthographic heuristic (4.1.1), collocation
heuristic (4.1.2) and frequent sentence starter heuristic (4.1.3).
*/
type TokenBasedAnnotation struct {
*Storage
PunctStrings
TokenParser
TokenGrouper
Ortho
}
// Annotate iterates groups tokens in pairs of two and then iterates over them to apply token annotation
func (a *TokenBasedAnnotation) Annotate(tokens []*Token) []*Token {
for _, tokPair := range a.TokenGrouper.Group(tokens) {
a.tokenAnnotation(tokPair[0], tokPair[1])
}
return tokens
}
func (a *TokenBasedAnnotation) tokenAnnotation(tokOne, tokTwo *Token) {
if tokTwo == nil {
return
}
if !a.TokenParser.HasPeriodFinal(tokOne) {
return
}
typ := a.TokenParser.TypeNoPeriod(tokOne)
nextTyp := a.TokenParser.TypeNoSentPeriod(tokTwo)
tokIsInitial := a.TokenParser.IsInitial(tokOne)
/*
[4.1.2. Collocation Heuristic] If there's a
collocation between the word before and after the
period, then label tok as an abbreviation and NOT
a sentence break. Note that collocations with
frequent sentence starters as their second word are
excluded in training.
*/
collocation := strings.Join([]string{typ, nextTyp}, ",")
if a.Collocations[collocation] != 0 {
tokOne.SentBreak = false
tokOne.Abbr = true
return
}
/*
[4.2. Token-Based Reclassification of Abbreviations] If
the token is an abbreviation or an ellipsis, then decide
whether we should *also* classify it as a sentbreak.
*/
if (tokOne.Abbr || a.TokenParser.IsEllipsis(tokOne)) && !tokIsInitial {
/*
[4.1.1. Orthographic Heuristic] Check if there's
orthogrpahic evidence about whether the next word
starts a sentence or not.
*/
isSentStarter := a.Ortho.Heuristic(tokTwo)
if isSentStarter == 1 {
tokOne.SentBreak = true
return
}
/*
[4.1.3. Frequent Sentence Starter Heruistic] If the
next word is capitalized, and is a member of the
frequent-sentence-starters list, then label tok as a
sentence break.
*/
if a.TokenParser.FirstUpper(tokTwo) && a.SentStarters[nextTyp] != 0 {
tokOne.SentBreak = true
return
}
}
/*
Sometimes there are two consecutive tokens with a lone "."
which probably means it is part of a spaced ellipsis ". . ."
so set those tokens and not sentence breaks
*/
if tokOne.Tok == "." && tokTwo.Tok == "." {
tokOne.SentBreak = false
tokTwo.SentBreak = false
return
}
/*
[4.3. Token-Based Detection of Initials and Ordinals]
Check if any initials or ordinals tokens that are marked
as sentbreaks should be reclassified as abbreviations.
*/
if tokIsInitial || typ == "##number##" {
isSentStarter := a.Ortho.Heuristic(tokTwo)
if isSentStarter == 0 {
tokOne.SentBreak = false
tokOne.Abbr = true
return
}
/*
Special heuristic for initials: if orthogrpahic
heuristc is unknown, and next word is always
capitalized, then mark as abbrev (eg: J. Bach).
*/
if isSentStarter == -1 &&
tokIsInitial &&
a.TokenParser.FirstUpper(tokTwo) &&
a.OrthoContext[nextTyp]&orthoLc == 0 {
tokOne.SentBreak = false
tokOne.Abbr = true
return
}
}
}