-
Notifications
You must be signed in to change notification settings - Fork 39
/
ortho.go
89 lines (80 loc) · 2.39 KB
/
ortho.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
package sentences
/*
The following constants are used to describe the orthographic
contexts in which a word can occur. BEG=beginning, MID=middle,
UNK=unknown, UC=uppercase, LC=lowercase, NC=no case.
*/
const (
// Beginning of a sentence with upper case.
orthoBegUc = 1 << 1
// Middle of a sentence with upper case.
orthoMidUc = 1 << 2
// Unknown position in a sentence with upper case.
orthoUnkUc = 1 << 3
// Beginning of a sentence with lower case.
orthoBegLc = 1 << 4
// Middle of a sentence with lower case.
orthoMidLc = 1 << 5
// Unknown position in a sentence with lower case.
orthoUnkLc = 1 << 6
// Occurs with upper case.
orthoUc = orthoBegUc + orthoMidUc + orthoUnkUc
// Occurs with lower case.
orthoLc = orthoBegLc + orthoMidLc + orthoUnkLc
)
/*
A map from context position and first-letter case to the
appropriate orthographic context flag.
*/
var orthoMap = map[[2]string]int{
[2]string{"initial", "upper"}: orthoBegUc,
[2]string{"internal", "upper"}: orthoMidUc,
[2]string{"unknown", "upper"}: orthoUnkUc,
[2]string{"initial", "lower"}: orthoBegLc,
[2]string{"internal", "lower"}: orthoMidLc,
[2]string{"unknown", "lower"}: orthoUnkLc,
}
// Ortho creates a promise for structs to implement an orthogonal heuristic
// method.
type Ortho interface {
Heuristic(*Token) int
}
// OrthoContext determines whether a token is capitalized, sentence starter, etc.
type OrthoContext struct {
*Storage
PunctStrings
TokenType
TokenFirst
}
/*
Heuristic decides whether the given token is the first token in a sentence.
*/
func (o *OrthoContext) Heuristic(token *Token) int {
if token == nil {
return 0
}
for _, punct := range o.PunctStrings.Punctuation() {
if token.Tok == string(punct) {
return 0
}
}
orthoCtx := o.Storage.OrthoContext[o.TokenType.TypeNoSentPeriod(token)]
/*
If the word is capitalized, occurs at least once with a
lower case first letter, and never occurs with an upper case
first letter sentence-internally, then it's a sentence starter.
*/
if o.TokenFirst.FirstUpper(token) && (orthoCtx&orthoLc > 0 && orthoCtx&orthoMidUc == 0) {
return 1
}
/*
If the word is lower case, and either (a) we've seen it used
with upper case, or (b) we've never seen it used
sentence-initially with lower case, then it's not a sentence
starter.
*/
if o.TokenFirst.FirstLower(token) && (orthoCtx&orthoUc > 0 || orthoCtx&orthoBegLc == 0) {
return 0
}
return -1
}