-
Notifications
You must be signed in to change notification settings - Fork 39
/
sentence_tokenizer.go
143 lines (114 loc) · 3.73 KB
/
sentence_tokenizer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
package sentences
import "fmt"
// SentenceTokenizer interface is used by the Tokenize function, can be extended to correct sentence
// boundaries that punkt misses.
type SentenceTokenizer interface {
AnnotateTokens([]*Token, ...AnnotateTokens) []*Token
Tokenize(string) []*Sentence
}
// DefaultSentenceTokenizer is a sentence tokenizer which uses an unsupervised algorithm to build a model
// for abbreviation words, collocations, and words that start sentences
// and then uses that model to find sentence boundaries.
type DefaultSentenceTokenizer struct {
*Storage
WordTokenizer
PunctStrings
Annotations []AnnotateTokens
}
// NewSentenceTokenizer are the sane defaults for the sentence tokenizer
func NewSentenceTokenizer(s *Storage) *DefaultSentenceTokenizer {
lang := NewPunctStrings()
word := NewWordTokenizer(lang)
annotations := NewAnnotations(s, lang, word)
tokenizer := &DefaultSentenceTokenizer{
Storage: s,
PunctStrings: lang,
WordTokenizer: word,
Annotations: annotations,
}
return tokenizer
}
// NewTokenizer wraps around DST doing the work for customizing the tokenizer
func NewTokenizer(s *Storage, word WordTokenizer, lang PunctStrings) *DefaultSentenceTokenizer {
annotations := NewAnnotations(s, lang, word)
tokenizer := &DefaultSentenceTokenizer{
Storage: s,
PunctStrings: lang,
WordTokenizer: word,
Annotations: annotations,
}
return tokenizer
}
/*
AnnotateTokens given a set of tokens augmented with markers for line-start and
paragraph-start, returns an iterator through those tokens with full
annotation including predicted sentence breaks.
*/
func (s *DefaultSentenceTokenizer) AnnotateTokens(tokens []*Token, annotate ...AnnotateTokens) []*Token {
for _, ann := range annotate {
tokens = ann.Annotate(tokens)
}
return tokens
}
/*
AnnotatedTokens are the fully annotated word tokens. This allows for adhoc adjustments to the tokens
*/
func (s *DefaultSentenceTokenizer) AnnotatedTokens(text string) []*Token {
// Use the default word tokenizer but only grab the tokens that
// relate to a sentence ending punctuation. This means grab the word
// before and after the punctuation.
tokens := s.WordTokenizer.Tokenize(text, false)
if len(tokens) == 0 {
return nil
}
return s.AnnotateTokens(tokens, s.Annotations...)
}
/*
SentencePositions returns an array of positions instead of returning an array
of sentences.
*/
func (s *DefaultSentenceTokenizer) SentencePositions(text string) []int {
annotatedTokens := s.AnnotatedTokens(text)
positions := make([]int, 0, len(annotatedTokens))
for _, token := range annotatedTokens {
if !token.SentBreak {
continue
}
positions = append(positions, token.Position)
}
lastChar := len(text)
positions = append(positions, lastChar)
return positions
}
/*
Sentence container to hold sentences, provides the character positions
as well as the text for that sentence.
*/
type Sentence struct {
Start int `json:"start"`
End int `json:"end"`
Text string `json:"text"`
}
func (s Sentence) String() string {
return fmt.Sprintf("<Sentence [%d:%d] '%s'>", s.Start, s.End, s.Text)
}
// Tokenize splits text input into sentence tokens.
func (s *DefaultSentenceTokenizer) Tokenize(text string) []*Sentence {
annotatedTokens := s.AnnotatedTokens(text)
lastBreak := 0
sentences := make([]*Sentence, 0, len(annotatedTokens))
for _, token := range annotatedTokens {
if !token.SentBreak {
continue
}
sentence := &Sentence{lastBreak, token.Position, text[lastBreak:token.Position]}
sentences = append(sentences, sentence)
lastBreak = token.Position
}
if lastBreak != len(text) {
lastChar := len(text)
sentence := &Sentence{lastBreak, lastChar, text[lastBreak:lastChar]}
sentences = append(sentences, sentence)
}
return sentences
}