@@ -2,11 +2,6 @@ package chroma
2
2
3
3
import (
4
4
"fmt"
5
- "regexp"
6
- "strings"
7
- "sync"
8
-
9
- "github.com/dlclark/regexp2"
10
5
)
11
6
12
7
var (
@@ -117,243 +112,3 @@ func (l Lexers) Less(i, j int) bool { return l[i].Config().Name < l[j].Config().
117
112
type Analyser interface {
118
113
AnalyseText (text string ) float32
119
114
}
120
-
121
- type Rule struct {
122
- Pattern string
123
- Type Emitter
124
- Mutator Mutator
125
- }
126
-
127
- // An Emitter takes group matches and returns tokens.
128
- type Emitter interface {
129
- // Emit tokens for the given regex groups.
130
- Emit (groups []string , lexer Lexer , out func (* Token ))
131
- }
132
-
133
- // EmitterFunc is a function that is an Emitter.
134
- type EmitterFunc func (groups []string , lexer Lexer , out func (* Token ))
135
-
136
- // Emit tokens for groups.
137
- func (e EmitterFunc ) Emit (groups []string , lexer Lexer , out func (* Token )) { e (groups , lexer , out ) }
138
-
139
- // ByGroups emits a token for each matching group in the rule's regex.
140
- func ByGroups (emitters ... Emitter ) Emitter {
141
- return EmitterFunc (func (groups []string , lexer Lexer , out func (* Token )) {
142
- // NOTE: If this line panics, there is a mismatch with groups. Uncomment the following line to debug.
143
- // fmt.Printf("%s %#v\n", emitters, groups[1:])
144
- for i , group := range groups [1 :] {
145
- emitters [i ].Emit ([]string {group }, lexer , out )
146
- }
147
- return
148
- })
149
- }
150
-
151
- // Using returns an Emitter that uses a given Lexer for parsing and emitting.
152
- func Using (lexer Lexer , options * TokeniseOptions ) Emitter {
153
- return EmitterFunc (func (groups []string , _ Lexer , out func (* Token )) {
154
- if err := lexer .Tokenise (options , groups [0 ], out ); err != nil {
155
- panic (err )
156
- }
157
- })
158
- }
159
-
160
- // UsingSelf is like Using, but uses the current Lexer.
161
- func UsingSelf (state string ) Emitter {
162
- return EmitterFunc (func (groups []string , lexer Lexer , out func (* Token )) {
163
- if err := lexer .Tokenise (& TokeniseOptions {State : state }, groups [0 ], out ); err != nil {
164
- panic (err )
165
- }
166
- })
167
- }
168
-
169
- // Words creates a regex that matches any of the given literal words.
170
- func Words (prefix , suffix string , words ... string ) string {
171
- for i , word := range words {
172
- words [i ] = regexp .QuoteMeta (word )
173
- }
174
- return prefix + `(` + strings .Join (words , `|` ) + `)` + suffix
175
- }
176
-
177
- // Rules maps from state to a sequence of Rules.
178
- type Rules map [string ][]Rule
179
-
180
- // MustNewLexer creates a new Lexer or panics.
181
- func MustNewLexer (config * Config , rules Rules ) * RegexLexer {
182
- lexer , err := NewLexer (config , rules )
183
- if err != nil {
184
- panic (err )
185
- }
186
- return lexer
187
- }
188
-
189
- // NewLexer creates a new regex-based Lexer.
190
- //
191
- // "rules" is a state machine transitition map. Each key is a state. Values are sets of rules
192
- // that match input, optionally modify lexer state, and output tokens.
193
- func NewLexer (config * Config , rules Rules ) (* RegexLexer , error ) {
194
- if config == nil {
195
- config = & Config {}
196
- }
197
- if _ , ok := rules ["root" ]; ! ok {
198
- return nil , fmt .Errorf ("no \" root\" state" )
199
- }
200
- compiledRules := map [string ][]CompiledRule {}
201
- for state , rules := range rules {
202
- for _ , rule := range rules {
203
- flags := ""
204
- if ! config .NotMultiline {
205
- flags += "m"
206
- }
207
- if config .CaseInsensitive {
208
- flags += "i"
209
- }
210
- if config .DotAll {
211
- flags += "s"
212
- }
213
- compiledRules [state ] = append (compiledRules [state ], CompiledRule {Rule : rule , flags : flags })
214
- }
215
- }
216
- return & RegexLexer {
217
- config : config ,
218
- rules : compiledRules ,
219
- }, nil
220
- }
221
-
222
- // A CompiledRule is a Rule with a pre-compiled regex.
223
- //
224
- // Note that regular expressions are lazily compiled on first use of the lexer.
225
- type CompiledRule struct {
226
- Rule
227
- Regexp * regexp2.Regexp
228
- flags string
229
- }
230
-
231
- type CompiledRules map [string ][]CompiledRule
232
-
233
- type LexerState struct {
234
- Text []rune
235
- Pos int
236
- Rules map [string ][]CompiledRule
237
- Stack []string
238
- State string
239
- Rule int
240
- // Group matches.
241
- Groups []string
242
- // Custum context for mutators.
243
- MutatorContext map [interface {}]interface {}
244
- }
245
-
246
- func (l * LexerState ) Set (key interface {}, value interface {}) {
247
- l .MutatorContext [key ] = value
248
- }
249
-
250
- func (l * LexerState ) Get (key interface {}) interface {} {
251
- return l .MutatorContext [key ]
252
- }
253
-
254
- type RegexLexer struct {
255
- config * Config
256
- analyser func (text string ) float32
257
-
258
- mu sync.Mutex
259
- compiled bool
260
- rules map [string ][]CompiledRule
261
- }
262
-
263
- // SetAnalyser sets the analyser function used to perform content inspection.
264
- func (r * RegexLexer ) SetAnalyser (analyser func (text string ) float32 ) * RegexLexer {
265
- r .analyser = analyser
266
- return r
267
- }
268
-
269
- func (r * RegexLexer ) AnalyseText (text string ) float32 {
270
- if r .analyser != nil {
271
- return r .analyser (text )
272
- }
273
- return 0.0
274
- }
275
-
276
- func (r * RegexLexer ) Config () * Config {
277
- return r .config
278
- }
279
-
280
- // Regex compilation is deferred until the lexer is used. This is to avoid significant init() time costs.
281
- func (r * RegexLexer ) maybeCompile () (err error ) {
282
- r .mu .Lock ()
283
- defer r .mu .Unlock ()
284
- if r .compiled {
285
- return nil
286
- }
287
- for state , rules := range r .rules {
288
- for i , rule := range rules {
289
- if rule .Regexp == nil {
290
- rule .Regexp , err = regexp2 .Compile ("^(?" + rule .flags + ")(?:" + rule .Pattern + ")" , 0 )
291
- if err != nil {
292
- return fmt .Errorf ("failed to compile rule %s.%d: %s" , state , i , err )
293
- }
294
- }
295
- rules [i ] = rule
296
- }
297
- }
298
- r .compiled = true
299
- return nil
300
- }
301
-
302
- func (r * RegexLexer ) Tokenise (options * TokeniseOptions , text string , out func (* Token )) error {
303
- if err := r .maybeCompile (); err != nil {
304
- return err
305
- }
306
- if options == nil {
307
- options = defaultOptions
308
- }
309
- state := & LexerState {
310
- Text : []rune (text ),
311
- Stack : []string {options .State },
312
- Rules : r .rules ,
313
- MutatorContext : map [interface {}]interface {}{},
314
- }
315
- for state .Pos < len (state .Text ) && len (state .Stack ) > 0 {
316
- state .State = state .Stack [len (state .Stack )- 1 ]
317
- ruleIndex , rule , groups := matchRules (state .Text [state .Pos :], state .Rules [state .State ])
318
- // No match.
319
- if groups == nil {
320
- out (& Token {Error , string (state .Text [state .Pos : state .Pos + 1 ])})
321
- state .Pos ++
322
- continue
323
- }
324
- state .Rule = ruleIndex
325
-
326
- state .Groups = groups
327
- state .Pos += len (groups [0 ])
328
- if rule .Mutator != nil {
329
- if err := rule .Mutator .Mutate (state ); err != nil {
330
- return err
331
- }
332
- }
333
- if rule .Type != nil {
334
- rule .Type .Emit (state .Groups , r , out )
335
- }
336
- }
337
- out (& Token {Type : EOF })
338
- return nil
339
- }
340
-
341
- // Tokenise text using lexer, returning tokens as a slice.
342
- func Tokenise (lexer Lexer , options * TokeniseOptions , text string ) ([]* Token , error ) {
343
- out := []* Token {}
344
- return out , lexer .Tokenise (options , text , func (token * Token ) { out = append (out , token ) })
345
- }
346
-
347
- func matchRules (text []rune , rules []CompiledRule ) (int , CompiledRule , []string ) {
348
- for i , rule := range rules {
349
- match , err := rule .Regexp .FindRunesMatch (text )
350
- if match != nil && err == nil {
351
- groups := []string {}
352
- for _ , g := range match .Groups () {
353
- groups = append (groups , g .String ())
354
- }
355
- return i , rule , groups
356
- }
357
- }
358
- return 0 , CompiledRule {}, nil
359
- }
0 commit comments