Skip to content

Commit 44b23f9

Browse files
committed
Split Regexp lexer into its own file.
1 parent a5637e6 commit 44b23f9

File tree

2 files changed

+250
-245
lines changed

2 files changed

+250
-245
lines changed

Diff for: lexer.go

-245
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,6 @@ package chroma
22

33
import (
44
"fmt"
5-
"regexp"
6-
"strings"
7-
"sync"
8-
9-
"github.com/dlclark/regexp2"
105
)
116

127
var (
@@ -117,243 +112,3 @@ func (l Lexers) Less(i, j int) bool { return l[i].Config().Name < l[j].Config().
117112
type Analyser interface {
118113
AnalyseText(text string) float32
119114
}
120-
121-
type Rule struct {
122-
Pattern string
123-
Type Emitter
124-
Mutator Mutator
125-
}
126-
127-
// An Emitter takes group matches and returns tokens.
128-
type Emitter interface {
129-
// Emit tokens for the given regex groups.
130-
Emit(groups []string, lexer Lexer, out func(*Token))
131-
}
132-
133-
// EmitterFunc is a function that is an Emitter.
134-
type EmitterFunc func(groups []string, lexer Lexer, out func(*Token))
135-
136-
// Emit tokens for groups.
137-
func (e EmitterFunc) Emit(groups []string, lexer Lexer, out func(*Token)) { e(groups, lexer, out) }
138-
139-
// ByGroups emits a token for each matching group in the rule's regex.
140-
func ByGroups(emitters ...Emitter) Emitter {
141-
return EmitterFunc(func(groups []string, lexer Lexer, out func(*Token)) {
142-
// NOTE: If this line panics, there is a mismatch with groups. Uncomment the following line to debug.
143-
// fmt.Printf("%s %#v\n", emitters, groups[1:])
144-
for i, group := range groups[1:] {
145-
emitters[i].Emit([]string{group}, lexer, out)
146-
}
147-
return
148-
})
149-
}
150-
151-
// Using returns an Emitter that uses a given Lexer for parsing and emitting.
152-
func Using(lexer Lexer, options *TokeniseOptions) Emitter {
153-
return EmitterFunc(func(groups []string, _ Lexer, out func(*Token)) {
154-
if err := lexer.Tokenise(options, groups[0], out); err != nil {
155-
panic(err)
156-
}
157-
})
158-
}
159-
160-
// UsingSelf is like Using, but uses the current Lexer.
161-
func UsingSelf(state string) Emitter {
162-
return EmitterFunc(func(groups []string, lexer Lexer, out func(*Token)) {
163-
if err := lexer.Tokenise(&TokeniseOptions{State: state}, groups[0], out); err != nil {
164-
panic(err)
165-
}
166-
})
167-
}
168-
169-
// Words creates a regex that matches any of the given literal words.
170-
func Words(prefix, suffix string, words ...string) string {
171-
for i, word := range words {
172-
words[i] = regexp.QuoteMeta(word)
173-
}
174-
return prefix + `(` + strings.Join(words, `|`) + `)` + suffix
175-
}
176-
177-
// Rules maps from state to a sequence of Rules.
178-
type Rules map[string][]Rule
179-
180-
// MustNewLexer creates a new Lexer or panics.
181-
func MustNewLexer(config *Config, rules Rules) *RegexLexer {
182-
lexer, err := NewLexer(config, rules)
183-
if err != nil {
184-
panic(err)
185-
}
186-
return lexer
187-
}
188-
189-
// NewLexer creates a new regex-based Lexer.
190-
//
191-
// "rules" is a state machine transitition map. Each key is a state. Values are sets of rules
192-
// that match input, optionally modify lexer state, and output tokens.
193-
func NewLexer(config *Config, rules Rules) (*RegexLexer, error) {
194-
if config == nil {
195-
config = &Config{}
196-
}
197-
if _, ok := rules["root"]; !ok {
198-
return nil, fmt.Errorf("no \"root\" state")
199-
}
200-
compiledRules := map[string][]CompiledRule{}
201-
for state, rules := range rules {
202-
for _, rule := range rules {
203-
flags := ""
204-
if !config.NotMultiline {
205-
flags += "m"
206-
}
207-
if config.CaseInsensitive {
208-
flags += "i"
209-
}
210-
if config.DotAll {
211-
flags += "s"
212-
}
213-
compiledRules[state] = append(compiledRules[state], CompiledRule{Rule: rule, flags: flags})
214-
}
215-
}
216-
return &RegexLexer{
217-
config: config,
218-
rules: compiledRules,
219-
}, nil
220-
}
221-
222-
// A CompiledRule is a Rule with a pre-compiled regex.
223-
//
224-
// Note that regular expressions are lazily compiled on first use of the lexer.
225-
type CompiledRule struct {
226-
Rule
227-
Regexp *regexp2.Regexp
228-
flags string
229-
}
230-
231-
type CompiledRules map[string][]CompiledRule
232-
233-
type LexerState struct {
234-
Text []rune
235-
Pos int
236-
Rules map[string][]CompiledRule
237-
Stack []string
238-
State string
239-
Rule int
240-
// Group matches.
241-
Groups []string
242-
// Custum context for mutators.
243-
MutatorContext map[interface{}]interface{}
244-
}
245-
246-
func (l *LexerState) Set(key interface{}, value interface{}) {
247-
l.MutatorContext[key] = value
248-
}
249-
250-
func (l *LexerState) Get(key interface{}) interface{} {
251-
return l.MutatorContext[key]
252-
}
253-
254-
type RegexLexer struct {
255-
config *Config
256-
analyser func(text string) float32
257-
258-
mu sync.Mutex
259-
compiled bool
260-
rules map[string][]CompiledRule
261-
}
262-
263-
// SetAnalyser sets the analyser function used to perform content inspection.
264-
func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) *RegexLexer {
265-
r.analyser = analyser
266-
return r
267-
}
268-
269-
func (r *RegexLexer) AnalyseText(text string) float32 {
270-
if r.analyser != nil {
271-
return r.analyser(text)
272-
}
273-
return 0.0
274-
}
275-
276-
func (r *RegexLexer) Config() *Config {
277-
return r.config
278-
}
279-
280-
// Regex compilation is deferred until the lexer is used. This is to avoid significant init() time costs.
281-
func (r *RegexLexer) maybeCompile() (err error) {
282-
r.mu.Lock()
283-
defer r.mu.Unlock()
284-
if r.compiled {
285-
return nil
286-
}
287-
for state, rules := range r.rules {
288-
for i, rule := range rules {
289-
if rule.Regexp == nil {
290-
rule.Regexp, err = regexp2.Compile("^(?"+rule.flags+")(?:"+rule.Pattern+")", 0)
291-
if err != nil {
292-
return fmt.Errorf("failed to compile rule %s.%d: %s", state, i, err)
293-
}
294-
}
295-
rules[i] = rule
296-
}
297-
}
298-
r.compiled = true
299-
return nil
300-
}
301-
302-
func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string, out func(*Token)) error {
303-
if err := r.maybeCompile(); err != nil {
304-
return err
305-
}
306-
if options == nil {
307-
options = defaultOptions
308-
}
309-
state := &LexerState{
310-
Text: []rune(text),
311-
Stack: []string{options.State},
312-
Rules: r.rules,
313-
MutatorContext: map[interface{}]interface{}{},
314-
}
315-
for state.Pos < len(state.Text) && len(state.Stack) > 0 {
316-
state.State = state.Stack[len(state.Stack)-1]
317-
ruleIndex, rule, groups := matchRules(state.Text[state.Pos:], state.Rules[state.State])
318-
// No match.
319-
if groups == nil {
320-
out(&Token{Error, string(state.Text[state.Pos : state.Pos+1])})
321-
state.Pos++
322-
continue
323-
}
324-
state.Rule = ruleIndex
325-
326-
state.Groups = groups
327-
state.Pos += len(groups[0])
328-
if rule.Mutator != nil {
329-
if err := rule.Mutator.Mutate(state); err != nil {
330-
return err
331-
}
332-
}
333-
if rule.Type != nil {
334-
rule.Type.Emit(state.Groups, r, out)
335-
}
336-
}
337-
out(&Token{Type: EOF})
338-
return nil
339-
}
340-
341-
// Tokenise text using lexer, returning tokens as a slice.
342-
func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]*Token, error) {
343-
out := []*Token{}
344-
return out, lexer.Tokenise(options, text, func(token *Token) { out = append(out, token) })
345-
}
346-
347-
func matchRules(text []rune, rules []CompiledRule) (int, CompiledRule, []string) {
348-
for i, rule := range rules {
349-
match, err := rule.Regexp.FindRunesMatch(text)
350-
if match != nil && err == nil {
351-
groups := []string{}
352-
for _, g := range match.Groups() {
353-
groups = append(groups, g.String())
354-
}
355-
return i, rule, groups
356-
}
357-
}
358-
return 0, CompiledRule{}, nil
359-
}

0 commit comments

Comments
 (0)