source: code/trunk/vendor/github.com/alecthomas/chroma/v2/regexp.go@ 67

Last change on this file since 67 was 67, checked in by Izuru Yakumo, 23 months ago

Use vendored modules

Signed-off-by: Izuru Yakumo <yakumo.izuru@…>

File size: 11.8 KB
Line 
1package chroma
2
3import (
4 "fmt"
5 "os"
6 "path/filepath"
7 "regexp"
8 "sort"
9 "strings"
10 "sync"
11 "time"
12 "unicode/utf8"
13
14 "github.com/dlclark/regexp2"
15)
16
17// A Rule is the fundamental matching unit of the Regex lexer state machine.
18type Rule struct {
19 Pattern string
20 Type Emitter
21 Mutator Mutator
22}
23
24// Words creates a regex that matches any of the given literal words.
25func Words(prefix, suffix string, words ...string) string {
26 sort.Slice(words, func(i, j int) bool {
27 return len(words[j]) < len(words[i])
28 })
29 for i, word := range words {
30 words[i] = regexp.QuoteMeta(word)
31 }
32 return prefix + `(` + strings.Join(words, `|`) + `)` + suffix
33}
34
35// Tokenise text using lexer, returning tokens as a slice.
36func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]Token, error) {
37 var out []Token
38 it, err := lexer.Tokenise(options, text)
39 if err != nil {
40 return nil, err
41 }
42 for t := it(); t != EOF; t = it() {
43 out = append(out, t)
44 }
45 return out, nil
46}
47
48// Rules maps from state to a sequence of Rules.
49type Rules map[string][]Rule
50
51// Rename clones rules then a rule.
52func (r Rules) Rename(oldRule, newRule string) Rules {
53 r = r.Clone()
54 r[newRule] = r[oldRule]
55 delete(r, oldRule)
56 return r
57}
58
59// Clone returns a clone of the Rules.
60func (r Rules) Clone() Rules {
61 out := map[string][]Rule{}
62 for key, rules := range r {
63 out[key] = make([]Rule, len(rules))
64 copy(out[key], rules)
65 }
66 return out
67}
68
69// Merge creates a clone of "r" then merges "rules" into the clone.
70func (r Rules) Merge(rules Rules) Rules {
71 out := r.Clone()
72 for k, v := range rules.Clone() {
73 out[k] = v
74 }
75 return out
76}
77
78// MustNewLexer creates a new Lexer with deferred rules generation or panics.
79func MustNewLexer(config *Config, rulesFunc func() Rules) *RegexLexer {
80 lexer, err := NewLexer(config, rulesFunc)
81 if err != nil {
82 panic(err)
83 }
84 return lexer
85}
86
87// NewLexer creates a new regex-based Lexer.
88//
89// "rules" is a state machine transition map. Each key is a state. Values are sets of rules
90// that match input, optionally modify lexer state, and output tokens.
91func NewLexer(config *Config, rulesFunc func() Rules) (*RegexLexer, error) {
92 if config == nil {
93 config = &Config{}
94 }
95 for _, glob := range append(config.Filenames, config.AliasFilenames...) {
96 _, err := filepath.Match(glob, "")
97 if err != nil {
98 return nil, fmt.Errorf("%s: %q is not a valid glob: %w", config.Name, glob, err)
99 }
100 }
101 r := &RegexLexer{
102 config: config,
103 fetchRulesFunc: func() (Rules, error) { return rulesFunc(), nil },
104 }
105 // One-off code to generate XML lexers in the Chroma source tree.
106 // var nameCleanRe = regexp.MustCompile(`[^-+A-Za-z0-9_]`)
107 // name := strings.ToLower(nameCleanRe.ReplaceAllString(config.Name, "_"))
108 // data, err := Marshal(r)
109 // if err != nil {
110 // if errors.Is(err, ErrNotSerialisable) {
111 // fmt.Fprintf(os.Stderr, "warning: %q: %s\n", name, err)
112 // return r, nil
113 // }
114 // return nil, err
115 // }
116 // _, file, _, ok := runtime.Caller(2)
117 // if !ok {
118 // panic("??")
119 // }
120 // fmt.Println(file)
121 // if strings.Contains(file, "/lexers/") {
122 // dir := filepath.Join(filepath.Dir(file), "embedded")
123 // err = os.MkdirAll(dir, 0700)
124 // if err != nil {
125 // return nil, err
126 // }
127 // filename := filepath.Join(dir, name) + ".xml"
128 // fmt.Println(filename)
129 // err = ioutil.WriteFile(filename, data, 0600)
130 // if err != nil {
131 // return nil, err
132 // }
133 // }
134 return r, nil
135}
136
137// Trace enables debug tracing.
138func (r *RegexLexer) Trace(trace bool) *RegexLexer {
139 r.trace = trace
140 return r
141}
142
143// A CompiledRule is a Rule with a pre-compiled regex.
144//
145// Note that regular expressions are lazily compiled on first use of the lexer.
146type CompiledRule struct {
147 Rule
148 Regexp *regexp2.Regexp
149 flags string
150}
151
152// CompiledRules is a map of rule name to sequence of compiled rules in that rule.
153type CompiledRules map[string][]*CompiledRule
154
155// LexerState contains the state for a single lex.
156type LexerState struct {
157 Lexer *RegexLexer
158 Registry *LexerRegistry
159 Text []rune
160 Pos int
161 Rules CompiledRules
162 Stack []string
163 State string
164 Rule int
165 // Group matches.
166 Groups []string
167 // Named Group matches.
168 NamedGroups map[string]string
169 // Custum context for mutators.
170 MutatorContext map[interface{}]interface{}
171 iteratorStack []Iterator
172 options *TokeniseOptions
173 newlineAdded bool
174}
175
176// Set mutator context.
177func (l *LexerState) Set(key interface{}, value interface{}) {
178 l.MutatorContext[key] = value
179}
180
181// Get mutator context.
182func (l *LexerState) Get(key interface{}) interface{} {
183 return l.MutatorContext[key]
184}
185
186// Iterator returns the next Token from the lexer.
187func (l *LexerState) Iterator() Token { // nolint: gocognit
188 end := len(l.Text)
189 if l.newlineAdded {
190 end--
191 }
192 for l.Pos < end && len(l.Stack) > 0 {
193 // Exhaust the iterator stack, if any.
194 for len(l.iteratorStack) > 0 {
195 n := len(l.iteratorStack) - 1
196 t := l.iteratorStack[n]()
197 if t == EOF {
198 l.iteratorStack = l.iteratorStack[:n]
199 continue
200 }
201 return t
202 }
203
204 l.State = l.Stack[len(l.Stack)-1]
205 if l.Lexer.trace {
206 fmt.Fprintf(os.Stderr, "%s: pos=%d, text=%q\n", l.State, l.Pos, string(l.Text[l.Pos:]))
207 }
208 selectedRule, ok := l.Rules[l.State]
209 if !ok {
210 panic("unknown state " + l.State)
211 }
212 ruleIndex, rule, groups, namedGroups := matchRules(l.Text, l.Pos, selectedRule)
213 // No match.
214 if groups == nil {
215 // From Pygments :\
216 //
217 // If the RegexLexer encounters a newline that is flagged as an error token, the stack is
218 // emptied and the lexer continues scanning in the 'root' state. This can help producing
219 // error-tolerant highlighting for erroneous input, e.g. when a single-line string is not
220 // closed.
221 if l.Text[l.Pos] == '\n' && l.State != l.options.State {
222 l.Stack = []string{l.options.State}
223 continue
224 }
225 l.Pos++
226 return Token{Error, string(l.Text[l.Pos-1 : l.Pos])}
227 }
228 l.Rule = ruleIndex
229 l.Groups = groups
230 l.NamedGroups = namedGroups
231 l.Pos += utf8.RuneCountInString(groups[0])
232 if rule.Mutator != nil {
233 if err := rule.Mutator.Mutate(l); err != nil {
234 panic(err)
235 }
236 }
237 if rule.Type != nil {
238 l.iteratorStack = append(l.iteratorStack, rule.Type.Emit(l.Groups, l))
239 }
240 }
241 // Exhaust the IteratorStack, if any.
242 // Duplicate code, but eh.
243 for len(l.iteratorStack) > 0 {
244 n := len(l.iteratorStack) - 1
245 t := l.iteratorStack[n]()
246 if t == EOF {
247 l.iteratorStack = l.iteratorStack[:n]
248 continue
249 }
250 return t
251 }
252
253 // If we get to here and we still have text, return it as an error.
254 if l.Pos != len(l.Text) && len(l.Stack) == 0 {
255 value := string(l.Text[l.Pos:])
256 l.Pos = len(l.Text)
257 return Token{Type: Error, Value: value}
258 }
259 return EOF
260}
261
262// RegexLexer is the default lexer implementation used in Chroma.
263type RegexLexer struct {
264 registry *LexerRegistry // The LexerRegistry this Lexer is associated with, if any.
265 config *Config
266 analyser func(text string) float32
267 trace bool
268
269 mu sync.Mutex
270 compiled bool
271 rawRules Rules
272 rules map[string][]*CompiledRule
273 fetchRulesFunc func() (Rules, error)
274 compileOnce sync.Once
275}
276
277func (r *RegexLexer) String() string {
278 return r.config.Name
279}
280
281// Rules in the Lexer.
282func (r *RegexLexer) Rules() (Rules, error) {
283 if err := r.needRules(); err != nil {
284 return nil, err
285 }
286 return r.rawRules, nil
287}
288
289// SetRegistry the lexer will use to lookup other lexers if necessary.
290func (r *RegexLexer) SetRegistry(registry *LexerRegistry) Lexer {
291 r.registry = registry
292 return r
293}
294
295// SetAnalyser sets the analyser function used to perform content inspection.
296func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) Lexer {
297 r.analyser = analyser
298 return r
299}
300
301func (r *RegexLexer) AnalyseText(text string) float32 { // nolint
302 if r.analyser != nil {
303 return r.analyser(text)
304 }
305 return 0.0
306}
307
308// SetConfig replaces the Config for this Lexer.
309func (r *RegexLexer) SetConfig(config *Config) *RegexLexer {
310 r.config = config
311 return r
312}
313
314func (r *RegexLexer) Config() *Config { // nolint
315 return r.config
316}
317
318// Regex compilation is deferred until the lexer is used. This is to avoid significant init() time costs.
319func (r *RegexLexer) maybeCompile() (err error) {
320 r.mu.Lock()
321 defer r.mu.Unlock()
322 if r.compiled {
323 return nil
324 }
325 for state, rules := range r.rules {
326 for i, rule := range rules {
327 if rule.Regexp == nil {
328 pattern := "(?:" + rule.Pattern + ")"
329 if rule.flags != "" {
330 pattern = "(?" + rule.flags + ")" + pattern
331 }
332 pattern = `\G` + pattern
333 rule.Regexp, err = regexp2.Compile(pattern, regexp2.RE2)
334 if err != nil {
335 return fmt.Errorf("failed to compile rule %s.%d: %s", state, i, err)
336 }
337 rule.Regexp.MatchTimeout = time.Millisecond * 250
338 }
339 }
340 }
341restart:
342 seen := map[LexerMutator]bool{}
343 for state := range r.rules {
344 for i := 0; i < len(r.rules[state]); i++ {
345 rule := r.rules[state][i]
346 if compile, ok := rule.Mutator.(LexerMutator); ok {
347 if seen[compile] {
348 return fmt.Errorf("saw mutator %T twice; this should not happen", compile)
349 }
350 seen[compile] = true
351 if err := compile.MutateLexer(r.rules, state, i); err != nil {
352 return err
353 }
354 // Process the rules again in case the mutator added/removed rules.
355 //
356 // This sounds bad, but shouldn't be significant in practice.
357 goto restart
358 }
359 }
360 }
361 r.compiled = true
362 return nil
363}
364
365func (r *RegexLexer) fetchRules() error {
366 rules, err := r.fetchRulesFunc()
367 if err != nil {
368 return fmt.Errorf("%s: failed to compile rules: %w", r.config.Name, err)
369 }
370 if _, ok := rules["root"]; !ok {
371 return fmt.Errorf("no \"root\" state")
372 }
373 compiledRules := map[string][]*CompiledRule{}
374 for state, rules := range rules {
375 compiledRules[state] = nil
376 for _, rule := range rules {
377 flags := ""
378 if !r.config.NotMultiline {
379 flags += "m"
380 }
381 if r.config.CaseInsensitive {
382 flags += "i"
383 }
384 if r.config.DotAll {
385 flags += "s"
386 }
387 compiledRules[state] = append(compiledRules[state], &CompiledRule{Rule: rule, flags: flags})
388 }
389 }
390
391 r.rawRules = rules
392 r.rules = compiledRules
393 return nil
394}
395
396func (r *RegexLexer) needRules() error {
397 var err error
398 if r.fetchRulesFunc != nil {
399 r.compileOnce.Do(func() {
400 err = r.fetchRules()
401 })
402 }
403 if err := r.maybeCompile(); err != nil {
404 return err
405 }
406 return err
407}
408
409func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) { // nolint
410 err := r.needRules()
411 if err != nil {
412 return nil, err
413 }
414 if options == nil {
415 options = defaultOptions
416 }
417 if options.EnsureLF {
418 text = ensureLF(text)
419 }
420 newlineAdded := false
421 if !options.Nested && r.config.EnsureNL && !strings.HasSuffix(text, "\n") {
422 text += "\n"
423 newlineAdded = true
424 }
425 state := &LexerState{
426 Registry: r.registry,
427 newlineAdded: newlineAdded,
428 options: options,
429 Lexer: r,
430 Text: []rune(text),
431 Stack: []string{options.State},
432 Rules: r.rules,
433 MutatorContext: map[interface{}]interface{}{},
434 }
435 return state.Iterator, nil
436}
437
438// MustRules is like Rules() but will panic on error.
439func (r *RegexLexer) MustRules() Rules {
440 rules, err := r.Rules()
441 if err != nil {
442 panic(err)
443 }
444 return rules
445}
446
447func matchRules(text []rune, pos int, rules []*CompiledRule) (int, *CompiledRule, []string, map[string]string) {
448 for i, rule := range rules {
449 match, err := rule.Regexp.FindRunesMatchStartingAt(text, pos)
450 if match != nil && err == nil && match.Index == pos {
451 groups := []string{}
452 namedGroups := make(map[string]string)
453 for _, g := range match.Groups() {
454 namedGroups[g.Name] = g.String()
455 groups = append(groups, g.String())
456 }
457 return i, rule, groups, namedGroups
458 }
459 }
460 return 0, &CompiledRule{}, nil, nil
461}
462
463// replace \r and \r\n with \n
464// same as strings.ReplaceAll but more efficient
465func ensureLF(text string) string {
466 buf := make([]byte, len(text))
467 var j int
468 for i := 0; i < len(text); i++ {
469 c := text[i]
470 if c == '\r' {
471 if i < len(text)-1 && text[i+1] == '\n' {
472 continue
473 }
474 c = '\n'
475 }
476 buf[j] = c
477 j++
478 }
479 return string(buf[:j])
480}
Note: See TracBrowser for help on using the repository browser.