1 | package chroma
|
---|
2 |
|
---|
3 | import (
|
---|
4 | "fmt"
|
---|
5 | "os"
|
---|
6 | "path/filepath"
|
---|
7 | "regexp"
|
---|
8 | "sort"
|
---|
9 | "strings"
|
---|
10 | "sync"
|
---|
11 | "time"
|
---|
12 | "unicode/utf8"
|
---|
13 |
|
---|
14 | "github.com/dlclark/regexp2"
|
---|
15 | )
|
---|
16 |
|
---|
17 | // A Rule is the fundamental matching unit of the Regex lexer state machine.
|
---|
18 | type Rule struct {
|
---|
19 | Pattern string
|
---|
20 | Type Emitter
|
---|
21 | Mutator Mutator
|
---|
22 | }
|
---|
23 |
|
---|
24 | // Words creates a regex that matches any of the given literal words.
|
---|
25 | func Words(prefix, suffix string, words ...string) string {
|
---|
26 | sort.Slice(words, func(i, j int) bool {
|
---|
27 | return len(words[j]) < len(words[i])
|
---|
28 | })
|
---|
29 | for i, word := range words {
|
---|
30 | words[i] = regexp.QuoteMeta(word)
|
---|
31 | }
|
---|
32 | return prefix + `(` + strings.Join(words, `|`) + `)` + suffix
|
---|
33 | }
|
---|
34 |
|
---|
35 | // Tokenise text using lexer, returning tokens as a slice.
|
---|
36 | func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]Token, error) {
|
---|
37 | var out []Token
|
---|
38 | it, err := lexer.Tokenise(options, text)
|
---|
39 | if err != nil {
|
---|
40 | return nil, err
|
---|
41 | }
|
---|
42 | for t := it(); t != EOF; t = it() {
|
---|
43 | out = append(out, t)
|
---|
44 | }
|
---|
45 | return out, nil
|
---|
46 | }
|
---|
47 |
|
---|
48 | // Rules maps from state to a sequence of Rules.
|
---|
49 | type Rules map[string][]Rule
|
---|
50 |
|
---|
51 | // Rename clones rules then a rule.
|
---|
52 | func (r Rules) Rename(oldRule, newRule string) Rules {
|
---|
53 | r = r.Clone()
|
---|
54 | r[newRule] = r[oldRule]
|
---|
55 | delete(r, oldRule)
|
---|
56 | return r
|
---|
57 | }
|
---|
58 |
|
---|
59 | // Clone returns a clone of the Rules.
|
---|
60 | func (r Rules) Clone() Rules {
|
---|
61 | out := map[string][]Rule{}
|
---|
62 | for key, rules := range r {
|
---|
63 | out[key] = make([]Rule, len(rules))
|
---|
64 | copy(out[key], rules)
|
---|
65 | }
|
---|
66 | return out
|
---|
67 | }
|
---|
68 |
|
---|
69 | // Merge creates a clone of "r" then merges "rules" into the clone.
|
---|
70 | func (r Rules) Merge(rules Rules) Rules {
|
---|
71 | out := r.Clone()
|
---|
72 | for k, v := range rules.Clone() {
|
---|
73 | out[k] = v
|
---|
74 | }
|
---|
75 | return out
|
---|
76 | }
|
---|
77 |
|
---|
78 | // MustNewLexer creates a new Lexer with deferred rules generation or panics.
|
---|
79 | func MustNewLexer(config *Config, rulesFunc func() Rules) *RegexLexer {
|
---|
80 | lexer, err := NewLexer(config, rulesFunc)
|
---|
81 | if err != nil {
|
---|
82 | panic(err)
|
---|
83 | }
|
---|
84 | return lexer
|
---|
85 | }
|
---|
86 |
|
---|
87 | // NewLexer creates a new regex-based Lexer.
|
---|
88 | //
|
---|
89 | // "rules" is a state machine transition map. Each key is a state. Values are sets of rules
|
---|
90 | // that match input, optionally modify lexer state, and output tokens.
|
---|
91 | func NewLexer(config *Config, rulesFunc func() Rules) (*RegexLexer, error) {
|
---|
92 | if config == nil {
|
---|
93 | config = &Config{}
|
---|
94 | }
|
---|
95 | for _, glob := range append(config.Filenames, config.AliasFilenames...) {
|
---|
96 | _, err := filepath.Match(glob, "")
|
---|
97 | if err != nil {
|
---|
98 | return nil, fmt.Errorf("%s: %q is not a valid glob: %w", config.Name, glob, err)
|
---|
99 | }
|
---|
100 | }
|
---|
101 | r := &RegexLexer{
|
---|
102 | config: config,
|
---|
103 | fetchRulesFunc: func() (Rules, error) { return rulesFunc(), nil },
|
---|
104 | }
|
---|
105 | // One-off code to generate XML lexers in the Chroma source tree.
|
---|
106 | // var nameCleanRe = regexp.MustCompile(`[^-+A-Za-z0-9_]`)
|
---|
107 | // name := strings.ToLower(nameCleanRe.ReplaceAllString(config.Name, "_"))
|
---|
108 | // data, err := Marshal(r)
|
---|
109 | // if err != nil {
|
---|
110 | // if errors.Is(err, ErrNotSerialisable) {
|
---|
111 | // fmt.Fprintf(os.Stderr, "warning: %q: %s\n", name, err)
|
---|
112 | // return r, nil
|
---|
113 | // }
|
---|
114 | // return nil, err
|
---|
115 | // }
|
---|
116 | // _, file, _, ok := runtime.Caller(2)
|
---|
117 | // if !ok {
|
---|
118 | // panic("??")
|
---|
119 | // }
|
---|
120 | // fmt.Println(file)
|
---|
121 | // if strings.Contains(file, "/lexers/") {
|
---|
122 | // dir := filepath.Join(filepath.Dir(file), "embedded")
|
---|
123 | // err = os.MkdirAll(dir, 0700)
|
---|
124 | // if err != nil {
|
---|
125 | // return nil, err
|
---|
126 | // }
|
---|
127 | // filename := filepath.Join(dir, name) + ".xml"
|
---|
128 | // fmt.Println(filename)
|
---|
129 | // err = ioutil.WriteFile(filename, data, 0600)
|
---|
130 | // if err != nil {
|
---|
131 | // return nil, err
|
---|
132 | // }
|
---|
133 | // }
|
---|
134 | return r, nil
|
---|
135 | }
|
---|
136 |
|
---|
137 | // Trace enables debug tracing.
|
---|
138 | func (r *RegexLexer) Trace(trace bool) *RegexLexer {
|
---|
139 | r.trace = trace
|
---|
140 | return r
|
---|
141 | }
|
---|
142 |
|
---|
143 | // A CompiledRule is a Rule with a pre-compiled regex.
|
---|
144 | //
|
---|
145 | // Note that regular expressions are lazily compiled on first use of the lexer.
|
---|
146 | type CompiledRule struct {
|
---|
147 | Rule
|
---|
148 | Regexp *regexp2.Regexp
|
---|
149 | flags string
|
---|
150 | }
|
---|
151 |
|
---|
152 | // CompiledRules is a map of rule name to sequence of compiled rules in that rule.
|
---|
153 | type CompiledRules map[string][]*CompiledRule
|
---|
154 |
|
---|
155 | // LexerState contains the state for a single lex.
|
---|
156 | type LexerState struct {
|
---|
157 | Lexer *RegexLexer
|
---|
158 | Registry *LexerRegistry
|
---|
159 | Text []rune
|
---|
160 | Pos int
|
---|
161 | Rules CompiledRules
|
---|
162 | Stack []string
|
---|
163 | State string
|
---|
164 | Rule int
|
---|
165 | // Group matches.
|
---|
166 | Groups []string
|
---|
167 | // Named Group matches.
|
---|
168 | NamedGroups map[string]string
|
---|
169 | // Custum context for mutators.
|
---|
170 | MutatorContext map[interface{}]interface{}
|
---|
171 | iteratorStack []Iterator
|
---|
172 | options *TokeniseOptions
|
---|
173 | newlineAdded bool
|
---|
174 | }
|
---|
175 |
|
---|
176 | // Set mutator context.
|
---|
177 | func (l *LexerState) Set(key interface{}, value interface{}) {
|
---|
178 | l.MutatorContext[key] = value
|
---|
179 | }
|
---|
180 |
|
---|
181 | // Get mutator context.
|
---|
182 | func (l *LexerState) Get(key interface{}) interface{} {
|
---|
183 | return l.MutatorContext[key]
|
---|
184 | }
|
---|
185 |
|
---|
186 | // Iterator returns the next Token from the lexer.
|
---|
187 | func (l *LexerState) Iterator() Token { // nolint: gocognit
|
---|
188 | end := len(l.Text)
|
---|
189 | if l.newlineAdded {
|
---|
190 | end--
|
---|
191 | }
|
---|
192 | for l.Pos < end && len(l.Stack) > 0 {
|
---|
193 | // Exhaust the iterator stack, if any.
|
---|
194 | for len(l.iteratorStack) > 0 {
|
---|
195 | n := len(l.iteratorStack) - 1
|
---|
196 | t := l.iteratorStack[n]()
|
---|
197 | if t == EOF {
|
---|
198 | l.iteratorStack = l.iteratorStack[:n]
|
---|
199 | continue
|
---|
200 | }
|
---|
201 | return t
|
---|
202 | }
|
---|
203 |
|
---|
204 | l.State = l.Stack[len(l.Stack)-1]
|
---|
205 | if l.Lexer.trace {
|
---|
206 | fmt.Fprintf(os.Stderr, "%s: pos=%d, text=%q\n", l.State, l.Pos, string(l.Text[l.Pos:]))
|
---|
207 | }
|
---|
208 | selectedRule, ok := l.Rules[l.State]
|
---|
209 | if !ok {
|
---|
210 | panic("unknown state " + l.State)
|
---|
211 | }
|
---|
212 | ruleIndex, rule, groups, namedGroups := matchRules(l.Text, l.Pos, selectedRule)
|
---|
213 | // No match.
|
---|
214 | if groups == nil {
|
---|
215 | // From Pygments :\
|
---|
216 | //
|
---|
217 | // If the RegexLexer encounters a newline that is flagged as an error token, the stack is
|
---|
218 | // emptied and the lexer continues scanning in the 'root' state. This can help producing
|
---|
219 | // error-tolerant highlighting for erroneous input, e.g. when a single-line string is not
|
---|
220 | // closed.
|
---|
221 | if l.Text[l.Pos] == '\n' && l.State != l.options.State {
|
---|
222 | l.Stack = []string{l.options.State}
|
---|
223 | continue
|
---|
224 | }
|
---|
225 | l.Pos++
|
---|
226 | return Token{Error, string(l.Text[l.Pos-1 : l.Pos])}
|
---|
227 | }
|
---|
228 | l.Rule = ruleIndex
|
---|
229 | l.Groups = groups
|
---|
230 | l.NamedGroups = namedGroups
|
---|
231 | l.Pos += utf8.RuneCountInString(groups[0])
|
---|
232 | if rule.Mutator != nil {
|
---|
233 | if err := rule.Mutator.Mutate(l); err != nil {
|
---|
234 | panic(err)
|
---|
235 | }
|
---|
236 | }
|
---|
237 | if rule.Type != nil {
|
---|
238 | l.iteratorStack = append(l.iteratorStack, rule.Type.Emit(l.Groups, l))
|
---|
239 | }
|
---|
240 | }
|
---|
241 | // Exhaust the IteratorStack, if any.
|
---|
242 | // Duplicate code, but eh.
|
---|
243 | for len(l.iteratorStack) > 0 {
|
---|
244 | n := len(l.iteratorStack) - 1
|
---|
245 | t := l.iteratorStack[n]()
|
---|
246 | if t == EOF {
|
---|
247 | l.iteratorStack = l.iteratorStack[:n]
|
---|
248 | continue
|
---|
249 | }
|
---|
250 | return t
|
---|
251 | }
|
---|
252 |
|
---|
253 | // If we get to here and we still have text, return it as an error.
|
---|
254 | if l.Pos != len(l.Text) && len(l.Stack) == 0 {
|
---|
255 | value := string(l.Text[l.Pos:])
|
---|
256 | l.Pos = len(l.Text)
|
---|
257 | return Token{Type: Error, Value: value}
|
---|
258 | }
|
---|
259 | return EOF
|
---|
260 | }
|
---|
261 |
|
---|
262 | // RegexLexer is the default lexer implementation used in Chroma.
|
---|
263 | type RegexLexer struct {
|
---|
264 | registry *LexerRegistry // The LexerRegistry this Lexer is associated with, if any.
|
---|
265 | config *Config
|
---|
266 | analyser func(text string) float32
|
---|
267 | trace bool
|
---|
268 |
|
---|
269 | mu sync.Mutex
|
---|
270 | compiled bool
|
---|
271 | rawRules Rules
|
---|
272 | rules map[string][]*CompiledRule
|
---|
273 | fetchRulesFunc func() (Rules, error)
|
---|
274 | compileOnce sync.Once
|
---|
275 | }
|
---|
276 |
|
---|
277 | func (r *RegexLexer) String() string {
|
---|
278 | return r.config.Name
|
---|
279 | }
|
---|
280 |
|
---|
281 | // Rules in the Lexer.
|
---|
282 | func (r *RegexLexer) Rules() (Rules, error) {
|
---|
283 | if err := r.needRules(); err != nil {
|
---|
284 | return nil, err
|
---|
285 | }
|
---|
286 | return r.rawRules, nil
|
---|
287 | }
|
---|
288 |
|
---|
289 | // SetRegistry the lexer will use to lookup other lexers if necessary.
|
---|
290 | func (r *RegexLexer) SetRegistry(registry *LexerRegistry) Lexer {
|
---|
291 | r.registry = registry
|
---|
292 | return r
|
---|
293 | }
|
---|
294 |
|
---|
295 | // SetAnalyser sets the analyser function used to perform content inspection.
|
---|
296 | func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) Lexer {
|
---|
297 | r.analyser = analyser
|
---|
298 | return r
|
---|
299 | }
|
---|
300 |
|
---|
301 | func (r *RegexLexer) AnalyseText(text string) float32 { // nolint
|
---|
302 | if r.analyser != nil {
|
---|
303 | return r.analyser(text)
|
---|
304 | }
|
---|
305 | return 0.0
|
---|
306 | }
|
---|
307 |
|
---|
308 | // SetConfig replaces the Config for this Lexer.
|
---|
309 | func (r *RegexLexer) SetConfig(config *Config) *RegexLexer {
|
---|
310 | r.config = config
|
---|
311 | return r
|
---|
312 | }
|
---|
313 |
|
---|
314 | func (r *RegexLexer) Config() *Config { // nolint
|
---|
315 | return r.config
|
---|
316 | }
|
---|
317 |
|
---|
318 | // Regex compilation is deferred until the lexer is used. This is to avoid significant init() time costs.
|
---|
319 | func (r *RegexLexer) maybeCompile() (err error) {
|
---|
320 | r.mu.Lock()
|
---|
321 | defer r.mu.Unlock()
|
---|
322 | if r.compiled {
|
---|
323 | return nil
|
---|
324 | }
|
---|
325 | for state, rules := range r.rules {
|
---|
326 | for i, rule := range rules {
|
---|
327 | if rule.Regexp == nil {
|
---|
328 | pattern := "(?:" + rule.Pattern + ")"
|
---|
329 | if rule.flags != "" {
|
---|
330 | pattern = "(?" + rule.flags + ")" + pattern
|
---|
331 | }
|
---|
332 | pattern = `\G` + pattern
|
---|
333 | rule.Regexp, err = regexp2.Compile(pattern, regexp2.RE2)
|
---|
334 | if err != nil {
|
---|
335 | return fmt.Errorf("failed to compile rule %s.%d: %s", state, i, err)
|
---|
336 | }
|
---|
337 | rule.Regexp.MatchTimeout = time.Millisecond * 250
|
---|
338 | }
|
---|
339 | }
|
---|
340 | }
|
---|
341 | restart:
|
---|
342 | seen := map[LexerMutator]bool{}
|
---|
343 | for state := range r.rules {
|
---|
344 | for i := 0; i < len(r.rules[state]); i++ {
|
---|
345 | rule := r.rules[state][i]
|
---|
346 | if compile, ok := rule.Mutator.(LexerMutator); ok {
|
---|
347 | if seen[compile] {
|
---|
348 | return fmt.Errorf("saw mutator %T twice; this should not happen", compile)
|
---|
349 | }
|
---|
350 | seen[compile] = true
|
---|
351 | if err := compile.MutateLexer(r.rules, state, i); err != nil {
|
---|
352 | return err
|
---|
353 | }
|
---|
354 | // Process the rules again in case the mutator added/removed rules.
|
---|
355 | //
|
---|
356 | // This sounds bad, but shouldn't be significant in practice.
|
---|
357 | goto restart
|
---|
358 | }
|
---|
359 | }
|
---|
360 | }
|
---|
361 | r.compiled = true
|
---|
362 | return nil
|
---|
363 | }
|
---|
364 |
|
---|
365 | func (r *RegexLexer) fetchRules() error {
|
---|
366 | rules, err := r.fetchRulesFunc()
|
---|
367 | if err != nil {
|
---|
368 | return fmt.Errorf("%s: failed to compile rules: %w", r.config.Name, err)
|
---|
369 | }
|
---|
370 | if _, ok := rules["root"]; !ok {
|
---|
371 | return fmt.Errorf("no \"root\" state")
|
---|
372 | }
|
---|
373 | compiledRules := map[string][]*CompiledRule{}
|
---|
374 | for state, rules := range rules {
|
---|
375 | compiledRules[state] = nil
|
---|
376 | for _, rule := range rules {
|
---|
377 | flags := ""
|
---|
378 | if !r.config.NotMultiline {
|
---|
379 | flags += "m"
|
---|
380 | }
|
---|
381 | if r.config.CaseInsensitive {
|
---|
382 | flags += "i"
|
---|
383 | }
|
---|
384 | if r.config.DotAll {
|
---|
385 | flags += "s"
|
---|
386 | }
|
---|
387 | compiledRules[state] = append(compiledRules[state], &CompiledRule{Rule: rule, flags: flags})
|
---|
388 | }
|
---|
389 | }
|
---|
390 |
|
---|
391 | r.rawRules = rules
|
---|
392 | r.rules = compiledRules
|
---|
393 | return nil
|
---|
394 | }
|
---|
395 |
|
---|
396 | func (r *RegexLexer) needRules() error {
|
---|
397 | var err error
|
---|
398 | if r.fetchRulesFunc != nil {
|
---|
399 | r.compileOnce.Do(func() {
|
---|
400 | err = r.fetchRules()
|
---|
401 | })
|
---|
402 | }
|
---|
403 | if err := r.maybeCompile(); err != nil {
|
---|
404 | return err
|
---|
405 | }
|
---|
406 | return err
|
---|
407 | }
|
---|
408 |
|
---|
409 | func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) { // nolint
|
---|
410 | err := r.needRules()
|
---|
411 | if err != nil {
|
---|
412 | return nil, err
|
---|
413 | }
|
---|
414 | if options == nil {
|
---|
415 | options = defaultOptions
|
---|
416 | }
|
---|
417 | if options.EnsureLF {
|
---|
418 | text = ensureLF(text)
|
---|
419 | }
|
---|
420 | newlineAdded := false
|
---|
421 | if !options.Nested && r.config.EnsureNL && !strings.HasSuffix(text, "\n") {
|
---|
422 | text += "\n"
|
---|
423 | newlineAdded = true
|
---|
424 | }
|
---|
425 | state := &LexerState{
|
---|
426 | Registry: r.registry,
|
---|
427 | newlineAdded: newlineAdded,
|
---|
428 | options: options,
|
---|
429 | Lexer: r,
|
---|
430 | Text: []rune(text),
|
---|
431 | Stack: []string{options.State},
|
---|
432 | Rules: r.rules,
|
---|
433 | MutatorContext: map[interface{}]interface{}{},
|
---|
434 | }
|
---|
435 | return state.Iterator, nil
|
---|
436 | }
|
---|
437 |
|
---|
438 | // MustRules is like Rules() but will panic on error.
|
---|
439 | func (r *RegexLexer) MustRules() Rules {
|
---|
440 | rules, err := r.Rules()
|
---|
441 | if err != nil {
|
---|
442 | panic(err)
|
---|
443 | }
|
---|
444 | return rules
|
---|
445 | }
|
---|
446 |
|
---|
447 | func matchRules(text []rune, pos int, rules []*CompiledRule) (int, *CompiledRule, []string, map[string]string) {
|
---|
448 | for i, rule := range rules {
|
---|
449 | match, err := rule.Regexp.FindRunesMatchStartingAt(text, pos)
|
---|
450 | if match != nil && err == nil && match.Index == pos {
|
---|
451 | groups := []string{}
|
---|
452 | namedGroups := make(map[string]string)
|
---|
453 | for _, g := range match.Groups() {
|
---|
454 | namedGroups[g.Name] = g.String()
|
---|
455 | groups = append(groups, g.String())
|
---|
456 | }
|
---|
457 | return i, rule, groups, namedGroups
|
---|
458 | }
|
---|
459 | }
|
---|
460 | return 0, &CompiledRule{}, nil, nil
|
---|
461 | }
|
---|
462 |
|
---|
463 | // replace \r and \r\n with \n
|
---|
464 | // same as strings.ReplaceAll but more efficient
|
---|
465 | func ensureLF(text string) string {
|
---|
466 | buf := make([]byte, len(text))
|
---|
467 | var j int
|
---|
468 | for i := 0; i < len(text); i++ {
|
---|
469 | c := text[i]
|
---|
470 | if c == '\r' {
|
---|
471 | if i < len(text)-1 && text[i+1] == '\n' {
|
---|
472 | continue
|
---|
473 | }
|
---|
474 | c = '\n'
|
---|
475 | }
|
---|
476 | buf[j] = c
|
---|
477 | j++
|
---|
478 | }
|
---|
479 | return string(buf[:j])
|
---|
480 | }
|
---|