Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: code/trunk/vendor/github.com/alecthomas/chroma/v2/regexp.go@ 67

Last change on this file since 67 was 67, checked in by Izuru Yakumo, 23 months ago

Use vendored modules

Signed-off-by: Izuru Yakumo <yakumo.izuru@…>

File size: 11.8 KB

Line
1	package chroma
2
3	import (
4	"fmt"
5	"os"
6	"path/filepath"
7	"regexp"
8	"sort"
9	"strings"
10	"sync"
11	"time"
12	"unicode/utf8"
13
14	"github.com/dlclark/regexp2"
15	)
16
17	// A Rule is the fundamental matching unit of the Regex lexer state machine.
18	type Rule struct {
19	Pattern string
20	Type Emitter
21	Mutator Mutator
22	}
23
24	// Words creates a regex that matches any of the given literal words.
25	func Words(prefix, suffix string, words ...string) string {
26	sort.Slice(words, func(i, j int) bool {
27	return len(words[j]) < len(words[i])
28	})
29	for i, word := range words {
30	words[i] = regexp.QuoteMeta(word)
31	}
32	return prefix + `(` + strings.Join(words, `\|`) + `)` + suffix
33	}
34
35	// Tokenise text using lexer, returning tokens as a slice.
36	func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]Token, error) {
37	var out []Token
38	it, err := lexer.Tokenise(options, text)
39	if err != nil {
40	return nil, err
41	}
42	for t := it(); t != EOF; t = it() {
43	out = append(out, t)
44	}
45	return out, nil
46	}
47
48	// Rules maps from state to a sequence of Rules.
49	type Rules map[string][]Rule
50
51	// Rename clones rules then a rule.
52	func (r Rules) Rename(oldRule, newRule string) Rules {
53	r = r.Clone()
54	r[newRule] = r[oldRule]
55	delete(r, oldRule)
56	return r
57	}
58
59	// Clone returns a clone of the Rules.
60	func (r Rules) Clone() Rules {
61	out := map[string][]Rule{}
62	for key, rules := range r {
63	out[key] = make([]Rule, len(rules))
64	copy(out[key], rules)
65	}
66	return out
67	}
68
69	// Merge creates a clone of "r" then merges "rules" into the clone.
70	func (r Rules) Merge(rules Rules) Rules {
71	out := r.Clone()
72	for k, v := range rules.Clone() {
73	out[k] = v
74	}
75	return out
76	}
77
78	// MustNewLexer creates a new Lexer with deferred rules generation or panics.
79	func MustNewLexer(config Config, rulesFunc func() Rules) RegexLexer {
80	lexer, err := NewLexer(config, rulesFunc)
81	if err != nil {
82	panic(err)
83	}
84	return lexer
85	}
86
87	// NewLexer creates a new regex-based Lexer.
88	//
89	// "rules" is a state machine transition map. Each key is a state. Values are sets of rules
90	// that match input, optionally modify lexer state, and output tokens.
91	func NewLexer(config Config, rulesFunc func() Rules) (RegexLexer, error) {
92	if config == nil {
93	config = &Config{}
94	}
95	for _, glob := range append(config.Filenames, config.AliasFilenames...) {
96	_, err := filepath.Match(glob, "")
97	if err != nil {
98	return nil, fmt.Errorf("%s: %q is not a valid glob: %w", config.Name, glob, err)
99	}
100	}
101	r := &RegexLexer{
102	config: config,
103	fetchRulesFunc: func() (Rules, error) { return rulesFunc(), nil },
104	}
105	// One-off code to generate XML lexers in the Chroma source tree.
106	// var nameCleanRe = regexp.MustCompile(`[^-+A-Za-z0-9_]`)
107	// name := strings.ToLower(nameCleanRe.ReplaceAllString(config.Name, "_"))
108	// data, err := Marshal(r)
109	// if err != nil {
110	// if errors.Is(err, ErrNotSerialisable) {
111	// fmt.Fprintf(os.Stderr, "warning: %q: %s\n", name, err)
112	// return r, nil
113	// }
114	// return nil, err
115	// }
116	// _, file, _, ok := runtime.Caller(2)
117	// if !ok {
118	// panic("??")
119	// }
120	// fmt.Println(file)
121	// if strings.Contains(file, "/lexers/") {
122	// dir := filepath.Join(filepath.Dir(file), "embedded")
123	// err = os.MkdirAll(dir, 0700)
124	// if err != nil {
125	// return nil, err
126	// }
127	// filename := filepath.Join(dir, name) + ".xml"
128	// fmt.Println(filename)
129	// err = ioutil.WriteFile(filename, data, 0600)
130	// if err != nil {
131	// return nil, err
132	// }
133	// }
134	return r, nil
135	}
136
137	// Trace enables debug tracing.
138	func (r RegexLexer) Trace(trace bool) RegexLexer {
139	r.trace = trace
140	return r
141	}
142
143	// A CompiledRule is a Rule with a pre-compiled regex.
144	//
145	// Note that regular expressions are lazily compiled on first use of the lexer.
146	type CompiledRule struct {
147	Rule
148	Regexp *regexp2.Regexp
149	flags string
150	}
151
152	// CompiledRules is a map of rule name to sequence of compiled rules in that rule.
153	type CompiledRules map[string][]*CompiledRule
154
155	// LexerState contains the state for a single lex.
156	type LexerState struct {
157	Lexer *RegexLexer
158	Registry *LexerRegistry
159	Text []rune
160	Pos int
161	Rules CompiledRules
162	Stack []string
163	State string
164	Rule int
165	// Group matches.
166	Groups []string
167	// Named Group matches.
168	NamedGroups map[string]string
169	// Custum context for mutators.
170	MutatorContext map[interface{}]interface{}
171	iteratorStack []Iterator
172	options *TokeniseOptions
173	newlineAdded bool
174	}
175
176	// Set mutator context.
177	func (l *LexerState) Set(key interface{}, value interface{}) {
178	l.MutatorContext[key] = value
179	}
180
181	// Get mutator context.
182	func (l *LexerState) Get(key interface{}) interface{} {
183	return l.MutatorContext[key]
184	}
185
186	// Iterator returns the next Token from the lexer.
187	func (l *LexerState) Iterator() Token { // nolint: gocognit
188	end := len(l.Text)
189	if l.newlineAdded {
190	end--
191	}
192	for l.Pos < end && len(l.Stack) > 0 {
193	// Exhaust the iterator stack, if any.
194	for len(l.iteratorStack) > 0 {
195	n := len(l.iteratorStack) - 1
196	t := l.iteratorStack[n]()
197	if t == EOF {
198	l.iteratorStack = l.iteratorStack[:n]
199	continue
200	}
201	return t
202	}
203
204	l.State = l.Stack[len(l.Stack)-1]
205	if l.Lexer.trace {
206	fmt.Fprintf(os.Stderr, "%s: pos=%d, text=%q\n", l.State, l.Pos, string(l.Text[l.Pos:]))
207	}
208	selectedRule, ok := l.Rules[l.State]
209	if !ok {
210	panic("unknown state " + l.State)
211	}
212	ruleIndex, rule, groups, namedGroups := matchRules(l.Text, l.Pos, selectedRule)
213	// No match.
214	if groups == nil {
215	// From Pygments :\
216	//
217	// If the RegexLexer encounters a newline that is flagged as an error token, the stack is
218	// emptied and the lexer continues scanning in the 'root' state. This can help producing
219	// error-tolerant highlighting for erroneous input, e.g. when a single-line string is not
220	// closed.
221	if l.Text[l.Pos] == '\n' && l.State != l.options.State {
222	l.Stack = []string{l.options.State}
223	continue
224	}
225	l.Pos++
226	return Token{Error, string(l.Text[l.Pos-1 : l.Pos])}
227	}
228	l.Rule = ruleIndex
229	l.Groups = groups
230	l.NamedGroups = namedGroups
231	l.Pos += utf8.RuneCountInString(groups[0])
232	if rule.Mutator != nil {
233	if err := rule.Mutator.Mutate(l); err != nil {
234	panic(err)
235	}
236	}
237	if rule.Type != nil {
238	l.iteratorStack = append(l.iteratorStack, rule.Type.Emit(l.Groups, l))
239	}
240	}
241	// Exhaust the IteratorStack, if any.
242	// Duplicate code, but eh.
243	for len(l.iteratorStack) > 0 {
244	n := len(l.iteratorStack) - 1
245	t := l.iteratorStack[n]()
246	if t == EOF {
247	l.iteratorStack = l.iteratorStack[:n]
248	continue
249	}
250	return t
251	}
252
253	// If we get to here and we still have text, return it as an error.
254	if l.Pos != len(l.Text) && len(l.Stack) == 0 {
255	value := string(l.Text[l.Pos:])
256	l.Pos = len(l.Text)
257	return Token{Type: Error, Value: value}
258	}
259	return EOF
260	}
261
262	// RegexLexer is the default lexer implementation used in Chroma.
263	type RegexLexer struct {
264	registry *LexerRegistry // The LexerRegistry this Lexer is associated with, if any.
265	config *Config
266	analyser func(text string) float32
267	trace bool
268
269	mu sync.Mutex
270	compiled bool
271	rawRules Rules
272	rules map[string][]*CompiledRule
273	fetchRulesFunc func() (Rules, error)
274	compileOnce sync.Once
275	}
276
277	func (r *RegexLexer) String() string {
278	return r.config.Name
279	}
280
281	// Rules in the Lexer.
282	func (r *RegexLexer) Rules() (Rules, error) {
283	if err := r.needRules(); err != nil {
284	return nil, err
285	}
286	return r.rawRules, nil
287	}
288
289	// SetRegistry the lexer will use to lookup other lexers if necessary.
290	func (r RegexLexer) SetRegistry(registry LexerRegistry) Lexer {
291	r.registry = registry
292	return r
293	}
294
295	// SetAnalyser sets the analyser function used to perform content inspection.
296	func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) Lexer {
297	r.analyser = analyser
298	return r
299	}
300
301	func (r *RegexLexer) AnalyseText(text string) float32 { // nolint
302	if r.analyser != nil {
303	return r.analyser(text)
304	}
305	return 0.0
306	}
307
308	// SetConfig replaces the Config for this Lexer.
309	func (r RegexLexer) SetConfig(config Config) *RegexLexer {
310	r.config = config
311	return r
312	}
313
314	func (r RegexLexer) Config() Config { // nolint
315	return r.config
316	}
317
318	// Regex compilation is deferred until the lexer is used. This is to avoid significant init() time costs.
319	func (r *RegexLexer) maybeCompile() (err error) {
320	r.mu.Lock()
321	defer r.mu.Unlock()
322	if r.compiled {
323	return nil
324	}
325	for state, rules := range r.rules {
326	for i, rule := range rules {
327	if rule.Regexp == nil {
328	pattern := "(?:" + rule.Pattern + ")"
329	if rule.flags != "" {
330	pattern = "(?" + rule.flags + ")" + pattern
331	}
332	pattern = `\G` + pattern
333	rule.Regexp, err = regexp2.Compile(pattern, regexp2.RE2)
334	if err != nil {
335	return fmt.Errorf("failed to compile rule %s.%d: %s", state, i, err)
336	}
337	rule.Regexp.MatchTimeout = time.Millisecond * 250
338	}
339	}
340	}
341	restart:
342	seen := map[LexerMutator]bool{}
343	for state := range r.rules {
344	for i := 0; i < len(r.rules[state]); i++ {
345	rule := r.rules[state][i]
346	if compile, ok := rule.Mutator.(LexerMutator); ok {
347	if seen[compile] {
348	return fmt.Errorf("saw mutator %T twice; this should not happen", compile)
349	}
350	seen[compile] = true
351	if err := compile.MutateLexer(r.rules, state, i); err != nil {
352	return err
353	}
354	// Process the rules again in case the mutator added/removed rules.
355	//
356	// This sounds bad, but shouldn't be significant in practice.
357	goto restart
358	}
359	}
360	}
361	r.compiled = true
362	return nil
363	}
364
365	func (r *RegexLexer) fetchRules() error {
366	rules, err := r.fetchRulesFunc()
367	if err != nil {
368	return fmt.Errorf("%s: failed to compile rules: %w", r.config.Name, err)
369	}
370	if _, ok := rules["root"]; !ok {
371	return fmt.Errorf("no \"root\" state")
372	}
373	compiledRules := map[string][]*CompiledRule{}
374	for state, rules := range rules {
375	compiledRules[state] = nil
376	for _, rule := range rules {
377	flags := ""
378	if !r.config.NotMultiline {
379	flags += "m"
380	}
381	if r.config.CaseInsensitive {
382	flags += "i"
383	}
384	if r.config.DotAll {
385	flags += "s"
386	}
387	compiledRules[state] = append(compiledRules[state], &CompiledRule{Rule: rule, flags: flags})
388	}
389	}
390
391	r.rawRules = rules
392	r.rules = compiledRules
393	return nil
394	}
395
396	func (r *RegexLexer) needRules() error {
397	var err error
398	if r.fetchRulesFunc != nil {
399	r.compileOnce.Do(func() {
400	err = r.fetchRules()
401	})
402	}
403	if err := r.maybeCompile(); err != nil {
404	return err
405	}
406	return err
407	}
408
409	func (r RegexLexer) Tokenise(options TokeniseOptions, text string) (Iterator, error) { // nolint
410	err := r.needRules()
411	if err != nil {
412	return nil, err
413	}
414	if options == nil {
415	options = defaultOptions
416	}
417	if options.EnsureLF {
418	text = ensureLF(text)
419	}
420	newlineAdded := false
421	if !options.Nested && r.config.EnsureNL && !strings.HasSuffix(text, "\n") {
422	text += "\n"
423	newlineAdded = true
424	}
425	state := &LexerState{
426	Registry: r.registry,
427	newlineAdded: newlineAdded,
428	options: options,
429	Lexer: r,
430	Text: []rune(text),
431	Stack: []string{options.State},
432	Rules: r.rules,
433	MutatorContext: map[interface{}]interface{}{},
434	}
435	return state.Iterator, nil
436	}
437
438	// MustRules is like Rules() but will panic on error.
439	func (r *RegexLexer) MustRules() Rules {
440	rules, err := r.Rules()
441	if err != nil {
442	panic(err)
443	}
444	return rules
445	}
446
447	func matchRules(text []rune, pos int, rules []CompiledRule) (int, CompiledRule, []string, map[string]string) {
448	for i, rule := range rules {
449	match, err := rule.Regexp.FindRunesMatchStartingAt(text, pos)
450	if match != nil && err == nil && match.Index == pos {
451	groups := []string{}
452	namedGroups := make(map[string]string)
453	for _, g := range match.Groups() {
454	namedGroups[g.Name] = g.String()
455	groups = append(groups, g.String())
456	}
457	return i, rule, groups, namedGroups
458	}
459	}
460	return 0, &CompiledRule{}, nil, nil
461	}
462
463	// replace \r and \r\n with \n
464	// same as strings.ReplaceAll but more efficient
465	func ensureLF(text string) string {
466	buf := make([]byte, len(text))
467	var j int
468	for i := 0; i < len(text); i++ {
469	c := text[i]
470	if c == '\r' {
471	if i < len(text)-1 && text[i+1] == '\n' {
472	continue
473	}
474	c = '\n'
475	}
476	buf[j] = c
477	j++
478	}
479	return string(buf[:j])
480	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: