Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: code/trunk/vendor/github.com/dlclark/regexp2/syntax/parser.go@ 67

Last change on this file since 67 was 67, checked in by Izuru Yakumo, 23 months ago

Use vendored modules

Signed-off-by: Izuru Yakumo <yakumo.izuru@…>

File size: 49.1 KB

Line
1	package syntax
2
3	import (
4	"fmt"
5	"math"
6	"os"
7	"sort"
8	"strconv"
9	"unicode"
10	)
11
12	type RegexOptions int32
13
14	const (
15	IgnoreCase RegexOptions = 0x0001 // "i"
16	Multiline = 0x0002 // "m"
17	ExplicitCapture = 0x0004 // "n"
18	Compiled = 0x0008 // "c"
19	Singleline = 0x0010 // "s"
20	IgnorePatternWhitespace = 0x0020 // "x"
21	RightToLeft = 0x0040 // "r"
22	Debug = 0x0080 // "d"
23	ECMAScript = 0x0100 // "e"
24	RE2 = 0x0200 // RE2 compat mode
25	)
26
27	func optionFromCode(ch rune) RegexOptions {
28	// case-insensitive
29	switch ch {
30	case 'i', 'I':
31	return IgnoreCase
32	case 'r', 'R':
33	return RightToLeft
34	case 'm', 'M':
35	return Multiline
36	case 'n', 'N':
37	return ExplicitCapture
38	case 's', 'S':
39	return Singleline
40	case 'x', 'X':
41	return IgnorePatternWhitespace
42	case 'd', 'D':
43	return Debug
44	case 'e', 'E':
45	return ECMAScript
46	default:
47	return 0
48	}
49	}
50
51	// An Error describes a failure to parse a regular expression
52	// and gives the offending expression.
53	type Error struct {
54	Code ErrorCode
55	Expr string
56	Args []interface{}
57	}
58
59	func (e *Error) Error() string {
60	if len(e.Args) == 0 {
61	return "error parsing regexp: " + e.Code.String() + " in `" + e.Expr + "`"
62	}
63	return "error parsing regexp: " + fmt.Sprintf(e.Code.String(), e.Args...) + " in `" + e.Expr + "`"
64	}
65
66	// An ErrorCode describes a failure to parse a regular expression.
67	type ErrorCode string
68
69	const (
70	// internal issue
71	ErrInternalError ErrorCode = "regexp/syntax: internal error"
72	// Parser errors
73	ErrUnterminatedComment = "unterminated comment"
74	ErrInvalidCharRange = "invalid character class range"
75	ErrInvalidRepeatSize = "invalid repeat count"
76	ErrInvalidUTF8 = "invalid UTF-8"
77	ErrCaptureGroupOutOfRange = "capture group number out of range"
78	ErrUnexpectedParen = "unexpected )"
79	ErrMissingParen = "missing closing )"
80	ErrMissingBrace = "missing closing }"
81	ErrInvalidRepeatOp = "invalid nested repetition operator"
82	ErrMissingRepeatArgument = "missing argument to repetition operator"
83	ErrConditionalExpression = "illegal conditional (?(...)) expression"
84	ErrTooManyAlternates = "too many \| in (?()\|)"
85	ErrUnrecognizedGrouping = "unrecognized grouping construct: (%v"
86	ErrInvalidGroupName = "invalid group name: group names must begin with a word character and have a matching terminator"
87	ErrCapNumNotZero = "capture number cannot be zero"
88	ErrUndefinedBackRef = "reference to undefined group number %v"
89	ErrUndefinedNameRef = "reference to undefined group name %v"
90	ErrAlternationCantCapture = "alternation conditions do not capture and cannot be named"
91	ErrAlternationCantHaveComment = "alternation conditions cannot be comments"
92	ErrMalformedReference = "(?(%v) ) malformed"
93	ErrUndefinedReference = "(?(%v) ) reference to undefined group"
94	ErrIllegalEndEscape = "illegal \\ at end of pattern"
95	ErrMalformedSlashP = "malformed \\p{X} character escape"
96	ErrIncompleteSlashP = "incomplete \\p{X} character escape"
97	ErrUnknownSlashP = "unknown unicode category, script, or property '%v'"
98	ErrUnrecognizedEscape = "unrecognized escape sequence \\%v"
99	ErrMissingControl = "missing control character"
100	ErrUnrecognizedControl = "unrecognized control character"
101	ErrTooFewHex = "insufficient hexadecimal digits"
102	ErrInvalidHex = "hex values may not be larger than 0x10FFFF"
103	ErrMalformedNameRef = "malformed \\k<...> named back reference"
104	ErrBadClassInCharRange = "cannot include class \\%v in character range"
105	ErrUnterminatedBracket = "unterminated [] set"
106	ErrSubtractionMustBeLast = "a subtraction must be the last element in a character class"
107	ErrReversedCharRange = "[x-y] range in reverse order"
108	)
109
110	func (e ErrorCode) String() string {
111	return string(e)
112	}
113
114	type parser struct {
115	stack *regexNode
116	group *regexNode
117	alternation *regexNode
118	concatenation *regexNode
119	unit *regexNode
120
121	patternRaw string
122	pattern []rune
123
124	currentPos int
125	specialCase *unicode.SpecialCase
126
127	autocap int
128	capcount int
129	captop int
130	capsize int
131
132	caps map[int]int
133	capnames map[string]int
134
135	capnumlist []int
136	capnamelist []string
137
138	options RegexOptions
139	optionsStack []RegexOptions
140	ignoreNextParen bool
141	}
142
143	const (
144	maxValueDiv10 int = math.MaxInt32 / 10
145	maxValueMod10 = math.MaxInt32 % 10
146	)
147
148	// Parse converts a regex string into a parse tree
149	func Parse(re string, op RegexOptions) (*RegexTree, error) {
150	p := parser{
151	options: op,
152	caps: make(map[int]int),
153	}
154	p.setPattern(re)
155
156	if err := p.countCaptures(); err != nil {
157	return nil, err
158	}
159
160	p.reset(op)
161	root, err := p.scanRegex()
162
163	if err != nil {
164	return nil, err
165	}
166	tree := &RegexTree{
167	root: root,
168	caps: p.caps,
169	capnumlist: p.capnumlist,
170	captop: p.captop,
171	Capnames: p.capnames,
172	Caplist: p.capnamelist,
173	options: op,
174	}
175
176	if tree.options&Debug > 0 {
177	os.Stdout.WriteString(tree.Dump())
178	}
179
180	return tree, nil
181	}
182
183	func (p *parser) setPattern(pattern string) {
184	p.patternRaw = pattern
185	p.pattern = make([]rune, 0, len(pattern))
186
187	//populate our rune array to handle utf8 encoding
188	for _, r := range pattern {
189	p.pattern = append(p.pattern, r)
190	}
191	}
192	func (p *parser) getErr(code ErrorCode, args ...interface{}) error {
193	return &Error{Code: code, Expr: p.patternRaw, Args: args}
194	}
195
196	func (p *parser) noteCaptureSlot(i, pos int) {
197	if _, ok := p.caps[i]; !ok {
198	// the rhs of the hashtable isn't used in the parser
199	p.caps[i] = pos
200	p.capcount++
201
202	if p.captop <= i {
203	if i == math.MaxInt32 {
204	p.captop = i
205	} else {
206	p.captop = i + 1
207	}
208	}
209	}
210	}
211
212	func (p *parser) noteCaptureName(name string, pos int) {
213	if p.capnames == nil {
214	p.capnames = make(map[string]int)
215	}
216
217	if _, ok := p.capnames[name]; !ok {
218	p.capnames[name] = pos
219	p.capnamelist = append(p.capnamelist, name)
220	}
221	}
222
223	func (p *parser) assignNameSlots() {
224	if p.capnames != nil {
225	for _, name := range p.capnamelist {
226	for p.isCaptureSlot(p.autocap) {
227	p.autocap++
228	}
229	pos := p.capnames[name]
230	p.capnames[name] = p.autocap
231	p.noteCaptureSlot(p.autocap, pos)
232
233	p.autocap++
234	}
235	}
236
237	// if the caps array has at least one gap, construct the list of used slots
238	if p.capcount < p.captop {
239	p.capnumlist = make([]int, p.capcount)
240	i := 0
241
242	for k := range p.caps {
243	p.capnumlist[i] = k
244	i++
245	}
246
247	sort.Ints(p.capnumlist)
248	}
249
250	// merge capsnumlist into capnamelist
251	if p.capnames != nil \|\| p.capnumlist != nil {
252	var oldcapnamelist []string
253	var next int
254	var k int
255
256	if p.capnames == nil {
257	oldcapnamelist = nil
258	p.capnames = make(map[string]int)
259	p.capnamelist = []string{}
260	next = -1
261	} else {
262	oldcapnamelist = p.capnamelist
263	p.capnamelist = []string{}
264	next = p.capnames[oldcapnamelist[0]]
265	}
266
267	for i := 0; i < p.capcount; i++ {
268	j := i
269	if p.capnumlist != nil {
270	j = p.capnumlist[i]
271	}
272
273	if next == j {
274	p.capnamelist = append(p.capnamelist, oldcapnamelist[k])
275	k++
276
277	if k == len(oldcapnamelist) {
278	next = -1
279	} else {
280	next = p.capnames[oldcapnamelist[k]]
281	}
282
283	} else {
284	//feature: culture?
285	str := strconv.Itoa(j)
286	p.capnamelist = append(p.capnamelist, str)
287	p.capnames[str] = j
288	}
289	}
290	}
291	}
292
293	func (p *parser) consumeAutocap() int {
294	r := p.autocap
295	p.autocap++
296	return r
297	}
298
299	// CountCaptures is a prescanner for deducing the slots used for
300	// captures by doing a partial tokenization of the pattern.
301	func (p *parser) countCaptures() error {
302	var ch rune
303
304	p.noteCaptureSlot(0, 0)
305
306	p.autocap = 1
307
308	for p.charsRight() > 0 {
309	pos := p.textpos()
310	ch = p.moveRightGetChar()
311	switch ch {
312	case '\\':
313	if p.charsRight() > 0 {
314	p.scanBackslash(true)
315	}
316
317	case '#':
318	if p.useOptionX() {
319	p.moveLeft()
320	p.scanBlank()
321	}
322
323	case '[':
324	p.scanCharSet(false, true)
325
326	case ')':
327	if !p.emptyOptionsStack() {
328	p.popOptions()
329	}
330
331	case '(':
332	if p.charsRight() >= 2 && p.rightChar(1) == '#' && p.rightChar(0) == '?' {
333	p.moveLeft()
334	p.scanBlank()
335	} else {
336	p.pushOptions()
337	if p.charsRight() > 0 && p.rightChar(0) == '?' {
338	// we have (?...
339	p.moveRight(1)
340
341	if p.charsRight() > 1 && (p.rightChar(0) == '<' \|\| p.rightChar(0) == '\'') {
342	// named group: (?<... or (?'...
343
344	p.moveRight(1)
345	ch = p.rightChar(0)
346
347	if ch != '0' && IsWordChar(ch) {
348	if ch >= '1' && ch <= '9' {
349	dec, err := p.scanDecimal()
350	if err != nil {
351	return err
352	}
353	p.noteCaptureSlot(dec, pos)
354	} else {
355	p.noteCaptureName(p.scanCapname(), pos)
356	}
357	}
358	} else if p.useRE2() && p.charsRight() > 2 && (p.rightChar(0) == 'P' && p.rightChar(1) == '<') {
359	// RE2-compat (?P<)
360	p.moveRight(2)
361	ch = p.rightChar(0)
362	if IsWordChar(ch) {
363	p.noteCaptureName(p.scanCapname(), pos)
364	}
365
366	} else {
367	// (?...
368
369	// get the options if it's an option construct (?cimsx-cimsx...)
370	p.scanOptions()
371
372	if p.charsRight() > 0 {
373	if p.rightChar(0) == ')' {
374	// (?cimsx-cimsx)
375	p.moveRight(1)
376	p.popKeepOptions()
377	} else if p.rightChar(0) == '(' {
378	// alternation construct: (?(foo)yes\|no)
379	// ignore the next paren so we don't capture the condition
380	p.ignoreNextParen = true
381
382	// break from here so we don't reset ignoreNextParen
383	continue
384	}
385	}
386	}
387	} else {
388	if !p.useOptionN() && !p.ignoreNextParen {
389	p.noteCaptureSlot(p.consumeAutocap(), pos)
390	}
391	}
392	}
393
394	p.ignoreNextParen = false
395
396	}
397	}
398
399	p.assignNameSlots()
400	return nil
401	}
402
403	func (p *parser) reset(topopts RegexOptions) {
404	p.currentPos = 0
405	p.autocap = 1
406	p.ignoreNextParen = false
407
408	if len(p.optionsStack) > 0 {
409	p.optionsStack = p.optionsStack[:0]
410	}
411
412	p.options = topopts
413	p.stack = nil
414	}
415
416	func (p parser) scanRegex() (regexNode, error) {
417	ch := '@' // nonspecial ch, means at beginning
418	isQuant := false
419
420	p.startGroup(newRegexNodeMN(ntCapture, p.options, 0, -1))
421
422	for p.charsRight() > 0 {
423	wasPrevQuantifier := isQuant
424	isQuant = false
425
426	if err := p.scanBlank(); err != nil {
427	return nil, err
428	}
429
430	startpos := p.textpos()
431
432	// move past all of the normal characters. We'll stop when we hit some kind of control character,
433	// or if IgnorePatternWhiteSpace is on, we'll stop when we see some whitespace.
434	if p.useOptionX() {
435	for p.charsRight() > 0 {
436	ch = p.rightChar(0)
437	//UGLY: clean up, this is ugly
438	if !(!isStopperX(ch) \|\| (ch == '{' && !p.isTrueQuantifier())) {
439	break
440	}
441	p.moveRight(1)
442	}
443	} else {
444	for p.charsRight() > 0 {
445	ch = p.rightChar(0)
446	if !(!isSpecial(ch) \|\| ch == '{' && !p.isTrueQuantifier()) {
447	break
448	}
449	p.moveRight(1)
450	}
451	}
452
453	endpos := p.textpos()
454
455	p.scanBlank()
456
457	if p.charsRight() == 0 {
458	ch = '!' // nonspecial, means at end
459	} else if ch = p.rightChar(0); isSpecial(ch) {
460	isQuant = isQuantifier(ch)
461	p.moveRight(1)
462	} else {
463	ch = ' ' // nonspecial, means at ordinary char
464	}
465
466	if startpos < endpos {
467	cchUnquantified := endpos - startpos
468	if isQuant {
469	cchUnquantified--
470	}
471	wasPrevQuantifier = false
472
473	if cchUnquantified > 0 {
474	p.addToConcatenate(startpos, cchUnquantified, false)
475	}
476
477	if isQuant {
478	p.addUnitOne(p.charAt(endpos - 1))
479	}
480	}
481
482	switch ch {
483	case '!':
484	goto BreakOuterScan
485
486	case ' ':
487	goto ContinueOuterScan
488
489	case '[':
490	cc, err := p.scanCharSet(p.useOptionI(), false)
491	if err != nil {
492	return nil, err
493	}
494	p.addUnitSet(cc)
495
496	case '(':
497	p.pushOptions()
498
499	if grouper, err := p.scanGroupOpen(); err != nil {
500	return nil, err
501	} else if grouper == nil {
502	p.popKeepOptions()
503	} else {
504	p.pushGroup()
505	p.startGroup(grouper)
506	}
507
508	continue
509
510	case '\|':
511	p.addAlternate()
512	goto ContinueOuterScan
513
514	case ')':
515	if p.emptyStack() {
516	return nil, p.getErr(ErrUnexpectedParen)
517	}
518
519	if err := p.addGroup(); err != nil {
520	return nil, err
521	}
522	if err := p.popGroup(); err != nil {
523	return nil, err
524	}
525	p.popOptions()
526
527	if p.unit == nil {
528	goto ContinueOuterScan
529	}
530
531	case '\\':
532	n, err := p.scanBackslash(false)
533	if err != nil {
534	return nil, err
535	}
536	p.addUnitNode(n)
537
538	case '^':
539	if p.useOptionM() {
540	p.addUnitType(ntBol)
541	} else {
542	p.addUnitType(ntBeginning)
543	}
544
545	case '$':
546	if p.useOptionM() {
547	p.addUnitType(ntEol)
548	} else {
549	p.addUnitType(ntEndZ)
550	}
551
552	case '.':
553	if p.useOptionE() {
554	p.addUnitSet(ECMAAnyClass())
555	} else if p.useOptionS() {
556	p.addUnitSet(AnyClass())
557	} else {
558	p.addUnitNotone('\n')
559	}
560
561	case '{', '*', '+', '?':
562	if p.unit == nil {
563	if wasPrevQuantifier {
564	return nil, p.getErr(ErrInvalidRepeatOp)
565	} else {
566	return nil, p.getErr(ErrMissingRepeatArgument)
567	}
568	}
569	p.moveLeft()
570
571	default:
572	return nil, p.getErr(ErrInternalError)
573	}
574
575	if err := p.scanBlank(); err != nil {
576	return nil, err
577	}
578
579	if p.charsRight() > 0 {
580	isQuant = p.isTrueQuantifier()
581	}
582	if p.charsRight() == 0 \|\| !isQuant {
583	//maintain odd C# assignment order -- not sure if required, could clean up?
584	p.addConcatenate()
585	goto ContinueOuterScan
586	}
587
588	ch = p.moveRightGetChar()
589
590	// Handle quantifiers
591	for p.unit != nil {
592	var min, max int
593	var lazy bool
594
595	switch ch {
596	case '*':
597	min = 0
598	max = math.MaxInt32
599
600	case '?':
601	min = 0
602	max = 1
603
604	case '+':
605	min = 1
606	max = math.MaxInt32
607
608	case '{':
609	{
610	var err error
611	startpos = p.textpos()
612	if min, err = p.scanDecimal(); err != nil {
613	return nil, err
614	}
615	max = min
616	if startpos < p.textpos() {
617	if p.charsRight() > 0 && p.rightChar(0) == ',' {
618	p.moveRight(1)
619	if p.charsRight() == 0 \|\| p.rightChar(0) == '}' {
620	max = math.MaxInt32
621	} else {
622	if max, err = p.scanDecimal(); err != nil {
623	return nil, err
624	}
625	}
626	}
627	}
628
629	if startpos == p.textpos() \|\| p.charsRight() == 0 \|\| p.moveRightGetChar() != '}' {
630	p.addConcatenate()
631	p.textto(startpos - 1)
632	goto ContinueOuterScan
633	}
634	}
635
636	default:
637	return nil, p.getErr(ErrInternalError)
638	}
639
640	if err := p.scanBlank(); err != nil {
641	return nil, err
642	}
643
644	if p.charsRight() == 0 \|\| p.rightChar(0) != '?' {
645	lazy = false
646	} else {
647	p.moveRight(1)
648	lazy = true
649	}
650
651	if min > max {
652	return nil, p.getErr(ErrInvalidRepeatSize)
653	}
654
655	p.addConcatenate3(lazy, min, max)
656	}
657
658	ContinueOuterScan:
659	}
660
661	BreakOuterScan:
662	;
663
664	if !p.emptyStack() {
665	return nil, p.getErr(ErrMissingParen)
666	}
667
668	if err := p.addGroup(); err != nil {
669	return nil, err
670	}
671
672	return p.unit, nil
673
674	}
675
676	/*
677	* Simple parsing for replacement patterns
678	*/
679	func (p parser) scanReplacement() (regexNode, error) {
680	var c, startpos int
681
682	p.concatenation = newRegexNode(ntConcatenate, p.options)
683
684	for {
685	c = p.charsRight()
686	if c == 0 {
687	break
688	}
689
690	startpos = p.textpos()
691
692	for c > 0 && p.rightChar(0) != '$' {
693	p.moveRight(1)
694	c--
695	}
696
697	p.addToConcatenate(startpos, p.textpos()-startpos, true)
698
699	if c > 0 {
700	if p.moveRightGetChar() == '$' {
701	n, err := p.scanDollar()
702	if err != nil {
703	return nil, err
704	}
705	p.addUnitNode(n)
706	}
707	p.addConcatenate()
708	}
709	}
710
711	return p.concatenation, nil
712	}
713
714	/*
715	* Scans $ patterns recognized within replacement patterns
716	*/
717	func (p parser) scanDollar() (regexNode, error) {
718	if p.charsRight() == 0 {
719	return newRegexNodeCh(ntOne, p.options, '$'), nil
720	}
721
722	ch := p.rightChar(0)
723	angled := false
724	backpos := p.textpos()
725	lastEndPos := backpos
726
727	// Note angle
728
729	if ch == '{' && p.charsRight() > 1 {
730	angled = true
731	p.moveRight(1)
732	ch = p.rightChar(0)
733	}
734
735	// Try to parse backreference: \1 or \{1} or \{cap}
736
737	if ch >= '0' && ch <= '9' {
738	if !angled && p.useOptionE() {
739	capnum := -1
740	newcapnum := int(ch - '0')
741	p.moveRight(1)
742	if p.isCaptureSlot(newcapnum) {
743	capnum = newcapnum
744	lastEndPos = p.textpos()
745	}
746
747	for p.charsRight() > 0 {
748	ch = p.rightChar(0)
749	if ch < '0' \|\| ch > '9' {
750	break
751	}
752	digit := int(ch - '0')
753	if newcapnum > maxValueDiv10 \|\| (newcapnum == maxValueDiv10 && digit > maxValueMod10) {
754	return nil, p.getErr(ErrCaptureGroupOutOfRange)
755	}
756
757	newcapnum = newcapnum*10 + digit
758
759	p.moveRight(1)
760	if p.isCaptureSlot(newcapnum) {
761	capnum = newcapnum
762	lastEndPos = p.textpos()
763	}
764	}
765	p.textto(lastEndPos)
766	if capnum >= 0 {
767	return newRegexNodeM(ntRef, p.options, capnum), nil
768	}
769	} else {
770	capnum, err := p.scanDecimal()
771	if err != nil {
772	return nil, err
773	}
774	if !angled \|\| p.charsRight() > 0 && p.moveRightGetChar() == '}' {
775	if p.isCaptureSlot(capnum) {
776	return newRegexNodeM(ntRef, p.options, capnum), nil
777	}
778	}
779	}
780	} else if angled && IsWordChar(ch) {
781	capname := p.scanCapname()
782
783	if p.charsRight() > 0 && p.moveRightGetChar() == '}' {
784	if p.isCaptureName(capname) {
785	return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil
786	}
787	}
788	} else if !angled {
789	capnum := 1
790
791	switch ch {
792	case '$':
793	p.moveRight(1)
794	return newRegexNodeCh(ntOne, p.options, '$'), nil
795	case '&':
796	capnum = 0
797	case '`':
798	capnum = replaceLeftPortion
799	case '\'':
800	capnum = replaceRightPortion
801	case '+':
802	capnum = replaceLastGroup
803	case '_':
804	capnum = replaceWholeString
805	}
806
807	if capnum != 1 {
808	p.moveRight(1)
809	return newRegexNodeM(ntRef, p.options, capnum), nil
810	}
811	}
812
813	// unrecognized $: literalize
814
815	p.textto(backpos)
816	return newRegexNodeCh(ntOne, p.options, '$'), nil
817	}
818
819	// scanGroupOpen scans chars following a '(' (not counting the '('), and returns
820	// a RegexNode for the type of group scanned, or nil if the group
821	// simply changed options (?cimsx-cimsx) or was a comment (#...).
822	func (p parser) scanGroupOpen() (regexNode, error) {
823	var ch rune
824	var nt nodeType
825	var err error
826	close := '>'
827	start := p.textpos()
828
829	// just return a RegexNode if we have:
830	// 1. "(" followed by nothing
831	// 2. "(x" where x != ?
832	// 3. "(?)"
833	if p.charsRight() == 0 \|\| p.rightChar(0) != '?' \|\| (p.rightChar(0) == '?' && (p.charsRight() > 1 && p.rightChar(1) == ')')) {
834	if p.useOptionN() \|\| p.ignoreNextParen {
835	p.ignoreNextParen = false
836	return newRegexNode(ntGroup, p.options), nil
837	}
838	return newRegexNodeMN(ntCapture, p.options, p.consumeAutocap(), -1), nil
839	}
840
841	p.moveRight(1)
842
843	for {
844	if p.charsRight() == 0 {
845	break
846	}
847
848	switch ch = p.moveRightGetChar(); ch {
849	case ':':
850	nt = ntGroup
851
852	case '=':
853	p.options &= ^RightToLeft
854	nt = ntRequire
855
856	case '!':
857	p.options &= ^RightToLeft
858	nt = ntPrevent
859
860	case '>':
861	nt = ntGreedy
862
863	case '\'':
864	close = '\''
865	fallthrough
866
867	case '<':
868	if p.charsRight() == 0 {
869	goto BreakRecognize
870	}
871
872	switch ch = p.moveRightGetChar(); ch {
873	case '=':
874	if close == '\'' {
875	goto BreakRecognize
876	}
877
878	p.options \|= RightToLeft
879	nt = ntRequire
880
881	case '!':
882	if close == '\'' {
883	goto BreakRecognize
884	}
885
886	p.options \|= RightToLeft
887	nt = ntPrevent
888
889	default:
890	p.moveLeft()
891	capnum := -1
892	uncapnum := -1
893	proceed := false
894
895	// grab part before -
896
897	if ch >= '0' && ch <= '9' {
898	if capnum, err = p.scanDecimal(); err != nil {
899	return nil, err
900	}
901
902	if !p.isCaptureSlot(capnum) {
903	capnum = -1
904	}
905
906	// check if we have bogus characters after the number
907	if p.charsRight() > 0 && !(p.rightChar(0) == close \|\| p.rightChar(0) == '-') {
908	return nil, p.getErr(ErrInvalidGroupName)
909	}
910	if capnum == 0 {
911	return nil, p.getErr(ErrCapNumNotZero)
912	}
913	} else if IsWordChar(ch) {
914	capname := p.scanCapname()
915
916	if p.isCaptureName(capname) {
917	capnum = p.captureSlotFromName(capname)
918	}
919
920	// check if we have bogus character after the name
921	if p.charsRight() > 0 && !(p.rightChar(0) == close \|\| p.rightChar(0) == '-') {
922	return nil, p.getErr(ErrInvalidGroupName)
923	}
924	} else if ch == '-' {
925	proceed = true
926	} else {
927	// bad group name - starts with something other than a word character and isn't a number
928	return nil, p.getErr(ErrInvalidGroupName)
929	}
930
931	// grab part after - if any
932
933	if (capnum != -1 \|\| proceed == true) && p.charsRight() > 0 && p.rightChar(0) == '-' {
934	p.moveRight(1)
935
936	//no more chars left, no closing char, etc
937	if p.charsRight() == 0 {
938	return nil, p.getErr(ErrInvalidGroupName)
939	}
940
941	ch = p.rightChar(0)
942	if ch >= '0' && ch <= '9' {
943	if uncapnum, err = p.scanDecimal(); err != nil {
944	return nil, err
945	}
946
947	if !p.isCaptureSlot(uncapnum) {
948	return nil, p.getErr(ErrUndefinedBackRef, uncapnum)
949	}
950
951	// check if we have bogus characters after the number
952	if p.charsRight() > 0 && p.rightChar(0) != close {
953	return nil, p.getErr(ErrInvalidGroupName)
954	}
955	} else if IsWordChar(ch) {
956	uncapname := p.scanCapname()
957
958	if !p.isCaptureName(uncapname) {
959	return nil, p.getErr(ErrUndefinedNameRef, uncapname)
960	}
961	uncapnum = p.captureSlotFromName(uncapname)
962
963	// check if we have bogus character after the name
964	if p.charsRight() > 0 && p.rightChar(0) != close {
965	return nil, p.getErr(ErrInvalidGroupName)
966	}
967	} else {
968	// bad group name - starts with something other than a word character and isn't a number
969	return nil, p.getErr(ErrInvalidGroupName)
970	}
971	}
972
973	// actually make the node
974
975	if (capnum != -1 \|\| uncapnum != -1) && p.charsRight() > 0 && p.moveRightGetChar() == close {
976	return newRegexNodeMN(ntCapture, p.options, capnum, uncapnum), nil
977	}
978	goto BreakRecognize
979	}
980
981	case '(':
982	// alternation construct (?(...) \| )
983
984	parenPos := p.textpos()
985	if p.charsRight() > 0 {
986	ch = p.rightChar(0)
987
988	// check if the alternation condition is a backref
989	if ch >= '0' && ch <= '9' {
990	var capnum int
991	if capnum, err = p.scanDecimal(); err != nil {
992	return nil, err
993	}
994	if p.charsRight() > 0 && p.moveRightGetChar() == ')' {
995	if p.isCaptureSlot(capnum) {
996	return newRegexNodeM(ntTestref, p.options, capnum), nil
997	}
998	return nil, p.getErr(ErrUndefinedReference, capnum)
999	}
1000
1001	return nil, p.getErr(ErrMalformedReference, capnum)
1002
1003	} else if IsWordChar(ch) {
1004	capname := p.scanCapname()
1005
1006	if p.isCaptureName(capname) && p.charsRight() > 0 && p.moveRightGetChar() == ')' {
1007	return newRegexNodeM(ntTestref, p.options, p.captureSlotFromName(capname)), nil
1008	}
1009	}
1010	}
1011	// not a backref
1012	nt = ntTestgroup
1013	p.textto(parenPos - 1) // jump to the start of the parentheses
1014	p.ignoreNextParen = true // but make sure we don't try to capture the insides
1015
1016	charsRight := p.charsRight()
1017	if charsRight >= 3 && p.rightChar(1) == '?' {
1018	rightchar2 := p.rightChar(2)
1019	// disallow comments in the condition
1020	if rightchar2 == '#' {
1021	return nil, p.getErr(ErrAlternationCantHaveComment)
1022	}
1023
1024	// disallow named capture group (?<..>..) in the condition
1025	if rightchar2 == '\'' {
1026	return nil, p.getErr(ErrAlternationCantCapture)
1027	}
1028
1029	if charsRight >= 4 && (rightchar2 == '<' && p.rightChar(3) != '!' && p.rightChar(3) != '=') {
1030	return nil, p.getErr(ErrAlternationCantCapture)
1031	}
1032	}
1033
1034	case 'P':
1035	if p.useRE2() {
1036	// support for P<name> syntax
1037	if p.charsRight() < 3 {
1038	goto BreakRecognize
1039	}
1040
1041	ch = p.moveRightGetChar()
1042	if ch != '<' {
1043	goto BreakRecognize
1044	}
1045
1046	ch = p.moveRightGetChar()
1047	p.moveLeft()
1048
1049	if IsWordChar(ch) {
1050	capnum := -1
1051	capname := p.scanCapname()
1052
1053	if p.isCaptureName(capname) {
1054	capnum = p.captureSlotFromName(capname)
1055	}
1056
1057	// check if we have bogus character after the name
1058	if p.charsRight() > 0 && p.rightChar(0) != '>' {
1059	return nil, p.getErr(ErrInvalidGroupName)
1060	}
1061
1062	// actually make the node
1063
1064	if capnum != -1 && p.charsRight() > 0 && p.moveRightGetChar() == '>' {
1065	return newRegexNodeMN(ntCapture, p.options, capnum, -1), nil
1066	}
1067	goto BreakRecognize
1068
1069	} else {
1070	// bad group name - starts with something other than a word character and isn't a number
1071	return nil, p.getErr(ErrInvalidGroupName)
1072	}
1073	}
1074	// if we're not using RE2 compat mode then
1075	// we just behave like normal
1076	fallthrough
1077
1078	default:
1079	p.moveLeft()
1080
1081	nt = ntGroup
1082	// disallow options in the children of a testgroup node
1083	if p.group.t != ntTestgroup {
1084	p.scanOptions()
1085	}
1086	if p.charsRight() == 0 {
1087	goto BreakRecognize
1088	}
1089
1090	if ch = p.moveRightGetChar(); ch == ')' {
1091	return nil, nil
1092	}
1093
1094	if ch != ':' {
1095	goto BreakRecognize
1096	}
1097
1098	}
1099
1100	return newRegexNode(nt, p.options), nil
1101	}
1102
1103	BreakRecognize:
1104
1105	// break Recognize comes here
1106
1107	return nil, p.getErr(ErrUnrecognizedGrouping, string(p.pattern[start:p.textpos()]))
1108	}
1109
1110	// scans backslash specials and basics
1111	func (p parser) scanBackslash(scanOnly bool) (regexNode, error) {
1112
1113	if p.charsRight() == 0 {
1114	return nil, p.getErr(ErrIllegalEndEscape)
1115	}
1116
1117	switch ch := p.rightChar(0); ch {
1118	case 'b', 'B', 'A', 'G', 'Z', 'z':
1119	p.moveRight(1)
1120	return newRegexNode(p.typeFromCode(ch), p.options), nil
1121
1122	case 'w':
1123	p.moveRight(1)
1124	if p.useOptionE() {
1125	return newRegexNodeSet(ntSet, p.options, ECMAWordClass()), nil
1126	}
1127	return newRegexNodeSet(ntSet, p.options, WordClass()), nil
1128
1129	case 'W':
1130	p.moveRight(1)
1131	if p.useOptionE() {
1132	return newRegexNodeSet(ntSet, p.options, NotECMAWordClass()), nil
1133	}
1134	return newRegexNodeSet(ntSet, p.options, NotWordClass()), nil
1135
1136	case 's':
1137	p.moveRight(1)
1138	if p.useOptionE() {
1139	return newRegexNodeSet(ntSet, p.options, ECMASpaceClass()), nil
1140	}
1141	return newRegexNodeSet(ntSet, p.options, SpaceClass()), nil
1142
1143	case 'S':
1144	p.moveRight(1)
1145	if p.useOptionE() {
1146	return newRegexNodeSet(ntSet, p.options, NotECMASpaceClass()), nil
1147	}
1148	return newRegexNodeSet(ntSet, p.options, NotSpaceClass()), nil
1149
1150	case 'd':
1151	p.moveRight(1)
1152	if p.useOptionE() {
1153	return newRegexNodeSet(ntSet, p.options, ECMADigitClass()), nil
1154	}
1155	return newRegexNodeSet(ntSet, p.options, DigitClass()), nil
1156
1157	case 'D':
1158	p.moveRight(1)
1159	if p.useOptionE() {
1160	return newRegexNodeSet(ntSet, p.options, NotECMADigitClass()), nil
1161	}
1162	return newRegexNodeSet(ntSet, p.options, NotDigitClass()), nil
1163
1164	case 'p', 'P':
1165	p.moveRight(1)
1166	prop, err := p.parseProperty()
1167	if err != nil {
1168	return nil, err
1169	}
1170	cc := &CharSet{}
1171	cc.addCategory(prop, (ch != 'p'), p.useOptionI(), p.patternRaw)
1172	if p.useOptionI() {
1173	cc.addLowercase()
1174	}
1175
1176	return newRegexNodeSet(ntSet, p.options, cc), nil
1177
1178	default:
1179	return p.scanBasicBackslash(scanOnly)
1180	}
1181	}
1182
1183	// Scans \-style backreferences and character escapes
1184	func (p parser) scanBasicBackslash(scanOnly bool) (regexNode, error) {
1185	if p.charsRight() == 0 {
1186	return nil, p.getErr(ErrIllegalEndEscape)
1187	}
1188	angled := false
1189	close := '\x00'
1190
1191	backpos := p.textpos()
1192	ch := p.rightChar(0)
1193
1194	// allow \k<foo> instead of \<foo>, which is now deprecated
1195
1196	if ch == 'k' {
1197	if p.charsRight() >= 2 {
1198	p.moveRight(1)
1199	ch = p.moveRightGetChar()
1200
1201	if ch == '<' \|\| ch == '\'' {
1202	angled = true
1203	if ch == '\'' {
1204	close = '\''
1205	} else {
1206	close = '>'
1207	}
1208	}
1209	}
1210
1211	if !angled \|\| p.charsRight() <= 0 {
1212	return nil, p.getErr(ErrMalformedNameRef)
1213	}
1214
1215	ch = p.rightChar(0)
1216
1217	} else if (ch == '<' \|\| ch == '\'') && p.charsRight() > 1 { // Note angle without \g
1218	angled = true
1219	if ch == '\'' {
1220	close = '\''
1221	} else {
1222	close = '>'
1223	}
1224
1225	p.moveRight(1)
1226	ch = p.rightChar(0)
1227	}
1228
1229	// Try to parse backreference: \<1> or \<cap>
1230
1231	if angled && ch >= '0' && ch <= '9' {
1232	capnum, err := p.scanDecimal()
1233	if err != nil {
1234	return nil, err
1235	}
1236
1237	if p.charsRight() > 0 && p.moveRightGetChar() == close {
1238	if p.isCaptureSlot(capnum) {
1239	return newRegexNodeM(ntRef, p.options, capnum), nil
1240	}
1241	return nil, p.getErr(ErrUndefinedBackRef, capnum)
1242	}
1243	} else if !angled && ch >= '1' && ch <= '9' { // Try to parse backreference or octal: \1
1244	capnum, err := p.scanDecimal()
1245	if err != nil {
1246	return nil, err
1247	}
1248
1249	if scanOnly {
1250	return nil, nil
1251	}
1252
1253	if p.isCaptureSlot(capnum) {
1254	return newRegexNodeM(ntRef, p.options, capnum), nil
1255	}
1256	if capnum <= 9 && !p.useOptionE() {
1257	return nil, p.getErr(ErrUndefinedBackRef, capnum)
1258	}
1259
1260	} else if angled && IsWordChar(ch) {
1261	capname := p.scanCapname()
1262
1263	if p.charsRight() > 0 && p.moveRightGetChar() == close {
1264	if p.isCaptureName(capname) {
1265	return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil
1266	}
1267	return nil, p.getErr(ErrUndefinedNameRef, capname)
1268	}
1269	}
1270
1271	// Not backreference: must be char code
1272
1273	p.textto(backpos)
1274	ch, err := p.scanCharEscape()
1275	if err != nil {
1276	return nil, err
1277	}
1278
1279	if p.useOptionI() {
1280	ch = unicode.ToLower(ch)
1281	}
1282
1283	return newRegexNodeCh(ntOne, p.options, ch), nil
1284	}
1285
1286	// Scans X for \p{X} or \P{X}
1287	func (p *parser) parseProperty() (string, error) {
1288	if p.charsRight() < 3 {
1289	return "", p.getErr(ErrIncompleteSlashP)
1290	}
1291	ch := p.moveRightGetChar()
1292	if ch != '{' {
1293	return "", p.getErr(ErrMalformedSlashP)
1294	}
1295
1296	startpos := p.textpos()
1297	for p.charsRight() > 0 {
1298	ch = p.moveRightGetChar()
1299	if !(IsWordChar(ch) \|\| ch == '-') {
1300	p.moveLeft()
1301	break
1302	}
1303	}
1304	capname := string(p.pattern[startpos:p.textpos()])
1305
1306	if p.charsRight() == 0 \|\| p.moveRightGetChar() != '}' {
1307	return "", p.getErr(ErrIncompleteSlashP)
1308	}
1309
1310	if !isValidUnicodeCat(capname) {
1311	return "", p.getErr(ErrUnknownSlashP, capname)
1312	}
1313
1314	return capname, nil
1315	}
1316
1317	// Returns ReNode type for zero-length assertions with a \ code.
1318	func (p *parser) typeFromCode(ch rune) nodeType {
1319	switch ch {
1320	case 'b':
1321	if p.useOptionE() {
1322	return ntECMABoundary
1323	}
1324	return ntBoundary
1325	case 'B':
1326	if p.useOptionE() {
1327	return ntNonECMABoundary
1328	}
1329	return ntNonboundary
1330	case 'A':
1331	return ntBeginning
1332	case 'G':
1333	return ntStart
1334	case 'Z':
1335	return ntEndZ
1336	case 'z':
1337	return ntEnd
1338	default:
1339	return ntNothing
1340	}
1341	}
1342
1343	// Scans whitespace or x-mode comments.
1344	func (p *parser) scanBlank() error {
1345	if p.useOptionX() {
1346	for {
1347	for p.charsRight() > 0 && isSpace(p.rightChar(0)) {
1348	p.moveRight(1)
1349	}
1350
1351	if p.charsRight() == 0 {
1352	break
1353	}
1354
1355	if p.rightChar(0) == '#' {
1356	for p.charsRight() > 0 && p.rightChar(0) != '\n' {
1357	p.moveRight(1)
1358	}
1359	} else if p.charsRight() >= 3 && p.rightChar(2) == '#' &&
1360	p.rightChar(1) == '?' && p.rightChar(0) == '(' {
1361	for p.charsRight() > 0 && p.rightChar(0) != ')' {
1362	p.moveRight(1)
1363	}
1364	if p.charsRight() == 0 {
1365	return p.getErr(ErrUnterminatedComment)
1366	}
1367	p.moveRight(1)
1368	} else {
1369	break
1370	}
1371	}
1372	} else {
1373	for {
1374	if p.charsRight() < 3 \|\| p.rightChar(2) != '#' \|\|
1375	p.rightChar(1) != '?' \|\| p.rightChar(0) != '(' {
1376	return nil
1377	}
1378
1379	for p.charsRight() > 0 && p.rightChar(0) != ')' {
1380	p.moveRight(1)
1381	}
1382	if p.charsRight() == 0 {
1383	return p.getErr(ErrUnterminatedComment)
1384	}
1385	p.moveRight(1)
1386	}
1387	}
1388	return nil
1389	}
1390
1391	func (p *parser) scanCapname() string {
1392	startpos := p.textpos()
1393
1394	for p.charsRight() > 0 {
1395	if !IsWordChar(p.moveRightGetChar()) {
1396	p.moveLeft()
1397	break
1398	}
1399	}
1400
1401	return string(p.pattern[startpos:p.textpos()])
1402	}
1403
1404	//Scans contents of [] (not including []'s), and converts to a set.
1405	func (p parser) scanCharSet(caseInsensitive, scanOnly bool) (CharSet, error) {
1406	ch := '\x00'
1407	chPrev := '\x00'
1408	inRange := false
1409	firstChar := true
1410	closed := false
1411
1412	var cc *CharSet
1413	if !scanOnly {
1414	cc = &CharSet{}
1415	}
1416
1417	if p.charsRight() > 0 && p.rightChar(0) == '^' {
1418	p.moveRight(1)
1419	if !scanOnly {
1420	cc.negate = true
1421	}
1422	}
1423
1424	for ; p.charsRight() > 0; firstChar = false {
1425	fTranslatedChar := false
1426	ch = p.moveRightGetChar()
1427	if ch == ']' {
1428	if !firstChar {
1429	closed = true
1430	break
1431	} else if p.useOptionE() {
1432	if !scanOnly {
1433	cc.addRanges(NoneClass().ranges)
1434	}
1435	closed = true
1436	break
1437	}
1438
1439	} else if ch == '\\' && p.charsRight() > 0 {
1440	switch ch = p.moveRightGetChar(); ch {
1441	case 'D', 'd':
1442	if !scanOnly {
1443	if inRange {
1444	return nil, p.getErr(ErrBadClassInCharRange, ch)
1445	}
1446	cc.addDigit(p.useOptionE(), ch == 'D', p.patternRaw)
1447	}
1448	continue
1449
1450	case 'S', 's':
1451	if !scanOnly {
1452	if inRange {
1453	return nil, p.getErr(ErrBadClassInCharRange, ch)
1454	}
1455	cc.addSpace(p.useOptionE(), ch == 'S')
1456	}
1457	continue
1458
1459	case 'W', 'w':
1460	if !scanOnly {
1461	if inRange {
1462	return nil, p.getErr(ErrBadClassInCharRange, ch)
1463	}
1464
1465	cc.addWord(p.useOptionE(), ch == 'W')
1466	}
1467	continue
1468
1469	case 'p', 'P':
1470	if !scanOnly {
1471	if inRange {
1472	return nil, p.getErr(ErrBadClassInCharRange, ch)
1473	}
1474	prop, err := p.parseProperty()
1475	if err != nil {
1476	return nil, err
1477	}
1478	cc.addCategory(prop, (ch != 'p'), caseInsensitive, p.patternRaw)
1479	} else {
1480	p.parseProperty()
1481	}
1482
1483	continue
1484
1485	case '-':
1486	if !scanOnly {
1487	cc.addRange(ch, ch)
1488	}
1489	continue
1490
1491	default:
1492	p.moveLeft()
1493	var err error
1494	ch, err = p.scanCharEscape() // non-literal character
1495	if err != nil {
1496	return nil, err
1497	}
1498	fTranslatedChar = true
1499	break // this break will only break out of the switch
1500	}
1501	} else if ch == '[' {
1502	// This is code for Posix style properties - [:Ll:] or [:IsTibetan:].
1503	// It currently doesn't do anything other than skip the whole thing!
1504	if p.charsRight() > 0 && p.rightChar(0) == ':' && !inRange {
1505	savePos := p.textpos()
1506
1507	p.moveRight(1)
1508	negate := false
1509	if p.charsRight() > 1 && p.rightChar(0) == '^' {
1510	negate = true
1511	p.moveRight(1)
1512	}
1513
1514	nm := p.scanCapname() // snag the name
1515	if !scanOnly && p.useRE2() {
1516	// look up the name since these are valid for RE2
1517	// add the group based on the name
1518	if ok := cc.addNamedASCII(nm, negate); !ok {
1519	return nil, p.getErr(ErrInvalidCharRange)
1520	}
1521	}
1522	if p.charsRight() < 2 \|\| p.moveRightGetChar() != ':' \|\| p.moveRightGetChar() != ']' {
1523	p.textto(savePos)
1524	} else if p.useRE2() {
1525	// move on
1526	continue
1527	}
1528	}
1529	}
1530
1531	if inRange {
1532	inRange = false
1533	if !scanOnly {
1534	if ch == '[' && !fTranslatedChar && !firstChar {
1535	// We thought we were in a range, but we're actually starting a subtraction.
1536	// In that case, we'll add chPrev to our char class, skip the opening [, and
1537	// scan the new character class recursively.
1538	cc.addChar(chPrev)
1539	sub, err := p.scanCharSet(caseInsensitive, false)
1540	if err != nil {
1541	return nil, err
1542	}
1543	cc.addSubtraction(sub)
1544
1545	if p.charsRight() > 0 && p.rightChar(0) != ']' {
1546	return nil, p.getErr(ErrSubtractionMustBeLast)
1547	}
1548	} else {
1549	// a regular range, like a-z
1550	if chPrev > ch {
1551	return nil, p.getErr(ErrReversedCharRange)
1552	}
1553	cc.addRange(chPrev, ch)
1554	}
1555	}
1556	} else if p.charsRight() >= 2 && p.rightChar(0) == '-' && p.rightChar(1) != ']' {
1557	// this could be the start of a range
1558	chPrev = ch
1559	inRange = true
1560	p.moveRight(1)
1561	} else if p.charsRight() >= 1 && ch == '-' && !fTranslatedChar && p.rightChar(0) == '[' && !firstChar {
1562	// we aren't in a range, and now there is a subtraction. Usually this happens
1563	// only when a subtraction follows a range, like [a-z-[b]]
1564	if !scanOnly {
1565	p.moveRight(1)
1566	sub, err := p.scanCharSet(caseInsensitive, false)
1567	if err != nil {
1568	return nil, err
1569	}
1570	cc.addSubtraction(sub)
1571
1572	if p.charsRight() > 0 && p.rightChar(0) != ']' {
1573	return nil, p.getErr(ErrSubtractionMustBeLast)
1574	}
1575	} else {
1576	p.moveRight(1)
1577	p.scanCharSet(caseInsensitive, true)
1578	}
1579	} else {
1580	if !scanOnly {
1581	cc.addRange(ch, ch)
1582	}
1583	}
1584	}
1585
1586	if !closed {
1587	return nil, p.getErr(ErrUnterminatedBracket)
1588	}
1589
1590	if !scanOnly && caseInsensitive {
1591	cc.addLowercase()
1592	}
1593
1594	return cc, nil
1595	}
1596
1597	// Scans any number of decimal digits (pegs value at 2^31-1 if too large)
1598	func (p *parser) scanDecimal() (int, error) {
1599	i := 0
1600	var d int
1601
1602	for p.charsRight() > 0 {
1603	d = int(p.rightChar(0) - '0')
1604	if d < 0 \|\| d > 9 {
1605	break
1606	}
1607	p.moveRight(1)
1608
1609	if i > maxValueDiv10 \|\| (i == maxValueDiv10 && d > maxValueMod10) {
1610	return 0, p.getErr(ErrCaptureGroupOutOfRange)
1611	}
1612
1613	i *= 10
1614	i += d
1615	}
1616
1617	return int(i), nil
1618	}
1619
1620	// Returns true for options allowed only at the top level
1621	func isOnlyTopOption(option RegexOptions) bool {
1622	return option == RightToLeft \|\| option == ECMAScript \|\| option == RE2
1623	}
1624
1625	// Scans cimsx-cimsx option string, stops at the first unrecognized char.
1626	func (p *parser) scanOptions() {
1627
1628	for off := false; p.charsRight() > 0; p.moveRight(1) {
1629	ch := p.rightChar(0)
1630
1631	if ch == '-' {
1632	off = true
1633	} else if ch == '+' {
1634	off = false
1635	} else {
1636	option := optionFromCode(ch)
1637	if option == 0 \|\| isOnlyTopOption(option) {
1638	return
1639	}
1640
1641	if off {
1642	p.options &= ^option
1643	} else {
1644	p.options \|= option
1645	}
1646	}
1647	}
1648	}
1649
1650	// Scans \ code for escape codes that map to single unicode chars.
1651	func (p *parser) scanCharEscape() (r rune, err error) {
1652
1653	ch := p.moveRightGetChar()
1654
1655	if ch >= '0' && ch <= '7' {
1656	p.moveLeft()
1657	return p.scanOctal(), nil
1658	}
1659
1660	pos := p.textpos()
1661
1662	switch ch {
1663	case 'x':
1664	// support for \x{HEX} syntax from Perl and PCRE
1665	if p.charsRight() > 0 && p.rightChar(0) == '{' {
1666	if p.useOptionE() {
1667	return ch, nil
1668	}
1669	p.moveRight(1)
1670	return p.scanHexUntilBrace()
1671	} else {
1672	r, err = p.scanHex(2)
1673	}
1674	case 'u':
1675	r, err = p.scanHex(4)
1676	case 'a':
1677	return '\u0007', nil
1678	case 'b':
1679	return '\b', nil
1680	case 'e':
1681	return '\u001B', nil
1682	case 'f':
1683	return '\f', nil
1684	case 'n':
1685	return '\n', nil
1686	case 'r':
1687	return '\r', nil
1688	case 't':
1689	return '\t', nil
1690	case 'v':
1691	return '\u000B', nil
1692	case 'c':
1693	r, err = p.scanControl()
1694	default:
1695	if !p.useOptionE() && IsWordChar(ch) {
1696	return 0, p.getErr(ErrUnrecognizedEscape, string(ch))
1697	}
1698	return ch, nil
1699	}
1700	if err != nil && p.useOptionE() {
1701	p.textto(pos)
1702	return ch, nil
1703	}
1704	return
1705	}
1706
1707	// Grabs and converts an ascii control character
1708	func (p *parser) scanControl() (rune, error) {
1709	if p.charsRight() <= 0 {
1710	return 0, p.getErr(ErrMissingControl)
1711	}
1712
1713	ch := p.moveRightGetChar()
1714
1715	// \ca interpreted as \cA
1716
1717	if ch >= 'a' && ch <= 'z' {
1718	ch = (ch - ('a' - 'A'))
1719	}
1720	ch = (ch - '@')
1721	if ch >= 0 && ch < ' ' {
1722	return ch, nil
1723	}
1724
1725	return 0, p.getErr(ErrUnrecognizedControl)
1726
1727	}
1728
1729	// Scan hex digits until we hit a closing brace.
1730	// Non-hex digits, hex value too large for UTF-8, or running out of chars are errors
1731	func (p *parser) scanHexUntilBrace() (rune, error) {
1732	// PCRE spec reads like unlimited hex digits are allowed, but unicode has a limit
1733	// so we can enforce that
1734	i := 0
1735	hasContent := false
1736
1737	for p.charsRight() > 0 {
1738	ch := p.moveRightGetChar()
1739	if ch == '}' {
1740	// hit our close brace, we're done here
1741	// prevent \x{}
1742	if !hasContent {
1743	return 0, p.getErr(ErrTooFewHex)
1744	}
1745	return rune(i), nil
1746	}
1747	hasContent = true
1748	// no brace needs to be hex digit
1749	d := hexDigit(ch)
1750	if d < 0 {
1751	return 0, p.getErr(ErrMissingBrace)
1752	}
1753
1754	i *= 0x10
1755	i += d
1756
1757	if i > unicode.MaxRune {
1758	return 0, p.getErr(ErrInvalidHex)
1759	}
1760	}
1761
1762	// we only make it here if we run out of digits without finding the brace
1763	return 0, p.getErr(ErrMissingBrace)
1764	}
1765
1766	// Scans exactly c hex digits (c=2 for \xFF, c=4 for \uFFFF)
1767	func (p *parser) scanHex(c int) (rune, error) {
1768
1769	i := 0
1770
1771	if p.charsRight() >= c {
1772	for c > 0 {
1773	d := hexDigit(p.moveRightGetChar())
1774	if d < 0 {
1775	break
1776	}
1777	i *= 0x10
1778	i += d
1779	c--
1780	}
1781	}
1782
1783	if c > 0 {
1784	return 0, p.getErr(ErrTooFewHex)
1785	}
1786
1787	return rune(i), nil
1788	}
1789
1790	// Returns n <= 0xF for a hex digit.
1791	func hexDigit(ch rune) int {
1792
1793	if d := uint(ch - '0'); d <= 9 {
1794	return int(d)
1795	}
1796
1797	if d := uint(ch - 'a'); d <= 5 {
1798	return int(d + 0xa)
1799	}
1800
1801	if d := uint(ch - 'A'); d <= 5 {
1802	return int(d + 0xa)
1803	}
1804
1805	return -1
1806	}
1807
1808	// Scans up to three octal digits (stops before exceeding 0377).
1809	func (p *parser) scanOctal() rune {
1810	// Consume octal chars only up to 3 digits and value 0377
1811
1812	c := 3
1813
1814	if c > p.charsRight() {
1815	c = p.charsRight()
1816	}
1817
1818	//we know the first char is good because the caller had to check
1819	i := 0
1820	d := int(p.rightChar(0) - '0')
1821	for c > 0 && d <= 7 && d >= 0 {
1822	if i >= 0x20 && p.useOptionE() {
1823	break
1824	}
1825	i *= 8
1826	i += d
1827	c--
1828
1829	p.moveRight(1)
1830	if !p.rightMost() {
1831	d = int(p.rightChar(0) - '0')
1832	}
1833	}
1834
1835	// Octal codes only go up to 255. Any larger and the behavior that Perl follows
1836	// is simply to truncate the high bits.
1837	i &= 0xFF
1838
1839	return rune(i)
1840	}
1841
1842	// Returns the current parsing position.
1843	func (p *parser) textpos() int {
1844	return p.currentPos
1845	}
1846
1847	// Zaps to a specific parsing position.
1848	func (p *parser) textto(pos int) {
1849	p.currentPos = pos
1850	}
1851
1852	// Returns the char at the right of the current parsing position and advances to the right.
1853	func (p *parser) moveRightGetChar() rune {
1854	ch := p.pattern[p.currentPos]
1855	p.currentPos++
1856	return ch
1857	}
1858
1859	// Moves the current position to the right.
1860	func (p *parser) moveRight(i int) {
1861	// default would be 1
1862	p.currentPos += i
1863	}
1864
1865	// Moves the current parsing position one to the left.
1866	func (p *parser) moveLeft() {
1867	p.currentPos--
1868	}
1869
1870	// Returns the char left of the current parsing position.
1871	func (p *parser) charAt(i int) rune {
1872	return p.pattern[i]
1873	}
1874
1875	// Returns the char i chars right of the current parsing position.
1876	func (p *parser) rightChar(i int) rune {
1877	// default would be 0
1878	return p.pattern[p.currentPos+i]
1879	}
1880
1881	// Number of characters to the right of the current parsing position.
1882	func (p *parser) charsRight() int {
1883	return len(p.pattern) - p.currentPos
1884	}
1885
1886	func (p *parser) rightMost() bool {
1887	return p.currentPos == len(p.pattern)
1888	}
1889
1890	// Looks up the slot number for a given name
1891	func (p *parser) captureSlotFromName(capname string) int {
1892	return p.capnames[capname]
1893	}
1894
1895	// True if the capture slot was noted
1896	func (p *parser) isCaptureSlot(i int) bool {
1897	if p.caps != nil {
1898	_, ok := p.caps[i]
1899	return ok
1900	}
1901
1902	return (i >= 0 && i < p.capsize)
1903	}
1904
1905	// Looks up the slot number for a given name
1906	func (p *parser) isCaptureName(capname string) bool {
1907	if p.capnames == nil {
1908	return false
1909	}
1910
1911	_, ok := p.capnames[capname]
1912	return ok
1913	}
1914
1915	// option shortcuts
1916
1917	// True if N option disabling '(' autocapture is on.
1918	func (p *parser) useOptionN() bool {
1919	return (p.options & ExplicitCapture) != 0
1920	}
1921
1922	// True if I option enabling case-insensitivity is on.
1923	func (p *parser) useOptionI() bool {
1924	return (p.options & IgnoreCase) != 0
1925	}
1926
1927	// True if M option altering meaning of $ and ^ is on.
1928	func (p *parser) useOptionM() bool {
1929	return (p.options & Multiline) != 0
1930	}
1931
1932	// True if S option altering meaning of . is on.
1933	func (p *parser) useOptionS() bool {
1934	return (p.options & Singleline) != 0
1935	}
1936
1937	// True if X option enabling whitespace/comment mode is on.
1938	func (p *parser) useOptionX() bool {
1939	return (p.options & IgnorePatternWhitespace) != 0
1940	}
1941
1942	// True if E option enabling ECMAScript behavior on.
1943	func (p *parser) useOptionE() bool {
1944	return (p.options & ECMAScript) != 0
1945	}
1946
1947	// true to use RE2 compatibility parsing behavior.
1948	func (p *parser) useRE2() bool {
1949	return (p.options & RE2) != 0
1950	}
1951
1952	// True if options stack is empty.
1953	func (p *parser) emptyOptionsStack() bool {
1954	return len(p.optionsStack) == 0
1955	}
1956
1957	// Finish the current quantifiable (when a quantifier is not found or is not possible)
1958	func (p *parser) addConcatenate() {
1959	// The first (\| inside a Testgroup group goes directly to the group
1960	p.concatenation.addChild(p.unit)
1961	p.unit = nil
1962	}
1963
1964	// Finish the current quantifiable (when a quantifier is found)
1965	func (p *parser) addConcatenate3(lazy bool, min, max int) {
1966	p.concatenation.addChild(p.unit.makeQuantifier(lazy, min, max))
1967	p.unit = nil
1968	}
1969
1970	// Sets the current unit to a single char node
1971	func (p *parser) addUnitOne(ch rune) {
1972	if p.useOptionI() {
1973	ch = unicode.ToLower(ch)
1974	}
1975
1976	p.unit = newRegexNodeCh(ntOne, p.options, ch)
1977	}
1978
1979	// Sets the current unit to a single inverse-char node
1980	func (p *parser) addUnitNotone(ch rune) {
1981	if p.useOptionI() {
1982	ch = unicode.ToLower(ch)
1983	}
1984
1985	p.unit = newRegexNodeCh(ntNotone, p.options, ch)
1986	}
1987
1988	// Sets the current unit to a single set node
1989	func (p parser) addUnitSet(set CharSet) {
1990	p.unit = newRegexNodeSet(ntSet, p.options, set)
1991	}
1992
1993	// Sets the current unit to a subtree
1994	func (p parser) addUnitNode(node regexNode) {
1995	p.unit = node
1996	}
1997
1998	// Sets the current unit to an assertion of the specified type
1999	func (p *parser) addUnitType(t nodeType) {
2000	p.unit = newRegexNode(t, p.options)
2001	}
2002
2003	// Finish the current group (in response to a ')' or end)
2004	func (p *parser) addGroup() error {
2005	if p.group.t == ntTestgroup \|\| p.group.t == ntTestref {
2006	p.group.addChild(p.concatenation.reverseLeft())
2007	if (p.group.t == ntTestref && len(p.group.children) > 2) \|\| len(p.group.children) > 3 {
2008	return p.getErr(ErrTooManyAlternates)
2009	}
2010	} else {
2011	p.alternation.addChild(p.concatenation.reverseLeft())
2012	p.group.addChild(p.alternation)
2013	}
2014
2015	p.unit = p.group
2016	return nil
2017	}
2018
2019	// Pops the option stack, but keeps the current options unchanged.
2020	func (p *parser) popKeepOptions() {
2021	lastIdx := len(p.optionsStack) - 1
2022	p.optionsStack = p.optionsStack[:lastIdx]
2023	}
2024
2025	// Recalls options from the stack.
2026	func (p *parser) popOptions() {
2027	lastIdx := len(p.optionsStack) - 1
2028	// get the last item on the stack and then remove it by reslicing
2029	p.options = p.optionsStack[lastIdx]
2030	p.optionsStack = p.optionsStack[:lastIdx]
2031	}
2032
2033	// Saves options on a stack.
2034	func (p *parser) pushOptions() {
2035	p.optionsStack = append(p.optionsStack, p.options)
2036	}
2037
2038	// Add a string to the last concatenate.
2039	func (p *parser) addToConcatenate(pos, cch int, isReplacement bool) {
2040	var node *regexNode
2041
2042	if cch == 0 {
2043	return
2044	}
2045
2046	if cch > 1 {
2047	str := p.pattern[pos : pos+cch]
2048
2049	if p.useOptionI() && !isReplacement {
2050	// We do the ToLower character by character for consistency. With surrogate chars, doing
2051	// a ToLower on the entire string could actually change the surrogate pair. This is more correct
2052	// linguistically, but since Regex doesn't support surrogates, it's more important to be
2053	// consistent.
2054	for i := 0; i < len(str); i++ {
2055	str[i] = unicode.ToLower(str[i])
2056	}
2057	}
2058
2059	node = newRegexNodeStr(ntMulti, p.options, str)
2060	} else {
2061	ch := p.charAt(pos)
2062
2063	if p.useOptionI() && !isReplacement {
2064	ch = unicode.ToLower(ch)
2065	}
2066
2067	node = newRegexNodeCh(ntOne, p.options, ch)
2068	}
2069
2070	p.concatenation.addChild(node)
2071	}
2072
2073	// Push the parser state (in response to an open paren)
2074	func (p *parser) pushGroup() {
2075	p.group.next = p.stack
2076	p.alternation.next = p.group
2077	p.concatenation.next = p.alternation
2078	p.stack = p.concatenation
2079	}
2080
2081	// Remember the pushed state (in response to a ')')
2082	func (p *parser) popGroup() error {
2083	p.concatenation = p.stack
2084	p.alternation = p.concatenation.next
2085	p.group = p.alternation.next
2086	p.stack = p.group.next
2087
2088	// The first () inside a Testgroup group goes directly to the group
2089	if p.group.t == ntTestgroup && len(p.group.children) == 0 {
2090	if p.unit == nil {
2091	return p.getErr(ErrConditionalExpression)
2092	}
2093
2094	p.group.addChild(p.unit)
2095	p.unit = nil
2096	}
2097	return nil
2098	}
2099
2100	// True if the group stack is empty.
2101	func (p *parser) emptyStack() bool {
2102	return p.stack == nil
2103	}
2104
2105	// Start a new round for the parser state (in response to an open paren or string start)
2106	func (p parser) startGroup(openGroup regexNode) {
2107	p.group = openGroup
2108	p.alternation = newRegexNode(ntAlternate, p.options)
2109	p.concatenation = newRegexNode(ntConcatenate, p.options)
2110	}
2111
2112	// Finish the current concatenation (in response to a \|)
2113	func (p *parser) addAlternate() {
2114	// The \| parts inside a Testgroup group go directly to the group
2115
2116	if p.group.t == ntTestgroup \|\| p.group.t == ntTestref {
2117	p.group.addChild(p.concatenation.reverseLeft())
2118	} else {
2119	p.alternation.addChild(p.concatenation.reverseLeft())
2120	}
2121
2122	p.concatenation = newRegexNode(ntConcatenate, p.options)
2123	}
2124
2125	// For categorizing ascii characters.
2126
2127	const (
2128	Q byte = 5 // quantifier
2129	S = 4 // ordinary stopper
2130	Z = 3 // ScanBlank stopper
2131	X = 2 // whitespace
2132	E = 1 // should be escaped
2133	)
2134
2135	var _category = []byte{
2136	//01 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 A B C D E F
2137	0, 0, 0, 0, 0, 0, 0, 0, 0, X, X, X, X, X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2138	// ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
2139	X, 0, 0, Z, S, 0, 0, 0, S, S, Q, Q, 0, 0, S, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q,
2140	//@A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _
2141	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, S, 0,
2142	//'a b c d e f g h i j k l m n o p q r s t u v w x y z { \| } ~
2143	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q, S, 0, 0, 0,
2144	}
2145
2146	func isSpace(ch rune) bool {
2147	return (ch <= ' ' && _category[ch] == X)
2148	}
2149
2150	// Returns true for those characters that terminate a string of ordinary chars.
2151	func isSpecial(ch rune) bool {
2152	return (ch <= '\|' && _category[ch] >= S)
2153	}
2154
2155	// Returns true for those characters that terminate a string of ordinary chars.
2156	func isStopperX(ch rune) bool {
2157	return (ch <= '\|' && _category[ch] >= X)
2158	}
2159
2160	// Returns true for those characters that begin a quantifier.
2161	func isQuantifier(ch rune) bool {
2162	return (ch <= '{' && _category[ch] >= Q)
2163	}
2164
2165	func (p *parser) isTrueQuantifier() bool {
2166	nChars := p.charsRight()
2167	if nChars == 0 {
2168	return false
2169	}
2170
2171	startpos := p.textpos()
2172	ch := p.charAt(startpos)
2173	if ch != '{' {
2174	return ch <= '{' && _category[ch] >= Q
2175	}
2176
2177	//UGLY: this is ugly -- the original code was ugly too
2178	pos := startpos
2179	for {
2180	nChars--
2181	if nChars <= 0 {
2182	break
2183	}
2184	pos++
2185	ch = p.charAt(pos)
2186	if ch < '0' \|\| ch > '9' {
2187	break
2188	}
2189	}
2190
2191	if nChars == 0 \|\| pos-startpos == 1 {
2192	return false
2193	}
2194	if ch == '}' {
2195	return true
2196	}
2197	if ch != ',' {
2198	return false
2199	}
2200	for {
2201	nChars--
2202	if nChars <= 0 {
2203	break
2204	}
2205	pos++
2206	ch = p.charAt(pos)
2207	if ch < '0' \|\| ch > '9' {
2208	break
2209	}
2210	}
2211
2212	return nChars > 0 && ch == '}'
2213	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: