source: code/trunk/vendor/github.com/dlclark/regexp2/syntax/parser.go@ 67

Last change on this file since 67 was 67, checked in by Izuru Yakumo, 23 months ago

Use vendored modules

Signed-off-by: Izuru Yakumo <yakumo.izuru@…>

File size: 49.1 KB
Line 
1package syntax
2
3import (
4 "fmt"
5 "math"
6 "os"
7 "sort"
8 "strconv"
9 "unicode"
10)
11
12type RegexOptions int32
13
14const (
15 IgnoreCase RegexOptions = 0x0001 // "i"
16 Multiline = 0x0002 // "m"
17 ExplicitCapture = 0x0004 // "n"
18 Compiled = 0x0008 // "c"
19 Singleline = 0x0010 // "s"
20 IgnorePatternWhitespace = 0x0020 // "x"
21 RightToLeft = 0x0040 // "r"
22 Debug = 0x0080 // "d"
23 ECMAScript = 0x0100 // "e"
24 RE2 = 0x0200 // RE2 compat mode
25)
26
27func optionFromCode(ch rune) RegexOptions {
28 // case-insensitive
29 switch ch {
30 case 'i', 'I':
31 return IgnoreCase
32 case 'r', 'R':
33 return RightToLeft
34 case 'm', 'M':
35 return Multiline
36 case 'n', 'N':
37 return ExplicitCapture
38 case 's', 'S':
39 return Singleline
40 case 'x', 'X':
41 return IgnorePatternWhitespace
42 case 'd', 'D':
43 return Debug
44 case 'e', 'E':
45 return ECMAScript
46 default:
47 return 0
48 }
49}
50
51// An Error describes a failure to parse a regular expression
52// and gives the offending expression.
53type Error struct {
54 Code ErrorCode
55 Expr string
56 Args []interface{}
57}
58
59func (e *Error) Error() string {
60 if len(e.Args) == 0 {
61 return "error parsing regexp: " + e.Code.String() + " in `" + e.Expr + "`"
62 }
63 return "error parsing regexp: " + fmt.Sprintf(e.Code.String(), e.Args...) + " in `" + e.Expr + "`"
64}
65
66// An ErrorCode describes a failure to parse a regular expression.
67type ErrorCode string
68
69const (
70 // internal issue
71 ErrInternalError ErrorCode = "regexp/syntax: internal error"
72 // Parser errors
73 ErrUnterminatedComment = "unterminated comment"
74 ErrInvalidCharRange = "invalid character class range"
75 ErrInvalidRepeatSize = "invalid repeat count"
76 ErrInvalidUTF8 = "invalid UTF-8"
77 ErrCaptureGroupOutOfRange = "capture group number out of range"
78 ErrUnexpectedParen = "unexpected )"
79 ErrMissingParen = "missing closing )"
80 ErrMissingBrace = "missing closing }"
81 ErrInvalidRepeatOp = "invalid nested repetition operator"
82 ErrMissingRepeatArgument = "missing argument to repetition operator"
83 ErrConditionalExpression = "illegal conditional (?(...)) expression"
84 ErrTooManyAlternates = "too many | in (?()|)"
85 ErrUnrecognizedGrouping = "unrecognized grouping construct: (%v"
86 ErrInvalidGroupName = "invalid group name: group names must begin with a word character and have a matching terminator"
87 ErrCapNumNotZero = "capture number cannot be zero"
88 ErrUndefinedBackRef = "reference to undefined group number %v"
89 ErrUndefinedNameRef = "reference to undefined group name %v"
90 ErrAlternationCantCapture = "alternation conditions do not capture and cannot be named"
91 ErrAlternationCantHaveComment = "alternation conditions cannot be comments"
92 ErrMalformedReference = "(?(%v) ) malformed"
93 ErrUndefinedReference = "(?(%v) ) reference to undefined group"
94 ErrIllegalEndEscape = "illegal \\ at end of pattern"
95 ErrMalformedSlashP = "malformed \\p{X} character escape"
96 ErrIncompleteSlashP = "incomplete \\p{X} character escape"
97 ErrUnknownSlashP = "unknown unicode category, script, or property '%v'"
98 ErrUnrecognizedEscape = "unrecognized escape sequence \\%v"
99 ErrMissingControl = "missing control character"
100 ErrUnrecognizedControl = "unrecognized control character"
101 ErrTooFewHex = "insufficient hexadecimal digits"
102 ErrInvalidHex = "hex values may not be larger than 0x10FFFF"
103 ErrMalformedNameRef = "malformed \\k<...> named back reference"
104 ErrBadClassInCharRange = "cannot include class \\%v in character range"
105 ErrUnterminatedBracket = "unterminated [] set"
106 ErrSubtractionMustBeLast = "a subtraction must be the last element in a character class"
107 ErrReversedCharRange = "[x-y] range in reverse order"
108)
109
110func (e ErrorCode) String() string {
111 return string(e)
112}
113
114type parser struct {
115 stack *regexNode
116 group *regexNode
117 alternation *regexNode
118 concatenation *regexNode
119 unit *regexNode
120
121 patternRaw string
122 pattern []rune
123
124 currentPos int
125 specialCase *unicode.SpecialCase
126
127 autocap int
128 capcount int
129 captop int
130 capsize int
131
132 caps map[int]int
133 capnames map[string]int
134
135 capnumlist []int
136 capnamelist []string
137
138 options RegexOptions
139 optionsStack []RegexOptions
140 ignoreNextParen bool
141}
142
143const (
144 maxValueDiv10 int = math.MaxInt32 / 10
145 maxValueMod10 = math.MaxInt32 % 10
146)
147
148// Parse converts a regex string into a parse tree
149func Parse(re string, op RegexOptions) (*RegexTree, error) {
150 p := parser{
151 options: op,
152 caps: make(map[int]int),
153 }
154 p.setPattern(re)
155
156 if err := p.countCaptures(); err != nil {
157 return nil, err
158 }
159
160 p.reset(op)
161 root, err := p.scanRegex()
162
163 if err != nil {
164 return nil, err
165 }
166 tree := &RegexTree{
167 root: root,
168 caps: p.caps,
169 capnumlist: p.capnumlist,
170 captop: p.captop,
171 Capnames: p.capnames,
172 Caplist: p.capnamelist,
173 options: op,
174 }
175
176 if tree.options&Debug > 0 {
177 os.Stdout.WriteString(tree.Dump())
178 }
179
180 return tree, nil
181}
182
183func (p *parser) setPattern(pattern string) {
184 p.patternRaw = pattern
185 p.pattern = make([]rune, 0, len(pattern))
186
187 //populate our rune array to handle utf8 encoding
188 for _, r := range pattern {
189 p.pattern = append(p.pattern, r)
190 }
191}
192func (p *parser) getErr(code ErrorCode, args ...interface{}) error {
193 return &Error{Code: code, Expr: p.patternRaw, Args: args}
194}
195
196func (p *parser) noteCaptureSlot(i, pos int) {
197 if _, ok := p.caps[i]; !ok {
198 // the rhs of the hashtable isn't used in the parser
199 p.caps[i] = pos
200 p.capcount++
201
202 if p.captop <= i {
203 if i == math.MaxInt32 {
204 p.captop = i
205 } else {
206 p.captop = i + 1
207 }
208 }
209 }
210}
211
212func (p *parser) noteCaptureName(name string, pos int) {
213 if p.capnames == nil {
214 p.capnames = make(map[string]int)
215 }
216
217 if _, ok := p.capnames[name]; !ok {
218 p.capnames[name] = pos
219 p.capnamelist = append(p.capnamelist, name)
220 }
221}
222
223func (p *parser) assignNameSlots() {
224 if p.capnames != nil {
225 for _, name := range p.capnamelist {
226 for p.isCaptureSlot(p.autocap) {
227 p.autocap++
228 }
229 pos := p.capnames[name]
230 p.capnames[name] = p.autocap
231 p.noteCaptureSlot(p.autocap, pos)
232
233 p.autocap++
234 }
235 }
236
237 // if the caps array has at least one gap, construct the list of used slots
238 if p.capcount < p.captop {
239 p.capnumlist = make([]int, p.capcount)
240 i := 0
241
242 for k := range p.caps {
243 p.capnumlist[i] = k
244 i++
245 }
246
247 sort.Ints(p.capnumlist)
248 }
249
250 // merge capsnumlist into capnamelist
251 if p.capnames != nil || p.capnumlist != nil {
252 var oldcapnamelist []string
253 var next int
254 var k int
255
256 if p.capnames == nil {
257 oldcapnamelist = nil
258 p.capnames = make(map[string]int)
259 p.capnamelist = []string{}
260 next = -1
261 } else {
262 oldcapnamelist = p.capnamelist
263 p.capnamelist = []string{}
264 next = p.capnames[oldcapnamelist[0]]
265 }
266
267 for i := 0; i < p.capcount; i++ {
268 j := i
269 if p.capnumlist != nil {
270 j = p.capnumlist[i]
271 }
272
273 if next == j {
274 p.capnamelist = append(p.capnamelist, oldcapnamelist[k])
275 k++
276
277 if k == len(oldcapnamelist) {
278 next = -1
279 } else {
280 next = p.capnames[oldcapnamelist[k]]
281 }
282
283 } else {
284 //feature: culture?
285 str := strconv.Itoa(j)
286 p.capnamelist = append(p.capnamelist, str)
287 p.capnames[str] = j
288 }
289 }
290 }
291}
292
293func (p *parser) consumeAutocap() int {
294 r := p.autocap
295 p.autocap++
296 return r
297}
298
299// CountCaptures is a prescanner for deducing the slots used for
300// captures by doing a partial tokenization of the pattern.
301func (p *parser) countCaptures() error {
302 var ch rune
303
304 p.noteCaptureSlot(0, 0)
305
306 p.autocap = 1
307
308 for p.charsRight() > 0 {
309 pos := p.textpos()
310 ch = p.moveRightGetChar()
311 switch ch {
312 case '\\':
313 if p.charsRight() > 0 {
314 p.scanBackslash(true)
315 }
316
317 case '#':
318 if p.useOptionX() {
319 p.moveLeft()
320 p.scanBlank()
321 }
322
323 case '[':
324 p.scanCharSet(false, true)
325
326 case ')':
327 if !p.emptyOptionsStack() {
328 p.popOptions()
329 }
330
331 case '(':
332 if p.charsRight() >= 2 && p.rightChar(1) == '#' && p.rightChar(0) == '?' {
333 p.moveLeft()
334 p.scanBlank()
335 } else {
336 p.pushOptions()
337 if p.charsRight() > 0 && p.rightChar(0) == '?' {
338 // we have (?...
339 p.moveRight(1)
340
341 if p.charsRight() > 1 && (p.rightChar(0) == '<' || p.rightChar(0) == '\'') {
342 // named group: (?<... or (?'...
343
344 p.moveRight(1)
345 ch = p.rightChar(0)
346
347 if ch != '0' && IsWordChar(ch) {
348 if ch >= '1' && ch <= '9' {
349 dec, err := p.scanDecimal()
350 if err != nil {
351 return err
352 }
353 p.noteCaptureSlot(dec, pos)
354 } else {
355 p.noteCaptureName(p.scanCapname(), pos)
356 }
357 }
358 } else if p.useRE2() && p.charsRight() > 2 && (p.rightChar(0) == 'P' && p.rightChar(1) == '<') {
359 // RE2-compat (?P<)
360 p.moveRight(2)
361 ch = p.rightChar(0)
362 if IsWordChar(ch) {
363 p.noteCaptureName(p.scanCapname(), pos)
364 }
365
366 } else {
367 // (?...
368
369 // get the options if it's an option construct (?cimsx-cimsx...)
370 p.scanOptions()
371
372 if p.charsRight() > 0 {
373 if p.rightChar(0) == ')' {
374 // (?cimsx-cimsx)
375 p.moveRight(1)
376 p.popKeepOptions()
377 } else if p.rightChar(0) == '(' {
378 // alternation construct: (?(foo)yes|no)
379 // ignore the next paren so we don't capture the condition
380 p.ignoreNextParen = true
381
382 // break from here so we don't reset ignoreNextParen
383 continue
384 }
385 }
386 }
387 } else {
388 if !p.useOptionN() && !p.ignoreNextParen {
389 p.noteCaptureSlot(p.consumeAutocap(), pos)
390 }
391 }
392 }
393
394 p.ignoreNextParen = false
395
396 }
397 }
398
399 p.assignNameSlots()
400 return nil
401}
402
403func (p *parser) reset(topopts RegexOptions) {
404 p.currentPos = 0
405 p.autocap = 1
406 p.ignoreNextParen = false
407
408 if len(p.optionsStack) > 0 {
409 p.optionsStack = p.optionsStack[:0]
410 }
411
412 p.options = topopts
413 p.stack = nil
414}
415
416func (p *parser) scanRegex() (*regexNode, error) {
417 ch := '@' // nonspecial ch, means at beginning
418 isQuant := false
419
420 p.startGroup(newRegexNodeMN(ntCapture, p.options, 0, -1))
421
422 for p.charsRight() > 0 {
423 wasPrevQuantifier := isQuant
424 isQuant = false
425
426 if err := p.scanBlank(); err != nil {
427 return nil, err
428 }
429
430 startpos := p.textpos()
431
432 // move past all of the normal characters. We'll stop when we hit some kind of control character,
433 // or if IgnorePatternWhiteSpace is on, we'll stop when we see some whitespace.
434 if p.useOptionX() {
435 for p.charsRight() > 0 {
436 ch = p.rightChar(0)
437 //UGLY: clean up, this is ugly
438 if !(!isStopperX(ch) || (ch == '{' && !p.isTrueQuantifier())) {
439 break
440 }
441 p.moveRight(1)
442 }
443 } else {
444 for p.charsRight() > 0 {
445 ch = p.rightChar(0)
446 if !(!isSpecial(ch) || ch == '{' && !p.isTrueQuantifier()) {
447 break
448 }
449 p.moveRight(1)
450 }
451 }
452
453 endpos := p.textpos()
454
455 p.scanBlank()
456
457 if p.charsRight() == 0 {
458 ch = '!' // nonspecial, means at end
459 } else if ch = p.rightChar(0); isSpecial(ch) {
460 isQuant = isQuantifier(ch)
461 p.moveRight(1)
462 } else {
463 ch = ' ' // nonspecial, means at ordinary char
464 }
465
466 if startpos < endpos {
467 cchUnquantified := endpos - startpos
468 if isQuant {
469 cchUnquantified--
470 }
471 wasPrevQuantifier = false
472
473 if cchUnquantified > 0 {
474 p.addToConcatenate(startpos, cchUnquantified, false)
475 }
476
477 if isQuant {
478 p.addUnitOne(p.charAt(endpos - 1))
479 }
480 }
481
482 switch ch {
483 case '!':
484 goto BreakOuterScan
485
486 case ' ':
487 goto ContinueOuterScan
488
489 case '[':
490 cc, err := p.scanCharSet(p.useOptionI(), false)
491 if err != nil {
492 return nil, err
493 }
494 p.addUnitSet(cc)
495
496 case '(':
497 p.pushOptions()
498
499 if grouper, err := p.scanGroupOpen(); err != nil {
500 return nil, err
501 } else if grouper == nil {
502 p.popKeepOptions()
503 } else {
504 p.pushGroup()
505 p.startGroup(grouper)
506 }
507
508 continue
509
510 case '|':
511 p.addAlternate()
512 goto ContinueOuterScan
513
514 case ')':
515 if p.emptyStack() {
516 return nil, p.getErr(ErrUnexpectedParen)
517 }
518
519 if err := p.addGroup(); err != nil {
520 return nil, err
521 }
522 if err := p.popGroup(); err != nil {
523 return nil, err
524 }
525 p.popOptions()
526
527 if p.unit == nil {
528 goto ContinueOuterScan
529 }
530
531 case '\\':
532 n, err := p.scanBackslash(false)
533 if err != nil {
534 return nil, err
535 }
536 p.addUnitNode(n)
537
538 case '^':
539 if p.useOptionM() {
540 p.addUnitType(ntBol)
541 } else {
542 p.addUnitType(ntBeginning)
543 }
544
545 case '$':
546 if p.useOptionM() {
547 p.addUnitType(ntEol)
548 } else {
549 p.addUnitType(ntEndZ)
550 }
551
552 case '.':
553 if p.useOptionE() {
554 p.addUnitSet(ECMAAnyClass())
555 } else if p.useOptionS() {
556 p.addUnitSet(AnyClass())
557 } else {
558 p.addUnitNotone('\n')
559 }
560
561 case '{', '*', '+', '?':
562 if p.unit == nil {
563 if wasPrevQuantifier {
564 return nil, p.getErr(ErrInvalidRepeatOp)
565 } else {
566 return nil, p.getErr(ErrMissingRepeatArgument)
567 }
568 }
569 p.moveLeft()
570
571 default:
572 return nil, p.getErr(ErrInternalError)
573 }
574
575 if err := p.scanBlank(); err != nil {
576 return nil, err
577 }
578
579 if p.charsRight() > 0 {
580 isQuant = p.isTrueQuantifier()
581 }
582 if p.charsRight() == 0 || !isQuant {
583 //maintain odd C# assignment order -- not sure if required, could clean up?
584 p.addConcatenate()
585 goto ContinueOuterScan
586 }
587
588 ch = p.moveRightGetChar()
589
590 // Handle quantifiers
591 for p.unit != nil {
592 var min, max int
593 var lazy bool
594
595 switch ch {
596 case '*':
597 min = 0
598 max = math.MaxInt32
599
600 case '?':
601 min = 0
602 max = 1
603
604 case '+':
605 min = 1
606 max = math.MaxInt32
607
608 case '{':
609 {
610 var err error
611 startpos = p.textpos()
612 if min, err = p.scanDecimal(); err != nil {
613 return nil, err
614 }
615 max = min
616 if startpos < p.textpos() {
617 if p.charsRight() > 0 && p.rightChar(0) == ',' {
618 p.moveRight(1)
619 if p.charsRight() == 0 || p.rightChar(0) == '}' {
620 max = math.MaxInt32
621 } else {
622 if max, err = p.scanDecimal(); err != nil {
623 return nil, err
624 }
625 }
626 }
627 }
628
629 if startpos == p.textpos() || p.charsRight() == 0 || p.moveRightGetChar() != '}' {
630 p.addConcatenate()
631 p.textto(startpos - 1)
632 goto ContinueOuterScan
633 }
634 }
635
636 default:
637 return nil, p.getErr(ErrInternalError)
638 }
639
640 if err := p.scanBlank(); err != nil {
641 return nil, err
642 }
643
644 if p.charsRight() == 0 || p.rightChar(0) != '?' {
645 lazy = false
646 } else {
647 p.moveRight(1)
648 lazy = true
649 }
650
651 if min > max {
652 return nil, p.getErr(ErrInvalidRepeatSize)
653 }
654
655 p.addConcatenate3(lazy, min, max)
656 }
657
658 ContinueOuterScan:
659 }
660
661BreakOuterScan:
662 ;
663
664 if !p.emptyStack() {
665 return nil, p.getErr(ErrMissingParen)
666 }
667
668 if err := p.addGroup(); err != nil {
669 return nil, err
670 }
671
672 return p.unit, nil
673
674}
675
676/*
677 * Simple parsing for replacement patterns
678 */
679func (p *parser) scanReplacement() (*regexNode, error) {
680 var c, startpos int
681
682 p.concatenation = newRegexNode(ntConcatenate, p.options)
683
684 for {
685 c = p.charsRight()
686 if c == 0 {
687 break
688 }
689
690 startpos = p.textpos()
691
692 for c > 0 && p.rightChar(0) != '$' {
693 p.moveRight(1)
694 c--
695 }
696
697 p.addToConcatenate(startpos, p.textpos()-startpos, true)
698
699 if c > 0 {
700 if p.moveRightGetChar() == '$' {
701 n, err := p.scanDollar()
702 if err != nil {
703 return nil, err
704 }
705 p.addUnitNode(n)
706 }
707 p.addConcatenate()
708 }
709 }
710
711 return p.concatenation, nil
712}
713
714/*
715 * Scans $ patterns recognized within replacement patterns
716 */
717func (p *parser) scanDollar() (*regexNode, error) {
718 if p.charsRight() == 0 {
719 return newRegexNodeCh(ntOne, p.options, '$'), nil
720 }
721
722 ch := p.rightChar(0)
723 angled := false
724 backpos := p.textpos()
725 lastEndPos := backpos
726
727 // Note angle
728
729 if ch == '{' && p.charsRight() > 1 {
730 angled = true
731 p.moveRight(1)
732 ch = p.rightChar(0)
733 }
734
735 // Try to parse backreference: \1 or \{1} or \{cap}
736
737 if ch >= '0' && ch <= '9' {
738 if !angled && p.useOptionE() {
739 capnum := -1
740 newcapnum := int(ch - '0')
741 p.moveRight(1)
742 if p.isCaptureSlot(newcapnum) {
743 capnum = newcapnum
744 lastEndPos = p.textpos()
745 }
746
747 for p.charsRight() > 0 {
748 ch = p.rightChar(0)
749 if ch < '0' || ch > '9' {
750 break
751 }
752 digit := int(ch - '0')
753 if newcapnum > maxValueDiv10 || (newcapnum == maxValueDiv10 && digit > maxValueMod10) {
754 return nil, p.getErr(ErrCaptureGroupOutOfRange)
755 }
756
757 newcapnum = newcapnum*10 + digit
758
759 p.moveRight(1)
760 if p.isCaptureSlot(newcapnum) {
761 capnum = newcapnum
762 lastEndPos = p.textpos()
763 }
764 }
765 p.textto(lastEndPos)
766 if capnum >= 0 {
767 return newRegexNodeM(ntRef, p.options, capnum), nil
768 }
769 } else {
770 capnum, err := p.scanDecimal()
771 if err != nil {
772 return nil, err
773 }
774 if !angled || p.charsRight() > 0 && p.moveRightGetChar() == '}' {
775 if p.isCaptureSlot(capnum) {
776 return newRegexNodeM(ntRef, p.options, capnum), nil
777 }
778 }
779 }
780 } else if angled && IsWordChar(ch) {
781 capname := p.scanCapname()
782
783 if p.charsRight() > 0 && p.moveRightGetChar() == '}' {
784 if p.isCaptureName(capname) {
785 return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil
786 }
787 }
788 } else if !angled {
789 capnum := 1
790
791 switch ch {
792 case '$':
793 p.moveRight(1)
794 return newRegexNodeCh(ntOne, p.options, '$'), nil
795 case '&':
796 capnum = 0
797 case '`':
798 capnum = replaceLeftPortion
799 case '\'':
800 capnum = replaceRightPortion
801 case '+':
802 capnum = replaceLastGroup
803 case '_':
804 capnum = replaceWholeString
805 }
806
807 if capnum != 1 {
808 p.moveRight(1)
809 return newRegexNodeM(ntRef, p.options, capnum), nil
810 }
811 }
812
813 // unrecognized $: literalize
814
815 p.textto(backpos)
816 return newRegexNodeCh(ntOne, p.options, '$'), nil
817}
818
819// scanGroupOpen scans chars following a '(' (not counting the '('), and returns
820// a RegexNode for the type of group scanned, or nil if the group
821// simply changed options (?cimsx-cimsx) or was a comment (#...).
822func (p *parser) scanGroupOpen() (*regexNode, error) {
823 var ch rune
824 var nt nodeType
825 var err error
826 close := '>'
827 start := p.textpos()
828
829 // just return a RegexNode if we have:
830 // 1. "(" followed by nothing
831 // 2. "(x" where x != ?
832 // 3. "(?)"
833 if p.charsRight() == 0 || p.rightChar(0) != '?' || (p.rightChar(0) == '?' && (p.charsRight() > 1 && p.rightChar(1) == ')')) {
834 if p.useOptionN() || p.ignoreNextParen {
835 p.ignoreNextParen = false
836 return newRegexNode(ntGroup, p.options), nil
837 }
838 return newRegexNodeMN(ntCapture, p.options, p.consumeAutocap(), -1), nil
839 }
840
841 p.moveRight(1)
842
843 for {
844 if p.charsRight() == 0 {
845 break
846 }
847
848 switch ch = p.moveRightGetChar(); ch {
849 case ':':
850 nt = ntGroup
851
852 case '=':
853 p.options &= ^RightToLeft
854 nt = ntRequire
855
856 case '!':
857 p.options &= ^RightToLeft
858 nt = ntPrevent
859
860 case '>':
861 nt = ntGreedy
862
863 case '\'':
864 close = '\''
865 fallthrough
866
867 case '<':
868 if p.charsRight() == 0 {
869 goto BreakRecognize
870 }
871
872 switch ch = p.moveRightGetChar(); ch {
873 case '=':
874 if close == '\'' {
875 goto BreakRecognize
876 }
877
878 p.options |= RightToLeft
879 nt = ntRequire
880
881 case '!':
882 if close == '\'' {
883 goto BreakRecognize
884 }
885
886 p.options |= RightToLeft
887 nt = ntPrevent
888
889 default:
890 p.moveLeft()
891 capnum := -1
892 uncapnum := -1
893 proceed := false
894
895 // grab part before -
896
897 if ch >= '0' && ch <= '9' {
898 if capnum, err = p.scanDecimal(); err != nil {
899 return nil, err
900 }
901
902 if !p.isCaptureSlot(capnum) {
903 capnum = -1
904 }
905
906 // check if we have bogus characters after the number
907 if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') {
908 return nil, p.getErr(ErrInvalidGroupName)
909 }
910 if capnum == 0 {
911 return nil, p.getErr(ErrCapNumNotZero)
912 }
913 } else if IsWordChar(ch) {
914 capname := p.scanCapname()
915
916 if p.isCaptureName(capname) {
917 capnum = p.captureSlotFromName(capname)
918 }
919
920 // check if we have bogus character after the name
921 if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') {
922 return nil, p.getErr(ErrInvalidGroupName)
923 }
924 } else if ch == '-' {
925 proceed = true
926 } else {
927 // bad group name - starts with something other than a word character and isn't a number
928 return nil, p.getErr(ErrInvalidGroupName)
929 }
930
931 // grab part after - if any
932
933 if (capnum != -1 || proceed == true) && p.charsRight() > 0 && p.rightChar(0) == '-' {
934 p.moveRight(1)
935
936 //no more chars left, no closing char, etc
937 if p.charsRight() == 0 {
938 return nil, p.getErr(ErrInvalidGroupName)
939 }
940
941 ch = p.rightChar(0)
942 if ch >= '0' && ch <= '9' {
943 if uncapnum, err = p.scanDecimal(); err != nil {
944 return nil, err
945 }
946
947 if !p.isCaptureSlot(uncapnum) {
948 return nil, p.getErr(ErrUndefinedBackRef, uncapnum)
949 }
950
951 // check if we have bogus characters after the number
952 if p.charsRight() > 0 && p.rightChar(0) != close {
953 return nil, p.getErr(ErrInvalidGroupName)
954 }
955 } else if IsWordChar(ch) {
956 uncapname := p.scanCapname()
957
958 if !p.isCaptureName(uncapname) {
959 return nil, p.getErr(ErrUndefinedNameRef, uncapname)
960 }
961 uncapnum = p.captureSlotFromName(uncapname)
962
963 // check if we have bogus character after the name
964 if p.charsRight() > 0 && p.rightChar(0) != close {
965 return nil, p.getErr(ErrInvalidGroupName)
966 }
967 } else {
968 // bad group name - starts with something other than a word character and isn't a number
969 return nil, p.getErr(ErrInvalidGroupName)
970 }
971 }
972
973 // actually make the node
974
975 if (capnum != -1 || uncapnum != -1) && p.charsRight() > 0 && p.moveRightGetChar() == close {
976 return newRegexNodeMN(ntCapture, p.options, capnum, uncapnum), nil
977 }
978 goto BreakRecognize
979 }
980
981 case '(':
982 // alternation construct (?(...) | )
983
984 parenPos := p.textpos()
985 if p.charsRight() > 0 {
986 ch = p.rightChar(0)
987
988 // check if the alternation condition is a backref
989 if ch >= '0' && ch <= '9' {
990 var capnum int
991 if capnum, err = p.scanDecimal(); err != nil {
992 return nil, err
993 }
994 if p.charsRight() > 0 && p.moveRightGetChar() == ')' {
995 if p.isCaptureSlot(capnum) {
996 return newRegexNodeM(ntTestref, p.options, capnum), nil
997 }
998 return nil, p.getErr(ErrUndefinedReference, capnum)
999 }
1000
1001 return nil, p.getErr(ErrMalformedReference, capnum)
1002
1003 } else if IsWordChar(ch) {
1004 capname := p.scanCapname()
1005
1006 if p.isCaptureName(capname) && p.charsRight() > 0 && p.moveRightGetChar() == ')' {
1007 return newRegexNodeM(ntTestref, p.options, p.captureSlotFromName(capname)), nil
1008 }
1009 }
1010 }
1011 // not a backref
1012 nt = ntTestgroup
1013 p.textto(parenPos - 1) // jump to the start of the parentheses
1014 p.ignoreNextParen = true // but make sure we don't try to capture the insides
1015
1016 charsRight := p.charsRight()
1017 if charsRight >= 3 && p.rightChar(1) == '?' {
1018 rightchar2 := p.rightChar(2)
1019 // disallow comments in the condition
1020 if rightchar2 == '#' {
1021 return nil, p.getErr(ErrAlternationCantHaveComment)
1022 }
1023
1024 // disallow named capture group (?<..>..) in the condition
1025 if rightchar2 == '\'' {
1026 return nil, p.getErr(ErrAlternationCantCapture)
1027 }
1028
1029 if charsRight >= 4 && (rightchar2 == '<' && p.rightChar(3) != '!' && p.rightChar(3) != '=') {
1030 return nil, p.getErr(ErrAlternationCantCapture)
1031 }
1032 }
1033
1034 case 'P':
1035 if p.useRE2() {
1036 // support for P<name> syntax
1037 if p.charsRight() < 3 {
1038 goto BreakRecognize
1039 }
1040
1041 ch = p.moveRightGetChar()
1042 if ch != '<' {
1043 goto BreakRecognize
1044 }
1045
1046 ch = p.moveRightGetChar()
1047 p.moveLeft()
1048
1049 if IsWordChar(ch) {
1050 capnum := -1
1051 capname := p.scanCapname()
1052
1053 if p.isCaptureName(capname) {
1054 capnum = p.captureSlotFromName(capname)
1055 }
1056
1057 // check if we have bogus character after the name
1058 if p.charsRight() > 0 && p.rightChar(0) != '>' {
1059 return nil, p.getErr(ErrInvalidGroupName)
1060 }
1061
1062 // actually make the node
1063
1064 if capnum != -1 && p.charsRight() > 0 && p.moveRightGetChar() == '>' {
1065 return newRegexNodeMN(ntCapture, p.options, capnum, -1), nil
1066 }
1067 goto BreakRecognize
1068
1069 } else {
1070 // bad group name - starts with something other than a word character and isn't a number
1071 return nil, p.getErr(ErrInvalidGroupName)
1072 }
1073 }
1074 // if we're not using RE2 compat mode then
1075 // we just behave like normal
1076 fallthrough
1077
1078 default:
1079 p.moveLeft()
1080
1081 nt = ntGroup
1082 // disallow options in the children of a testgroup node
1083 if p.group.t != ntTestgroup {
1084 p.scanOptions()
1085 }
1086 if p.charsRight() == 0 {
1087 goto BreakRecognize
1088 }
1089
1090 if ch = p.moveRightGetChar(); ch == ')' {
1091 return nil, nil
1092 }
1093
1094 if ch != ':' {
1095 goto BreakRecognize
1096 }
1097
1098 }
1099
1100 return newRegexNode(nt, p.options), nil
1101 }
1102
1103BreakRecognize:
1104
1105 // break Recognize comes here
1106
1107 return nil, p.getErr(ErrUnrecognizedGrouping, string(p.pattern[start:p.textpos()]))
1108}
1109
1110// scans backslash specials and basics
1111func (p *parser) scanBackslash(scanOnly bool) (*regexNode, error) {
1112
1113 if p.charsRight() == 0 {
1114 return nil, p.getErr(ErrIllegalEndEscape)
1115 }
1116
1117 switch ch := p.rightChar(0); ch {
1118 case 'b', 'B', 'A', 'G', 'Z', 'z':
1119 p.moveRight(1)
1120 return newRegexNode(p.typeFromCode(ch), p.options), nil
1121
1122 case 'w':
1123 p.moveRight(1)
1124 if p.useOptionE() {
1125 return newRegexNodeSet(ntSet, p.options, ECMAWordClass()), nil
1126 }
1127 return newRegexNodeSet(ntSet, p.options, WordClass()), nil
1128
1129 case 'W':
1130 p.moveRight(1)
1131 if p.useOptionE() {
1132 return newRegexNodeSet(ntSet, p.options, NotECMAWordClass()), nil
1133 }
1134 return newRegexNodeSet(ntSet, p.options, NotWordClass()), nil
1135
1136 case 's':
1137 p.moveRight(1)
1138 if p.useOptionE() {
1139 return newRegexNodeSet(ntSet, p.options, ECMASpaceClass()), nil
1140 }
1141 return newRegexNodeSet(ntSet, p.options, SpaceClass()), nil
1142
1143 case 'S':
1144 p.moveRight(1)
1145 if p.useOptionE() {
1146 return newRegexNodeSet(ntSet, p.options, NotECMASpaceClass()), nil
1147 }
1148 return newRegexNodeSet(ntSet, p.options, NotSpaceClass()), nil
1149
1150 case 'd':
1151 p.moveRight(1)
1152 if p.useOptionE() {
1153 return newRegexNodeSet(ntSet, p.options, ECMADigitClass()), nil
1154 }
1155 return newRegexNodeSet(ntSet, p.options, DigitClass()), nil
1156
1157 case 'D':
1158 p.moveRight(1)
1159 if p.useOptionE() {
1160 return newRegexNodeSet(ntSet, p.options, NotECMADigitClass()), nil
1161 }
1162 return newRegexNodeSet(ntSet, p.options, NotDigitClass()), nil
1163
1164 case 'p', 'P':
1165 p.moveRight(1)
1166 prop, err := p.parseProperty()
1167 if err != nil {
1168 return nil, err
1169 }
1170 cc := &CharSet{}
1171 cc.addCategory(prop, (ch != 'p'), p.useOptionI(), p.patternRaw)
1172 if p.useOptionI() {
1173 cc.addLowercase()
1174 }
1175
1176 return newRegexNodeSet(ntSet, p.options, cc), nil
1177
1178 default:
1179 return p.scanBasicBackslash(scanOnly)
1180 }
1181}
1182
1183// Scans \-style backreferences and character escapes
1184func (p *parser) scanBasicBackslash(scanOnly bool) (*regexNode, error) {
1185 if p.charsRight() == 0 {
1186 return nil, p.getErr(ErrIllegalEndEscape)
1187 }
1188 angled := false
1189 close := '\x00'
1190
1191 backpos := p.textpos()
1192 ch := p.rightChar(0)
1193
1194 // allow \k<foo> instead of \<foo>, which is now deprecated
1195
1196 if ch == 'k' {
1197 if p.charsRight() >= 2 {
1198 p.moveRight(1)
1199 ch = p.moveRightGetChar()
1200
1201 if ch == '<' || ch == '\'' {
1202 angled = true
1203 if ch == '\'' {
1204 close = '\''
1205 } else {
1206 close = '>'
1207 }
1208 }
1209 }
1210
1211 if !angled || p.charsRight() <= 0 {
1212 return nil, p.getErr(ErrMalformedNameRef)
1213 }
1214
1215 ch = p.rightChar(0)
1216
1217 } else if (ch == '<' || ch == '\'') && p.charsRight() > 1 { // Note angle without \g
1218 angled = true
1219 if ch == '\'' {
1220 close = '\''
1221 } else {
1222 close = '>'
1223 }
1224
1225 p.moveRight(1)
1226 ch = p.rightChar(0)
1227 }
1228
1229 // Try to parse backreference: \<1> or \<cap>
1230
1231 if angled && ch >= '0' && ch <= '9' {
1232 capnum, err := p.scanDecimal()
1233 if err != nil {
1234 return nil, err
1235 }
1236
1237 if p.charsRight() > 0 && p.moveRightGetChar() == close {
1238 if p.isCaptureSlot(capnum) {
1239 return newRegexNodeM(ntRef, p.options, capnum), nil
1240 }
1241 return nil, p.getErr(ErrUndefinedBackRef, capnum)
1242 }
1243 } else if !angled && ch >= '1' && ch <= '9' { // Try to parse backreference or octal: \1
1244 capnum, err := p.scanDecimal()
1245 if err != nil {
1246 return nil, err
1247 }
1248
1249 if scanOnly {
1250 return nil, nil
1251 }
1252
1253 if p.isCaptureSlot(capnum) {
1254 return newRegexNodeM(ntRef, p.options, capnum), nil
1255 }
1256 if capnum <= 9 && !p.useOptionE() {
1257 return nil, p.getErr(ErrUndefinedBackRef, capnum)
1258 }
1259
1260 } else if angled && IsWordChar(ch) {
1261 capname := p.scanCapname()
1262
1263 if p.charsRight() > 0 && p.moveRightGetChar() == close {
1264 if p.isCaptureName(capname) {
1265 return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil
1266 }
1267 return nil, p.getErr(ErrUndefinedNameRef, capname)
1268 }
1269 }
1270
1271 // Not backreference: must be char code
1272
1273 p.textto(backpos)
1274 ch, err := p.scanCharEscape()
1275 if err != nil {
1276 return nil, err
1277 }
1278
1279 if p.useOptionI() {
1280 ch = unicode.ToLower(ch)
1281 }
1282
1283 return newRegexNodeCh(ntOne, p.options, ch), nil
1284}
1285
1286// Scans X for \p{X} or \P{X}
1287func (p *parser) parseProperty() (string, error) {
1288 if p.charsRight() < 3 {
1289 return "", p.getErr(ErrIncompleteSlashP)
1290 }
1291 ch := p.moveRightGetChar()
1292 if ch != '{' {
1293 return "", p.getErr(ErrMalformedSlashP)
1294 }
1295
1296 startpos := p.textpos()
1297 for p.charsRight() > 0 {
1298 ch = p.moveRightGetChar()
1299 if !(IsWordChar(ch) || ch == '-') {
1300 p.moveLeft()
1301 break
1302 }
1303 }
1304 capname := string(p.pattern[startpos:p.textpos()])
1305
1306 if p.charsRight() == 0 || p.moveRightGetChar() != '}' {
1307 return "", p.getErr(ErrIncompleteSlashP)
1308 }
1309
1310 if !isValidUnicodeCat(capname) {
1311 return "", p.getErr(ErrUnknownSlashP, capname)
1312 }
1313
1314 return capname, nil
1315}
1316
1317// Returns ReNode type for zero-length assertions with a \ code.
1318func (p *parser) typeFromCode(ch rune) nodeType {
1319 switch ch {
1320 case 'b':
1321 if p.useOptionE() {
1322 return ntECMABoundary
1323 }
1324 return ntBoundary
1325 case 'B':
1326 if p.useOptionE() {
1327 return ntNonECMABoundary
1328 }
1329 return ntNonboundary
1330 case 'A':
1331 return ntBeginning
1332 case 'G':
1333 return ntStart
1334 case 'Z':
1335 return ntEndZ
1336 case 'z':
1337 return ntEnd
1338 default:
1339 return ntNothing
1340 }
1341}
1342
1343// Scans whitespace or x-mode comments.
1344func (p *parser) scanBlank() error {
1345 if p.useOptionX() {
1346 for {
1347 for p.charsRight() > 0 && isSpace(p.rightChar(0)) {
1348 p.moveRight(1)
1349 }
1350
1351 if p.charsRight() == 0 {
1352 break
1353 }
1354
1355 if p.rightChar(0) == '#' {
1356 for p.charsRight() > 0 && p.rightChar(0) != '\n' {
1357 p.moveRight(1)
1358 }
1359 } else if p.charsRight() >= 3 && p.rightChar(2) == '#' &&
1360 p.rightChar(1) == '?' && p.rightChar(0) == '(' {
1361 for p.charsRight() > 0 && p.rightChar(0) != ')' {
1362 p.moveRight(1)
1363 }
1364 if p.charsRight() == 0 {
1365 return p.getErr(ErrUnterminatedComment)
1366 }
1367 p.moveRight(1)
1368 } else {
1369 break
1370 }
1371 }
1372 } else {
1373 for {
1374 if p.charsRight() < 3 || p.rightChar(2) != '#' ||
1375 p.rightChar(1) != '?' || p.rightChar(0) != '(' {
1376 return nil
1377 }
1378
1379 for p.charsRight() > 0 && p.rightChar(0) != ')' {
1380 p.moveRight(1)
1381 }
1382 if p.charsRight() == 0 {
1383 return p.getErr(ErrUnterminatedComment)
1384 }
1385 p.moveRight(1)
1386 }
1387 }
1388 return nil
1389}
1390
1391func (p *parser) scanCapname() string {
1392 startpos := p.textpos()
1393
1394 for p.charsRight() > 0 {
1395 if !IsWordChar(p.moveRightGetChar()) {
1396 p.moveLeft()
1397 break
1398 }
1399 }
1400
1401 return string(p.pattern[startpos:p.textpos()])
1402}
1403
1404//Scans contents of [] (not including []'s), and converts to a set.
1405func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
1406 ch := '\x00'
1407 chPrev := '\x00'
1408 inRange := false
1409 firstChar := true
1410 closed := false
1411
1412 var cc *CharSet
1413 if !scanOnly {
1414 cc = &CharSet{}
1415 }
1416
1417 if p.charsRight() > 0 && p.rightChar(0) == '^' {
1418 p.moveRight(1)
1419 if !scanOnly {
1420 cc.negate = true
1421 }
1422 }
1423
1424 for ; p.charsRight() > 0; firstChar = false {
1425 fTranslatedChar := false
1426 ch = p.moveRightGetChar()
1427 if ch == ']' {
1428 if !firstChar {
1429 closed = true
1430 break
1431 } else if p.useOptionE() {
1432 if !scanOnly {
1433 cc.addRanges(NoneClass().ranges)
1434 }
1435 closed = true
1436 break
1437 }
1438
1439 } else if ch == '\\' && p.charsRight() > 0 {
1440 switch ch = p.moveRightGetChar(); ch {
1441 case 'D', 'd':
1442 if !scanOnly {
1443 if inRange {
1444 return nil, p.getErr(ErrBadClassInCharRange, ch)
1445 }
1446 cc.addDigit(p.useOptionE(), ch == 'D', p.patternRaw)
1447 }
1448 continue
1449
1450 case 'S', 's':
1451 if !scanOnly {
1452 if inRange {
1453 return nil, p.getErr(ErrBadClassInCharRange, ch)
1454 }
1455 cc.addSpace(p.useOptionE(), ch == 'S')
1456 }
1457 continue
1458
1459 case 'W', 'w':
1460 if !scanOnly {
1461 if inRange {
1462 return nil, p.getErr(ErrBadClassInCharRange, ch)
1463 }
1464
1465 cc.addWord(p.useOptionE(), ch == 'W')
1466 }
1467 continue
1468
1469 case 'p', 'P':
1470 if !scanOnly {
1471 if inRange {
1472 return nil, p.getErr(ErrBadClassInCharRange, ch)
1473 }
1474 prop, err := p.parseProperty()
1475 if err != nil {
1476 return nil, err
1477 }
1478 cc.addCategory(prop, (ch != 'p'), caseInsensitive, p.patternRaw)
1479 } else {
1480 p.parseProperty()
1481 }
1482
1483 continue
1484
1485 case '-':
1486 if !scanOnly {
1487 cc.addRange(ch, ch)
1488 }
1489 continue
1490
1491 default:
1492 p.moveLeft()
1493 var err error
1494 ch, err = p.scanCharEscape() // non-literal character
1495 if err != nil {
1496 return nil, err
1497 }
1498 fTranslatedChar = true
1499 break // this break will only break out of the switch
1500 }
1501 } else if ch == '[' {
1502 // This is code for Posix style properties - [:Ll:] or [:IsTibetan:].
1503 // It currently doesn't do anything other than skip the whole thing!
1504 if p.charsRight() > 0 && p.rightChar(0) == ':' && !inRange {
1505 savePos := p.textpos()
1506
1507 p.moveRight(1)
1508 negate := false
1509 if p.charsRight() > 1 && p.rightChar(0) == '^' {
1510 negate = true
1511 p.moveRight(1)
1512 }
1513
1514 nm := p.scanCapname() // snag the name
1515 if !scanOnly && p.useRE2() {
1516 // look up the name since these are valid for RE2
1517 // add the group based on the name
1518 if ok := cc.addNamedASCII(nm, negate); !ok {
1519 return nil, p.getErr(ErrInvalidCharRange)
1520 }
1521 }
1522 if p.charsRight() < 2 || p.moveRightGetChar() != ':' || p.moveRightGetChar() != ']' {
1523 p.textto(savePos)
1524 } else if p.useRE2() {
1525 // move on
1526 continue
1527 }
1528 }
1529 }
1530
1531 if inRange {
1532 inRange = false
1533 if !scanOnly {
1534 if ch == '[' && !fTranslatedChar && !firstChar {
1535 // We thought we were in a range, but we're actually starting a subtraction.
1536 // In that case, we'll add chPrev to our char class, skip the opening [, and
1537 // scan the new character class recursively.
1538 cc.addChar(chPrev)
1539 sub, err := p.scanCharSet(caseInsensitive, false)
1540 if err != nil {
1541 return nil, err
1542 }
1543 cc.addSubtraction(sub)
1544
1545 if p.charsRight() > 0 && p.rightChar(0) != ']' {
1546 return nil, p.getErr(ErrSubtractionMustBeLast)
1547 }
1548 } else {
1549 // a regular range, like a-z
1550 if chPrev > ch {
1551 return nil, p.getErr(ErrReversedCharRange)
1552 }
1553 cc.addRange(chPrev, ch)
1554 }
1555 }
1556 } else if p.charsRight() >= 2 && p.rightChar(0) == '-' && p.rightChar(1) != ']' {
1557 // this could be the start of a range
1558 chPrev = ch
1559 inRange = true
1560 p.moveRight(1)
1561 } else if p.charsRight() >= 1 && ch == '-' && !fTranslatedChar && p.rightChar(0) == '[' && !firstChar {
1562 // we aren't in a range, and now there is a subtraction. Usually this happens
1563 // only when a subtraction follows a range, like [a-z-[b]]
1564 if !scanOnly {
1565 p.moveRight(1)
1566 sub, err := p.scanCharSet(caseInsensitive, false)
1567 if err != nil {
1568 return nil, err
1569 }
1570 cc.addSubtraction(sub)
1571
1572 if p.charsRight() > 0 && p.rightChar(0) != ']' {
1573 return nil, p.getErr(ErrSubtractionMustBeLast)
1574 }
1575 } else {
1576 p.moveRight(1)
1577 p.scanCharSet(caseInsensitive, true)
1578 }
1579 } else {
1580 if !scanOnly {
1581 cc.addRange(ch, ch)
1582 }
1583 }
1584 }
1585
1586 if !closed {
1587 return nil, p.getErr(ErrUnterminatedBracket)
1588 }
1589
1590 if !scanOnly && caseInsensitive {
1591 cc.addLowercase()
1592 }
1593
1594 return cc, nil
1595}
1596
1597// Scans any number of decimal digits (pegs value at 2^31-1 if too large)
1598func (p *parser) scanDecimal() (int, error) {
1599 i := 0
1600 var d int
1601
1602 for p.charsRight() > 0 {
1603 d = int(p.rightChar(0) - '0')
1604 if d < 0 || d > 9 {
1605 break
1606 }
1607 p.moveRight(1)
1608
1609 if i > maxValueDiv10 || (i == maxValueDiv10 && d > maxValueMod10) {
1610 return 0, p.getErr(ErrCaptureGroupOutOfRange)
1611 }
1612
1613 i *= 10
1614 i += d
1615 }
1616
1617 return int(i), nil
1618}
1619
1620// Returns true for options allowed only at the top level
1621func isOnlyTopOption(option RegexOptions) bool {
1622 return option == RightToLeft || option == ECMAScript || option == RE2
1623}
1624
1625// Scans cimsx-cimsx option string, stops at the first unrecognized char.
1626func (p *parser) scanOptions() {
1627
1628 for off := false; p.charsRight() > 0; p.moveRight(1) {
1629 ch := p.rightChar(0)
1630
1631 if ch == '-' {
1632 off = true
1633 } else if ch == '+' {
1634 off = false
1635 } else {
1636 option := optionFromCode(ch)
1637 if option == 0 || isOnlyTopOption(option) {
1638 return
1639 }
1640
1641 if off {
1642 p.options &= ^option
1643 } else {
1644 p.options |= option
1645 }
1646 }
1647 }
1648}
1649
1650// Scans \ code for escape codes that map to single unicode chars.
1651func (p *parser) scanCharEscape() (r rune, err error) {
1652
1653 ch := p.moveRightGetChar()
1654
1655 if ch >= '0' && ch <= '7' {
1656 p.moveLeft()
1657 return p.scanOctal(), nil
1658 }
1659
1660 pos := p.textpos()
1661
1662 switch ch {
1663 case 'x':
1664 // support for \x{HEX} syntax from Perl and PCRE
1665 if p.charsRight() > 0 && p.rightChar(0) == '{' {
1666 if p.useOptionE() {
1667 return ch, nil
1668 }
1669 p.moveRight(1)
1670 return p.scanHexUntilBrace()
1671 } else {
1672 r, err = p.scanHex(2)
1673 }
1674 case 'u':
1675 r, err = p.scanHex(4)
1676 case 'a':
1677 return '\u0007', nil
1678 case 'b':
1679 return '\b', nil
1680 case 'e':
1681 return '\u001B', nil
1682 case 'f':
1683 return '\f', nil
1684 case 'n':
1685 return '\n', nil
1686 case 'r':
1687 return '\r', nil
1688 case 't':
1689 return '\t', nil
1690 case 'v':
1691 return '\u000B', nil
1692 case 'c':
1693 r, err = p.scanControl()
1694 default:
1695 if !p.useOptionE() && IsWordChar(ch) {
1696 return 0, p.getErr(ErrUnrecognizedEscape, string(ch))
1697 }
1698 return ch, nil
1699 }
1700 if err != nil && p.useOptionE() {
1701 p.textto(pos)
1702 return ch, nil
1703 }
1704 return
1705}
1706
1707// Grabs and converts an ascii control character
1708func (p *parser) scanControl() (rune, error) {
1709 if p.charsRight() <= 0 {
1710 return 0, p.getErr(ErrMissingControl)
1711 }
1712
1713 ch := p.moveRightGetChar()
1714
1715 // \ca interpreted as \cA
1716
1717 if ch >= 'a' && ch <= 'z' {
1718 ch = (ch - ('a' - 'A'))
1719 }
1720 ch = (ch - '@')
1721 if ch >= 0 && ch < ' ' {
1722 return ch, nil
1723 }
1724
1725 return 0, p.getErr(ErrUnrecognizedControl)
1726
1727}
1728
1729// Scan hex digits until we hit a closing brace.
1730// Non-hex digits, hex value too large for UTF-8, or running out of chars are errors
1731func (p *parser) scanHexUntilBrace() (rune, error) {
1732 // PCRE spec reads like unlimited hex digits are allowed, but unicode has a limit
1733 // so we can enforce that
1734 i := 0
1735 hasContent := false
1736
1737 for p.charsRight() > 0 {
1738 ch := p.moveRightGetChar()
1739 if ch == '}' {
1740 // hit our close brace, we're done here
1741 // prevent \x{}
1742 if !hasContent {
1743 return 0, p.getErr(ErrTooFewHex)
1744 }
1745 return rune(i), nil
1746 }
1747 hasContent = true
1748 // no brace needs to be hex digit
1749 d := hexDigit(ch)
1750 if d < 0 {
1751 return 0, p.getErr(ErrMissingBrace)
1752 }
1753
1754 i *= 0x10
1755 i += d
1756
1757 if i > unicode.MaxRune {
1758 return 0, p.getErr(ErrInvalidHex)
1759 }
1760 }
1761
1762 // we only make it here if we run out of digits without finding the brace
1763 return 0, p.getErr(ErrMissingBrace)
1764}
1765
1766// Scans exactly c hex digits (c=2 for \xFF, c=4 for \uFFFF)
1767func (p *parser) scanHex(c int) (rune, error) {
1768
1769 i := 0
1770
1771 if p.charsRight() >= c {
1772 for c > 0 {
1773 d := hexDigit(p.moveRightGetChar())
1774 if d < 0 {
1775 break
1776 }
1777 i *= 0x10
1778 i += d
1779 c--
1780 }
1781 }
1782
1783 if c > 0 {
1784 return 0, p.getErr(ErrTooFewHex)
1785 }
1786
1787 return rune(i), nil
1788}
1789
1790// Returns n <= 0xF for a hex digit.
1791func hexDigit(ch rune) int {
1792
1793 if d := uint(ch - '0'); d <= 9 {
1794 return int(d)
1795 }
1796
1797 if d := uint(ch - 'a'); d <= 5 {
1798 return int(d + 0xa)
1799 }
1800
1801 if d := uint(ch - 'A'); d <= 5 {
1802 return int(d + 0xa)
1803 }
1804
1805 return -1
1806}
1807
1808// Scans up to three octal digits (stops before exceeding 0377).
1809func (p *parser) scanOctal() rune {
1810 // Consume octal chars only up to 3 digits and value 0377
1811
1812 c := 3
1813
1814 if c > p.charsRight() {
1815 c = p.charsRight()
1816 }
1817
1818 //we know the first char is good because the caller had to check
1819 i := 0
1820 d := int(p.rightChar(0) - '0')
1821 for c > 0 && d <= 7 && d >= 0 {
1822 if i >= 0x20 && p.useOptionE() {
1823 break
1824 }
1825 i *= 8
1826 i += d
1827 c--
1828
1829 p.moveRight(1)
1830 if !p.rightMost() {
1831 d = int(p.rightChar(0) - '0')
1832 }
1833 }
1834
1835 // Octal codes only go up to 255. Any larger and the behavior that Perl follows
1836 // is simply to truncate the high bits.
1837 i &= 0xFF
1838
1839 return rune(i)
1840}
1841
1842// Returns the current parsing position.
1843func (p *parser) textpos() int {
1844 return p.currentPos
1845}
1846
1847// Zaps to a specific parsing position.
1848func (p *parser) textto(pos int) {
1849 p.currentPos = pos
1850}
1851
1852// Returns the char at the right of the current parsing position and advances to the right.
1853func (p *parser) moveRightGetChar() rune {
1854 ch := p.pattern[p.currentPos]
1855 p.currentPos++
1856 return ch
1857}
1858
1859// Moves the current position to the right.
1860func (p *parser) moveRight(i int) {
1861 // default would be 1
1862 p.currentPos += i
1863}
1864
1865// Moves the current parsing position one to the left.
1866func (p *parser) moveLeft() {
1867 p.currentPos--
1868}
1869
1870// Returns the char left of the current parsing position.
1871func (p *parser) charAt(i int) rune {
1872 return p.pattern[i]
1873}
1874
1875// Returns the char i chars right of the current parsing position.
1876func (p *parser) rightChar(i int) rune {
1877 // default would be 0
1878 return p.pattern[p.currentPos+i]
1879}
1880
1881// Number of characters to the right of the current parsing position.
1882func (p *parser) charsRight() int {
1883 return len(p.pattern) - p.currentPos
1884}
1885
1886func (p *parser) rightMost() bool {
1887 return p.currentPos == len(p.pattern)
1888}
1889
1890// Looks up the slot number for a given name
1891func (p *parser) captureSlotFromName(capname string) int {
1892 return p.capnames[capname]
1893}
1894
1895// True if the capture slot was noted
1896func (p *parser) isCaptureSlot(i int) bool {
1897 if p.caps != nil {
1898 _, ok := p.caps[i]
1899 return ok
1900 }
1901
1902 return (i >= 0 && i < p.capsize)
1903}
1904
1905// Looks up the slot number for a given name
1906func (p *parser) isCaptureName(capname string) bool {
1907 if p.capnames == nil {
1908 return false
1909 }
1910
1911 _, ok := p.capnames[capname]
1912 return ok
1913}
1914
1915// option shortcuts
1916
1917// True if N option disabling '(' autocapture is on.
1918func (p *parser) useOptionN() bool {
1919 return (p.options & ExplicitCapture) != 0
1920}
1921
1922// True if I option enabling case-insensitivity is on.
1923func (p *parser) useOptionI() bool {
1924 return (p.options & IgnoreCase) != 0
1925}
1926
1927// True if M option altering meaning of $ and ^ is on.
1928func (p *parser) useOptionM() bool {
1929 return (p.options & Multiline) != 0
1930}
1931
1932// True if S option altering meaning of . is on.
1933func (p *parser) useOptionS() bool {
1934 return (p.options & Singleline) != 0
1935}
1936
1937// True if X option enabling whitespace/comment mode is on.
1938func (p *parser) useOptionX() bool {
1939 return (p.options & IgnorePatternWhitespace) != 0
1940}
1941
1942// True if E option enabling ECMAScript behavior on.
1943func (p *parser) useOptionE() bool {
1944 return (p.options & ECMAScript) != 0
1945}
1946
1947// true to use RE2 compatibility parsing behavior.
1948func (p *parser) useRE2() bool {
1949 return (p.options & RE2) != 0
1950}
1951
1952// True if options stack is empty.
1953func (p *parser) emptyOptionsStack() bool {
1954 return len(p.optionsStack) == 0
1955}
1956
1957// Finish the current quantifiable (when a quantifier is not found or is not possible)
1958func (p *parser) addConcatenate() {
1959 // The first (| inside a Testgroup group goes directly to the group
1960 p.concatenation.addChild(p.unit)
1961 p.unit = nil
1962}
1963
1964// Finish the current quantifiable (when a quantifier is found)
1965func (p *parser) addConcatenate3(lazy bool, min, max int) {
1966 p.concatenation.addChild(p.unit.makeQuantifier(lazy, min, max))
1967 p.unit = nil
1968}
1969
1970// Sets the current unit to a single char node
1971func (p *parser) addUnitOne(ch rune) {
1972 if p.useOptionI() {
1973 ch = unicode.ToLower(ch)
1974 }
1975
1976 p.unit = newRegexNodeCh(ntOne, p.options, ch)
1977}
1978
1979// Sets the current unit to a single inverse-char node
1980func (p *parser) addUnitNotone(ch rune) {
1981 if p.useOptionI() {
1982 ch = unicode.ToLower(ch)
1983 }
1984
1985 p.unit = newRegexNodeCh(ntNotone, p.options, ch)
1986}
1987
1988// Sets the current unit to a single set node
1989func (p *parser) addUnitSet(set *CharSet) {
1990 p.unit = newRegexNodeSet(ntSet, p.options, set)
1991}
1992
1993// Sets the current unit to a subtree
1994func (p *parser) addUnitNode(node *regexNode) {
1995 p.unit = node
1996}
1997
1998// Sets the current unit to an assertion of the specified type
1999func (p *parser) addUnitType(t nodeType) {
2000 p.unit = newRegexNode(t, p.options)
2001}
2002
2003// Finish the current group (in response to a ')' or end)
2004func (p *parser) addGroup() error {
2005 if p.group.t == ntTestgroup || p.group.t == ntTestref {
2006 p.group.addChild(p.concatenation.reverseLeft())
2007 if (p.group.t == ntTestref && len(p.group.children) > 2) || len(p.group.children) > 3 {
2008 return p.getErr(ErrTooManyAlternates)
2009 }
2010 } else {
2011 p.alternation.addChild(p.concatenation.reverseLeft())
2012 p.group.addChild(p.alternation)
2013 }
2014
2015 p.unit = p.group
2016 return nil
2017}
2018
2019// Pops the option stack, but keeps the current options unchanged.
2020func (p *parser) popKeepOptions() {
2021 lastIdx := len(p.optionsStack) - 1
2022 p.optionsStack = p.optionsStack[:lastIdx]
2023}
2024
2025// Recalls options from the stack.
2026func (p *parser) popOptions() {
2027 lastIdx := len(p.optionsStack) - 1
2028 // get the last item on the stack and then remove it by reslicing
2029 p.options = p.optionsStack[lastIdx]
2030 p.optionsStack = p.optionsStack[:lastIdx]
2031}
2032
2033// Saves options on a stack.
2034func (p *parser) pushOptions() {
2035 p.optionsStack = append(p.optionsStack, p.options)
2036}
2037
2038// Add a string to the last concatenate.
2039func (p *parser) addToConcatenate(pos, cch int, isReplacement bool) {
2040 var node *regexNode
2041
2042 if cch == 0 {
2043 return
2044 }
2045
2046 if cch > 1 {
2047 str := p.pattern[pos : pos+cch]
2048
2049 if p.useOptionI() && !isReplacement {
2050 // We do the ToLower character by character for consistency. With surrogate chars, doing
2051 // a ToLower on the entire string could actually change the surrogate pair. This is more correct
2052 // linguistically, but since Regex doesn't support surrogates, it's more important to be
2053 // consistent.
2054 for i := 0; i < len(str); i++ {
2055 str[i] = unicode.ToLower(str[i])
2056 }
2057 }
2058
2059 node = newRegexNodeStr(ntMulti, p.options, str)
2060 } else {
2061 ch := p.charAt(pos)
2062
2063 if p.useOptionI() && !isReplacement {
2064 ch = unicode.ToLower(ch)
2065 }
2066
2067 node = newRegexNodeCh(ntOne, p.options, ch)
2068 }
2069
2070 p.concatenation.addChild(node)
2071}
2072
2073// Push the parser state (in response to an open paren)
2074func (p *parser) pushGroup() {
2075 p.group.next = p.stack
2076 p.alternation.next = p.group
2077 p.concatenation.next = p.alternation
2078 p.stack = p.concatenation
2079}
2080
2081// Remember the pushed state (in response to a ')')
2082func (p *parser) popGroup() error {
2083 p.concatenation = p.stack
2084 p.alternation = p.concatenation.next
2085 p.group = p.alternation.next
2086 p.stack = p.group.next
2087
2088 // The first () inside a Testgroup group goes directly to the group
2089 if p.group.t == ntTestgroup && len(p.group.children) == 0 {
2090 if p.unit == nil {
2091 return p.getErr(ErrConditionalExpression)
2092 }
2093
2094 p.group.addChild(p.unit)
2095 p.unit = nil
2096 }
2097 return nil
2098}
2099
2100// True if the group stack is empty.
2101func (p *parser) emptyStack() bool {
2102 return p.stack == nil
2103}
2104
2105// Start a new round for the parser state (in response to an open paren or string start)
2106func (p *parser) startGroup(openGroup *regexNode) {
2107 p.group = openGroup
2108 p.alternation = newRegexNode(ntAlternate, p.options)
2109 p.concatenation = newRegexNode(ntConcatenate, p.options)
2110}
2111
2112// Finish the current concatenation (in response to a |)
2113func (p *parser) addAlternate() {
2114 // The | parts inside a Testgroup group go directly to the group
2115
2116 if p.group.t == ntTestgroup || p.group.t == ntTestref {
2117 p.group.addChild(p.concatenation.reverseLeft())
2118 } else {
2119 p.alternation.addChild(p.concatenation.reverseLeft())
2120 }
2121
2122 p.concatenation = newRegexNode(ntConcatenate, p.options)
2123}
2124
2125// For categorizing ascii characters.
2126
2127const (
2128 Q byte = 5 // quantifier
2129 S = 4 // ordinary stopper
2130 Z = 3 // ScanBlank stopper
2131 X = 2 // whitespace
2132 E = 1 // should be escaped
2133)
2134
2135var _category = []byte{
2136 //01 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 A B C D E F
2137 0, 0, 0, 0, 0, 0, 0, 0, 0, X, X, X, X, X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2138 // ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
2139 X, 0, 0, Z, S, 0, 0, 0, S, S, Q, Q, 0, 0, S, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q,
2140 //@A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _
2141 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, S, 0,
2142 //'a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~
2143 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q, S, 0, 0, 0,
2144}
2145
2146func isSpace(ch rune) bool {
2147 return (ch <= ' ' && _category[ch] == X)
2148}
2149
2150// Returns true for those characters that terminate a string of ordinary chars.
2151func isSpecial(ch rune) bool {
2152 return (ch <= '|' && _category[ch] >= S)
2153}
2154
2155// Returns true for those characters that terminate a string of ordinary chars.
2156func isStopperX(ch rune) bool {
2157 return (ch <= '|' && _category[ch] >= X)
2158}
2159
2160// Returns true for those characters that begin a quantifier.
2161func isQuantifier(ch rune) bool {
2162 return (ch <= '{' && _category[ch] >= Q)
2163}
2164
2165func (p *parser) isTrueQuantifier() bool {
2166 nChars := p.charsRight()
2167 if nChars == 0 {
2168 return false
2169 }
2170
2171 startpos := p.textpos()
2172 ch := p.charAt(startpos)
2173 if ch != '{' {
2174 return ch <= '{' && _category[ch] >= Q
2175 }
2176
2177 //UGLY: this is ugly -- the original code was ugly too
2178 pos := startpos
2179 for {
2180 nChars--
2181 if nChars <= 0 {
2182 break
2183 }
2184 pos++
2185 ch = p.charAt(pos)
2186 if ch < '0' || ch > '9' {
2187 break
2188 }
2189 }
2190
2191 if nChars == 0 || pos-startpos == 1 {
2192 return false
2193 }
2194 if ch == '}' {
2195 return true
2196 }
2197 if ch != ',' {
2198 return false
2199 }
2200 for {
2201 nChars--
2202 if nChars <= 0 {
2203 break
2204 }
2205 pos++
2206 ch = p.charAt(pos)
2207 if ch < '0' || ch > '9' {
2208 break
2209 }
2210 }
2211
2212 return nChars > 0 && ch == '}'
2213}
Note: See TracBrowser for help on using the repository browser.