source: code/trunk/vendor/github.com/dlclark/regexp2/syntax/tree.go@ 67

Last change on this file since 67 was 67, checked in by Izuru Yakumo, 23 months ago

Use vendored modules

Signed-off-by: Izuru Yakumo <yakumo.izuru@…>

File size: 15.6 KB
Line 
1package syntax
2
3import (
4 "bytes"
5 "fmt"
6 "math"
7 "strconv"
8)
9
10type RegexTree struct {
11 root *regexNode
12 caps map[int]int
13 capnumlist []int
14 captop int
15 Capnames map[string]int
16 Caplist []string
17 options RegexOptions
18}
19
20// It is built into a parsed tree for a regular expression.
21
22// Implementation notes:
23//
24// Since the node tree is a temporary data structure only used
25// during compilation of the regexp to integer codes, it's
26// designed for clarity and convenience rather than
27// space efficiency.
28//
29// RegexNodes are built into a tree, linked by the n.children list.
30// Each node also has a n.parent and n.ichild member indicating
31// its parent and which child # it is in its parent's list.
32//
33// RegexNodes come in as many types as there are constructs in
34// a regular expression, for example, "concatenate", "alternate",
35// "one", "rept", "group". There are also node types for basic
36// peephole optimizations, e.g., "onerep", "notsetrep", etc.
37//
38// Because perl 5 allows "lookback" groups that scan backwards,
39// each node also gets a "direction". Normally the value of
40// boolean n.backward = false.
41//
42// During parsing, top-level nodes are also stacked onto a parse
43// stack (a stack of trees). For this purpose we have a n.next
44// pointer. [Note that to save a few bytes, we could overload the
45// n.parent pointer instead.]
46//
47// On the parse stack, each tree has a "role" - basically, the
48// nonterminal in the grammar that the parser has currently
49// assigned to the tree. That code is stored in n.role.
50//
51// Finally, some of the different kinds of nodes have data.
52// Two integers (for the looping constructs) are stored in
53// n.operands, an an object (either a string or a set)
54// is stored in n.data
55type regexNode struct {
56 t nodeType
57 children []*regexNode
58 str []rune
59 set *CharSet
60 ch rune
61 m int
62 n int
63 options RegexOptions
64 next *regexNode
65}
66
67type nodeType int32
68
69const (
70 // The following are leaves, and correspond to primitive operations
71
72 ntOnerep nodeType = 0 // lef,back char,min,max a {n}
73 ntNotonerep = 1 // lef,back char,min,max .{n}
74 ntSetrep = 2 // lef,back set,min,max [\d]{n}
75 ntOneloop = 3 // lef,back char,min,max a {,n}
76 ntNotoneloop = 4 // lef,back char,min,max .{,n}
77 ntSetloop = 5 // lef,back set,min,max [\d]{,n}
78 ntOnelazy = 6 // lef,back char,min,max a {,n}?
79 ntNotonelazy = 7 // lef,back char,min,max .{,n}?
80 ntSetlazy = 8 // lef,back set,min,max [\d]{,n}?
81 ntOne = 9 // lef char a
82 ntNotone = 10 // lef char [^a]
83 ntSet = 11 // lef set [a-z\s] \w \s \d
84 ntMulti = 12 // lef string abcd
85 ntRef = 13 // lef group \#
86 ntBol = 14 // ^
87 ntEol = 15 // $
88 ntBoundary = 16 // \b
89 ntNonboundary = 17 // \B
90 ntBeginning = 18 // \A
91 ntStart = 19 // \G
92 ntEndZ = 20 // \Z
93 ntEnd = 21 // \Z
94
95 // Interior nodes do not correspond to primitive operations, but
96 // control structures compositing other operations
97
98 // Concat and alternate take n children, and can run forward or backwards
99
100 ntNothing = 22 // []
101 ntEmpty = 23 // ()
102 ntAlternate = 24 // a|b
103 ntConcatenate = 25 // ab
104 ntLoop = 26 // m,x * + ? {,}
105 ntLazyloop = 27 // m,x *? +? ?? {,}?
106 ntCapture = 28 // n ()
107 ntGroup = 29 // (?:)
108 ntRequire = 30 // (?=) (?<=)
109 ntPrevent = 31 // (?!) (?<!)
110 ntGreedy = 32 // (?>) (?<)
111 ntTestref = 33 // (?(n) | )
112 ntTestgroup = 34 // (?(...) | )
113
114 ntECMABoundary = 41 // \b
115 ntNonECMABoundary = 42 // \B
116)
117
118func newRegexNode(t nodeType, opt RegexOptions) *regexNode {
119 return &regexNode{
120 t: t,
121 options: opt,
122 }
123}
124
125func newRegexNodeCh(t nodeType, opt RegexOptions, ch rune) *regexNode {
126 return &regexNode{
127 t: t,
128 options: opt,
129 ch: ch,
130 }
131}
132
133func newRegexNodeStr(t nodeType, opt RegexOptions, str []rune) *regexNode {
134 return &regexNode{
135 t: t,
136 options: opt,
137 str: str,
138 }
139}
140
141func newRegexNodeSet(t nodeType, opt RegexOptions, set *CharSet) *regexNode {
142 return &regexNode{
143 t: t,
144 options: opt,
145 set: set,
146 }
147}
148
149func newRegexNodeM(t nodeType, opt RegexOptions, m int) *regexNode {
150 return &regexNode{
151 t: t,
152 options: opt,
153 m: m,
154 }
155}
156func newRegexNodeMN(t nodeType, opt RegexOptions, m, n int) *regexNode {
157 return &regexNode{
158 t: t,
159 options: opt,
160 m: m,
161 n: n,
162 }
163}
164
165func (n *regexNode) writeStrToBuf(buf *bytes.Buffer) {
166 for i := 0; i < len(n.str); i++ {
167 buf.WriteRune(n.str[i])
168 }
169}
170
171func (n *regexNode) addChild(child *regexNode) {
172 reduced := child.reduce()
173 n.children = append(n.children, reduced)
174 reduced.next = n
175}
176
177func (n *regexNode) insertChildren(afterIndex int, nodes []*regexNode) {
178 newChildren := make([]*regexNode, 0, len(n.children)+len(nodes))
179 n.children = append(append(append(newChildren, n.children[:afterIndex]...), nodes...), n.children[afterIndex:]...)
180}
181
182// removes children including the start but not the end index
183func (n *regexNode) removeChildren(startIndex, endIndex int) {
184 n.children = append(n.children[:startIndex], n.children[endIndex:]...)
185}
186
187// Pass type as OneLazy or OneLoop
188func (n *regexNode) makeRep(t nodeType, min, max int) {
189 n.t += (t - ntOne)
190 n.m = min
191 n.n = max
192}
193
194func (n *regexNode) reduce() *regexNode {
195 switch n.t {
196 case ntAlternate:
197 return n.reduceAlternation()
198
199 case ntConcatenate:
200 return n.reduceConcatenation()
201
202 case ntLoop, ntLazyloop:
203 return n.reduceRep()
204
205 case ntGroup:
206 return n.reduceGroup()
207
208 case ntSet, ntSetloop:
209 return n.reduceSet()
210
211 default:
212 return n
213 }
214}
215
216// Basic optimization. Single-letter alternations can be replaced
217// by faster set specifications, and nested alternations with no
218// intervening operators can be flattened:
219//
220// a|b|c|def|g|h -> [a-c]|def|[gh]
221// apple|(?:orange|pear)|grape -> apple|orange|pear|grape
222func (n *regexNode) reduceAlternation() *regexNode {
223 if len(n.children) == 0 {
224 return newRegexNode(ntNothing, n.options)
225 }
226
227 wasLastSet := false
228 lastNodeCannotMerge := false
229 var optionsLast RegexOptions
230 var i, j int
231
232 for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 {
233 at := n.children[i]
234
235 if j < i {
236 n.children[j] = at
237 }
238
239 for {
240 if at.t == ntAlternate {
241 for k := 0; k < len(at.children); k++ {
242 at.children[k].next = n
243 }
244 n.insertChildren(i+1, at.children)
245
246 j--
247 } else if at.t == ntSet || at.t == ntOne {
248 // Cannot merge sets if L or I options differ, or if either are negated.
249 optionsAt := at.options & (RightToLeft | IgnoreCase)
250
251 if at.t == ntSet {
252 if !wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge || !at.set.IsMergeable() {
253 wasLastSet = true
254 lastNodeCannotMerge = !at.set.IsMergeable()
255 optionsLast = optionsAt
256 break
257 }
258 } else if !wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge {
259 wasLastSet = true
260 lastNodeCannotMerge = false
261 optionsLast = optionsAt
262 break
263 }
264
265 // The last node was a Set or a One, we're a Set or One and our options are the same.
266 // Merge the two nodes.
267 j--
268 prev := n.children[j]
269
270 var prevCharClass *CharSet
271 if prev.t == ntOne {
272 prevCharClass = &CharSet{}
273 prevCharClass.addChar(prev.ch)
274 } else {
275 prevCharClass = prev.set
276 }
277
278 if at.t == ntOne {
279 prevCharClass.addChar(at.ch)
280 } else {
281 prevCharClass.addSet(*at.set)
282 }
283
284 prev.t = ntSet
285 prev.set = prevCharClass
286 } else if at.t == ntNothing {
287 j--
288 } else {
289 wasLastSet = false
290 lastNodeCannotMerge = false
291 }
292 break
293 }
294 }
295
296 if j < i {
297 n.removeChildren(j, i)
298 }
299
300 return n.stripEnation(ntNothing)
301}
302
303// Basic optimization. Adjacent strings can be concatenated.
304//
305// (?:abc)(?:def) -> abcdef
306func (n *regexNode) reduceConcatenation() *regexNode {
307 // Eliminate empties and concat adjacent strings/chars
308
309 var optionsLast RegexOptions
310 var optionsAt RegexOptions
311 var i, j int
312
313 if len(n.children) == 0 {
314 return newRegexNode(ntEmpty, n.options)
315 }
316
317 wasLastString := false
318
319 for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 {
320 var at, prev *regexNode
321
322 at = n.children[i]
323
324 if j < i {
325 n.children[j] = at
326 }
327
328 if at.t == ntConcatenate &&
329 ((at.options & RightToLeft) == (n.options & RightToLeft)) {
330 for k := 0; k < len(at.children); k++ {
331 at.children[k].next = n
332 }
333
334 //insert at.children at i+1 index in n.children
335 n.insertChildren(i+1, at.children)
336
337 j--
338 } else if at.t == ntMulti || at.t == ntOne {
339 // Cannot merge strings if L or I options differ
340 optionsAt = at.options & (RightToLeft | IgnoreCase)
341
342 if !wasLastString || optionsLast != optionsAt {
343 wasLastString = true
344 optionsLast = optionsAt
345 continue
346 }
347
348 j--
349 prev = n.children[j]
350
351 if prev.t == ntOne {
352 prev.t = ntMulti
353 prev.str = []rune{prev.ch}
354 }
355
356 if (optionsAt & RightToLeft) == 0 {
357 if at.t == ntOne {
358 prev.str = append(prev.str, at.ch)
359 } else {
360 prev.str = append(prev.str, at.str...)
361 }
362 } else {
363 if at.t == ntOne {
364 // insert at the front by expanding our slice, copying the data over, and then setting the value
365 prev.str = append(prev.str, 0)
366 copy(prev.str[1:], prev.str)
367 prev.str[0] = at.ch
368 } else {
369 //insert at the front...this one we'll make a new slice and copy both into it
370 merge := make([]rune, len(prev.str)+len(at.str))
371 copy(merge, at.str)
372 copy(merge[len(at.str):], prev.str)
373 prev.str = merge
374 }
375 }
376 } else if at.t == ntEmpty {
377 j--
378 } else {
379 wasLastString = false
380 }
381 }
382
383 if j < i {
384 // remove indices j through i from the children
385 n.removeChildren(j, i)
386 }
387
388 return n.stripEnation(ntEmpty)
389}
390
391// Nested repeaters just get multiplied with each other if they're not
392// too lumpy
393func (n *regexNode) reduceRep() *regexNode {
394
395 u := n
396 t := n.t
397 min := n.m
398 max := n.n
399
400 for {
401 if len(u.children) == 0 {
402 break
403 }
404
405 child := u.children[0]
406
407 // multiply reps of the same type only
408 if child.t != t {
409 childType := child.t
410
411 if !(childType >= ntOneloop && childType <= ntSetloop && t == ntLoop ||
412 childType >= ntOnelazy && childType <= ntSetlazy && t == ntLazyloop) {
413 break
414 }
415 }
416
417 // child can be too lumpy to blur, e.g., (a {100,105}) {3} or (a {2,})?
418 // [but things like (a {2,})+ are not too lumpy...]
419 if u.m == 0 && child.m > 1 || child.n < child.m*2 {
420 break
421 }
422
423 u = child
424 if u.m > 0 {
425 if (math.MaxInt32-1)/u.m < min {
426 u.m = math.MaxInt32
427 } else {
428 u.m = u.m * min
429 }
430 }
431 if u.n > 0 {
432 if (math.MaxInt32-1)/u.n < max {
433 u.n = math.MaxInt32
434 } else {
435 u.n = u.n * max
436 }
437 }
438 }
439
440 if math.MaxInt32 == min {
441 return newRegexNode(ntNothing, n.options)
442 }
443 return u
444
445}
446
447// Simple optimization. If a concatenation or alternation has only
448// one child strip out the intermediate node. If it has zero children,
449// turn it into an empty.
450func (n *regexNode) stripEnation(emptyType nodeType) *regexNode {
451 switch len(n.children) {
452 case 0:
453 return newRegexNode(emptyType, n.options)
454 case 1:
455 return n.children[0]
456 default:
457 return n
458 }
459}
460
461func (n *regexNode) reduceGroup() *regexNode {
462 u := n
463
464 for u.t == ntGroup {
465 u = u.children[0]
466 }
467
468 return u
469}
470
471// Simple optimization. If a set is a singleton, an inverse singleton,
472// or empty, it's transformed accordingly.
473func (n *regexNode) reduceSet() *regexNode {
474 // Extract empty-set, one and not-one case as special
475
476 if n.set == nil {
477 n.t = ntNothing
478 } else if n.set.IsSingleton() {
479 n.ch = n.set.SingletonChar()
480 n.set = nil
481 n.t += (ntOne - ntSet)
482 } else if n.set.IsSingletonInverse() {
483 n.ch = n.set.SingletonChar()
484 n.set = nil
485 n.t += (ntNotone - ntSet)
486 }
487
488 return n
489}
490
491func (n *regexNode) reverseLeft() *regexNode {
492 if n.options&RightToLeft != 0 && n.t == ntConcatenate && len(n.children) > 0 {
493 //reverse children order
494 for left, right := 0, len(n.children)-1; left < right; left, right = left+1, right-1 {
495 n.children[left], n.children[right] = n.children[right], n.children[left]
496 }
497 }
498
499 return n
500}
501
502func (n *regexNode) makeQuantifier(lazy bool, min, max int) *regexNode {
503 if min == 0 && max == 0 {
504 return newRegexNode(ntEmpty, n.options)
505 }
506
507 if min == 1 && max == 1 {
508 return n
509 }
510
511 switch n.t {
512 case ntOne, ntNotone, ntSet:
513 if lazy {
514 n.makeRep(Onelazy, min, max)
515 } else {
516 n.makeRep(Oneloop, min, max)
517 }
518 return n
519
520 default:
521 var t nodeType
522 if lazy {
523 t = ntLazyloop
524 } else {
525 t = ntLoop
526 }
527 result := newRegexNodeMN(t, n.options, min, max)
528 result.addChild(n)
529 return result
530 }
531}
532
533// debug functions
534
535var typeStr = []string{
536 "Onerep", "Notonerep", "Setrep",
537 "Oneloop", "Notoneloop", "Setloop",
538 "Onelazy", "Notonelazy", "Setlazy",
539 "One", "Notone", "Set",
540 "Multi", "Ref",
541 "Bol", "Eol", "Boundary", "Nonboundary",
542 "Beginning", "Start", "EndZ", "End",
543 "Nothing", "Empty",
544 "Alternate", "Concatenate",
545 "Loop", "Lazyloop",
546 "Capture", "Group", "Require", "Prevent", "Greedy",
547 "Testref", "Testgroup",
548 "Unknown", "Unknown", "Unknown",
549 "Unknown", "Unknown", "Unknown",
550 "ECMABoundary", "NonECMABoundary",
551}
552
553func (n *regexNode) description() string {
554 buf := &bytes.Buffer{}
555
556 buf.WriteString(typeStr[n.t])
557
558 if (n.options & ExplicitCapture) != 0 {
559 buf.WriteString("-C")
560 }
561 if (n.options & IgnoreCase) != 0 {
562 buf.WriteString("-I")
563 }
564 if (n.options & RightToLeft) != 0 {
565 buf.WriteString("-L")
566 }
567 if (n.options & Multiline) != 0 {
568 buf.WriteString("-M")
569 }
570 if (n.options & Singleline) != 0 {
571 buf.WriteString("-S")
572 }
573 if (n.options & IgnorePatternWhitespace) != 0 {
574 buf.WriteString("-X")
575 }
576 if (n.options & ECMAScript) != 0 {
577 buf.WriteString("-E")
578 }
579
580 switch n.t {
581 case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntOne, ntNotone:
582 buf.WriteString("(Ch = " + CharDescription(n.ch) + ")")
583 break
584 case ntCapture:
585 buf.WriteString("(index = " + strconv.Itoa(n.m) + ", unindex = " + strconv.Itoa(n.n) + ")")
586 break
587 case ntRef, ntTestref:
588 buf.WriteString("(index = " + strconv.Itoa(n.m) + ")")
589 break
590 case ntMulti:
591 fmt.Fprintf(buf, "(String = %s)", string(n.str))
592 break
593 case ntSet, ntSetloop, ntSetlazy:
594 buf.WriteString("(Set = " + n.set.String() + ")")
595 break
596 }
597
598 switch n.t {
599 case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntSetloop, ntSetlazy, ntLoop, ntLazyloop:
600 buf.WriteString("(Min = ")
601 buf.WriteString(strconv.Itoa(n.m))
602 buf.WriteString(", Max = ")
603 if n.n == math.MaxInt32 {
604 buf.WriteString("inf")
605 } else {
606 buf.WriteString(strconv.Itoa(n.n))
607 }
608 buf.WriteString(")")
609
610 break
611 }
612
613 return buf.String()
614}
615
616var padSpace = []byte(" ")
617
618func (t *RegexTree) Dump() string {
619 return t.root.dump()
620}
621
622func (n *regexNode) dump() string {
623 var stack []int
624 CurNode := n
625 CurChild := 0
626
627 buf := bytes.NewBufferString(CurNode.description())
628 buf.WriteRune('\n')
629
630 for {
631 if CurNode.children != nil && CurChild < len(CurNode.children) {
632 stack = append(stack, CurChild+1)
633 CurNode = CurNode.children[CurChild]
634 CurChild = 0
635
636 Depth := len(stack)
637 if Depth > 32 {
638 Depth = 32
639 }
640 buf.Write(padSpace[:Depth])
641 buf.WriteString(CurNode.description())
642 buf.WriteRune('\n')
643 } else {
644 if len(stack) == 0 {
645 break
646 }
647
648 CurChild = stack[len(stack)-1]
649 stack = stack[:len(stack)-1]
650 CurNode = CurNode.next
651 }
652 }
653 return buf.String()
654}
Note: See TracBrowser for help on using the repository browser.