Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: code/trunk/vendor/github.com/dlclark/regexp2/syntax/tree.go@ 67

Last change on this file since 67 was 67, checked in by Izuru Yakumo, 23 months ago

Use vendored modules

Signed-off-by: Izuru Yakumo <yakumo.izuru@…>

File size: 15.6 KB

Line
1	package syntax
2
3	import (
4	"bytes"
5	"fmt"
6	"math"
7	"strconv"
8	)
9
10	type RegexTree struct {
11	root *regexNode
12	caps map[int]int
13	capnumlist []int
14	captop int
15	Capnames map[string]int
16	Caplist []string
17	options RegexOptions
18	}
19
20	// It is built into a parsed tree for a regular expression.
21
22	// Implementation notes:
23	//
24	// Since the node tree is a temporary data structure only used
25	// during compilation of the regexp to integer codes, it's
26	// designed for clarity and convenience rather than
27	// space efficiency.
28	//
29	// RegexNodes are built into a tree, linked by the n.children list.
30	// Each node also has a n.parent and n.ichild member indicating
31	// its parent and which child # it is in its parent's list.
32	//
33	// RegexNodes come in as many types as there are constructs in
34	// a regular expression, for example, "concatenate", "alternate",
35	// "one", "rept", "group". There are also node types for basic
36	// peephole optimizations, e.g., "onerep", "notsetrep", etc.
37	//
38	// Because perl 5 allows "lookback" groups that scan backwards,
39	// each node also gets a "direction". Normally the value of
40	// boolean n.backward = false.
41	//
42	// During parsing, top-level nodes are also stacked onto a parse
43	// stack (a stack of trees). For this purpose we have a n.next
44	// pointer. [Note that to save a few bytes, we could overload the
45	// n.parent pointer instead.]
46	//
47	// On the parse stack, each tree has a "role" - basically, the
48	// nonterminal in the grammar that the parser has currently
49	// assigned to the tree. That code is stored in n.role.
50	//
51	// Finally, some of the different kinds of nodes have data.
52	// Two integers (for the looping constructs) are stored in
53	// n.operands, an an object (either a string or a set)
54	// is stored in n.data
55	type regexNode struct {
56	t nodeType
57	children []*regexNode
58	str []rune
59	set *CharSet
60	ch rune
61	m int
62	n int
63	options RegexOptions
64	next *regexNode
65	}
66
67	type nodeType int32
68
69	const (
70	// The following are leaves, and correspond to primitive operations
71
72	ntOnerep nodeType = 0 // lef,back char,min,max a {n}
73	ntNotonerep = 1 // lef,back char,min,max .{n}
74	ntSetrep = 2 // lef,back set,min,max [\d]{n}
75	ntOneloop = 3 // lef,back char,min,max a {,n}
76	ntNotoneloop = 4 // lef,back char,min,max .{,n}
77	ntSetloop = 5 // lef,back set,min,max [\d]{,n}
78	ntOnelazy = 6 // lef,back char,min,max a {,n}?
79	ntNotonelazy = 7 // lef,back char,min,max .{,n}?
80	ntSetlazy = 8 // lef,back set,min,max [\d]{,n}?
81	ntOne = 9 // lef char a
82	ntNotone = 10 // lef char [^a]
83	ntSet = 11 // lef set [a-z\s] \w \s \d
84	ntMulti = 12 // lef string abcd
85	ntRef = 13 // lef group \#
86	ntBol = 14 // ^
87	ntEol = 15 // $
88	ntBoundary = 16 // \b
89	ntNonboundary = 17 // \B
90	ntBeginning = 18 // \A
91	ntStart = 19 // \G
92	ntEndZ = 20 // \Z
93	ntEnd = 21 // \Z
94
95	// Interior nodes do not correspond to primitive operations, but
96	// control structures compositing other operations
97
98	// Concat and alternate take n children, and can run forward or backwards
99
100	ntNothing = 22 // []
101	ntEmpty = 23 // ()
102	ntAlternate = 24 // a\|b
103	ntConcatenate = 25 // ab
104	ntLoop = 26 // m,x * + ? {,}
105	ntLazyloop = 27 // m,x *? +? ?? {,}?
106	ntCapture = 28 // n ()
107	ntGroup = 29 // (?:)
108	ntRequire = 30 // (?=) (?<=)
109	ntPrevent = 31 // (?!) (?<!)
110	ntGreedy = 32 // (?>) (?<)
111	ntTestref = 33 // (?(n) \| )
112	ntTestgroup = 34 // (?(...) \| )
113
114	ntECMABoundary = 41 // \b
115	ntNonECMABoundary = 42 // \B
116	)
117
118	func newRegexNode(t nodeType, opt RegexOptions) *regexNode {
119	return &regexNode{
120	t: t,
121	options: opt,
122	}
123	}
124
125	func newRegexNodeCh(t nodeType, opt RegexOptions, ch rune) *regexNode {
126	return &regexNode{
127	t: t,
128	options: opt,
129	ch: ch,
130	}
131	}
132
133	func newRegexNodeStr(t nodeType, opt RegexOptions, str []rune) *regexNode {
134	return &regexNode{
135	t: t,
136	options: opt,
137	str: str,
138	}
139	}
140
141	func newRegexNodeSet(t nodeType, opt RegexOptions, set CharSet) regexNode {
142	return &regexNode{
143	t: t,
144	options: opt,
145	set: set,
146	}
147	}
148
149	func newRegexNodeM(t nodeType, opt RegexOptions, m int) *regexNode {
150	return &regexNode{
151	t: t,
152	options: opt,
153	m: m,
154	}
155	}
156	func newRegexNodeMN(t nodeType, opt RegexOptions, m, n int) *regexNode {
157	return &regexNode{
158	t: t,
159	options: opt,
160	m: m,
161	n: n,
162	}
163	}
164
165	func (n regexNode) writeStrToBuf(buf bytes.Buffer) {
166	for i := 0; i < len(n.str); i++ {
167	buf.WriteRune(n.str[i])
168	}
169	}
170
171	func (n regexNode) addChild(child regexNode) {
172	reduced := child.reduce()
173	n.children = append(n.children, reduced)
174	reduced.next = n
175	}
176
177	func (n regexNode) insertChildren(afterIndex int, nodes []regexNode) {
178	newChildren := make([]*regexNode, 0, len(n.children)+len(nodes))
179	n.children = append(append(append(newChildren, n.children[:afterIndex]...), nodes...), n.children[afterIndex:]...)
180	}
181
182	// removes children including the start but not the end index
183	func (n *regexNode) removeChildren(startIndex, endIndex int) {
184	n.children = append(n.children[:startIndex], n.children[endIndex:]...)
185	}
186
187	// Pass type as OneLazy or OneLoop
188	func (n *regexNode) makeRep(t nodeType, min, max int) {
189	n.t += (t - ntOne)
190	n.m = min
191	n.n = max
192	}
193
194	func (n regexNode) reduce() regexNode {
195	switch n.t {
196	case ntAlternate:
197	return n.reduceAlternation()
198
199	case ntConcatenate:
200	return n.reduceConcatenation()
201
202	case ntLoop, ntLazyloop:
203	return n.reduceRep()
204
205	case ntGroup:
206	return n.reduceGroup()
207
208	case ntSet, ntSetloop:
209	return n.reduceSet()
210
211	default:
212	return n
213	}
214	}
215
216	// Basic optimization. Single-letter alternations can be replaced
217	// by faster set specifications, and nested alternations with no
218	// intervening operators can be flattened:
219	//
220	// a\|b\|c\|def\|g\|h -> [a-c]\|def\|[gh]
221	// apple\|(?:orange\|pear)\|grape -> apple\|orange\|pear\|grape
222	func (n regexNode) reduceAlternation() regexNode {
223	if len(n.children) == 0 {
224	return newRegexNode(ntNothing, n.options)
225	}
226
227	wasLastSet := false
228	lastNodeCannotMerge := false
229	var optionsLast RegexOptions
230	var i, j int
231
232	for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 {
233	at := n.children[i]
234
235	if j < i {
236	n.children[j] = at
237	}
238
239	for {
240	if at.t == ntAlternate {
241	for k := 0; k < len(at.children); k++ {
242	at.children[k].next = n
243	}
244	n.insertChildren(i+1, at.children)
245
246	j--
247	} else if at.t == ntSet \|\| at.t == ntOne {
248	// Cannot merge sets if L or I options differ, or if either are negated.
249	optionsAt := at.options & (RightToLeft \| IgnoreCase)
250
251	if at.t == ntSet {
252	if !wasLastSet \|\| optionsLast != optionsAt \|\| lastNodeCannotMerge \|\| !at.set.IsMergeable() {
253	wasLastSet = true
254	lastNodeCannotMerge = !at.set.IsMergeable()
255	optionsLast = optionsAt
256	break
257	}
258	} else if !wasLastSet \|\| optionsLast != optionsAt \|\| lastNodeCannotMerge {
259	wasLastSet = true
260	lastNodeCannotMerge = false
261	optionsLast = optionsAt
262	break
263	}
264
265	// The last node was a Set or a One, we're a Set or One and our options are the same.
266	// Merge the two nodes.
267	j--
268	prev := n.children[j]
269
270	var prevCharClass *CharSet
271	if prev.t == ntOne {
272	prevCharClass = &CharSet{}
273	prevCharClass.addChar(prev.ch)
274	} else {
275	prevCharClass = prev.set
276	}
277
278	if at.t == ntOne {
279	prevCharClass.addChar(at.ch)
280	} else {
281	prevCharClass.addSet(*at.set)
282	}
283
284	prev.t = ntSet
285	prev.set = prevCharClass
286	} else if at.t == ntNothing {
287	j--
288	} else {
289	wasLastSet = false
290	lastNodeCannotMerge = false
291	}
292	break
293	}
294	}
295
296	if j < i {
297	n.removeChildren(j, i)
298	}
299
300	return n.stripEnation(ntNothing)
301	}
302
303	// Basic optimization. Adjacent strings can be concatenated.
304	//
305	// (?:abc)(?:def) -> abcdef
306	func (n regexNode) reduceConcatenation() regexNode {
307	// Eliminate empties and concat adjacent strings/chars
308
309	var optionsLast RegexOptions
310	var optionsAt RegexOptions
311	var i, j int
312
313	if len(n.children) == 0 {
314	return newRegexNode(ntEmpty, n.options)
315	}
316
317	wasLastString := false
318
319	for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 {
320	var at, prev *regexNode
321
322	at = n.children[i]
323
324	if j < i {
325	n.children[j] = at
326	}
327
328	if at.t == ntConcatenate &&
329	((at.options & RightToLeft) == (n.options & RightToLeft)) {
330	for k := 0; k < len(at.children); k++ {
331	at.children[k].next = n
332	}
333
334	//insert at.children at i+1 index in n.children
335	n.insertChildren(i+1, at.children)
336
337	j--
338	} else if at.t == ntMulti \|\| at.t == ntOne {
339	// Cannot merge strings if L or I options differ
340	optionsAt = at.options & (RightToLeft \| IgnoreCase)
341
342	if !wasLastString \|\| optionsLast != optionsAt {
343	wasLastString = true
344	optionsLast = optionsAt
345	continue
346	}
347
348	j--
349	prev = n.children[j]
350
351	if prev.t == ntOne {
352	prev.t = ntMulti
353	prev.str = []rune{prev.ch}
354	}
355
356	if (optionsAt & RightToLeft) == 0 {
357	if at.t == ntOne {
358	prev.str = append(prev.str, at.ch)
359	} else {
360	prev.str = append(prev.str, at.str...)
361	}
362	} else {
363	if at.t == ntOne {
364	// insert at the front by expanding our slice, copying the data over, and then setting the value
365	prev.str = append(prev.str, 0)
366	copy(prev.str[1:], prev.str)
367	prev.str[0] = at.ch
368	} else {
369	//insert at the front...this one we'll make a new slice and copy both into it
370	merge := make([]rune, len(prev.str)+len(at.str))
371	copy(merge, at.str)
372	copy(merge[len(at.str):], prev.str)
373	prev.str = merge
374	}
375	}
376	} else if at.t == ntEmpty {
377	j--
378	} else {
379	wasLastString = false
380	}
381	}
382
383	if j < i {
384	// remove indices j through i from the children
385	n.removeChildren(j, i)
386	}
387
388	return n.stripEnation(ntEmpty)
389	}
390
391	// Nested repeaters just get multiplied with each other if they're not
392	// too lumpy
393	func (n regexNode) reduceRep() regexNode {
394
395	u := n
396	t := n.t
397	min := n.m
398	max := n.n
399
400	for {
401	if len(u.children) == 0 {
402	break
403	}
404
405	child := u.children[0]
406
407	// multiply reps of the same type only
408	if child.t != t {
409	childType := child.t
410
411	if !(childType >= ntOneloop && childType <= ntSetloop && t == ntLoop \|\|
412	childType >= ntOnelazy && childType <= ntSetlazy && t == ntLazyloop) {
413	break
414	}
415	}
416
417	// child can be too lumpy to blur, e.g., (a {100,105}) {3} or (a {2,})?
418	// [but things like (a {2,})+ are not too lumpy...]
419	if u.m == 0 && child.m > 1 \|\| child.n < child.m*2 {
420	break
421	}
422
423	u = child
424	if u.m > 0 {
425	if (math.MaxInt32-1)/u.m < min {
426	u.m = math.MaxInt32
427	} else {
428	u.m = u.m * min
429	}
430	}
431	if u.n > 0 {
432	if (math.MaxInt32-1)/u.n < max {
433	u.n = math.MaxInt32
434	} else {
435	u.n = u.n * max
436	}
437	}
438	}
439
440	if math.MaxInt32 == min {
441	return newRegexNode(ntNothing, n.options)
442	}
443	return u
444
445	}
446
447	// Simple optimization. If a concatenation or alternation has only
448	// one child strip out the intermediate node. If it has zero children,
449	// turn it into an empty.
450	func (n regexNode) stripEnation(emptyType nodeType) regexNode {
451	switch len(n.children) {
452	case 0:
453	return newRegexNode(emptyType, n.options)
454	case 1:
455	return n.children[0]
456	default:
457	return n
458	}
459	}
460
461	func (n regexNode) reduceGroup() regexNode {
462	u := n
463
464	for u.t == ntGroup {
465	u = u.children[0]
466	}
467
468	return u
469	}
470
471	// Simple optimization. If a set is a singleton, an inverse singleton,
472	// or empty, it's transformed accordingly.
473	func (n regexNode) reduceSet() regexNode {
474	// Extract empty-set, one and not-one case as special
475
476	if n.set == nil {
477	n.t = ntNothing
478	} else if n.set.IsSingleton() {
479	n.ch = n.set.SingletonChar()
480	n.set = nil
481	n.t += (ntOne - ntSet)
482	} else if n.set.IsSingletonInverse() {
483	n.ch = n.set.SingletonChar()
484	n.set = nil
485	n.t += (ntNotone - ntSet)
486	}
487
488	return n
489	}
490
491	func (n regexNode) reverseLeft() regexNode {
492	if n.options&RightToLeft != 0 && n.t == ntConcatenate && len(n.children) > 0 {
493	//reverse children order
494	for left, right := 0, len(n.children)-1; left < right; left, right = left+1, right-1 {
495	n.children[left], n.children[right] = n.children[right], n.children[left]
496	}
497	}
498
499	return n
500	}
501
502	func (n regexNode) makeQuantifier(lazy bool, min, max int) regexNode {
503	if min == 0 && max == 0 {
504	return newRegexNode(ntEmpty, n.options)
505	}
506
507	if min == 1 && max == 1 {
508	return n
509	}
510
511	switch n.t {
512	case ntOne, ntNotone, ntSet:
513	if lazy {
514	n.makeRep(Onelazy, min, max)
515	} else {
516	n.makeRep(Oneloop, min, max)
517	}
518	return n
519
520	default:
521	var t nodeType
522	if lazy {
523	t = ntLazyloop
524	} else {
525	t = ntLoop
526	}
527	result := newRegexNodeMN(t, n.options, min, max)
528	result.addChild(n)
529	return result
530	}
531	}
532
533	// debug functions
534
535	var typeStr = []string{
536	"Onerep", "Notonerep", "Setrep",
537	"Oneloop", "Notoneloop", "Setloop",
538	"Onelazy", "Notonelazy", "Setlazy",
539	"One", "Notone", "Set",
540	"Multi", "Ref",
541	"Bol", "Eol", "Boundary", "Nonboundary",
542	"Beginning", "Start", "EndZ", "End",
543	"Nothing", "Empty",
544	"Alternate", "Concatenate",
545	"Loop", "Lazyloop",
546	"Capture", "Group", "Require", "Prevent", "Greedy",
547	"Testref", "Testgroup",
548	"Unknown", "Unknown", "Unknown",
549	"Unknown", "Unknown", "Unknown",
550	"ECMABoundary", "NonECMABoundary",
551	}
552
553	func (n *regexNode) description() string {
554	buf := &bytes.Buffer{}
555
556	buf.WriteString(typeStr[n.t])
557
558	if (n.options & ExplicitCapture) != 0 {
559	buf.WriteString("-C")
560	}
561	if (n.options & IgnoreCase) != 0 {
562	buf.WriteString("-I")
563	}
564	if (n.options & RightToLeft) != 0 {
565	buf.WriteString("-L")
566	}
567	if (n.options & Multiline) != 0 {
568	buf.WriteString("-M")
569	}
570	if (n.options & Singleline) != 0 {
571	buf.WriteString("-S")
572	}
573	if (n.options & IgnorePatternWhitespace) != 0 {
574	buf.WriteString("-X")
575	}
576	if (n.options & ECMAScript) != 0 {
577	buf.WriteString("-E")
578	}
579
580	switch n.t {
581	case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntOne, ntNotone:
582	buf.WriteString("(Ch = " + CharDescription(n.ch) + ")")
583	break
584	case ntCapture:
585	buf.WriteString("(index = " + strconv.Itoa(n.m) + ", unindex = " + strconv.Itoa(n.n) + ")")
586	break
587	case ntRef, ntTestref:
588	buf.WriteString("(index = " + strconv.Itoa(n.m) + ")")
589	break
590	case ntMulti:
591	fmt.Fprintf(buf, "(String = %s)", string(n.str))
592	break
593	case ntSet, ntSetloop, ntSetlazy:
594	buf.WriteString("(Set = " + n.set.String() + ")")
595	break
596	}
597
598	switch n.t {
599	case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntSetloop, ntSetlazy, ntLoop, ntLazyloop:
600	buf.WriteString("(Min = ")
601	buf.WriteString(strconv.Itoa(n.m))
602	buf.WriteString(", Max = ")
603	if n.n == math.MaxInt32 {
604	buf.WriteString("inf")
605	} else {
606	buf.WriteString(strconv.Itoa(n.n))
607	}
608	buf.WriteString(")")
609
610	break
611	}
612
613	return buf.String()
614	}
615
616	var padSpace = []byte(" ")
617
618	func (t *RegexTree) Dump() string {
619	return t.root.dump()
620	}
621
622	func (n *regexNode) dump() string {
623	var stack []int
624	CurNode := n
625	CurChild := 0
626
627	buf := bytes.NewBufferString(CurNode.description())
628	buf.WriteRune('\n')
629
630	for {
631	if CurNode.children != nil && CurChild < len(CurNode.children) {
632	stack = append(stack, CurChild+1)
633	CurNode = CurNode.children[CurChild]
634	CurChild = 0
635
636	Depth := len(stack)
637	if Depth > 32 {
638	Depth = 32
639	}
640	buf.Write(padSpace[:Depth])
641	buf.WriteString(CurNode.description())
642	buf.WriteRune('\n')
643	} else {
644	if len(stack) == 0 {
645	break
646	}
647
648	CurChild = stack[len(stack)-1]
649	stack = stack[:len(stack)-1]
650	CurNode = CurNode.next
651	}
652	}
653	return buf.String()
654	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: