source: code/trunk/vendor/github.com/dlclark/regexp2/syntax/writer.go@ 67

Last change on this file since 67 was 67, checked in by Izuru Yakumo, 23 months ago

Use vendored modules

Signed-off-by: Izuru Yakumo <yakumo.izuru@…>

File size: 10.8 KB
Line 
1package syntax
2
3import (
4 "bytes"
5 "fmt"
6 "math"
7 "os"
8)
9
10func Write(tree *RegexTree) (*Code, error) {
11 w := writer{
12 intStack: make([]int, 0, 32),
13 emitted: make([]int, 2),
14 stringhash: make(map[string]int),
15 sethash: make(map[string]int),
16 }
17
18 code, err := w.codeFromTree(tree)
19
20 if tree.options&Debug > 0 && code != nil {
21 os.Stdout.WriteString(code.Dump())
22 os.Stdout.WriteString("\n")
23 }
24
25 return code, err
26}
27
28type writer struct {
29 emitted []int
30
31 intStack []int
32 curpos int
33 stringhash map[string]int
34 stringtable [][]rune
35 sethash map[string]int
36 settable []*CharSet
37 counting bool
38 count int
39 trackcount int
40 caps map[int]int
41}
42
43const (
44 beforeChild nodeType = 64
45 afterChild = 128
46 //MaxPrefixSize is the largest number of runes we'll use for a BoyerMoyer prefix
47 MaxPrefixSize = 50
48)
49
50// The top level RegexCode generator. It does a depth-first walk
51// through the tree and calls EmitFragment to emits code before
52// and after each child of an interior node, and at each leaf.
53//
54// It runs two passes, first to count the size of the generated
55// code, and second to generate the code.
56//
57// We should time it against the alternative, which is
58// to just generate the code and grow the array as we go.
59func (w *writer) codeFromTree(tree *RegexTree) (*Code, error) {
60 var (
61 curNode *regexNode
62 curChild int
63 capsize int
64 )
65 // construct sparse capnum mapping if some numbers are unused
66
67 if tree.capnumlist == nil || tree.captop == len(tree.capnumlist) {
68 capsize = tree.captop
69 w.caps = nil
70 } else {
71 capsize = len(tree.capnumlist)
72 w.caps = tree.caps
73 for i := 0; i < len(tree.capnumlist); i++ {
74 w.caps[tree.capnumlist[i]] = i
75 }
76 }
77
78 w.counting = true
79
80 for {
81 if !w.counting {
82 w.emitted = make([]int, w.count)
83 }
84
85 curNode = tree.root
86 curChild = 0
87
88 w.emit1(Lazybranch, 0)
89
90 for {
91 if len(curNode.children) == 0 {
92 w.emitFragment(curNode.t, curNode, 0)
93 } else if curChild < len(curNode.children) {
94 w.emitFragment(curNode.t|beforeChild, curNode, curChild)
95
96 curNode = curNode.children[curChild]
97
98 w.pushInt(curChild)
99 curChild = 0
100 continue
101 }
102
103 if w.emptyStack() {
104 break
105 }
106
107 curChild = w.popInt()
108 curNode = curNode.next
109
110 w.emitFragment(curNode.t|afterChild, curNode, curChild)
111 curChild++
112 }
113
114 w.patchJump(0, w.curPos())
115 w.emit(Stop)
116
117 if !w.counting {
118 break
119 }
120
121 w.counting = false
122 }
123
124 fcPrefix := getFirstCharsPrefix(tree)
125 prefix := getPrefix(tree)
126 rtl := (tree.options & RightToLeft) != 0
127
128 var bmPrefix *BmPrefix
129 //TODO: benchmark string prefixes
130 if prefix != nil && len(prefix.PrefixStr) > 0 && MaxPrefixSize > 0 {
131 if len(prefix.PrefixStr) > MaxPrefixSize {
132 // limit prefix changes to 10k
133 prefix.PrefixStr = prefix.PrefixStr[:MaxPrefixSize]
134 }
135 bmPrefix = newBmPrefix(prefix.PrefixStr, prefix.CaseInsensitive, rtl)
136 } else {
137 bmPrefix = nil
138 }
139
140 return &Code{
141 Codes: w.emitted,
142 Strings: w.stringtable,
143 Sets: w.settable,
144 TrackCount: w.trackcount,
145 Caps: w.caps,
146 Capsize: capsize,
147 FcPrefix: fcPrefix,
148 BmPrefix: bmPrefix,
149 Anchors: getAnchors(tree),
150 RightToLeft: rtl,
151 }, nil
152}
153
154// The main RegexCode generator. It does a depth-first walk
155// through the tree and calls EmitFragment to emits code before
156// and after each child of an interior node, and at each leaf.
157func (w *writer) emitFragment(nodetype nodeType, node *regexNode, curIndex int) error {
158 bits := InstOp(0)
159
160 if nodetype <= ntRef {
161 if (node.options & RightToLeft) != 0 {
162 bits |= Rtl
163 }
164 if (node.options & IgnoreCase) != 0 {
165 bits |= Ci
166 }
167 }
168 ntBits := nodeType(bits)
169
170 switch nodetype {
171 case ntConcatenate | beforeChild, ntConcatenate | afterChild, ntEmpty:
172 break
173
174 case ntAlternate | beforeChild:
175 if curIndex < len(node.children)-1 {
176 w.pushInt(w.curPos())
177 w.emit1(Lazybranch, 0)
178 }
179
180 case ntAlternate | afterChild:
181 if curIndex < len(node.children)-1 {
182 lbPos := w.popInt()
183 w.pushInt(w.curPos())
184 w.emit1(Goto, 0)
185 w.patchJump(lbPos, w.curPos())
186 } else {
187 for i := 0; i < curIndex; i++ {
188 w.patchJump(w.popInt(), w.curPos())
189 }
190 }
191 break
192
193 case ntTestref | beforeChild:
194 if curIndex == 0 {
195 w.emit(Setjump)
196 w.pushInt(w.curPos())
197 w.emit1(Lazybranch, 0)
198 w.emit1(Testref, w.mapCapnum(node.m))
199 w.emit(Forejump)
200 }
201
202 case ntTestref | afterChild:
203 if curIndex == 0 {
204 branchpos := w.popInt()
205 w.pushInt(w.curPos())
206 w.emit1(Goto, 0)
207 w.patchJump(branchpos, w.curPos())
208 w.emit(Forejump)
209 if len(node.children) <= 1 {
210 w.patchJump(w.popInt(), w.curPos())
211 }
212 } else if curIndex == 1 {
213 w.patchJump(w.popInt(), w.curPos())
214 }
215
216 case ntTestgroup | beforeChild:
217 if curIndex == 0 {
218 w.emit(Setjump)
219 w.emit(Setmark)
220 w.pushInt(w.curPos())
221 w.emit1(Lazybranch, 0)
222 }
223
224 case ntTestgroup | afterChild:
225 if curIndex == 0 {
226 w.emit(Getmark)
227 w.emit(Forejump)
228 } else if curIndex == 1 {
229 Branchpos := w.popInt()
230 w.pushInt(w.curPos())
231 w.emit1(Goto, 0)
232 w.patchJump(Branchpos, w.curPos())
233 w.emit(Getmark)
234 w.emit(Forejump)
235 if len(node.children) <= 2 {
236 w.patchJump(w.popInt(), w.curPos())
237 }
238 } else if curIndex == 2 {
239 w.patchJump(w.popInt(), w.curPos())
240 }
241
242 case ntLoop | beforeChild, ntLazyloop | beforeChild:
243
244 if node.n < math.MaxInt32 || node.m > 1 {
245 if node.m == 0 {
246 w.emit1(Nullcount, 0)
247 } else {
248 w.emit1(Setcount, 1-node.m)
249 }
250 } else if node.m == 0 {
251 w.emit(Nullmark)
252 } else {
253 w.emit(Setmark)
254 }
255
256 if node.m == 0 {
257 w.pushInt(w.curPos())
258 w.emit1(Goto, 0)
259 }
260 w.pushInt(w.curPos())
261
262 case ntLoop | afterChild, ntLazyloop | afterChild:
263
264 startJumpPos := w.curPos()
265 lazy := (nodetype - (ntLoop | afterChild))
266
267 if node.n < math.MaxInt32 || node.m > 1 {
268 if node.n == math.MaxInt32 {
269 w.emit2(InstOp(Branchcount+lazy), w.popInt(), math.MaxInt32)
270 } else {
271 w.emit2(InstOp(Branchcount+lazy), w.popInt(), node.n-node.m)
272 }
273 } else {
274 w.emit1(InstOp(Branchmark+lazy), w.popInt())
275 }
276
277 if node.m == 0 {
278 w.patchJump(w.popInt(), startJumpPos)
279 }
280
281 case ntGroup | beforeChild, ntGroup | afterChild:
282
283 case ntCapture | beforeChild:
284 w.emit(Setmark)
285
286 case ntCapture | afterChild:
287 w.emit2(Capturemark, w.mapCapnum(node.m), w.mapCapnum(node.n))
288
289 case ntRequire | beforeChild:
290 // NOTE: the following line causes lookahead/lookbehind to be
291 // NON-BACKTRACKING. It can be commented out with (*)
292 w.emit(Setjump)
293
294 w.emit(Setmark)
295
296 case ntRequire | afterChild:
297 w.emit(Getmark)
298
299 // NOTE: the following line causes lookahead/lookbehind to be
300 // NON-BACKTRACKING. It can be commented out with (*)
301 w.emit(Forejump)
302
303 case ntPrevent | beforeChild:
304 w.emit(Setjump)
305 w.pushInt(w.curPos())
306 w.emit1(Lazybranch, 0)
307
308 case ntPrevent | afterChild:
309 w.emit(Backjump)
310 w.patchJump(w.popInt(), w.curPos())
311 w.emit(Forejump)
312
313 case ntGreedy | beforeChild:
314 w.emit(Setjump)
315
316 case ntGreedy | afterChild:
317 w.emit(Forejump)
318
319 case ntOne, ntNotone:
320 w.emit1(InstOp(node.t|ntBits), int(node.ch))
321
322 case ntNotoneloop, ntNotonelazy, ntOneloop, ntOnelazy:
323 if node.m > 0 {
324 if node.t == ntOneloop || node.t == ntOnelazy {
325 w.emit2(Onerep|bits, int(node.ch), node.m)
326 } else {
327 w.emit2(Notonerep|bits, int(node.ch), node.m)
328 }
329 }
330 if node.n > node.m {
331 if node.n == math.MaxInt32 {
332 w.emit2(InstOp(node.t|ntBits), int(node.ch), math.MaxInt32)
333 } else {
334 w.emit2(InstOp(node.t|ntBits), int(node.ch), node.n-node.m)
335 }
336 }
337
338 case ntSetloop, ntSetlazy:
339 if node.m > 0 {
340 w.emit2(Setrep|bits, w.setCode(node.set), node.m)
341 }
342 if node.n > node.m {
343 if node.n == math.MaxInt32 {
344 w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), math.MaxInt32)
345 } else {
346 w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), node.n-node.m)
347 }
348 }
349
350 case ntMulti:
351 w.emit1(InstOp(node.t|ntBits), w.stringCode(node.str))
352
353 case ntSet:
354 w.emit1(InstOp(node.t|ntBits), w.setCode(node.set))
355
356 case ntRef:
357 w.emit1(InstOp(node.t|ntBits), w.mapCapnum(node.m))
358
359 case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd:
360 w.emit(InstOp(node.t))
361
362 default:
363 return fmt.Errorf("unexpected opcode in regular expression generation: %v", nodetype)
364 }
365
366 return nil
367}
368
369// To avoid recursion, we use a simple integer stack.
370// This is the push.
371func (w *writer) pushInt(i int) {
372 w.intStack = append(w.intStack, i)
373}
374
375// Returns true if the stack is empty.
376func (w *writer) emptyStack() bool {
377 return len(w.intStack) == 0
378}
379
380// This is the pop.
381func (w *writer) popInt() int {
382 //get our item
383 idx := len(w.intStack) - 1
384 i := w.intStack[idx]
385 //trim our slice
386 w.intStack = w.intStack[:idx]
387 return i
388}
389
390// Returns the current position in the emitted code.
391func (w *writer) curPos() int {
392 return w.curpos
393}
394
395// Fixes up a jump instruction at the specified offset
396// so that it jumps to the specified jumpDest.
397func (w *writer) patchJump(offset, jumpDest int) {
398 w.emitted[offset+1] = jumpDest
399}
400
401// Returns an index in the set table for a charset
402// uses a map to eliminate duplicates.
403func (w *writer) setCode(set *CharSet) int {
404 if w.counting {
405 return 0
406 }
407
408 buf := &bytes.Buffer{}
409
410 set.mapHashFill(buf)
411 hash := buf.String()
412 i, ok := w.sethash[hash]
413 if !ok {
414 i = len(w.sethash)
415 w.sethash[hash] = i
416 w.settable = append(w.settable, set)
417 }
418 return i
419}
420
421// Returns an index in the string table for a string.
422// uses a map to eliminate duplicates.
423func (w *writer) stringCode(str []rune) int {
424 if w.counting {
425 return 0
426 }
427
428 hash := string(str)
429 i, ok := w.stringhash[hash]
430 if !ok {
431 i = len(w.stringhash)
432 w.stringhash[hash] = i
433 w.stringtable = append(w.stringtable, str)
434 }
435
436 return i
437}
438
439// When generating code on a regex that uses a sparse set
440// of capture slots, we hash them to a dense set of indices
441// for an array of capture slots. Instead of doing the hash
442// at match time, it's done at compile time, here.
443func (w *writer) mapCapnum(capnum int) int {
444 if capnum == -1 {
445 return -1
446 }
447
448 if w.caps != nil {
449 return w.caps[capnum]
450 }
451
452 return capnum
453}
454
455// Emits a zero-argument operation. Note that the emit
456// functions all run in two modes: they can emit code, or
457// they can just count the size of the code.
458func (w *writer) emit(op InstOp) {
459 if w.counting {
460 w.count++
461 if opcodeBacktracks(op) {
462 w.trackcount++
463 }
464 return
465 }
466 w.emitted[w.curpos] = int(op)
467 w.curpos++
468}
469
470// Emits a one-argument operation.
471func (w *writer) emit1(op InstOp, opd1 int) {
472 if w.counting {
473 w.count += 2
474 if opcodeBacktracks(op) {
475 w.trackcount++
476 }
477 return
478 }
479 w.emitted[w.curpos] = int(op)
480 w.curpos++
481 w.emitted[w.curpos] = opd1
482 w.curpos++
483}
484
485// Emits a two-argument operation.
486func (w *writer) emit2(op InstOp, opd1, opd2 int) {
487 if w.counting {
488 w.count += 3
489 if opcodeBacktracks(op) {
490 w.trackcount++
491 }
492 return
493 }
494 w.emitted[w.curpos] = int(op)
495 w.curpos++
496 w.emitted[w.curpos] = opd1
497 w.curpos++
498 w.emitted[w.curpos] = opd2
499 w.curpos++
500}
Note: See TracBrowser for help on using the repository browser.