1 | package syntax
|
---|
2 |
|
---|
3 | import (
|
---|
4 | "bytes"
|
---|
5 | "fmt"
|
---|
6 | "math"
|
---|
7 | "os"
|
---|
8 | )
|
---|
9 |
|
---|
10 | func Write(tree *RegexTree) (*Code, error) {
|
---|
11 | w := writer{
|
---|
12 | intStack: make([]int, 0, 32),
|
---|
13 | emitted: make([]int, 2),
|
---|
14 | stringhash: make(map[string]int),
|
---|
15 | sethash: make(map[string]int),
|
---|
16 | }
|
---|
17 |
|
---|
18 | code, err := w.codeFromTree(tree)
|
---|
19 |
|
---|
20 | if tree.options&Debug > 0 && code != nil {
|
---|
21 | os.Stdout.WriteString(code.Dump())
|
---|
22 | os.Stdout.WriteString("\n")
|
---|
23 | }
|
---|
24 |
|
---|
25 | return code, err
|
---|
26 | }
|
---|
27 |
|
---|
28 | type writer struct {
|
---|
29 | emitted []int
|
---|
30 |
|
---|
31 | intStack []int
|
---|
32 | curpos int
|
---|
33 | stringhash map[string]int
|
---|
34 | stringtable [][]rune
|
---|
35 | sethash map[string]int
|
---|
36 | settable []*CharSet
|
---|
37 | counting bool
|
---|
38 | count int
|
---|
39 | trackcount int
|
---|
40 | caps map[int]int
|
---|
41 | }
|
---|
42 |
|
---|
43 | const (
|
---|
44 | beforeChild nodeType = 64
|
---|
45 | afterChild = 128
|
---|
46 | //MaxPrefixSize is the largest number of runes we'll use for a BoyerMoyer prefix
|
---|
47 | MaxPrefixSize = 50
|
---|
48 | )
|
---|
49 |
|
---|
50 | // The top level RegexCode generator. It does a depth-first walk
|
---|
51 | // through the tree and calls EmitFragment to emits code before
|
---|
52 | // and after each child of an interior node, and at each leaf.
|
---|
53 | //
|
---|
54 | // It runs two passes, first to count the size of the generated
|
---|
55 | // code, and second to generate the code.
|
---|
56 | //
|
---|
57 | // We should time it against the alternative, which is
|
---|
58 | // to just generate the code and grow the array as we go.
|
---|
59 | func (w *writer) codeFromTree(tree *RegexTree) (*Code, error) {
|
---|
60 | var (
|
---|
61 | curNode *regexNode
|
---|
62 | curChild int
|
---|
63 | capsize int
|
---|
64 | )
|
---|
65 | // construct sparse capnum mapping if some numbers are unused
|
---|
66 |
|
---|
67 | if tree.capnumlist == nil || tree.captop == len(tree.capnumlist) {
|
---|
68 | capsize = tree.captop
|
---|
69 | w.caps = nil
|
---|
70 | } else {
|
---|
71 | capsize = len(tree.capnumlist)
|
---|
72 | w.caps = tree.caps
|
---|
73 | for i := 0; i < len(tree.capnumlist); i++ {
|
---|
74 | w.caps[tree.capnumlist[i]] = i
|
---|
75 | }
|
---|
76 | }
|
---|
77 |
|
---|
78 | w.counting = true
|
---|
79 |
|
---|
80 | for {
|
---|
81 | if !w.counting {
|
---|
82 | w.emitted = make([]int, w.count)
|
---|
83 | }
|
---|
84 |
|
---|
85 | curNode = tree.root
|
---|
86 | curChild = 0
|
---|
87 |
|
---|
88 | w.emit1(Lazybranch, 0)
|
---|
89 |
|
---|
90 | for {
|
---|
91 | if len(curNode.children) == 0 {
|
---|
92 | w.emitFragment(curNode.t, curNode, 0)
|
---|
93 | } else if curChild < len(curNode.children) {
|
---|
94 | w.emitFragment(curNode.t|beforeChild, curNode, curChild)
|
---|
95 |
|
---|
96 | curNode = curNode.children[curChild]
|
---|
97 |
|
---|
98 | w.pushInt(curChild)
|
---|
99 | curChild = 0
|
---|
100 | continue
|
---|
101 | }
|
---|
102 |
|
---|
103 | if w.emptyStack() {
|
---|
104 | break
|
---|
105 | }
|
---|
106 |
|
---|
107 | curChild = w.popInt()
|
---|
108 | curNode = curNode.next
|
---|
109 |
|
---|
110 | w.emitFragment(curNode.t|afterChild, curNode, curChild)
|
---|
111 | curChild++
|
---|
112 | }
|
---|
113 |
|
---|
114 | w.patchJump(0, w.curPos())
|
---|
115 | w.emit(Stop)
|
---|
116 |
|
---|
117 | if !w.counting {
|
---|
118 | break
|
---|
119 | }
|
---|
120 |
|
---|
121 | w.counting = false
|
---|
122 | }
|
---|
123 |
|
---|
124 | fcPrefix := getFirstCharsPrefix(tree)
|
---|
125 | prefix := getPrefix(tree)
|
---|
126 | rtl := (tree.options & RightToLeft) != 0
|
---|
127 |
|
---|
128 | var bmPrefix *BmPrefix
|
---|
129 | //TODO: benchmark string prefixes
|
---|
130 | if prefix != nil && len(prefix.PrefixStr) > 0 && MaxPrefixSize > 0 {
|
---|
131 | if len(prefix.PrefixStr) > MaxPrefixSize {
|
---|
132 | // limit prefix changes to 10k
|
---|
133 | prefix.PrefixStr = prefix.PrefixStr[:MaxPrefixSize]
|
---|
134 | }
|
---|
135 | bmPrefix = newBmPrefix(prefix.PrefixStr, prefix.CaseInsensitive, rtl)
|
---|
136 | } else {
|
---|
137 | bmPrefix = nil
|
---|
138 | }
|
---|
139 |
|
---|
140 | return &Code{
|
---|
141 | Codes: w.emitted,
|
---|
142 | Strings: w.stringtable,
|
---|
143 | Sets: w.settable,
|
---|
144 | TrackCount: w.trackcount,
|
---|
145 | Caps: w.caps,
|
---|
146 | Capsize: capsize,
|
---|
147 | FcPrefix: fcPrefix,
|
---|
148 | BmPrefix: bmPrefix,
|
---|
149 | Anchors: getAnchors(tree),
|
---|
150 | RightToLeft: rtl,
|
---|
151 | }, nil
|
---|
152 | }
|
---|
153 |
|
---|
154 | // The main RegexCode generator. It does a depth-first walk
|
---|
155 | // through the tree and calls EmitFragment to emits code before
|
---|
156 | // and after each child of an interior node, and at each leaf.
|
---|
157 | func (w *writer) emitFragment(nodetype nodeType, node *regexNode, curIndex int) error {
|
---|
158 | bits := InstOp(0)
|
---|
159 |
|
---|
160 | if nodetype <= ntRef {
|
---|
161 | if (node.options & RightToLeft) != 0 {
|
---|
162 | bits |= Rtl
|
---|
163 | }
|
---|
164 | if (node.options & IgnoreCase) != 0 {
|
---|
165 | bits |= Ci
|
---|
166 | }
|
---|
167 | }
|
---|
168 | ntBits := nodeType(bits)
|
---|
169 |
|
---|
170 | switch nodetype {
|
---|
171 | case ntConcatenate | beforeChild, ntConcatenate | afterChild, ntEmpty:
|
---|
172 | break
|
---|
173 |
|
---|
174 | case ntAlternate | beforeChild:
|
---|
175 | if curIndex < len(node.children)-1 {
|
---|
176 | w.pushInt(w.curPos())
|
---|
177 | w.emit1(Lazybranch, 0)
|
---|
178 | }
|
---|
179 |
|
---|
180 | case ntAlternate | afterChild:
|
---|
181 | if curIndex < len(node.children)-1 {
|
---|
182 | lbPos := w.popInt()
|
---|
183 | w.pushInt(w.curPos())
|
---|
184 | w.emit1(Goto, 0)
|
---|
185 | w.patchJump(lbPos, w.curPos())
|
---|
186 | } else {
|
---|
187 | for i := 0; i < curIndex; i++ {
|
---|
188 | w.patchJump(w.popInt(), w.curPos())
|
---|
189 | }
|
---|
190 | }
|
---|
191 | break
|
---|
192 |
|
---|
193 | case ntTestref | beforeChild:
|
---|
194 | if curIndex == 0 {
|
---|
195 | w.emit(Setjump)
|
---|
196 | w.pushInt(w.curPos())
|
---|
197 | w.emit1(Lazybranch, 0)
|
---|
198 | w.emit1(Testref, w.mapCapnum(node.m))
|
---|
199 | w.emit(Forejump)
|
---|
200 | }
|
---|
201 |
|
---|
202 | case ntTestref | afterChild:
|
---|
203 | if curIndex == 0 {
|
---|
204 | branchpos := w.popInt()
|
---|
205 | w.pushInt(w.curPos())
|
---|
206 | w.emit1(Goto, 0)
|
---|
207 | w.patchJump(branchpos, w.curPos())
|
---|
208 | w.emit(Forejump)
|
---|
209 | if len(node.children) <= 1 {
|
---|
210 | w.patchJump(w.popInt(), w.curPos())
|
---|
211 | }
|
---|
212 | } else if curIndex == 1 {
|
---|
213 | w.patchJump(w.popInt(), w.curPos())
|
---|
214 | }
|
---|
215 |
|
---|
216 | case ntTestgroup | beforeChild:
|
---|
217 | if curIndex == 0 {
|
---|
218 | w.emit(Setjump)
|
---|
219 | w.emit(Setmark)
|
---|
220 | w.pushInt(w.curPos())
|
---|
221 | w.emit1(Lazybranch, 0)
|
---|
222 | }
|
---|
223 |
|
---|
224 | case ntTestgroup | afterChild:
|
---|
225 | if curIndex == 0 {
|
---|
226 | w.emit(Getmark)
|
---|
227 | w.emit(Forejump)
|
---|
228 | } else if curIndex == 1 {
|
---|
229 | Branchpos := w.popInt()
|
---|
230 | w.pushInt(w.curPos())
|
---|
231 | w.emit1(Goto, 0)
|
---|
232 | w.patchJump(Branchpos, w.curPos())
|
---|
233 | w.emit(Getmark)
|
---|
234 | w.emit(Forejump)
|
---|
235 | if len(node.children) <= 2 {
|
---|
236 | w.patchJump(w.popInt(), w.curPos())
|
---|
237 | }
|
---|
238 | } else if curIndex == 2 {
|
---|
239 | w.patchJump(w.popInt(), w.curPos())
|
---|
240 | }
|
---|
241 |
|
---|
242 | case ntLoop | beforeChild, ntLazyloop | beforeChild:
|
---|
243 |
|
---|
244 | if node.n < math.MaxInt32 || node.m > 1 {
|
---|
245 | if node.m == 0 {
|
---|
246 | w.emit1(Nullcount, 0)
|
---|
247 | } else {
|
---|
248 | w.emit1(Setcount, 1-node.m)
|
---|
249 | }
|
---|
250 | } else if node.m == 0 {
|
---|
251 | w.emit(Nullmark)
|
---|
252 | } else {
|
---|
253 | w.emit(Setmark)
|
---|
254 | }
|
---|
255 |
|
---|
256 | if node.m == 0 {
|
---|
257 | w.pushInt(w.curPos())
|
---|
258 | w.emit1(Goto, 0)
|
---|
259 | }
|
---|
260 | w.pushInt(w.curPos())
|
---|
261 |
|
---|
262 | case ntLoop | afterChild, ntLazyloop | afterChild:
|
---|
263 |
|
---|
264 | startJumpPos := w.curPos()
|
---|
265 | lazy := (nodetype - (ntLoop | afterChild))
|
---|
266 |
|
---|
267 | if node.n < math.MaxInt32 || node.m > 1 {
|
---|
268 | if node.n == math.MaxInt32 {
|
---|
269 | w.emit2(InstOp(Branchcount+lazy), w.popInt(), math.MaxInt32)
|
---|
270 | } else {
|
---|
271 | w.emit2(InstOp(Branchcount+lazy), w.popInt(), node.n-node.m)
|
---|
272 | }
|
---|
273 | } else {
|
---|
274 | w.emit1(InstOp(Branchmark+lazy), w.popInt())
|
---|
275 | }
|
---|
276 |
|
---|
277 | if node.m == 0 {
|
---|
278 | w.patchJump(w.popInt(), startJumpPos)
|
---|
279 | }
|
---|
280 |
|
---|
281 | case ntGroup | beforeChild, ntGroup | afterChild:
|
---|
282 |
|
---|
283 | case ntCapture | beforeChild:
|
---|
284 | w.emit(Setmark)
|
---|
285 |
|
---|
286 | case ntCapture | afterChild:
|
---|
287 | w.emit2(Capturemark, w.mapCapnum(node.m), w.mapCapnum(node.n))
|
---|
288 |
|
---|
289 | case ntRequire | beforeChild:
|
---|
290 | // NOTE: the following line causes lookahead/lookbehind to be
|
---|
291 | // NON-BACKTRACKING. It can be commented out with (*)
|
---|
292 | w.emit(Setjump)
|
---|
293 |
|
---|
294 | w.emit(Setmark)
|
---|
295 |
|
---|
296 | case ntRequire | afterChild:
|
---|
297 | w.emit(Getmark)
|
---|
298 |
|
---|
299 | // NOTE: the following line causes lookahead/lookbehind to be
|
---|
300 | // NON-BACKTRACKING. It can be commented out with (*)
|
---|
301 | w.emit(Forejump)
|
---|
302 |
|
---|
303 | case ntPrevent | beforeChild:
|
---|
304 | w.emit(Setjump)
|
---|
305 | w.pushInt(w.curPos())
|
---|
306 | w.emit1(Lazybranch, 0)
|
---|
307 |
|
---|
308 | case ntPrevent | afterChild:
|
---|
309 | w.emit(Backjump)
|
---|
310 | w.patchJump(w.popInt(), w.curPos())
|
---|
311 | w.emit(Forejump)
|
---|
312 |
|
---|
313 | case ntGreedy | beforeChild:
|
---|
314 | w.emit(Setjump)
|
---|
315 |
|
---|
316 | case ntGreedy | afterChild:
|
---|
317 | w.emit(Forejump)
|
---|
318 |
|
---|
319 | case ntOne, ntNotone:
|
---|
320 | w.emit1(InstOp(node.t|ntBits), int(node.ch))
|
---|
321 |
|
---|
322 | case ntNotoneloop, ntNotonelazy, ntOneloop, ntOnelazy:
|
---|
323 | if node.m > 0 {
|
---|
324 | if node.t == ntOneloop || node.t == ntOnelazy {
|
---|
325 | w.emit2(Onerep|bits, int(node.ch), node.m)
|
---|
326 | } else {
|
---|
327 | w.emit2(Notonerep|bits, int(node.ch), node.m)
|
---|
328 | }
|
---|
329 | }
|
---|
330 | if node.n > node.m {
|
---|
331 | if node.n == math.MaxInt32 {
|
---|
332 | w.emit2(InstOp(node.t|ntBits), int(node.ch), math.MaxInt32)
|
---|
333 | } else {
|
---|
334 | w.emit2(InstOp(node.t|ntBits), int(node.ch), node.n-node.m)
|
---|
335 | }
|
---|
336 | }
|
---|
337 |
|
---|
338 | case ntSetloop, ntSetlazy:
|
---|
339 | if node.m > 0 {
|
---|
340 | w.emit2(Setrep|bits, w.setCode(node.set), node.m)
|
---|
341 | }
|
---|
342 | if node.n > node.m {
|
---|
343 | if node.n == math.MaxInt32 {
|
---|
344 | w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), math.MaxInt32)
|
---|
345 | } else {
|
---|
346 | w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), node.n-node.m)
|
---|
347 | }
|
---|
348 | }
|
---|
349 |
|
---|
350 | case ntMulti:
|
---|
351 | w.emit1(InstOp(node.t|ntBits), w.stringCode(node.str))
|
---|
352 |
|
---|
353 | case ntSet:
|
---|
354 | w.emit1(InstOp(node.t|ntBits), w.setCode(node.set))
|
---|
355 |
|
---|
356 | case ntRef:
|
---|
357 | w.emit1(InstOp(node.t|ntBits), w.mapCapnum(node.m))
|
---|
358 |
|
---|
359 | case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd:
|
---|
360 | w.emit(InstOp(node.t))
|
---|
361 |
|
---|
362 | default:
|
---|
363 | return fmt.Errorf("unexpected opcode in regular expression generation: %v", nodetype)
|
---|
364 | }
|
---|
365 |
|
---|
366 | return nil
|
---|
367 | }
|
---|
368 |
|
---|
369 | // To avoid recursion, we use a simple integer stack.
|
---|
370 | // This is the push.
|
---|
371 | func (w *writer) pushInt(i int) {
|
---|
372 | w.intStack = append(w.intStack, i)
|
---|
373 | }
|
---|
374 |
|
---|
375 | // Returns true if the stack is empty.
|
---|
376 | func (w *writer) emptyStack() bool {
|
---|
377 | return len(w.intStack) == 0
|
---|
378 | }
|
---|
379 |
|
---|
380 | // This is the pop.
|
---|
381 | func (w *writer) popInt() int {
|
---|
382 | //get our item
|
---|
383 | idx := len(w.intStack) - 1
|
---|
384 | i := w.intStack[idx]
|
---|
385 | //trim our slice
|
---|
386 | w.intStack = w.intStack[:idx]
|
---|
387 | return i
|
---|
388 | }
|
---|
389 |
|
---|
390 | // Returns the current position in the emitted code.
|
---|
391 | func (w *writer) curPos() int {
|
---|
392 | return w.curpos
|
---|
393 | }
|
---|
394 |
|
---|
395 | // Fixes up a jump instruction at the specified offset
|
---|
396 | // so that it jumps to the specified jumpDest.
|
---|
397 | func (w *writer) patchJump(offset, jumpDest int) {
|
---|
398 | w.emitted[offset+1] = jumpDest
|
---|
399 | }
|
---|
400 |
|
---|
401 | // Returns an index in the set table for a charset
|
---|
402 | // uses a map to eliminate duplicates.
|
---|
403 | func (w *writer) setCode(set *CharSet) int {
|
---|
404 | if w.counting {
|
---|
405 | return 0
|
---|
406 | }
|
---|
407 |
|
---|
408 | buf := &bytes.Buffer{}
|
---|
409 |
|
---|
410 | set.mapHashFill(buf)
|
---|
411 | hash := buf.String()
|
---|
412 | i, ok := w.sethash[hash]
|
---|
413 | if !ok {
|
---|
414 | i = len(w.sethash)
|
---|
415 | w.sethash[hash] = i
|
---|
416 | w.settable = append(w.settable, set)
|
---|
417 | }
|
---|
418 | return i
|
---|
419 | }
|
---|
420 |
|
---|
421 | // Returns an index in the string table for a string.
|
---|
422 | // uses a map to eliminate duplicates.
|
---|
423 | func (w *writer) stringCode(str []rune) int {
|
---|
424 | if w.counting {
|
---|
425 | return 0
|
---|
426 | }
|
---|
427 |
|
---|
428 | hash := string(str)
|
---|
429 | i, ok := w.stringhash[hash]
|
---|
430 | if !ok {
|
---|
431 | i = len(w.stringhash)
|
---|
432 | w.stringhash[hash] = i
|
---|
433 | w.stringtable = append(w.stringtable, str)
|
---|
434 | }
|
---|
435 |
|
---|
436 | return i
|
---|
437 | }
|
---|
438 |
|
---|
439 | // When generating code on a regex that uses a sparse set
|
---|
440 | // of capture slots, we hash them to a dense set of indices
|
---|
441 | // for an array of capture slots. Instead of doing the hash
|
---|
442 | // at match time, it's done at compile time, here.
|
---|
443 | func (w *writer) mapCapnum(capnum int) int {
|
---|
444 | if capnum == -1 {
|
---|
445 | return -1
|
---|
446 | }
|
---|
447 |
|
---|
448 | if w.caps != nil {
|
---|
449 | return w.caps[capnum]
|
---|
450 | }
|
---|
451 |
|
---|
452 | return capnum
|
---|
453 | }
|
---|
454 |
|
---|
455 | // Emits a zero-argument operation. Note that the emit
|
---|
456 | // functions all run in two modes: they can emit code, or
|
---|
457 | // they can just count the size of the code.
|
---|
458 | func (w *writer) emit(op InstOp) {
|
---|
459 | if w.counting {
|
---|
460 | w.count++
|
---|
461 | if opcodeBacktracks(op) {
|
---|
462 | w.trackcount++
|
---|
463 | }
|
---|
464 | return
|
---|
465 | }
|
---|
466 | w.emitted[w.curpos] = int(op)
|
---|
467 | w.curpos++
|
---|
468 | }
|
---|
469 |
|
---|
470 | // Emits a one-argument operation.
|
---|
471 | func (w *writer) emit1(op InstOp, opd1 int) {
|
---|
472 | if w.counting {
|
---|
473 | w.count += 2
|
---|
474 | if opcodeBacktracks(op) {
|
---|
475 | w.trackcount++
|
---|
476 | }
|
---|
477 | return
|
---|
478 | }
|
---|
479 | w.emitted[w.curpos] = int(op)
|
---|
480 | w.curpos++
|
---|
481 | w.emitted[w.curpos] = opd1
|
---|
482 | w.curpos++
|
---|
483 | }
|
---|
484 |
|
---|
485 | // Emits a two-argument operation.
|
---|
486 | func (w *writer) emit2(op InstOp, opd1, opd2 int) {
|
---|
487 | if w.counting {
|
---|
488 | w.count += 3
|
---|
489 | if opcodeBacktracks(op) {
|
---|
490 | w.trackcount++
|
---|
491 | }
|
---|
492 | return
|
---|
493 | }
|
---|
494 | w.emitted[w.curpos] = int(op)
|
---|
495 | w.curpos++
|
---|
496 | w.emitted[w.curpos] = opd1
|
---|
497 | w.curpos++
|
---|
498 | w.emitted[w.curpos] = opd2
|
---|
499 | w.curpos++
|
---|
500 | }
|
---|