1 | package syntax
|
---|
2 |
|
---|
3 | import (
|
---|
4 | "bytes"
|
---|
5 | "fmt"
|
---|
6 | "strconv"
|
---|
7 | "unicode"
|
---|
8 | "unicode/utf8"
|
---|
9 | )
|
---|
10 |
|
---|
11 | type Prefix struct {
|
---|
12 | PrefixStr []rune
|
---|
13 | PrefixSet CharSet
|
---|
14 | CaseInsensitive bool
|
---|
15 | }
|
---|
16 |
|
---|
17 | // It takes a RegexTree and computes the set of chars that can start it.
|
---|
18 | func getFirstCharsPrefix(tree *RegexTree) *Prefix {
|
---|
19 | s := regexFcd{
|
---|
20 | fcStack: make([]regexFc, 32),
|
---|
21 | intStack: make([]int, 32),
|
---|
22 | }
|
---|
23 | fc := s.regexFCFromRegexTree(tree)
|
---|
24 |
|
---|
25 | if fc == nil || fc.nullable || fc.cc.IsEmpty() {
|
---|
26 | return nil
|
---|
27 | }
|
---|
28 | fcSet := fc.getFirstChars()
|
---|
29 | return &Prefix{PrefixSet: fcSet, CaseInsensitive: fc.caseInsensitive}
|
---|
30 | }
|
---|
31 |
|
---|
32 | type regexFcd struct {
|
---|
33 | intStack []int
|
---|
34 | intDepth int
|
---|
35 | fcStack []regexFc
|
---|
36 | fcDepth int
|
---|
37 | skipAllChildren bool // don't process any more children at the current level
|
---|
38 | skipchild bool // don't process the current child.
|
---|
39 | failed bool
|
---|
40 | }
|
---|
41 |
|
---|
42 | /*
|
---|
43 | * The main FC computation. It does a shortcutted depth-first walk
|
---|
44 | * through the tree and calls CalculateFC to emits code before
|
---|
45 | * and after each child of an interior node, and at each leaf.
|
---|
46 | */
|
---|
47 | func (s *regexFcd) regexFCFromRegexTree(tree *RegexTree) *regexFc {
|
---|
48 | curNode := tree.root
|
---|
49 | curChild := 0
|
---|
50 |
|
---|
51 | for {
|
---|
52 | if len(curNode.children) == 0 {
|
---|
53 | // This is a leaf node
|
---|
54 | s.calculateFC(curNode.t, curNode, 0)
|
---|
55 | } else if curChild < len(curNode.children) && !s.skipAllChildren {
|
---|
56 | // This is an interior node, and we have more children to analyze
|
---|
57 | s.calculateFC(curNode.t|beforeChild, curNode, curChild)
|
---|
58 |
|
---|
59 | if !s.skipchild {
|
---|
60 | curNode = curNode.children[curChild]
|
---|
61 | // this stack is how we get a depth first walk of the tree.
|
---|
62 | s.pushInt(curChild)
|
---|
63 | curChild = 0
|
---|
64 | } else {
|
---|
65 | curChild++
|
---|
66 | s.skipchild = false
|
---|
67 | }
|
---|
68 | continue
|
---|
69 | }
|
---|
70 |
|
---|
71 | // This is an interior node where we've finished analyzing all the children, or
|
---|
72 | // the end of a leaf node.
|
---|
73 | s.skipAllChildren = false
|
---|
74 |
|
---|
75 | if s.intIsEmpty() {
|
---|
76 | break
|
---|
77 | }
|
---|
78 |
|
---|
79 | curChild = s.popInt()
|
---|
80 | curNode = curNode.next
|
---|
81 |
|
---|
82 | s.calculateFC(curNode.t|afterChild, curNode, curChild)
|
---|
83 | if s.failed {
|
---|
84 | return nil
|
---|
85 | }
|
---|
86 |
|
---|
87 | curChild++
|
---|
88 | }
|
---|
89 |
|
---|
90 | if s.fcIsEmpty() {
|
---|
91 | return nil
|
---|
92 | }
|
---|
93 |
|
---|
94 | return s.popFC()
|
---|
95 | }
|
---|
96 |
|
---|
97 | // To avoid recursion, we use a simple integer stack.
|
---|
98 | // This is the push.
|
---|
99 | func (s *regexFcd) pushInt(I int) {
|
---|
100 | if s.intDepth >= len(s.intStack) {
|
---|
101 | expanded := make([]int, s.intDepth*2)
|
---|
102 | copy(expanded, s.intStack)
|
---|
103 | s.intStack = expanded
|
---|
104 | }
|
---|
105 |
|
---|
106 | s.intStack[s.intDepth] = I
|
---|
107 | s.intDepth++
|
---|
108 | }
|
---|
109 |
|
---|
110 | // True if the stack is empty.
|
---|
111 | func (s *regexFcd) intIsEmpty() bool {
|
---|
112 | return s.intDepth == 0
|
---|
113 | }
|
---|
114 |
|
---|
115 | // This is the pop.
|
---|
116 | func (s *regexFcd) popInt() int {
|
---|
117 | s.intDepth--
|
---|
118 | return s.intStack[s.intDepth]
|
---|
119 | }
|
---|
120 |
|
---|
121 | // We also use a stack of RegexFC objects.
|
---|
122 | // This is the push.
|
---|
123 | func (s *regexFcd) pushFC(fc regexFc) {
|
---|
124 | if s.fcDepth >= len(s.fcStack) {
|
---|
125 | expanded := make([]regexFc, s.fcDepth*2)
|
---|
126 | copy(expanded, s.fcStack)
|
---|
127 | s.fcStack = expanded
|
---|
128 | }
|
---|
129 |
|
---|
130 | s.fcStack[s.fcDepth] = fc
|
---|
131 | s.fcDepth++
|
---|
132 | }
|
---|
133 |
|
---|
134 | // True if the stack is empty.
|
---|
135 | func (s *regexFcd) fcIsEmpty() bool {
|
---|
136 | return s.fcDepth == 0
|
---|
137 | }
|
---|
138 |
|
---|
139 | // This is the pop.
|
---|
140 | func (s *regexFcd) popFC() *regexFc {
|
---|
141 | s.fcDepth--
|
---|
142 | return &s.fcStack[s.fcDepth]
|
---|
143 | }
|
---|
144 |
|
---|
145 | // This is the top.
|
---|
146 | func (s *regexFcd) topFC() *regexFc {
|
---|
147 | return &s.fcStack[s.fcDepth-1]
|
---|
148 | }
|
---|
149 |
|
---|
150 | // Called in Beforechild to prevent further processing of the current child
|
---|
151 | func (s *regexFcd) skipChild() {
|
---|
152 | s.skipchild = true
|
---|
153 | }
|
---|
154 |
|
---|
155 | // FC computation and shortcut cases for each node type
|
---|
156 | func (s *regexFcd) calculateFC(nt nodeType, node *regexNode, CurIndex int) {
|
---|
157 | //fmt.Printf("NodeType: %v, CurIndex: %v, Desc: %v\n", nt, CurIndex, node.description())
|
---|
158 | ci := false
|
---|
159 | rtl := false
|
---|
160 |
|
---|
161 | if nt <= ntRef {
|
---|
162 | if (node.options & IgnoreCase) != 0 {
|
---|
163 | ci = true
|
---|
164 | }
|
---|
165 | if (node.options & RightToLeft) != 0 {
|
---|
166 | rtl = true
|
---|
167 | }
|
---|
168 | }
|
---|
169 |
|
---|
170 | switch nt {
|
---|
171 | case ntConcatenate | beforeChild, ntAlternate | beforeChild, ntTestref | beforeChild, ntLoop | beforeChild, ntLazyloop | beforeChild:
|
---|
172 | break
|
---|
173 |
|
---|
174 | case ntTestgroup | beforeChild:
|
---|
175 | if CurIndex == 0 {
|
---|
176 | s.skipChild()
|
---|
177 | }
|
---|
178 | break
|
---|
179 |
|
---|
180 | case ntEmpty:
|
---|
181 | s.pushFC(regexFc{nullable: true})
|
---|
182 | break
|
---|
183 |
|
---|
184 | case ntConcatenate | afterChild:
|
---|
185 | if CurIndex != 0 {
|
---|
186 | child := s.popFC()
|
---|
187 | cumul := s.topFC()
|
---|
188 |
|
---|
189 | s.failed = !cumul.addFC(*child, true)
|
---|
190 | }
|
---|
191 |
|
---|
192 | fc := s.topFC()
|
---|
193 | if !fc.nullable {
|
---|
194 | s.skipAllChildren = true
|
---|
195 | }
|
---|
196 | break
|
---|
197 |
|
---|
198 | case ntTestgroup | afterChild:
|
---|
199 | if CurIndex > 1 {
|
---|
200 | child := s.popFC()
|
---|
201 | cumul := s.topFC()
|
---|
202 |
|
---|
203 | s.failed = !cumul.addFC(*child, false)
|
---|
204 | }
|
---|
205 | break
|
---|
206 |
|
---|
207 | case ntAlternate | afterChild, ntTestref | afterChild:
|
---|
208 | if CurIndex != 0 {
|
---|
209 | child := s.popFC()
|
---|
210 | cumul := s.topFC()
|
---|
211 |
|
---|
212 | s.failed = !cumul.addFC(*child, false)
|
---|
213 | }
|
---|
214 | break
|
---|
215 |
|
---|
216 | case ntLoop | afterChild, ntLazyloop | afterChild:
|
---|
217 | if node.m == 0 {
|
---|
218 | fc := s.topFC()
|
---|
219 | fc.nullable = true
|
---|
220 | }
|
---|
221 | break
|
---|
222 |
|
---|
223 | case ntGroup | beforeChild, ntGroup | afterChild, ntCapture | beforeChild, ntCapture | afterChild, ntGreedy | beforeChild, ntGreedy | afterChild:
|
---|
224 | break
|
---|
225 |
|
---|
226 | case ntRequire | beforeChild, ntPrevent | beforeChild:
|
---|
227 | s.skipChild()
|
---|
228 | s.pushFC(regexFc{nullable: true})
|
---|
229 | break
|
---|
230 |
|
---|
231 | case ntRequire | afterChild, ntPrevent | afterChild:
|
---|
232 | break
|
---|
233 |
|
---|
234 | case ntOne, ntNotone:
|
---|
235 | s.pushFC(newRegexFc(node.ch, nt == ntNotone, false, ci))
|
---|
236 | break
|
---|
237 |
|
---|
238 | case ntOneloop, ntOnelazy:
|
---|
239 | s.pushFC(newRegexFc(node.ch, false, node.m == 0, ci))
|
---|
240 | break
|
---|
241 |
|
---|
242 | case ntNotoneloop, ntNotonelazy:
|
---|
243 | s.pushFC(newRegexFc(node.ch, true, node.m == 0, ci))
|
---|
244 | break
|
---|
245 |
|
---|
246 | case ntMulti:
|
---|
247 | if len(node.str) == 0 {
|
---|
248 | s.pushFC(regexFc{nullable: true})
|
---|
249 | } else if !rtl {
|
---|
250 | s.pushFC(newRegexFc(node.str[0], false, false, ci))
|
---|
251 | } else {
|
---|
252 | s.pushFC(newRegexFc(node.str[len(node.str)-1], false, false, ci))
|
---|
253 | }
|
---|
254 | break
|
---|
255 |
|
---|
256 | case ntSet:
|
---|
257 | s.pushFC(regexFc{cc: node.set.Copy(), nullable: false, caseInsensitive: ci})
|
---|
258 | break
|
---|
259 |
|
---|
260 | case ntSetloop, ntSetlazy:
|
---|
261 | s.pushFC(regexFc{cc: node.set.Copy(), nullable: node.m == 0, caseInsensitive: ci})
|
---|
262 | break
|
---|
263 |
|
---|
264 | case ntRef:
|
---|
265 | s.pushFC(regexFc{cc: *AnyClass(), nullable: true, caseInsensitive: false})
|
---|
266 | break
|
---|
267 |
|
---|
268 | case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd:
|
---|
269 | s.pushFC(regexFc{nullable: true})
|
---|
270 | break
|
---|
271 |
|
---|
272 | default:
|
---|
273 | panic(fmt.Sprintf("unexpected op code: %v", nt))
|
---|
274 | }
|
---|
275 | }
|
---|
276 |
|
---|
277 | type regexFc struct {
|
---|
278 | cc CharSet
|
---|
279 | nullable bool
|
---|
280 | caseInsensitive bool
|
---|
281 | }
|
---|
282 |
|
---|
283 | func newRegexFc(ch rune, not, nullable, caseInsensitive bool) regexFc {
|
---|
284 | r := regexFc{
|
---|
285 | caseInsensitive: caseInsensitive,
|
---|
286 | nullable: nullable,
|
---|
287 | }
|
---|
288 | if not {
|
---|
289 | if ch > 0 {
|
---|
290 | r.cc.addRange('\x00', ch-1)
|
---|
291 | }
|
---|
292 | if ch < 0xFFFF {
|
---|
293 | r.cc.addRange(ch+1, utf8.MaxRune)
|
---|
294 | }
|
---|
295 | } else {
|
---|
296 | r.cc.addRange(ch, ch)
|
---|
297 | }
|
---|
298 | return r
|
---|
299 | }
|
---|
300 |
|
---|
301 | func (r *regexFc) getFirstChars() CharSet {
|
---|
302 | if r.caseInsensitive {
|
---|
303 | r.cc.addLowercase()
|
---|
304 | }
|
---|
305 |
|
---|
306 | return r.cc
|
---|
307 | }
|
---|
308 |
|
---|
309 | func (r *regexFc) addFC(fc regexFc, concatenate bool) bool {
|
---|
310 | if !r.cc.IsMergeable() || !fc.cc.IsMergeable() {
|
---|
311 | return false
|
---|
312 | }
|
---|
313 |
|
---|
314 | if concatenate {
|
---|
315 | if !r.nullable {
|
---|
316 | return true
|
---|
317 | }
|
---|
318 |
|
---|
319 | if !fc.nullable {
|
---|
320 | r.nullable = false
|
---|
321 | }
|
---|
322 | } else {
|
---|
323 | if fc.nullable {
|
---|
324 | r.nullable = true
|
---|
325 | }
|
---|
326 | }
|
---|
327 |
|
---|
328 | r.caseInsensitive = r.caseInsensitive || fc.caseInsensitive
|
---|
329 | r.cc.addSet(fc.cc)
|
---|
330 |
|
---|
331 | return true
|
---|
332 | }
|
---|
333 |
|
---|
334 | // This is a related computation: it takes a RegexTree and computes the
|
---|
335 | // leading substring if it sees one. It's quite trivial and gives up easily.
|
---|
336 | func getPrefix(tree *RegexTree) *Prefix {
|
---|
337 | var concatNode *regexNode
|
---|
338 | nextChild := 0
|
---|
339 |
|
---|
340 | curNode := tree.root
|
---|
341 |
|
---|
342 | for {
|
---|
343 | switch curNode.t {
|
---|
344 | case ntConcatenate:
|
---|
345 | if len(curNode.children) > 0 {
|
---|
346 | concatNode = curNode
|
---|
347 | nextChild = 0
|
---|
348 | }
|
---|
349 |
|
---|
350 | case ntGreedy, ntCapture:
|
---|
351 | curNode = curNode.children[0]
|
---|
352 | concatNode = nil
|
---|
353 | continue
|
---|
354 |
|
---|
355 | case ntOneloop, ntOnelazy:
|
---|
356 | if curNode.m > 0 {
|
---|
357 | return &Prefix{
|
---|
358 | PrefixStr: repeat(curNode.ch, curNode.m),
|
---|
359 | CaseInsensitive: (curNode.options & IgnoreCase) != 0,
|
---|
360 | }
|
---|
361 | }
|
---|
362 | return nil
|
---|
363 |
|
---|
364 | case ntOne:
|
---|
365 | return &Prefix{
|
---|
366 | PrefixStr: []rune{curNode.ch},
|
---|
367 | CaseInsensitive: (curNode.options & IgnoreCase) != 0,
|
---|
368 | }
|
---|
369 |
|
---|
370 | case ntMulti:
|
---|
371 | return &Prefix{
|
---|
372 | PrefixStr: curNode.str,
|
---|
373 | CaseInsensitive: (curNode.options & IgnoreCase) != 0,
|
---|
374 | }
|
---|
375 |
|
---|
376 | case ntBol, ntEol, ntBoundary, ntECMABoundary, ntBeginning, ntStart,
|
---|
377 | ntEndZ, ntEnd, ntEmpty, ntRequire, ntPrevent:
|
---|
378 |
|
---|
379 | default:
|
---|
380 | return nil
|
---|
381 | }
|
---|
382 |
|
---|
383 | if concatNode == nil || nextChild >= len(concatNode.children) {
|
---|
384 | return nil
|
---|
385 | }
|
---|
386 |
|
---|
387 | curNode = concatNode.children[nextChild]
|
---|
388 | nextChild++
|
---|
389 | }
|
---|
390 | }
|
---|
391 |
|
---|
392 | // repeat the rune r, c times... up to the max of MaxPrefixSize
|
---|
393 | func repeat(r rune, c int) []rune {
|
---|
394 | if c > MaxPrefixSize {
|
---|
395 | c = MaxPrefixSize
|
---|
396 | }
|
---|
397 |
|
---|
398 | ret := make([]rune, c)
|
---|
399 |
|
---|
400 | // binary growth using copy for speed
|
---|
401 | ret[0] = r
|
---|
402 | bp := 1
|
---|
403 | for bp < len(ret) {
|
---|
404 | copy(ret[bp:], ret[:bp])
|
---|
405 | bp *= 2
|
---|
406 | }
|
---|
407 |
|
---|
408 | return ret
|
---|
409 | }
|
---|
410 |
|
---|
411 | // BmPrefix precomputes the Boyer-Moore
|
---|
412 | // tables for fast string scanning. These tables allow
|
---|
413 | // you to scan for the first occurrence of a string within
|
---|
414 | // a large body of text without examining every character.
|
---|
415 | // The performance of the heuristic depends on the actual
|
---|
416 | // string and the text being searched, but usually, the longer
|
---|
417 | // the string that is being searched for, the fewer characters
|
---|
418 | // need to be examined.
|
---|
419 | type BmPrefix struct {
|
---|
420 | positive []int
|
---|
421 | negativeASCII []int
|
---|
422 | negativeUnicode [][]int
|
---|
423 | pattern []rune
|
---|
424 | lowASCII rune
|
---|
425 | highASCII rune
|
---|
426 | rightToLeft bool
|
---|
427 | caseInsensitive bool
|
---|
428 | }
|
---|
429 |
|
---|
430 | func newBmPrefix(pattern []rune, caseInsensitive, rightToLeft bool) *BmPrefix {
|
---|
431 |
|
---|
432 | b := &BmPrefix{
|
---|
433 | rightToLeft: rightToLeft,
|
---|
434 | caseInsensitive: caseInsensitive,
|
---|
435 | pattern: pattern,
|
---|
436 | }
|
---|
437 |
|
---|
438 | if caseInsensitive {
|
---|
439 | for i := 0; i < len(b.pattern); i++ {
|
---|
440 | // We do the ToLower character by character for consistency. With surrogate chars, doing
|
---|
441 | // a ToLower on the entire string could actually change the surrogate pair. This is more correct
|
---|
442 | // linguistically, but since Regex doesn't support surrogates, it's more important to be
|
---|
443 | // consistent.
|
---|
444 |
|
---|
445 | b.pattern[i] = unicode.ToLower(b.pattern[i])
|
---|
446 | }
|
---|
447 | }
|
---|
448 |
|
---|
449 | var beforefirst, last, bump int
|
---|
450 | var scan, match int
|
---|
451 |
|
---|
452 | if !rightToLeft {
|
---|
453 | beforefirst = -1
|
---|
454 | last = len(b.pattern) - 1
|
---|
455 | bump = 1
|
---|
456 | } else {
|
---|
457 | beforefirst = len(b.pattern)
|
---|
458 | last = 0
|
---|
459 | bump = -1
|
---|
460 | }
|
---|
461 |
|
---|
462 | // PART I - the good-suffix shift table
|
---|
463 | //
|
---|
464 | // compute the positive requirement:
|
---|
465 | // if char "i" is the first one from the right that doesn't match,
|
---|
466 | // then we know the matcher can advance by _positive[i].
|
---|
467 | //
|
---|
468 | // This algorithm is a simplified variant of the standard
|
---|
469 | // Boyer-Moore good suffix calculation.
|
---|
470 |
|
---|
471 | b.positive = make([]int, len(b.pattern))
|
---|
472 |
|
---|
473 | examine := last
|
---|
474 | ch := b.pattern[examine]
|
---|
475 | b.positive[examine] = bump
|
---|
476 | examine -= bump
|
---|
477 |
|
---|
478 | Outerloop:
|
---|
479 | for {
|
---|
480 | // find an internal char (examine) that matches the tail
|
---|
481 |
|
---|
482 | for {
|
---|
483 | if examine == beforefirst {
|
---|
484 | break Outerloop
|
---|
485 | }
|
---|
486 | if b.pattern[examine] == ch {
|
---|
487 | break
|
---|
488 | }
|
---|
489 | examine -= bump
|
---|
490 | }
|
---|
491 |
|
---|
492 | match = last
|
---|
493 | scan = examine
|
---|
494 |
|
---|
495 | // find the length of the match
|
---|
496 | for {
|
---|
497 | if scan == beforefirst || b.pattern[match] != b.pattern[scan] {
|
---|
498 | // at the end of the match, note the difference in _positive
|
---|
499 | // this is not the length of the match, but the distance from the internal match
|
---|
500 | // to the tail suffix.
|
---|
501 | if b.positive[match] == 0 {
|
---|
502 | b.positive[match] = match - scan
|
---|
503 | }
|
---|
504 |
|
---|
505 | // System.Diagnostics.Debug.WriteLine("Set positive[" + match + "] to " + (match - scan));
|
---|
506 |
|
---|
507 | break
|
---|
508 | }
|
---|
509 |
|
---|
510 | scan -= bump
|
---|
511 | match -= bump
|
---|
512 | }
|
---|
513 |
|
---|
514 | examine -= bump
|
---|
515 | }
|
---|
516 |
|
---|
517 | match = last - bump
|
---|
518 |
|
---|
519 | // scan for the chars for which there are no shifts that yield a different candidate
|
---|
520 |
|
---|
521 | // The inside of the if statement used to say
|
---|
522 | // "_positive[match] = last - beforefirst;"
|
---|
523 | // This is slightly less aggressive in how much we skip, but at worst it
|
---|
524 | // should mean a little more work rather than skipping a potential match.
|
---|
525 | for match != beforefirst {
|
---|
526 | if b.positive[match] == 0 {
|
---|
527 | b.positive[match] = bump
|
---|
528 | }
|
---|
529 |
|
---|
530 | match -= bump
|
---|
531 | }
|
---|
532 |
|
---|
533 | // PART II - the bad-character shift table
|
---|
534 | //
|
---|
535 | // compute the negative requirement:
|
---|
536 | // if char "ch" is the reject character when testing position "i",
|
---|
537 | // we can slide up by _negative[ch];
|
---|
538 | // (_negative[ch] = str.Length - 1 - str.LastIndexOf(ch))
|
---|
539 | //
|
---|
540 | // the lookup table is divided into ASCII and Unicode portions;
|
---|
541 | // only those parts of the Unicode 16-bit code set that actually
|
---|
542 | // appear in the string are in the table. (Maximum size with
|
---|
543 | // Unicode is 65K; ASCII only case is 512 bytes.)
|
---|
544 |
|
---|
545 | b.negativeASCII = make([]int, 128)
|
---|
546 |
|
---|
547 | for i := 0; i < len(b.negativeASCII); i++ {
|
---|
548 | b.negativeASCII[i] = last - beforefirst
|
---|
549 | }
|
---|
550 |
|
---|
551 | b.lowASCII = 127
|
---|
552 | b.highASCII = 0
|
---|
553 |
|
---|
554 | for examine = last; examine != beforefirst; examine -= bump {
|
---|
555 | ch = b.pattern[examine]
|
---|
556 |
|
---|
557 | switch {
|
---|
558 | case ch < 128:
|
---|
559 | if b.lowASCII > ch {
|
---|
560 | b.lowASCII = ch
|
---|
561 | }
|
---|
562 |
|
---|
563 | if b.highASCII < ch {
|
---|
564 | b.highASCII = ch
|
---|
565 | }
|
---|
566 |
|
---|
567 | if b.negativeASCII[ch] == last-beforefirst {
|
---|
568 | b.negativeASCII[ch] = last - examine
|
---|
569 | }
|
---|
570 | case ch <= 0xffff:
|
---|
571 | i, j := ch>>8, ch&0xFF
|
---|
572 |
|
---|
573 | if b.negativeUnicode == nil {
|
---|
574 | b.negativeUnicode = make([][]int, 256)
|
---|
575 | }
|
---|
576 |
|
---|
577 | if b.negativeUnicode[i] == nil {
|
---|
578 | newarray := make([]int, 256)
|
---|
579 |
|
---|
580 | for k := 0; k < len(newarray); k++ {
|
---|
581 | newarray[k] = last - beforefirst
|
---|
582 | }
|
---|
583 |
|
---|
584 | if i == 0 {
|
---|
585 | copy(newarray, b.negativeASCII)
|
---|
586 | //TODO: this line needed?
|
---|
587 | b.negativeASCII = newarray
|
---|
588 | }
|
---|
589 |
|
---|
590 | b.negativeUnicode[i] = newarray
|
---|
591 | }
|
---|
592 |
|
---|
593 | if b.negativeUnicode[i][j] == last-beforefirst {
|
---|
594 | b.negativeUnicode[i][j] = last - examine
|
---|
595 | }
|
---|
596 | default:
|
---|
597 | // we can't do the filter because this algo doesn't support
|
---|
598 | // unicode chars >0xffff
|
---|
599 | return nil
|
---|
600 | }
|
---|
601 | }
|
---|
602 |
|
---|
603 | return b
|
---|
604 | }
|
---|
605 |
|
---|
606 | func (b *BmPrefix) String() string {
|
---|
607 | return string(b.pattern)
|
---|
608 | }
|
---|
609 |
|
---|
610 | // Dump returns the contents of the filter as a human readable string
|
---|
611 | func (b *BmPrefix) Dump(indent string) string {
|
---|
612 | buf := &bytes.Buffer{}
|
---|
613 |
|
---|
614 | fmt.Fprintf(buf, "%sBM Pattern: %s\n%sPositive: ", indent, string(b.pattern), indent)
|
---|
615 | for i := 0; i < len(b.positive); i++ {
|
---|
616 | buf.WriteString(strconv.Itoa(b.positive[i]))
|
---|
617 | buf.WriteRune(' ')
|
---|
618 | }
|
---|
619 | buf.WriteRune('\n')
|
---|
620 |
|
---|
621 | if b.negativeASCII != nil {
|
---|
622 | buf.WriteString(indent)
|
---|
623 | buf.WriteString("Negative table\n")
|
---|
624 | for i := 0; i < len(b.negativeASCII); i++ {
|
---|
625 | if b.negativeASCII[i] != len(b.pattern) {
|
---|
626 | fmt.Fprintf(buf, "%s %s %s\n", indent, Escape(string(rune(i))), strconv.Itoa(b.negativeASCII[i]))
|
---|
627 | }
|
---|
628 | }
|
---|
629 | }
|
---|
630 |
|
---|
631 | return buf.String()
|
---|
632 | }
|
---|
633 |
|
---|
634 | // Scan uses the Boyer-Moore algorithm to find the first occurrence
|
---|
635 | // of the specified string within text, beginning at index, and
|
---|
636 | // constrained within beglimit and endlimit.
|
---|
637 | //
|
---|
638 | // The direction and case-sensitivity of the match is determined
|
---|
639 | // by the arguments to the RegexBoyerMoore constructor.
|
---|
640 | func (b *BmPrefix) Scan(text []rune, index, beglimit, endlimit int) int {
|
---|
641 | var (
|
---|
642 | defadv, test, test2 int
|
---|
643 | match, startmatch, endmatch int
|
---|
644 | bump, advance int
|
---|
645 | chTest rune
|
---|
646 | unicodeLookup []int
|
---|
647 | )
|
---|
648 |
|
---|
649 | if !b.rightToLeft {
|
---|
650 | defadv = len(b.pattern)
|
---|
651 | startmatch = len(b.pattern) - 1
|
---|
652 | endmatch = 0
|
---|
653 | test = index + defadv - 1
|
---|
654 | bump = 1
|
---|
655 | } else {
|
---|
656 | defadv = -len(b.pattern)
|
---|
657 | startmatch = 0
|
---|
658 | endmatch = -defadv - 1
|
---|
659 | test = index + defadv
|
---|
660 | bump = -1
|
---|
661 | }
|
---|
662 |
|
---|
663 | chMatch := b.pattern[startmatch]
|
---|
664 |
|
---|
665 | for {
|
---|
666 | if test >= endlimit || test < beglimit {
|
---|
667 | return -1
|
---|
668 | }
|
---|
669 |
|
---|
670 | chTest = text[test]
|
---|
671 |
|
---|
672 | if b.caseInsensitive {
|
---|
673 | chTest = unicode.ToLower(chTest)
|
---|
674 | }
|
---|
675 |
|
---|
676 | if chTest != chMatch {
|
---|
677 | if chTest < 128 {
|
---|
678 | advance = b.negativeASCII[chTest]
|
---|
679 | } else if chTest < 0xffff && len(b.negativeUnicode) > 0 {
|
---|
680 | unicodeLookup = b.negativeUnicode[chTest>>8]
|
---|
681 | if len(unicodeLookup) > 0 {
|
---|
682 | advance = unicodeLookup[chTest&0xFF]
|
---|
683 | } else {
|
---|
684 | advance = defadv
|
---|
685 | }
|
---|
686 | } else {
|
---|
687 | advance = defadv
|
---|
688 | }
|
---|
689 |
|
---|
690 | test += advance
|
---|
691 | } else { // if (chTest == chMatch)
|
---|
692 | test2 = test
|
---|
693 | match = startmatch
|
---|
694 |
|
---|
695 | for {
|
---|
696 | if match == endmatch {
|
---|
697 | if b.rightToLeft {
|
---|
698 | return test2 + 1
|
---|
699 | } else {
|
---|
700 | return test2
|
---|
701 | }
|
---|
702 | }
|
---|
703 |
|
---|
704 | match -= bump
|
---|
705 | test2 -= bump
|
---|
706 |
|
---|
707 | chTest = text[test2]
|
---|
708 |
|
---|
709 | if b.caseInsensitive {
|
---|
710 | chTest = unicode.ToLower(chTest)
|
---|
711 | }
|
---|
712 |
|
---|
713 | if chTest != b.pattern[match] {
|
---|
714 | advance = b.positive[match]
|
---|
715 | if (chTest & 0xFF80) == 0 {
|
---|
716 | test2 = (match - startmatch) + b.negativeASCII[chTest]
|
---|
717 | } else if chTest < 0xffff && len(b.negativeUnicode) > 0 {
|
---|
718 | unicodeLookup = b.negativeUnicode[chTest>>8]
|
---|
719 | if len(unicodeLookup) > 0 {
|
---|
720 | test2 = (match - startmatch) + unicodeLookup[chTest&0xFF]
|
---|
721 | } else {
|
---|
722 | test += advance
|
---|
723 | break
|
---|
724 | }
|
---|
725 | } else {
|
---|
726 | test += advance
|
---|
727 | break
|
---|
728 | }
|
---|
729 |
|
---|
730 | if b.rightToLeft {
|
---|
731 | if test2 < advance {
|
---|
732 | advance = test2
|
---|
733 | }
|
---|
734 | } else if test2 > advance {
|
---|
735 | advance = test2
|
---|
736 | }
|
---|
737 |
|
---|
738 | test += advance
|
---|
739 | break
|
---|
740 | }
|
---|
741 | }
|
---|
742 | }
|
---|
743 | }
|
---|
744 | }
|
---|
745 |
|
---|
746 | // When a regex is anchored, we can do a quick IsMatch test instead of a Scan
|
---|
747 | func (b *BmPrefix) IsMatch(text []rune, index, beglimit, endlimit int) bool {
|
---|
748 | if !b.rightToLeft {
|
---|
749 | if index < beglimit || endlimit-index < len(b.pattern) {
|
---|
750 | return false
|
---|
751 | }
|
---|
752 |
|
---|
753 | return b.matchPattern(text, index)
|
---|
754 | } else {
|
---|
755 | if index > endlimit || index-beglimit < len(b.pattern) {
|
---|
756 | return false
|
---|
757 | }
|
---|
758 |
|
---|
759 | return b.matchPattern(text, index-len(b.pattern))
|
---|
760 | }
|
---|
761 | }
|
---|
762 |
|
---|
763 | func (b *BmPrefix) matchPattern(text []rune, index int) bool {
|
---|
764 | if len(text)-index < len(b.pattern) {
|
---|
765 | return false
|
---|
766 | }
|
---|
767 |
|
---|
768 | if b.caseInsensitive {
|
---|
769 | for i := 0; i < len(b.pattern); i++ {
|
---|
770 | //Debug.Assert(textinfo.ToLower(_pattern[i]) == _pattern[i], "pattern should be converted to lower case in constructor!");
|
---|
771 | if unicode.ToLower(text[index+i]) != b.pattern[i] {
|
---|
772 | return false
|
---|
773 | }
|
---|
774 | }
|
---|
775 | return true
|
---|
776 | } else {
|
---|
777 | for i := 0; i < len(b.pattern); i++ {
|
---|
778 | if text[index+i] != b.pattern[i] {
|
---|
779 | return false
|
---|
780 | }
|
---|
781 | }
|
---|
782 | return true
|
---|
783 | }
|
---|
784 | }
|
---|
785 |
|
---|
786 | type AnchorLoc int16
|
---|
787 |
|
---|
788 | // where the regex can be pegged
|
---|
789 | const (
|
---|
790 | AnchorBeginning AnchorLoc = 0x0001
|
---|
791 | AnchorBol = 0x0002
|
---|
792 | AnchorStart = 0x0004
|
---|
793 | AnchorEol = 0x0008
|
---|
794 | AnchorEndZ = 0x0010
|
---|
795 | AnchorEnd = 0x0020
|
---|
796 | AnchorBoundary = 0x0040
|
---|
797 | AnchorECMABoundary = 0x0080
|
---|
798 | )
|
---|
799 |
|
---|
800 | func getAnchors(tree *RegexTree) AnchorLoc {
|
---|
801 |
|
---|
802 | var concatNode *regexNode
|
---|
803 | nextChild, result := 0, AnchorLoc(0)
|
---|
804 |
|
---|
805 | curNode := tree.root
|
---|
806 |
|
---|
807 | for {
|
---|
808 | switch curNode.t {
|
---|
809 | case ntConcatenate:
|
---|
810 | if len(curNode.children) > 0 {
|
---|
811 | concatNode = curNode
|
---|
812 | nextChild = 0
|
---|
813 | }
|
---|
814 |
|
---|
815 | case ntGreedy, ntCapture:
|
---|
816 | curNode = curNode.children[0]
|
---|
817 | concatNode = nil
|
---|
818 | continue
|
---|
819 |
|
---|
820 | case ntBol, ntEol, ntBoundary, ntECMABoundary, ntBeginning,
|
---|
821 | ntStart, ntEndZ, ntEnd:
|
---|
822 | return result | anchorFromType(curNode.t)
|
---|
823 |
|
---|
824 | case ntEmpty, ntRequire, ntPrevent:
|
---|
825 |
|
---|
826 | default:
|
---|
827 | return result
|
---|
828 | }
|
---|
829 |
|
---|
830 | if concatNode == nil || nextChild >= len(concatNode.children) {
|
---|
831 | return result
|
---|
832 | }
|
---|
833 |
|
---|
834 | curNode = concatNode.children[nextChild]
|
---|
835 | nextChild++
|
---|
836 | }
|
---|
837 | }
|
---|
838 |
|
---|
839 | func anchorFromType(t nodeType) AnchorLoc {
|
---|
840 | switch t {
|
---|
841 | case ntBol:
|
---|
842 | return AnchorBol
|
---|
843 | case ntEol:
|
---|
844 | return AnchorEol
|
---|
845 | case ntBoundary:
|
---|
846 | return AnchorBoundary
|
---|
847 | case ntECMABoundary:
|
---|
848 | return AnchorECMABoundary
|
---|
849 | case ntBeginning:
|
---|
850 | return AnchorBeginning
|
---|
851 | case ntStart:
|
---|
852 | return AnchorStart
|
---|
853 | case ntEndZ:
|
---|
854 | return AnchorEndZ
|
---|
855 | case ntEnd:
|
---|
856 | return AnchorEnd
|
---|
857 | default:
|
---|
858 | return 0
|
---|
859 | }
|
---|
860 | }
|
---|
861 |
|
---|
862 | // anchorDescription returns a human-readable description of the anchors
|
---|
863 | func (anchors AnchorLoc) String() string {
|
---|
864 | buf := &bytes.Buffer{}
|
---|
865 |
|
---|
866 | if 0 != (anchors & AnchorBeginning) {
|
---|
867 | buf.WriteString(", Beginning")
|
---|
868 | }
|
---|
869 | if 0 != (anchors & AnchorStart) {
|
---|
870 | buf.WriteString(", Start")
|
---|
871 | }
|
---|
872 | if 0 != (anchors & AnchorBol) {
|
---|
873 | buf.WriteString(", Bol")
|
---|
874 | }
|
---|
875 | if 0 != (anchors & AnchorBoundary) {
|
---|
876 | buf.WriteString(", Boundary")
|
---|
877 | }
|
---|
878 | if 0 != (anchors & AnchorECMABoundary) {
|
---|
879 | buf.WriteString(", ECMABoundary")
|
---|
880 | }
|
---|
881 | if 0 != (anchors & AnchorEol) {
|
---|
882 | buf.WriteString(", Eol")
|
---|
883 | }
|
---|
884 | if 0 != (anchors & AnchorEnd) {
|
---|
885 | buf.WriteString(", End")
|
---|
886 | }
|
---|
887 | if 0 != (anchors & AnchorEndZ) {
|
---|
888 | buf.WriteString(", EndZ")
|
---|
889 | }
|
---|
890 |
|
---|
891 | // trim off comma
|
---|
892 | if buf.Len() >= 2 {
|
---|
893 | return buf.String()[2:]
|
---|
894 | }
|
---|
895 | return "None"
|
---|
896 | }
|
---|