1 | package syntax
|
---|
2 |
|
---|
3 | import (
|
---|
4 | "bytes"
|
---|
5 | "encoding/binary"
|
---|
6 | "fmt"
|
---|
7 | "sort"
|
---|
8 | "unicode"
|
---|
9 | "unicode/utf8"
|
---|
10 | )
|
---|
11 |
|
---|
12 | // CharSet combines start-end rune ranges and unicode categories representing a set of characters
|
---|
13 | type CharSet struct {
|
---|
14 | ranges []singleRange
|
---|
15 | categories []category
|
---|
16 | sub *CharSet //optional subtractor
|
---|
17 | negate bool
|
---|
18 | anything bool
|
---|
19 | }
|
---|
20 |
|
---|
21 | type category struct {
|
---|
22 | negate bool
|
---|
23 | cat string
|
---|
24 | }
|
---|
25 |
|
---|
26 | type singleRange struct {
|
---|
27 | first rune
|
---|
28 | last rune
|
---|
29 | }
|
---|
30 |
|
---|
31 | const (
|
---|
32 | spaceCategoryText = " "
|
---|
33 | wordCategoryText = "W"
|
---|
34 | )
|
---|
35 |
|
---|
36 | var (
|
---|
37 | ecmaSpace = []rune{0x0009, 0x000e, 0x0020, 0x0021, 0x00a0, 0x00a1, 0x1680, 0x1681, 0x2000, 0x200b, 0x2028, 0x202a, 0x202f, 0x2030, 0x205f, 0x2060, 0x3000, 0x3001, 0xfeff, 0xff00}
|
---|
38 | ecmaWord = []rune{0x0030, 0x003a, 0x0041, 0x005b, 0x005f, 0x0060, 0x0061, 0x007b}
|
---|
39 | ecmaDigit = []rune{0x0030, 0x003a}
|
---|
40 | )
|
---|
41 |
|
---|
42 | var (
|
---|
43 | AnyClass = getCharSetFromOldString([]rune{0}, false)
|
---|
44 | ECMAAnyClass = getCharSetFromOldString([]rune{0, 0x000a, 0x000b, 0x000d, 0x000e}, false)
|
---|
45 | NoneClass = getCharSetFromOldString(nil, false)
|
---|
46 | ECMAWordClass = getCharSetFromOldString(ecmaWord, false)
|
---|
47 | NotECMAWordClass = getCharSetFromOldString(ecmaWord, true)
|
---|
48 | ECMASpaceClass = getCharSetFromOldString(ecmaSpace, false)
|
---|
49 | NotECMASpaceClass = getCharSetFromOldString(ecmaSpace, true)
|
---|
50 | ECMADigitClass = getCharSetFromOldString(ecmaDigit, false)
|
---|
51 | NotECMADigitClass = getCharSetFromOldString(ecmaDigit, true)
|
---|
52 |
|
---|
53 | WordClass = getCharSetFromCategoryString(false, false, wordCategoryText)
|
---|
54 | NotWordClass = getCharSetFromCategoryString(true, false, wordCategoryText)
|
---|
55 | SpaceClass = getCharSetFromCategoryString(false, false, spaceCategoryText)
|
---|
56 | NotSpaceClass = getCharSetFromCategoryString(true, false, spaceCategoryText)
|
---|
57 | DigitClass = getCharSetFromCategoryString(false, false, "Nd")
|
---|
58 | NotDigitClass = getCharSetFromCategoryString(false, true, "Nd")
|
---|
59 | )
|
---|
60 |
|
---|
61 | var unicodeCategories = func() map[string]*unicode.RangeTable {
|
---|
62 | retVal := make(map[string]*unicode.RangeTable)
|
---|
63 | for k, v := range unicode.Scripts {
|
---|
64 | retVal[k] = v
|
---|
65 | }
|
---|
66 | for k, v := range unicode.Categories {
|
---|
67 | retVal[k] = v
|
---|
68 | }
|
---|
69 | for k, v := range unicode.Properties {
|
---|
70 | retVal[k] = v
|
---|
71 | }
|
---|
72 | return retVal
|
---|
73 | }()
|
---|
74 |
|
---|
75 | func getCharSetFromCategoryString(negateSet bool, negateCat bool, cats ...string) func() *CharSet {
|
---|
76 | if negateCat && negateSet {
|
---|
77 | panic("BUG! You should only negate the set OR the category in a constant setup, but not both")
|
---|
78 | }
|
---|
79 |
|
---|
80 | c := CharSet{negate: negateSet}
|
---|
81 |
|
---|
82 | c.categories = make([]category, len(cats))
|
---|
83 | for i, cat := range cats {
|
---|
84 | c.categories[i] = category{cat: cat, negate: negateCat}
|
---|
85 | }
|
---|
86 | return func() *CharSet {
|
---|
87 | //make a copy each time
|
---|
88 | local := c
|
---|
89 | //return that address
|
---|
90 | return &local
|
---|
91 | }
|
---|
92 | }
|
---|
93 |
|
---|
94 | func getCharSetFromOldString(setText []rune, negate bool) func() *CharSet {
|
---|
95 | c := CharSet{}
|
---|
96 | if len(setText) > 0 {
|
---|
97 | fillFirst := false
|
---|
98 | l := len(setText)
|
---|
99 | if negate {
|
---|
100 | if setText[0] == 0 {
|
---|
101 | setText = setText[1:]
|
---|
102 | } else {
|
---|
103 | l++
|
---|
104 | fillFirst = true
|
---|
105 | }
|
---|
106 | }
|
---|
107 |
|
---|
108 | if l%2 == 0 {
|
---|
109 | c.ranges = make([]singleRange, l/2)
|
---|
110 | } else {
|
---|
111 | c.ranges = make([]singleRange, l/2+1)
|
---|
112 | }
|
---|
113 |
|
---|
114 | first := true
|
---|
115 | if fillFirst {
|
---|
116 | c.ranges[0] = singleRange{first: 0}
|
---|
117 | first = false
|
---|
118 | }
|
---|
119 |
|
---|
120 | i := 0
|
---|
121 | for _, r := range setText {
|
---|
122 | if first {
|
---|
123 | // lower bound in a new range
|
---|
124 | c.ranges[i] = singleRange{first: r}
|
---|
125 | first = false
|
---|
126 | } else {
|
---|
127 | c.ranges[i].last = r - 1
|
---|
128 | i++
|
---|
129 | first = true
|
---|
130 | }
|
---|
131 | }
|
---|
132 | if !first {
|
---|
133 | c.ranges[i].last = utf8.MaxRune
|
---|
134 | }
|
---|
135 | }
|
---|
136 |
|
---|
137 | return func() *CharSet {
|
---|
138 | local := c
|
---|
139 | return &local
|
---|
140 | }
|
---|
141 | }
|
---|
142 |
|
---|
143 | // Copy makes a deep copy to prevent accidental mutation of a set
|
---|
144 | func (c CharSet) Copy() CharSet {
|
---|
145 | ret := CharSet{
|
---|
146 | anything: c.anything,
|
---|
147 | negate: c.negate,
|
---|
148 | }
|
---|
149 |
|
---|
150 | ret.ranges = append(ret.ranges, c.ranges...)
|
---|
151 | ret.categories = append(ret.categories, c.categories...)
|
---|
152 |
|
---|
153 | if c.sub != nil {
|
---|
154 | sub := c.sub.Copy()
|
---|
155 | ret.sub = &sub
|
---|
156 | }
|
---|
157 |
|
---|
158 | return ret
|
---|
159 | }
|
---|
160 |
|
---|
161 | // gets a human-readable description for a set string
|
---|
162 | func (c CharSet) String() string {
|
---|
163 | buf := &bytes.Buffer{}
|
---|
164 | buf.WriteRune('[')
|
---|
165 |
|
---|
166 | if c.IsNegated() {
|
---|
167 | buf.WriteRune('^')
|
---|
168 | }
|
---|
169 |
|
---|
170 | for _, r := range c.ranges {
|
---|
171 |
|
---|
172 | buf.WriteString(CharDescription(r.first))
|
---|
173 | if r.first != r.last {
|
---|
174 | if r.last-r.first != 1 {
|
---|
175 | //groups that are 1 char apart skip the dash
|
---|
176 | buf.WriteRune('-')
|
---|
177 | }
|
---|
178 | buf.WriteString(CharDescription(r.last))
|
---|
179 | }
|
---|
180 | }
|
---|
181 |
|
---|
182 | for _, c := range c.categories {
|
---|
183 | buf.WriteString(c.String())
|
---|
184 | }
|
---|
185 |
|
---|
186 | if c.sub != nil {
|
---|
187 | buf.WriteRune('-')
|
---|
188 | buf.WriteString(c.sub.String())
|
---|
189 | }
|
---|
190 |
|
---|
191 | buf.WriteRune(']')
|
---|
192 |
|
---|
193 | return buf.String()
|
---|
194 | }
|
---|
195 |
|
---|
196 | // mapHashFill converts a charset into a buffer for use in maps
|
---|
197 | func (c CharSet) mapHashFill(buf *bytes.Buffer) {
|
---|
198 | if c.negate {
|
---|
199 | buf.WriteByte(0)
|
---|
200 | } else {
|
---|
201 | buf.WriteByte(1)
|
---|
202 | }
|
---|
203 |
|
---|
204 | binary.Write(buf, binary.LittleEndian, len(c.ranges))
|
---|
205 | binary.Write(buf, binary.LittleEndian, len(c.categories))
|
---|
206 | for _, r := range c.ranges {
|
---|
207 | buf.WriteRune(r.first)
|
---|
208 | buf.WriteRune(r.last)
|
---|
209 | }
|
---|
210 | for _, ct := range c.categories {
|
---|
211 | buf.WriteString(ct.cat)
|
---|
212 | if ct.negate {
|
---|
213 | buf.WriteByte(1)
|
---|
214 | } else {
|
---|
215 | buf.WriteByte(0)
|
---|
216 | }
|
---|
217 | }
|
---|
218 |
|
---|
219 | if c.sub != nil {
|
---|
220 | c.sub.mapHashFill(buf)
|
---|
221 | }
|
---|
222 | }
|
---|
223 |
|
---|
224 | // CharIn returns true if the rune is in our character set (either ranges or categories).
|
---|
225 | // It handles negations and subtracted sub-charsets.
|
---|
226 | func (c CharSet) CharIn(ch rune) bool {
|
---|
227 | val := false
|
---|
228 | // in s && !s.subtracted
|
---|
229 |
|
---|
230 | //check ranges
|
---|
231 | for _, r := range c.ranges {
|
---|
232 | if ch < r.first {
|
---|
233 | continue
|
---|
234 | }
|
---|
235 | if ch <= r.last {
|
---|
236 | val = true
|
---|
237 | break
|
---|
238 | }
|
---|
239 | }
|
---|
240 |
|
---|
241 | //check categories if we haven't already found a range
|
---|
242 | if !val && len(c.categories) > 0 {
|
---|
243 | for _, ct := range c.categories {
|
---|
244 | // special categories...then unicode
|
---|
245 | if ct.cat == spaceCategoryText {
|
---|
246 | if unicode.IsSpace(ch) {
|
---|
247 | // we found a space so we're done
|
---|
248 | // negate means this is a "bad" thing
|
---|
249 | val = !ct.negate
|
---|
250 | break
|
---|
251 | } else if ct.negate {
|
---|
252 | val = true
|
---|
253 | break
|
---|
254 | }
|
---|
255 | } else if ct.cat == wordCategoryText {
|
---|
256 | if IsWordChar(ch) {
|
---|
257 | val = !ct.negate
|
---|
258 | break
|
---|
259 | } else if ct.negate {
|
---|
260 | val = true
|
---|
261 | break
|
---|
262 | }
|
---|
263 | } else if unicode.Is(unicodeCategories[ct.cat], ch) {
|
---|
264 | // if we're in this unicode category then we're done
|
---|
265 | // if negate=true on this category then we "failed" our test
|
---|
266 | // otherwise we're good that we found it
|
---|
267 | val = !ct.negate
|
---|
268 | break
|
---|
269 | } else if ct.negate {
|
---|
270 | val = true
|
---|
271 | break
|
---|
272 | }
|
---|
273 | }
|
---|
274 | }
|
---|
275 |
|
---|
276 | // negate the whole char set
|
---|
277 | if c.negate {
|
---|
278 | val = !val
|
---|
279 | }
|
---|
280 |
|
---|
281 | // get subtracted recurse
|
---|
282 | if val && c.sub != nil {
|
---|
283 | val = !c.sub.CharIn(ch)
|
---|
284 | }
|
---|
285 |
|
---|
286 | //log.Printf("Char '%v' in %v == %v", string(ch), c.String(), val)
|
---|
287 | return val
|
---|
288 | }
|
---|
289 |
|
---|
290 | func (c category) String() string {
|
---|
291 | switch c.cat {
|
---|
292 | case spaceCategoryText:
|
---|
293 | if c.negate {
|
---|
294 | return "\\S"
|
---|
295 | }
|
---|
296 | return "\\s"
|
---|
297 | case wordCategoryText:
|
---|
298 | if c.negate {
|
---|
299 | return "\\W"
|
---|
300 | }
|
---|
301 | return "\\w"
|
---|
302 | }
|
---|
303 | if _, ok := unicodeCategories[c.cat]; ok {
|
---|
304 |
|
---|
305 | if c.negate {
|
---|
306 | return "\\P{" + c.cat + "}"
|
---|
307 | }
|
---|
308 | return "\\p{" + c.cat + "}"
|
---|
309 | }
|
---|
310 | return "Unknown category: " + c.cat
|
---|
311 | }
|
---|
312 |
|
---|
313 | // CharDescription Produces a human-readable description for a single character.
|
---|
314 | func CharDescription(ch rune) string {
|
---|
315 | /*if ch == '\\' {
|
---|
316 | return "\\\\"
|
---|
317 | }
|
---|
318 |
|
---|
319 | if ch > ' ' && ch <= '~' {
|
---|
320 | return string(ch)
|
---|
321 | } else if ch == '\n' {
|
---|
322 | return "\\n"
|
---|
323 | } else if ch == ' ' {
|
---|
324 | return "\\ "
|
---|
325 | }*/
|
---|
326 |
|
---|
327 | b := &bytes.Buffer{}
|
---|
328 | escape(b, ch, false) //fmt.Sprintf("%U", ch)
|
---|
329 | return b.String()
|
---|
330 | }
|
---|
331 |
|
---|
332 | // According to UTS#18 Unicode Regular Expressions (http://www.unicode.org/reports/tr18/)
|
---|
333 | // RL 1.4 Simple Word Boundaries The class of <word_character> includes all Alphabetic
|
---|
334 | // values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C
|
---|
335 | // ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER.
|
---|
336 | func IsWordChar(r rune) bool {
|
---|
337 | //"L", "Mn", "Nd", "Pc"
|
---|
338 | return unicode.In(r,
|
---|
339 | unicode.Categories["L"], unicode.Categories["Mn"],
|
---|
340 | unicode.Categories["Nd"], unicode.Categories["Pc"]) || r == '\u200D' || r == '\u200C'
|
---|
341 | //return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
|
---|
342 | }
|
---|
343 |
|
---|
344 | func IsECMAWordChar(r rune) bool {
|
---|
345 | return unicode.In(r,
|
---|
346 | unicode.Categories["L"], unicode.Categories["Mn"],
|
---|
347 | unicode.Categories["Nd"], unicode.Categories["Pc"])
|
---|
348 |
|
---|
349 | //return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
|
---|
350 | }
|
---|
351 |
|
---|
352 | // SingletonChar will return the char from the first range without validation.
|
---|
353 | // It assumes you have checked for IsSingleton or IsSingletonInverse and will panic given bad input
|
---|
354 | func (c CharSet) SingletonChar() rune {
|
---|
355 | return c.ranges[0].first
|
---|
356 | }
|
---|
357 |
|
---|
358 | func (c CharSet) IsSingleton() bool {
|
---|
359 | return !c.negate && //negated is multiple chars
|
---|
360 | len(c.categories) == 0 && len(c.ranges) == 1 && // multiple ranges and unicode classes represent multiple chars
|
---|
361 | c.sub == nil && // subtraction means we've got multiple chars
|
---|
362 | c.ranges[0].first == c.ranges[0].last // first and last equal means we're just 1 char
|
---|
363 | }
|
---|
364 |
|
---|
365 | func (c CharSet) IsSingletonInverse() bool {
|
---|
366 | return c.negate && //same as above, but requires negated
|
---|
367 | len(c.categories) == 0 && len(c.ranges) == 1 && // multiple ranges and unicode classes represent multiple chars
|
---|
368 | c.sub == nil && // subtraction means we've got multiple chars
|
---|
369 | c.ranges[0].first == c.ranges[0].last // first and last equal means we're just 1 char
|
---|
370 | }
|
---|
371 |
|
---|
372 | func (c CharSet) IsMergeable() bool {
|
---|
373 | return !c.IsNegated() && !c.HasSubtraction()
|
---|
374 | }
|
---|
375 |
|
---|
376 | func (c CharSet) IsNegated() bool {
|
---|
377 | return c.negate
|
---|
378 | }
|
---|
379 |
|
---|
380 | func (c CharSet) HasSubtraction() bool {
|
---|
381 | return c.sub != nil
|
---|
382 | }
|
---|
383 |
|
---|
384 | func (c CharSet) IsEmpty() bool {
|
---|
385 | return len(c.ranges) == 0 && len(c.categories) == 0 && c.sub == nil
|
---|
386 | }
|
---|
387 |
|
---|
388 | func (c *CharSet) addDigit(ecma, negate bool, pattern string) {
|
---|
389 | if ecma {
|
---|
390 | if negate {
|
---|
391 | c.addRanges(NotECMADigitClass().ranges)
|
---|
392 | } else {
|
---|
393 | c.addRanges(ECMADigitClass().ranges)
|
---|
394 | }
|
---|
395 | } else {
|
---|
396 | c.addCategories(category{cat: "Nd", negate: negate})
|
---|
397 | }
|
---|
398 | }
|
---|
399 |
|
---|
400 | func (c *CharSet) addChar(ch rune) {
|
---|
401 | c.addRange(ch, ch)
|
---|
402 | }
|
---|
403 |
|
---|
404 | func (c *CharSet) addSpace(ecma, negate bool) {
|
---|
405 | if ecma {
|
---|
406 | if negate {
|
---|
407 | c.addRanges(NotECMASpaceClass().ranges)
|
---|
408 | } else {
|
---|
409 | c.addRanges(ECMASpaceClass().ranges)
|
---|
410 | }
|
---|
411 | } else {
|
---|
412 | c.addCategories(category{cat: spaceCategoryText, negate: negate})
|
---|
413 | }
|
---|
414 | }
|
---|
415 |
|
---|
416 | func (c *CharSet) addWord(ecma, negate bool) {
|
---|
417 | if ecma {
|
---|
418 | if negate {
|
---|
419 | c.addRanges(NotECMAWordClass().ranges)
|
---|
420 | } else {
|
---|
421 | c.addRanges(ECMAWordClass().ranges)
|
---|
422 | }
|
---|
423 | } else {
|
---|
424 | c.addCategories(category{cat: wordCategoryText, negate: negate})
|
---|
425 | }
|
---|
426 | }
|
---|
427 |
|
---|
428 | // Add set ranges and categories into ours -- no deduping or anything
|
---|
429 | func (c *CharSet) addSet(set CharSet) {
|
---|
430 | if c.anything {
|
---|
431 | return
|
---|
432 | }
|
---|
433 | if set.anything {
|
---|
434 | c.makeAnything()
|
---|
435 | return
|
---|
436 | }
|
---|
437 | // just append here to prevent double-canon
|
---|
438 | c.ranges = append(c.ranges, set.ranges...)
|
---|
439 | c.addCategories(set.categories...)
|
---|
440 | c.canonicalize()
|
---|
441 | }
|
---|
442 |
|
---|
443 | func (c *CharSet) makeAnything() {
|
---|
444 | c.anything = true
|
---|
445 | c.categories = []category{}
|
---|
446 | c.ranges = AnyClass().ranges
|
---|
447 | }
|
---|
448 |
|
---|
449 | func (c *CharSet) addCategories(cats ...category) {
|
---|
450 | // don't add dupes and remove positive+negative
|
---|
451 | if c.anything {
|
---|
452 | // if we've had a previous positive+negative group then
|
---|
453 | // just return, we're as broad as we can get
|
---|
454 | return
|
---|
455 | }
|
---|
456 |
|
---|
457 | for _, ct := range cats {
|
---|
458 | found := false
|
---|
459 | for _, ct2 := range c.categories {
|
---|
460 | if ct.cat == ct2.cat {
|
---|
461 | if ct.negate != ct2.negate {
|
---|
462 | // oposite negations...this mean we just
|
---|
463 | // take us as anything and move on
|
---|
464 | c.makeAnything()
|
---|
465 | return
|
---|
466 | }
|
---|
467 | found = true
|
---|
468 | break
|
---|
469 | }
|
---|
470 | }
|
---|
471 |
|
---|
472 | if !found {
|
---|
473 | c.categories = append(c.categories, ct)
|
---|
474 | }
|
---|
475 | }
|
---|
476 | }
|
---|
477 |
|
---|
478 | // Merges new ranges to our own
|
---|
479 | func (c *CharSet) addRanges(ranges []singleRange) {
|
---|
480 | if c.anything {
|
---|
481 | return
|
---|
482 | }
|
---|
483 | c.ranges = append(c.ranges, ranges...)
|
---|
484 | c.canonicalize()
|
---|
485 | }
|
---|
486 |
|
---|
487 | // Merges everything but the new ranges into our own
|
---|
488 | func (c *CharSet) addNegativeRanges(ranges []singleRange) {
|
---|
489 | if c.anything {
|
---|
490 | return
|
---|
491 | }
|
---|
492 |
|
---|
493 | var hi rune
|
---|
494 |
|
---|
495 | // convert incoming ranges into opposites, assume they are in order
|
---|
496 | for _, r := range ranges {
|
---|
497 | if hi < r.first {
|
---|
498 | c.ranges = append(c.ranges, singleRange{hi, r.first - 1})
|
---|
499 | }
|
---|
500 | hi = r.last + 1
|
---|
501 | }
|
---|
502 |
|
---|
503 | if hi < utf8.MaxRune {
|
---|
504 | c.ranges = append(c.ranges, singleRange{hi, utf8.MaxRune})
|
---|
505 | }
|
---|
506 |
|
---|
507 | c.canonicalize()
|
---|
508 | }
|
---|
509 |
|
---|
510 | func isValidUnicodeCat(catName string) bool {
|
---|
511 | _, ok := unicodeCategories[catName]
|
---|
512 | return ok
|
---|
513 | }
|
---|
514 |
|
---|
515 | func (c *CharSet) addCategory(categoryName string, negate, caseInsensitive bool, pattern string) {
|
---|
516 | if !isValidUnicodeCat(categoryName) {
|
---|
517 | // unknown unicode category, script, or property "blah"
|
---|
518 | panic(fmt.Errorf("Unknown unicode category, script, or property '%v'", categoryName))
|
---|
519 |
|
---|
520 | }
|
---|
521 |
|
---|
522 | if caseInsensitive && (categoryName == "Ll" || categoryName == "Lu" || categoryName == "Lt") {
|
---|
523 | // when RegexOptions.IgnoreCase is specified then {Ll} {Lu} and {Lt} cases should all match
|
---|
524 | c.addCategories(
|
---|
525 | category{cat: "Ll", negate: negate},
|
---|
526 | category{cat: "Lu", negate: negate},
|
---|
527 | category{cat: "Lt", negate: negate})
|
---|
528 | }
|
---|
529 | c.addCategories(category{cat: categoryName, negate: negate})
|
---|
530 | }
|
---|
531 |
|
---|
532 | func (c *CharSet) addSubtraction(sub *CharSet) {
|
---|
533 | c.sub = sub
|
---|
534 | }
|
---|
535 |
|
---|
536 | func (c *CharSet) addRange(chMin, chMax rune) {
|
---|
537 | c.ranges = append(c.ranges, singleRange{first: chMin, last: chMax})
|
---|
538 | c.canonicalize()
|
---|
539 | }
|
---|
540 |
|
---|
541 | func (c *CharSet) addNamedASCII(name string, negate bool) bool {
|
---|
542 | var rs []singleRange
|
---|
543 |
|
---|
544 | switch name {
|
---|
545 | case "alnum":
|
---|
546 | rs = []singleRange{singleRange{'0', '9'}, singleRange{'A', 'Z'}, singleRange{'a', 'z'}}
|
---|
547 | case "alpha":
|
---|
548 | rs = []singleRange{singleRange{'A', 'Z'}, singleRange{'a', 'z'}}
|
---|
549 | case "ascii":
|
---|
550 | rs = []singleRange{singleRange{0, 0x7f}}
|
---|
551 | case "blank":
|
---|
552 | rs = []singleRange{singleRange{'\t', '\t'}, singleRange{' ', ' '}}
|
---|
553 | case "cntrl":
|
---|
554 | rs = []singleRange{singleRange{0, 0x1f}, singleRange{0x7f, 0x7f}}
|
---|
555 | case "digit":
|
---|
556 | c.addDigit(false, negate, "")
|
---|
557 | case "graph":
|
---|
558 | rs = []singleRange{singleRange{'!', '~'}}
|
---|
559 | case "lower":
|
---|
560 | rs = []singleRange{singleRange{'a', 'z'}}
|
---|
561 | case "print":
|
---|
562 | rs = []singleRange{singleRange{' ', '~'}}
|
---|
563 | case "punct": //[!-/:-@[-`{-~]
|
---|
564 | rs = []singleRange{singleRange{'!', '/'}, singleRange{':', '@'}, singleRange{'[', '`'}, singleRange{'{', '~'}}
|
---|
565 | case "space":
|
---|
566 | c.addSpace(true, negate)
|
---|
567 | case "upper":
|
---|
568 | rs = []singleRange{singleRange{'A', 'Z'}}
|
---|
569 | case "word":
|
---|
570 | c.addWord(true, negate)
|
---|
571 | case "xdigit":
|
---|
572 | rs = []singleRange{singleRange{'0', '9'}, singleRange{'A', 'F'}, singleRange{'a', 'f'}}
|
---|
573 | default:
|
---|
574 | return false
|
---|
575 | }
|
---|
576 |
|
---|
577 | if len(rs) > 0 {
|
---|
578 | if negate {
|
---|
579 | c.addNegativeRanges(rs)
|
---|
580 | } else {
|
---|
581 | c.addRanges(rs)
|
---|
582 | }
|
---|
583 | }
|
---|
584 |
|
---|
585 | return true
|
---|
586 | }
|
---|
587 |
|
---|
588 | type singleRangeSorter []singleRange
|
---|
589 |
|
---|
590 | func (p singleRangeSorter) Len() int { return len(p) }
|
---|
591 | func (p singleRangeSorter) Less(i, j int) bool { return p[i].first < p[j].first }
|
---|
592 | func (p singleRangeSorter) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
|
---|
593 |
|
---|
594 | // Logic to reduce a character class to a unique, sorted form.
|
---|
595 | func (c *CharSet) canonicalize() {
|
---|
596 | var i, j int
|
---|
597 | var last rune
|
---|
598 |
|
---|
599 | //
|
---|
600 | // Find and eliminate overlapping or abutting ranges
|
---|
601 | //
|
---|
602 |
|
---|
603 | if len(c.ranges) > 1 {
|
---|
604 | sort.Sort(singleRangeSorter(c.ranges))
|
---|
605 |
|
---|
606 | done := false
|
---|
607 |
|
---|
608 | for i, j = 1, 0; ; i++ {
|
---|
609 | for last = c.ranges[j].last; ; i++ {
|
---|
610 | if i == len(c.ranges) || last == utf8.MaxRune {
|
---|
611 | done = true
|
---|
612 | break
|
---|
613 | }
|
---|
614 |
|
---|
615 | CurrentRange := c.ranges[i]
|
---|
616 | if CurrentRange.first > last+1 {
|
---|
617 | break
|
---|
618 | }
|
---|
619 |
|
---|
620 | if last < CurrentRange.last {
|
---|
621 | last = CurrentRange.last
|
---|
622 | }
|
---|
623 | }
|
---|
624 |
|
---|
625 | c.ranges[j] = singleRange{first: c.ranges[j].first, last: last}
|
---|
626 |
|
---|
627 | j++
|
---|
628 |
|
---|
629 | if done {
|
---|
630 | break
|
---|
631 | }
|
---|
632 |
|
---|
633 | if j < i {
|
---|
634 | c.ranges[j] = c.ranges[i]
|
---|
635 | }
|
---|
636 | }
|
---|
637 |
|
---|
638 | c.ranges = append(c.ranges[:j], c.ranges[len(c.ranges):]...)
|
---|
639 | }
|
---|
640 | }
|
---|
641 |
|
---|
642 | // Adds to the class any lowercase versions of characters already
|
---|
643 | // in the class. Used for case-insensitivity.
|
---|
644 | func (c *CharSet) addLowercase() {
|
---|
645 | if c.anything {
|
---|
646 | return
|
---|
647 | }
|
---|
648 | toAdd := []singleRange{}
|
---|
649 | for i := 0; i < len(c.ranges); i++ {
|
---|
650 | r := c.ranges[i]
|
---|
651 | if r.first == r.last {
|
---|
652 | lower := unicode.ToLower(r.first)
|
---|
653 | c.ranges[i] = singleRange{first: lower, last: lower}
|
---|
654 | } else {
|
---|
655 | toAdd = append(toAdd, r)
|
---|
656 | }
|
---|
657 | }
|
---|
658 |
|
---|
659 | for _, r := range toAdd {
|
---|
660 | c.addLowercaseRange(r.first, r.last)
|
---|
661 | }
|
---|
662 | c.canonicalize()
|
---|
663 | }
|
---|
664 |
|
---|
665 | /**************************************************************************
|
---|
666 | Let U be the set of Unicode character values and let L be the lowercase
|
---|
667 | function, mapping from U to U. To perform case insensitive matching of
|
---|
668 | character sets, we need to be able to map an interval I in U, say
|
---|
669 |
|
---|
670 | I = [chMin, chMax] = { ch : chMin <= ch <= chMax }
|
---|
671 |
|
---|
672 | to a set A such that A contains L(I) and A is contained in the union of
|
---|
673 | I and L(I).
|
---|
674 |
|
---|
675 | The table below partitions U into intervals on which L is non-decreasing.
|
---|
676 | Thus, for any interval J = [a, b] contained in one of these intervals,
|
---|
677 | L(J) is contained in [L(a), L(b)].
|
---|
678 |
|
---|
679 | It is also true that for any such J, [L(a), L(b)] is contained in the
|
---|
680 | union of J and L(J). This does not follow from L being non-decreasing on
|
---|
681 | these intervals. It follows from the nature of the L on each interval.
|
---|
682 | On each interval, L has one of the following forms:
|
---|
683 |
|
---|
684 | (1) L(ch) = constant (LowercaseSet)
|
---|
685 | (2) L(ch) = ch + offset (LowercaseAdd)
|
---|
686 | (3) L(ch) = ch | 1 (LowercaseBor)
|
---|
687 | (4) L(ch) = ch + (ch & 1) (LowercaseBad)
|
---|
688 |
|
---|
689 | It is easy to verify that for any of these forms [L(a), L(b)] is
|
---|
690 | contained in the union of [a, b] and L([a, b]).
|
---|
691 | ***************************************************************************/
|
---|
692 |
|
---|
693 | const (
|
---|
694 | LowercaseSet = 0 // Set to arg.
|
---|
695 | LowercaseAdd = 1 // Add arg.
|
---|
696 | LowercaseBor = 2 // Bitwise or with 1.
|
---|
697 | LowercaseBad = 3 // Bitwise and with 1 and add original.
|
---|
698 | )
|
---|
699 |
|
---|
700 | type lcMap struct {
|
---|
701 | chMin, chMax rune
|
---|
702 | op, data int32
|
---|
703 | }
|
---|
704 |
|
---|
705 | var lcTable = []lcMap{
|
---|
706 | lcMap{'\u0041', '\u005A', LowercaseAdd, 32},
|
---|
707 | lcMap{'\u00C0', '\u00DE', LowercaseAdd, 32},
|
---|
708 | lcMap{'\u0100', '\u012E', LowercaseBor, 0},
|
---|
709 | lcMap{'\u0130', '\u0130', LowercaseSet, 0x0069},
|
---|
710 | lcMap{'\u0132', '\u0136', LowercaseBor, 0},
|
---|
711 | lcMap{'\u0139', '\u0147', LowercaseBad, 0},
|
---|
712 | lcMap{'\u014A', '\u0176', LowercaseBor, 0},
|
---|
713 | lcMap{'\u0178', '\u0178', LowercaseSet, 0x00FF},
|
---|
714 | lcMap{'\u0179', '\u017D', LowercaseBad, 0},
|
---|
715 | lcMap{'\u0181', '\u0181', LowercaseSet, 0x0253},
|
---|
716 | lcMap{'\u0182', '\u0184', LowercaseBor, 0},
|
---|
717 | lcMap{'\u0186', '\u0186', LowercaseSet, 0x0254},
|
---|
718 | lcMap{'\u0187', '\u0187', LowercaseSet, 0x0188},
|
---|
719 | lcMap{'\u0189', '\u018A', LowercaseAdd, 205},
|
---|
720 | lcMap{'\u018B', '\u018B', LowercaseSet, 0x018C},
|
---|
721 | lcMap{'\u018E', '\u018E', LowercaseSet, 0x01DD},
|
---|
722 | lcMap{'\u018F', '\u018F', LowercaseSet, 0x0259},
|
---|
723 | lcMap{'\u0190', '\u0190', LowercaseSet, 0x025B},
|
---|
724 | lcMap{'\u0191', '\u0191', LowercaseSet, 0x0192},
|
---|
725 | lcMap{'\u0193', '\u0193', LowercaseSet, 0x0260},
|
---|
726 | lcMap{'\u0194', '\u0194', LowercaseSet, 0x0263},
|
---|
727 | lcMap{'\u0196', '\u0196', LowercaseSet, 0x0269},
|
---|
728 | lcMap{'\u0197', '\u0197', LowercaseSet, 0x0268},
|
---|
729 | lcMap{'\u0198', '\u0198', LowercaseSet, 0x0199},
|
---|
730 | lcMap{'\u019C', '\u019C', LowercaseSet, 0x026F},
|
---|
731 | lcMap{'\u019D', '\u019D', LowercaseSet, 0x0272},
|
---|
732 | lcMap{'\u019F', '\u019F', LowercaseSet, 0x0275},
|
---|
733 | lcMap{'\u01A0', '\u01A4', LowercaseBor, 0},
|
---|
734 | lcMap{'\u01A7', '\u01A7', LowercaseSet, 0x01A8},
|
---|
735 | lcMap{'\u01A9', '\u01A9', LowercaseSet, 0x0283},
|
---|
736 | lcMap{'\u01AC', '\u01AC', LowercaseSet, 0x01AD},
|
---|
737 | lcMap{'\u01AE', '\u01AE', LowercaseSet, 0x0288},
|
---|
738 | lcMap{'\u01AF', '\u01AF', LowercaseSet, 0x01B0},
|
---|
739 | lcMap{'\u01B1', '\u01B2', LowercaseAdd, 217},
|
---|
740 | lcMap{'\u01B3', '\u01B5', LowercaseBad, 0},
|
---|
741 | lcMap{'\u01B7', '\u01B7', LowercaseSet, 0x0292},
|
---|
742 | lcMap{'\u01B8', '\u01B8', LowercaseSet, 0x01B9},
|
---|
743 | lcMap{'\u01BC', '\u01BC', LowercaseSet, 0x01BD},
|
---|
744 | lcMap{'\u01C4', '\u01C5', LowercaseSet, 0x01C6},
|
---|
745 | lcMap{'\u01C7', '\u01C8', LowercaseSet, 0x01C9},
|
---|
746 | lcMap{'\u01CA', '\u01CB', LowercaseSet, 0x01CC},
|
---|
747 | lcMap{'\u01CD', '\u01DB', LowercaseBad, 0},
|
---|
748 | lcMap{'\u01DE', '\u01EE', LowercaseBor, 0},
|
---|
749 | lcMap{'\u01F1', '\u01F2', LowercaseSet, 0x01F3},
|
---|
750 | lcMap{'\u01F4', '\u01F4', LowercaseSet, 0x01F5},
|
---|
751 | lcMap{'\u01FA', '\u0216', LowercaseBor, 0},
|
---|
752 | lcMap{'\u0386', '\u0386', LowercaseSet, 0x03AC},
|
---|
753 | lcMap{'\u0388', '\u038A', LowercaseAdd, 37},
|
---|
754 | lcMap{'\u038C', '\u038C', LowercaseSet, 0x03CC},
|
---|
755 | lcMap{'\u038E', '\u038F', LowercaseAdd, 63},
|
---|
756 | lcMap{'\u0391', '\u03AB', LowercaseAdd, 32},
|
---|
757 | lcMap{'\u03E2', '\u03EE', LowercaseBor, 0},
|
---|
758 | lcMap{'\u0401', '\u040F', LowercaseAdd, 80},
|
---|
759 | lcMap{'\u0410', '\u042F', LowercaseAdd, 32},
|
---|
760 | lcMap{'\u0460', '\u0480', LowercaseBor, 0},
|
---|
761 | lcMap{'\u0490', '\u04BE', LowercaseBor, 0},
|
---|
762 | lcMap{'\u04C1', '\u04C3', LowercaseBad, 0},
|
---|
763 | lcMap{'\u04C7', '\u04C7', LowercaseSet, 0x04C8},
|
---|
764 | lcMap{'\u04CB', '\u04CB', LowercaseSet, 0x04CC},
|
---|
765 | lcMap{'\u04D0', '\u04EA', LowercaseBor, 0},
|
---|
766 | lcMap{'\u04EE', '\u04F4', LowercaseBor, 0},
|
---|
767 | lcMap{'\u04F8', '\u04F8', LowercaseSet, 0x04F9},
|
---|
768 | lcMap{'\u0531', '\u0556', LowercaseAdd, 48},
|
---|
769 | lcMap{'\u10A0', '\u10C5', LowercaseAdd, 48},
|
---|
770 | lcMap{'\u1E00', '\u1EF8', LowercaseBor, 0},
|
---|
771 | lcMap{'\u1F08', '\u1F0F', LowercaseAdd, -8},
|
---|
772 | lcMap{'\u1F18', '\u1F1F', LowercaseAdd, -8},
|
---|
773 | lcMap{'\u1F28', '\u1F2F', LowercaseAdd, -8},
|
---|
774 | lcMap{'\u1F38', '\u1F3F', LowercaseAdd, -8},
|
---|
775 | lcMap{'\u1F48', '\u1F4D', LowercaseAdd, -8},
|
---|
776 | lcMap{'\u1F59', '\u1F59', LowercaseSet, 0x1F51},
|
---|
777 | lcMap{'\u1F5B', '\u1F5B', LowercaseSet, 0x1F53},
|
---|
778 | lcMap{'\u1F5D', '\u1F5D', LowercaseSet, 0x1F55},
|
---|
779 | lcMap{'\u1F5F', '\u1F5F', LowercaseSet, 0x1F57},
|
---|
780 | lcMap{'\u1F68', '\u1F6F', LowercaseAdd, -8},
|
---|
781 | lcMap{'\u1F88', '\u1F8F', LowercaseAdd, -8},
|
---|
782 | lcMap{'\u1F98', '\u1F9F', LowercaseAdd, -8},
|
---|
783 | lcMap{'\u1FA8', '\u1FAF', LowercaseAdd, -8},
|
---|
784 | lcMap{'\u1FB8', '\u1FB9', LowercaseAdd, -8},
|
---|
785 | lcMap{'\u1FBA', '\u1FBB', LowercaseAdd, -74},
|
---|
786 | lcMap{'\u1FBC', '\u1FBC', LowercaseSet, 0x1FB3},
|
---|
787 | lcMap{'\u1FC8', '\u1FCB', LowercaseAdd, -86},
|
---|
788 | lcMap{'\u1FCC', '\u1FCC', LowercaseSet, 0x1FC3},
|
---|
789 | lcMap{'\u1FD8', '\u1FD9', LowercaseAdd, -8},
|
---|
790 | lcMap{'\u1FDA', '\u1FDB', LowercaseAdd, -100},
|
---|
791 | lcMap{'\u1FE8', '\u1FE9', LowercaseAdd, -8},
|
---|
792 | lcMap{'\u1FEA', '\u1FEB', LowercaseAdd, -112},
|
---|
793 | lcMap{'\u1FEC', '\u1FEC', LowercaseSet, 0x1FE5},
|
---|
794 | lcMap{'\u1FF8', '\u1FF9', LowercaseAdd, -128},
|
---|
795 | lcMap{'\u1FFA', '\u1FFB', LowercaseAdd, -126},
|
---|
796 | lcMap{'\u1FFC', '\u1FFC', LowercaseSet, 0x1FF3},
|
---|
797 | lcMap{'\u2160', '\u216F', LowercaseAdd, 16},
|
---|
798 | lcMap{'\u24B6', '\u24D0', LowercaseAdd, 26},
|
---|
799 | lcMap{'\uFF21', '\uFF3A', LowercaseAdd, 32},
|
---|
800 | }
|
---|
801 |
|
---|
802 | func (c *CharSet) addLowercaseRange(chMin, chMax rune) {
|
---|
803 | var i, iMax, iMid int
|
---|
804 | var chMinT, chMaxT rune
|
---|
805 | var lc lcMap
|
---|
806 |
|
---|
807 | for i, iMax = 0, len(lcTable); i < iMax; {
|
---|
808 | iMid = (i + iMax) / 2
|
---|
809 | if lcTable[iMid].chMax < chMin {
|
---|
810 | i = iMid + 1
|
---|
811 | } else {
|
---|
812 | iMax = iMid
|
---|
813 | }
|
---|
814 | }
|
---|
815 |
|
---|
816 | for ; i < len(lcTable); i++ {
|
---|
817 | lc = lcTable[i]
|
---|
818 | if lc.chMin > chMax {
|
---|
819 | return
|
---|
820 | }
|
---|
821 | chMinT = lc.chMin
|
---|
822 | if chMinT < chMin {
|
---|
823 | chMinT = chMin
|
---|
824 | }
|
---|
825 |
|
---|
826 | chMaxT = lc.chMax
|
---|
827 | if chMaxT > chMax {
|
---|
828 | chMaxT = chMax
|
---|
829 | }
|
---|
830 |
|
---|
831 | switch lc.op {
|
---|
832 | case LowercaseSet:
|
---|
833 | chMinT = rune(lc.data)
|
---|
834 | chMaxT = rune(lc.data)
|
---|
835 | break
|
---|
836 | case LowercaseAdd:
|
---|
837 | chMinT += lc.data
|
---|
838 | chMaxT += lc.data
|
---|
839 | break
|
---|
840 | case LowercaseBor:
|
---|
841 | chMinT |= 1
|
---|
842 | chMaxT |= 1
|
---|
843 | break
|
---|
844 | case LowercaseBad:
|
---|
845 | chMinT += (chMinT & 1)
|
---|
846 | chMaxT += (chMaxT & 1)
|
---|
847 | break
|
---|
848 | }
|
---|
849 |
|
---|
850 | if chMinT < chMin || chMaxT > chMax {
|
---|
851 | c.addRange(chMinT, chMaxT)
|
---|
852 | }
|
---|
853 | }
|
---|
854 | }
|
---|