Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: code/trunk/vendor/github.com/dlclark/regexp2/syntax/charclass.go@ 67

Last change on this file since 67 was 67, checked in by Izuru Yakumo, 23 months ago

Use vendored modules

Signed-off-by: Izuru Yakumo <yakumo.izuru@…>

File size: 22.3 KB

Line
1	package syntax
2
3	import (
4	"bytes"
5	"encoding/binary"
6	"fmt"
7	"sort"
8	"unicode"
9	"unicode/utf8"
10	)
11
12	// CharSet combines start-end rune ranges and unicode categories representing a set of characters
13	type CharSet struct {
14	ranges []singleRange
15	categories []category
16	sub *CharSet //optional subtractor
17	negate bool
18	anything bool
19	}
20
21	type category struct {
22	negate bool
23	cat string
24	}
25
26	type singleRange struct {
27	first rune
28	last rune
29	}
30
31	const (
32	spaceCategoryText = " "
33	wordCategoryText = "W"
34	)
35
36	var (
37	ecmaSpace = []rune{0x0009, 0x000e, 0x0020, 0x0021, 0x00a0, 0x00a1, 0x1680, 0x1681, 0x2000, 0x200b, 0x2028, 0x202a, 0x202f, 0x2030, 0x205f, 0x2060, 0x3000, 0x3001, 0xfeff, 0xff00}
38	ecmaWord = []rune{0x0030, 0x003a, 0x0041, 0x005b, 0x005f, 0x0060, 0x0061, 0x007b}
39	ecmaDigit = []rune{0x0030, 0x003a}
40	)
41
42	var (
43	AnyClass = getCharSetFromOldString([]rune{0}, false)
44	ECMAAnyClass = getCharSetFromOldString([]rune{0, 0x000a, 0x000b, 0x000d, 0x000e}, false)
45	NoneClass = getCharSetFromOldString(nil, false)
46	ECMAWordClass = getCharSetFromOldString(ecmaWord, false)
47	NotECMAWordClass = getCharSetFromOldString(ecmaWord, true)
48	ECMASpaceClass = getCharSetFromOldString(ecmaSpace, false)
49	NotECMASpaceClass = getCharSetFromOldString(ecmaSpace, true)
50	ECMADigitClass = getCharSetFromOldString(ecmaDigit, false)
51	NotECMADigitClass = getCharSetFromOldString(ecmaDigit, true)
52
53	WordClass = getCharSetFromCategoryString(false, false, wordCategoryText)
54	NotWordClass = getCharSetFromCategoryString(true, false, wordCategoryText)
55	SpaceClass = getCharSetFromCategoryString(false, false, spaceCategoryText)
56	NotSpaceClass = getCharSetFromCategoryString(true, false, spaceCategoryText)
57	DigitClass = getCharSetFromCategoryString(false, false, "Nd")
58	NotDigitClass = getCharSetFromCategoryString(false, true, "Nd")
59	)
60
61	var unicodeCategories = func() map[string]*unicode.RangeTable {
62	retVal := make(map[string]*unicode.RangeTable)
63	for k, v := range unicode.Scripts {
64	retVal[k] = v
65	}
66	for k, v := range unicode.Categories {
67	retVal[k] = v
68	}
69	for k, v := range unicode.Properties {
70	retVal[k] = v
71	}
72	return retVal
73	}()
74
75	func getCharSetFromCategoryString(negateSet bool, negateCat bool, cats ...string) func() *CharSet {
76	if negateCat && negateSet {
77	panic("BUG! You should only negate the set OR the category in a constant setup, but not both")
78	}
79
80	c := CharSet{negate: negateSet}
81
82	c.categories = make([]category, len(cats))
83	for i, cat := range cats {
84	c.categories[i] = category{cat: cat, negate: negateCat}
85	}
86	return func() *CharSet {
87	//make a copy each time
88	local := c
89	//return that address
90	return &local
91	}
92	}
93
94	func getCharSetFromOldString(setText []rune, negate bool) func() *CharSet {
95	c := CharSet{}
96	if len(setText) > 0 {
97	fillFirst := false
98	l := len(setText)
99	if negate {
100	if setText[0] == 0 {
101	setText = setText[1:]
102	} else {
103	l++
104	fillFirst = true
105	}
106	}
107
108	if l%2 == 0 {
109	c.ranges = make([]singleRange, l/2)
110	} else {
111	c.ranges = make([]singleRange, l/2+1)
112	}
113
114	first := true
115	if fillFirst {
116	c.ranges[0] = singleRange{first: 0}
117	first = false
118	}
119
120	i := 0
121	for _, r := range setText {
122	if first {
123	// lower bound in a new range
124	c.ranges[i] = singleRange{first: r}
125	first = false
126	} else {
127	c.ranges[i].last = r - 1
128	i++
129	first = true
130	}
131	}
132	if !first {
133	c.ranges[i].last = utf8.MaxRune
134	}
135	}
136
137	return func() *CharSet {
138	local := c
139	return &local
140	}
141	}
142
143	// Copy makes a deep copy to prevent accidental mutation of a set
144	func (c CharSet) Copy() CharSet {
145	ret := CharSet{
146	anything: c.anything,
147	negate: c.negate,
148	}
149
150	ret.ranges = append(ret.ranges, c.ranges...)
151	ret.categories = append(ret.categories, c.categories...)
152
153	if c.sub != nil {
154	sub := c.sub.Copy()
155	ret.sub = &sub
156	}
157
158	return ret
159	}
160
161	// gets a human-readable description for a set string
162	func (c CharSet) String() string {
163	buf := &bytes.Buffer{}
164	buf.WriteRune('[')
165
166	if c.IsNegated() {
167	buf.WriteRune('^')
168	}
169
170	for _, r := range c.ranges {
171
172	buf.WriteString(CharDescription(r.first))
173	if r.first != r.last {
174	if r.last-r.first != 1 {
175	//groups that are 1 char apart skip the dash
176	buf.WriteRune('-')
177	}
178	buf.WriteString(CharDescription(r.last))
179	}
180	}
181
182	for _, c := range c.categories {
183	buf.WriteString(c.String())
184	}
185
186	if c.sub != nil {
187	buf.WriteRune('-')
188	buf.WriteString(c.sub.String())
189	}
190
191	buf.WriteRune(']')
192
193	return buf.String()
194	}
195
196	// mapHashFill converts a charset into a buffer for use in maps
197	func (c CharSet) mapHashFill(buf *bytes.Buffer) {
198	if c.negate {
199	buf.WriteByte(0)
200	} else {
201	buf.WriteByte(1)
202	}
203
204	binary.Write(buf, binary.LittleEndian, len(c.ranges))
205	binary.Write(buf, binary.LittleEndian, len(c.categories))
206	for _, r := range c.ranges {
207	buf.WriteRune(r.first)
208	buf.WriteRune(r.last)
209	}
210	for _, ct := range c.categories {
211	buf.WriteString(ct.cat)
212	if ct.negate {
213	buf.WriteByte(1)
214	} else {
215	buf.WriteByte(0)
216	}
217	}
218
219	if c.sub != nil {
220	c.sub.mapHashFill(buf)
221	}
222	}
223
224	// CharIn returns true if the rune is in our character set (either ranges or categories).
225	// It handles negations and subtracted sub-charsets.
226	func (c CharSet) CharIn(ch rune) bool {
227	val := false
228	// in s && !s.subtracted
229
230	//check ranges
231	for _, r := range c.ranges {
232	if ch < r.first {
233	continue
234	}
235	if ch <= r.last {
236	val = true
237	break
238	}
239	}
240
241	//check categories if we haven't already found a range
242	if !val && len(c.categories) > 0 {
243	for _, ct := range c.categories {
244	// special categories...then unicode
245	if ct.cat == spaceCategoryText {
246	if unicode.IsSpace(ch) {
247	// we found a space so we're done
248	// negate means this is a "bad" thing
249	val = !ct.negate
250	break
251	} else if ct.negate {
252	val = true
253	break
254	}
255	} else if ct.cat == wordCategoryText {
256	if IsWordChar(ch) {
257	val = !ct.negate
258	break
259	} else if ct.negate {
260	val = true
261	break
262	}
263	} else if unicode.Is(unicodeCategories[ct.cat], ch) {
264	// if we're in this unicode category then we're done
265	// if negate=true on this category then we "failed" our test
266	// otherwise we're good that we found it
267	val = !ct.negate
268	break
269	} else if ct.negate {
270	val = true
271	break
272	}
273	}
274	}
275
276	// negate the whole char set
277	if c.negate {
278	val = !val
279	}
280
281	// get subtracted recurse
282	if val && c.sub != nil {
283	val = !c.sub.CharIn(ch)
284	}
285
286	//log.Printf("Char '%v' in %v == %v", string(ch), c.String(), val)
287	return val
288	}
289
290	func (c category) String() string {
291	switch c.cat {
292	case spaceCategoryText:
293	if c.negate {
294	return "\\S"
295	}
296	return "\\s"
297	case wordCategoryText:
298	if c.negate {
299	return "\\W"
300	}
301	return "\\w"
302	}
303	if _, ok := unicodeCategories[c.cat]; ok {
304
305	if c.negate {
306	return "\\P{" + c.cat + "}"
307	}
308	return "\\p{" + c.cat + "}"
309	}
310	return "Unknown category: " + c.cat
311	}
312
313	// CharDescription Produces a human-readable description for a single character.
314	func CharDescription(ch rune) string {
315	/*if ch == '\\' {
316	return "\\\\"
317	}
318
319	if ch > ' ' && ch <= '~' {
320	return string(ch)
321	} else if ch == '\n' {
322	return "\\n"
323	} else if ch == ' ' {
324	return "\\ "
325	}*/
326
327	b := &bytes.Buffer{}
328	escape(b, ch, false) //fmt.Sprintf("%U", ch)
329	return b.String()
330	}
331
332	// According to UTS#18 Unicode Regular Expressions (http://www.unicode.org/reports/tr18/)
333	// RL 1.4 Simple Word Boundaries The class of <word_character> includes all Alphabetic
334	// values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C
335	// ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER.
336	func IsWordChar(r rune) bool {
337	//"L", "Mn", "Nd", "Pc"
338	return unicode.In(r,
339	unicode.Categories["L"], unicode.Categories["Mn"],
340	unicode.Categories["Nd"], unicode.Categories["Pc"]) \|\| r == '\u200D' \|\| r == '\u200C'
341	//return 'A' <= r && r <= 'Z' \|\| 'a' <= r && r <= 'z' \|\| '0' <= r && r <= '9' \|\| r == '_'
342	}
343
344	func IsECMAWordChar(r rune) bool {
345	return unicode.In(r,
346	unicode.Categories["L"], unicode.Categories["Mn"],
347	unicode.Categories["Nd"], unicode.Categories["Pc"])
348
349	//return 'A' <= r && r <= 'Z' \|\| 'a' <= r && r <= 'z' \|\| '0' <= r && r <= '9' \|\| r == '_'
350	}
351
352	// SingletonChar will return the char from the first range without validation.
353	// It assumes you have checked for IsSingleton or IsSingletonInverse and will panic given bad input
354	func (c CharSet) SingletonChar() rune {
355	return c.ranges[0].first
356	}
357
358	func (c CharSet) IsSingleton() bool {
359	return !c.negate && //negated is multiple chars
360	len(c.categories) == 0 && len(c.ranges) == 1 && // multiple ranges and unicode classes represent multiple chars
361	c.sub == nil && // subtraction means we've got multiple chars
362	c.ranges[0].first == c.ranges[0].last // first and last equal means we're just 1 char
363	}
364
365	func (c CharSet) IsSingletonInverse() bool {
366	return c.negate && //same as above, but requires negated
367	len(c.categories) == 0 && len(c.ranges) == 1 && // multiple ranges and unicode classes represent multiple chars
368	c.sub == nil && // subtraction means we've got multiple chars
369	c.ranges[0].first == c.ranges[0].last // first and last equal means we're just 1 char
370	}
371
372	func (c CharSet) IsMergeable() bool {
373	return !c.IsNegated() && !c.HasSubtraction()
374	}
375
376	func (c CharSet) IsNegated() bool {
377	return c.negate
378	}
379
380	func (c CharSet) HasSubtraction() bool {
381	return c.sub != nil
382	}
383
384	func (c CharSet) IsEmpty() bool {
385	return len(c.ranges) == 0 && len(c.categories) == 0 && c.sub == nil
386	}
387
388	func (c *CharSet) addDigit(ecma, negate bool, pattern string) {
389	if ecma {
390	if negate {
391	c.addRanges(NotECMADigitClass().ranges)
392	} else {
393	c.addRanges(ECMADigitClass().ranges)
394	}
395	} else {
396	c.addCategories(category{cat: "Nd", negate: negate})
397	}
398	}
399
400	func (c *CharSet) addChar(ch rune) {
401	c.addRange(ch, ch)
402	}
403
404	func (c *CharSet) addSpace(ecma, negate bool) {
405	if ecma {
406	if negate {
407	c.addRanges(NotECMASpaceClass().ranges)
408	} else {
409	c.addRanges(ECMASpaceClass().ranges)
410	}
411	} else {
412	c.addCategories(category{cat: spaceCategoryText, negate: negate})
413	}
414	}
415
416	func (c *CharSet) addWord(ecma, negate bool) {
417	if ecma {
418	if negate {
419	c.addRanges(NotECMAWordClass().ranges)
420	} else {
421	c.addRanges(ECMAWordClass().ranges)
422	}
423	} else {
424	c.addCategories(category{cat: wordCategoryText, negate: negate})
425	}
426	}
427
428	// Add set ranges and categories into ours -- no deduping or anything
429	func (c *CharSet) addSet(set CharSet) {
430	if c.anything {
431	return
432	}
433	if set.anything {
434	c.makeAnything()
435	return
436	}
437	// just append here to prevent double-canon
438	c.ranges = append(c.ranges, set.ranges...)
439	c.addCategories(set.categories...)
440	c.canonicalize()
441	}
442
443	func (c *CharSet) makeAnything() {
444	c.anything = true
445	c.categories = []category{}
446	c.ranges = AnyClass().ranges
447	}
448
449	func (c *CharSet) addCategories(cats ...category) {
450	// don't add dupes and remove positive+negative
451	if c.anything {
452	// if we've had a previous positive+negative group then
453	// just return, we're as broad as we can get
454	return
455	}
456
457	for _, ct := range cats {
458	found := false
459	for _, ct2 := range c.categories {
460	if ct.cat == ct2.cat {
461	if ct.negate != ct2.negate {
462	// oposite negations...this mean we just
463	// take us as anything and move on
464	c.makeAnything()
465	return
466	}
467	found = true
468	break
469	}
470	}
471
472	if !found {
473	c.categories = append(c.categories, ct)
474	}
475	}
476	}
477
478	// Merges new ranges to our own
479	func (c *CharSet) addRanges(ranges []singleRange) {
480	if c.anything {
481	return
482	}
483	c.ranges = append(c.ranges, ranges...)
484	c.canonicalize()
485	}
486
487	// Merges everything but the new ranges into our own
488	func (c *CharSet) addNegativeRanges(ranges []singleRange) {
489	if c.anything {
490	return
491	}
492
493	var hi rune
494
495	// convert incoming ranges into opposites, assume they are in order
496	for _, r := range ranges {
497	if hi < r.first {
498	c.ranges = append(c.ranges, singleRange{hi, r.first - 1})
499	}
500	hi = r.last + 1
501	}
502
503	if hi < utf8.MaxRune {
504	c.ranges = append(c.ranges, singleRange{hi, utf8.MaxRune})
505	}
506
507	c.canonicalize()
508	}
509
510	func isValidUnicodeCat(catName string) bool {
511	_, ok := unicodeCategories[catName]
512	return ok
513	}
514
515	func (c *CharSet) addCategory(categoryName string, negate, caseInsensitive bool, pattern string) {
516	if !isValidUnicodeCat(categoryName) {
517	// unknown unicode category, script, or property "blah"
518	panic(fmt.Errorf("Unknown unicode category, script, or property '%v'", categoryName))
519
520	}
521
522	if caseInsensitive && (categoryName == "Ll" \|\| categoryName == "Lu" \|\| categoryName == "Lt") {
523	// when RegexOptions.IgnoreCase is specified then {Ll} {Lu} and {Lt} cases should all match
524	c.addCategories(
525	category{cat: "Ll", negate: negate},
526	category{cat: "Lu", negate: negate},
527	category{cat: "Lt", negate: negate})
528	}
529	c.addCategories(category{cat: categoryName, negate: negate})
530	}
531
532	func (c CharSet) addSubtraction(sub CharSet) {
533	c.sub = sub
534	}
535
536	func (c *CharSet) addRange(chMin, chMax rune) {
537	c.ranges = append(c.ranges, singleRange{first: chMin, last: chMax})
538	c.canonicalize()
539	}
540
541	func (c *CharSet) addNamedASCII(name string, negate bool) bool {
542	var rs []singleRange
543
544	switch name {
545	case "alnum":
546	rs = []singleRange{singleRange{'0', '9'}, singleRange{'A', 'Z'}, singleRange{'a', 'z'}}
547	case "alpha":
548	rs = []singleRange{singleRange{'A', 'Z'}, singleRange{'a', 'z'}}
549	case "ascii":
550	rs = []singleRange{singleRange{0, 0x7f}}
551	case "blank":
552	rs = []singleRange{singleRange{'\t', '\t'}, singleRange{' ', ' '}}
553	case "cntrl":
554	rs = []singleRange{singleRange{0, 0x1f}, singleRange{0x7f, 0x7f}}
555	case "digit":
556	c.addDigit(false, negate, "")
557	case "graph":
558	rs = []singleRange{singleRange{'!', '~'}}
559	case "lower":
560	rs = []singleRange{singleRange{'a', 'z'}}
561	case "print":
562	rs = []singleRange{singleRange{' ', '~'}}
563	case "punct": //[!-/:-@[-`{-~]
564	rs = []singleRange{singleRange{'!', '/'}, singleRange{':', '@'}, singleRange{'[', '`'}, singleRange{'{', '~'}}
565	case "space":
566	c.addSpace(true, negate)
567	case "upper":
568	rs = []singleRange{singleRange{'A', 'Z'}}
569	case "word":
570	c.addWord(true, negate)
571	case "xdigit":
572	rs = []singleRange{singleRange{'0', '9'}, singleRange{'A', 'F'}, singleRange{'a', 'f'}}
573	default:
574	return false
575	}
576
577	if len(rs) > 0 {
578	if negate {
579	c.addNegativeRanges(rs)
580	} else {
581	c.addRanges(rs)
582	}
583	}
584
585	return true
586	}
587
588	type singleRangeSorter []singleRange
589
590	func (p singleRangeSorter) Len() int { return len(p) }
591	func (p singleRangeSorter) Less(i, j int) bool { return p[i].first < p[j].first }
592	func (p singleRangeSorter) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
593
594	// Logic to reduce a character class to a unique, sorted form.
595	func (c *CharSet) canonicalize() {
596	var i, j int
597	var last rune
598
599	//
600	// Find and eliminate overlapping or abutting ranges
601	//
602
603	if len(c.ranges) > 1 {
604	sort.Sort(singleRangeSorter(c.ranges))
605
606	done := false
607
608	for i, j = 1, 0; ; i++ {
609	for last = c.ranges[j].last; ; i++ {
610	if i == len(c.ranges) \|\| last == utf8.MaxRune {
611	done = true
612	break
613	}
614
615	CurrentRange := c.ranges[i]
616	if CurrentRange.first > last+1 {
617	break
618	}
619
620	if last < CurrentRange.last {
621	last = CurrentRange.last
622	}
623	}
624
625	c.ranges[j] = singleRange{first: c.ranges[j].first, last: last}
626
627	j++
628
629	if done {
630	break
631	}
632
633	if j < i {
634	c.ranges[j] = c.ranges[i]
635	}
636	}
637
638	c.ranges = append(c.ranges[:j], c.ranges[len(c.ranges):]...)
639	}
640	}
641
642	// Adds to the class any lowercase versions of characters already
643	// in the class. Used for case-insensitivity.
644	func (c *CharSet) addLowercase() {
645	if c.anything {
646	return
647	}
648	toAdd := []singleRange{}
649	for i := 0; i < len(c.ranges); i++ {
650	r := c.ranges[i]
651	if r.first == r.last {
652	lower := unicode.ToLower(r.first)
653	c.ranges[i] = singleRange{first: lower, last: lower}
654	} else {
655	toAdd = append(toAdd, r)
656	}
657	}
658
659	for _, r := range toAdd {
660	c.addLowercaseRange(r.first, r.last)
661	}
662	c.canonicalize()
663	}
664
665	/**************************************************************************
666	Let U be the set of Unicode character values and let L be the lowercase
667	function, mapping from U to U. To perform case insensitive matching of
668	character sets, we need to be able to map an interval I in U, say
669
670	I = [chMin, chMax] = { ch : chMin <= ch <= chMax }
671
672	to a set A such that A contains L(I) and A is contained in the union of
673	I and L(I).
674
675	The table below partitions U into intervals on which L is non-decreasing.
676	Thus, for any interval J = [a, b] contained in one of these intervals,
677	L(J) is contained in [L(a), L(b)].
678
679	It is also true that for any such J, [L(a), L(b)] is contained in the
680	union of J and L(J). This does not follow from L being non-decreasing on
681	these intervals. It follows from the nature of the L on each interval.
682	On each interval, L has one of the following forms:
683
684	(1) L(ch) = constant (LowercaseSet)
685	(2) L(ch) = ch + offset (LowercaseAdd)
686	(3) L(ch) = ch \| 1 (LowercaseBor)
687	(4) L(ch) = ch + (ch & 1) (LowercaseBad)
688
689	It is easy to verify that for any of these forms [L(a), L(b)] is
690	contained in the union of [a, b] and L([a, b]).
691	***************************************************************************/
692
693	const (
694	LowercaseSet = 0 // Set to arg.
695	LowercaseAdd = 1 // Add arg.
696	LowercaseBor = 2 // Bitwise or with 1.
697	LowercaseBad = 3 // Bitwise and with 1 and add original.
698	)
699
700	type lcMap struct {
701	chMin, chMax rune
702	op, data int32
703	}
704
705	var lcTable = []lcMap{
706	lcMap{'\u0041', '\u005A', LowercaseAdd, 32},
707	lcMap{'\u00C0', '\u00DE', LowercaseAdd, 32},
708	lcMap{'\u0100', '\u012E', LowercaseBor, 0},
709	lcMap{'\u0130', '\u0130', LowercaseSet, 0x0069},
710	lcMap{'\u0132', '\u0136', LowercaseBor, 0},
711	lcMap{'\u0139', '\u0147', LowercaseBad, 0},
712	lcMap{'\u014A', '\u0176', LowercaseBor, 0},
713	lcMap{'\u0178', '\u0178', LowercaseSet, 0x00FF},
714	lcMap{'\u0179', '\u017D', LowercaseBad, 0},
715	lcMap{'\u0181', '\u0181', LowercaseSet, 0x0253},
716	lcMap{'\u0182', '\u0184', LowercaseBor, 0},
717	lcMap{'\u0186', '\u0186', LowercaseSet, 0x0254},
718	lcMap{'\u0187', '\u0187', LowercaseSet, 0x0188},
719	lcMap{'\u0189', '\u018A', LowercaseAdd, 205},
720	lcMap{'\u018B', '\u018B', LowercaseSet, 0x018C},
721	lcMap{'\u018E', '\u018E', LowercaseSet, 0x01DD},
722	lcMap{'\u018F', '\u018F', LowercaseSet, 0x0259},
723	lcMap{'\u0190', '\u0190', LowercaseSet, 0x025B},
724	lcMap{'\u0191', '\u0191', LowercaseSet, 0x0192},
725	lcMap{'\u0193', '\u0193', LowercaseSet, 0x0260},
726	lcMap{'\u0194', '\u0194', LowercaseSet, 0x0263},
727	lcMap{'\u0196', '\u0196', LowercaseSet, 0x0269},
728	lcMap{'\u0197', '\u0197', LowercaseSet, 0x0268},
729	lcMap{'\u0198', '\u0198', LowercaseSet, 0x0199},
730	lcMap{'\u019C', '\u019C', LowercaseSet, 0x026F},
731	lcMap{'\u019D', '\u019D', LowercaseSet, 0x0272},
732	lcMap{'\u019F', '\u019F', LowercaseSet, 0x0275},
733	lcMap{'\u01A0', '\u01A4', LowercaseBor, 0},
734	lcMap{'\u01A7', '\u01A7', LowercaseSet, 0x01A8},
735	lcMap{'\u01A9', '\u01A9', LowercaseSet, 0x0283},
736	lcMap{'\u01AC', '\u01AC', LowercaseSet, 0x01AD},
737	lcMap{'\u01AE', '\u01AE', LowercaseSet, 0x0288},
738	lcMap{'\u01AF', '\u01AF', LowercaseSet, 0x01B0},
739	lcMap{'\u01B1', '\u01B2', LowercaseAdd, 217},
740	lcMap{'\u01B3', '\u01B5', LowercaseBad, 0},
741	lcMap{'\u01B7', '\u01B7', LowercaseSet, 0x0292},
742	lcMap{'\u01B8', '\u01B8', LowercaseSet, 0x01B9},
743	lcMap{'\u01BC', '\u01BC', LowercaseSet, 0x01BD},
744	lcMap{'\u01C4', '\u01C5', LowercaseSet, 0x01C6},
745	lcMap{'\u01C7', '\u01C8', LowercaseSet, 0x01C9},
746	lcMap{'\u01CA', '\u01CB', LowercaseSet, 0x01CC},
747	lcMap{'\u01CD', '\u01DB', LowercaseBad, 0},
748	lcMap{'\u01DE', '\u01EE', LowercaseBor, 0},
749	lcMap{'\u01F1', '\u01F2', LowercaseSet, 0x01F3},
750	lcMap{'\u01F4', '\u01F4', LowercaseSet, 0x01F5},
751	lcMap{'\u01FA', '\u0216', LowercaseBor, 0},
752	lcMap{'\u0386', '\u0386', LowercaseSet, 0x03AC},
753	lcMap{'\u0388', '\u038A', LowercaseAdd, 37},
754	lcMap{'\u038C', '\u038C', LowercaseSet, 0x03CC},
755	lcMap{'\u038E', '\u038F', LowercaseAdd, 63},
756	lcMap{'\u0391', '\u03AB', LowercaseAdd, 32},
757	lcMap{'\u03E2', '\u03EE', LowercaseBor, 0},
758	lcMap{'\u0401', '\u040F', LowercaseAdd, 80},
759	lcMap{'\u0410', '\u042F', LowercaseAdd, 32},
760	lcMap{'\u0460', '\u0480', LowercaseBor, 0},
761	lcMap{'\u0490', '\u04BE', LowercaseBor, 0},
762	lcMap{'\u04C1', '\u04C3', LowercaseBad, 0},
763	lcMap{'\u04C7', '\u04C7', LowercaseSet, 0x04C8},
764	lcMap{'\u04CB', '\u04CB', LowercaseSet, 0x04CC},
765	lcMap{'\u04D0', '\u04EA', LowercaseBor, 0},
766	lcMap{'\u04EE', '\u04F4', LowercaseBor, 0},
767	lcMap{'\u04F8', '\u04F8', LowercaseSet, 0x04F9},
768	lcMap{'\u0531', '\u0556', LowercaseAdd, 48},
769	lcMap{'\u10A0', '\u10C5', LowercaseAdd, 48},
770	lcMap{'\u1E00', '\u1EF8', LowercaseBor, 0},
771	lcMap{'\u1F08', '\u1F0F', LowercaseAdd, -8},
772	lcMap{'\u1F18', '\u1F1F', LowercaseAdd, -8},
773	lcMap{'\u1F28', '\u1F2F', LowercaseAdd, -8},
774	lcMap{'\u1F38', '\u1F3F', LowercaseAdd, -8},
775	lcMap{'\u1F48', '\u1F4D', LowercaseAdd, -8},
776	lcMap{'\u1F59', '\u1F59', LowercaseSet, 0x1F51},
777	lcMap{'\u1F5B', '\u1F5B', LowercaseSet, 0x1F53},
778	lcMap{'\u1F5D', '\u1F5D', LowercaseSet, 0x1F55},
779	lcMap{'\u1F5F', '\u1F5F', LowercaseSet, 0x1F57},
780	lcMap{'\u1F68', '\u1F6F', LowercaseAdd, -8},
781	lcMap{'\u1F88', '\u1F8F', LowercaseAdd, -8},
782	lcMap{'\u1F98', '\u1F9F', LowercaseAdd, -8},
783	lcMap{'\u1FA8', '\u1FAF', LowercaseAdd, -8},
784	lcMap{'\u1FB8', '\u1FB9', LowercaseAdd, -8},
785	lcMap{'\u1FBA', '\u1FBB', LowercaseAdd, -74},
786	lcMap{'\u1FBC', '\u1FBC', LowercaseSet, 0x1FB3},
787	lcMap{'\u1FC8', '\u1FCB', LowercaseAdd, -86},
788	lcMap{'\u1FCC', '\u1FCC', LowercaseSet, 0x1FC3},
789	lcMap{'\u1FD8', '\u1FD9', LowercaseAdd, -8},
790	lcMap{'\u1FDA', '\u1FDB', LowercaseAdd, -100},
791	lcMap{'\u1FE8', '\u1FE9', LowercaseAdd, -8},
792	lcMap{'\u1FEA', '\u1FEB', LowercaseAdd, -112},
793	lcMap{'\u1FEC', '\u1FEC', LowercaseSet, 0x1FE5},
794	lcMap{'\u1FF8', '\u1FF9', LowercaseAdd, -128},
795	lcMap{'\u1FFA', '\u1FFB', LowercaseAdd, -126},
796	lcMap{'\u1FFC', '\u1FFC', LowercaseSet, 0x1FF3},
797	lcMap{'\u2160', '\u216F', LowercaseAdd, 16},
798	lcMap{'\u24B6', '\u24D0', LowercaseAdd, 26},
799	lcMap{'\uFF21', '\uFF3A', LowercaseAdd, 32},
800	}
801
802	func (c *CharSet) addLowercaseRange(chMin, chMax rune) {
803	var i, iMax, iMid int
804	var chMinT, chMaxT rune
805	var lc lcMap
806
807	for i, iMax = 0, len(lcTable); i < iMax; {
808	iMid = (i + iMax) / 2
809	if lcTable[iMid].chMax < chMin {
810	i = iMid + 1
811	} else {
812	iMax = iMid
813	}
814	}
815
816	for ; i < len(lcTable); i++ {
817	lc = lcTable[i]
818	if lc.chMin > chMax {
819	return
820	}
821	chMinT = lc.chMin
822	if chMinT < chMin {
823	chMinT = chMin
824	}
825
826	chMaxT = lc.chMax
827	if chMaxT > chMax {
828	chMaxT = chMax
829	}
830
831	switch lc.op {
832	case LowercaseSet:
833	chMinT = rune(lc.data)
834	chMaxT = rune(lc.data)
835	break
836	case LowercaseAdd:
837	chMinT += lc.data
838	chMaxT += lc.data
839	break
840	case LowercaseBor:
841	chMinT \|= 1
842	chMaxT \|= 1
843	break
844	case LowercaseBad:
845	chMinT += (chMinT & 1)
846	chMaxT += (chMaxT & 1)
847	break
848	}
849
850	if chMinT < chMin \|\| chMaxT > chMax {
851	c.addRange(chMinT, chMaxT)
852	}
853	}
854	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: