1 | /*
|
---|
2 | Package regexp2 is a regexp package that has an interface similar to Go's framework regexp engine but uses a
|
---|
3 | more feature full regex engine behind the scenes.
|
---|
4 |
|
---|
5 | It doesn't have constant time guarantees, but it allows backtracking and is compatible with Perl5 and .NET.
|
---|
6 | You'll likely be better off with the RE2 engine from the regexp package and should only use this if you
|
---|
7 | need to write very complex patterns or require compatibility with .NET.
|
---|
8 | */
|
---|
9 | package regexp2
|
---|
10 |
|
---|
11 | import (
|
---|
12 | "errors"
|
---|
13 | "math"
|
---|
14 | "strconv"
|
---|
15 | "sync"
|
---|
16 | "time"
|
---|
17 |
|
---|
18 | "github.com/dlclark/regexp2/syntax"
|
---|
19 | )
|
---|
20 |
|
---|
21 | // Default timeout used when running regexp matches -- "forever"
|
---|
22 | var DefaultMatchTimeout = time.Duration(math.MaxInt64)
|
---|
23 |
|
---|
24 | // Regexp is the representation of a compiled regular expression.
|
---|
25 | // A Regexp is safe for concurrent use by multiple goroutines.
|
---|
26 | type Regexp struct {
|
---|
27 | //timeout when trying to find matches
|
---|
28 | MatchTimeout time.Duration
|
---|
29 |
|
---|
30 | // read-only after Compile
|
---|
31 | pattern string // as passed to Compile
|
---|
32 | options RegexOptions // options
|
---|
33 |
|
---|
34 | caps map[int]int // capnum->index
|
---|
35 | capnames map[string]int //capture group name -> index
|
---|
36 | capslist []string //sorted list of capture group names
|
---|
37 | capsize int // size of the capture array
|
---|
38 |
|
---|
39 | code *syntax.Code // compiled program
|
---|
40 |
|
---|
41 | // cache of machines for running regexp
|
---|
42 | muRun sync.Mutex
|
---|
43 | runner []*runner
|
---|
44 | }
|
---|
45 |
|
---|
46 | // Compile parses a regular expression and returns, if successful,
|
---|
47 | // a Regexp object that can be used to match against text.
|
---|
48 | func Compile(expr string, opt RegexOptions) (*Regexp, error) {
|
---|
49 | // parse it
|
---|
50 | tree, err := syntax.Parse(expr, syntax.RegexOptions(opt))
|
---|
51 | if err != nil {
|
---|
52 | return nil, err
|
---|
53 | }
|
---|
54 |
|
---|
55 | // translate it to code
|
---|
56 | code, err := syntax.Write(tree)
|
---|
57 | if err != nil {
|
---|
58 | return nil, err
|
---|
59 | }
|
---|
60 |
|
---|
61 | // return it
|
---|
62 | return &Regexp{
|
---|
63 | pattern: expr,
|
---|
64 | options: opt,
|
---|
65 | caps: code.Caps,
|
---|
66 | capnames: tree.Capnames,
|
---|
67 | capslist: tree.Caplist,
|
---|
68 | capsize: code.Capsize,
|
---|
69 | code: code,
|
---|
70 | MatchTimeout: DefaultMatchTimeout,
|
---|
71 | }, nil
|
---|
72 | }
|
---|
73 |
|
---|
74 | // MustCompile is like Compile but panics if the expression cannot be parsed.
|
---|
75 | // It simplifies safe initialization of global variables holding compiled regular
|
---|
76 | // expressions.
|
---|
77 | func MustCompile(str string, opt RegexOptions) *Regexp {
|
---|
78 | regexp, error := Compile(str, opt)
|
---|
79 | if error != nil {
|
---|
80 | panic(`regexp2: Compile(` + quote(str) + `): ` + error.Error())
|
---|
81 | }
|
---|
82 | return regexp
|
---|
83 | }
|
---|
84 |
|
---|
85 | // Escape adds backslashes to any special characters in the input string
|
---|
86 | func Escape(input string) string {
|
---|
87 | return syntax.Escape(input)
|
---|
88 | }
|
---|
89 |
|
---|
90 | // Unescape removes any backslashes from previously-escaped special characters in the input string
|
---|
91 | func Unescape(input string) (string, error) {
|
---|
92 | return syntax.Unescape(input)
|
---|
93 | }
|
---|
94 |
|
---|
95 | // String returns the source text used to compile the regular expression.
|
---|
96 | func (re *Regexp) String() string {
|
---|
97 | return re.pattern
|
---|
98 | }
|
---|
99 |
|
---|
100 | func quote(s string) string {
|
---|
101 | if strconv.CanBackquote(s) {
|
---|
102 | return "`" + s + "`"
|
---|
103 | }
|
---|
104 | return strconv.Quote(s)
|
---|
105 | }
|
---|
106 |
|
---|
107 | // RegexOptions impact the runtime and parsing behavior
|
---|
108 | // for each specific regex. They are setable in code as well
|
---|
109 | // as in the regex pattern itself.
|
---|
110 | type RegexOptions int32
|
---|
111 |
|
---|
112 | const (
|
---|
113 | None RegexOptions = 0x0
|
---|
114 | IgnoreCase = 0x0001 // "i"
|
---|
115 | Multiline = 0x0002 // "m"
|
---|
116 | ExplicitCapture = 0x0004 // "n"
|
---|
117 | Compiled = 0x0008 // "c"
|
---|
118 | Singleline = 0x0010 // "s"
|
---|
119 | IgnorePatternWhitespace = 0x0020 // "x"
|
---|
120 | RightToLeft = 0x0040 // "r"
|
---|
121 | Debug = 0x0080 // "d"
|
---|
122 | ECMAScript = 0x0100 // "e"
|
---|
123 | RE2 = 0x0200 // RE2 (regexp package) compatibility mode
|
---|
124 | )
|
---|
125 |
|
---|
126 | func (re *Regexp) RightToLeft() bool {
|
---|
127 | return re.options&RightToLeft != 0
|
---|
128 | }
|
---|
129 |
|
---|
130 | func (re *Regexp) Debug() bool {
|
---|
131 | return re.options&Debug != 0
|
---|
132 | }
|
---|
133 |
|
---|
134 | // Replace searches the input string and replaces each match found with the replacement text.
|
---|
135 | // Count will limit the number of matches attempted and startAt will allow
|
---|
136 | // us to skip past possible matches at the start of the input (left or right depending on RightToLeft option).
|
---|
137 | // Set startAt and count to -1 to go through the whole string
|
---|
138 | func (re *Regexp) Replace(input, replacement string, startAt, count int) (string, error) {
|
---|
139 | data, err := syntax.NewReplacerData(replacement, re.caps, re.capsize, re.capnames, syntax.RegexOptions(re.options))
|
---|
140 | if err != nil {
|
---|
141 | return "", err
|
---|
142 | }
|
---|
143 | //TODO: cache ReplacerData
|
---|
144 |
|
---|
145 | return replace(re, data, nil, input, startAt, count)
|
---|
146 | }
|
---|
147 |
|
---|
148 | // ReplaceFunc searches the input string and replaces each match found using the string from the evaluator
|
---|
149 | // Count will limit the number of matches attempted and startAt will allow
|
---|
150 | // us to skip past possible matches at the start of the input (left or right depending on RightToLeft option).
|
---|
151 | // Set startAt and count to -1 to go through the whole string.
|
---|
152 | func (re *Regexp) ReplaceFunc(input string, evaluator MatchEvaluator, startAt, count int) (string, error) {
|
---|
153 | return replace(re, nil, evaluator, input, startAt, count)
|
---|
154 | }
|
---|
155 |
|
---|
156 | // FindStringMatch searches the input string for a Regexp match
|
---|
157 | func (re *Regexp) FindStringMatch(s string) (*Match, error) {
|
---|
158 | // convert string to runes
|
---|
159 | return re.run(false, -1, getRunes(s))
|
---|
160 | }
|
---|
161 |
|
---|
162 | // FindRunesMatch searches the input rune slice for a Regexp match
|
---|
163 | func (re *Regexp) FindRunesMatch(r []rune) (*Match, error) {
|
---|
164 | return re.run(false, -1, r)
|
---|
165 | }
|
---|
166 |
|
---|
167 | // FindStringMatchStartingAt searches the input string for a Regexp match starting at the startAt index
|
---|
168 | func (re *Regexp) FindStringMatchStartingAt(s string, startAt int) (*Match, error) {
|
---|
169 | if startAt > len(s) {
|
---|
170 | return nil, errors.New("startAt must be less than the length of the input string")
|
---|
171 | }
|
---|
172 | r, startAt := re.getRunesAndStart(s, startAt)
|
---|
173 | if startAt == -1 {
|
---|
174 | // we didn't find our start index in the string -- that's a problem
|
---|
175 | return nil, errors.New("startAt must align to the start of a valid rune in the input string")
|
---|
176 | }
|
---|
177 |
|
---|
178 | return re.run(false, startAt, r)
|
---|
179 | }
|
---|
180 |
|
---|
181 | // FindRunesMatchStartingAt searches the input rune slice for a Regexp match starting at the startAt index
|
---|
182 | func (re *Regexp) FindRunesMatchStartingAt(r []rune, startAt int) (*Match, error) {
|
---|
183 | return re.run(false, startAt, r)
|
---|
184 | }
|
---|
185 |
|
---|
186 | // FindNextMatch returns the next match in the same input string as the match parameter.
|
---|
187 | // Will return nil if there is no next match or if given a nil match.
|
---|
188 | func (re *Regexp) FindNextMatch(m *Match) (*Match, error) {
|
---|
189 | if m == nil {
|
---|
190 | return nil, nil
|
---|
191 | }
|
---|
192 |
|
---|
193 | // If previous match was empty, advance by one before matching to prevent
|
---|
194 | // infinite loop
|
---|
195 | startAt := m.textpos
|
---|
196 | if m.Length == 0 {
|
---|
197 | if m.textpos == len(m.text) {
|
---|
198 | return nil, nil
|
---|
199 | }
|
---|
200 |
|
---|
201 | if re.RightToLeft() {
|
---|
202 | startAt--
|
---|
203 | } else {
|
---|
204 | startAt++
|
---|
205 | }
|
---|
206 | }
|
---|
207 | return re.run(false, startAt, m.text)
|
---|
208 | }
|
---|
209 |
|
---|
210 | // MatchString return true if the string matches the regex
|
---|
211 | // error will be set if a timeout occurs
|
---|
212 | func (re *Regexp) MatchString(s string) (bool, error) {
|
---|
213 | m, err := re.run(true, -1, getRunes(s))
|
---|
214 | if err != nil {
|
---|
215 | return false, err
|
---|
216 | }
|
---|
217 | return m != nil, nil
|
---|
218 | }
|
---|
219 |
|
---|
220 | func (re *Regexp) getRunesAndStart(s string, startAt int) ([]rune, int) {
|
---|
221 | if startAt < 0 {
|
---|
222 | if re.RightToLeft() {
|
---|
223 | r := getRunes(s)
|
---|
224 | return r, len(r)
|
---|
225 | }
|
---|
226 | return getRunes(s), 0
|
---|
227 | }
|
---|
228 | ret := make([]rune, len(s))
|
---|
229 | i := 0
|
---|
230 | runeIdx := -1
|
---|
231 | for strIdx, r := range s {
|
---|
232 | if strIdx == startAt {
|
---|
233 | runeIdx = i
|
---|
234 | }
|
---|
235 | ret[i] = r
|
---|
236 | i++
|
---|
237 | }
|
---|
238 | if startAt == len(s) {
|
---|
239 | runeIdx = i
|
---|
240 | }
|
---|
241 | return ret[:i], runeIdx
|
---|
242 | }
|
---|
243 |
|
---|
244 | func getRunes(s string) []rune {
|
---|
245 | return []rune(s)
|
---|
246 | }
|
---|
247 |
|
---|
248 | // MatchRunes return true if the runes matches the regex
|
---|
249 | // error will be set if a timeout occurs
|
---|
250 | func (re *Regexp) MatchRunes(r []rune) (bool, error) {
|
---|
251 | m, err := re.run(true, -1, r)
|
---|
252 | if err != nil {
|
---|
253 | return false, err
|
---|
254 | }
|
---|
255 | return m != nil, nil
|
---|
256 | }
|
---|
257 |
|
---|
258 | // GetGroupNames Returns the set of strings used to name capturing groups in the expression.
|
---|
259 | func (re *Regexp) GetGroupNames() []string {
|
---|
260 | var result []string
|
---|
261 |
|
---|
262 | if re.capslist == nil {
|
---|
263 | result = make([]string, re.capsize)
|
---|
264 |
|
---|
265 | for i := 0; i < len(result); i++ {
|
---|
266 | result[i] = strconv.Itoa(i)
|
---|
267 | }
|
---|
268 | } else {
|
---|
269 | result = make([]string, len(re.capslist))
|
---|
270 | copy(result, re.capslist)
|
---|
271 | }
|
---|
272 |
|
---|
273 | return result
|
---|
274 | }
|
---|
275 |
|
---|
276 | // GetGroupNumbers returns the integer group numbers corresponding to a group name.
|
---|
277 | func (re *Regexp) GetGroupNumbers() []int {
|
---|
278 | var result []int
|
---|
279 |
|
---|
280 | if re.caps == nil {
|
---|
281 | result = make([]int, re.capsize)
|
---|
282 |
|
---|
283 | for i := 0; i < len(result); i++ {
|
---|
284 | result[i] = i
|
---|
285 | }
|
---|
286 | } else {
|
---|
287 | result = make([]int, len(re.caps))
|
---|
288 |
|
---|
289 | for k, v := range re.caps {
|
---|
290 | result[v] = k
|
---|
291 | }
|
---|
292 | }
|
---|
293 |
|
---|
294 | return result
|
---|
295 | }
|
---|
296 |
|
---|
297 | // GroupNameFromNumber retrieves a group name that corresponds to a group number.
|
---|
298 | // It will return "" for and unknown group number. Unnamed groups automatically
|
---|
299 | // receive a name that is the decimal string equivalent of its number.
|
---|
300 | func (re *Regexp) GroupNameFromNumber(i int) string {
|
---|
301 | if re.capslist == nil {
|
---|
302 | if i >= 0 && i < re.capsize {
|
---|
303 | return strconv.Itoa(i)
|
---|
304 | }
|
---|
305 |
|
---|
306 | return ""
|
---|
307 | }
|
---|
308 |
|
---|
309 | if re.caps != nil {
|
---|
310 | var ok bool
|
---|
311 | if i, ok = re.caps[i]; !ok {
|
---|
312 | return ""
|
---|
313 | }
|
---|
314 | }
|
---|
315 |
|
---|
316 | if i >= 0 && i < len(re.capslist) {
|
---|
317 | return re.capslist[i]
|
---|
318 | }
|
---|
319 |
|
---|
320 | return ""
|
---|
321 | }
|
---|
322 |
|
---|
323 | // GroupNumberFromName returns a group number that corresponds to a group name.
|
---|
324 | // Returns -1 if the name is not a recognized group name. Numbered groups
|
---|
325 | // automatically get a group name that is the decimal string equivalent of its number.
|
---|
326 | func (re *Regexp) GroupNumberFromName(name string) int {
|
---|
327 | // look up name if we have a hashtable of names
|
---|
328 | if re.capnames != nil {
|
---|
329 | if k, ok := re.capnames[name]; ok {
|
---|
330 | return k
|
---|
331 | }
|
---|
332 |
|
---|
333 | return -1
|
---|
334 | }
|
---|
335 |
|
---|
336 | // convert to an int if it looks like a number
|
---|
337 | result := 0
|
---|
338 | for i := 0; i < len(name); i++ {
|
---|
339 | ch := name[i]
|
---|
340 |
|
---|
341 | if ch > '9' || ch < '0' {
|
---|
342 | return -1
|
---|
343 | }
|
---|
344 |
|
---|
345 | result *= 10
|
---|
346 | result += int(ch - '0')
|
---|
347 | }
|
---|
348 |
|
---|
349 | // return int if it's in range
|
---|
350 | if result >= 0 && result < re.capsize {
|
---|
351 | return result
|
---|
352 | }
|
---|
353 |
|
---|
354 | return -1
|
---|
355 | }
|
---|