[67] | 1 | /*
|
---|
| 2 | Package regexp2 is a regexp package that has an interface similar to Go's framework regexp engine but uses a
|
---|
| 3 | more feature full regex engine behind the scenes.
|
---|
| 4 |
|
---|
| 5 | It doesn't have constant time guarantees, but it allows backtracking and is compatible with Perl5 and .NET.
|
---|
| 6 | You'll likely be better off with the RE2 engine from the regexp package and should only use this if you
|
---|
| 7 | need to write very complex patterns or require compatibility with .NET.
|
---|
| 8 | */
|
---|
| 9 | package regexp2
|
---|
| 10 |
|
---|
| 11 | import (
|
---|
| 12 | "errors"
|
---|
| 13 | "math"
|
---|
| 14 | "strconv"
|
---|
| 15 | "sync"
|
---|
| 16 | "time"
|
---|
| 17 |
|
---|
| 18 | "github.com/dlclark/regexp2/syntax"
|
---|
| 19 | )
|
---|
| 20 |
|
---|
| 21 | // Default timeout used when running regexp matches -- "forever"
|
---|
| 22 | var DefaultMatchTimeout = time.Duration(math.MaxInt64)
|
---|
| 23 |
|
---|
| 24 | // Regexp is the representation of a compiled regular expression.
|
---|
| 25 | // A Regexp is safe for concurrent use by multiple goroutines.
|
---|
| 26 | type Regexp struct {
|
---|
| 27 | //timeout when trying to find matches
|
---|
| 28 | MatchTimeout time.Duration
|
---|
| 29 |
|
---|
| 30 | // read-only after Compile
|
---|
| 31 | pattern string // as passed to Compile
|
---|
| 32 | options RegexOptions // options
|
---|
| 33 |
|
---|
| 34 | caps map[int]int // capnum->index
|
---|
| 35 | capnames map[string]int //capture group name -> index
|
---|
| 36 | capslist []string //sorted list of capture group names
|
---|
| 37 | capsize int // size of the capture array
|
---|
| 38 |
|
---|
| 39 | code *syntax.Code // compiled program
|
---|
| 40 |
|
---|
| 41 | // cache of machines for running regexp
|
---|
| 42 | muRun sync.Mutex
|
---|
| 43 | runner []*runner
|
---|
| 44 | }
|
---|
| 45 |
|
---|
| 46 | // Compile parses a regular expression and returns, if successful,
|
---|
| 47 | // a Regexp object that can be used to match against text.
|
---|
| 48 | func Compile(expr string, opt RegexOptions) (*Regexp, error) {
|
---|
| 49 | // parse it
|
---|
| 50 | tree, err := syntax.Parse(expr, syntax.RegexOptions(opt))
|
---|
| 51 | if err != nil {
|
---|
| 52 | return nil, err
|
---|
| 53 | }
|
---|
| 54 |
|
---|
| 55 | // translate it to code
|
---|
| 56 | code, err := syntax.Write(tree)
|
---|
| 57 | if err != nil {
|
---|
| 58 | return nil, err
|
---|
| 59 | }
|
---|
| 60 |
|
---|
| 61 | // return it
|
---|
| 62 | return &Regexp{
|
---|
| 63 | pattern: expr,
|
---|
| 64 | options: opt,
|
---|
| 65 | caps: code.Caps,
|
---|
| 66 | capnames: tree.Capnames,
|
---|
| 67 | capslist: tree.Caplist,
|
---|
| 68 | capsize: code.Capsize,
|
---|
| 69 | code: code,
|
---|
| 70 | MatchTimeout: DefaultMatchTimeout,
|
---|
| 71 | }, nil
|
---|
| 72 | }
|
---|
| 73 |
|
---|
| 74 | // MustCompile is like Compile but panics if the expression cannot be parsed.
|
---|
| 75 | // It simplifies safe initialization of global variables holding compiled regular
|
---|
| 76 | // expressions.
|
---|
| 77 | func MustCompile(str string, opt RegexOptions) *Regexp {
|
---|
| 78 | regexp, error := Compile(str, opt)
|
---|
| 79 | if error != nil {
|
---|
| 80 | panic(`regexp2: Compile(` + quote(str) + `): ` + error.Error())
|
---|
| 81 | }
|
---|
| 82 | return regexp
|
---|
| 83 | }
|
---|
| 84 |
|
---|
| 85 | // Escape adds backslashes to any special characters in the input string
|
---|
| 86 | func Escape(input string) string {
|
---|
| 87 | return syntax.Escape(input)
|
---|
| 88 | }
|
---|
| 89 |
|
---|
| 90 | // Unescape removes any backslashes from previously-escaped special characters in the input string
|
---|
| 91 | func Unescape(input string) (string, error) {
|
---|
| 92 | return syntax.Unescape(input)
|
---|
| 93 | }
|
---|
| 94 |
|
---|
| 95 | // String returns the source text used to compile the regular expression.
|
---|
| 96 | func (re *Regexp) String() string {
|
---|
| 97 | return re.pattern
|
---|
| 98 | }
|
---|
| 99 |
|
---|
| 100 | func quote(s string) string {
|
---|
| 101 | if strconv.CanBackquote(s) {
|
---|
| 102 | return "`" + s + "`"
|
---|
| 103 | }
|
---|
| 104 | return strconv.Quote(s)
|
---|
| 105 | }
|
---|
| 106 |
|
---|
| 107 | // RegexOptions impact the runtime and parsing behavior
|
---|
| 108 | // for each specific regex. They are setable in code as well
|
---|
| 109 | // as in the regex pattern itself.
|
---|
| 110 | type RegexOptions int32
|
---|
| 111 |
|
---|
| 112 | const (
|
---|
| 113 | None RegexOptions = 0x0
|
---|
| 114 | IgnoreCase = 0x0001 // "i"
|
---|
| 115 | Multiline = 0x0002 // "m"
|
---|
| 116 | ExplicitCapture = 0x0004 // "n"
|
---|
| 117 | Compiled = 0x0008 // "c"
|
---|
| 118 | Singleline = 0x0010 // "s"
|
---|
| 119 | IgnorePatternWhitespace = 0x0020 // "x"
|
---|
| 120 | RightToLeft = 0x0040 // "r"
|
---|
| 121 | Debug = 0x0080 // "d"
|
---|
| 122 | ECMAScript = 0x0100 // "e"
|
---|
| 123 | RE2 = 0x0200 // RE2 (regexp package) compatibility mode
|
---|
| 124 | )
|
---|
| 125 |
|
---|
| 126 | func (re *Regexp) RightToLeft() bool {
|
---|
| 127 | return re.options&RightToLeft != 0
|
---|
| 128 | }
|
---|
| 129 |
|
---|
| 130 | func (re *Regexp) Debug() bool {
|
---|
| 131 | return re.options&Debug != 0
|
---|
| 132 | }
|
---|
| 133 |
|
---|
| 134 | // Replace searches the input string and replaces each match found with the replacement text.
|
---|
| 135 | // Count will limit the number of matches attempted and startAt will allow
|
---|
| 136 | // us to skip past possible matches at the start of the input (left or right depending on RightToLeft option).
|
---|
| 137 | // Set startAt and count to -1 to go through the whole string
|
---|
| 138 | func (re *Regexp) Replace(input, replacement string, startAt, count int) (string, error) {
|
---|
| 139 | data, err := syntax.NewReplacerData(replacement, re.caps, re.capsize, re.capnames, syntax.RegexOptions(re.options))
|
---|
| 140 | if err != nil {
|
---|
| 141 | return "", err
|
---|
| 142 | }
|
---|
| 143 | //TODO: cache ReplacerData
|
---|
| 144 |
|
---|
| 145 | return replace(re, data, nil, input, startAt, count)
|
---|
| 146 | }
|
---|
| 147 |
|
---|
| 148 | // ReplaceFunc searches the input string and replaces each match found using the string from the evaluator
|
---|
| 149 | // Count will limit the number of matches attempted and startAt will allow
|
---|
| 150 | // us to skip past possible matches at the start of the input (left or right depending on RightToLeft option).
|
---|
| 151 | // Set startAt and count to -1 to go through the whole string.
|
---|
| 152 | func (re *Regexp) ReplaceFunc(input string, evaluator MatchEvaluator, startAt, count int) (string, error) {
|
---|
| 153 | return replace(re, nil, evaluator, input, startAt, count)
|
---|
| 154 | }
|
---|
| 155 |
|
---|
| 156 | // FindStringMatch searches the input string for a Regexp match
|
---|
| 157 | func (re *Regexp) FindStringMatch(s string) (*Match, error) {
|
---|
| 158 | // convert string to runes
|
---|
| 159 | return re.run(false, -1, getRunes(s))
|
---|
| 160 | }
|
---|
| 161 |
|
---|
| 162 | // FindRunesMatch searches the input rune slice for a Regexp match
|
---|
| 163 | func (re *Regexp) FindRunesMatch(r []rune) (*Match, error) {
|
---|
| 164 | return re.run(false, -1, r)
|
---|
| 165 | }
|
---|
| 166 |
|
---|
| 167 | // FindStringMatchStartingAt searches the input string for a Regexp match starting at the startAt index
|
---|
| 168 | func (re *Regexp) FindStringMatchStartingAt(s string, startAt int) (*Match, error) {
|
---|
| 169 | if startAt > len(s) {
|
---|
| 170 | return nil, errors.New("startAt must be less than the length of the input string")
|
---|
| 171 | }
|
---|
| 172 | r, startAt := re.getRunesAndStart(s, startAt)
|
---|
| 173 | if startAt == -1 {
|
---|
| 174 | // we didn't find our start index in the string -- that's a problem
|
---|
| 175 | return nil, errors.New("startAt must align to the start of a valid rune in the input string")
|
---|
| 176 | }
|
---|
| 177 |
|
---|
| 178 | return re.run(false, startAt, r)
|
---|
| 179 | }
|
---|
| 180 |
|
---|
| 181 | // FindRunesMatchStartingAt searches the input rune slice for a Regexp match starting at the startAt index
|
---|
| 182 | func (re *Regexp) FindRunesMatchStartingAt(r []rune, startAt int) (*Match, error) {
|
---|
| 183 | return re.run(false, startAt, r)
|
---|
| 184 | }
|
---|
| 185 |
|
---|
| 186 | // FindNextMatch returns the next match in the same input string as the match parameter.
|
---|
| 187 | // Will return nil if there is no next match or if given a nil match.
|
---|
| 188 | func (re *Regexp) FindNextMatch(m *Match) (*Match, error) {
|
---|
| 189 | if m == nil {
|
---|
| 190 | return nil, nil
|
---|
| 191 | }
|
---|
| 192 |
|
---|
| 193 | // If previous match was empty, advance by one before matching to prevent
|
---|
| 194 | // infinite loop
|
---|
| 195 | startAt := m.textpos
|
---|
| 196 | if m.Length == 0 {
|
---|
| 197 | if m.textpos == len(m.text) {
|
---|
| 198 | return nil, nil
|
---|
| 199 | }
|
---|
| 200 |
|
---|
| 201 | if re.RightToLeft() {
|
---|
| 202 | startAt--
|
---|
| 203 | } else {
|
---|
| 204 | startAt++
|
---|
| 205 | }
|
---|
| 206 | }
|
---|
| 207 | return re.run(false, startAt, m.text)
|
---|
| 208 | }
|
---|
| 209 |
|
---|
| 210 | // MatchString return true if the string matches the regex
|
---|
| 211 | // error will be set if a timeout occurs
|
---|
| 212 | func (re *Regexp) MatchString(s string) (bool, error) {
|
---|
| 213 | m, err := re.run(true, -1, getRunes(s))
|
---|
| 214 | if err != nil {
|
---|
| 215 | return false, err
|
---|
| 216 | }
|
---|
| 217 | return m != nil, nil
|
---|
| 218 | }
|
---|
| 219 |
|
---|
| 220 | func (re *Regexp) getRunesAndStart(s string, startAt int) ([]rune, int) {
|
---|
| 221 | if startAt < 0 {
|
---|
| 222 | if re.RightToLeft() {
|
---|
| 223 | r := getRunes(s)
|
---|
| 224 | return r, len(r)
|
---|
| 225 | }
|
---|
| 226 | return getRunes(s), 0
|
---|
| 227 | }
|
---|
| 228 | ret := make([]rune, len(s))
|
---|
| 229 | i := 0
|
---|
| 230 | runeIdx := -1
|
---|
| 231 | for strIdx, r := range s {
|
---|
| 232 | if strIdx == startAt {
|
---|
| 233 | runeIdx = i
|
---|
| 234 | }
|
---|
| 235 | ret[i] = r
|
---|
| 236 | i++
|
---|
| 237 | }
|
---|
| 238 | if startAt == len(s) {
|
---|
| 239 | runeIdx = i
|
---|
| 240 | }
|
---|
| 241 | return ret[:i], runeIdx
|
---|
| 242 | }
|
---|
| 243 |
|
---|
| 244 | func getRunes(s string) []rune {
|
---|
| 245 | return []rune(s)
|
---|
| 246 | }
|
---|
| 247 |
|
---|
| 248 | // MatchRunes return true if the runes matches the regex
|
---|
| 249 | // error will be set if a timeout occurs
|
---|
| 250 | func (re *Regexp) MatchRunes(r []rune) (bool, error) {
|
---|
| 251 | m, err := re.run(true, -1, r)
|
---|
| 252 | if err != nil {
|
---|
| 253 | return false, err
|
---|
| 254 | }
|
---|
| 255 | return m != nil, nil
|
---|
| 256 | }
|
---|
| 257 |
|
---|
| 258 | // GetGroupNames Returns the set of strings used to name capturing groups in the expression.
|
---|
| 259 | func (re *Regexp) GetGroupNames() []string {
|
---|
| 260 | var result []string
|
---|
| 261 |
|
---|
| 262 | if re.capslist == nil {
|
---|
| 263 | result = make([]string, re.capsize)
|
---|
| 264 |
|
---|
| 265 | for i := 0; i < len(result); i++ {
|
---|
| 266 | result[i] = strconv.Itoa(i)
|
---|
| 267 | }
|
---|
| 268 | } else {
|
---|
| 269 | result = make([]string, len(re.capslist))
|
---|
| 270 | copy(result, re.capslist)
|
---|
| 271 | }
|
---|
| 272 |
|
---|
| 273 | return result
|
---|
| 274 | }
|
---|
| 275 |
|
---|
| 276 | // GetGroupNumbers returns the integer group numbers corresponding to a group name.
|
---|
| 277 | func (re *Regexp) GetGroupNumbers() []int {
|
---|
| 278 | var result []int
|
---|
| 279 |
|
---|
| 280 | if re.caps == nil {
|
---|
| 281 | result = make([]int, re.capsize)
|
---|
| 282 |
|
---|
| 283 | for i := 0; i < len(result); i++ {
|
---|
| 284 | result[i] = i
|
---|
| 285 | }
|
---|
| 286 | } else {
|
---|
| 287 | result = make([]int, len(re.caps))
|
---|
| 288 |
|
---|
| 289 | for k, v := range re.caps {
|
---|
| 290 | result[v] = k
|
---|
| 291 | }
|
---|
| 292 | }
|
---|
| 293 |
|
---|
| 294 | return result
|
---|
| 295 | }
|
---|
| 296 |
|
---|
| 297 | // GroupNameFromNumber retrieves a group name that corresponds to a group number.
|
---|
| 298 | // It will return "" for and unknown group number. Unnamed groups automatically
|
---|
| 299 | // receive a name that is the decimal string equivalent of its number.
|
---|
| 300 | func (re *Regexp) GroupNameFromNumber(i int) string {
|
---|
| 301 | if re.capslist == nil {
|
---|
| 302 | if i >= 0 && i < re.capsize {
|
---|
| 303 | return strconv.Itoa(i)
|
---|
| 304 | }
|
---|
| 305 |
|
---|
| 306 | return ""
|
---|
| 307 | }
|
---|
| 308 |
|
---|
| 309 | if re.caps != nil {
|
---|
| 310 | var ok bool
|
---|
| 311 | if i, ok = re.caps[i]; !ok {
|
---|
| 312 | return ""
|
---|
| 313 | }
|
---|
| 314 | }
|
---|
| 315 |
|
---|
| 316 | if i >= 0 && i < len(re.capslist) {
|
---|
| 317 | return re.capslist[i]
|
---|
| 318 | }
|
---|
| 319 |
|
---|
| 320 | return ""
|
---|
| 321 | }
|
---|
| 322 |
|
---|
| 323 | // GroupNumberFromName returns a group number that corresponds to a group name.
|
---|
| 324 | // Returns -1 if the name is not a recognized group name. Numbered groups
|
---|
| 325 | // automatically get a group name that is the decimal string equivalent of its number.
|
---|
| 326 | func (re *Regexp) GroupNumberFromName(name string) int {
|
---|
| 327 | // look up name if we have a hashtable of names
|
---|
| 328 | if re.capnames != nil {
|
---|
| 329 | if k, ok := re.capnames[name]; ok {
|
---|
| 330 | return k
|
---|
| 331 | }
|
---|
| 332 |
|
---|
| 333 | return -1
|
---|
| 334 | }
|
---|
| 335 |
|
---|
| 336 | // convert to an int if it looks like a number
|
---|
| 337 | result := 0
|
---|
| 338 | for i := 0; i < len(name); i++ {
|
---|
| 339 | ch := name[i]
|
---|
| 340 |
|
---|
| 341 | if ch > '9' || ch < '0' {
|
---|
| 342 | return -1
|
---|
| 343 | }
|
---|
| 344 |
|
---|
| 345 | result *= 10
|
---|
| 346 | result += int(ch - '0')
|
---|
| 347 | }
|
---|
| 348 |
|
---|
| 349 | // return int if it's in range
|
---|
| 350 | if result >= 0 && result < re.capsize {
|
---|
| 351 | return result
|
---|
| 352 | }
|
---|
| 353 |
|
---|
| 354 | return -1
|
---|
| 355 | }
|
---|