[822] | 1 | /*
|
---|
| 2 | Copyright 2012 Google Inc. All Rights Reserved.
|
---|
| 3 |
|
---|
| 4 | Licensed under the Apache License, Version 2.0 (the "License");
|
---|
| 5 | you may not use this file except in compliance with the License.
|
---|
| 6 | You may obtain a copy of the License at
|
---|
| 7 |
|
---|
| 8 | http://www.apache.org/licenses/LICENSE-2.0
|
---|
| 9 |
|
---|
| 10 | Unless required by applicable law or agreed to in writing, software
|
---|
| 11 | distributed under the License is distributed on an "AS IS" BASIS,
|
---|
| 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
---|
| 13 | See the License for the specific language governing permissions and
|
---|
| 14 | limitations under the License.
|
---|
| 15 | */
|
---|
| 16 |
|
---|
| 17 | /*
|
---|
| 18 | Package shlex implements a simple lexer which splits input in to tokens using
|
---|
| 19 | shell-style rules for quoting and commenting.
|
---|
| 20 |
|
---|
| 21 | The basic use case uses the default ASCII lexer to split a string into sub-strings:
|
---|
| 22 |
|
---|
| 23 | shlex.Split("one \"two three\" four") -> []string{"one", "two three", "four"}
|
---|
| 24 |
|
---|
| 25 | To process a stream of strings:
|
---|
| 26 |
|
---|
| 27 | l := NewLexer(os.Stdin)
|
---|
| 28 | for ; token, err := l.Next(); err != nil {
|
---|
| 29 | // process token
|
---|
| 30 | }
|
---|
| 31 |
|
---|
| 32 | To access the raw token stream (which includes tokens for comments):
|
---|
| 33 |
|
---|
| 34 | t := NewTokenizer(os.Stdin)
|
---|
| 35 | for ; token, err := t.Next(); err != nil {
|
---|
| 36 | // process token
|
---|
| 37 | }
|
---|
| 38 |
|
---|
| 39 | */
|
---|
| 40 | package shlex
|
---|
| 41 |
|
---|
| 42 | import (
|
---|
| 43 | "bufio"
|
---|
| 44 | "fmt"
|
---|
| 45 | "io"
|
---|
| 46 | "strings"
|
---|
| 47 | )
|
---|
| 48 |
|
---|
| 49 | // TokenType is a top-level token classification: A word, space, comment, unknown.
|
---|
| 50 | type TokenType int
|
---|
| 51 |
|
---|
| 52 | // runeTokenClass is the type of a UTF-8 character classification: A quote, space, escape.
|
---|
| 53 | type runeTokenClass int
|
---|
| 54 |
|
---|
| 55 | // the internal state used by the lexer state machine
|
---|
| 56 | type lexerState int
|
---|
| 57 |
|
---|
| 58 | // Token is a (type, value) pair representing a lexographical token.
|
---|
| 59 | type Token struct {
|
---|
| 60 | tokenType TokenType
|
---|
| 61 | value string
|
---|
| 62 | }
|
---|
| 63 |
|
---|
| 64 | // Equal reports whether tokens a, and b, are equal.
|
---|
| 65 | // Two tokens are equal if both their types and values are equal. A nil token can
|
---|
| 66 | // never be equal to another token.
|
---|
| 67 | func (a *Token) Equal(b *Token) bool {
|
---|
| 68 | if a == nil || b == nil {
|
---|
| 69 | return false
|
---|
| 70 | }
|
---|
| 71 | if a.tokenType != b.tokenType {
|
---|
| 72 | return false
|
---|
| 73 | }
|
---|
| 74 | return a.value == b.value
|
---|
| 75 | }
|
---|
| 76 |
|
---|
| 77 | // Named classes of UTF-8 runes
|
---|
| 78 | const (
|
---|
| 79 | spaceRunes = " \t\r\n"
|
---|
| 80 | escapingQuoteRunes = `"`
|
---|
| 81 | nonEscapingQuoteRunes = "'"
|
---|
| 82 | escapeRunes = `\`
|
---|
| 83 | commentRunes = "#"
|
---|
| 84 | )
|
---|
| 85 |
|
---|
| 86 | // Classes of rune token
|
---|
| 87 | const (
|
---|
| 88 | unknownRuneClass runeTokenClass = iota
|
---|
| 89 | spaceRuneClass
|
---|
| 90 | escapingQuoteRuneClass
|
---|
| 91 | nonEscapingQuoteRuneClass
|
---|
| 92 | escapeRuneClass
|
---|
| 93 | commentRuneClass
|
---|
| 94 | eofRuneClass
|
---|
| 95 | )
|
---|
| 96 |
|
---|
| 97 | // Classes of lexographic token
|
---|
| 98 | const (
|
---|
| 99 | UnknownToken TokenType = iota
|
---|
| 100 | WordToken
|
---|
| 101 | SpaceToken
|
---|
| 102 | CommentToken
|
---|
| 103 | )
|
---|
| 104 |
|
---|
| 105 | // Lexer state machine states
|
---|
| 106 | const (
|
---|
| 107 | startState lexerState = iota // no runes have been seen
|
---|
| 108 | inWordState // processing regular runes in a word
|
---|
| 109 | escapingState // we have just consumed an escape rune; the next rune is literal
|
---|
| 110 | escapingQuotedState // we have just consumed an escape rune within a quoted string
|
---|
| 111 | quotingEscapingState // we are within a quoted string that supports escaping ("...")
|
---|
| 112 | quotingState // we are within a string that does not support escaping ('...')
|
---|
| 113 | commentState // we are within a comment (everything following an unquoted or unescaped #
|
---|
| 114 | )
|
---|
| 115 |
|
---|
| 116 | // tokenClassifier is used for classifying rune characters.
|
---|
| 117 | type tokenClassifier map[rune]runeTokenClass
|
---|
| 118 |
|
---|
| 119 | func (typeMap tokenClassifier) addRuneClass(runes string, tokenType runeTokenClass) {
|
---|
| 120 | for _, runeChar := range runes {
|
---|
| 121 | typeMap[runeChar] = tokenType
|
---|
| 122 | }
|
---|
| 123 | }
|
---|
| 124 |
|
---|
| 125 | // newDefaultClassifier creates a new classifier for ASCII characters.
|
---|
| 126 | func newDefaultClassifier() tokenClassifier {
|
---|
| 127 | t := tokenClassifier{}
|
---|
| 128 | t.addRuneClass(spaceRunes, spaceRuneClass)
|
---|
| 129 | t.addRuneClass(escapingQuoteRunes, escapingQuoteRuneClass)
|
---|
| 130 | t.addRuneClass(nonEscapingQuoteRunes, nonEscapingQuoteRuneClass)
|
---|
| 131 | t.addRuneClass(escapeRunes, escapeRuneClass)
|
---|
| 132 | t.addRuneClass(commentRunes, commentRuneClass)
|
---|
| 133 | return t
|
---|
| 134 | }
|
---|
| 135 |
|
---|
| 136 | // ClassifyRune classifiees a rune
|
---|
| 137 | func (t tokenClassifier) ClassifyRune(runeVal rune) runeTokenClass {
|
---|
| 138 | return t[runeVal]
|
---|
| 139 | }
|
---|
| 140 |
|
---|
| 141 | // Lexer turns an input stream into a sequence of tokens. Whitespace and comments are skipped.
|
---|
| 142 | type Lexer Tokenizer
|
---|
| 143 |
|
---|
| 144 | // NewLexer creates a new lexer from an input stream.
|
---|
| 145 | func NewLexer(r io.Reader) *Lexer {
|
---|
| 146 |
|
---|
| 147 | return (*Lexer)(NewTokenizer(r))
|
---|
| 148 | }
|
---|
| 149 |
|
---|
| 150 | // Next returns the next word, or an error. If there are no more words,
|
---|
| 151 | // the error will be io.EOF.
|
---|
| 152 | func (l *Lexer) Next() (string, error) {
|
---|
| 153 | for {
|
---|
| 154 | token, err := (*Tokenizer)(l).Next()
|
---|
| 155 | if err != nil {
|
---|
| 156 | return "", err
|
---|
| 157 | }
|
---|
| 158 | switch token.tokenType {
|
---|
| 159 | case WordToken:
|
---|
| 160 | return token.value, nil
|
---|
| 161 | case CommentToken:
|
---|
| 162 | // skip comments
|
---|
| 163 | default:
|
---|
| 164 | return "", fmt.Errorf("Unknown token type: %v", token.tokenType)
|
---|
| 165 | }
|
---|
| 166 | }
|
---|
| 167 | }
|
---|
| 168 |
|
---|
| 169 | // Tokenizer turns an input stream into a sequence of typed tokens
|
---|
| 170 | type Tokenizer struct {
|
---|
| 171 | input bufio.Reader
|
---|
| 172 | classifier tokenClassifier
|
---|
| 173 | }
|
---|
| 174 |
|
---|
| 175 | // NewTokenizer creates a new tokenizer from an input stream.
|
---|
| 176 | func NewTokenizer(r io.Reader) *Tokenizer {
|
---|
| 177 | input := bufio.NewReader(r)
|
---|
| 178 | classifier := newDefaultClassifier()
|
---|
| 179 | return &Tokenizer{
|
---|
| 180 | input: *input,
|
---|
| 181 | classifier: classifier}
|
---|
| 182 | }
|
---|
| 183 |
|
---|
| 184 | // scanStream scans the stream for the next token using the internal state machine.
|
---|
| 185 | // It will panic if it encounters a rune which it does not know how to handle.
|
---|
| 186 | func (t *Tokenizer) scanStream() (*Token, error) {
|
---|
| 187 | state := startState
|
---|
| 188 | var tokenType TokenType
|
---|
| 189 | var value []rune
|
---|
| 190 | var nextRune rune
|
---|
| 191 | var nextRuneType runeTokenClass
|
---|
| 192 | var err error
|
---|
| 193 |
|
---|
| 194 | for {
|
---|
| 195 | nextRune, _, err = t.input.ReadRune()
|
---|
| 196 | nextRuneType = t.classifier.ClassifyRune(nextRune)
|
---|
| 197 |
|
---|
| 198 | if err == io.EOF {
|
---|
| 199 | nextRuneType = eofRuneClass
|
---|
| 200 | err = nil
|
---|
| 201 | } else if err != nil {
|
---|
| 202 | return nil, err
|
---|
| 203 | }
|
---|
| 204 |
|
---|
| 205 | switch state {
|
---|
| 206 | case startState: // no runes read yet
|
---|
| 207 | {
|
---|
| 208 | switch nextRuneType {
|
---|
| 209 | case eofRuneClass:
|
---|
| 210 | {
|
---|
| 211 | return nil, io.EOF
|
---|
| 212 | }
|
---|
| 213 | case spaceRuneClass:
|
---|
| 214 | {
|
---|
| 215 | }
|
---|
| 216 | case escapingQuoteRuneClass:
|
---|
| 217 | {
|
---|
| 218 | tokenType = WordToken
|
---|
| 219 | state = quotingEscapingState
|
---|
| 220 | }
|
---|
| 221 | case nonEscapingQuoteRuneClass:
|
---|
| 222 | {
|
---|
| 223 | tokenType = WordToken
|
---|
| 224 | state = quotingState
|
---|
| 225 | }
|
---|
| 226 | case escapeRuneClass:
|
---|
| 227 | {
|
---|
| 228 | tokenType = WordToken
|
---|
| 229 | state = escapingState
|
---|
| 230 | }
|
---|
| 231 | case commentRuneClass:
|
---|
| 232 | {
|
---|
| 233 | tokenType = CommentToken
|
---|
| 234 | state = commentState
|
---|
| 235 | }
|
---|
| 236 | default:
|
---|
| 237 | {
|
---|
| 238 | tokenType = WordToken
|
---|
| 239 | value = append(value, nextRune)
|
---|
| 240 | state = inWordState
|
---|
| 241 | }
|
---|
| 242 | }
|
---|
| 243 | }
|
---|
| 244 | case inWordState: // in a regular word
|
---|
| 245 | {
|
---|
| 246 | switch nextRuneType {
|
---|
| 247 | case eofRuneClass:
|
---|
| 248 | {
|
---|
| 249 | token := &Token{
|
---|
| 250 | tokenType: tokenType,
|
---|
| 251 | value: string(value)}
|
---|
| 252 | return token, err
|
---|
| 253 | }
|
---|
| 254 | case spaceRuneClass:
|
---|
| 255 | {
|
---|
| 256 | token := &Token{
|
---|
| 257 | tokenType: tokenType,
|
---|
| 258 | value: string(value)}
|
---|
| 259 | return token, err
|
---|
| 260 | }
|
---|
| 261 | case escapingQuoteRuneClass:
|
---|
| 262 | {
|
---|
| 263 | state = quotingEscapingState
|
---|
| 264 | }
|
---|
| 265 | case nonEscapingQuoteRuneClass:
|
---|
| 266 | {
|
---|
| 267 | state = quotingState
|
---|
| 268 | }
|
---|
| 269 | case escapeRuneClass:
|
---|
| 270 | {
|
---|
| 271 | state = escapingState
|
---|
| 272 | }
|
---|
| 273 | default:
|
---|
| 274 | {
|
---|
| 275 | value = append(value, nextRune)
|
---|
| 276 | }
|
---|
| 277 | }
|
---|
| 278 | }
|
---|
| 279 | case escapingState: // the rune after an escape character
|
---|
| 280 | {
|
---|
| 281 | switch nextRuneType {
|
---|
| 282 | case eofRuneClass:
|
---|
| 283 | {
|
---|
| 284 | err = fmt.Errorf("EOF found after escape character")
|
---|
| 285 | token := &Token{
|
---|
| 286 | tokenType: tokenType,
|
---|
| 287 | value: string(value)}
|
---|
| 288 | return token, err
|
---|
| 289 | }
|
---|
| 290 | default:
|
---|
| 291 | {
|
---|
| 292 | state = inWordState
|
---|
| 293 | value = append(value, nextRune)
|
---|
| 294 | }
|
---|
| 295 | }
|
---|
| 296 | }
|
---|
| 297 | case escapingQuotedState: // the next rune after an escape character, in double quotes
|
---|
| 298 | {
|
---|
| 299 | switch nextRuneType {
|
---|
| 300 | case eofRuneClass:
|
---|
| 301 | {
|
---|
| 302 | err = fmt.Errorf("EOF found after escape character")
|
---|
| 303 | token := &Token{
|
---|
| 304 | tokenType: tokenType,
|
---|
| 305 | value: string(value)}
|
---|
| 306 | return token, err
|
---|
| 307 | }
|
---|
| 308 | default:
|
---|
| 309 | {
|
---|
| 310 | state = quotingEscapingState
|
---|
| 311 | value = append(value, nextRune)
|
---|
| 312 | }
|
---|
| 313 | }
|
---|
| 314 | }
|
---|
| 315 | case quotingEscapingState: // in escaping double quotes
|
---|
| 316 | {
|
---|
| 317 | switch nextRuneType {
|
---|
| 318 | case eofRuneClass:
|
---|
| 319 | {
|
---|
| 320 | err = fmt.Errorf("EOF found when expecting closing quote")
|
---|
| 321 | token := &Token{
|
---|
| 322 | tokenType: tokenType,
|
---|
| 323 | value: string(value)}
|
---|
| 324 | return token, err
|
---|
| 325 | }
|
---|
| 326 | case escapingQuoteRuneClass:
|
---|
| 327 | {
|
---|
| 328 | state = inWordState
|
---|
| 329 | }
|
---|
| 330 | case escapeRuneClass:
|
---|
| 331 | {
|
---|
| 332 | state = escapingQuotedState
|
---|
| 333 | }
|
---|
| 334 | default:
|
---|
| 335 | {
|
---|
| 336 | value = append(value, nextRune)
|
---|
| 337 | }
|
---|
| 338 | }
|
---|
| 339 | }
|
---|
| 340 | case quotingState: // in non-escaping single quotes
|
---|
| 341 | {
|
---|
| 342 | switch nextRuneType {
|
---|
| 343 | case eofRuneClass:
|
---|
| 344 | {
|
---|
| 345 | err = fmt.Errorf("EOF found when expecting closing quote")
|
---|
| 346 | token := &Token{
|
---|
| 347 | tokenType: tokenType,
|
---|
| 348 | value: string(value)}
|
---|
| 349 | return token, err
|
---|
| 350 | }
|
---|
| 351 | case nonEscapingQuoteRuneClass:
|
---|
| 352 | {
|
---|
| 353 | state = inWordState
|
---|
| 354 | }
|
---|
| 355 | default:
|
---|
| 356 | {
|
---|
| 357 | value = append(value, nextRune)
|
---|
| 358 | }
|
---|
| 359 | }
|
---|
| 360 | }
|
---|
| 361 | case commentState: // in a comment
|
---|
| 362 | {
|
---|
| 363 | switch nextRuneType {
|
---|
| 364 | case eofRuneClass:
|
---|
| 365 | {
|
---|
| 366 | token := &Token{
|
---|
| 367 | tokenType: tokenType,
|
---|
| 368 | value: string(value)}
|
---|
| 369 | return token, err
|
---|
| 370 | }
|
---|
| 371 | case spaceRuneClass:
|
---|
| 372 | {
|
---|
| 373 | if nextRune == '\n' {
|
---|
| 374 | state = startState
|
---|
| 375 | token := &Token{
|
---|
| 376 | tokenType: tokenType,
|
---|
| 377 | value: string(value)}
|
---|
| 378 | return token, err
|
---|
| 379 | } else {
|
---|
| 380 | value = append(value, nextRune)
|
---|
| 381 | }
|
---|
| 382 | }
|
---|
| 383 | default:
|
---|
| 384 | {
|
---|
| 385 | value = append(value, nextRune)
|
---|
| 386 | }
|
---|
| 387 | }
|
---|
| 388 | }
|
---|
| 389 | default:
|
---|
| 390 | {
|
---|
| 391 | return nil, fmt.Errorf("Unexpected state: %v", state)
|
---|
| 392 | }
|
---|
| 393 | }
|
---|
| 394 | }
|
---|
| 395 | }
|
---|
| 396 |
|
---|
| 397 | // Next returns the next token in the stream.
|
---|
| 398 | func (t *Tokenizer) Next() (*Token, error) {
|
---|
| 399 | return t.scanStream()
|
---|
| 400 | }
|
---|
| 401 |
|
---|
| 402 | // Split partitions a string into a slice of strings.
|
---|
| 403 | func Split(s string) ([]string, error) {
|
---|
| 404 | l := NewLexer(strings.NewReader(s))
|
---|
| 405 | subStrings := make([]string, 0)
|
---|
| 406 | for {
|
---|
| 407 | word, err := l.Next()
|
---|
| 408 | if err != nil {
|
---|
| 409 | if err == io.EOF {
|
---|
| 410 | return subStrings, nil
|
---|
| 411 | }
|
---|
| 412 | return subStrings, err
|
---|
| 413 | }
|
---|
| 414 | subStrings = append(subStrings, word)
|
---|
| 415 | }
|
---|
| 416 | }
|
---|