1 | /*
|
---|
2 | Copyright 2012 Google Inc. All Rights Reserved.
|
---|
3 |
|
---|
4 | Licensed under the Apache License, Version 2.0 (the "License");
|
---|
5 | you may not use this file except in compliance with the License.
|
---|
6 | You may obtain a copy of the License at
|
---|
7 |
|
---|
8 | http://www.apache.org/licenses/LICENSE-2.0
|
---|
9 |
|
---|
10 | Unless required by applicable law or agreed to in writing, software
|
---|
11 | distributed under the License is distributed on an "AS IS" BASIS,
|
---|
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
---|
13 | See the License for the specific language governing permissions and
|
---|
14 | limitations under the License.
|
---|
15 | */
|
---|
16 |
|
---|
17 | /*
|
---|
18 | Package shlex implements a simple lexer which splits input in to tokens using
|
---|
19 | shell-style rules for quoting and commenting.
|
---|
20 |
|
---|
21 | The basic use case uses the default ASCII lexer to split a string into sub-strings:
|
---|
22 |
|
---|
23 | shlex.Split("one \"two three\" four") -> []string{"one", "two three", "four"}
|
---|
24 |
|
---|
25 | To process a stream of strings:
|
---|
26 |
|
---|
27 | l := NewLexer(os.Stdin)
|
---|
28 | for ; token, err := l.Next(); err != nil {
|
---|
29 | // process token
|
---|
30 | }
|
---|
31 |
|
---|
32 | To access the raw token stream (which includes tokens for comments):
|
---|
33 |
|
---|
34 | t := NewTokenizer(os.Stdin)
|
---|
35 | for ; token, err := t.Next(); err != nil {
|
---|
36 | // process token
|
---|
37 | }
|
---|
38 |
|
---|
39 | */
|
---|
40 | package shlex
|
---|
41 |
|
---|
42 | import (
|
---|
43 | "bufio"
|
---|
44 | "fmt"
|
---|
45 | "io"
|
---|
46 | "strings"
|
---|
47 | )
|
---|
48 |
|
---|
49 | // TokenType is a top-level token classification: A word, space, comment, unknown.
|
---|
50 | type TokenType int
|
---|
51 |
|
---|
52 | // runeTokenClass is the type of a UTF-8 character classification: A quote, space, escape.
|
---|
53 | type runeTokenClass int
|
---|
54 |
|
---|
55 | // the internal state used by the lexer state machine
|
---|
56 | type lexerState int
|
---|
57 |
|
---|
58 | // Token is a (type, value) pair representing a lexographical token.
|
---|
59 | type Token struct {
|
---|
60 | tokenType TokenType
|
---|
61 | value string
|
---|
62 | }
|
---|
63 |
|
---|
64 | // Equal reports whether tokens a, and b, are equal.
|
---|
65 | // Two tokens are equal if both their types and values are equal. A nil token can
|
---|
66 | // never be equal to another token.
|
---|
67 | func (a *Token) Equal(b *Token) bool {
|
---|
68 | if a == nil || b == nil {
|
---|
69 | return false
|
---|
70 | }
|
---|
71 | if a.tokenType != b.tokenType {
|
---|
72 | return false
|
---|
73 | }
|
---|
74 | return a.value == b.value
|
---|
75 | }
|
---|
76 |
|
---|
77 | // Named classes of UTF-8 runes
|
---|
78 | const (
|
---|
79 | spaceRunes = " \t\r\n"
|
---|
80 | escapingQuoteRunes = `"`
|
---|
81 | nonEscapingQuoteRunes = "'"
|
---|
82 | escapeRunes = `\`
|
---|
83 | commentRunes = "#"
|
---|
84 | )
|
---|
85 |
|
---|
86 | // Classes of rune token
|
---|
87 | const (
|
---|
88 | unknownRuneClass runeTokenClass = iota
|
---|
89 | spaceRuneClass
|
---|
90 | escapingQuoteRuneClass
|
---|
91 | nonEscapingQuoteRuneClass
|
---|
92 | escapeRuneClass
|
---|
93 | commentRuneClass
|
---|
94 | eofRuneClass
|
---|
95 | )
|
---|
96 |
|
---|
97 | // Classes of lexographic token
|
---|
98 | const (
|
---|
99 | UnknownToken TokenType = iota
|
---|
100 | WordToken
|
---|
101 | SpaceToken
|
---|
102 | CommentToken
|
---|
103 | )
|
---|
104 |
|
---|
105 | // Lexer state machine states
|
---|
106 | const (
|
---|
107 | startState lexerState = iota // no runes have been seen
|
---|
108 | inWordState // processing regular runes in a word
|
---|
109 | escapingState // we have just consumed an escape rune; the next rune is literal
|
---|
110 | escapingQuotedState // we have just consumed an escape rune within a quoted string
|
---|
111 | quotingEscapingState // we are within a quoted string that supports escaping ("...")
|
---|
112 | quotingState // we are within a string that does not support escaping ('...')
|
---|
113 | commentState // we are within a comment (everything following an unquoted or unescaped #
|
---|
114 | )
|
---|
115 |
|
---|
116 | // tokenClassifier is used for classifying rune characters.
|
---|
117 | type tokenClassifier map[rune]runeTokenClass
|
---|
118 |
|
---|
119 | func (typeMap tokenClassifier) addRuneClass(runes string, tokenType runeTokenClass) {
|
---|
120 | for _, runeChar := range runes {
|
---|
121 | typeMap[runeChar] = tokenType
|
---|
122 | }
|
---|
123 | }
|
---|
124 |
|
---|
125 | // newDefaultClassifier creates a new classifier for ASCII characters.
|
---|
126 | func newDefaultClassifier() tokenClassifier {
|
---|
127 | t := tokenClassifier{}
|
---|
128 | t.addRuneClass(spaceRunes, spaceRuneClass)
|
---|
129 | t.addRuneClass(escapingQuoteRunes, escapingQuoteRuneClass)
|
---|
130 | t.addRuneClass(nonEscapingQuoteRunes, nonEscapingQuoteRuneClass)
|
---|
131 | t.addRuneClass(escapeRunes, escapeRuneClass)
|
---|
132 | t.addRuneClass(commentRunes, commentRuneClass)
|
---|
133 | return t
|
---|
134 | }
|
---|
135 |
|
---|
136 | // ClassifyRune classifiees a rune
|
---|
137 | func (t tokenClassifier) ClassifyRune(runeVal rune) runeTokenClass {
|
---|
138 | return t[runeVal]
|
---|
139 | }
|
---|
140 |
|
---|
141 | // Lexer turns an input stream into a sequence of tokens. Whitespace and comments are skipped.
|
---|
142 | type Lexer Tokenizer
|
---|
143 |
|
---|
144 | // NewLexer creates a new lexer from an input stream.
|
---|
145 | func NewLexer(r io.Reader) *Lexer {
|
---|
146 |
|
---|
147 | return (*Lexer)(NewTokenizer(r))
|
---|
148 | }
|
---|
149 |
|
---|
150 | // Next returns the next word, or an error. If there are no more words,
|
---|
151 | // the error will be io.EOF.
|
---|
152 | func (l *Lexer) Next() (string, error) {
|
---|
153 | for {
|
---|
154 | token, err := (*Tokenizer)(l).Next()
|
---|
155 | if err != nil {
|
---|
156 | return "", err
|
---|
157 | }
|
---|
158 | switch token.tokenType {
|
---|
159 | case WordToken:
|
---|
160 | return token.value, nil
|
---|
161 | case CommentToken:
|
---|
162 | // skip comments
|
---|
163 | default:
|
---|
164 | return "", fmt.Errorf("Unknown token type: %v", token.tokenType)
|
---|
165 | }
|
---|
166 | }
|
---|
167 | }
|
---|
168 |
|
---|
169 | // Tokenizer turns an input stream into a sequence of typed tokens
|
---|
170 | type Tokenizer struct {
|
---|
171 | input bufio.Reader
|
---|
172 | classifier tokenClassifier
|
---|
173 | }
|
---|
174 |
|
---|
175 | // NewTokenizer creates a new tokenizer from an input stream.
|
---|
176 | func NewTokenizer(r io.Reader) *Tokenizer {
|
---|
177 | input := bufio.NewReader(r)
|
---|
178 | classifier := newDefaultClassifier()
|
---|
179 | return &Tokenizer{
|
---|
180 | input: *input,
|
---|
181 | classifier: classifier}
|
---|
182 | }
|
---|
183 |
|
---|
184 | // scanStream scans the stream for the next token using the internal state machine.
|
---|
185 | // It will panic if it encounters a rune which it does not know how to handle.
|
---|
186 | func (t *Tokenizer) scanStream() (*Token, error) {
|
---|
187 | state := startState
|
---|
188 | var tokenType TokenType
|
---|
189 | var value []rune
|
---|
190 | var nextRune rune
|
---|
191 | var nextRuneType runeTokenClass
|
---|
192 | var err error
|
---|
193 |
|
---|
194 | for {
|
---|
195 | nextRune, _, err = t.input.ReadRune()
|
---|
196 | nextRuneType = t.classifier.ClassifyRune(nextRune)
|
---|
197 |
|
---|
198 | if err == io.EOF {
|
---|
199 | nextRuneType = eofRuneClass
|
---|
200 | err = nil
|
---|
201 | } else if err != nil {
|
---|
202 | return nil, err
|
---|
203 | }
|
---|
204 |
|
---|
205 | switch state {
|
---|
206 | case startState: // no runes read yet
|
---|
207 | {
|
---|
208 | switch nextRuneType {
|
---|
209 | case eofRuneClass:
|
---|
210 | {
|
---|
211 | return nil, io.EOF
|
---|
212 | }
|
---|
213 | case spaceRuneClass:
|
---|
214 | {
|
---|
215 | }
|
---|
216 | case escapingQuoteRuneClass:
|
---|
217 | {
|
---|
218 | tokenType = WordToken
|
---|
219 | state = quotingEscapingState
|
---|
220 | }
|
---|
221 | case nonEscapingQuoteRuneClass:
|
---|
222 | {
|
---|
223 | tokenType = WordToken
|
---|
224 | state = quotingState
|
---|
225 | }
|
---|
226 | case escapeRuneClass:
|
---|
227 | {
|
---|
228 | tokenType = WordToken
|
---|
229 | state = escapingState
|
---|
230 | }
|
---|
231 | case commentRuneClass:
|
---|
232 | {
|
---|
233 | tokenType = CommentToken
|
---|
234 | state = commentState
|
---|
235 | }
|
---|
236 | default:
|
---|
237 | {
|
---|
238 | tokenType = WordToken
|
---|
239 | value = append(value, nextRune)
|
---|
240 | state = inWordState
|
---|
241 | }
|
---|
242 | }
|
---|
243 | }
|
---|
244 | case inWordState: // in a regular word
|
---|
245 | {
|
---|
246 | switch nextRuneType {
|
---|
247 | case eofRuneClass:
|
---|
248 | {
|
---|
249 | token := &Token{
|
---|
250 | tokenType: tokenType,
|
---|
251 | value: string(value)}
|
---|
252 | return token, err
|
---|
253 | }
|
---|
254 | case spaceRuneClass:
|
---|
255 | {
|
---|
256 | token := &Token{
|
---|
257 | tokenType: tokenType,
|
---|
258 | value: string(value)}
|
---|
259 | return token, err
|
---|
260 | }
|
---|
261 | case escapingQuoteRuneClass:
|
---|
262 | {
|
---|
263 | state = quotingEscapingState
|
---|
264 | }
|
---|
265 | case nonEscapingQuoteRuneClass:
|
---|
266 | {
|
---|
267 | state = quotingState
|
---|
268 | }
|
---|
269 | case escapeRuneClass:
|
---|
270 | {
|
---|
271 | state = escapingState
|
---|
272 | }
|
---|
273 | default:
|
---|
274 | {
|
---|
275 | value = append(value, nextRune)
|
---|
276 | }
|
---|
277 | }
|
---|
278 | }
|
---|
279 | case escapingState: // the rune after an escape character
|
---|
280 | {
|
---|
281 | switch nextRuneType {
|
---|
282 | case eofRuneClass:
|
---|
283 | {
|
---|
284 | err = fmt.Errorf("EOF found after escape character")
|
---|
285 | token := &Token{
|
---|
286 | tokenType: tokenType,
|
---|
287 | value: string(value)}
|
---|
288 | return token, err
|
---|
289 | }
|
---|
290 | default:
|
---|
291 | {
|
---|
292 | state = inWordState
|
---|
293 | value = append(value, nextRune)
|
---|
294 | }
|
---|
295 | }
|
---|
296 | }
|
---|
297 | case escapingQuotedState: // the next rune after an escape character, in double quotes
|
---|
298 | {
|
---|
299 | switch nextRuneType {
|
---|
300 | case eofRuneClass:
|
---|
301 | {
|
---|
302 | err = fmt.Errorf("EOF found after escape character")
|
---|
303 | token := &Token{
|
---|
304 | tokenType: tokenType,
|
---|
305 | value: string(value)}
|
---|
306 | return token, err
|
---|
307 | }
|
---|
308 | default:
|
---|
309 | {
|
---|
310 | state = quotingEscapingState
|
---|
311 | value = append(value, nextRune)
|
---|
312 | }
|
---|
313 | }
|
---|
314 | }
|
---|
315 | case quotingEscapingState: // in escaping double quotes
|
---|
316 | {
|
---|
317 | switch nextRuneType {
|
---|
318 | case eofRuneClass:
|
---|
319 | {
|
---|
320 | err = fmt.Errorf("EOF found when expecting closing quote")
|
---|
321 | token := &Token{
|
---|
322 | tokenType: tokenType,
|
---|
323 | value: string(value)}
|
---|
324 | return token, err
|
---|
325 | }
|
---|
326 | case escapingQuoteRuneClass:
|
---|
327 | {
|
---|
328 | state = inWordState
|
---|
329 | }
|
---|
330 | case escapeRuneClass:
|
---|
331 | {
|
---|
332 | state = escapingQuotedState
|
---|
333 | }
|
---|
334 | default:
|
---|
335 | {
|
---|
336 | value = append(value, nextRune)
|
---|
337 | }
|
---|
338 | }
|
---|
339 | }
|
---|
340 | case quotingState: // in non-escaping single quotes
|
---|
341 | {
|
---|
342 | switch nextRuneType {
|
---|
343 | case eofRuneClass:
|
---|
344 | {
|
---|
345 | err = fmt.Errorf("EOF found when expecting closing quote")
|
---|
346 | token := &Token{
|
---|
347 | tokenType: tokenType,
|
---|
348 | value: string(value)}
|
---|
349 | return token, err
|
---|
350 | }
|
---|
351 | case nonEscapingQuoteRuneClass:
|
---|
352 | {
|
---|
353 | state = inWordState
|
---|
354 | }
|
---|
355 | default:
|
---|
356 | {
|
---|
357 | value = append(value, nextRune)
|
---|
358 | }
|
---|
359 | }
|
---|
360 | }
|
---|
361 | case commentState: // in a comment
|
---|
362 | {
|
---|
363 | switch nextRuneType {
|
---|
364 | case eofRuneClass:
|
---|
365 | {
|
---|
366 | token := &Token{
|
---|
367 | tokenType: tokenType,
|
---|
368 | value: string(value)}
|
---|
369 | return token, err
|
---|
370 | }
|
---|
371 | case spaceRuneClass:
|
---|
372 | {
|
---|
373 | if nextRune == '\n' {
|
---|
374 | state = startState
|
---|
375 | token := &Token{
|
---|
376 | tokenType: tokenType,
|
---|
377 | value: string(value)}
|
---|
378 | return token, err
|
---|
379 | } else {
|
---|
380 | value = append(value, nextRune)
|
---|
381 | }
|
---|
382 | }
|
---|
383 | default:
|
---|
384 | {
|
---|
385 | value = append(value, nextRune)
|
---|
386 | }
|
---|
387 | }
|
---|
388 | }
|
---|
389 | default:
|
---|
390 | {
|
---|
391 | return nil, fmt.Errorf("Unexpected state: %v", state)
|
---|
392 | }
|
---|
393 | }
|
---|
394 | }
|
---|
395 | }
|
---|
396 |
|
---|
397 | // Next returns the next token in the stream.
|
---|
398 | func (t *Tokenizer) Next() (*Token, error) {
|
---|
399 | return t.scanStream()
|
---|
400 | }
|
---|
401 |
|
---|
402 | // Split partitions a string into a slice of strings.
|
---|
403 | func Split(s string) ([]string, error) {
|
---|
404 | l := NewLexer(strings.NewReader(s))
|
---|
405 | subStrings := make([]string, 0)
|
---|
406 | for {
|
---|
407 | word, err := l.Next()
|
---|
408 | if err != nil {
|
---|
409 | if err == io.EOF {
|
---|
410 | return subStrings, nil
|
---|
411 | }
|
---|
412 | return subStrings, err
|
---|
413 | }
|
---|
414 | subStrings = append(subStrings, word)
|
---|
415 | }
|
---|
416 | }
|
---|