418 lines
9.7 KiB
Go
418 lines
9.7 KiB
Go
/*
|
|
Copyright 2012 Google Inc. All Rights Reserved.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
/*
|
|
Package shlex implements a simple lexer which splits input in to tokens using
|
|
shell-style rules for quoting and commenting.
|
|
|
|
The basic use case uses the default ASCII lexer to split a string into sub-strings:
|
|
|
|
shlex.Split("one \"two three\" four") -> []string{"one", "two three", "four"}
|
|
|
|
To process a stream of strings:
|
|
|
|
l := NewLexer(os.Stdin)
|
|
for ; token, err := l.Next(); err != nil {
|
|
// process token
|
|
}
|
|
|
|
To access the raw token stream (which includes tokens for comments):
|
|
|
|
t := NewTokenizer(os.Stdin)
|
|
for ; token, err := t.Next(); err != nil {
|
|
// process token
|
|
}
|
|
|
|
*/
|
|
package shlex
|
|
|
|
import (
|
|
"bufio"
|
|
"fmt"
|
|
"io"
|
|
"strings"
|
|
)
|
|
|
|
// TokenType is a top-level token classification: A word, space, comment, unknown.
|
|
type TokenType int
|
|
|
|
// runeTokenClass is the type of a UTF-8 character classification: A quote, space, escape.
|
|
type runeTokenClass int
|
|
|
|
// the internal state used by the lexer state machine
|
|
type lexerState int
|
|
|
|
// Token is a (type, value) pair representing a lexographical token.
|
|
type Token struct {
|
|
tokenType TokenType
|
|
value string
|
|
}
|
|
|
|
// Equal reports whether tokens a, and b, are equal.
|
|
// Two tokens are equal if both their types and values are equal. A nil token can
|
|
// never be equal to another token.
|
|
func (a *Token) Equal(b *Token) bool {
|
|
if a == nil || b == nil {
|
|
return false
|
|
}
|
|
if a.tokenType != b.tokenType {
|
|
return false
|
|
}
|
|
return a.value == b.value
|
|
}
|
|
|
|
// Named classes of UTF-8 runes
|
|
const (
|
|
spaceRunes = " \t\r\n"
|
|
escapingQuoteRunes = `"`
|
|
nonEscapingQuoteRunes = "'"
|
|
escapeRunes = `\`
|
|
commentRunes = "#"
|
|
)
|
|
|
|
// Classes of rune token
|
|
const (
|
|
unknownRuneClass runeTokenClass = iota
|
|
spaceRuneClass
|
|
escapingQuoteRuneClass
|
|
nonEscapingQuoteRuneClass
|
|
escapeRuneClass
|
|
commentRuneClass
|
|
eofRuneClass
|
|
)
|
|
|
|
// Classes of lexographic token
|
|
const (
|
|
UnknownToken TokenType = iota
|
|
WordToken
|
|
SpaceToken
|
|
CommentToken
|
|
)
|
|
|
|
// Lexer state machine states
|
|
const (
|
|
startState lexerState = iota // no runes have been seen
|
|
inWordState // processing regular runes in a word
|
|
escapingState // we have just consumed an escape rune; the next rune is literal
|
|
escapingQuotedState // we have just consumed an escape rune within a quoted string
|
|
quotingEscapingState // we are within a quoted string that supports escaping ("...")
|
|
quotingState // we are within a string that does not support escaping ('...')
|
|
commentState // we are within a comment (everything following an unquoted or unescaped #
|
|
)
|
|
|
|
// tokenClassifier is used for classifying rune characters.
|
|
type tokenClassifier map[rune]runeTokenClass
|
|
|
|
func (typeMap tokenClassifier) addRuneClass(runes string, tokenType runeTokenClass) {
|
|
for _, runeChar := range runes {
|
|
typeMap[runeChar] = tokenType
|
|
}
|
|
}
|
|
|
|
// newDefaultClassifier creates a new classifier for ASCII characters.
|
|
func newDefaultClassifier() tokenClassifier {
|
|
t := tokenClassifier{}
|
|
t.addRuneClass(spaceRunes, spaceRuneClass)
|
|
t.addRuneClass(escapingQuoteRunes, escapingQuoteRuneClass)
|
|
t.addRuneClass(nonEscapingQuoteRunes, nonEscapingQuoteRuneClass)
|
|
t.addRuneClass(escapeRunes, escapeRuneClass)
|
|
t.addRuneClass(commentRunes, commentRuneClass)
|
|
return t
|
|
}
|
|
|
|
// ClassifyRune classifiees a rune
|
|
func (t tokenClassifier) ClassifyRune(runeVal rune) runeTokenClass {
|
|
return t[runeVal]
|
|
}
|
|
|
|
// Lexer turns an input stream into a sequence of tokens. Whitespace and comments are skipped.
|
|
type Lexer Tokenizer
|
|
|
|
// NewLexer creates a new lexer from an input stream.
|
|
func NewLexer(r io.Reader) *Lexer {
|
|
|
|
return (*Lexer)(NewTokenizer(r))
|
|
}
|
|
|
|
// Next returns the next word, or an error. If there are no more words,
|
|
// the error will be io.EOF.
|
|
func (l *Lexer) Next() (string, error) {
|
|
for {
|
|
token, err := (*Tokenizer)(l).Next()
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
switch token.tokenType {
|
|
case WordToken:
|
|
return token.value, nil
|
|
case CommentToken:
|
|
// skip comments
|
|
default:
|
|
return "", fmt.Errorf("Unknown token type: %v", token.tokenType)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Tokenizer turns an input stream into a sequence of typed tokens
|
|
type Tokenizer struct {
|
|
input bufio.Reader
|
|
classifier tokenClassifier
|
|
}
|
|
|
|
// NewTokenizer creates a new tokenizer from an input stream.
|
|
func NewTokenizer(r io.Reader) *Tokenizer {
|
|
input := bufio.NewReader(r)
|
|
classifier := newDefaultClassifier()
|
|
return &Tokenizer{
|
|
input: *input,
|
|
classifier: classifier}
|
|
}
|
|
|
|
// scanStream scans the stream for the next token using the internal state machine.
|
|
// It will panic if it encounters a rune which it does not know how to handle.
|
|
func (t *Tokenizer) scanStream() (*Token, error) {
|
|
state := startState
|
|
var tokenType TokenType
|
|
var value []rune
|
|
var nextRune rune
|
|
var nextRuneType runeTokenClass
|
|
var err error
|
|
|
|
for {
|
|
nextRune, _, err = t.input.ReadRune()
|
|
nextRuneType = t.classifier.ClassifyRune(nextRune)
|
|
|
|
if err == io.EOF {
|
|
nextRuneType = eofRuneClass
|
|
err = nil
|
|
} else if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
switch state {
|
|
case startState: // no runes read yet
|
|
{
|
|
switch nextRuneType {
|
|
case eofRuneClass:
|
|
{
|
|
return nil, io.EOF
|
|
}
|
|
case spaceRuneClass:
|
|
{
|
|
}
|
|
case escapingQuoteRuneClass:
|
|
{
|
|
tokenType = WordToken
|
|
state = quotingEscapingState
|
|
}
|
|
case nonEscapingQuoteRuneClass:
|
|
{
|
|
tokenType = WordToken
|
|
state = quotingState
|
|
}
|
|
case escapeRuneClass:
|
|
{
|
|
tokenType = WordToken
|
|
state = escapingState
|
|
}
|
|
case commentRuneClass:
|
|
{
|
|
tokenType = CommentToken
|
|
state = commentState
|
|
}
|
|
default:
|
|
{
|
|
tokenType = WordToken
|
|
value = append(value, nextRune)
|
|
state = inWordState
|
|
}
|
|
}
|
|
}
|
|
case inWordState: // in a regular word
|
|
{
|
|
switch nextRuneType {
|
|
case eofRuneClass:
|
|
{
|
|
token := &Token{
|
|
tokenType: tokenType,
|
|
value: string(value)}
|
|
return token, err
|
|
}
|
|
case spaceRuneClass:
|
|
{
|
|
t.input.UnreadRune()
|
|
token := &Token{
|
|
tokenType: tokenType,
|
|
value: string(value)}
|
|
return token, err
|
|
}
|
|
case escapingQuoteRuneClass:
|
|
{
|
|
state = quotingEscapingState
|
|
}
|
|
case nonEscapingQuoteRuneClass:
|
|
{
|
|
state = quotingState
|
|
}
|
|
case escapeRuneClass:
|
|
{
|
|
state = escapingState
|
|
}
|
|
default:
|
|
{
|
|
value = append(value, nextRune)
|
|
}
|
|
}
|
|
}
|
|
case escapingState: // the rune after an escape character
|
|
{
|
|
switch nextRuneType {
|
|
case eofRuneClass:
|
|
{
|
|
err = fmt.Errorf("EOF found after escape character")
|
|
token := &Token{
|
|
tokenType: tokenType,
|
|
value: string(value)}
|
|
return token, err
|
|
}
|
|
default:
|
|
{
|
|
state = inWordState
|
|
value = append(value, nextRune)
|
|
}
|
|
}
|
|
}
|
|
case escapingQuotedState: // the next rune after an escape character, in double quotes
|
|
{
|
|
switch nextRuneType {
|
|
case eofRuneClass:
|
|
{
|
|
err = fmt.Errorf("EOF found after escape character")
|
|
token := &Token{
|
|
tokenType: tokenType,
|
|
value: string(value)}
|
|
return token, err
|
|
}
|
|
default:
|
|
{
|
|
state = quotingEscapingState
|
|
value = append(value, nextRune)
|
|
}
|
|
}
|
|
}
|
|
case quotingEscapingState: // in escaping double quotes
|
|
{
|
|
switch nextRuneType {
|
|
case eofRuneClass:
|
|
{
|
|
err = fmt.Errorf("EOF found when expecting closing quote")
|
|
token := &Token{
|
|
tokenType: tokenType,
|
|
value: string(value)}
|
|
return token, err
|
|
}
|
|
case escapingQuoteRuneClass:
|
|
{
|
|
state = inWordState
|
|
}
|
|
case escapeRuneClass:
|
|
{
|
|
state = escapingQuotedState
|
|
}
|
|
default:
|
|
{
|
|
value = append(value, nextRune)
|
|
}
|
|
}
|
|
}
|
|
case quotingState: // in non-escaping single quotes
|
|
{
|
|
switch nextRuneType {
|
|
case eofRuneClass:
|
|
{
|
|
err = fmt.Errorf("EOF found when expecting closing quote")
|
|
token := &Token{
|
|
tokenType: tokenType,
|
|
value: string(value)}
|
|
return token, err
|
|
}
|
|
case nonEscapingQuoteRuneClass:
|
|
{
|
|
state = inWordState
|
|
}
|
|
default:
|
|
{
|
|
value = append(value, nextRune)
|
|
}
|
|
}
|
|
}
|
|
case commentState: // in a comment
|
|
{
|
|
switch nextRuneType {
|
|
case eofRuneClass:
|
|
{
|
|
token := &Token{
|
|
tokenType: tokenType,
|
|
value: string(value)}
|
|
return token, err
|
|
}
|
|
case spaceRuneClass:
|
|
{
|
|
if nextRune == '\n' {
|
|
state = startState
|
|
token := &Token{
|
|
tokenType: tokenType,
|
|
value: string(value)}
|
|
return token, err
|
|
} else {
|
|
value = append(value, nextRune)
|
|
}
|
|
}
|
|
default:
|
|
{
|
|
value = append(value, nextRune)
|
|
}
|
|
}
|
|
}
|
|
default:
|
|
{
|
|
return nil, fmt.Errorf("Unexpected state: %v", state)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Next returns the next token in the stream.
|
|
func (t *Tokenizer) Next() (*Token, error) {
|
|
return t.scanStream()
|
|
}
|
|
|
|
// Split partitions a string into a slice of strings.
|
|
func Split(s string) ([]string, error) {
|
|
l := NewLexer(strings.NewReader(s))
|
|
subStrings := make([]string, 0)
|
|
for {
|
|
word, err := l.Next()
|
|
if err != nil {
|
|
if err == io.EOF {
|
|
return subStrings, nil
|
|
}
|
|
return subStrings, err
|
|
}
|
|
subStrings = append(subStrings, word)
|
|
}
|
|
}
|