go-toml/lexer/states.go

207 lines
5.5 KiB
Go

package lexer
// stateFn represents the state of the scanner as a function
// that returns the next state.
type stateFn func(*Lexer) stateFn
const (
whitespace string = " \t"
carriageReturn string = "\r"
newline string = "\n"
hash string = "#"
equal string = "="
lower string = "abcdefghijklmnopqrstuvwxyz"
upper string = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
digits string = "0123456789"
dot string = "."
underscore string = "_"
dash string = "-"
singleQuote string = "'"
doubleQuote string = "\""
backslash string = "\\"
someQuote string = singleQuote + doubleQuote
bareKey string = lower + upper + digits + underscore + dash
startOfKey string = bareKey + someQuote
)
func stateKeyValuePair(l *Lexer) stateFn {
l.skip(whitespace + carriageReturn + newline)
if l.upcoming(hash) {
return stateComment
}
if l.upcoming(startOfKey) {
return stateKey
}
return stateEndOfFile
}
// A '#' hash symbol marks the rest of the line as a comment.
func stateComment(l *Lexer) stateFn {
l.resetStringBuilder()
for {
switch {
case l.atEndOfFile() || l.accept(newline):
l.emit(ItemComment, l.getString())
return stateKeyValuePair
case l.accept(carriageReturn):
l.ignore()
default:
l.addToString(l.next())
}
}
}
// A key may be either bare, quoted or dotted.
func stateKey(l *Lexer) stateFn {
if l.upcoming(bareKey) {
return stateBareKey
}
return l.unexpectedTokenError("a valid key name")
}
// Bare keys may only contain ASCII letters, ASCII digits,
// underscores, and dashes (A-Za-z0-9_-). Note that bare
// keys are allowed to be composed of only ASCII digits,
// e.g. 1234, but are always interpreted as strings.
func stateBareKey(l *Lexer) stateFn {
l.acceptWhile(bareKey)
l.emit(ItemKey, l.getAcceptedString())
return stateEndOfKeyOrKeyDot
}
// Dotted keys are a sequence of bare or quoted keys joined with a dot.
// This allows for grouping similar properties together:
func stateEndOfKeyOrKeyDot(l *Lexer) stateFn {
// Whitespace around dot-separated parts is ignored, however,
// best practice is to not use any extraneous whitespace.
l.skip(whitespace)
if l.accept(dot) {
l.emit(ItemKeyDot, ".")
l.skip(whitespace)
return stateKey
}
return stateKeyAssignment
}
// Keys are on the left of the equals sign and values are on the right.
// Whitespace is ignored around key names and values. The key, equals
// sign, and value must be on the same line (though some values can
// be broken over multiple lines).
func stateKeyAssignment(l *Lexer) stateFn {
l.skip(whitespace)
if l.accept(equal) {
l.skip(whitespace)
return stateValue
}
return l.unexpectedTokenError("an '=' value assignment")
}
func stateValue(l *Lexer) stateFn {
l.skip(whitespace)
if l.upcoming(someQuote) {
return stateStringValue
}
return l.unexpectedTokenError("a value")
}
// There are four ways to express strings: basic, multi-line basic, literal,
// and multi-line literal. All strings must contain only valid UTF-8 characters.
func stateStringValue(l *Lexer) stateFn {
if l.accept(doubleQuote) {
return stateBasicStringValue
}
return l.unexpectedTokenError("a string value")
}
func stateBasicStringValue(l *Lexer) stateFn {
// Possibly a """ multi-line string start,
// possibly the end of an "" empty string.
if l.accept(doubleQuote) {
// A """ multi-line string.
if l.accept(doubleQuote) {
l.ignore()
return stateMultiLineBasicString
}
// An "" empty string.
l.ignore()
l.emit(ItemString, "")
return stateKeyValuePair
}
l.ignore()
return stateBasicString
}
// Basic strings are surrounded by quotation marks. Any Unicode character
// may be used except those that must be escaped: quotation mark, backslash,
// and the control characters (U+0000 to U+001F, U+007F).
//
// For convenience, some popular characters have a compact escape sequence.
//
// \b - backspace (U+0008)
// \t - tab (U+0009)
// \n - linefeed (U+000A)
// \f - form feed (U+000C)
// \r - carriage return (U+000D)
// \" - quote (U+0022)
// \\ - backslash (U+005C)
// \uXXXX - unicode (U+XXXX)
// \UXXXXXXXX - unicode (U+XXXXXXXX)
//
// Any Unicode character may be escaped with the \uXXXX or \UXXXXXXXX forms.
// The escape codes must be valid Unicode scalar values.
//
// All other escape sequences not listed above are reserved and,
// if used, TOML should produce an error.
var basicEscapes = map[rune]rune{
'b': rune(8),
't': rune(9),
'n': rune(10),
'f': rune(12),
'r': rune(13),
'"': rune(34),
'\\': rune(92),
}
func stateParseBasicString(l *Lexer) stateFn {
for {
switch {
case l.atEndOfFile():
return l.unexpectedEndOfFile("basic string token")
case l.accept(doubleQuote):
return l.popState()
case l.accept(backslash):
r := l.next()
if escaped, ok := basicEscapes[r]; ok {
l.addToString(escaped)
} else {
return l.errorf("Invalid escape sequence \\%c in string value", r)
}
default:
l.addToString(l.next())
}
}
}
func stateBasicString(l *Lexer) stateFn {
l.resetStringBuilder()
l.pushState(func(l *Lexer) stateFn {
l.emit(ItemString, l.getString())
return stateKeyValuePair
})
return stateParseBasicString
}
func stateMultiLineBasicString(l *Lexer) stateFn {
return l.errorf("Not yet implemented")
}
func stateEndOfFile(l *Lexer) stateFn {
i := l.peek()
if i == endOfFile {
l.emit(ItemEOF, "EOF")
return nil
}
return l.unexpectedTokenError("end of file")
}