Splitting off a more generic parser (it's fun getting to know a language, but you keep refactoring with all new stuff that you learn :-)
This commit is contained in:
parent
aeb48edc44
commit
f86ef2b918
|
@ -1,48 +1,35 @@
|
||||||
package lexer
|
package lexer
|
||||||
|
|
||||||
import "fmt"
|
import (
|
||||||
|
"fmt"
|
||||||
|
|
||||||
// itemType represents the type of lexer items.
|
"github.com/mmakaay/toml/parser"
|
||||||
type itemType int
|
)
|
||||||
|
|
||||||
// Definition of all the lexer item types for the TOML lexer.
|
// Definition of all the lexer item types for the TOML lexer.
|
||||||
const (
|
const (
|
||||||
ItemError itemType = iota // An error occurred
|
ItemComment parser.ItemType = iota // An error occurred
|
||||||
ItemEOF // End of input reached
|
|
||||||
ItemComment // Comment string, starts with # till en of line
|
|
||||||
ItemKey // Key of a key/value pair
|
ItemKey // Key of a key/value pair
|
||||||
ItemKeyDot // Dot for a dotted key
|
ItemKeyDot // Dot for a dotted key
|
||||||
ItemAssignment // Value assignment coming up (=)
|
ItemAssignment // Value assignment coming up (=)
|
||||||
ItemString // A value of type string
|
ItemString // A value of type string
|
||||||
)
|
)
|
||||||
|
|
||||||
// Item represents a lexer item returned from the scanner.
|
// ParserItemToString returns a string representation of the
|
||||||
type Item struct {
|
// parser.Item. This is used for unit testing purposes.
|
||||||
Type itemType //Type, e.g. ItemComment, ItemString
|
func ParserItemToString(i parser.Item) string {
|
||||||
Value string // Value, e.g. "10.42", "["
|
|
||||||
}
|
|
||||||
|
|
||||||
// String returns a string representation of the lexer item.
|
|
||||||
func (i Item) String() string {
|
|
||||||
switch i.Type {
|
switch i.Type {
|
||||||
|
case ItemComment:
|
||||||
|
return fmt.Sprintf("#(%s)", i.Value)
|
||||||
case ItemKey:
|
case ItemKey:
|
||||||
return fmt.Sprintf("[%s]", i.Value)
|
return fmt.Sprintf("[%s]", i.Value)
|
||||||
|
case ItemString:
|
||||||
|
return fmt.Sprintf("STR(%s)", i.Value)
|
||||||
case ItemKeyDot:
|
case ItemKeyDot:
|
||||||
return "."
|
return "."
|
||||||
case ItemAssignment:
|
case ItemAssignment:
|
||||||
return "="
|
return "="
|
||||||
}
|
|
||||||
return fmt.Sprintf("%s(%s)", i.Type, i.Value)
|
|
||||||
}
|
|
||||||
|
|
||||||
// String returns a string representation of the lexer item type.
|
|
||||||
func (i itemType) String() string {
|
|
||||||
switch i {
|
|
||||||
case ItemComment:
|
|
||||||
return "#"
|
|
||||||
case ItemString:
|
|
||||||
return "STR"
|
|
||||||
default:
|
default:
|
||||||
panic(fmt.Sprintf("No translation available for type id %d", i))
|
panic(fmt.Sprintf("No string representation available for parser.Item id %d", i.Type))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,21 +4,23 @@ import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"strings"
|
"strings"
|
||||||
"unicode/utf8"
|
"unicode/utf8"
|
||||||
|
|
||||||
|
"github.com/mmakaay/toml/parser"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Lexer holds the state of the lexer.
|
// Lexer holds the state of the lexer.
|
||||||
type Lexer struct {
|
type Lexer struct {
|
||||||
input string // the scanned input string
|
input string // the scanned input
|
||||||
state stateFn // a function that handles the current state
|
state parser.StateFn // a function that handles the current state
|
||||||
stack []stateFn // state function stack, for nested parsing
|
stack []parser.StateFn // state function stack, for nested parsing
|
||||||
|
len int // the total length of the input in bytes
|
||||||
pos int // current byte scanning position in the input
|
pos int // current byte scanning position in the input
|
||||||
newline bool // keep track of when we have scanned a newline
|
newline bool // keep track of when we have scanned a newline
|
||||||
cursorRow int // current row number in the input
|
cursorRow int // current row number in the input
|
||||||
cursorColumn int // current column position in the input
|
cursorColumn int // current column position in the input
|
||||||
width int // width of the last rune read, for supporting backup()
|
|
||||||
buffer StringBuffer // an efficient buffer, used to build string values
|
buffer StringBuffer // an efficient buffer, used to build string values
|
||||||
items chan Item // channel of resulting lexer items
|
items chan parser.Item // channel of resulting lexer items
|
||||||
item Item // the current item as reached by Next() and retrieved by Get()
|
item parser.Item // the current item as reached by Next() and retrieved by Get()
|
||||||
err *Error // an error when lexing failed, retrieved by Error()
|
err *Error // an error when lexing failed, retrieved by Error()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -35,46 +37,45 @@ func (err *Error) Error() string {
|
||||||
return err.Message
|
return err.Message
|
||||||
}
|
}
|
||||||
|
|
||||||
// Lex takes an input string and initializes the TOML lexer for it.
|
// New takes an input string and initializes the lexer for it.
|
||||||
func Lex(input string) *Lexer {
|
func New(input string) *Lexer {
|
||||||
return &Lexer{
|
return &Lexer{
|
||||||
input: input,
|
input: input,
|
||||||
|
len: len(input),
|
||||||
state: stateKeyValuePair,
|
state: stateKeyValuePair,
|
||||||
items: make(chan Item, 2),
|
items: make(chan parser.Item, 2),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Next advances to the next lexer item in the input string.
|
// Next advances to the next lexer item in the input string.
|
||||||
// When a valid item was found, then the boolean return parameter is returned.
|
// When a valid item was found, then the boolean return parameter will be true.
|
||||||
// On error or when reaching the end of the input, false is returned.
|
// On error or when reaching the end of the input, false is returned.
|
||||||
// When an error occurred, it will be set in the error return value.
|
// When an error occurred, it will be set in the error return value, nil otherwise.
|
||||||
func (l *Lexer) Next() (Item, *Error, bool) {
|
func (l *Lexer) Next() (parser.Item, *Error, bool) {
|
||||||
if l.state == nil {
|
|
||||||
panic("This should not happen: nil state reached, but entering Next()")
|
|
||||||
}
|
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case i := <-l.items:
|
case i := <-l.items:
|
||||||
if i.Type == ItemEOF {
|
switch {
|
||||||
|
case i.Type == ItemEOF:
|
||||||
return i, nil, false
|
return i, nil, false
|
||||||
}
|
case i.Type == ItemError:
|
||||||
if i.Type == ItemError {
|
|
||||||
l.err = &Error{i.Value, l.cursorRow, l.cursorColumn}
|
l.err = &Error{i.Value, l.cursorRow, l.cursorColumn}
|
||||||
return i, l.err, false
|
return i, l.err, false
|
||||||
}
|
default:
|
||||||
l.item = i
|
l.item = i
|
||||||
return i, nil, true
|
return i, nil, true
|
||||||
|
}
|
||||||
default:
|
default:
|
||||||
l.state = l.state(l)
|
l.state = l.state(l)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ToArray returns lexer items as an array.
|
// ToArray returns lexer items as an array (mainly intended for testing purposes)
|
||||||
// When an error occurs during scanning, a partial result will be
|
// When an error occurs during scanning, a partial result will be
|
||||||
// returned, accompanied by the error that occurred.
|
// returned, accompanied by the error that occurred.
|
||||||
func (l *Lexer) ToArray() ([]Item, *Error) {
|
func (l *Lexer) ToArray() ([]parser.Item, *Error) {
|
||||||
var items []Item
|
var items []parser.Item
|
||||||
for {
|
for {
|
||||||
item, err, more := l.Next()
|
item, err, more := l.Next()
|
||||||
if !more {
|
if !more {
|
||||||
|
@ -100,25 +101,25 @@ func (l *Lexer) popState() stateFn {
|
||||||
|
|
||||||
// atEndOfFile returns true when there is no more data available in the input.
|
// atEndOfFile returns true when there is no more data available in the input.
|
||||||
func (l *Lexer) atEndOfFile() bool {
|
func (l *Lexer) atEndOfFile() bool {
|
||||||
return l.pos >= len(l.input)
|
return l.pos >= l.len
|
||||||
}
|
}
|
||||||
|
|
||||||
// emit passes a lexer item back to the client, including the provided string.
|
// emit passes a lexer item back to the client, including the provided string.
|
||||||
func (l *Lexer) emit(t itemType, s string) {
|
func (l *Lexer) emit(t parser.ItemType, s string) {
|
||||||
l.items <- Item{t, s}
|
l.items <- parser.Item{Type: t, Value: s}
|
||||||
l.buffer.Reset()
|
l.buffer.Reset()
|
||||||
}
|
}
|
||||||
|
|
||||||
// emitLiteral passes a lexer item back to the client, including the accumulated
|
// emitLiteral passes a lexer item back to the client, including the accumulated
|
||||||
// string buffer data as a literal string.
|
// string buffer data as a literal string.
|
||||||
func (l *Lexer) emitLiteral(t itemType) {
|
func (l *Lexer) emitLiteral(t parser.ItemType) {
|
||||||
l.emit(t, l.buffer.AsLiteralString())
|
l.emit(t, l.buffer.AsLiteralString())
|
||||||
}
|
}
|
||||||
|
|
||||||
// emitTrimmedLiteral passes a lexer item back to the client, including the
|
// emitTrimmedLiteral passes a lexer item back to the client, including the
|
||||||
// accumulated string buffer data as a literal string with whitespace
|
// accumulated string buffer data as a literal string with whitespace
|
||||||
// trimmed from it.
|
// trimmed from it.
|
||||||
func (l *Lexer) emitTrimmedLiteral(t itemType) {
|
func (l *Lexer) emitTrimmedLiteral(t parser.ItemType) {
|
||||||
l.emit(t, strings.TrimSpace(l.buffer.AsLiteralString()))
|
l.emit(t, strings.TrimSpace(l.buffer.AsLiteralString()))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -127,7 +128,7 @@ func (l *Lexer) emitTrimmedLiteral(t itemType) {
|
||||||
// codes like \n, \t, \uXXXX, etc.)
|
// codes like \n, \t, \uXXXX, etc.)
|
||||||
// This method might return an error, in case there is data in the
|
// This method might return an error, in case there is data in the
|
||||||
// string buffer that is not valid for string interpretation.
|
// string buffer that is not valid for string interpretation.
|
||||||
func (l *Lexer) emitInterpreted(t itemType) error {
|
func (l *Lexer) emitInterpreted(t parser.ItemType) error {
|
||||||
s, err := l.buffer.AsInterpretedString()
|
s, err := l.buffer.AsInterpretedString()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
|
@ -137,15 +138,10 @@ func (l *Lexer) emitInterpreted(t itemType) error {
|
||||||
}
|
}
|
||||||
|
|
||||||
// emitError emits a lexer error item back to the client.
|
// emitError emits a lexer error item back to the client.
|
||||||
func (l *Lexer) emitError(message string) {
|
func (l *Lexer) emitError(format string, args ...interface{}) stateFn {
|
||||||
|
message := fmt.Sprintf(format, args...)
|
||||||
l.emit(ItemError, message)
|
l.emit(ItemError, message)
|
||||||
}
|
return nil
|
||||||
|
|
||||||
// backup steps back one rune
|
|
||||||
// Can be called only once per call of next.
|
|
||||||
func (l *Lexer) backup() {
|
|
||||||
l.pos -= l.width
|
|
||||||
l.cursorColumn--
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// peek returns but does not advance to the next rune(s) in the input.
|
// peek returns but does not advance to the next rune(s) in the input.
|
||||||
|
@ -176,17 +172,40 @@ func (l *Lexer) peekMulti(amount int) ([]rune, int, bool) {
|
||||||
return peeked, width, true
|
return peeked, width, true
|
||||||
}
|
}
|
||||||
|
|
||||||
// acceptNext adds the specified amount of runes from the input to the string buffer.
|
// acceptAny adds the next rune from the input to the string buffer.
|
||||||
// If not enough runes could be read (end of file or invalid UTF8 data), then false is returned.
|
// If no rune could be read (end of file or invalid UTF8 data), then
|
||||||
func (l *Lexer) acceptNext(count int) bool {
|
// false is returned.
|
||||||
for i := 0; i < count; i++ {
|
func (l *Lexer) acceptAny() bool {
|
||||||
if r, ok := l.next(); ok {
|
if r, ok := l.next(); ok {
|
||||||
l.buffer.WriteRune(r)
|
l.buffer.WriteRune(r)
|
||||||
} else {
|
return true
|
||||||
return false
|
|
||||||
}
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// accept adds the next rune to the string buffer and returns true if it's
|
||||||
|
// from the valid set of runes. Otherwise false is returned.
|
||||||
|
func (l *Lexer) accept(matches ...string) bool {
|
||||||
|
return l.acceptPattern(matches...)
|
||||||
|
}
|
||||||
|
|
||||||
|
// AcceptMatching adds the next runes to the string buffer, but only
|
||||||
|
// if the upcoming runes satisfy the provided pattern.
|
||||||
|
// When runes were added then true is returned, false otherwise.
|
||||||
|
func (l *Lexer) acceptPattern(pattern ...string) bool {
|
||||||
|
return l.progress(func(r rune) { l.buffer.WriteRune(r) }, pattern...)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (l *Lexer) progress(callback func(rune), matches ...string) bool {
|
||||||
|
if runes, w, ok := l.match(matches...); ok {
|
||||||
|
l.pos += w
|
||||||
|
for _, r := range runes {
|
||||||
|
callback(r)
|
||||||
|
l.advanceCursor(r)
|
||||||
}
|
}
|
||||||
return true
|
return true
|
||||||
|
}
|
||||||
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
// acceptConsecutive adds consecutive runes from the input to the string
|
// acceptConsecutive adds consecutive runes from the input to the string
|
||||||
|
@ -200,27 +219,9 @@ func (l *Lexer) acceptConsecutive(match string) bool {
|
||||||
return accepted
|
return accepted
|
||||||
}
|
}
|
||||||
|
|
||||||
// next returns the next rune from the input and a boolean indicating if
|
// advanceCursor advances the rune cursor one position in the
|
||||||
// reading the input was successful.
|
// input data. While doing so, it keeps tracks of newlines,
|
||||||
// When the end of input is reached, or an invalid UTF8 character is
|
// so we can report on row + column positions on error.
|
||||||
// read, then false is returned.
|
|
||||||
func (l *Lexer) next() (rune, bool) {
|
|
||||||
r, w, ok := l.peek()
|
|
||||||
if ok {
|
|
||||||
l.width = w
|
|
||||||
l.pos += w
|
|
||||||
l.advanceCursor(r)
|
|
||||||
return r, true
|
|
||||||
}
|
|
||||||
l.width = 0
|
|
||||||
if r == utf8.RuneError && w == 0 {
|
|
||||||
l.emitError("unexpected end of file")
|
|
||||||
} else {
|
|
||||||
l.emitError("invalid UTF8 character")
|
|
||||||
}
|
|
||||||
return r, false
|
|
||||||
}
|
|
||||||
|
|
||||||
func (l *Lexer) advanceCursor(r rune) {
|
func (l *Lexer) advanceCursor(r rune) {
|
||||||
if l.newline {
|
if l.newline {
|
||||||
l.cursorColumn = 0
|
l.cursorColumn = 0
|
||||||
|
@ -233,40 +234,20 @@ func (l *Lexer) advanceCursor(r rune) {
|
||||||
|
|
||||||
// skip skips runes, but only when all provided matches are satisfied.
|
// skip skips runes, but only when all provided matches are satisfied.
|
||||||
// Returns true when one or more runes were skipped.
|
// Returns true when one or more runes were skipped.
|
||||||
func (l *Lexer) skipMatching(matches ...string) bool {
|
func (l *Lexer) skipMatching(pattern ...string) bool {
|
||||||
if runes, w, ok := l.match(matches...); ok {
|
return l.progress(func(r rune) {}, pattern...)
|
||||||
l.pos += w
|
|
||||||
for _, r := range runes {
|
|
||||||
l.advanceCursor(r)
|
|
||||||
}
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// skipConsecutive skips consecutive runes from the provided match.
|
// skipConsecutive skips consecutive runes from the provided match.
|
||||||
// Returns true when one or more runes were skipped.
|
// Returns true when one or more runes were skipped.
|
||||||
func (l *Lexer) skipConsecutive(match string) bool {
|
func (l *Lexer) skipConsecutive(pattern string) bool {
|
||||||
didSkip := false
|
didSkip := false
|
||||||
for l.skipMatching(match) {
|
for l.skipMatching(pattern) {
|
||||||
didSkip = true
|
didSkip = true
|
||||||
}
|
}
|
||||||
return didSkip
|
return didSkip
|
||||||
}
|
}
|
||||||
|
|
||||||
// accept adds the next rune to the string buffer and returns true if it's
|
|
||||||
// from the valid set of runes. Otherwise false is returned.
|
|
||||||
func (l *Lexer) accept(match string) bool {
|
|
||||||
if r, ok := l.next(); ok {
|
|
||||||
if strings.IndexRune(match, r) >= 0 {
|
|
||||||
l.buffer.WriteRune(r)
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
l.backup()
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
// upcoming checks if the upcoming runes satisfy the provided rune matches.
|
// upcoming checks if the upcoming runes satisfy the provided rune matches.
|
||||||
// This is a lot like the match method, with the difference that
|
// This is a lot like the match method, with the difference that
|
||||||
// this one only returns the boolean value.
|
// this one only returns the boolean value.
|
||||||
|
@ -275,6 +256,25 @@ func (l *Lexer) upcoming(matches ...string) bool {
|
||||||
return ok
|
return ok
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// next returns the next rune from the input and a boolean indicating if
|
||||||
|
// reading the input was successful.
|
||||||
|
// When the end of input is reached, or an invalid UTF8 character is
|
||||||
|
// read, then false is returned.
|
||||||
|
func (l *Lexer) next() (rune, bool) {
|
||||||
|
r, w, ok := l.peek()
|
||||||
|
if ok {
|
||||||
|
l.pos += w
|
||||||
|
l.advanceCursor(r)
|
||||||
|
return r, true
|
||||||
|
}
|
||||||
|
if r == utf8.RuneError && w == 0 {
|
||||||
|
l.emitError("unexpected end of file")
|
||||||
|
} else {
|
||||||
|
l.emitError("invalid UTF8 character")
|
||||||
|
}
|
||||||
|
return r, false
|
||||||
|
}
|
||||||
|
|
||||||
// match checks if the upcoming runes satisfy the provided rune matches.
|
// match checks if the upcoming runes satisfy the provided rune matches.
|
||||||
// It returns a slice of runes that were found, their total byte width
|
// It returns a slice of runes that were found, their total byte width
|
||||||
// and a boolean indicating whether or not all provided matches matched
|
// and a boolean indicating whether or not all provided matches matched
|
||||||
|
@ -292,24 +292,14 @@ func (l *Lexer) match(matches ...string) ([]rune, int, bool) {
|
||||||
return peeked, width, false
|
return peeked, width, false
|
||||||
}
|
}
|
||||||
|
|
||||||
// error returns an error token and terminates the scan
|
|
||||||
// by returning nil to l.run.
|
|
||||||
func (l *Lexer) errorf(format string, args ...interface{}) stateFn {
|
|
||||||
l.items <- Item{
|
|
||||||
ItemError,
|
|
||||||
fmt.Sprintf(format, args...),
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (l *Lexer) unexpectedInputError(expected string) stateFn {
|
func (l *Lexer) unexpectedInputError(expected string) stateFn {
|
||||||
// next() takes care of error messages for ok == false.
|
// next() takes care of emitting errors for ok == false.
|
||||||
if r, ok := l.next(); ok {
|
if r, ok := l.next(); ok {
|
||||||
l.emitError(fmt.Sprintf("unexpected character %q (expected %s)", r, expected))
|
return l.emitError(fmt.Sprintf("unexpected character %q (expected %s)", r, expected))
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (l *Lexer) unexpectedEndOfFile(expected string) stateFn {
|
func (l *Lexer) unexpectedEndOfFile(expected string) stateFn {
|
||||||
return l.errorf("Unexpected end of file (expected %s)", expected)
|
return l.emitError("Unexpected end of file (expected %s)", expected)
|
||||||
}
|
}
|
148
lexer/states.go
148
lexer/states.go
|
@ -1,8 +1,6 @@
|
||||||
package lexer
|
package lexer
|
||||||
|
|
||||||
// stateFn represents the state of the lexer as a function
|
import "github.com/mmakaay/toml/parser"
|
||||||
// that returns the next state.
|
|
||||||
type stateFn func(*Lexer) stateFn
|
|
||||||
|
|
||||||
const (
|
const (
|
||||||
whitespace string = " \t"
|
whitespace string = " \t"
|
||||||
|
@ -28,59 +26,65 @@ const (
|
||||||
longUtf8Escape string = "U"
|
longUtf8Escape string = "U"
|
||||||
)
|
)
|
||||||
|
|
||||||
func stateKeyValuePair(l *Lexer) stateFn {
|
// NewParser creates a new parser, using the provided input string
|
||||||
l.skipConsecutive(whitespace + carriageReturn + newline)
|
// as the data to parse.
|
||||||
if l.skipMatching(hash) {
|
func NewParser(input string) *parser.Parser {
|
||||||
|
return parser.New(input, stateKeyValuePair)
|
||||||
|
}
|
||||||
|
|
||||||
|
func stateKeyValuePair(l *parser.Parser) parser.StateFn {
|
||||||
|
l.SkipConsecutive(whitespace + carriageReturn + newline)
|
||||||
|
if l.SkipMatching(hash) {
|
||||||
return stateComment
|
return stateComment
|
||||||
}
|
}
|
||||||
if l.upcoming(startOfKey) {
|
if l.Upcoming(startOfKey) {
|
||||||
return stateKey
|
return stateKey
|
||||||
}
|
}
|
||||||
return stateEndOfFile
|
return stateEndOfFile
|
||||||
}
|
}
|
||||||
|
|
||||||
// A '#' hash symbol marks the rest of the line as a comment.
|
// A '#' hash symbol marks the rest of the line as a comment.
|
||||||
func stateComment(l *Lexer) stateFn {
|
func stateComment(l *parser.Parser) parser.StateFn {
|
||||||
for {
|
for {
|
||||||
switch {
|
switch {
|
||||||
case l.atEndOfFile() || l.skipMatching(newline):
|
case l.AtEndOfFile() || l.SkipMatching(newline):
|
||||||
l.emitTrimmedLiteral(ItemComment)
|
l.EmitLiteralTrim(ItemComment)
|
||||||
return stateKeyValuePair
|
return stateKeyValuePair
|
||||||
default:
|
default:
|
||||||
if !l.acceptNext(1) {
|
if !l.AcceptAny() {
|
||||||
return l.unexpectedInputError("comment")
|
return nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// A key may be either bare, quoted or dotted.
|
// A key may be either bare, quoted or dotted.
|
||||||
func stateKey(l *Lexer) stateFn {
|
func stateKey(l *parser.Parser) parser.StateFn {
|
||||||
if l.accept(bareKeyChars) {
|
if l.AcceptMatching(bareKeyChars) {
|
||||||
return statebareKeyChars
|
return statebareKeyChars
|
||||||
}
|
}
|
||||||
return l.unexpectedInputError("a valid key name")
|
return l.UnexpectedInputError("a valid key name")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Bare keys may only contain ASCII letters, ASCII digits,
|
// Bare keys may only contain ASCII letters, ASCII digits,
|
||||||
// underscores, and dashes (A-Za-z0-9_-). Note that bare
|
// underscores, and dashes (A-Za-z0-9_-). Note that bare
|
||||||
// keys are allowed to be composed of only ASCII digits,
|
// keys are allowed to be composed of only ASCII digits,
|
||||||
// e.g. 1234, but are always interpreted as strings.
|
// e.g. 1234, but are always interpreted as strings.
|
||||||
func statebareKeyChars(l *Lexer) stateFn {
|
func statebareKeyChars(l *parser.Parser) parser.StateFn {
|
||||||
l.acceptConsecutive(bareKeyChars)
|
l.AcceptConsecutive(bareKeyChars)
|
||||||
l.emitLiteral(ItemKey)
|
l.EmitLiteral(ItemKey)
|
||||||
return stateEndOfKeyOrKeyDot
|
return stateEndOfKeyOrKeyDot
|
||||||
}
|
}
|
||||||
|
|
||||||
// Dotted keys are a sequence of bare or quoted keys joined with a dot.
|
// Dotted keys are a sequence of bare or quoted keys joined with a dot.
|
||||||
// This allows for grouping similar properties together:
|
// This allows for grouping similar properties together:
|
||||||
func stateEndOfKeyOrKeyDot(l *Lexer) stateFn {
|
func stateEndOfKeyOrKeyDot(l *parser.Parser) parser.StateFn {
|
||||||
// Whitespace around dot-separated parts is ignored, however,
|
// Whitespace around dot-separated parts is ignored, however,
|
||||||
// best practice is to not use any extraneous whitespace.
|
// best practice is to not use any extraneous whitespace.
|
||||||
l.skipConsecutive(whitespace)
|
l.SkipConsecutive(whitespace)
|
||||||
if l.skipMatching(dot) {
|
if l.SkipMatching(dot) {
|
||||||
l.emit(ItemKeyDot, "")
|
l.Emit(ItemKeyDot, "")
|
||||||
l.skipConsecutive(whitespace)
|
l.SkipConsecutive(whitespace)
|
||||||
return stateKey
|
return stateKey
|
||||||
}
|
}
|
||||||
return stateKeyAssignment
|
return stateKeyAssignment
|
||||||
|
@ -90,62 +94,69 @@ func stateEndOfKeyOrKeyDot(l *Lexer) stateFn {
|
||||||
// Whitespace is ignored around key names and values. The key, equals
|
// Whitespace is ignored around key names and values. The key, equals
|
||||||
// sign, and value must be on the same line (though some values can
|
// sign, and value must be on the same line (though some values can
|
||||||
// be broken over multiple lines).
|
// be broken over multiple lines).
|
||||||
func stateKeyAssignment(l *Lexer) stateFn {
|
func stateKeyAssignment(l *parser.Parser) parser.StateFn {
|
||||||
l.skipConsecutive(whitespace)
|
l.SkipConsecutive(whitespace)
|
||||||
if l.skipMatching(equal) {
|
if l.SkipMatching(equal) {
|
||||||
l.emit(ItemAssignment, "")
|
l.Emit(ItemAssignment, "")
|
||||||
l.skipConsecutive(whitespace)
|
l.SkipConsecutive(whitespace)
|
||||||
return stateValue
|
return stateValue
|
||||||
}
|
}
|
||||||
return l.unexpectedInputError("a value assignment")
|
return l.UnexpectedInputError("a value assignment")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Values must be of the following types: String, Integer, Float, Boolean,
|
// Values must be of the following types: String, Integer, Float, Boolean,
|
||||||
// Datetime, Array, or Inline Table. Unspecified values are invalid.
|
// Datetime, Array, or Inline Table. Unspecified values are invalid.
|
||||||
func stateValue(l *Lexer) stateFn {
|
func stateValue(l *parser.Parser) parser.StateFn {
|
||||||
l.skipConsecutive(whitespace)
|
l.SkipConsecutive(whitespace)
|
||||||
if l.upcoming(quoteChars) {
|
if l.Upcoming(quoteChars) {
|
||||||
return stateStringValue
|
return stateStringValue
|
||||||
}
|
}
|
||||||
return l.unexpectedInputError("a value")
|
return l.UnexpectedInputError("a value")
|
||||||
}
|
}
|
||||||
|
|
||||||
// There are four ways to express strings: basic, multi-line basic, literal,
|
// There are four ways to express strings: basic, multi-line basic, literal,
|
||||||
// and multi-line literal. All strings must contain only valid UTF-8 characters.
|
// and multi-line literal. All strings must contain only valid UTF-8 characters.
|
||||||
func stateStringValue(l *Lexer) stateFn {
|
func stateStringValue(l *parser.Parser) parser.StateFn {
|
||||||
switch {
|
switch {
|
||||||
case l.skipMatching(doubleQuote, doubleQuote, doubleQuote):
|
case l.SkipMatching(doubleQuote, doubleQuote, doubleQuote):
|
||||||
// Multi-line basic strings are surrounded by three quotation marks on each side.
|
// Multi-line basic strings are surrounded by three quotation marks on each side.
|
||||||
return stateMultiLineBasicString
|
return stateMultiLineBasicString
|
||||||
case l.skipMatching(doubleQuote):
|
case l.SkipMatching(doubleQuote):
|
||||||
// Basic strings are surrounded by quotation marks.
|
// Basic strings are surrounded by quotation marks.
|
||||||
return stateBasicStringValue
|
return stateSingleLineBasicString
|
||||||
}
|
}
|
||||||
return l.unexpectedInputError("a string value")
|
return l.UnexpectedInputError("a string value")
|
||||||
}
|
}
|
||||||
|
|
||||||
func stateBasicStringValue(l *Lexer) stateFn {
|
func stateSingleLineBasicString(l *parser.Parser) parser.StateFn {
|
||||||
if l.upcoming(doubleQuote, doubleQuote) {
|
if l.Upcoming(doubleQuote, doubleQuote) {
|
||||||
return stateMultiLineBasicString
|
return stateMultiLineBasicString
|
||||||
}
|
}
|
||||||
return stateBasicString
|
return stateBasicString
|
||||||
}
|
}
|
||||||
|
|
||||||
const invalidBasicStringCharacters string = "" +
|
func stateMultiLineBasicString(l *parser.Parser) parser.StateFn {
|
||||||
|
l.EmitError("Not yet implemented")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Any Unicode character may be used except those that must be escaped:
|
||||||
|
// quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F).
|
||||||
|
const invalidBasicStringCharacters string = "\"\\" +
|
||||||
"\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007" +
|
"\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007" +
|
||||||
"\u0008\u0009\u000A\u000B\u000C\u000D\u000E\u000F" +
|
"\u0008\u0009\u000A\u000B\u000C\u000D\u000E\u000F" +
|
||||||
"\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017" +
|
"\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017" +
|
||||||
"\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F" +
|
"\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F" +
|
||||||
"\u007F"
|
"\u007F"
|
||||||
|
|
||||||
func stateParseBasicString(l *Lexer) stateFn {
|
func stateParseBasicString(l *parser.Parser) parser.StateFn {
|
||||||
for {
|
for {
|
||||||
switch {
|
switch {
|
||||||
case l.atEndOfFile():
|
case l.AtEndOfFile():
|
||||||
return l.unexpectedEndOfFile("basic string token")
|
return l.UnexpectedEndOfFile("basic string token")
|
||||||
case l.skipMatching(doubleQuote):
|
case l.SkipMatching(doubleQuote):
|
||||||
return l.popState()
|
return l.PopState()
|
||||||
case l.upcoming(backslash, escapeChars):
|
case l.AcceptMatching(backslash, escapeChars):
|
||||||
// For convenience, some popular characters have a compact escape sequence.
|
// For convenience, some popular characters have a compact escape sequence.
|
||||||
// \b - backspace (U+0008)
|
// \b - backspace (U+0008)
|
||||||
// \t - tab (U+0009)
|
// \t - tab (U+0009)
|
||||||
|
@ -154,50 +165,45 @@ func stateParseBasicString(l *Lexer) stateFn {
|
||||||
// \r - carriage return (U+000D)
|
// \r - carriage return (U+000D)
|
||||||
// \" - quote (U+0022)
|
// \" - quote (U+0022)
|
||||||
// \\ - backslash (U+005C)
|
// \\ - backslash (U+005C)
|
||||||
l.acceptNext(2)
|
case l.AcceptMatching(backslash, shortUtf8Escape, hex, hex, hex, hex):
|
||||||
case l.upcoming(backslash, shortUtf8Escape, hex, hex, hex, hex):
|
|
||||||
// \uXXXX - unicode (U+XXXX)
|
// \uXXXX - unicode (U+XXXX)
|
||||||
l.acceptNext(6)
|
case l.AcceptMatching(backslash, longUtf8Escape, hex, hex, hex, hex, hex, hex, hex, hex):
|
||||||
case l.upcoming(backslash, longUtf8Escape, hex, hex, hex, hex, hex, hex, hex, hex):
|
|
||||||
// \UXXXXXXXX - unicode (U+XXXXXXXX)
|
// \UXXXXXXXX - unicode (U+XXXXXXXX)
|
||||||
l.acceptNext(10)
|
case l.Upcoming(backslash):
|
||||||
case l.upcoming(backslash):
|
|
||||||
// All other escape sequences not listed above are reserved and,
|
// All other escape sequences not listed above are reserved and,
|
||||||
// if used, TOML should produce an error.
|
// if used, TOML should produce an error.
|
||||||
return l.errorf("Invalid escape sequence in basic string")
|
return l.EmitError("Invalid escape sequence in basic string")
|
||||||
case l.upcoming(invalidBasicStringCharacters):
|
case l.Upcoming(invalidBasicStringCharacters):
|
||||||
// Any Unicode character may be used except those that must be escaped:
|
// Any Unicode character may be used except those that must be escaped:
|
||||||
// quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F).
|
// quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F).
|
||||||
r, _ := l.next()
|
r, _, _ := l.Match(invalidBasicStringCharacters)
|
||||||
return l.errorf("Invalid character in basic string: %q", r)
|
l.EmitError("Invalid character in basic string: %q (must be escaped)", r[0])
|
||||||
|
return nil
|
||||||
default:
|
default:
|
||||||
if !l.acceptNext(1) {
|
if !l.AcceptAny() {
|
||||||
return l.unexpectedInputError("string value")
|
return nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func stateBasicString(l *Lexer) stateFn {
|
func stateBasicString(l *parser.Parser) parser.StateFn {
|
||||||
l.pushState(func(l *Lexer) stateFn {
|
l.PushState(func(l *parser.Parser) parser.StateFn {
|
||||||
err := l.emitInterpreted(ItemString)
|
err := l.EmitInterpreted(ItemString)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return l.errorf("Invalid data in string: %s", err)
|
l.EmitError("Invalid data in string: %s", err)
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
return stateKeyValuePair
|
return stateKeyValuePair
|
||||||
})
|
})
|
||||||
return stateParseBasicString
|
return stateParseBasicString
|
||||||
}
|
}
|
||||||
|
|
||||||
func stateMultiLineBasicString(l *Lexer) stateFn {
|
func stateEndOfFile(l *parser.Parser) parser.StateFn {
|
||||||
return l.errorf("Not yet implemented")
|
if l.AtEndOfFile() {
|
||||||
}
|
l.Emit(parser.ItemEOF, "EOF") // todo Automate within parser?
|
||||||
|
|
||||||
func stateEndOfFile(l *Lexer) stateFn {
|
|
||||||
if l.atEndOfFile() {
|
|
||||||
l.emit(ItemEOF, "EOF")
|
|
||||||
} else {
|
} else {
|
||||||
l.unexpectedInputError("end of file")
|
l.UnexpectedInputError("end of file")
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,7 +9,7 @@ import (
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestErrorsIncludeLineAndRowPosition(t *testing.T) {
|
func TestErrorsIncludeLineAndRowPosition(t *testing.T) {
|
||||||
_, err := lexer.Lex("# 12345 abcde\t\n\n\n# 67890\r\n# 12345\xbc").ToArray()
|
_, err := lexer.NewParser("# 12345 abcde\t\n\n\n# 67890\r\n# 12345\xbc").ToArray()
|
||||||
t.Logf("Got error: %s", err.Error())
|
t.Logf("Got error: %s", err.Error())
|
||||||
if err.Row != 4 {
|
if err.Row != 4 {
|
||||||
t.Errorf("Unexpected line number: %d (expected %d)", err.Row, 4)
|
t.Errorf("Unexpected line number: %d (expected %d)", err.Row, 4)
|
||||||
|
@ -19,21 +19,20 @@ func TestErrorsIncludeLineAndRowPosition(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestEmptyInput(t *testing.T) {
|
||||||
|
runStatesT(t, statesT{"empty string", "", "", ""})
|
||||||
|
}
|
||||||
|
|
||||||
func TestInvalidUtf8Data(t *testing.T) {
|
func TestInvalidUtf8Data(t *testing.T) {
|
||||||
runStatesTs(t, []statesT{
|
runStatesTs(t, []statesT{
|
||||||
{"inside comment", "# \xbc", "", "invalid UTF8 character"},
|
{"inside comment", "# \xbc", "", "invalid UTF8 character"},
|
||||||
{"bare key 1", "\xbc", "", "invalid UTF8 character"},
|
{"bare key 1", "\xbc", "", "invalid UTF8 character"},
|
||||||
{"bare key 2", "key\xbc", "", "invalid UTF8 character"},
|
{"bare key 2", "key\xbc", "[key]", "invalid UTF8 character"},
|
||||||
{"assignment", "key \xbc", "[key]", "invalid UTF8 character"},
|
{"assignment", "key \xbc", "[key]", "invalid UTF8 character"},
|
||||||
{"start of value", "key=\xbc", "[key]=", "invalid UTF8 character"},
|
{"start of value", "key=\xbc", "[key]=", "invalid UTF8 character"},
|
||||||
{"basic string value", "a=\"\xbc\"", "[a]=", "invalid UTF8 character"},
|
{"basic string value", "a=\"\xbc\"", "[a]=", "invalid UTF8 character"},
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestEmptyInput(t *testing.T) {
|
|
||||||
runStatesT(t, statesT{"empty string", "", "", ""})
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestWhiteSpaceAndNewlines(t *testing.T) {
|
func TestWhiteSpaceAndNewlines(t *testing.T) {
|
||||||
runStatesTs(t, []statesT{
|
runStatesTs(t, []statesT{
|
||||||
{"space", " ", "", ""},
|
{"space", " ", "", ""},
|
||||||
|
@ -61,13 +60,13 @@ func TestKeyWithoutAssignment(t *testing.T) {
|
||||||
err := "unexpected end of file"
|
err := "unexpected end of file"
|
||||||
runStatesTs(t, []statesT{
|
runStatesTs(t, []statesT{
|
||||||
{"bare with whitespace", " a ", "[a]", err},
|
{"bare with whitespace", " a ", "[a]", err},
|
||||||
{"bare lower", "abcdefghijklmnopqrstuvwxyz", "", err},
|
{"bare lower", "abcdefghijklmnopqrstuvwxyz", "[abcdefghijklmnopqrstuvwxyz]", err},
|
||||||
{"bare upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "", err},
|
{"bare upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "[ABCDEFGHIJKLMNOPQRSTUVWXYZ]", err},
|
||||||
{"bare numbers", "0123456789", "", err},
|
{"bare numbers", "0123456789", "[0123456789]", err},
|
||||||
{"bare underscore", "_", "", err},
|
{"bare underscore", "_", "[_]", err},
|
||||||
{"bare dash", "-", "", err},
|
{"bare dash", "-", "[-]", err},
|
||||||
{"bare big mix", "-hey_good_Lookin123-", "", err},
|
{"bare big mix", "-hey_good_Lookin123-", "[-hey_good_Lookin123-]", err},
|
||||||
{"bare dotted", "a._.c", "[a].[_].", err},
|
{"bare dotted", "a._.c", "[a].[_].[c]", err},
|
||||||
{"bare dotted with whitespace", " a .\t\t b\t ", "[a].[b]", err},
|
{"bare dotted with whitespace", " a .\t\t b\t ", "[a].[b]", err},
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
@ -90,9 +89,9 @@ func TestUnterminatedBasicString(t *testing.T) {
|
||||||
|
|
||||||
func TestBasicStringWithUnescapedControlCharacters(t *testing.T) {
|
func TestBasicStringWithUnescapedControlCharacters(t *testing.T) {
|
||||||
runStatesTs(t, []statesT{
|
runStatesTs(t, []statesT{
|
||||||
{"null char", "a=\"\u0000\"", "[a]=", `Invalid character in basic string: '\x00'`},
|
{"null char", "a=\"\u0000\"", "[a]=", `Invalid character in basic string: '\x00' (must be escaped)`},
|
||||||
{"newline", "a=\"b\nc\nd\"", "[a]=", `Invalid character in basic string: '\n'`},
|
{"newline", "a=\"b\nc\nd\"", "[a]=", `Invalid character in basic string: '\n' (must be escaped)`},
|
||||||
{"delete", "a=\"\u007F\"", "[a]=", `Invalid character in basic string: '\u007f'`},
|
{"delete", "a=\"\u007F\"", "[a]=", `Invalid character in basic string: '\u007f' (must be escaped)`},
|
||||||
})
|
})
|
||||||
|
|
||||||
// No need to write all test cases for disallowed characters by hand.
|
// No need to write all test cases for disallowed characters by hand.
|
||||||
|
@ -100,7 +99,7 @@ func TestBasicStringWithUnescapedControlCharacters(t *testing.T) {
|
||||||
name := fmt.Sprintf("control character %x", rune(i))
|
name := fmt.Sprintf("control character %x", rune(i))
|
||||||
runStatesT(
|
runStatesT(
|
||||||
t, statesT{name, fmt.Sprintf(`_="%c"`, rune(i)), "[_]=",
|
t, statesT{name, fmt.Sprintf(`_="%c"`, rune(i)), "[_]=",
|
||||||
fmt.Sprintf(`Invalid character in basic string: %q`, rune(i))})
|
fmt.Sprintf(`Invalid character in basic string: %q (must be escaped)`, rune(i))})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -163,7 +162,7 @@ func runStatesTs(t *testing.T, tests []statesT) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func runStatesT(t *testing.T, c statesT) {
|
func runStatesT(t *testing.T, c statesT) {
|
||||||
l, err := lexer.Lex(c.in).ToArray()
|
l, err := lexer.NewParser(c.in).ToArray()
|
||||||
if err == nil && c.err != "" {
|
if err == nil && c.err != "" {
|
||||||
t.Errorf("[%s] Expected error '%s', but no error occurred", c.name, c.err)
|
t.Errorf("[%s] Expected error '%s', but no error occurred", c.name, c.err)
|
||||||
}
|
}
|
||||||
|
@ -179,14 +178,15 @@ func runStatesT(t *testing.T, c statesT) {
|
||||||
t.Errorf("[%s] Unexpected number of lexer items:\nexpected: %d\nactual: %d\n", c.name, len(expected), len(l))
|
t.Errorf("[%s] Unexpected number of lexer items:\nexpected: %d\nactual: %d\n", c.name, len(expected), len(l))
|
||||||
}
|
}
|
||||||
for i, e := range expected {
|
for i, e := range expected {
|
||||||
if l[i].String() != e {
|
v := lexer.ParserItemToString(l[i])
|
||||||
t.Errorf("[%s] Unexpected lexer item at index %d:\nexpected: %s\nactual: %s\n", c.name, i, e, l[i])
|
if v != e {
|
||||||
|
t.Errorf("[%s] Unexpected lexer item at index %d:\nexpected: %s\nactual: %s\n", c.name, i, e, v)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
case string:
|
case string:
|
||||||
a := make([]string, len(l))
|
a := make([]string, len(l))
|
||||||
for _, v := range l {
|
for _, v := range l {
|
||||||
a = append(a, v.String())
|
a = append(a, lexer.ParserItemToString(v))
|
||||||
}
|
}
|
||||||
actual := strings.Join(a, "")
|
actual := strings.Join(a, "")
|
||||||
if actual != expected {
|
if actual != expected {
|
||||||
|
|
|
@ -0,0 +1,261 @@
|
||||||
|
package parser
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
"unicode/utf8"
|
||||||
|
)
|
||||||
|
|
||||||
|
// New takes an input string and a start state,
|
||||||
|
// and initializes the parser for it.
|
||||||
|
func New(input string, startState StateFn) *Parser {
|
||||||
|
return &Parser{
|
||||||
|
input: input,
|
||||||
|
len: len(input),
|
||||||
|
state: startState,
|
||||||
|
items: make(chan Item, 2),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// PushState adds the state function to the state stack.
|
||||||
|
// This is used for implementing nested parsing.
|
||||||
|
func (l *Parser) PushState(state StateFn) {
|
||||||
|
l.stack = append(l.stack, state)
|
||||||
|
}
|
||||||
|
|
||||||
|
// PopState pops the last pushed state from the state stack.
|
||||||
|
func (l *Parser) PopState() StateFn {
|
||||||
|
last := len(l.stack) - 1
|
||||||
|
head, tail := l.stack[:last], l.stack[last]
|
||||||
|
l.stack = head
|
||||||
|
return tail
|
||||||
|
}
|
||||||
|
|
||||||
|
// AtEndOfFile returns true when there is no more data available in the input.
|
||||||
|
func (l *Parser) AtEndOfFile() bool {
|
||||||
|
return l.pos >= l.len
|
||||||
|
}
|
||||||
|
|
||||||
|
// Emit passes a Parser item to the client, including the provided string.
|
||||||
|
func (l *Parser) Emit(t ItemType, s string) {
|
||||||
|
l.items <- Item{t, s}
|
||||||
|
l.buffer.Reset()
|
||||||
|
}
|
||||||
|
|
||||||
|
// EmitLiteral passes a Parser item to the client, including the accumulated
|
||||||
|
// string buffer data as a literal string.
|
||||||
|
func (l *Parser) EmitLiteral(t ItemType) {
|
||||||
|
l.Emit(t, l.buffer.AsLiteralString())
|
||||||
|
}
|
||||||
|
|
||||||
|
// EmitLiteralTrim passes a Parser item to the client, including the
|
||||||
|
// accumulated string buffer data as a literal string with whitespace
|
||||||
|
// trimmed from it.
|
||||||
|
func (l *Parser) EmitLiteralTrim(t ItemType) {
|
||||||
|
l.Emit(t, strings.TrimSpace(l.buffer.AsLiteralString()))
|
||||||
|
}
|
||||||
|
|
||||||
|
// EmitInterpreted passes a Parser item to the client, including the
|
||||||
|
// accumulated string buffer data a Go doubled quoted interpreted string
|
||||||
|
// (handling escape codes like \n, \t, \uXXXX, etc.)
|
||||||
|
// This method might return an error, in case there is data in the
|
||||||
|
// string buffer that is not valid for string interpretation.
|
||||||
|
func (l *Parser) EmitInterpreted(t ItemType) error {
|
||||||
|
s, err := l.buffer.AsInterpretedString()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
l.Emit(t, s)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// EmitError emits a Parser error item to the client.
|
||||||
|
func (l *Parser) EmitError(format string, args ...interface{}) StateFn {
|
||||||
|
message := fmt.Sprintf(format, args...)
|
||||||
|
l.Emit(ItemError, message)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Match checks if the upcoming runes satisfy all provided patterns.
|
||||||
|
// It returns a slice of runes that were found, their total byte width
|
||||||
|
// and a boolean indicating whether or not all provided patterns were
|
||||||
|
// satisfied by the input data.
|
||||||
|
func (l *Parser) Match(patterns ...string) ([]rune, int, bool) {
|
||||||
|
peeked, width, ok := l.peekMulti(len(patterns))
|
||||||
|
if ok {
|
||||||
|
for i, r := range patterns {
|
||||||
|
if strings.IndexRune(r, peeked[i]) < 0 {
|
||||||
|
return peeked, width, false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return peeked, width, true
|
||||||
|
}
|
||||||
|
return peeked, width, false
|
||||||
|
}
|
||||||
|
|
||||||
|
// Upcoming checks if the upcoming runes satisfy all provided patterns.
|
||||||
|
// Returns true if all provided patterns are satisfied.
|
||||||
|
func (l *Parser) Upcoming(patterns ...string) bool {
|
||||||
|
_, _, ok := l.Match(patterns...)
|
||||||
|
return ok
|
||||||
|
}
|
||||||
|
|
||||||
|
// AcceptAny adds the next rune from the input to the string buffer.
|
||||||
|
// If no rune could be read (end of file or invalid UTF8 data),
|
||||||
|
// then false is returned.
|
||||||
|
func (l *Parser) AcceptAny() bool {
|
||||||
|
if r, ok := l.next(); ok {
|
||||||
|
l.buffer.WriteRune(r)
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// AcceptMatching adds the next runes to the string buffer, but only
|
||||||
|
// if the upcoming runes satisfy the provided patterns.
|
||||||
|
// When runes were added then true is returned, false otherwise.
|
||||||
|
func (l *Parser) AcceptMatching(patterns ...string) bool {
|
||||||
|
return l.progress(func(r rune) { l.buffer.WriteRune(r) }, patterns...)
|
||||||
|
}
|
||||||
|
|
||||||
|
// AcceptConsecutive adds consecutive runes from the input to the string
|
||||||
|
// buffer, as long as they exist in the pattern.
|
||||||
|
// If any runes were added then true is returned, false otherwise.
|
||||||
|
func (l *Parser) AcceptConsecutive(pattern string) bool {
|
||||||
|
accepted := false
|
||||||
|
for l.AcceptMatching(pattern) {
|
||||||
|
accepted = true
|
||||||
|
}
|
||||||
|
return accepted
|
||||||
|
}
|
||||||
|
|
||||||
|
// SkipMatching skips runes, but only when all provided patterns are satisfied.
|
||||||
|
// Returns true when one or more runes were skipped.
|
||||||
|
func (l *Parser) SkipMatching(patterns ...string) bool {
|
||||||
|
if runes, w, ok := l.Match(patterns...); ok {
|
||||||
|
l.pos += w
|
||||||
|
for _, r := range runes {
|
||||||
|
l.advanceCursor(r)
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// SkipConsecutive skips consecutive runes from the provided pattern.
|
||||||
|
// Returns true when one or more runes were skipped.
|
||||||
|
func (l *Parser) SkipConsecutive(pattern string) bool {
|
||||||
|
didSkip := false
|
||||||
|
for l.SkipMatching(pattern) {
|
||||||
|
didSkip = true
|
||||||
|
}
|
||||||
|
return didSkip
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// EMIT DATA AND ERRORS
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
// UnexpectedInputError is used by a parser implementation to emit an
|
||||||
|
// error item that tells the client that an unexpected rune was
|
||||||
|
// encountered in the input.
|
||||||
|
// The parameter 'expected' is used to provide some context to the error.
|
||||||
|
func (l *Parser) UnexpectedInputError(expected string) StateFn {
|
||||||
|
// next() takes care of error messages for ok == false.
|
||||||
|
if r, ok := l.next(); ok {
|
||||||
|
return l.EmitError(fmt.Sprintf("unexpected character %q (expected %s)", r, expected))
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// UnexpectedEndOfFile is used by a parser implementation to emit an
|
||||||
|
// error item that tells the client that more data was expected from
|
||||||
|
// the input.
|
||||||
|
// The parameter 'expected' is used to provide some context to the error.
|
||||||
|
func (l *Parser) UnexpectedEndOfFile(expected string) StateFn {
|
||||||
|
return l.EmitError("Unexpected end of file (expected %s)", expected)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// LEXER : our lexer is quite low level, it only returns UTF8 runes
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
// peek returns but does not advance to the next rune(s) in the input.
|
||||||
|
// Returns the rune, its width and a boolean. The boolean will be false in case
|
||||||
|
// no upcoming rune can be peeked (end of data or invalid UTF8 character).
|
||||||
|
func (l *Parser) peek() (rune, int, bool) {
|
||||||
|
peeked, width := utf8.DecodeRuneInString(l.input[l.pos:])
|
||||||
|
return peeked, width, peeked != utf8.RuneError
|
||||||
|
}
|
||||||
|
|
||||||
|
// peekMulti takes a peek at multiple upcoming runes in the input.
|
||||||
|
// Returns a slice of runes, their total width in bytes and a boolean.
|
||||||
|
// The boolean will be false in case less runes can be peeked than
|
||||||
|
// the requested amount (end of data or invalid UTF8 character).
|
||||||
|
func (l *Parser) peekMulti(amount int) ([]rune, int, bool) {
|
||||||
|
width := 0
|
||||||
|
var peeked []rune
|
||||||
|
for i := 0; i < amount; i++ {
|
||||||
|
r, w := utf8.DecodeRuneInString(l.input[l.pos+width:])
|
||||||
|
switch {
|
||||||
|
case r == utf8.RuneError:
|
||||||
|
return peeked, width, false
|
||||||
|
default:
|
||||||
|
width += w
|
||||||
|
peeked = append(peeked, r)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return peeked, width, true
|
||||||
|
}
|
||||||
|
|
||||||
|
// progress moves the cursor forward in the input, returning one rune
|
||||||
|
// for every specified pattern. The cursor is only moved forward when
|
||||||
|
// all patterns are satisfied.
|
||||||
|
// Returns true when all patterns were satisfied and the cursor was
|
||||||
|
// moved forward, false otherwise.
|
||||||
|
// A callback function can be provided to specify what to do with
|
||||||
|
// the runes that are encountered in the input.
|
||||||
|
func (l *Parser) progress(callback func(rune), patterns ...string) bool {
|
||||||
|
if runes, w, ok := l.Match(patterns...); ok {
|
||||||
|
l.pos += w
|
||||||
|
for _, r := range runes {
|
||||||
|
callback(r)
|
||||||
|
l.advanceCursor(r)
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// next returns the next rune from the input and a boolean indicating if
|
||||||
|
// reading the input was successful.
|
||||||
|
// When the end of input is reached, or an invalid UTF8 character is
|
||||||
|
// read, then false is returned. Both are considered error cases,
|
||||||
|
// and for that reason these automatically emit an error to the client.
|
||||||
|
func (l *Parser) next() (rune, bool) {
|
||||||
|
r, w, ok := l.peek()
|
||||||
|
if ok {
|
||||||
|
l.pos += w
|
||||||
|
l.advanceCursor(r)
|
||||||
|
return r, true
|
||||||
|
}
|
||||||
|
if r == utf8.RuneError && w == 0 {
|
||||||
|
l.EmitError("unexpected end of file")
|
||||||
|
} else {
|
||||||
|
l.EmitError("invalid UTF8 character")
|
||||||
|
}
|
||||||
|
return r, false
|
||||||
|
}
|
||||||
|
|
||||||
|
// advanceCursor advances the rune cursor one position in the
|
||||||
|
// input data. While doing so, it keeps tracks of newlines,
|
||||||
|
// so we can report on row + column positions on error.
|
||||||
|
func (l *Parser) advanceCursor(r rune) {
|
||||||
|
if l.newline {
|
||||||
|
l.cursorColumn = 0
|
||||||
|
l.cursorRow++
|
||||||
|
} else {
|
||||||
|
l.cursorColumn++
|
||||||
|
}
|
||||||
|
l.newline = r == '\n'
|
||||||
|
}
|
|
@ -0,0 +1,62 @@
|
||||||
|
package parser
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// StringBuffer is a string buffer implementation, which is used by the parser
|
||||||
|
// to efficiently accumulate runes from the input and eventually turn these
|
||||||
|
// into a string, either literal or interpreted.
|
||||||
|
type StringBuffer struct {
|
||||||
|
buffer bytes.Buffer
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reset resets the string buffer, in order to build a new string.
|
||||||
|
func (b *StringBuffer) Reset() *StringBuffer {
|
||||||
|
b.buffer.Reset()
|
||||||
|
return b
|
||||||
|
}
|
||||||
|
|
||||||
|
// WriteString adds the runes of the input string to the string buffer.
|
||||||
|
func (b *StringBuffer) WriteString(s string) *StringBuffer {
|
||||||
|
for _, r := range s {
|
||||||
|
b.WriteRune(r)
|
||||||
|
}
|
||||||
|
return b
|
||||||
|
}
|
||||||
|
|
||||||
|
// WriteRune adds a single rune to the string buffer.
|
||||||
|
func (b *StringBuffer) WriteRune(r rune) *StringBuffer {
|
||||||
|
b.buffer.WriteRune(r)
|
||||||
|
return b
|
||||||
|
}
|
||||||
|
|
||||||
|
// AsLiteralString returns the string buffer as a literal string.
|
||||||
|
// Literal means that no escape sequences are processed.
|
||||||
|
func (b *StringBuffer) AsLiteralString() string {
|
||||||
|
return b.buffer.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
// AsInterpretedString returns the string in its interpreted form.
|
||||||
|
// Interpreted means that escape sequences are handled in the way that Go would
|
||||||
|
// have, had it been inside double quotes. It translates for example escape
|
||||||
|
// sequences like "\n", "\t", \uXXXX" and "\UXXXXXXXX" into their string
|
||||||
|
// representations.
|
||||||
|
// Since the input might contain invalid escape sequences, this method
|
||||||
|
// also returns an error. When an error is returned, the returned string will
|
||||||
|
// contain the string as far as it could be interpreted.
|
||||||
|
func (b *StringBuffer) AsInterpretedString() (string, error) {
|
||||||
|
var sb strings.Builder
|
||||||
|
tail := b.buffer.String()
|
||||||
|
for len(tail) > 0 {
|
||||||
|
r, _, newtail, err := strconv.UnquoteChar(tail, '"')
|
||||||
|
if err != nil {
|
||||||
|
return sb.String(), err
|
||||||
|
}
|
||||||
|
tail = newtail
|
||||||
|
sb.WriteRune(r)
|
||||||
|
}
|
||||||
|
return sb.String(), nil
|
||||||
|
}
|
|
@ -0,0 +1,90 @@
|
||||||
|
package parser_test
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/mmakaay/toml/parser"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestGeneratingStringDoesNotResetBuffer(t *testing.T) {
|
||||||
|
var b parser.StringBuffer
|
||||||
|
s1, _ := b.WriteString(`hi\nthere`).AsInterpretedString()
|
||||||
|
s2 := b.AsLiteralString()
|
||||||
|
if s1 != "hi\nthere" {
|
||||||
|
t.Fatalf("Did not get expected string\"X\" for try 1, but %q", s1)
|
||||||
|
}
|
||||||
|
if s2 != "hi\\nthere" {
|
||||||
|
t.Fatalf("Did not get expected string\"X\" for try 2, but %q", s2)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestResetResetsBuffer(t *testing.T) {
|
||||||
|
var b parser.StringBuffer
|
||||||
|
s := b.WriteRune('X').Reset().AsLiteralString()
|
||||||
|
if s != "" {
|
||||||
|
t.Fatalf("Did not get expected empty string, but %q", s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAsLiteralString(t *testing.T) {
|
||||||
|
b := parser.StringBuffer{}
|
||||||
|
for _, c := range []stringbufT{
|
||||||
|
{"empty string", ``, ``, OK},
|
||||||
|
{"simple string", `Simple string!`, `Simple string!`, OK},
|
||||||
|
{"single quote", `'`, `'`, OK},
|
||||||
|
{"double quote", `"`, `"`, OK},
|
||||||
|
{"escaped single quote", `\'`, `\'`, OK},
|
||||||
|
{"escaped double quote", `\"`, `\"`, OK},
|
||||||
|
{"escape anything", `\x\t\f\n\r\'\"\\`, `\x\t\f\n\r\'\"\\`, OK},
|
||||||
|
{"UTF8 escapes", `\uceb2\U00e0b8bf`, `\uceb2\U00e0b8bf`, OK},
|
||||||
|
{"actual newline", "on\nmultiple\nlines", "on\nmultiple\nlines", OK},
|
||||||
|
} {
|
||||||
|
s := b.Reset().WriteString(c.in).AsLiteralString()
|
||||||
|
if s != c.out {
|
||||||
|
t.Fatalf("[%s] %q -> %q failed: actual result = %q", c.name, c.in, c.out, s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAsInterpretedString(t *testing.T) {
|
||||||
|
b := parser.StringBuffer{}
|
||||||
|
for _, c := range []stringbufT{
|
||||||
|
{"empty string", "", "", OK},
|
||||||
|
{"one character", "Simple string!", "Simple string!", OK},
|
||||||
|
{"escaped single quote", `\'`, "", FAIL},
|
||||||
|
{"escaped double quote", `\"`, `"`, OK},
|
||||||
|
{"bare single quote", `'`, "'", OK},
|
||||||
|
{"string in single quotes", `'Hello'`, `'Hello'`, OK},
|
||||||
|
{"string in escaped double quotes", `\"Hello\"`, `"Hello"`, OK},
|
||||||
|
{"escape something", `\t\f\n\r\"\\`, "\t\f\n\r\"\\", OK},
|
||||||
|
{"short UTF8 escapes", `\u2318Wh\u00e9\u00e9!`, `⌘Whéé!`, OK},
|
||||||
|
{"long UTF8 escapes", `\U0001014D \u2318 Wh\u00e9\u00e9!`, `𐅍 ⌘ Whéé!`, OK},
|
||||||
|
{"UTF8 characters", "Ѝюج wut Ж ?", "Ѝюج wut Ж ?", OK},
|
||||||
|
{"example from spec",
|
||||||
|
`I'm a string. \"You can quote me\". Name\tJos\u00E9\nLocation\tSF.`,
|
||||||
|
"I'm a string. \"You can quote me\". Name\tJosé\nLocation\tSF.", OK},
|
||||||
|
} {
|
||||||
|
s, err := b.Reset().WriteString(c.in).AsInterpretedString()
|
||||||
|
if c.isSuccessCase && err != nil {
|
||||||
|
t.Fatalf("[%s] unexpected error for input %q: %s", c.name, c.in, err)
|
||||||
|
}
|
||||||
|
if !c.isSuccessCase && err == nil {
|
||||||
|
t.Fatalf("[%s] expected a failure, but no failure occurred", c.name)
|
||||||
|
}
|
||||||
|
if s != c.out && c.isSuccessCase {
|
||||||
|
t.Fatalf("[%s] %q -> %q failed: actual result = %q", c.name, c.in, c.out, s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type stringbufT struct {
|
||||||
|
name string
|
||||||
|
in string
|
||||||
|
out string
|
||||||
|
isSuccessCase bool
|
||||||
|
}
|
||||||
|
|
||||||
|
const (
|
||||||
|
OK bool = true
|
||||||
|
FAIL bool = false
|
||||||
|
)
|
|
@ -0,0 +1,51 @@
|
||||||
|
package parser
|
||||||
|
|
||||||
|
// Parser holds the internal state of the Parser.
|
||||||
|
type Parser struct {
|
||||||
|
state StateFn // a function that handles the current state
|
||||||
|
stack []StateFn // state function stack, for nested parsing
|
||||||
|
input string // the scanned input
|
||||||
|
len int // the total length of the input in bytes
|
||||||
|
pos int // current byte scanning position in the input
|
||||||
|
newline bool // keep track of when we have scanned a newline
|
||||||
|
cursorRow int // current row number in the input
|
||||||
|
cursorColumn int // current column position in the input
|
||||||
|
buffer StringBuffer // an efficient buffer, used to build string values
|
||||||
|
items chan Item // channel of resulting Parser items
|
||||||
|
item Item // the current item as reached by Next() and retrieved by Get()
|
||||||
|
err *Error // an error when lexing failed, retrieved by Error()
|
||||||
|
}
|
||||||
|
|
||||||
|
// StateFn represents the state of the parser as a function
|
||||||
|
// that returns the next state.
|
||||||
|
type StateFn func(*Parser) StateFn
|
||||||
|
|
||||||
|
// ItemType represents the type of a parser Item.
|
||||||
|
type ItemType int
|
||||||
|
|
||||||
|
// ItemEOF is a built-in parser item type that is used for flagging that the
|
||||||
|
// end of the input was reached.
|
||||||
|
const ItemEOF ItemType = -1
|
||||||
|
|
||||||
|
// ItemError is a built-in parser item type that is used for flagging that
|
||||||
|
// an error has occurred during parsing.
|
||||||
|
const ItemError ItemType = -2
|
||||||
|
|
||||||
|
// Item represents an item returned from the parser.
|
||||||
|
type Item struct {
|
||||||
|
Type ItemType
|
||||||
|
Value string
|
||||||
|
}
|
||||||
|
|
||||||
|
// Error is used as the error type when parsing errors occur.
|
||||||
|
// The error includes some extra meta information to allow for useful
|
||||||
|
// error messages to the user.
|
||||||
|
type Error struct {
|
||||||
|
Message string
|
||||||
|
Row int
|
||||||
|
Column int
|
||||||
|
}
|
||||||
|
|
||||||
|
func (err *Error) Error() string {
|
||||||
|
return err.Message
|
||||||
|
}
|
|
@ -0,0 +1,39 @@
|
||||||
|
package parser
|
||||||
|
|
||||||
|
// Next retrieves the next parsed item.
|
||||||
|
// When a valid item was found, then the boolean return parameter will be true.
|
||||||
|
// On error or when successfully reaching the end of the input, false is returned.
|
||||||
|
// When an error occurred, it will be set in the error return value, nil otherwise.
|
||||||
|
func (l *Parser) Next() (Item, *Error, bool) {
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case i := <-l.items:
|
||||||
|
switch {
|
||||||
|
case i.Type == ItemEOF:
|
||||||
|
return i, nil, false
|
||||||
|
case i.Type == ItemError:
|
||||||
|
l.err = &Error{i.Value, l.cursorRow, l.cursorColumn}
|
||||||
|
return i, l.err, false
|
||||||
|
default:
|
||||||
|
l.item = i
|
||||||
|
return i, nil, true
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
l.state = l.state(l)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ToArray returns Parser items as an array (mainly intended for testing purposes)
|
||||||
|
// When an error occurs during scanning, a partial result will be
|
||||||
|
// returned, accompanied by the error that occurred.
|
||||||
|
func (l *Parser) ToArray() ([]Item, *Error) {
|
||||||
|
var items []Item
|
||||||
|
for {
|
||||||
|
item, err, more := l.Next()
|
||||||
|
if !more {
|
||||||
|
return items, err
|
||||||
|
}
|
||||||
|
items = append(items, item)
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue