diff --git a/lexer/items.go b/lexer/items.go index e64515f..026aadc 100644 --- a/lexer/items.go +++ b/lexer/items.go @@ -1,48 +1,35 @@ package lexer -import "fmt" +import ( + "fmt" -// itemType represents the type of lexer items. -type itemType int + "github.com/mmakaay/toml/parser" +) // Definition of all the lexer item types for the TOML lexer. const ( - ItemError itemType = iota // An error occurred - ItemEOF // End of input reached - ItemComment // Comment string, starts with # till en of line - ItemKey // Key of a key/value pair - ItemKeyDot // Dot for a dotted key - ItemAssignment // Value assignment coming up (=) - ItemString // A value of type string + ItemComment parser.ItemType = iota // An error occurred + ItemKey // Key of a key/value pair + ItemKeyDot // Dot for a dotted key + ItemAssignment // Value assignment coming up (=) + ItemString // A value of type string ) -// Item represents a lexer item returned from the scanner. -type Item struct { - Type itemType //Type, e.g. ItemComment, ItemString - Value string // Value, e.g. "10.42", "[" -} - -// String returns a string representation of the lexer item. -func (i Item) String() string { +// ParserItemToString returns a string representation of the +// parser.Item. This is used for unit testing purposes. +func ParserItemToString(i parser.Item) string { switch i.Type { + case ItemComment: + return fmt.Sprintf("#(%s)", i.Value) case ItemKey: return fmt.Sprintf("[%s]", i.Value) + case ItemString: + return fmt.Sprintf("STR(%s)", i.Value) case ItemKeyDot: return "." case ItemAssignment: return "=" - } - return fmt.Sprintf("%s(%s)", i.Type, i.Value) -} - -// String returns a string representation of the lexer item type. -func (i itemType) String() string { - switch i { - case ItemComment: - return "#" - case ItemString: - return "STR" default: - panic(fmt.Sprintf("No translation available for type id %d", i)) + panic(fmt.Sprintf("No string representation available for parser.Item id %d", i.Type)) } } diff --git a/lexer/lexer.go b/lexer/lexer.no similarity index 66% rename from lexer/lexer.go rename to lexer/lexer.no index 1ca4b06..7a2d92a 100644 --- a/lexer/lexer.go +++ b/lexer/lexer.no @@ -4,22 +4,24 @@ import ( "fmt" "strings" "unicode/utf8" + + "github.com/mmakaay/toml/parser" ) // Lexer holds the state of the lexer. type Lexer struct { - input string // the scanned input string - state stateFn // a function that handles the current state - stack []stateFn // state function stack, for nested parsing - pos int // current byte scanning position in the input - newline bool // keep track of when we have scanned a newline - cursorRow int // current row number in the input - cursorColumn int // current column position in the input - width int // width of the last rune read, for supporting backup() - buffer StringBuffer // an efficient buffer, used to build string values - items chan Item // channel of resulting lexer items - item Item // the current item as reached by Next() and retrieved by Get() - err *Error // an error when lexing failed, retrieved by Error() + input string // the scanned input + state parser.StateFn // a function that handles the current state + stack []parser.StateFn // state function stack, for nested parsing + len int // the total length of the input in bytes + pos int // current byte scanning position in the input + newline bool // keep track of when we have scanned a newline + cursorRow int // current row number in the input + cursorColumn int // current column position in the input + buffer StringBuffer // an efficient buffer, used to build string values + items chan parser.Item // channel of resulting lexer items + item parser.Item // the current item as reached by Next() and retrieved by Get() + err *Error // an error when lexing failed, retrieved by Error() } // Error is used as the error type when lexing errors occur. @@ -35,46 +37,45 @@ func (err *Error) Error() string { return err.Message } -// Lex takes an input string and initializes the TOML lexer for it. -func Lex(input string) *Lexer { +// New takes an input string and initializes the lexer for it. +func New(input string) *Lexer { return &Lexer{ input: input, + len: len(input), state: stateKeyValuePair, - items: make(chan Item, 2), + items: make(chan parser.Item, 2), } } // Next advances to the next lexer item in the input string. -// When a valid item was found, then the boolean return parameter is returned. +// When a valid item was found, then the boolean return parameter will be true. // On error or when reaching the end of the input, false is returned. -// When an error occurred, it will be set in the error return value. -func (l *Lexer) Next() (Item, *Error, bool) { - if l.state == nil { - panic("This should not happen: nil state reached, but entering Next()") - } +// When an error occurred, it will be set in the error return value, nil otherwise. +func (l *Lexer) Next() (parser.Item, *Error, bool) { for { select { case i := <-l.items: - if i.Type == ItemEOF { + switch { + case i.Type == ItemEOF: return i, nil, false - } - if i.Type == ItemError { + case i.Type == ItemError: l.err = &Error{i.Value, l.cursorRow, l.cursorColumn} return i, l.err, false + default: + l.item = i + return i, nil, true } - l.item = i - return i, nil, true default: l.state = l.state(l) } } } -// ToArray returns lexer items as an array. +// ToArray returns lexer items as an array (mainly intended for testing purposes) // When an error occurs during scanning, a partial result will be // returned, accompanied by the error that occurred. -func (l *Lexer) ToArray() ([]Item, *Error) { - var items []Item +func (l *Lexer) ToArray() ([]parser.Item, *Error) { + var items []parser.Item for { item, err, more := l.Next() if !more { @@ -100,25 +101,25 @@ func (l *Lexer) popState() stateFn { // atEndOfFile returns true when there is no more data available in the input. func (l *Lexer) atEndOfFile() bool { - return l.pos >= len(l.input) + return l.pos >= l.len } // emit passes a lexer item back to the client, including the provided string. -func (l *Lexer) emit(t itemType, s string) { - l.items <- Item{t, s} +func (l *Lexer) emit(t parser.ItemType, s string) { + l.items <- parser.Item{Type: t, Value: s} l.buffer.Reset() } // emitLiteral passes a lexer item back to the client, including the accumulated // string buffer data as a literal string. -func (l *Lexer) emitLiteral(t itemType) { +func (l *Lexer) emitLiteral(t parser.ItemType) { l.emit(t, l.buffer.AsLiteralString()) } // emitTrimmedLiteral passes a lexer item back to the client, including the // accumulated string buffer data as a literal string with whitespace // trimmed from it. -func (l *Lexer) emitTrimmedLiteral(t itemType) { +func (l *Lexer) emitTrimmedLiteral(t parser.ItemType) { l.emit(t, strings.TrimSpace(l.buffer.AsLiteralString())) } @@ -127,7 +128,7 @@ func (l *Lexer) emitTrimmedLiteral(t itemType) { // codes like \n, \t, \uXXXX, etc.) // This method might return an error, in case there is data in the // string buffer that is not valid for string interpretation. -func (l *Lexer) emitInterpreted(t itemType) error { +func (l *Lexer) emitInterpreted(t parser.ItemType) error { s, err := l.buffer.AsInterpretedString() if err != nil { return err @@ -137,15 +138,10 @@ func (l *Lexer) emitInterpreted(t itemType) error { } // emitError emits a lexer error item back to the client. -func (l *Lexer) emitError(message string) { +func (l *Lexer) emitError(format string, args ...interface{}) stateFn { + message := fmt.Sprintf(format, args...) l.emit(ItemError, message) -} - -// backup steps back one rune -// Can be called only once per call of next. -func (l *Lexer) backup() { - l.pos -= l.width - l.cursorColumn-- + return nil } // peek returns but does not advance to the next rune(s) in the input. @@ -176,17 +172,40 @@ func (l *Lexer) peekMulti(amount int) ([]rune, int, bool) { return peeked, width, true } -// acceptNext adds the specified amount of runes from the input to the string buffer. -// If not enough runes could be read (end of file or invalid UTF8 data), then false is returned. -func (l *Lexer) acceptNext(count int) bool { - for i := 0; i < count; i++ { - if r, ok := l.next(); ok { - l.buffer.WriteRune(r) - } else { - return false - } +// acceptAny adds the next rune from the input to the string buffer. +// If no rune could be read (end of file or invalid UTF8 data), then +// false is returned. +func (l *Lexer) acceptAny() bool { + if r, ok := l.next(); ok { + l.buffer.WriteRune(r) + return true } - return true + return false +} + +// accept adds the next rune to the string buffer and returns true if it's +// from the valid set of runes. Otherwise false is returned. +func (l *Lexer) accept(matches ...string) bool { + return l.acceptPattern(matches...) +} + +// AcceptMatching adds the next runes to the string buffer, but only +// if the upcoming runes satisfy the provided pattern. +// When runes were added then true is returned, false otherwise. +func (l *Lexer) acceptPattern(pattern ...string) bool { + return l.progress(func(r rune) { l.buffer.WriteRune(r) }, pattern...) +} + +func (l *Lexer) progress(callback func(rune), matches ...string) bool { + if runes, w, ok := l.match(matches...); ok { + l.pos += w + for _, r := range runes { + callback(r) + l.advanceCursor(r) + } + return true + } + return false } // acceptConsecutive adds consecutive runes from the input to the string @@ -200,27 +219,9 @@ func (l *Lexer) acceptConsecutive(match string) bool { return accepted } -// next returns the next rune from the input and a boolean indicating if -// reading the input was successful. -// When the end of input is reached, or an invalid UTF8 character is -// read, then false is returned. -func (l *Lexer) next() (rune, bool) { - r, w, ok := l.peek() - if ok { - l.width = w - l.pos += w - l.advanceCursor(r) - return r, true - } - l.width = 0 - if r == utf8.RuneError && w == 0 { - l.emitError("unexpected end of file") - } else { - l.emitError("invalid UTF8 character") - } - return r, false -} - +// advanceCursor advances the rune cursor one position in the +// input data. While doing so, it keeps tracks of newlines, +// so we can report on row + column positions on error. func (l *Lexer) advanceCursor(r rune) { if l.newline { l.cursorColumn = 0 @@ -233,40 +234,20 @@ func (l *Lexer) advanceCursor(r rune) { // skip skips runes, but only when all provided matches are satisfied. // Returns true when one or more runes were skipped. -func (l *Lexer) skipMatching(matches ...string) bool { - if runes, w, ok := l.match(matches...); ok { - l.pos += w - for _, r := range runes { - l.advanceCursor(r) - } - return true - } - return false +func (l *Lexer) skipMatching(pattern ...string) bool { + return l.progress(func(r rune) {}, pattern...) } // skipConsecutive skips consecutive runes from the provided match. // Returns true when one or more runes were skipped. -func (l *Lexer) skipConsecutive(match string) bool { +func (l *Lexer) skipConsecutive(pattern string) bool { didSkip := false - for l.skipMatching(match) { + for l.skipMatching(pattern) { didSkip = true } return didSkip } -// accept adds the next rune to the string buffer and returns true if it's -// from the valid set of runes. Otherwise false is returned. -func (l *Lexer) accept(match string) bool { - if r, ok := l.next(); ok { - if strings.IndexRune(match, r) >= 0 { - l.buffer.WriteRune(r) - return true - } - } - l.backup() - return false -} - // upcoming checks if the upcoming runes satisfy the provided rune matches. // This is a lot like the match method, with the difference that // this one only returns the boolean value. @@ -275,6 +256,25 @@ func (l *Lexer) upcoming(matches ...string) bool { return ok } +// next returns the next rune from the input and a boolean indicating if +// reading the input was successful. +// When the end of input is reached, or an invalid UTF8 character is +// read, then false is returned. +func (l *Lexer) next() (rune, bool) { + r, w, ok := l.peek() + if ok { + l.pos += w + l.advanceCursor(r) + return r, true + } + if r == utf8.RuneError && w == 0 { + l.emitError("unexpected end of file") + } else { + l.emitError("invalid UTF8 character") + } + return r, false +} + // match checks if the upcoming runes satisfy the provided rune matches. // It returns a slice of runes that were found, their total byte width // and a boolean indicating whether or not all provided matches matched @@ -292,24 +292,14 @@ func (l *Lexer) match(matches ...string) ([]rune, int, bool) { return peeked, width, false } -// error returns an error token and terminates the scan -// by returning nil to l.run. -func (l *Lexer) errorf(format string, args ...interface{}) stateFn { - l.items <- Item{ - ItemError, - fmt.Sprintf(format, args...), - } - return nil -} - func (l *Lexer) unexpectedInputError(expected string) stateFn { - // next() takes care of error messages for ok == false. + // next() takes care of emitting errors for ok == false. if r, ok := l.next(); ok { - l.emitError(fmt.Sprintf("unexpected character %q (expected %s)", r, expected)) + return l.emitError(fmt.Sprintf("unexpected character %q (expected %s)", r, expected)) } return nil } func (l *Lexer) unexpectedEndOfFile(expected string) stateFn { - return l.errorf("Unexpected end of file (expected %s)", expected) + return l.emitError("Unexpected end of file (expected %s)", expected) } diff --git a/lexer/states.go b/lexer/states.go index eaee205..4378982 100644 --- a/lexer/states.go +++ b/lexer/states.go @@ -1,8 +1,6 @@ package lexer -// stateFn represents the state of the lexer as a function -// that returns the next state. -type stateFn func(*Lexer) stateFn +import "github.com/mmakaay/toml/parser" const ( whitespace string = " \t" @@ -28,59 +26,65 @@ const ( longUtf8Escape string = "U" ) -func stateKeyValuePair(l *Lexer) stateFn { - l.skipConsecutive(whitespace + carriageReturn + newline) - if l.skipMatching(hash) { +// NewParser creates a new parser, using the provided input string +// as the data to parse. +func NewParser(input string) *parser.Parser { + return parser.New(input, stateKeyValuePair) +} + +func stateKeyValuePair(l *parser.Parser) parser.StateFn { + l.SkipConsecutive(whitespace + carriageReturn + newline) + if l.SkipMatching(hash) { return stateComment } - if l.upcoming(startOfKey) { + if l.Upcoming(startOfKey) { return stateKey } return stateEndOfFile } // A '#' hash symbol marks the rest of the line as a comment. -func stateComment(l *Lexer) stateFn { +func stateComment(l *parser.Parser) parser.StateFn { for { switch { - case l.atEndOfFile() || l.skipMatching(newline): - l.emitTrimmedLiteral(ItemComment) + case l.AtEndOfFile() || l.SkipMatching(newline): + l.EmitLiteralTrim(ItemComment) return stateKeyValuePair default: - if !l.acceptNext(1) { - return l.unexpectedInputError("comment") + if !l.AcceptAny() { + return nil } } } } // A key may be either bare, quoted or dotted. -func stateKey(l *Lexer) stateFn { - if l.accept(bareKeyChars) { +func stateKey(l *parser.Parser) parser.StateFn { + if l.AcceptMatching(bareKeyChars) { return statebareKeyChars } - return l.unexpectedInputError("a valid key name") + return l.UnexpectedInputError("a valid key name") } // Bare keys may only contain ASCII letters, ASCII digits, // underscores, and dashes (A-Za-z0-9_-). Note that bare // keys are allowed to be composed of only ASCII digits, // e.g. 1234, but are always interpreted as strings. -func statebareKeyChars(l *Lexer) stateFn { - l.acceptConsecutive(bareKeyChars) - l.emitLiteral(ItemKey) +func statebareKeyChars(l *parser.Parser) parser.StateFn { + l.AcceptConsecutive(bareKeyChars) + l.EmitLiteral(ItemKey) return stateEndOfKeyOrKeyDot } // Dotted keys are a sequence of bare or quoted keys joined with a dot. // This allows for grouping similar properties together: -func stateEndOfKeyOrKeyDot(l *Lexer) stateFn { +func stateEndOfKeyOrKeyDot(l *parser.Parser) parser.StateFn { // Whitespace around dot-separated parts is ignored, however, // best practice is to not use any extraneous whitespace. - l.skipConsecutive(whitespace) - if l.skipMatching(dot) { - l.emit(ItemKeyDot, "") - l.skipConsecutive(whitespace) + l.SkipConsecutive(whitespace) + if l.SkipMatching(dot) { + l.Emit(ItemKeyDot, "") + l.SkipConsecutive(whitespace) return stateKey } return stateKeyAssignment @@ -90,62 +94,69 @@ func stateEndOfKeyOrKeyDot(l *Lexer) stateFn { // Whitespace is ignored around key names and values. The key, equals // sign, and value must be on the same line (though some values can // be broken over multiple lines). -func stateKeyAssignment(l *Lexer) stateFn { - l.skipConsecutive(whitespace) - if l.skipMatching(equal) { - l.emit(ItemAssignment, "") - l.skipConsecutive(whitespace) +func stateKeyAssignment(l *parser.Parser) parser.StateFn { + l.SkipConsecutive(whitespace) + if l.SkipMatching(equal) { + l.Emit(ItemAssignment, "") + l.SkipConsecutive(whitespace) return stateValue } - return l.unexpectedInputError("a value assignment") + return l.UnexpectedInputError("a value assignment") } // Values must be of the following types: String, Integer, Float, Boolean, // Datetime, Array, or Inline Table. Unspecified values are invalid. -func stateValue(l *Lexer) stateFn { - l.skipConsecutive(whitespace) - if l.upcoming(quoteChars) { +func stateValue(l *parser.Parser) parser.StateFn { + l.SkipConsecutive(whitespace) + if l.Upcoming(quoteChars) { return stateStringValue } - return l.unexpectedInputError("a value") + return l.UnexpectedInputError("a value") } // There are four ways to express strings: basic, multi-line basic, literal, // and multi-line literal. All strings must contain only valid UTF-8 characters. -func stateStringValue(l *Lexer) stateFn { +func stateStringValue(l *parser.Parser) parser.StateFn { switch { - case l.skipMatching(doubleQuote, doubleQuote, doubleQuote): + case l.SkipMatching(doubleQuote, doubleQuote, doubleQuote): // Multi-line basic strings are surrounded by three quotation marks on each side. return stateMultiLineBasicString - case l.skipMatching(doubleQuote): + case l.SkipMatching(doubleQuote): // Basic strings are surrounded by quotation marks. - return stateBasicStringValue + return stateSingleLineBasicString } - return l.unexpectedInputError("a string value") + return l.UnexpectedInputError("a string value") } -func stateBasicStringValue(l *Lexer) stateFn { - if l.upcoming(doubleQuote, doubleQuote) { +func stateSingleLineBasicString(l *parser.Parser) parser.StateFn { + if l.Upcoming(doubleQuote, doubleQuote) { return stateMultiLineBasicString } return stateBasicString } -const invalidBasicStringCharacters string = "" + +func stateMultiLineBasicString(l *parser.Parser) parser.StateFn { + l.EmitError("Not yet implemented") + return nil +} + +// Any Unicode character may be used except those that must be escaped: +// quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F). +const invalidBasicStringCharacters string = "\"\\" + "\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007" + "\u0008\u0009\u000A\u000B\u000C\u000D\u000E\u000F" + "\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017" + "\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F" + "\u007F" -func stateParseBasicString(l *Lexer) stateFn { +func stateParseBasicString(l *parser.Parser) parser.StateFn { for { switch { - case l.atEndOfFile(): - return l.unexpectedEndOfFile("basic string token") - case l.skipMatching(doubleQuote): - return l.popState() - case l.upcoming(backslash, escapeChars): + case l.AtEndOfFile(): + return l.UnexpectedEndOfFile("basic string token") + case l.SkipMatching(doubleQuote): + return l.PopState() + case l.AcceptMatching(backslash, escapeChars): // For convenience, some popular characters have a compact escape sequence. // \b - backspace (U+0008) // \t - tab (U+0009) @@ -154,50 +165,45 @@ func stateParseBasicString(l *Lexer) stateFn { // \r - carriage return (U+000D) // \" - quote (U+0022) // \\ - backslash (U+005C) - l.acceptNext(2) - case l.upcoming(backslash, shortUtf8Escape, hex, hex, hex, hex): + case l.AcceptMatching(backslash, shortUtf8Escape, hex, hex, hex, hex): // \uXXXX - unicode (U+XXXX) - l.acceptNext(6) - case l.upcoming(backslash, longUtf8Escape, hex, hex, hex, hex, hex, hex, hex, hex): + case l.AcceptMatching(backslash, longUtf8Escape, hex, hex, hex, hex, hex, hex, hex, hex): // \UXXXXXXXX - unicode (U+XXXXXXXX) - l.acceptNext(10) - case l.upcoming(backslash): + case l.Upcoming(backslash): // All other escape sequences not listed above are reserved and, // if used, TOML should produce an error. - return l.errorf("Invalid escape sequence in basic string") - case l.upcoming(invalidBasicStringCharacters): + return l.EmitError("Invalid escape sequence in basic string") + case l.Upcoming(invalidBasicStringCharacters): // Any Unicode character may be used except those that must be escaped: // quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F). - r, _ := l.next() - return l.errorf("Invalid character in basic string: %q", r) + r, _, _ := l.Match(invalidBasicStringCharacters) + l.EmitError("Invalid character in basic string: %q (must be escaped)", r[0]) + return nil default: - if !l.acceptNext(1) { - return l.unexpectedInputError("string value") + if !l.AcceptAny() { + return nil } } } } -func stateBasicString(l *Lexer) stateFn { - l.pushState(func(l *Lexer) stateFn { - err := l.emitInterpreted(ItemString) +func stateBasicString(l *parser.Parser) parser.StateFn { + l.PushState(func(l *parser.Parser) parser.StateFn { + err := l.EmitInterpreted(ItemString) if err != nil { - return l.errorf("Invalid data in string: %s", err) + l.EmitError("Invalid data in string: %s", err) + return nil } return stateKeyValuePair }) return stateParseBasicString } -func stateMultiLineBasicString(l *Lexer) stateFn { - return l.errorf("Not yet implemented") -} - -func stateEndOfFile(l *Lexer) stateFn { - if l.atEndOfFile() { - l.emit(ItemEOF, "EOF") +func stateEndOfFile(l *parser.Parser) parser.StateFn { + if l.AtEndOfFile() { + l.Emit(parser.ItemEOF, "EOF") // todo Automate within parser? } else { - l.unexpectedInputError("end of file") + l.UnexpectedInputError("end of file") } return nil } diff --git a/lexer/states_test.go b/lexer/states_test.go index b72b9a8..680b6bd 100644 --- a/lexer/states_test.go +++ b/lexer/states_test.go @@ -9,7 +9,7 @@ import ( ) func TestErrorsIncludeLineAndRowPosition(t *testing.T) { - _, err := lexer.Lex("# 12345 abcde\t\n\n\n# 67890\r\n# 12345\xbc").ToArray() + _, err := lexer.NewParser("# 12345 abcde\t\n\n\n# 67890\r\n# 12345\xbc").ToArray() t.Logf("Got error: %s", err.Error()) if err.Row != 4 { t.Errorf("Unexpected line number: %d (expected %d)", err.Row, 4) @@ -19,21 +19,20 @@ func TestErrorsIncludeLineAndRowPosition(t *testing.T) { } } +func TestEmptyInput(t *testing.T) { + runStatesT(t, statesT{"empty string", "", "", ""}) +} + func TestInvalidUtf8Data(t *testing.T) { runStatesTs(t, []statesT{ {"inside comment", "# \xbc", "", "invalid UTF8 character"}, {"bare key 1", "\xbc", "", "invalid UTF8 character"}, - {"bare key 2", "key\xbc", "", "invalid UTF8 character"}, + {"bare key 2", "key\xbc", "[key]", "invalid UTF8 character"}, {"assignment", "key \xbc", "[key]", "invalid UTF8 character"}, {"start of value", "key=\xbc", "[key]=", "invalid UTF8 character"}, {"basic string value", "a=\"\xbc\"", "[a]=", "invalid UTF8 character"}, }) } - -func TestEmptyInput(t *testing.T) { - runStatesT(t, statesT{"empty string", "", "", ""}) -} - func TestWhiteSpaceAndNewlines(t *testing.T) { runStatesTs(t, []statesT{ {"space", " ", "", ""}, @@ -61,13 +60,13 @@ func TestKeyWithoutAssignment(t *testing.T) { err := "unexpected end of file" runStatesTs(t, []statesT{ {"bare with whitespace", " a ", "[a]", err}, - {"bare lower", "abcdefghijklmnopqrstuvwxyz", "", err}, - {"bare upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "", err}, - {"bare numbers", "0123456789", "", err}, - {"bare underscore", "_", "", err}, - {"bare dash", "-", "", err}, - {"bare big mix", "-hey_good_Lookin123-", "", err}, - {"bare dotted", "a._.c", "[a].[_].", err}, + {"bare lower", "abcdefghijklmnopqrstuvwxyz", "[abcdefghijklmnopqrstuvwxyz]", err}, + {"bare upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "[ABCDEFGHIJKLMNOPQRSTUVWXYZ]", err}, + {"bare numbers", "0123456789", "[0123456789]", err}, + {"bare underscore", "_", "[_]", err}, + {"bare dash", "-", "[-]", err}, + {"bare big mix", "-hey_good_Lookin123-", "[-hey_good_Lookin123-]", err}, + {"bare dotted", "a._.c", "[a].[_].[c]", err}, {"bare dotted with whitespace", " a .\t\t b\t ", "[a].[b]", err}, }) } @@ -90,9 +89,9 @@ func TestUnterminatedBasicString(t *testing.T) { func TestBasicStringWithUnescapedControlCharacters(t *testing.T) { runStatesTs(t, []statesT{ - {"null char", "a=\"\u0000\"", "[a]=", `Invalid character in basic string: '\x00'`}, - {"newline", "a=\"b\nc\nd\"", "[a]=", `Invalid character in basic string: '\n'`}, - {"delete", "a=\"\u007F\"", "[a]=", `Invalid character in basic string: '\u007f'`}, + {"null char", "a=\"\u0000\"", "[a]=", `Invalid character in basic string: '\x00' (must be escaped)`}, + {"newline", "a=\"b\nc\nd\"", "[a]=", `Invalid character in basic string: '\n' (must be escaped)`}, + {"delete", "a=\"\u007F\"", "[a]=", `Invalid character in basic string: '\u007f' (must be escaped)`}, }) // No need to write all test cases for disallowed characters by hand. @@ -100,7 +99,7 @@ func TestBasicStringWithUnescapedControlCharacters(t *testing.T) { name := fmt.Sprintf("control character %x", rune(i)) runStatesT( t, statesT{name, fmt.Sprintf(`_="%c"`, rune(i)), "[_]=", - fmt.Sprintf(`Invalid character in basic string: %q`, rune(i))}) + fmt.Sprintf(`Invalid character in basic string: %q (must be escaped)`, rune(i))}) } } @@ -163,7 +162,7 @@ func runStatesTs(t *testing.T, tests []statesT) { } func runStatesT(t *testing.T, c statesT) { - l, err := lexer.Lex(c.in).ToArray() + l, err := lexer.NewParser(c.in).ToArray() if err == nil && c.err != "" { t.Errorf("[%s] Expected error '%s', but no error occurred", c.name, c.err) } @@ -179,14 +178,15 @@ func runStatesT(t *testing.T, c statesT) { t.Errorf("[%s] Unexpected number of lexer items:\nexpected: %d\nactual: %d\n", c.name, len(expected), len(l)) } for i, e := range expected { - if l[i].String() != e { - t.Errorf("[%s] Unexpected lexer item at index %d:\nexpected: %s\nactual: %s\n", c.name, i, e, l[i]) + v := lexer.ParserItemToString(l[i]) + if v != e { + t.Errorf("[%s] Unexpected lexer item at index %d:\nexpected: %s\nactual: %s\n", c.name, i, e, v) } } case string: a := make([]string, len(l)) for _, v := range l { - a = append(a, v.String()) + a = append(a, lexer.ParserItemToString(v)) } actual := strings.Join(a, "") if actual != expected { diff --git a/lexer/stringbuf.go b/lexer/stringbuf.no similarity index 100% rename from lexer/stringbuf.go rename to lexer/stringbuf.no diff --git a/lexer/stringbuf_test.go b/lexer/stringbuf_test.no similarity index 100% rename from lexer/stringbuf_test.go rename to lexer/stringbuf_test.no diff --git a/parser/parser.go b/parser/parser.go new file mode 100644 index 0000000..160ae0b --- /dev/null +++ b/parser/parser.go @@ -0,0 +1,261 @@ +package parser + +import ( + "fmt" + "strings" + "unicode/utf8" +) + +// New takes an input string and a start state, +// and initializes the parser for it. +func New(input string, startState StateFn) *Parser { + return &Parser{ + input: input, + len: len(input), + state: startState, + items: make(chan Item, 2), + } +} + +// PushState adds the state function to the state stack. +// This is used for implementing nested parsing. +func (l *Parser) PushState(state StateFn) { + l.stack = append(l.stack, state) +} + +// PopState pops the last pushed state from the state stack. +func (l *Parser) PopState() StateFn { + last := len(l.stack) - 1 + head, tail := l.stack[:last], l.stack[last] + l.stack = head + return tail +} + +// AtEndOfFile returns true when there is no more data available in the input. +func (l *Parser) AtEndOfFile() bool { + return l.pos >= l.len +} + +// Emit passes a Parser item to the client, including the provided string. +func (l *Parser) Emit(t ItemType, s string) { + l.items <- Item{t, s} + l.buffer.Reset() +} + +// EmitLiteral passes a Parser item to the client, including the accumulated +// string buffer data as a literal string. +func (l *Parser) EmitLiteral(t ItemType) { + l.Emit(t, l.buffer.AsLiteralString()) +} + +// EmitLiteralTrim passes a Parser item to the client, including the +// accumulated string buffer data as a literal string with whitespace +// trimmed from it. +func (l *Parser) EmitLiteralTrim(t ItemType) { + l.Emit(t, strings.TrimSpace(l.buffer.AsLiteralString())) +} + +// EmitInterpreted passes a Parser item to the client, including the +// accumulated string buffer data a Go doubled quoted interpreted string +// (handling escape codes like \n, \t, \uXXXX, etc.) +// This method might return an error, in case there is data in the +// string buffer that is not valid for string interpretation. +func (l *Parser) EmitInterpreted(t ItemType) error { + s, err := l.buffer.AsInterpretedString() + if err != nil { + return err + } + l.Emit(t, s) + return nil +} + +// EmitError emits a Parser error item to the client. +func (l *Parser) EmitError(format string, args ...interface{}) StateFn { + message := fmt.Sprintf(format, args...) + l.Emit(ItemError, message) + return nil +} + +// Match checks if the upcoming runes satisfy all provided patterns. +// It returns a slice of runes that were found, their total byte width +// and a boolean indicating whether or not all provided patterns were +// satisfied by the input data. +func (l *Parser) Match(patterns ...string) ([]rune, int, bool) { + peeked, width, ok := l.peekMulti(len(patterns)) + if ok { + for i, r := range patterns { + if strings.IndexRune(r, peeked[i]) < 0 { + return peeked, width, false + } + } + return peeked, width, true + } + return peeked, width, false +} + +// Upcoming checks if the upcoming runes satisfy all provided patterns. +// Returns true if all provided patterns are satisfied. +func (l *Parser) Upcoming(patterns ...string) bool { + _, _, ok := l.Match(patterns...) + return ok +} + +// AcceptAny adds the next rune from the input to the string buffer. +// If no rune could be read (end of file or invalid UTF8 data), +// then false is returned. +func (l *Parser) AcceptAny() bool { + if r, ok := l.next(); ok { + l.buffer.WriteRune(r) + return true + } + return false +} + +// AcceptMatching adds the next runes to the string buffer, but only +// if the upcoming runes satisfy the provided patterns. +// When runes were added then true is returned, false otherwise. +func (l *Parser) AcceptMatching(patterns ...string) bool { + return l.progress(func(r rune) { l.buffer.WriteRune(r) }, patterns...) +} + +// AcceptConsecutive adds consecutive runes from the input to the string +// buffer, as long as they exist in the pattern. +// If any runes were added then true is returned, false otherwise. +func (l *Parser) AcceptConsecutive(pattern string) bool { + accepted := false + for l.AcceptMatching(pattern) { + accepted = true + } + return accepted +} + +// SkipMatching skips runes, but only when all provided patterns are satisfied. +// Returns true when one or more runes were skipped. +func (l *Parser) SkipMatching(patterns ...string) bool { + if runes, w, ok := l.Match(patterns...); ok { + l.pos += w + for _, r := range runes { + l.advanceCursor(r) + } + return true + } + return false +} + +// SkipConsecutive skips consecutive runes from the provided pattern. +// Returns true when one or more runes were skipped. +func (l *Parser) SkipConsecutive(pattern string) bool { + didSkip := false + for l.SkipMatching(pattern) { + didSkip = true + } + return didSkip +} + +// ============================================================================ +// EMIT DATA AND ERRORS +// ============================================================================ + +// UnexpectedInputError is used by a parser implementation to emit an +// error item that tells the client that an unexpected rune was +// encountered in the input. +// The parameter 'expected' is used to provide some context to the error. +func (l *Parser) UnexpectedInputError(expected string) StateFn { + // next() takes care of error messages for ok == false. + if r, ok := l.next(); ok { + return l.EmitError(fmt.Sprintf("unexpected character %q (expected %s)", r, expected)) + } + return nil +} + +// UnexpectedEndOfFile is used by a parser implementation to emit an +// error item that tells the client that more data was expected from +// the input. +// The parameter 'expected' is used to provide some context to the error. +func (l *Parser) UnexpectedEndOfFile(expected string) StateFn { + return l.EmitError("Unexpected end of file (expected %s)", expected) +} + +// ============================================================================ +// LEXER : our lexer is quite low level, it only returns UTF8 runes +// ============================================================================ + +// peek returns but does not advance to the next rune(s) in the input. +// Returns the rune, its width and a boolean. The boolean will be false in case +// no upcoming rune can be peeked (end of data or invalid UTF8 character). +func (l *Parser) peek() (rune, int, bool) { + peeked, width := utf8.DecodeRuneInString(l.input[l.pos:]) + return peeked, width, peeked != utf8.RuneError +} + +// peekMulti takes a peek at multiple upcoming runes in the input. +// Returns a slice of runes, their total width in bytes and a boolean. +// The boolean will be false in case less runes can be peeked than +// the requested amount (end of data or invalid UTF8 character). +func (l *Parser) peekMulti(amount int) ([]rune, int, bool) { + width := 0 + var peeked []rune + for i := 0; i < amount; i++ { + r, w := utf8.DecodeRuneInString(l.input[l.pos+width:]) + switch { + case r == utf8.RuneError: + return peeked, width, false + default: + width += w + peeked = append(peeked, r) + } + } + return peeked, width, true +} + +// progress moves the cursor forward in the input, returning one rune +// for every specified pattern. The cursor is only moved forward when +// all patterns are satisfied. +// Returns true when all patterns were satisfied and the cursor was +// moved forward, false otherwise. +// A callback function can be provided to specify what to do with +// the runes that are encountered in the input. +func (l *Parser) progress(callback func(rune), patterns ...string) bool { + if runes, w, ok := l.Match(patterns...); ok { + l.pos += w + for _, r := range runes { + callback(r) + l.advanceCursor(r) + } + return true + } + return false +} + +// next returns the next rune from the input and a boolean indicating if +// reading the input was successful. +// When the end of input is reached, or an invalid UTF8 character is +// read, then false is returned. Both are considered error cases, +// and for that reason these automatically emit an error to the client. +func (l *Parser) next() (rune, bool) { + r, w, ok := l.peek() + if ok { + l.pos += w + l.advanceCursor(r) + return r, true + } + if r == utf8.RuneError && w == 0 { + l.EmitError("unexpected end of file") + } else { + l.EmitError("invalid UTF8 character") + } + return r, false +} + +// advanceCursor advances the rune cursor one position in the +// input data. While doing so, it keeps tracks of newlines, +// so we can report on row + column positions on error. +func (l *Parser) advanceCursor(r rune) { + if l.newline { + l.cursorColumn = 0 + l.cursorRow++ + } else { + l.cursorColumn++ + } + l.newline = r == '\n' +} diff --git a/parser/stringbuf.go b/parser/stringbuf.go new file mode 100644 index 0000000..1b9b570 --- /dev/null +++ b/parser/stringbuf.go @@ -0,0 +1,62 @@ +package parser + +import ( + "bytes" + "strconv" + "strings" +) + +// StringBuffer is a string buffer implementation, which is used by the parser +// to efficiently accumulate runes from the input and eventually turn these +// into a string, either literal or interpreted. +type StringBuffer struct { + buffer bytes.Buffer +} + +// Reset resets the string buffer, in order to build a new string. +func (b *StringBuffer) Reset() *StringBuffer { + b.buffer.Reset() + return b +} + +// WriteString adds the runes of the input string to the string buffer. +func (b *StringBuffer) WriteString(s string) *StringBuffer { + for _, r := range s { + b.WriteRune(r) + } + return b +} + +// WriteRune adds a single rune to the string buffer. +func (b *StringBuffer) WriteRune(r rune) *StringBuffer { + b.buffer.WriteRune(r) + return b +} + +// AsLiteralString returns the string buffer as a literal string. +// Literal means that no escape sequences are processed. +func (b *StringBuffer) AsLiteralString() string { + return b.buffer.String() +} + +// AsInterpretedString returns the string in its interpreted form. +// Interpreted means that escape sequences are handled in the way that Go would +// have, had it been inside double quotes. It translates for example escape +// sequences like "\n", "\t", \uXXXX" and "\UXXXXXXXX" into their string +// representations. +// Since the input might contain invalid escape sequences, this method +// also returns an error. When an error is returned, the returned string will +// contain the string as far as it could be interpreted. +func (b *StringBuffer) AsInterpretedString() (string, error) { + var sb strings.Builder + tail := b.buffer.String() + for len(tail) > 0 { + r, _, newtail, err := strconv.UnquoteChar(tail, '"') + if err != nil { + return sb.String(), err + } + tail = newtail + sb.WriteRune(r) + } + return sb.String(), nil +} diff --git a/parser/stringbuf_test.go b/parser/stringbuf_test.go new file mode 100644 index 0000000..fda5e98 --- /dev/null +++ b/parser/stringbuf_test.go @@ -0,0 +1,90 @@ +package parser_test + +import ( + "testing" + + "github.com/mmakaay/toml/parser" +) + +func TestGeneratingStringDoesNotResetBuffer(t *testing.T) { + var b parser.StringBuffer + s1, _ := b.WriteString(`hi\nthere`).AsInterpretedString() + s2 := b.AsLiteralString() + if s1 != "hi\nthere" { + t.Fatalf("Did not get expected string\"X\" for try 1, but %q", s1) + } + if s2 != "hi\\nthere" { + t.Fatalf("Did not get expected string\"X\" for try 2, but %q", s2) + } +} + +func TestResetResetsBuffer(t *testing.T) { + var b parser.StringBuffer + s := b.WriteRune('X').Reset().AsLiteralString() + if s != "" { + t.Fatalf("Did not get expected empty string, but %q", s) + } +} + +func TestAsLiteralString(t *testing.T) { + b := parser.StringBuffer{} + for _, c := range []stringbufT{ + {"empty string", ``, ``, OK}, + {"simple string", `Simple string!`, `Simple string!`, OK}, + {"single quote", `'`, `'`, OK}, + {"double quote", `"`, `"`, OK}, + {"escaped single quote", `\'`, `\'`, OK}, + {"escaped double quote", `\"`, `\"`, OK}, + {"escape anything", `\x\t\f\n\r\'\"\\`, `\x\t\f\n\r\'\"\\`, OK}, + {"UTF8 escapes", `\uceb2\U00e0b8bf`, `\uceb2\U00e0b8bf`, OK}, + {"actual newline", "on\nmultiple\nlines", "on\nmultiple\nlines", OK}, + } { + s := b.Reset().WriteString(c.in).AsLiteralString() + if s != c.out { + t.Fatalf("[%s] %q -> %q failed: actual result = %q", c.name, c.in, c.out, s) + } + } +} + +func TestAsInterpretedString(t *testing.T) { + b := parser.StringBuffer{} + for _, c := range []stringbufT{ + {"empty string", "", "", OK}, + {"one character", "Simple string!", "Simple string!", OK}, + {"escaped single quote", `\'`, "", FAIL}, + {"escaped double quote", `\"`, `"`, OK}, + {"bare single quote", `'`, "'", OK}, + {"string in single quotes", `'Hello'`, `'Hello'`, OK}, + {"string in escaped double quotes", `\"Hello\"`, `"Hello"`, OK}, + {"escape something", `\t\f\n\r\"\\`, "\t\f\n\r\"\\", OK}, + {"short UTF8 escapes", `\u2318Wh\u00e9\u00e9!`, `⌘Whéé!`, OK}, + {"long UTF8 escapes", `\U0001014D \u2318 Wh\u00e9\u00e9!`, `𐅍 ⌘ Whéé!`, OK}, + {"UTF8 characters", "Ѝюج wut Ж ?", "Ѝюج wut Ж ?", OK}, + {"example from spec", + `I'm a string. \"You can quote me\". Name\tJos\u00E9\nLocation\tSF.`, + "I'm a string. \"You can quote me\". Name\tJosé\nLocation\tSF.", OK}, + } { + s, err := b.Reset().WriteString(c.in).AsInterpretedString() + if c.isSuccessCase && err != nil { + t.Fatalf("[%s] unexpected error for input %q: %s", c.name, c.in, err) + } + if !c.isSuccessCase && err == nil { + t.Fatalf("[%s] expected a failure, but no failure occurred", c.name) + } + if s != c.out && c.isSuccessCase { + t.Fatalf("[%s] %q -> %q failed: actual result = %q", c.name, c.in, c.out, s) + } + } +} + +type stringbufT struct { + name string + in string + out string + isSuccessCase bool +} + +const ( + OK bool = true + FAIL bool = false +) diff --git a/parser/types.go b/parser/types.go new file mode 100644 index 0000000..4818f95 --- /dev/null +++ b/parser/types.go @@ -0,0 +1,51 @@ +package parser + +// Parser holds the internal state of the Parser. +type Parser struct { + state StateFn // a function that handles the current state + stack []StateFn // state function stack, for nested parsing + input string // the scanned input + len int // the total length of the input in bytes + pos int // current byte scanning position in the input + newline bool // keep track of when we have scanned a newline + cursorRow int // current row number in the input + cursorColumn int // current column position in the input + buffer StringBuffer // an efficient buffer, used to build string values + items chan Item // channel of resulting Parser items + item Item // the current item as reached by Next() and retrieved by Get() + err *Error // an error when lexing failed, retrieved by Error() +} + +// StateFn represents the state of the parser as a function +// that returns the next state. +type StateFn func(*Parser) StateFn + +// ItemType represents the type of a parser Item. +type ItemType int + +// ItemEOF is a built-in parser item type that is used for flagging that the +// end of the input was reached. +const ItemEOF ItemType = -1 + +// ItemError is a built-in parser item type that is used for flagging that +// an error has occurred during parsing. +const ItemError ItemType = -2 + +// Item represents an item returned from the parser. +type Item struct { + Type ItemType + Value string +} + +// Error is used as the error type when parsing errors occur. +// The error includes some extra meta information to allow for useful +// error messages to the user. +type Error struct { + Message string + Row int + Column int +} + +func (err *Error) Error() string { + return err.Message +} diff --git a/parser/user_api.go b/parser/user_api.go new file mode 100644 index 0000000..288b53d --- /dev/null +++ b/parser/user_api.go @@ -0,0 +1,39 @@ +package parser + +// Next retrieves the next parsed item. +// When a valid item was found, then the boolean return parameter will be true. +// On error or when successfully reaching the end of the input, false is returned. +// When an error occurred, it will be set in the error return value, nil otherwise. +func (l *Parser) Next() (Item, *Error, bool) { + for { + select { + case i := <-l.items: + switch { + case i.Type == ItemEOF: + return i, nil, false + case i.Type == ItemError: + l.err = &Error{i.Value, l.cursorRow, l.cursorColumn} + return i, l.err, false + default: + l.item = i + return i, nil, true + } + default: + l.state = l.state(l) + } + } +} + +// ToArray returns Parser items as an array (mainly intended for testing purposes) +// When an error occurs during scanning, a partial result will be +// returned, accompanied by the error that occurred. +func (l *Parser) ToArray() ([]Item, *Error) { + var items []Item + for { + item, err, more := l.Next() + if !more { + return items, err + } + items = append(items, item) + } +}