From 666cff3af381142633780e4e9eefaee6458b76cb Mon Sep 17 00:00:00 2001 From: Maurice Makaay Date: Fri, 17 May 2019 22:03:10 +0000 Subject: [PATCH] Ahhhh found a name that clicked for the more general layer of the parser code: parsekit. That is short and tells me what it is. It's not a parser, but something to build parsers with. Now I could also name the actual parsing code as I would like to, namely 'toml/parser'. So it feels like the structure is settling down. --- Makefile | 4 +- lexer/syn_eof.go | 12 - lexer/syn_value.go | 13 - parsekit/emitting.go | 67 +++++ parsekit/internals.go | 88 ++++++ parsekit/matching.go | 120 ++++++++ parser/user_api.go => parsekit/parsekit.go | 29 +- .../statestack.go => parsekit/staterouting.go | 12 +- {parser => parsekit}/stringbuf.go | 2 +- {parser => parsekit}/stringbuf_test.go | 2 +- {parser => parsekit}/types.go | 8 +- {lexer => parser}/definitions.go | 20 +- {lexer => parser}/helpers_test.go | 12 +- {lexer => parser}/lexer_test.go | 6 +- parser/parser.go | 274 ------------------ {lexer => parser}/syn_comments.go | 10 +- {lexer => parser}/syn_comments_test.go | 2 +- parser/syn_eof.go | 12 + {lexer => parser}/syn_key.go | 48 +-- {lexer => parser}/syn_key_test.go | 2 +- {lexer => parser}/syn_strings.go | 44 +-- {lexer => parser}/syn_strings_test.go | 2 +- parser/syn_value.go | 13 + 23 files changed, 408 insertions(+), 394 deletions(-) delete mode 100644 lexer/syn_eof.go delete mode 100644 lexer/syn_value.go create mode 100644 parsekit/emitting.go create mode 100644 parsekit/internals.go create mode 100644 parsekit/matching.go rename parser/user_api.go => parsekit/parsekit.go (59%) rename parser/statestack.go => parsekit/staterouting.go (67%) rename {parser => parsekit}/stringbuf.go (99%) rename {parser => parsekit}/stringbuf_test.go (99%) rename {parser => parsekit}/types.go (93%) rename {lexer => parser}/definitions.go (68%) rename {lexer => parser}/helpers_test.go (90%) rename {lexer => parser}/lexer_test.go (89%) delete mode 100644 parser/parser.go rename {lexer => parser}/syn_comments.go (66%) rename {lexer => parser}/syn_comments_test.go (97%) create mode 100644 parser/syn_eof.go rename {lexer => parser}/syn_key.go (53%) rename {lexer => parser}/syn_key_test.go (98%) rename {lexer => parser}/syn_strings.go (69%) rename {lexer => parser}/syn_strings_test.go (99%) create mode 100644 parser/syn_value.go diff --git a/Makefile b/Makefile index 8d3a8f6..e850288 100644 --- a/Makefile +++ b/Makefile @@ -1,3 +1,3 @@ test: - cd parser && go test - cd lexer && go test + @cd parsekit && go test + @cd parser && go test diff --git a/lexer/syn_eof.go b/lexer/syn_eof.go deleted file mode 100644 index 06ab965..0000000 --- a/lexer/syn_eof.go +++ /dev/null @@ -1,12 +0,0 @@ -package lexer - -import "github.com/mmakaay/toml/parser" - -func stateEndOfFile(l *parser.Parser) parser.StateFn { - if l.AtEndOfFile() { - l.Emit(parser.ItemEOF, "EOF") // todo Automate within parser? - } else { - l.UnexpectedInputError("end of file") - } - return nil -} diff --git a/lexer/syn_value.go b/lexer/syn_value.go deleted file mode 100644 index 15501a2..0000000 --- a/lexer/syn_value.go +++ /dev/null @@ -1,13 +0,0 @@ -package lexer - -import "github.com/mmakaay/toml/parser" - -// Values must be of the following types: String, Integer, Float, Boolean, -// Datetime, Array, or Inline Table. Unspecified values are invalid. -func stateValue(l *parser.Parser) parser.StateFn { - l.SkipConsecutive(whitespace) - if l.Upcoming(quoteChars) { - return stateStringValue - } - return l.UnexpectedInputError("a value") -} diff --git a/parsekit/emitting.go b/parsekit/emitting.go new file mode 100644 index 0000000..6f25f8a --- /dev/null +++ b/parsekit/emitting.go @@ -0,0 +1,67 @@ +package parsekit + +import ( + "fmt" + "strings" +) + +// Emit passes a Parser item to the client, including the provided string. +func (p *P) Emit(t ItemType, s string) { + p.items <- Item{t, s} + p.buffer.reset() +} + +// EmitLiteral passes a Parser item to the client, including the accumulated +// string buffer data as a literal string. +func (p *P) EmitLiteral(t ItemType) { + p.Emit(t, p.buffer.asLiteralString()) +} + +// EmitLiteralTrim passes a Parser item to the client, including the +// accumulated string buffer data as a literal string with whitespace +// trimmed from it. +func (p *P) EmitLiteralTrim(t ItemType) { + p.Emit(t, strings.TrimSpace(p.buffer.asLiteralString())) +} + +// EmitInterpreted passes a Parser item to the client, including the +// accumulated string buffer data a Go doubled quoted interpreted string +// (handling escape codes like \n, \t, \uXXXX, etc.) +// This method might return an error, in case there is data in the +// string buffer that is not valid for string interpretation. +func (p *P) EmitInterpreted(t ItemType) error { + s, err := p.buffer.asInterpretedString() + if err != nil { + return err + } + p.Emit(t, s) + return nil +} + +// EmitError emits a Parser error item to the client. +func (p *P) EmitError(format string, args ...interface{}) StateFn { + message := fmt.Sprintf(format, args...) + p.Emit(ItemError, message) + return nil +} + +// UnexpectedInput is used by a parser implementation to emit an +// error item that tells the client that an unexpected rune was +// encountered in the input. +// The parameter 'expected' is used to provide some context to the error. +func (p *P) UnexpectedInput(expected string) StateFn { + // next() takes care of error messages in cases where ok == false. + // Therefore, we only provide an error message for the ok case here. + if r, ok := p.next(); ok { + return p.EmitError(fmt.Sprintf("unexpected character %q (expected %s)", r, expected)) + } + return nil +} + +// UnexpectedEndOfFile is used by a parser implementation to emit an +// error item that tells the client that more data was expected from +// the input. +// The parameter 'expected' is used to provide some context to the error. +func (p *P) UnexpectedEndOfFile(expected string) StateFn { + return p.EmitError("Unexpected end of file (expected %s)", expected) +} diff --git a/parsekit/internals.go b/parsekit/internals.go new file mode 100644 index 0000000..d0521c2 --- /dev/null +++ b/parsekit/internals.go @@ -0,0 +1,88 @@ +package parsekit + +import ( + "unicode/utf8" +) + +// next returns the next rune from the input and a boolean indicating if +// reading the input was successful. +// When the end of input is reached, or an invalid UTF8 character is +// read, then false is returned. Both are considered error cases, +// and for that reason these automatically emit an error to the client. +func (p *P) next() (rune, bool) { + r, w, ok := p.peek() + if ok { + p.advanceCursor(r, w) + return r, true + } + if r == utf8.RuneError && w == 0 { + p.EmitError("unexpected end of file") + } else { + p.EmitError("invalid UTF8 character") + } + return r, false +} + +// peek returns but does not advance the cursor to the next rune(s) in the input. +// Returns the rune, its width in bytes and a boolean. +// The boolean will be false in case no upcoming rune can be peeked +// (end of data or invalid UTF8 character). +func (p *P) peek() (rune, int, bool) { + peeked, width := utf8.DecodeRuneInString(p.input[p.pos:]) + return peeked, width, peeked != utf8.RuneError +} + +// peekMulti takes a peek at multiple upcoming runes in the input. +// Returns a slice of runes, a slice containing their respective +// widths in bytes and a boolean. +// The boolean will be false in case less runes can be peeked than +// the requested amount (end of data or invalid UTF8 character). +func (p *P) peekMulti(amount int) ([]rune, []int, bool) { + var runes []rune + var widths []int + offset := 0 + for i := 0; i < amount; i++ { + r, w := utf8.DecodeRuneInString(p.input[p.pos+offset:]) + switch { + case r == utf8.RuneError: + return runes, widths, false + default: + offset += w + runes = append(runes, r) + widths = append(widths, w) + } + } + return runes, widths, true +} + +// progress moves the cursor forward in the input, returning one rune +// for every specified pattern. The cursor will only be moved forward when +// all requested patterns can be satisfied. +// Returns true when all patterns were satisfied and the cursor was +// moved forward, false otherwise. +// A callback function can be provided to specify what to do with +// the runes that are encountered in the input. +func (p *P) progress(callback func(rune), patterns ...string) bool { + if runes, widths, ok := p.Match(patterns...); ok { + for i, r := range runes { + callback(r) + p.advanceCursor(r, widths[i]) + } + return true + } + return false +} + +// advanceCursor advances the rune cursor one position in the +// input data. While doing so, it keeps tracks of newlines, +// so we can report on row + column positions on error. +func (p *P) advanceCursor(r rune, w int) { + p.pos += w + if p.newline { + p.cursorColumn = 0 + p.cursorRow++ + } else { + p.cursorColumn++ + } + p.newline = r == '\n' +} diff --git a/parsekit/matching.go b/parsekit/matching.go new file mode 100644 index 0000000..6fc7784 --- /dev/null +++ b/parsekit/matching.go @@ -0,0 +1,120 @@ +package parsekit + +import ( + "strings" +) + +// AtEndOfFile returns true when there is no more data available in the input. +func (p *P) AtEndOfFile() bool { + return p.pos >= p.len +} + +// AtEndOfLine returns true when the cursor is either at the end of the line +// or at the end of the file. The cursor is not moved to a new position +// by this method. +func (p *P) AtEndOfLine() bool { + return p.AtEndOfFile() || + p.Upcoming("\r", "\n") || + p.Upcoming("\n") +} + +// SkipEndOfLine returns true when the cursor is either at the end of the line +// or at the end of the file. Additionally, when not at the end of the file, +// the cursor is moved forward to beyond the newline. +func (p *P) SkipEndOfLine() bool { + return p.AtEndOfFile() || + p.SkipMatching("\r", "\n") || + p.SkipMatching("\n") +} + +// AcceptEndOfLine returns true when the cursor is either at the end of the line +// or at the end of the file. When not at the end of the file, a normalized +// newline (only a '\n' character, even with '\r\n' on the input) +// is added to the string buffer. +func (p *P) AcceptEndOfLine() bool { + if p.AtEndOfFile() { + return true + } + if p.SkipEndOfLine() { + p.buffer.writeRune('\n') + return true + } + return false +} + +// Match checks if the upcoming runes satisfy all provided patterns. +// It returns a slice of runes that were found, a slice containing +// their respective byte widths, and a boolean indicating whether +// or not all provided patterns were satisfied by the input data. +func (p *P) Match(patterns ...string) ([]rune, []int, bool) { + peeked, widths, ok := p.peekMulti(len(patterns)) + if ok { + for i, r := range patterns { + if strings.IndexRune(r, peeked[i]) < 0 { + return peeked, widths, false + } + } + return peeked, widths, true + } + return peeked, widths, false +} + +// Upcoming checks if the upcoming runes satisfy all provided patterns. +// Returns true if all provided patterns are satisfied. +// This is basically the same as the Match method, but with only +// the boolean return parameter for programmer convenciency. +func (p *P) Upcoming(patterns ...string) bool { + _, _, ok := p.Match(patterns...) + return ok +} + +// AcceptAny adds the next rune from the input to the string buffer. +// If no rune could be read (end of file or invalid UTF8 data), +// then false is returned. +func (p *P) AcceptAny() bool { + if r, ok := p.next(); ok { + p.buffer.writeRune(r) + return true + } + return false +} + +// AcceptMatching adds the next runes to the string buffer, but only +// if the upcoming runes satisfy the provided patterns. +// When runes were added then true is returned, false otherwise. +func (p *P) AcceptMatching(patterns ...string) bool { + return p.progress(func(r rune) { p.buffer.writeRune(r) }, patterns...) +} + +// AcceptConsecutive adds consecutive runes from the input to the string +// buffer, as long as they exist in the pattern. +// If any runes were added then true is returned, false otherwise. +func (p *P) AcceptConsecutive(pattern string) bool { + accepted := false + for p.AcceptMatching(pattern) { + accepted = true + } + return accepted +} + +// SkipMatching skips runes, but only when all provided patterns are satisfied. +// Returns true when one or more runes were skipped. +func (p *P) SkipMatching(patterns ...string) bool { + if runes, widths, ok := p.Match(patterns...); ok { + for i, r := range runes { + p.advanceCursor(r, widths[i]) + } + return true + } + return false +} + +// SkipConsecutive skips consecutive runes from the provided pattern. +// Returns true when one or more runes were skipped. +func (p *P) SkipConsecutive(pattern string) bool { + didSkip := false + for p.SkipMatching(pattern) { + didSkip = true + } + return didSkip +} diff --git a/parser/user_api.go b/parsekit/parsekit.go similarity index 59% rename from parser/user_api.go rename to parsekit/parsekit.go index 288b53d..634cbf3 100644 --- a/parser/user_api.go +++ b/parsekit/parsekit.go @@ -1,25 +1,36 @@ -package parser +package parsekit + +// New takes an input string and a start state, +// and initializes the parser for it. +func New(input string, startState StateFn) *P { + return &P{ + input: input, + len: len(input), + state: startState, + items: make(chan Item, 2), + } +} // Next retrieves the next parsed item. // When a valid item was found, then the boolean return parameter will be true. // On error or when successfully reaching the end of the input, false is returned. // When an error occurred, it will be set in the error return value, nil otherwise. -func (l *Parser) Next() (Item, *Error, bool) { +func (p *P) Next() (Item, *Error, bool) { for { select { - case i := <-l.items: + case i := <-p.items: switch { case i.Type == ItemEOF: return i, nil, false case i.Type == ItemError: - l.err = &Error{i.Value, l.cursorRow, l.cursorColumn} - return i, l.err, false + p.err = &Error{i.Value, p.cursorRow, p.cursorColumn} + return i, p.err, false default: - l.item = i + p.item = i return i, nil, true } default: - l.state = l.state(l) + p.state = p.state(p) } } } @@ -27,10 +38,10 @@ func (l *Parser) Next() (Item, *Error, bool) { // ToArray returns Parser items as an array (mainly intended for testing purposes) // When an error occurs during scanning, a partial result will be // returned, accompanied by the error that occurred. -func (l *Parser) ToArray() ([]Item, *Error) { +func (p *P) ToArray() ([]Item, *Error) { var items []Item for { - item, err, more := l.Next() + item, err, more := p.Next() if !more { return items, err } diff --git a/parser/statestack.go b/parsekit/staterouting.go similarity index 67% rename from parser/statestack.go rename to parsekit/staterouting.go index 1252fc6..decbbc0 100644 --- a/parser/statestack.go +++ b/parsekit/staterouting.go @@ -1,6 +1,6 @@ -package parser +package parsekit -func (p *Parser) QueueStates(states ...StateFn) StateFn { +func (p *P) QueueStates(states ...StateFn) StateFn { first, followup := states[0], states[1:] for reverse := range followup { p.PushState(followup[len(followup)-reverse-1]) @@ -8,24 +8,24 @@ func (p *Parser) QueueStates(states ...StateFn) StateFn { return first } -func (p *Parser) ToChildState(state StateFn) StateFn { +func (p *P) ToChildState(state StateFn) StateFn { p.PushState(p.state) return state } -func (p *Parser) ToParentState() StateFn { +func (p *P) ToParentState() StateFn { state := p.PopState() return state } // PushState adds the state function to the state stack. // This is used for implementing nested parsing. -func (p *Parser) PushState(state StateFn) { +func (p *P) PushState(state StateFn) { p.stack = append(p.stack, state) } // PopState pops the last pushed state from the state stack. -func (p *Parser) PopState() StateFn { +func (p *P) PopState() StateFn { last := len(p.stack) - 1 head, tail := p.stack[:last], p.stack[last] p.stack = head diff --git a/parser/stringbuf.go b/parsekit/stringbuf.go similarity index 99% rename from parser/stringbuf.go rename to parsekit/stringbuf.go index 46df171..8df4659 100644 --- a/parser/stringbuf.go +++ b/parsekit/stringbuf.go @@ -1,4 +1,4 @@ -package parser +package parsekit import ( "bytes" diff --git a/parser/stringbuf_test.go b/parsekit/stringbuf_test.go similarity index 99% rename from parser/stringbuf_test.go rename to parsekit/stringbuf_test.go index a71a6e8..0140688 100644 --- a/parser/stringbuf_test.go +++ b/parsekit/stringbuf_test.go @@ -1,4 +1,4 @@ -package parser +package parsekit import ( "testing" diff --git a/parser/types.go b/parsekit/types.go similarity index 93% rename from parser/types.go rename to parsekit/types.go index 951a785..24cc338 100644 --- a/parser/types.go +++ b/parsekit/types.go @@ -1,7 +1,7 @@ -package parser +package parsekit -// Parser holds the internal state of the Parser. -type Parser struct { +// P holds the internal state of the parser. +type P struct { state StateFn // a function that handles the current state stack []StateFn // state function stack, for nested parsing input string // the scanned input @@ -18,7 +18,7 @@ type Parser struct { // StateFn represents the state of the parser as a function // that returns the next state. -type StateFn func(*Parser) StateFn +type StateFn func(*P) StateFn // ItemType represents the type of a parser Item. type ItemType int diff --git a/lexer/definitions.go b/parser/definitions.go similarity index 68% rename from lexer/definitions.go rename to parser/definitions.go index ee32ae0..f9469be 100644 --- a/lexer/definitions.go +++ b/parser/definitions.go @@ -1,14 +1,14 @@ -package lexer +package parser -import "github.com/mmakaay/toml/parser" +import "github.com/mmakaay/toml/parsekit" -// Item types that are emitted by this parser. +// Item types that are produced by this parser. const ( - ItemComment parser.ItemType = iota // An error occurred - ItemKey // Key of a key/value pair - ItemKeyDot // Dot for a dotted key - ItemAssignment // Value assignment coming up (=) - ItemString // A value of type string + ItemComment parsekit.ItemType = iota // Comment string + ItemKey // Key of a key/value pair + ItemKeyDot // Dot for a dotted key + ItemAssignment // Value assignment coming up (=) + ItemString // A value of type string ) const ( @@ -43,6 +43,6 @@ var ( // NewParser creates a new parser, using the provided input string // as the data to parse. -func NewParser(input string) *parser.Parser { - return parser.New(input, stateKeyValuePair) +func NewParser(input string) *parsekit.P { + return parsekit.New(input, stateKeyValuePair) } diff --git a/lexer/helpers_test.go b/parser/helpers_test.go similarity index 90% rename from lexer/helpers_test.go rename to parser/helpers_test.go index 69d686e..2f23a1b 100644 --- a/lexer/helpers_test.go +++ b/parser/helpers_test.go @@ -1,12 +1,12 @@ -package lexer_test +package parser_test import ( "fmt" "strings" "testing" - "github.com/mmakaay/toml/lexer" - "github.com/mmakaay/toml/parser" + "github.com/mmakaay/toml/parsekit" + lexer "github.com/mmakaay/toml/parser" ) type statesT struct { @@ -56,8 +56,8 @@ func runStatesT(t *testing.T, c statesT) { } } -// ParserItemToString returns a string representation of the parser.Item. -func ParserItemToString(i parser.Item) string { +// ParserItemToString returns a string representation of the parsekit.Item. +func ParserItemToString(i parsekit.Item) string { switch i.Type { case lexer.ItemComment: return fmt.Sprintf("#(%s)", i.Value) @@ -70,6 +70,6 @@ func ParserItemToString(i parser.Item) string { case lexer.ItemAssignment: return "=" default: - panic(fmt.Sprintf("No string representation available for parser.Item id %d", i.Type)) + panic(fmt.Sprintf("No string representation available for parsekit.Item id %d", i.Type)) } } diff --git a/lexer/lexer_test.go b/parser/lexer_test.go similarity index 89% rename from lexer/lexer_test.go rename to parser/lexer_test.go index 97542de..c20471d 100644 --- a/lexer/lexer_test.go +++ b/parser/lexer_test.go @@ -1,13 +1,13 @@ -package lexer_test +package parser_test import ( "testing" - "github.com/mmakaay/toml/lexer" + "github.com/mmakaay/toml/parser" ) func TestErrorsIncludeLineAndRowPosition(t *testing.T) { - _, err := lexer.NewParser("# 12345 abcde\t\n\n\n# 67890\r\n# 12345\xbc").ToArray() + _, err := parser.NewParser("# 12345 abcde\t\n\n\n# 67890\r\n# 12345\xbc").ToArray() t.Logf("Got error: %s", err.Error()) if err.Row != 4 { t.Errorf("Unexpected line number: %d (expected %d)", err.Row, 4) diff --git a/parser/parser.go b/parser/parser.go deleted file mode 100644 index 5fcd8ec..0000000 --- a/parser/parser.go +++ /dev/null @@ -1,274 +0,0 @@ -package parser - -import ( - "fmt" - "strings" - "unicode/utf8" -) - -// New takes an input string and a start state, -// and initializes the parser for it. -func New(input string, startState StateFn) *Parser { - return &Parser{ - input: input, - len: len(input), - state: startState, - items: make(chan Item, 2), - } -} - -// AtEndOfFile returns true when there is no more data available in the input. -func (p *Parser) AtEndOfFile() bool { - return p.pos >= p.len -} - -func (p *Parser) AtEndOfLine() bool { - return p.AtEndOfFile() || - p.Upcoming("\r", "\n") || - p.Upcoming("\n") -} - -func (p *Parser) SkipEndOfLine() bool { - return p.AtEndOfFile() || - p.SkipMatching("\r", "\n") || - p.SkipMatching("\n") -} - -func (p *Parser) AcceptEndOfLine() bool { - // No newline, but we're defintely at the end of the line here. - if p.AtEndOfFile() { - return true - } - // If we see some kind of end of line, then we accept a - // normalized newline, which is just a '\n'. This will normalize - // '\r\n' into '\n'. - if p.SkipEndOfLine() { - p.buffer.writeRune('\n') - return true - } - return false -} - -// Emit passes a Parser item to the client, including the provided string. -func (p *Parser) Emit(t ItemType, s string) { - p.items <- Item{t, s} - p.buffer.reset() -} - -// EmitLiteral passes a Parser item to the client, including the accumulated -// string buffer data as a literal string. -func (p *Parser) EmitLiteral(t ItemType) { - p.Emit(t, p.buffer.asLiteralString()) -} - -// EmitLiteralTrim passes a Parser item to the client, including the -// accumulated string buffer data as a literal string with whitespace -// trimmed from it. -func (p *Parser) EmitLiteralTrim(t ItemType) { - p.Emit(t, strings.TrimSpace(p.buffer.asLiteralString())) -} - -// EmitInterpreted passes a Parser item to the client, including the -// accumulated string buffer data a Go doubled quoted interpreted string -// (handling escape codes like \n, \t, \uXXXX, etc.) -// This method might return an error, in case there is data in the -// string buffer that is not valid for string interpretation. -func (p *Parser) EmitInterpreted(t ItemType) error { - s, err := p.buffer.asInterpretedString() - if err != nil { - return err - } - p.Emit(t, s) - return nil -} - -// EmitError emits a Parser error item to the client. -func (p *Parser) EmitError(format string, args ...interface{}) StateFn { - message := fmt.Sprintf(format, args...) - p.Emit(ItemError, message) - return nil -} - -// Match checks if the upcoming runes satisfy all provided patterns. -// It returns a slice of runes that were found, their total byte width -// and a boolean indicating whether or not all provided patterns were -// satisfied by the input data. -func (p *Parser) Match(patterns ...string) ([]rune, int, bool) { - peeked, width, ok := p.peekMulti(len(patterns)) - if ok { - for i, r := range patterns { - if strings.IndexRune(r, peeked[i]) < 0 { - return peeked, width, false - } - } - return peeked, width, true - } - return peeked, width, false -} - -// Upcoming checks if the upcoming runes satisfy all provided patterns. -// Returns true if all provided patterns are satisfied. -func (p *Parser) Upcoming(patterns ...string) bool { - _, _, ok := p.Match(patterns...) - return ok -} - -// AcceptAny adds the next rune from the input to the string buffer. -// If no rune could be read (end of file or invalid UTF8 data), -// then false is returned. -func (p *Parser) AcceptAny() bool { - if r, ok := p.next(); ok { - p.buffer.writeRune(r) - return true - } - return false -} - -// AcceptMatching adds the next runes to the string buffer, but only -// if the upcoming runes satisfy the provided patterns. -// When runes were added then true is returned, false otherwise. -func (p *Parser) AcceptMatching(patterns ...string) bool { - return p.progress(func(r rune) { p.buffer.writeRune(r) }, patterns...) -} - -// AcceptConsecutive adds consecutive runes from the input to the string -// buffer, as long as they exist in the pattern. -// If any runes were added then true is returned, false otherwise. -func (p *Parser) AcceptConsecutive(pattern string) bool { - accepted := false - for p.AcceptMatching(pattern) { - accepted = true - } - return accepted -} - -// SkipMatching skips runes, but only when all provided patterns are satisfied. -// Returns true when one or more runes were skipped. -func (p *Parser) SkipMatching(patterns ...string) bool { - if runes, w, ok := p.Match(patterns...); ok { - p.pos += w - for _, r := range runes { - p.advanceCursor(r) - } - return true - } - return false -} - -// SkipConsecutive skips consecutive runes from the provided pattern. -// Returns true when one or more runes were skipped. -func (p *Parser) SkipConsecutive(pattern string) bool { - didSkip := false - for p.SkipMatching(pattern) { - didSkip = true - } - return didSkip -} - -// ============================================================================ -// EMIT DATA AND ERRORS -// ============================================================================ - -// UnexpectedInputError is used by a parser implementation to emit an -// error item that tells the client that an unexpected rune was -// encountered in the input. -// The parameter 'expected' is used to provide some context to the error. -func (p *Parser) UnexpectedInputError(expected string) StateFn { - // next() takes care of error messages for ok == false. - if r, ok := p.next(); ok { - return p.EmitError(fmt.Sprintf("unexpected character %q (expected %s)", r, expected)) - } - return nil -} - -// UnexpectedEndOfFile is used by a parser implementation to emit an -// error item that tells the client that more data was expected from -// the input. -// The parameter 'expected' is used to provide some context to the error. -func (p *Parser) UnexpectedEndOfFile(expected string) StateFn { - return p.EmitError("Unexpected end of file (expected %s)", expected) -} - -// ============================================================================ -// LEXER : our lexer is quite low level, it only returns UTF8 runes -// ============================================================================ - -// peek returns but does not advance to the next rune(s) in the input. -// Returns the rune, its width and a boolean. The boolean will be false in case -// no upcoming rune can be peeked (end of data or invalid UTF8 character). -func (p *Parser) peek() (rune, int, bool) { - peeked, width := utf8.DecodeRuneInString(p.input[p.pos:]) - return peeked, width, peeked != utf8.RuneError -} - -// peekMulti takes a peek at multiple upcoming runes in the input. -// Returns a slice of runes, their total width in bytes and a boolean. -// The boolean will be false in case less runes can be peeked than -// the requested amount (end of data or invalid UTF8 character). -func (p *Parser) peekMulti(amount int) ([]rune, int, bool) { - width := 0 - var peeked []rune - for i := 0; i < amount; i++ { - r, w := utf8.DecodeRuneInString(p.input[p.pos+width:]) - switch { - case r == utf8.RuneError: - return peeked, width, false - default: - width += w - peeked = append(peeked, r) - } - } - return peeked, width, true -} - -// progress moves the cursor forward in the input, returning one rune -// for every specified pattern. The cursor is only moved forward when -// all patterns are satisfied. -// Returns true when all patterns were satisfied and the cursor was -// moved forward, false otherwise. -// A callback function can be provided to specify what to do with -// the runes that are encountered in the input. -func (p *Parser) progress(callback func(rune), patterns ...string) bool { - if runes, w, ok := p.Match(patterns...); ok { - p.pos += w - for _, r := range runes { - callback(r) - p.advanceCursor(r) - } - return true - } - return false -} - -// next returns the next rune from the input and a boolean indicating if -// reading the input was successful. -// When the end of input is reached, or an invalid UTF8 character is -// read, then false is returned. Both are considered error cases, -// and for that reason these automatically emit an error to the client. -func (p *Parser) next() (rune, bool) { - r, w, ok := p.peek() - if ok { - p.pos += w - p.advanceCursor(r) - return r, true - } - if r == utf8.RuneError && w == 0 { - p.EmitError("unexpected end of file") - } else { - p.EmitError("invalid UTF8 character") - } - return r, false -} - -// advanceCursor advances the rune cursor one position in the -// input data. While doing so, it keeps tracks of newlines, -// so we can report on row + column positions on error. -func (p *Parser) advanceCursor(r rune) { - if p.newline { - p.cursorColumn = 0 - p.cursorRow++ - } else { - p.cursorColumn++ - } - p.newline = r == '\n' -} diff --git a/lexer/syn_comments.go b/parser/syn_comments.go similarity index 66% rename from lexer/syn_comments.go rename to parser/syn_comments.go index cb7082d..bd5b196 100644 --- a/lexer/syn_comments.go +++ b/parser/syn_comments.go @@ -1,15 +1,17 @@ -package lexer +package parser -import "github.com/mmakaay/toml/parser" +import ( + "github.com/mmakaay/toml/parsekit" +) // A '#' hash symbol marks the rest of the line as a comment. -func stateCommentStart(p *parser.Parser) parser.StateFn { +func stateCommentStart(p *parsekit.P) parsekit.StateFn { p.SkipConsecutive(hash) return stateCommentContent } // All characters up to the end of the line are included in the comment. -func stateCommentContent(p *parser.Parser) parser.StateFn { +func stateCommentContent(p *parsekit.P) parsekit.StateFn { switch { case p.AtEndOfLine(): p.EmitLiteralTrim(ItemComment) diff --git a/lexer/syn_comments_test.go b/parser/syn_comments_test.go similarity index 97% rename from lexer/syn_comments_test.go rename to parser/syn_comments_test.go index 0927350..8b70544 100644 --- a/lexer/syn_comments_test.go +++ b/parser/syn_comments_test.go @@ -1,4 +1,4 @@ -package lexer_test +package parser_test import ( "testing" diff --git a/parser/syn_eof.go b/parser/syn_eof.go new file mode 100644 index 0000000..73c0b8a --- /dev/null +++ b/parser/syn_eof.go @@ -0,0 +1,12 @@ +package parser + +import "github.com/mmakaay/toml/parsekit" + +func stateEndOfFile(p *parsekit.P) parsekit.StateFn { + if p.AtEndOfFile() { + p.Emit(parsekit.ItemEOF, "EOF") // todo Automate within parser? + } else { + p.UnexpectedInput("end of file") + } + return nil +} diff --git a/lexer/syn_key.go b/parser/syn_key.go similarity index 53% rename from lexer/syn_key.go rename to parser/syn_key.go index 31d85e7..0949b46 100644 --- a/lexer/syn_key.go +++ b/parser/syn_key.go @@ -1,15 +1,15 @@ -package lexer +package parser -import "github.com/mmakaay/toml/parser" +import "github.com/mmakaay/toml/parsekit" // The primary building block of a TOML document is the key/value pair. -func stateKeyValuePair(l *parser.Parser) parser.StateFn { +func stateKeyValuePair(p *parsekit.P) parsekit.StateFn { switch { - case l.SkipConsecutive(whitespace + carriageReturn + newline): + case p.SkipConsecutive(whitespace + carriageReturn + newline): return stateKeyValuePair - case l.Upcoming(hash): - return l.ToChildState(stateCommentStart) - case l.Upcoming(startOfKey): + case p.Upcoming(hash): + return p.ToChildState(stateCommentStart) + case p.Upcoming(startOfKey): return stateKey default: return stateEndOfFile @@ -17,32 +17,32 @@ func stateKeyValuePair(l *parser.Parser) parser.StateFn { } // A key may be either bare, quoted or dotted. -func stateKey(l *parser.Parser) parser.StateFn { - if l.AcceptMatching(bareKeyChars) { +func stateKey(p *parsekit.P) parsekit.StateFn { + if p.AcceptMatching(bareKeyChars) { return statebareKeyChars } - return l.UnexpectedInputError("a valid key name") + return p.UnexpectedInput("a valid key name") } // Bare keys may only contain ASCII letters, ASCII digits, // underscores, and dashes (A-Za-z0-9_-). Note that bare // keys are allowed to be composed of only ASCII digits, // e.g. 1234, but are always interpreted as strings. -func statebareKeyChars(l *parser.Parser) parser.StateFn { - l.AcceptConsecutive(bareKeyChars) - l.EmitLiteral(ItemKey) +func statebareKeyChars(p *parsekit.P) parsekit.StateFn { + p.AcceptConsecutive(bareKeyChars) + p.EmitLiteral(ItemKey) return stateEndOfKeyOrKeyDot } // Dotted keys are a sequence of bare or quoted keys joined with a dot. // This allows for grouping similar properties together: -func stateEndOfKeyOrKeyDot(l *parser.Parser) parser.StateFn { +func stateEndOfKeyOrKeyDot(p *parsekit.P) parsekit.StateFn { // Whitespace around dot-separated parts is ignored, however, // best practice is to not use any extraneous whitespace. - l.SkipConsecutive(whitespace) - if l.SkipMatching(dot) { - l.Emit(ItemKeyDot, "") - l.SkipConsecutive(whitespace) + p.SkipConsecutive(whitespace) + if p.SkipMatching(dot) { + p.Emit(ItemKeyDot, "") + p.SkipConsecutive(whitespace) return stateKey } return stateKeyAssignment @@ -52,12 +52,12 @@ func stateEndOfKeyOrKeyDot(l *parser.Parser) parser.StateFn { // Whitespace is ignored around key names and values. The key, equals // sign, and value must be on the same line (though some values can // be broken over multiple lines). -func stateKeyAssignment(l *parser.Parser) parser.StateFn { - l.SkipConsecutive(whitespace) - if l.SkipMatching(equal) { - l.Emit(ItemAssignment, "") - l.SkipConsecutive(whitespace) +func stateKeyAssignment(p *parsekit.P) parsekit.StateFn { + p.SkipConsecutive(whitespace) + if p.SkipMatching(equal) { + p.Emit(ItemAssignment, "") + p.SkipConsecutive(whitespace) return stateValue } - return l.UnexpectedInputError("a value assignment") + return p.UnexpectedInput("a value assignment") } diff --git a/lexer/syn_key_test.go b/parser/syn_key_test.go similarity index 98% rename from lexer/syn_key_test.go rename to parser/syn_key_test.go index fe3b234..e3a7702 100644 --- a/lexer/syn_key_test.go +++ b/parser/syn_key_test.go @@ -1,4 +1,4 @@ -package lexer_test +package parser_test import ( "testing" diff --git a/lexer/syn_strings.go b/parser/syn_strings.go similarity index 69% rename from lexer/syn_strings.go rename to parser/syn_strings.go index 18fd0fb..68226c5 100644 --- a/lexer/syn_strings.go +++ b/parser/syn_strings.go @@ -1,19 +1,19 @@ -package lexer +package parser -import "github.com/mmakaay/toml/parser" +import "github.com/mmakaay/toml/parsekit" // There are four ways to express strings: basic, multi-line basic, literal, // and multi-line literal. All strings must contain only valid UTF-8 characters. // * Multi-line basic strings are surrounded by three quotation marks on each side. // * Basic strings are surrounded by quotation marks. -func stateStringValue(l *parser.Parser) parser.StateFn { +func stateStringValue(p *parsekit.P) parsekit.StateFn { switch { - case l.SkipMatching(doubleQuote3...): + case p.SkipMatching(doubleQuote3...): return stateMultiLineBasicString - case l.SkipMatching(doubleQuote): - return l.QueueStates(stateParseString, stateBasicStringSpecific) + case p.SkipMatching(doubleQuote): + return p.QueueStates(stateParseString, stateBasicStringSpecific) } - return l.UnexpectedInputError("a string value") + return p.UnexpectedInput("a string value") } // Specific handling of input for basic strings. @@ -22,7 +22,7 @@ func stateStringValue(l *parser.Parser) parser.StateFn { // "All other escape sequences [..] are reserved and, if used, TOML should // produce an error."" -func stateBasicStringSpecific(p *parser.Parser) parser.StateFn { +func stateBasicStringSpecific(p *parsekit.P) parsekit.StateFn { switch { case p.SkipMatching(doubleQuote): if err := p.EmitInterpreted(ItemString); err != nil { @@ -36,8 +36,8 @@ func stateBasicStringSpecific(p *parser.Parser) parser.StateFn { } } -func stateMultiLineBasicString(l *parser.Parser) parser.StateFn { - l.EmitError("Not yet implemented") +func stateMultiLineBasicString(p *parsekit.P) parsekit.StateFn { + p.EmitError("Not yet implemented") return nil } @@ -50,11 +50,11 @@ const invalidBasicStringCharacters string = "\"\\" + "\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F" + "\u007F" -func stateParseString(l *parser.Parser) parser.StateFn { +func stateParseString(p *parsekit.P) parsekit.StateFn { switch { - case l.AtEndOfFile(): - return l.UnexpectedEndOfFile("basic string token") - case l.AcceptMatching(backslash, escapeChars): + case p.AtEndOfFile(): + return p.UnexpectedEndOfFile("basic string token") + case p.AcceptMatching(backslash, escapeChars): // For convenience, some popular characters have a compact escape sequence. // \b - backspace (U+0008) // \t - tab (U+0009) @@ -63,22 +63,22 @@ func stateParseString(l *parser.Parser) parser.StateFn { // \r - carriage return (U+000D) // \" - quote (U+0022) // \\ - backslash (U+005C) - case l.AcceptMatching(shortUtf8Match...): + case p.AcceptMatching(shortUtf8Match...): // \uXXXX - unicode (U+XXXX) - case l.AcceptMatching(longUtf8Match...): + case p.AcceptMatching(longUtf8Match...): // \UXXXXXXXX - unicode (U+XXXXXXXX) - case l.Upcoming(backslash) || l.Upcoming(doubleQuote): + case p.Upcoming(backslash) || p.Upcoming(doubleQuote): // Returning to the parent state to have special cases handled, // because there are differences between single and multi line strings. - return l.ToParentState() - case l.Upcoming(invalidBasicStringCharacters): + return p.ToParentState() + case p.Upcoming(invalidBasicStringCharacters): // Any Unicode character may be used except those that must be escaped: // quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F). - r, _, _ := l.Match(invalidBasicStringCharacters) - l.EmitError("Invalid character in basic string: %q (must be escaped)", r[0]) + r, _, _ := p.Match(invalidBasicStringCharacters) + p.EmitError("Invalid character in basic string: %q (must be escaped)", r[0]) return nil default: - l.AcceptAny() + p.AcceptAny() } return stateParseString } diff --git a/lexer/syn_strings_test.go b/parser/syn_strings_test.go similarity index 99% rename from lexer/syn_strings_test.go rename to parser/syn_strings_test.go index bbb0cfe..0598550 100644 --- a/lexer/syn_strings_test.go +++ b/parser/syn_strings_test.go @@ -1,4 +1,4 @@ -package lexer_test +package parser_test import ( "fmt" diff --git a/parser/syn_value.go b/parser/syn_value.go new file mode 100644 index 0000000..2d6604f --- /dev/null +++ b/parser/syn_value.go @@ -0,0 +1,13 @@ +package parser + +import "github.com/mmakaay/toml/parsekit" + +// Values must be of the following types: String, Integer, Float, Boolean, +// Datetime, Array, or Inline Table. Unspecified values are invalid. +func stateValue(p *parsekit.P) parsekit.StateFn { + p.SkipConsecutive(whitespace) + if p.Upcoming(quoteChars) { + return stateStringValue + } + return p.UnexpectedInput("a value") +}