diff --git a/parsekit/emitting.go b/parsekit/emitting.go index 6f25f8a..293b7ec 100644 --- a/parsekit/emitting.go +++ b/parsekit/emitting.go @@ -39,29 +39,27 @@ func (p *P) EmitInterpreted(t ItemType) error { } // EmitError emits a Parser error item to the client. -func (p *P) EmitError(format string, args ...interface{}) StateFn { +func (p *P) EmitError(format string, args ...interface{}) { message := fmt.Sprintf(format, args...) p.Emit(ItemError, message) - return nil } // UnexpectedInput is used by a parser implementation to emit an // error item that tells the client that an unexpected rune was // encountered in the input. // The parameter 'expected' is used to provide some context to the error. -func (p *P) UnexpectedInput(expected string) StateFn { +func (p *P) UnexpectedInput(expected string) { // next() takes care of error messages in cases where ok == false. // Therefore, we only provide an error message for the ok case here. if r, ok := p.next(); ok { - return p.EmitError(fmt.Sprintf("unexpected character %q (expected %s)", r, expected)) + p.EmitError(fmt.Sprintf("unexpected character %q (expected %s)", r, expected)) } - return nil } // UnexpectedEndOfFile is used by a parser implementation to emit an // error item that tells the client that more data was expected from // the input. // The parameter 'expected' is used to provide some context to the error. -func (p *P) UnexpectedEndOfFile(expected string) StateFn { - return p.EmitError("Unexpected end of file (expected %s)", expected) +func (p *P) UnexpectedEndOfFile(expected string) { + p.EmitError("Unexpected end of file (expected %s)", expected) } diff --git a/parsekit/internals.go b/parsekit/internals.go index d0521c2..776b19e 100644 --- a/parsekit/internals.go +++ b/parsekit/internals.go @@ -10,7 +10,7 @@ import ( // read, then false is returned. Both are considered error cases, // and for that reason these automatically emit an error to the client. func (p *P) next() (rune, bool) { - r, w, ok := p.peek() + r, w, ok := p.peek(0) if ok { p.advanceCursor(r, w) return r, true @@ -27,8 +27,8 @@ func (p *P) next() (rune, bool) { // Returns the rune, its width in bytes and a boolean. // The boolean will be false in case no upcoming rune can be peeked // (end of data or invalid UTF8 character). -func (p *P) peek() (rune, int, bool) { - peeked, width := utf8.DecodeRuneInString(p.input[p.pos:]) +func (p *P) peek(offsetInBytes int) (rune, int, bool) { + peeked, width := utf8.DecodeRuneInString(p.input[p.pos+offsetInBytes:]) return peeked, width, peeked != utf8.RuneError } @@ -62,7 +62,7 @@ func (p *P) peekMulti(amount int) ([]rune, []int, bool) { // moved forward, false otherwise. // A callback function can be provided to specify what to do with // the runes that are encountered in the input. -func (p *P) progress(callback func(rune), patterns ...string) bool { +func (p *P) progress(callback func(rune), patterns ...interface{}) bool { if runes, widths, ok := p.Match(patterns...); ok { for i, r := range runes { callback(r) diff --git a/parsekit/matching.go b/parsekit/matching.go index 6fc7784..e1853d5 100644 --- a/parsekit/matching.go +++ b/parsekit/matching.go @@ -1,7 +1,9 @@ package parsekit import ( + "fmt" "strings" + "unicode/utf8" ) // AtEndOfFile returns true when there is no more data available in the input. @@ -42,28 +44,56 @@ func (p *P) AcceptEndOfLine() bool { return false } -// Match checks if the upcoming runes satisfy all provided patterns. -// It returns a slice of runes that were found, a slice containing -// their respective byte widths, and a boolean indicating whether -// or not all provided patterns were satisfied by the input data. -func (p *P) Match(patterns ...string) ([]rune, []int, bool) { - peeked, widths, ok := p.peekMulti(len(patterns)) - if ok { - for i, r := range patterns { - if strings.IndexRune(r, peeked[i]) < 0 { - return peeked, widths, false - } - } - return peeked, widths, true +func (p *P) Match(patterns ...interface{}) ([]rune, []int, bool) { + return p.match(0, patterns...) +} + +func (p *P) match(offset int, patterns ...interface{}) ([]rune, []int, bool) { + var runes []rune + var widths []int + + addRune := func(r rune, w int) { + offset += w + runes = append(runes, r) + widths = append(widths, w) } - return peeked, widths, false + + for _, pattern := range patterns { + r, w := utf8.DecodeRuneInString(p.input[p.pos+offset:]) + if r == utf8.RuneError { + return runes, widths, false + } + switch pattern := pattern.(type) { + case []interface{}: + rs, ws, matched := p.match(offset, pattern...) + for i, r := range rs { + addRune(r, ws[i]) + } + if !matched { + return runes, widths, false + } + case string: + if strings.IndexRune(pattern, r) < 0 { + return runes, widths, false + } + addRune(r, w) + case rune: + if pattern != r { + return runes, widths, false + } + addRune(r, w) + default: + panic(fmt.Sprintf("Not rune matching implemented for pattern of type %T", pattern)) + } + } + return runes, widths, true } // Upcoming checks if the upcoming runes satisfy all provided patterns. // Returns true if all provided patterns are satisfied. // This is basically the same as the Match method, but with only // the boolean return parameter for programmer convenciency. -func (p *P) Upcoming(patterns ...string) bool { +func (p *P) Upcoming(patterns ...interface{}) bool { _, _, ok := p.Match(patterns...) return ok } @@ -79,10 +109,50 @@ func (p *P) AcceptAny() bool { return false } +type afterFollowup struct { + p *P + runes []rune + widths []int + ok bool +} + +func (a *afterFollowup) Store() bool { + if a.ok { + for i, r := range a.runes { + a.p.buffer.writeRune(r) + a.p.advanceCursor(r, a.widths[i]) + } + } + return a.ok +} + +func (a *afterFollowup) Ignore() bool { + if a.ok { + for i, r := range a.runes { + a.p.advanceCursor(r, a.widths[i]) + } + } + return a.ok +} + +func (a *afterFollowup) Backup() bool { + return a.ok +} + +func (p *P) After(patterns ...interface{}) *afterFollowup { + runes, widths, ok := p.Match(patterns...) + return &afterFollowup{ + p: p, + runes: runes, + widths: widths, + ok: ok, + } +} + // AcceptMatching adds the next runes to the string buffer, but only // if the upcoming runes satisfy the provided patterns. // When runes were added then true is returned, false otherwise. -func (p *P) AcceptMatching(patterns ...string) bool { +func (p *P) AcceptMatching(patterns ...interface{}) bool { return p.progress(func(r rune) { p.buffer.writeRune(r) }, patterns...) } @@ -99,7 +169,7 @@ func (p *P) AcceptConsecutive(pattern string) bool { // SkipMatching skips runes, but only when all provided patterns are satisfied. // Returns true when one or more runes were skipped. -func (p *P) SkipMatching(patterns ...string) bool { +func (p *P) SkipMatching(patterns ...interface{}) bool { if runes, widths, ok := p.Match(patterns...); ok { for i, r := range runes { p.advanceCursor(r, widths[i]) diff --git a/parsekit/parsekit.go b/parsekit/parsekit.go index 634cbf3..b669890 100644 --- a/parsekit/parsekit.go +++ b/parsekit/parsekit.go @@ -4,10 +4,10 @@ package parsekit // and initializes the parser for it. func New(input string, startState StateFn) *P { return &P{ - input: input, - len: len(input), - state: startState, - items: make(chan Item, 2), + input: input, + len: len(input), + nextState: startState, + items: make(chan Item, 2), } } @@ -30,7 +30,11 @@ func (p *P) Next() (Item, *Error, bool) { return i, nil, true } default: - p.state = p.state(p) + if p.nextState == nil { + panic("No next state was scheduled for the parser") + } + p.state = p.nextState + p.state(p) } } } diff --git a/parsekit/staterouting.go b/parsekit/staterouting.go index decbbc0..b6e0831 100644 --- a/parsekit/staterouting.go +++ b/parsekit/staterouting.go @@ -1,11 +1,30 @@ package parsekit -func (p *P) QueueStates(states ...StateFn) StateFn { - first, followup := states[0], states[1:] - for reverse := range followup { - p.PushState(followup[len(followup)-reverse-1]) - } - return first +func (p *P) RouteRepeat() { + p.nextState = p.state + return +} + +type RouteFollowup struct { + p *P +} + +func (p *P) RouteTo(state StateFn) *RouteFollowup { + p.nextState = state + return &RouteFollowup{p} +} + +func (r *RouteFollowup) ThenTo(state StateFn) *RouteFollowup { + r.p.PushState(state) + return r +} + +func (r *RouteFollowup) ThenReturnHere() { + r.p.PushState(r.p.state) +} + +func (p *P) RouteReturn() { + p.nextState = p.PopState() } func (p *P) ToChildState(state StateFn) StateFn { diff --git a/parsekit/types.go b/parsekit/types.go index 24cc338..e07eb5d 100644 --- a/parsekit/types.go +++ b/parsekit/types.go @@ -2,7 +2,8 @@ package parsekit // P holds the internal state of the parser. type P struct { - state StateFn // a function that handles the current state + state StateFn // the function that handles the current state + nextState StateFn // the function that will handle the next state stack []StateFn // state function stack, for nested parsing input string // the scanned input len int // the total length of the input in bytes @@ -18,7 +19,7 @@ type P struct { // StateFn represents the state of the parser as a function // that returns the next state. -type StateFn func(*P) StateFn +type StateFn func(*P) // ItemType represents the type of a parser Item. type ItemType int diff --git a/parser/definitions.go b/parser/definitions.go deleted file mode 100644 index f9469be..0000000 --- a/parser/definitions.go +++ /dev/null @@ -1,48 +0,0 @@ -package parser - -import "github.com/mmakaay/toml/parsekit" - -// Item types that are produced by this parser. -const ( - ItemComment parsekit.ItemType = iota // Comment string - ItemKey // Key of a key/value pair - ItemKeyDot // Dot for a dotted key - ItemAssignment // Value assignment coming up (=) - ItemString // A value of type string -) - -const ( - whitespace string = " \t" - carriageReturn string = "\r" - newline string = "\n" - hash string = "#" - equal string = "=" - lower string = "abcdefghijklmnopqrstuvwxyz" - upper string = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" - digits string = "0123456789" - hex string = digits + "abcdefABCDEF" - dot string = "." - underscore string = "_" - dash string = "-" - singleQuote string = "'" - doubleQuote string = "\"" - backslash string = "\\" - quoteChars string = singleQuote + doubleQuote - bareKeyChars string = lower + upper + digits + underscore + dash - startOfKey string = bareKeyChars + quoteChars - escapeChars string = `btnfr"\` - shortUtf8Escape string = "u" - longUtf8Escape string = "U" -) - -var ( - doubleQuote3 = []string{doubleQuote, doubleQuote, doubleQuote} - shortUtf8Match = []string{backslash, "u", hex, hex, hex, hex} - longUtf8Match = []string{backslash, "U", hex, hex, hex, hex, hex, hex, hex, hex} -) - -// NewParser creates a new parser, using the provided input string -// as the data to parse. -func NewParser(input string) *parsekit.P { - return parsekit.New(input, stateKeyValuePair) -} diff --git a/parser/parser.go b/parser/parser.go new file mode 100644 index 0000000..65c7ed7 --- /dev/null +++ b/parser/parser.go @@ -0,0 +1,53 @@ +package parser + +import "github.com/mmakaay/toml/parsekit" + +// Item types that are produced by this parser. +const ( + ItemComment parsekit.ItemType = iota // Comment string + ItemKey // Key of a key/value pair + ItemKeyDot // Dot for a dotted key + ItemAssignment // Value assignment coming up (=) + ItemString // A value of type string +) + +const ( + whitespace string = " \t" + carriageReturn string = "\r" + newline string = "\n" + hash string = "#" + equal string = "=" + lower string = "abcdefghijklmnopqrstuvwxyz" + upper string = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + digits string = "0123456789" + hex string = digits + "abcdefABCDEF" + dot string = "." + underscore string = "_" + dash string = "-" + singleQuote string = "'" + doubleQuote string = "\"" + backslash string = "\\" + quoteChars string = singleQuote + doubleQuote + bareKeyChars string = lower + upper + digits + underscore + dash + startOfKey string = bareKeyChars + quoteChars + validEscapeChars string = `btnfr"\` + mustBeEscaped string = "" + + "\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007" + + "\u0008\u0009\u000A\u000B\u000C\u000D\u000E\u000F" + + "\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017" + + "\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F" + + "\u007F" +) + +var ( + doubleQuote3 = []interface{}{doubleQuote, doubleQuote, doubleQuote} + hex4 = []interface{}{hex, hex, hex, hex} + shortUtf8Match = []interface{}{backslash, 'u', hex4} + longUtf8Match = []interface{}{backslash, 'U', hex4, hex4} +) + +// NewParser creates a new parser, using the provided input string +// as the data to parse. +func NewParser(input string) *parsekit.P { + return parsekit.New(input, stateKeyValuePair) +} diff --git a/parser/lexer_test.go b/parser/parser_test.go similarity index 100% rename from parser/lexer_test.go rename to parser/parser_test.go diff --git a/parser/syn_comments.go b/parser/syn_comments.go index bd5b196..c5d9927 100644 --- a/parser/syn_comments.go +++ b/parser/syn_comments.go @@ -5,19 +5,19 @@ import ( ) // A '#' hash symbol marks the rest of the line as a comment. -func stateCommentStart(p *parsekit.P) parsekit.StateFn { +func stateCommentStart(p *parsekit.P) { p.SkipConsecutive(hash) - return stateCommentContent + p.RouteTo(stateCommentContent) } // All characters up to the end of the line are included in the comment. -func stateCommentContent(p *parsekit.P) parsekit.StateFn { +func stateCommentContent(p *parsekit.P) { switch { case p.AtEndOfLine(): p.EmitLiteralTrim(ItemComment) - return p.ToParentState() + p.RouteReturn() default: p.AcceptAny() - return stateCommentContent + p.RouteRepeat() } } diff --git a/parser/syn_eof.go b/parser/syn_eof.go index 73c0b8a..2d61f97 100644 --- a/parser/syn_eof.go +++ b/parser/syn_eof.go @@ -2,11 +2,10 @@ package parser import "github.com/mmakaay/toml/parsekit" -func stateEndOfFile(p *parsekit.P) parsekit.StateFn { +func stateEndOfFile(p *parsekit.P) { if p.AtEndOfFile() { p.Emit(parsekit.ItemEOF, "EOF") // todo Automate within parser? } else { p.UnexpectedInput("end of file") } - return nil } diff --git a/parser/syn_key.go b/parser/syn_key.go index 0949b46..bef5ee4 100644 --- a/parser/syn_key.go +++ b/parser/syn_key.go @@ -3,61 +3,64 @@ package parser import "github.com/mmakaay/toml/parsekit" // The primary building block of a TOML document is the key/value pair. -func stateKeyValuePair(p *parsekit.P) parsekit.StateFn { +func stateKeyValuePair(p *parsekit.P) { switch { - case p.SkipConsecutive(whitespace + carriageReturn + newline): - return stateKeyValuePair - case p.Upcoming(hash): - return p.ToChildState(stateCommentStart) - case p.Upcoming(startOfKey): - return stateKey + case p.After(whitespace + carriageReturn + newline).Ignore(): + p.RouteRepeat() + case p.After(hash).Backup(): + p.RouteTo(stateCommentStart).ThenReturnHere() + case p.After(startOfKey).Backup(): + p.RouteTo(stateKey) default: - return stateEndOfFile + p.RouteTo(stateEndOfFile) } } // A key may be either bare, quoted or dotted. -func stateKey(p *parsekit.P) parsekit.StateFn { - if p.AcceptMatching(bareKeyChars) { - return statebareKeyChars +func stateKey(p *parsekit.P) { + if p.After(bareKeyChars).Backup() { + p.RouteTo(statebareKey) + } else { + p.UnexpectedInput("a valid key name") } - return p.UnexpectedInput("a valid key name") } // Bare keys may only contain ASCII letters, ASCII digits, // underscores, and dashes (A-Za-z0-9_-). Note that bare // keys are allowed to be composed of only ASCII digits, // e.g. 1234, but are always interpreted as strings. -func statebareKeyChars(p *parsekit.P) parsekit.StateFn { - p.AcceptConsecutive(bareKeyChars) +func statebareKey(p *parsekit.P) { + p.AcceptConsecutive(bareKeyChars) // TODO make a plan for adding this to After() p.EmitLiteral(ItemKey) - return stateEndOfKeyOrKeyDot + p.RouteTo(stateEndOfKeyOrKeyDot) } // Dotted keys are a sequence of bare or quoted keys joined with a dot. // This allows for grouping similar properties together: -func stateEndOfKeyOrKeyDot(p *parsekit.P) parsekit.StateFn { +func stateEndOfKeyOrKeyDot(p *parsekit.P) { // Whitespace around dot-separated parts is ignored, however, // best practice is to not use any extraneous whitespace. p.SkipConsecutive(whitespace) - if p.SkipMatching(dot) { - p.Emit(ItemKeyDot, "") + if p.After(dot).Store() { p.SkipConsecutive(whitespace) - return stateKey + p.EmitLiteral(ItemKeyDot) + p.RouteTo(stateKey) + } else { + p.RouteTo(stateKeyAssignment) } - return stateKeyAssignment } // Keys are on the left of the equals sign and values are on the right. // Whitespace is ignored around key names and values. The key, equals // sign, and value must be on the same line (though some values can // be broken over multiple lines). -func stateKeyAssignment(p *parsekit.P) parsekit.StateFn { +func stateKeyAssignment(p *parsekit.P) { p.SkipConsecutive(whitespace) - if p.SkipMatching(equal) { - p.Emit(ItemAssignment, "") + if p.After(equal).Store() { + p.EmitLiteral(ItemAssignment) p.SkipConsecutive(whitespace) - return stateValue + p.RouteTo(stateValue) + } else { + p.UnexpectedInput("a value assignment") } - return p.UnexpectedInput("a value assignment") } diff --git a/parser/syn_strings.go b/parser/syn_strings.go index 68226c5..de149fe 100644 --- a/parser/syn_strings.go +++ b/parser/syn_strings.go @@ -6,14 +6,52 @@ import "github.com/mmakaay/toml/parsekit" // and multi-line literal. All strings must contain only valid UTF-8 characters. // * Multi-line basic strings are surrounded by three quotation marks on each side. // * Basic strings are surrounded by quotation marks. -func stateStringValue(p *parsekit.P) parsekit.StateFn { +func stateStringValue(p *parsekit.P) { switch { - case p.SkipMatching(doubleQuote3...): - return stateMultiLineBasicString + case p.SkipMatching(doubleQuote3): + p.RouteTo(stateMultiLineBasicString) case p.SkipMatching(doubleQuote): - return p.QueueStates(stateParseString, stateBasicStringSpecific) + p.RouteTo(parseString).ThenTo(basicStringSpecifics) + default: + p.UnexpectedInput("a string value") + } +} + +func stateMultiLineBasicString(p *parsekit.P) { + p.EmitError("Not yet implemented") +} + +// For convenience, some popular characters have a compact escape sequence. +// +// \b - backspace (U+0008) +// \t - tab (U+0009) +// \n - linefeed (U+000A) +// \f - form feed (U+000C) +// \r - carriage return (U+000D) +// \" - quote (U+0022) +// \\ - backslash (U+005C) +// \uXXXX - unicode (U+XXXX) +// \UXXXXXXXX - unicode (U+XXXXXXXX) +// +// Any Unicode character may be used except those that must be escaped: +// quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F). +func parseString(p *parsekit.P) { + switch { + case p.AtEndOfFile(): + p.UnexpectedEndOfFile("basic string token") + case p.After(backslash, validEscapeChars).Store() || + p.After(shortUtf8Match).Store() || + p.After(longUtf8Match).Store(): + p.RouteRepeat() + case p.After(mustBeEscaped).Backup(): + r, _, _ := p.Match(mustBeEscaped) + p.EmitError("Invalid character in basic string: %q (must be escaped)", r[0]) + case p.After(backslash).Backup() || p.After(doubleQuote).Backup(): + p.RouteReturn() + default: + p.AcceptAny() + p.RouteRepeat() } - return p.UnexpectedInput("a string value") } // Specific handling of input for basic strings. @@ -21,64 +59,17 @@ func stateStringValue(p *parsekit.P) parsekit.StateFn { // * No additional \escape sequences are allowed. What the spec say about this: // "All other escape sequences [..] are reserved and, if used, TOML should // produce an error."" - -func stateBasicStringSpecific(p *parsekit.P) parsekit.StateFn { +func basicStringSpecifics(p *parsekit.P) { switch { - case p.SkipMatching(doubleQuote): - if err := p.EmitInterpreted(ItemString); err != nil { - return p.EmitError("Invalid data in string: %s", err) + case p.After(doubleQuote).Ignore(): + if err := p.EmitInterpreted(ItemString); err != nil { // TODO testcase? + p.EmitError("Invalid data in string: %s", err) + } else { + p.RouteTo(stateKeyValuePair) } - return stateKeyValuePair - case p.Upcoming(backslash): - return p.EmitError("Invalid escape sequence") + case p.After(backslash).Backup(): + p.EmitError("Invalid escape sequence") default: - return p.QueueStates(stateParseString, stateBasicStringSpecific) + p.RouteTo(parseString).ThenTo(basicStringSpecifics) } } - -func stateMultiLineBasicString(p *parsekit.P) parsekit.StateFn { - p.EmitError("Not yet implemented") - return nil -} - -// Any Unicode character may be used except those that must be escaped: -// quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F). -const invalidBasicStringCharacters string = "\"\\" + - "\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007" + - "\u0008\u0009\u000A\u000B\u000C\u000D\u000E\u000F" + - "\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017" + - "\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F" + - "\u007F" - -func stateParseString(p *parsekit.P) parsekit.StateFn { - switch { - case p.AtEndOfFile(): - return p.UnexpectedEndOfFile("basic string token") - case p.AcceptMatching(backslash, escapeChars): - // For convenience, some popular characters have a compact escape sequence. - // \b - backspace (U+0008) - // \t - tab (U+0009) - // \n - linefeed (U+000A) - // \f - form feed (U+000C) - // \r - carriage return (U+000D) - // \" - quote (U+0022) - // \\ - backslash (U+005C) - case p.AcceptMatching(shortUtf8Match...): - // \uXXXX - unicode (U+XXXX) - case p.AcceptMatching(longUtf8Match...): - // \UXXXXXXXX - unicode (U+XXXXXXXX) - case p.Upcoming(backslash) || p.Upcoming(doubleQuote): - // Returning to the parent state to have special cases handled, - // because there are differences between single and multi line strings. - return p.ToParentState() - case p.Upcoming(invalidBasicStringCharacters): - // Any Unicode character may be used except those that must be escaped: - // quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F). - r, _, _ := p.Match(invalidBasicStringCharacters) - p.EmitError("Invalid character in basic string: %q (must be escaped)", r[0]) - return nil - default: - p.AcceptAny() - } - return stateParseString -} diff --git a/parser/syn_value.go b/parser/syn_value.go index 2d6604f..e55fc64 100644 --- a/parser/syn_value.go +++ b/parser/syn_value.go @@ -4,10 +4,11 @@ import "github.com/mmakaay/toml/parsekit" // Values must be of the following types: String, Integer, Float, Boolean, // Datetime, Array, or Inline Table. Unspecified values are invalid. -func stateValue(p *parsekit.P) parsekit.StateFn { +func stateValue(p *parsekit.P) { p.SkipConsecutive(whitespace) if p.Upcoming(quoteChars) { - return stateStringValue + p.RouteTo(stateStringValue) + } else { + p.UnexpectedInput("a value") } - return p.UnexpectedInput("a value") }