Added some more straightforwardness to the pattern magic coding. I can now write stuff like p.After(upper, upper, 4hex).Store() to store runes in the string buffer when the match is complete. Other options instead of Store() are for now Backup() (making it more of a peek) or Ignore() (skipping over the scanned text). I think this methodology forms a nice mental representation for the coder that uses the library. It's close to how we think about parsing (or at least I do)

This commit is contained in:
Maurice Makaay 2019-05-18 01:19:25 +00:00
parent 666cff3af3
commit 4556520582
14 changed files with 276 additions and 185 deletions

View File

@ -39,29 +39,27 @@ func (p *P) EmitInterpreted(t ItemType) error {
}
// EmitError emits a Parser error item to the client.
func (p *P) EmitError(format string, args ...interface{}) StateFn {
func (p *P) EmitError(format string, args ...interface{}) {
message := fmt.Sprintf(format, args...)
p.Emit(ItemError, message)
return nil
}
// UnexpectedInput is used by a parser implementation to emit an
// error item that tells the client that an unexpected rune was
// encountered in the input.
// The parameter 'expected' is used to provide some context to the error.
func (p *P) UnexpectedInput(expected string) StateFn {
func (p *P) UnexpectedInput(expected string) {
// next() takes care of error messages in cases where ok == false.
// Therefore, we only provide an error message for the ok case here.
if r, ok := p.next(); ok {
return p.EmitError(fmt.Sprintf("unexpected character %q (expected %s)", r, expected))
p.EmitError(fmt.Sprintf("unexpected character %q (expected %s)", r, expected))
}
return nil
}
// UnexpectedEndOfFile is used by a parser implementation to emit an
// error item that tells the client that more data was expected from
// the input.
// The parameter 'expected' is used to provide some context to the error.
func (p *P) UnexpectedEndOfFile(expected string) StateFn {
return p.EmitError("Unexpected end of file (expected %s)", expected)
func (p *P) UnexpectedEndOfFile(expected string) {
p.EmitError("Unexpected end of file (expected %s)", expected)
}

View File

@ -10,7 +10,7 @@ import (
// read, then false is returned. Both are considered error cases,
// and for that reason these automatically emit an error to the client.
func (p *P) next() (rune, bool) {
r, w, ok := p.peek()
r, w, ok := p.peek(0)
if ok {
p.advanceCursor(r, w)
return r, true
@ -27,8 +27,8 @@ func (p *P) next() (rune, bool) {
// Returns the rune, its width in bytes and a boolean.
// The boolean will be false in case no upcoming rune can be peeked
// (end of data or invalid UTF8 character).
func (p *P) peek() (rune, int, bool) {
peeked, width := utf8.DecodeRuneInString(p.input[p.pos:])
func (p *P) peek(offsetInBytes int) (rune, int, bool) {
peeked, width := utf8.DecodeRuneInString(p.input[p.pos+offsetInBytes:])
return peeked, width, peeked != utf8.RuneError
}
@ -62,7 +62,7 @@ func (p *P) peekMulti(amount int) ([]rune, []int, bool) {
// moved forward, false otherwise.
// A callback function can be provided to specify what to do with
// the runes that are encountered in the input.
func (p *P) progress(callback func(rune), patterns ...string) bool {
func (p *P) progress(callback func(rune), patterns ...interface{}) bool {
if runes, widths, ok := p.Match(patterns...); ok {
for i, r := range runes {
callback(r)

View File

@ -1,7 +1,9 @@
package parsekit
import (
"fmt"
"strings"
"unicode/utf8"
)
// AtEndOfFile returns true when there is no more data available in the input.
@ -42,28 +44,56 @@ func (p *P) AcceptEndOfLine() bool {
return false
}
// Match checks if the upcoming runes satisfy all provided patterns.
// It returns a slice of runes that were found, a slice containing
// their respective byte widths, and a boolean indicating whether
// or not all provided patterns were satisfied by the input data.
func (p *P) Match(patterns ...string) ([]rune, []int, bool) {
peeked, widths, ok := p.peekMulti(len(patterns))
if ok {
for i, r := range patterns {
if strings.IndexRune(r, peeked[i]) < 0 {
return peeked, widths, false
func (p *P) Match(patterns ...interface{}) ([]rune, []int, bool) {
return p.match(0, patterns...)
}
func (p *P) match(offset int, patterns ...interface{}) ([]rune, []int, bool) {
var runes []rune
var widths []int
addRune := func(r rune, w int) {
offset += w
runes = append(runes, r)
widths = append(widths, w)
}
for _, pattern := range patterns {
r, w := utf8.DecodeRuneInString(p.input[p.pos+offset:])
if r == utf8.RuneError {
return runes, widths, false
}
switch pattern := pattern.(type) {
case []interface{}:
rs, ws, matched := p.match(offset, pattern...)
for i, r := range rs {
addRune(r, ws[i])
}
if !matched {
return runes, widths, false
}
case string:
if strings.IndexRune(pattern, r) < 0 {
return runes, widths, false
}
addRune(r, w)
case rune:
if pattern != r {
return runes, widths, false
}
addRune(r, w)
default:
panic(fmt.Sprintf("Not rune matching implemented for pattern of type %T", pattern))
}
}
return peeked, widths, true
}
return peeked, widths, false
return runes, widths, true
}
// Upcoming checks if the upcoming runes satisfy all provided patterns.
// Returns true if all provided patterns are satisfied.
// This is basically the same as the Match method, but with only
// the boolean return parameter for programmer convenciency.
func (p *P) Upcoming(patterns ...string) bool {
func (p *P) Upcoming(patterns ...interface{}) bool {
_, _, ok := p.Match(patterns...)
return ok
}
@ -79,10 +109,50 @@ func (p *P) AcceptAny() bool {
return false
}
type afterFollowup struct {
p *P
runes []rune
widths []int
ok bool
}
func (a *afterFollowup) Store() bool {
if a.ok {
for i, r := range a.runes {
a.p.buffer.writeRune(r)
a.p.advanceCursor(r, a.widths[i])
}
}
return a.ok
}
func (a *afterFollowup) Ignore() bool {
if a.ok {
for i, r := range a.runes {
a.p.advanceCursor(r, a.widths[i])
}
}
return a.ok
}
func (a *afterFollowup) Backup() bool {
return a.ok
}
func (p *P) After(patterns ...interface{}) *afterFollowup {
runes, widths, ok := p.Match(patterns...)
return &afterFollowup{
p: p,
runes: runes,
widths: widths,
ok: ok,
}
}
// AcceptMatching adds the next runes to the string buffer, but only
// if the upcoming runes satisfy the provided patterns.
// When runes were added then true is returned, false otherwise.
func (p *P) AcceptMatching(patterns ...string) bool {
func (p *P) AcceptMatching(patterns ...interface{}) bool {
return p.progress(func(r rune) { p.buffer.writeRune(r) }, patterns...)
}
@ -99,7 +169,7 @@ func (p *P) AcceptConsecutive(pattern string) bool {
// SkipMatching skips runes, but only when all provided patterns are satisfied.
// Returns true when one or more runes were skipped.
func (p *P) SkipMatching(patterns ...string) bool {
func (p *P) SkipMatching(patterns ...interface{}) bool {
if runes, widths, ok := p.Match(patterns...); ok {
for i, r := range runes {
p.advanceCursor(r, widths[i])

View File

@ -6,7 +6,7 @@ func New(input string, startState StateFn) *P {
return &P{
input: input,
len: len(input),
state: startState,
nextState: startState,
items: make(chan Item, 2),
}
}
@ -30,7 +30,11 @@ func (p *P) Next() (Item, *Error, bool) {
return i, nil, true
}
default:
p.state = p.state(p)
if p.nextState == nil {
panic("No next state was scheduled for the parser")
}
p.state = p.nextState
p.state(p)
}
}
}

View File

@ -1,11 +1,30 @@
package parsekit
func (p *P) QueueStates(states ...StateFn) StateFn {
first, followup := states[0], states[1:]
for reverse := range followup {
p.PushState(followup[len(followup)-reverse-1])
func (p *P) RouteRepeat() {
p.nextState = p.state
return
}
return first
type RouteFollowup struct {
p *P
}
func (p *P) RouteTo(state StateFn) *RouteFollowup {
p.nextState = state
return &RouteFollowup{p}
}
func (r *RouteFollowup) ThenTo(state StateFn) *RouteFollowup {
r.p.PushState(state)
return r
}
func (r *RouteFollowup) ThenReturnHere() {
r.p.PushState(r.p.state)
}
func (p *P) RouteReturn() {
p.nextState = p.PopState()
}
func (p *P) ToChildState(state StateFn) StateFn {

View File

@ -2,7 +2,8 @@ package parsekit
// P holds the internal state of the parser.
type P struct {
state StateFn // a function that handles the current state
state StateFn // the function that handles the current state
nextState StateFn // the function that will handle the next state
stack []StateFn // state function stack, for nested parsing
input string // the scanned input
len int // the total length of the input in bytes
@ -18,7 +19,7 @@ type P struct {
// StateFn represents the state of the parser as a function
// that returns the next state.
type StateFn func(*P) StateFn
type StateFn func(*P)
// ItemType represents the type of a parser Item.
type ItemType int

View File

@ -1,48 +0,0 @@
package parser
import "github.com/mmakaay/toml/parsekit"
// Item types that are produced by this parser.
const (
ItemComment parsekit.ItemType = iota // Comment string
ItemKey // Key of a key/value pair
ItemKeyDot // Dot for a dotted key
ItemAssignment // Value assignment coming up (=)
ItemString // A value of type string
)
const (
whitespace string = " \t"
carriageReturn string = "\r"
newline string = "\n"
hash string = "#"
equal string = "="
lower string = "abcdefghijklmnopqrstuvwxyz"
upper string = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
digits string = "0123456789"
hex string = digits + "abcdefABCDEF"
dot string = "."
underscore string = "_"
dash string = "-"
singleQuote string = "'"
doubleQuote string = "\""
backslash string = "\\"
quoteChars string = singleQuote + doubleQuote
bareKeyChars string = lower + upper + digits + underscore + dash
startOfKey string = bareKeyChars + quoteChars
escapeChars string = `btnfr"\`
shortUtf8Escape string = "u"
longUtf8Escape string = "U"
)
var (
doubleQuote3 = []string{doubleQuote, doubleQuote, doubleQuote}
shortUtf8Match = []string{backslash, "u", hex, hex, hex, hex}
longUtf8Match = []string{backslash, "U", hex, hex, hex, hex, hex, hex, hex, hex}
)
// NewParser creates a new parser, using the provided input string
// as the data to parse.
func NewParser(input string) *parsekit.P {
return parsekit.New(input, stateKeyValuePair)
}

53
parser/parser.go Normal file
View File

@ -0,0 +1,53 @@
package parser
import "github.com/mmakaay/toml/parsekit"
// Item types that are produced by this parser.
const (
ItemComment parsekit.ItemType = iota // Comment string
ItemKey // Key of a key/value pair
ItemKeyDot // Dot for a dotted key
ItemAssignment // Value assignment coming up (=)
ItemString // A value of type string
)
const (
whitespace string = " \t"
carriageReturn string = "\r"
newline string = "\n"
hash string = "#"
equal string = "="
lower string = "abcdefghijklmnopqrstuvwxyz"
upper string = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
digits string = "0123456789"
hex string = digits + "abcdefABCDEF"
dot string = "."
underscore string = "_"
dash string = "-"
singleQuote string = "'"
doubleQuote string = "\""
backslash string = "\\"
quoteChars string = singleQuote + doubleQuote
bareKeyChars string = lower + upper + digits + underscore + dash
startOfKey string = bareKeyChars + quoteChars
validEscapeChars string = `btnfr"\`
mustBeEscaped string = "" +
"\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007" +
"\u0008\u0009\u000A\u000B\u000C\u000D\u000E\u000F" +
"\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017" +
"\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F" +
"\u007F"
)
var (
doubleQuote3 = []interface{}{doubleQuote, doubleQuote, doubleQuote}
hex4 = []interface{}{hex, hex, hex, hex}
shortUtf8Match = []interface{}{backslash, 'u', hex4}
longUtf8Match = []interface{}{backslash, 'U', hex4, hex4}
)
// NewParser creates a new parser, using the provided input string
// as the data to parse.
func NewParser(input string) *parsekit.P {
return parsekit.New(input, stateKeyValuePair)
}

View File

@ -5,19 +5,19 @@ import (
)
// A '#' hash symbol marks the rest of the line as a comment.
func stateCommentStart(p *parsekit.P) parsekit.StateFn {
func stateCommentStart(p *parsekit.P) {
p.SkipConsecutive(hash)
return stateCommentContent
p.RouteTo(stateCommentContent)
}
// All characters up to the end of the line are included in the comment.
func stateCommentContent(p *parsekit.P) parsekit.StateFn {
func stateCommentContent(p *parsekit.P) {
switch {
case p.AtEndOfLine():
p.EmitLiteralTrim(ItemComment)
return p.ToParentState()
p.RouteReturn()
default:
p.AcceptAny()
return stateCommentContent
p.RouteRepeat()
}
}

View File

@ -2,11 +2,10 @@ package parser
import "github.com/mmakaay/toml/parsekit"
func stateEndOfFile(p *parsekit.P) parsekit.StateFn {
func stateEndOfFile(p *parsekit.P) {
if p.AtEndOfFile() {
p.Emit(parsekit.ItemEOF, "EOF") // todo Automate within parser?
} else {
p.UnexpectedInput("end of file")
}
return nil
}

View File

@ -3,61 +3,64 @@ package parser
import "github.com/mmakaay/toml/parsekit"
// The primary building block of a TOML document is the key/value pair.
func stateKeyValuePair(p *parsekit.P) parsekit.StateFn {
func stateKeyValuePair(p *parsekit.P) {
switch {
case p.SkipConsecutive(whitespace + carriageReturn + newline):
return stateKeyValuePair
case p.Upcoming(hash):
return p.ToChildState(stateCommentStart)
case p.Upcoming(startOfKey):
return stateKey
case p.After(whitespace + carriageReturn + newline).Ignore():
p.RouteRepeat()
case p.After(hash).Backup():
p.RouteTo(stateCommentStart).ThenReturnHere()
case p.After(startOfKey).Backup():
p.RouteTo(stateKey)
default:
return stateEndOfFile
p.RouteTo(stateEndOfFile)
}
}
// A key may be either bare, quoted or dotted.
func stateKey(p *parsekit.P) parsekit.StateFn {
if p.AcceptMatching(bareKeyChars) {
return statebareKeyChars
func stateKey(p *parsekit.P) {
if p.After(bareKeyChars).Backup() {
p.RouteTo(statebareKey)
} else {
p.UnexpectedInput("a valid key name")
}
return p.UnexpectedInput("a valid key name")
}
// Bare keys may only contain ASCII letters, ASCII digits,
// underscores, and dashes (A-Za-z0-9_-). Note that bare
// keys are allowed to be composed of only ASCII digits,
// e.g. 1234, but are always interpreted as strings.
func statebareKeyChars(p *parsekit.P) parsekit.StateFn {
p.AcceptConsecutive(bareKeyChars)
func statebareKey(p *parsekit.P) {
p.AcceptConsecutive(bareKeyChars) // TODO make a plan for adding this to After()
p.EmitLiteral(ItemKey)
return stateEndOfKeyOrKeyDot
p.RouteTo(stateEndOfKeyOrKeyDot)
}
// Dotted keys are a sequence of bare or quoted keys joined with a dot.
// This allows for grouping similar properties together:
func stateEndOfKeyOrKeyDot(p *parsekit.P) parsekit.StateFn {
func stateEndOfKeyOrKeyDot(p *parsekit.P) {
// Whitespace around dot-separated parts is ignored, however,
// best practice is to not use any extraneous whitespace.
p.SkipConsecutive(whitespace)
if p.SkipMatching(dot) {
p.Emit(ItemKeyDot, "")
if p.After(dot).Store() {
p.SkipConsecutive(whitespace)
return stateKey
p.EmitLiteral(ItemKeyDot)
p.RouteTo(stateKey)
} else {
p.RouteTo(stateKeyAssignment)
}
return stateKeyAssignment
}
// Keys are on the left of the equals sign and values are on the right.
// Whitespace is ignored around key names and values. The key, equals
// sign, and value must be on the same line (though some values can
// be broken over multiple lines).
func stateKeyAssignment(p *parsekit.P) parsekit.StateFn {
func stateKeyAssignment(p *parsekit.P) {
p.SkipConsecutive(whitespace)
if p.SkipMatching(equal) {
p.Emit(ItemAssignment, "")
if p.After(equal).Store() {
p.EmitLiteral(ItemAssignment)
p.SkipConsecutive(whitespace)
return stateValue
p.RouteTo(stateValue)
} else {
p.UnexpectedInput("a value assignment")
}
return p.UnexpectedInput("a value assignment")
}

View File

@ -6,56 +6,23 @@ import "github.com/mmakaay/toml/parsekit"
// and multi-line literal. All strings must contain only valid UTF-8 characters.
// * Multi-line basic strings are surrounded by three quotation marks on each side.
// * Basic strings are surrounded by quotation marks.
func stateStringValue(p *parsekit.P) parsekit.StateFn {
func stateStringValue(p *parsekit.P) {
switch {
case p.SkipMatching(doubleQuote3...):
return stateMultiLineBasicString
case p.SkipMatching(doubleQuote3):
p.RouteTo(stateMultiLineBasicString)
case p.SkipMatching(doubleQuote):
return p.QueueStates(stateParseString, stateBasicStringSpecific)
}
return p.UnexpectedInput("a string value")
}
// Specific handling of input for basic strings.
// * A double quote ends the string
// * No additional \escape sequences are allowed. What the spec say about this:
// "All other escape sequences [..] are reserved and, if used, TOML should
// produce an error.""
func stateBasicStringSpecific(p *parsekit.P) parsekit.StateFn {
switch {
case p.SkipMatching(doubleQuote):
if err := p.EmitInterpreted(ItemString); err != nil {
return p.EmitError("Invalid data in string: %s", err)
}
return stateKeyValuePair
case p.Upcoming(backslash):
return p.EmitError("Invalid escape sequence")
p.RouteTo(parseString).ThenTo(basicStringSpecifics)
default:
return p.QueueStates(stateParseString, stateBasicStringSpecific)
p.UnexpectedInput("a string value")
}
}
func stateMultiLineBasicString(p *parsekit.P) parsekit.StateFn {
func stateMultiLineBasicString(p *parsekit.P) {
p.EmitError("Not yet implemented")
return nil
}
// Any Unicode character may be used except those that must be escaped:
// quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F).
const invalidBasicStringCharacters string = "\"\\" +
"\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007" +
"\u0008\u0009\u000A\u000B\u000C\u000D\u000E\u000F" +
"\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017" +
"\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F" +
"\u007F"
func stateParseString(p *parsekit.P) parsekit.StateFn {
switch {
case p.AtEndOfFile():
return p.UnexpectedEndOfFile("basic string token")
case p.AcceptMatching(backslash, escapeChars):
// For convenience, some popular characters have a compact escape sequence.
//
// \b - backspace (U+0008)
// \t - tab (U+0009)
// \n - linefeed (U+000A)
@ -63,22 +30,46 @@ func stateParseString(p *parsekit.P) parsekit.StateFn {
// \r - carriage return (U+000D)
// \" - quote (U+0022)
// \\ - backslash (U+005C)
case p.AcceptMatching(shortUtf8Match...):
// \uXXXX - unicode (U+XXXX)
case p.AcceptMatching(longUtf8Match...):
// \UXXXXXXXX - unicode (U+XXXXXXXX)
case p.Upcoming(backslash) || p.Upcoming(doubleQuote):
// Returning to the parent state to have special cases handled,
// because there are differences between single and multi line strings.
return p.ToParentState()
case p.Upcoming(invalidBasicStringCharacters):
//
// Any Unicode character may be used except those that must be escaped:
// quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F).
r, _, _ := p.Match(invalidBasicStringCharacters)
func parseString(p *parsekit.P) {
switch {
case p.AtEndOfFile():
p.UnexpectedEndOfFile("basic string token")
case p.After(backslash, validEscapeChars).Store() ||
p.After(shortUtf8Match).Store() ||
p.After(longUtf8Match).Store():
p.RouteRepeat()
case p.After(mustBeEscaped).Backup():
r, _, _ := p.Match(mustBeEscaped)
p.EmitError("Invalid character in basic string: %q (must be escaped)", r[0])
return nil
case p.After(backslash).Backup() || p.After(doubleQuote).Backup():
p.RouteReturn()
default:
p.AcceptAny()
p.RouteRepeat()
}
}
// Specific handling of input for basic strings.
// * A double quote ends the string
// * No additional \escape sequences are allowed. What the spec say about this:
// "All other escape sequences [..] are reserved and, if used, TOML should
// produce an error.""
func basicStringSpecifics(p *parsekit.P) {
switch {
case p.After(doubleQuote).Ignore():
if err := p.EmitInterpreted(ItemString); err != nil { // TODO testcase?
p.EmitError("Invalid data in string: %s", err)
} else {
p.RouteTo(stateKeyValuePair)
}
case p.After(backslash).Backup():
p.EmitError("Invalid escape sequence")
default:
p.RouteTo(parseString).ThenTo(basicStringSpecifics)
}
return stateParseString
}

View File

@ -4,10 +4,11 @@ import "github.com/mmakaay/toml/parsekit"
// Values must be of the following types: String, Integer, Float, Boolean,
// Datetime, Array, or Inline Table. Unspecified values are invalid.
func stateValue(p *parsekit.P) parsekit.StateFn {
func stateValue(p *parsekit.P) {
p.SkipConsecutive(whitespace)
if p.Upcoming(quoteChars) {
return stateStringValue
p.RouteTo(stateStringValue)
} else {
p.UnexpectedInput("a value")
}
return p.UnexpectedInput("a value")
}