Phew, that was quite the update. I've now got a working implementation of a parser/combinator-like matching API, which prevents us from having to specify everything in state functions. That is way too low level for a lot of things. I'd rather have parser/combinator-style definitions for chunks of the input and keeping the state functions for higher level document structure parsing.

This commit is contained in:
Maurice Makaay 2019-05-19 23:35:03 +00:00
parent 55e23874f7
commit e3e408dfdb
16 changed files with 721 additions and 234 deletions

View File

@ -3,6 +3,7 @@ package parsekit
import (
"fmt"
"strings"
"unicode/utf8"
)
// Emit passes a Parser item to the client, including the provided string.
@ -51,8 +52,16 @@ func (p *P) EmitError(format string, args ...interface{}) {
func (p *P) UnexpectedInput(expected string) {
// next() takes care of error messages in cases where ok == false.
// Therefore, we only provide an error message for the ok case here.
if r, ok := p.next(); ok {
p.EmitError(fmt.Sprintf("unexpected character %q (expected %s)", r, expected))
r, _, ok := p.peek(0)
switch {
case ok:
p.EmitError("unexpected character %q (expected %s)", r, expected)
case r == EOF:
p.EmitError("unexpected end of file (expected %s)", expected)
case r == utf8.RuneError:
p.EmitError("invalid UTF8 character in input (expected %s)", expected)
default:
panic("Unhandled output from peek()")
}
}

View File

@ -4,32 +4,13 @@ import (
"unicode/utf8"
)
// next returns the next rune from the input and a boolean indicating if
// reading the input was successful.
// When the end of input is reached, or an invalid UTF8 character is
// read, then false is returned. Both are considered error cases,
// and for that reason these automatically emit an error to the client.
func (p *P) next() (rune, bool) {
r, w, ok := p.peek(0)
if ok {
p.advanceCursor(r, w)
return r, true
}
if r == utf8.RuneError && w == 0 {
p.EmitError("unexpected end of file")
} else {
p.EmitError("invalid UTF8 character")
}
return r, false
}
// peek returns but does not advance the cursor to the next rune(s) in the input.
// Returns the rune, its width in bytes and a boolean.
// The boolean will be false in case no upcoming rune can be peeked
// (end of data or invalid UTF8 character).
func (p *P) peek(offsetInBytes int) (rune, int, bool) {
peeked, width := utf8.DecodeRuneInString(p.input[p.pos+offsetInBytes:])
return peeked, width, peeked != utf8.RuneError
r, w := utf8.DecodeRuneInString(p.input[p.pos+offsetInBytes:])
return handleRuneError(r, w)
}
// peekMulti takes a peek at multiple upcoming runes in the input.
@ -43,13 +24,12 @@ func (p *P) peekMulti(amount int) ([]rune, []int, bool) {
offset := 0
for i := 0; i < amount; i++ {
r, w := utf8.DecodeRuneInString(p.input[p.pos+offset:])
switch {
case r == utf8.RuneError:
return runes, widths, false
default:
offset += w
r, w, ok := handleRuneError(r, w)
runes = append(runes, r)
widths = append(widths, w)
offset += w
if !ok {
return runes, widths, false
}
}
return runes, widths, true
@ -86,3 +66,21 @@ func (p *P) advanceCursor(r rune, w int) {
}
p.newline = r == '\n'
}
// handleRuneError is used to normale rune value in case of errors.
// When an error occurs, then utf8.RuneError will be in the rune.
// This can however indicate one of two situations:
// * w == 0: end of file is reached
// * w == 1: invalid UTF character on input
// This function lets these two cases return respectively the
// package's own EOF or INVALID runes, to make it easy for client
// code to distinct between these two cases.
func handleRuneError(r rune, w int) (rune, int, bool) {
if r == utf8.RuneError {
if w == 0 {
return EOF, 0, false
}
return INVALID, w, false
}
return r, w, true
}

218
parsekit/matchers.go Normal file
View File

@ -0,0 +1,218 @@
package parsekit
import "unicode/utf8"
// Not in need of it myself, but nice to have I guess:
// - NotFollowedBy
// - Discard
// - Separated
type MatchDialog struct {
p *P
runes []rune
widths []int
offset int
curRune rune
curWidth int
forked bool
}
func (m *MatchDialog) Fork() *MatchDialog {
fork := &MatchDialog{
p: m.p,
offset: m.offset,
forked: true,
}
return fork
}
func (m *MatchDialog) Join(fork *MatchDialog) bool {
if !fork.forked {
panic("Cannot join a non-forked MatchDialog")
}
m.runes = append(m.runes, fork.runes...)
m.widths = append(m.widths, fork.widths...)
m.offset = fork.offset
fork.runes = []rune{}
fork.widths = []int{}
return true
}
func (m *MatchDialog) NextRune() (rune, bool) {
if m.curRune == utf8.RuneError {
panic("Matcher must not call NextRune() after it returned false")
}
r, w := utf8.DecodeRuneInString(m.p.input[m.p.pos+m.offset:])
m.offset += w
m.curRune = r
m.curWidth = w
m.runes = append(m.runes, r)
m.widths = append(m.widths, w)
return r, r != EOF && r != INVALID
}
// Matcher is the interface that can be implemented to provide
// a matching stategy for the match() function.
// A MatchDialog is provided as input. This implements a
// specific set of methods that a Matcher needs to retrieve data
// from the parser and to report back results.
type Matcher interface {
Match(*MatchDialog) bool
}
type MatcherConstructors struct {
Any func() MatchAny
Rune func(rune rune) MatchRune
RuneRange func(start rune, end rune) MatchRuneRange
Runes func(runes ...rune) MatchAnyOf
AnyOf func(matchers ...Matcher) MatchAnyOf
Repeat func(count int, matcher Matcher) MatchRepeat
Sequence func(matchers ...Matcher) MatchSequence
ZeroOrMore func(matcher Matcher) MatchZeroOrMore
OneOrMore func(matcher Matcher) MatchOneOrMore
Optional func(matcher Matcher) MatchOptional
}
var C = MatcherConstructors{
Any: func() MatchAny {
return MatchAny{}
},
Rune: func(rune rune) MatchRune {
return MatchRune{rune}
},
RuneRange: func(start rune, end rune) MatchRuneRange {
return MatchRuneRange{start, end}
},
Runes: func(runes ...rune) MatchAnyOf {
m := make([]Matcher, len(runes))
for i, r := range runes {
m[i] = MatchRune{r}
}
return MatchAnyOf{m}
},
AnyOf: func(matchers ...Matcher) MatchAnyOf {
return MatchAnyOf{matchers}
},
Repeat: func(count int, matcher Matcher) MatchRepeat {
return MatchRepeat{count, matcher}
},
Sequence: func(matchers ...Matcher) MatchSequence {
return MatchSequence{matchers}
},
OneOrMore: func(matcher Matcher) MatchOneOrMore {
return MatchOneOrMore{matcher}
},
ZeroOrMore: func(matcher Matcher) MatchZeroOrMore {
return MatchZeroOrMore{matcher}
},
Optional: func(matcher Matcher) MatchOptional {
return MatchOptional{matcher}
},
}
type MatchAny struct{}
func (c MatchAny) Match(m *MatchDialog) bool {
_, ok := m.NextRune()
return ok
}
type MatchRune struct {
match rune
}
func (c MatchRune) Match(m *MatchDialog) bool {
r, ok := m.NextRune()
return ok && r == c.match
}
type MatchRuneRange struct {
start rune
end rune
}
func (c MatchRuneRange) Match(m *MatchDialog) bool {
r, ok := m.NextRune()
return ok && r >= c.start && r <= c.end
}
type MatchAnyOf struct {
matcher []Matcher
}
func (c MatchAnyOf) Match(m *MatchDialog) bool {
for _, matcher := range c.matcher {
mc := m.Fork()
if matcher.Match(mc) {
return m.Join(mc)
}
}
return false
}
type MatchRepeat struct {
count int
matcher Matcher
}
func (c MatchRepeat) Match(m *MatchDialog) bool {
mc := m.Fork()
for i := 0; i < c.count; i++ {
if !c.matcher.Match(mc) {
return false
}
}
m.Join(mc)
return true
}
type MatchSequence struct {
matchers []Matcher
}
func (c MatchSequence) Match(m *MatchDialog) bool {
mPart := m.Fork()
for _, matcher := range c.matchers {
if !matcher.Match(mPart) {
return false
}
}
m.Join(mPart)
return true
}
type MatchOneOrMore struct {
matcher Matcher
}
func (c MatchOneOrMore) Match(m *MatchDialog) bool {
mc := m.Fork()
for c.matcher.Match(mc) {
m.Join(mc)
}
return len(m.runes) > 0
}
type MatchZeroOrMore struct {
matcher Matcher
}
func (c MatchZeroOrMore) Match(m *MatchDialog) bool {
mc := m.Fork()
for c.matcher.Match(mc) {
m.Join(mc)
}
return true
}
type MatchOptional struct {
matcher Matcher
}
func (c MatchOptional) Match(m *MatchDialog) bool {
mc := m.Fork()
if c.matcher.Match(mc) {
m.Join(mc)
}
return true
}

260
parsekit/matchers_test.go Normal file
View File

@ -0,0 +1,260 @@
package parsekit_test
import (
"testing"
p "github.com/mmakaay/toml/parsekit"
)
var c = p.C
const TestItem p.ItemType = 1
func newParser(input string, matcher p.Matcher) *p.P {
stateFn := func(p *p.P) {
if p.On(matcher).Accept() {
p.EmitLiteral(TestItem)
p.Repeat()
} else {
p.UnexpectedInput("MATCH")
}
}
return p.New(input, stateFn)
}
func TestMatchAny(t *testing.T) {
p := newParser("o", c.Any())
r, err, ok := p.Next()
if !ok {
t.Fatalf("Parsing failed: %s", err)
}
if r.Type != TestItem {
t.Error("Parser item type not expected TestTitem")
}
if r.Value != "o" {
t.Errorf("Parser item value is %q instead of expected \"o\"", r.Value)
}
}
func TestMatchAny_AtEndOfFile(t *testing.T) {
p := newParser("", c.Any())
_, err, ok := p.Next()
if ok {
t.Fatalf("Parsing unexpectedly succeeded")
}
expected := "unexpected end of file (expected MATCH)"
if err.Error() != expected {
t.Fatalf("Unexpected error from parser:\nexpectd: %s\nactual: %s\n", expected, err.Error())
}
}
func TestMatchAny_AtInvalidUtf8Rune(t *testing.T) {
p := newParser("\xcd", c.Any())
_, err, ok := p.Next()
if ok {
t.Fatalf("Parsing unexpectedly succeeded")
}
expected := "invalid UTF8 character in input (expected MATCH)"
if err.Error() != expected {
t.Fatalf("Unexpected error from parser:\nexpectd: %s\nactual: %s\n", expected, err.Error())
}
}
func TestMatchRune(t *testing.T) {
p := newParser("xxx", c.Rune('x'))
r, err, ok := p.Next()
if !ok {
t.Fatalf("Parsing failed: %s", err)
}
if r.Type != TestItem {
t.Error("Parser item type not expected TestTitem")
}
if r.Value != "x" {
t.Errorf("Parser item value is %q instead of expected \"x\"", r.Value)
}
}
func TestMatchRune_OnMismatch(t *testing.T) {
p := newParser("x ", c.Rune(' '))
_, err, ok := p.Next()
if ok {
t.Fatalf("Parsing did not fail unexpectedly")
}
expected := "unexpected character 'x' (expected MATCH)"
if err.Error() != expected {
t.Fatalf("Unexpected error from parser:\nexpectd: %s\nactual: %s\n", expected, err.Error())
}
}
func TestMatchRuneRange(t *testing.T) {
m := c.RuneRange('b', 'y')
s := "mnopqrstuvwxybcdefghijkl"
p := newParser(s, m)
for i := 0; i < len(s); i++ {
r, err, ok := p.Next()
if !ok {
t.Fatalf("Parsing failed: %s", err)
}
if s[i] != r.Value[0] {
t.Fatalf("Unexpected parse output on cycle %d:\nexpected: %q\nactual: %q\n", i+1, s[i], r.Value[0])
}
}
if _, _, ok := newParser("a", m).Next(); ok {
t.Fatalf("Unexpected parse success for input 'a'")
}
if _, _, ok := newParser("z", m).Next(); ok {
t.Fatalf("Unexpected parse success for input 'z'")
}
}
func TestMatchRunes(t *testing.T) {
m := c.Runes('+', '-', '*', '/')
s := "-+/*+++"
p := newParser(s, m)
for i := 0; i < len(s); i++ {
r, err, ok := p.Next()
if !ok {
t.Fatalf("Parsing failed: %s", err)
}
if s[i] != r.Value[0] {
t.Fatalf("Unexpected parse output on cycle %d:\nexpected: %q\nactual: %q\n", i+1, s[i], r.Value[0])
}
}
if _, _, ok := newParser("^", m).Next(); ok {
t.Fatalf("Unexpected parse success for input '^'")
}
if _, _, ok := newParser("x", m).Next(); ok {
t.Fatalf("Unexpected parse success for input 'x'")
}
}
func TestMatchAnyOf(t *testing.T) {
p := newParser("abc", c.AnyOf(c.Rune('a'), c.Rune('b')))
r, err, ok := p.Next()
if !ok {
t.Fatalf("Parsing failed: %s", err)
}
if r.Type != TestItem {
t.Error("Parser item type not expected TestTitem")
}
if r.Value != "a" {
t.Errorf("Parser item value is %q instead of expected \"a\"", r.Value)
}
r, err, ok = p.Next()
if !ok {
t.Fatalf("Parsing failed: %s", err)
}
if r.Type != TestItem {
t.Error("Parser item type not expected TestTitem")
}
if r.Value != "b" {
t.Errorf("Parser item value is %q instead of expected \"a\"", r.Value)
}
}
func TestMatchRepeat(t *testing.T) {
p := newParser("xxxxyyyy", c.Repeat(4, c.Rune('x')))
r, err, ok := p.Next()
if !ok {
t.Fatalf("Parsing failed: %s at row: %d, column %d\n", err, err.Row, err.Column)
}
if r.Value != "xxxx" {
t.Errorf("Parser item value is %q instead of expected \"xxxx\"", r.Value)
}
}
func TestMatchRepeat_Mismatch(t *testing.T) {
p := newParser("xxxyyyy", c.Repeat(4, c.Rune('x')))
_, err, ok := p.Next()
if ok {
t.Fatalf("Parsing did not fail unexpectedly")
}
expected := "unexpected character 'x' (expected MATCH)"
if err.Error() != expected {
t.Fatalf("Unexpected error from parser:\nexpectd: %s\nactual: %s\n", expected, err.Error())
}
}
func TestMatchOneOrMore(t *testing.T) {
p := newParser("xxxxxxxxyyyy", c.OneOrMore(c.Rune('x')))
r, err, ok := p.Next()
if !ok {
t.Fatalf("Parsing failed: %s at row: %d, column %d\n", err, err.Row, err.Column)
}
if r.Value != "xxxxxxxx" {
t.Errorf("Parser item value is %q instead of expected \"xxxxxxxx\"", r.Value)
}
}
func TestMatchSequence(t *testing.T) {
p := newParser("10101", c.Sequence(c.Rune('1'), c.Rune('0')))
r, err, ok := p.Next()
if !ok {
t.Fatalf("Parsing failed: %s at row: %d, column %d\n", err, err.Row, err.Column)
}
if r.Value != "10" {
t.Errorf("Parser item value is %q instead of expected \"10\"", r.Value)
}
}
func TestMatchSequence_CombinedWithOneOrMore(t *testing.T) {
p := newParser("101010987", c.OneOrMore(c.Sequence(c.Rune('1'), c.Rune('0'))))
r, err, ok := p.Next()
if !ok {
t.Fatalf("Parsing failed: %s at row: %d, column %d\n", err, err.Row, err.Column)
}
if r.Value != "101010" {
t.Errorf("Parser item value is %q instead of expected \"101010\"", r.Value)
}
}
func TestSequence_WithRepeatedRunes(t *testing.T) {
whitespace := c.Optional(c.OneOrMore(c.Rune(' ')))
equal := c.Rune('=')
assignment := c.Sequence(whitespace, equal, whitespace)
p := newParser(" == 10", assignment)
r, err, ok := p.Next()
if !ok {
t.Fatalf("Parsing failed: %s at row: %d, column %d\n", err, err.Row, err.Column)
}
if r.Value != " =" {
t.Errorf("Parser item value is %q instead of expected \" =\"", r.Value)
}
}
func TestMatchOptional(t *testing.T) {
p := newParser("xyz", c.Optional(c.Rune('x')))
r, err, ok := p.Next()
if !ok {
t.Fatalf("Parsing failed: %s at row: %d, column %d\n", err, err.Row, err.Column)
}
if r.Value != "x" {
t.Errorf("Parser item value is %q instead of expected \"x\"", r.Value)
}
p = newParser("xyz", c.Optional(c.Rune('y')))
r, err, ok = p.Next()
if !ok {
t.Fatalf("Parsing failed: %s at row: %d, column %d\n", err, err.Row, err.Column)
}
if r.Value != "" {
t.Errorf("Parser item value is %q instead of expected \"\"", r.Value)
}
}
func TestMixAndMatch(t *testing.T) {
hex := c.AnyOf(c.RuneRange('0', '9'), c.RuneRange('a', 'f'), c.RuneRange('A', 'F'))
backslash := c.Rune('\\')
x := c.Rune('x')
hexbyte := c.Sequence(backslash, x, c.Repeat(2, hex))
p := newParser(`\x9a\x01\xF0\xfCAndSomeMoreStuff`, c.Repeat(4, hexbyte))
r, err, ok := p.Next()
if !ok {
t.Fatalf("Parsing failed: %s at row: %d, column %d\n", err, err.Row, err.Column)
}
if r.Value != `\x9a\x01\xF0\xfC` {
t.Errorf("Parser item value is %q instead of expected \"%q\"", r.Value, `\x9a\x01\xF0\xfC`)
}
}

View File

@ -64,6 +64,13 @@ func (p *P) match(offset int, patterns ...interface{}) ([]rune, []int, bool) {
return runes, widths, false
}
switch pattern := pattern.(type) {
case Matcher:
m := &MatchDialog{p: p}
if pattern.Match(m) {
return m.runes, m.widths, true
} else {
return m.runes, m.widths, false
}
case []interface{}:
rs, ws, matched := p.match(offset, pattern...)
for i, r := range rs {
@ -98,17 +105,6 @@ func (p *P) Upcoming(patterns ...interface{}) bool {
return ok
}
// AcceptAny adds the next rune from the input to the string buffer.
// If no rune could be read (end of file or invalid UTF8 data),
// then false is returned.
func (p *P) AcceptAny() bool {
if r, ok := p.next(); ok {
p.buffer.writeRune(r)
return true
}
return false
}
type action struct {
p *P
runes []rune
@ -129,6 +125,10 @@ func (a *action) Accept() bool {
func (a *action) Skip() bool {
if a.ok {
for i, r := range a.runes {
type C struct {
Rune MatchRune
}
a.p.advanceCursor(r, a.widths[i])
}
}
@ -159,20 +159,10 @@ func (p *P) On(patterns ...interface{}) *action {
// AcceptMatching adds the next runes to the string buffer, but only
// if the upcoming runes satisfy the provided patterns.
// When runes were added then true is returned, false otherwise.
func (p *P) AcceptMatching(patterns ...interface{}) bool {
return p.progress(func(r rune) { p.buffer.writeRune(r) }, patterns...)
}
// AcceptConsecutive adds consecutive runes from the input to the string
// buffer, as long as they exist in the pattern.
// If any runes were added then true is returned, false otherwise.
func (p *P) AcceptConsecutive(pattern string) bool {
accepted := false
for p.AcceptMatching(pattern) {
accepted = true
}
return accepted
}
// TODO not needed anymore
// func (p *P) AcceptMatching(patterns ...interface{}) bool {
// return p.progress(func(r rune) { p.buffer.writeRune(r) }, patterns...)
// }
// SkipMatching skips runes, but only when all provided patterns are satisfied.
// Returns true when one or more runes were skipped.
@ -185,13 +175,3 @@ func (p *P) SkipMatching(patterns ...interface{}) bool {
}
return false
}
// SkipConsecutive skips consecutive runes from the provided pattern.
// Returns true when one or more runes were skipped.
func (p *P) SkipConsecutive(pattern string) bool {
didSkip := false
for p.SkipMatching(pattern) {
didSkip = true
}
return didSkip
}

View File

@ -1,5 +1,9 @@
package parsekit
import (
"unicode/utf8"
)
// P holds the internal state of the parser.
type P struct {
state StateFn // the function that handles the current state
@ -50,3 +54,14 @@ type Error struct {
func (err *Error) Error() string {
return err.Message
}
// EOF is a special rune, which is used to indicate an end of file when
// reading a character from the input.
// It can be treated as a rune when writing parsing rules, so a valid way to
// say 'I now expect the end of the file' is using something like:
// if (p.On(c.Rune(EOF)).Skip()) { ... }
const EOF rune = -1
// INVALID is a special rune, which is used to indicate an invalid UTF8
// rune on the input.
const INVALID rune = utf8.RuneError

View File

@ -6,7 +6,7 @@ import (
"testing"
"github.com/mmakaay/toml/parsekit"
lexer "github.com/mmakaay/toml/parser"
"github.com/mmakaay/toml/parser"
)
type statesT struct {
@ -23,7 +23,7 @@ func runStatesTs(t *testing.T, tests []statesT) {
}
func runStatesT(t *testing.T, c statesT) {
l, err := lexer.NewParser(c.in).ToArray()
l, err := parser.NewParser(c.in).ToArray()
if err == nil && c.err != "" {
t.Errorf("[%s] Expected error '%s', but no error occurred", c.name, c.err)
}
@ -36,12 +36,12 @@ func runStatesT(t *testing.T, c statesT) {
switch expected := c.out.(type) {
case []string:
if len(expected) != len(l) {
t.Errorf("[%s] Unexpected number of lexer items:\nexpected: %d\nactual: %d\n", c.name, len(expected), len(l))
t.Errorf("[%s] Unexpected number of parser items:\nexpected: %d\nactual: %d\n", c.name, len(expected), len(l))
}
for i, e := range expected {
v := ParserItemToString(l[i])
if v != e {
t.Errorf("[%s] Unexpected lexer item at index %d:\nexpected: %s\nactual: %s\n", c.name, i, e, v)
t.Errorf("[%s] Unexpected parser item at index %d:\nexpected: %s\nactual: %s\n", c.name, i, e, v)
}
}
case string:
@ -51,7 +51,7 @@ func runStatesT(t *testing.T, c statesT) {
}
actual := strings.Join(a, "")
if actual != expected {
t.Errorf("[%s] Unexpected lexer output:\nexpected: %s\nactual: %s\n", c.name, expected, actual)
t.Errorf("[%s] Unexpected parser output:\nexpected: %s\nactual: %s\n", c.name, expected, actual)
}
}
}
@ -59,15 +59,15 @@ func runStatesT(t *testing.T, c statesT) {
// ParserItemToString returns a string representation of the parsekit.Item.
func ParserItemToString(i parsekit.Item) string {
switch i.Type {
case lexer.ItemComment:
case parser.ItemComment:
return fmt.Sprintf("#(%s)", i.Value)
case lexer.ItemKey:
case parser.ItemKey:
return fmt.Sprintf("[%s]", i.Value)
case lexer.ItemString:
case parser.ItemString:
return fmt.Sprintf("STR(%s)", i.Value)
case lexer.ItemKeyDot:
case parser.ItemKeyDot:
return "."
case lexer.ItemAssignment:
case parser.ItemAssignment:
return "="
default:
panic(fmt.Sprintf("No string representation available for parsekit.Item id %d", i.Type))

View File

@ -11,40 +11,28 @@ const (
ItemString // A value of type string
)
const (
whitespace string = " \t"
carriageReturn string = "\r"
newline string = "\n"
hash string = "#"
equal string = "="
lower string = "abcdefghijklmnopqrstuvwxyz"
upper string = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
digits string = "0123456789"
hex string = digits + "abcdefABCDEF"
dot string = "."
underscore string = "_"
dash string = "-"
singleQuote string = "'"
doubleQuote string = "\""
backslash string = "\\"
quoteChars string = singleQuote + doubleQuote
bareKeyChars string = lower + upper + digits + underscore + dash
startOfKey string = bareKeyChars + quoteChars
validEscapeChars string = `btnfr"\`
mustBeEscaped string = "" +
"\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007" +
"\u0008\u0009\u000A\u000B\u000C\u000D\u000E\u000F" +
"\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017" +
"\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F" +
"\u007F"
)
var (
keySeparatorDot = []interface{}{whitespace, dot, whitespace}
doubleQuote3 = []interface{}{doubleQuote, doubleQuote, doubleQuote}
hex4 = []interface{}{hex, hex, hex, hex}
shortUtf8Match = []interface{}{backslash, 'u', hex4}
longUtf8Match = []interface{}{backslash, 'U', hex4, hex4}
c = parsekit.C
space = c.Rune(' ')
tab = c.Rune('\t')
carriageReturn = c.Rune('\r')
lineFeed = c.Rune('\n')
hash = c.Rune('#')
underscore = c.Rune('_')
dash = c.Rune('-')
equal = c.Rune('=')
dot = c.Rune('.')
singleQuote = c.Rune('\'')
doubleQuote = c.Rune('"')
any = c.Any()
anyQuote = c.AnyOf(singleQuote, doubleQuote)
backslash = c.Rune('\\')
lower = c.RuneRange('a', 'z')
upper = c.RuneRange('A', 'Z')
digit = c.RuneRange('0', '9')
whitespace = c.OneOrMore(c.AnyOf(space, tab))
whitespaceOrNewlines = c.OneOrMore(c.AnyOf(space, tab, carriageReturn, lineFeed))
optionalWhitespace = c.Optional(whitespace)
)
// NewParser creates a new parser, using the provided input string

View File

@ -6,6 +6,10 @@ import (
"github.com/mmakaay/toml/parser"
)
func TestEmptyInput(t *testing.T) {
runStatesT(t, statesT{"empty string", "", "", ""})
}
func TestErrorsIncludeLineAndRowPosition(t *testing.T) {
_, err := parser.NewParser("# 12345 abcde\t\n\n\n# 67890\r\n# 12345\xbc").ToArray()
t.Logf("Got error: %s", err.Error())
@ -17,18 +21,13 @@ func TestErrorsIncludeLineAndRowPosition(t *testing.T) {
}
}
func TestEmptyInput(t *testing.T) {
runStatesT(t, statesT{"empty string", "", "", ""})
}
func TestInvalidUtf8Data(t *testing.T) {
runStatesTs(t, []statesT{
{"inside comment", "# \xbc", "", "invalid UTF8 character"},
{"bare key 1", "\xbc", "", "invalid UTF8 character"},
{"bare key 2", "key\xbc", "[key]", "invalid UTF8 character"},
{"assignment", "key \xbc", "[key]", "invalid UTF8 character"},
{"start of value", "key=\xbc", "[key]=", "invalid UTF8 character"},
{"basic string value", "a=\"\xbc\"", "[a]=", "invalid UTF8 character"},
{"inside comment", "# \xbc", "", "invalid UTF8 character in input (expected comment contents)"},
{"bare key 1", "\xbc", "", "invalid UTF8 character in input (expected end of file)"},
{"bare key 2", "key\xbc", "[key]", "invalid UTF8 character in input (expected a value assignment)"},
{"start of value", "key=\xbc", "[key]=", "invalid UTF8 character in input (expected a value)"},
{"basic string value", "a=\"\xbc\"", "[a]=", "invalid UTF8 character in input (expected string contents)"},
})
}

View File

@ -6,7 +6,7 @@ import (
// A '#' hash symbol marks the rest of the line as a comment.
func startComment(p *parsekit.P) {
p.SkipConsecutive(hash)
p.On(c.OneOrMore(hash)).Skip()
p.RouteTo(commentContents)
}
@ -16,8 +16,9 @@ func commentContents(p *parsekit.P) {
case p.AtEndOfLine():
p.EmitLiteralTrim(ItemComment)
p.RouteReturn()
default:
p.AcceptAny()
case p.On(any).Accept():
p.Repeat()
default:
p.UnexpectedInput("comment contents")
}
}

View File

@ -1,65 +0,0 @@
package parser
import "github.com/mmakaay/toml/parsekit"
// The primary building block of a TOML document is the key/value pair.
func startKeyValuePair(p *parsekit.P) {
switch {
case p.On(whitespace + carriageReturn + newline).Skip():
p.Repeat()
case p.On(hash).Stay():
p.RouteTo(startComment).ThenReturnHere()
case p.On(startOfKey).RouteTo(startKey):
default:
p.RouteTo(endOfFile)
}
}
// A key may be either bare, quoted or dotted.
func startKey(p *parsekit.P) {
switch {
case p.On(bareKeyChars).RouteTo(startBareKey):
default:
p.UnexpectedInput("a valid key name")
}
}
// Bare keys may only contain ASCII letters, ASCII digits,
// underscores, and dashes (A-Za-z0-9_-). Note that bare
// keys are allowed to be composed of only ASCII digits,
// e.g. 1234, but are always interpreted as strings.
func startBareKey(p *parsekit.P) {
p.AcceptConsecutive(bareKeyChars) // TODO make a plan for adding this to After()
p.EmitLiteral(ItemKey)
p.RouteTo(endOfKeyOrDot)
}
// Dotted keys are a sequence of bare or quoted keys joined with a dot.
// This allows for grouping similar properties together:
func endOfKeyOrDot(p *parsekit.P) {
// Whitespace around dot-separated parts is ignored, however,
// best practice is to not use any extraneous whitespace.
p.SkipConsecutive(whitespace)
if p.On(dot).Accept() {
p.SkipConsecutive(whitespace)
p.EmitLiteral(ItemKeyDot)
p.RouteTo(startKey)
} else {
p.RouteTo(startKeyAssignment)
}
}
// Keys are on the left of the equals sign and values are on the right.
// Whitespace is ignored around key names and values. The key, equals
// sign, and value must be on the same line (though some values can
// be broken over multiple lines).
func startKeyAssignment(p *parsekit.P) {
p.SkipConsecutive(whitespace)
if p.On(equal).Accept() {
p.EmitLiteral(ItemAssignment)
p.SkipConsecutive(whitespace)
p.RouteTo(startValue)
} else {
p.UnexpectedInput("a value assignment")
}
}

View File

@ -0,0 +1,88 @@
package parser
import "github.com/mmakaay/toml/parsekit"
// The primary building block of a TOML document is the key/value pair.
var (
// Keys are on the left of the equals sign and values are on the right.
// Whitespace is ignored around key names and values. The key, equals
// sign, and value must be on the same line (though some values can be
// broken over multiple lines).
keyAssignment = c.Sequence(optionalWhitespace, equal, optionalWhitespace)
// A key may be either bare, quoted or dotted.
// Bare keys may only contain ASCII letters, ASCII digits,
// underscores, and dashes (A-Za-z0-9_-). Note that bare
// keys are allowed to be composed of only ASCII digits,
// e.g. 1234, but are always interpreted as strings.
bareKeyRune = c.AnyOf(lower, upper, digit, underscore, dash)
bareKey = c.OneOrMore(bareKeyRune)
// Quoted keys follow the exact same rules as either basic
// strings or literal strings and allow you to use a much broader
// set of key names. Best practice is to use bare keys except
// when absolutely necessary.
// A bare key must be non-empty, but an empty quoted key is
// allowed (though discouraged).
startOfKey = c.AnyOf(bareKeyRune, anyQuote)
// Dotted keys are a sequence of bare or quoted keys joined with a dot.
// This allows for grouping similar properties together.
// Whitespace around dot-separated parts is ignored, however, best
// practice is to not use any extraneous whitespace.
keySeparatordDot = c.Sequence(optionalWhitespace, dot, optionalWhitespace)
)
func startKeyValuePair(p *parsekit.P) {
p.On(whitespaceOrNewlines).Skip()
switch {
case p.On(hash).Stay():
p.RouteTo(startComment).ThenReturnHere()
case p.On(startOfKey).RouteTo(startKey):
default:
p.RouteTo(endOfFile) // TODO Make end of file a Matcher, so this can be simpler.
}
}
func startKey(p *parsekit.P) {
switch {
case p.On(bareKeyRune).RouteTo(startBareKey):
default:
p.UnexpectedInput("a valid key name")
}
}
func startBareKey(p *parsekit.P) {
p.On(bareKey).Accept()
p.EmitLiteral(ItemKey)
p.RouteTo(endOfKeyOrDot)
}
func endOfKeyOrDot(p *parsekit.P) {
if p.On(keySeparatordDot).Skip() {
p.Emit(ItemKeyDot, ".")
p.RouteTo(startKey)
} else {
p.RouteTo(startKeyAssignment)
}
}
func startKeyAssignment(p *parsekit.P) {
if p.On(keyAssignment).Skip() {
p.Emit(ItemAssignment, "=")
p.RouteTo(startValue)
} else {
p.UnexpectedInput("a value assignment")
}
}
// Values must be of the following types: String, Integer, Float, Boolean,
// Datetime, Array, or Inline Table. Unspecified values are invalid.
func startValue(p *parsekit.P) {
switch {
case p.On(anyQuote).RouteTo(startString):
default:
p.UnexpectedInput("a value")
}
}

View File

@ -5,9 +5,9 @@ import (
)
func TestKeyWithoutAssignment(t *testing.T) {
err := "unexpected end of file"
err := "unexpected end of file (expected a value assignment)"
runStatesTs(t, []statesT{
{"bare with whitespace", " a ", "[a]", err},
{"bare with whitespace", " a ", "[a]", "unexpected character ' ' (expected a value assignment)"},
{"bare lower", "abcdefghijklmnopqrstuvwxyz", "[abcdefghijklmnopqrstuvwxyz]", err},
{"bare upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "[ABCDEFGHIJKLMNOPQRSTUVWXYZ]", err},
{"bare numbers", "0123456789", "[0123456789]", err},
@ -18,15 +18,14 @@ func TestKeyWithoutAssignment(t *testing.T) {
}
func TestDottedKey(t *testing.T) {
err := "unexpected end of file"
runStatesTs(t, []statesT{
{"bare dotted", "a._.c", "[a].[_].[c]", err},
{"bare dotted with whitespace", " a .\t\t b\t ", "[a].[b]", err},
{"bare dotted", "a._.c", "[a].[_].[c]", "unexpected end of file (expected a value assignment)"},
{"bare dotted with whitespace", " a .\t\t b\t ", "[a].[b]", `unexpected character '\t' (expected a value assignment)`},
})
}
func TestKeyWithAssignmentButNoValue(t *testing.T) {
err := "unexpected end of file"
err := "unexpected end of file (expected a value)"
runStatesTs(t, []statesT{
{"bare", "a=", "[a]=", err},
{"double equal sign", "a==", "[a]=", "unexpected character '=' (expected a value)"},

View File

@ -2,18 +2,16 @@ package parser
import "github.com/mmakaay/toml/parsekit"
var (
// There are four ways to express strings: basic, multi-line basic, literal,
// and multi-line literal. All strings must contain only valid UTF-8 characters.
// * Multi-line basic strings are surrounded by three quotation marks on each side.
// * Basic strings are surrounded by quotation marks.
func startString(p *parsekit.P) {
switch {
case p.On(doubleQuote3).RouteTo(startMultiLineBasicString):
case p.On(doubleQuote).RouteTo(startBasicString):
default:
p.UnexpectedInput("a string value")
}
}
doubleQuote3 = c.Repeat(3, doubleQuote)
// Any Unicode character may be used except those that must be escaped:
// quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F).
charThatMustBeEscaped = c.AnyOf(c.RuneRange('\u0000', '\u001F'), c.Rune('\u007F'))
// For convenience, some popular characters have a compact escape sequence.
//
@ -26,25 +24,38 @@ func startString(p *parsekit.P) {
// \\ - backslash (U+005C)
// \uXXXX - unicode (U+XXXX)
// \UXXXXXXXX - unicode (U+XXXXXXXX)
//
// Any Unicode character may be used except those that must be escaped:
// quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F).
validEscapeChar = c.AnyOf(c.Runes('b', 't', 'n', 'f', 'r'), doubleQuote, backslash)
shortEscape = c.Sequence(backslash, validEscapeChar)
hex = c.AnyOf(digit, c.RuneRange('a', 'f'), c.RuneRange('A', 'F'))
shortUtf8Escape = c.Sequence(backslash, c.Rune('u'), c.Repeat(4, hex))
longUtf8Escape = c.Sequence(backslash, c.Rune('U'), c.Repeat(8, hex))
validEscape = c.AnyOf(shortEscape, shortUtf8Escape, longUtf8Escape)
)
func startString(p *parsekit.P) {
switch {
case p.On(doubleQuote3).RouteTo(startMultiLineBasicString):
case p.On(doubleQuote).RouteTo(startBasicString):
default:
p.UnexpectedInput("a string value")
}
}
func parseBasicString(p *parsekit.P) {
switch {
case p.AtEndOfFile():
case p.On(parsekit.EOF).Stay():
p.UnexpectedEndOfFile("basic string token")
case p.On(backslash, validEscapeChars).Accept() ||
p.On(shortUtf8Match).Accept() ||
p.On(longUtf8Match).Accept():
case p.On(validEscape).Accept():
p.Repeat()
case p.On(mustBeEscaped).Stay():
r, _, _ := p.Match(mustBeEscaped)
case p.On(charThatMustBeEscaped).Stay():
r, _, _ := p.Match(charThatMustBeEscaped)
p.EmitError("Invalid character in basic string: %q (must be escaped)", r[0])
case p.On(backslash).Stay() || p.On(doubleQuote).Stay():
p.RouteReturn()
default:
p.AcceptAny()
case p.On(any).Accept():
p.Repeat()
default:
p.UnexpectedInput("string contents")
}
}
@ -69,7 +80,7 @@ func basicStringSpecifics(p *parsekit.P) {
case p.On(backslash).Stay():
p.EmitError("Invalid escape sequence")
default:
p.RouteTo(startBasicString)
panic("String parsing should not have ended up here")
}
}

View File

@ -33,8 +33,8 @@ func TestEmptyBasicString(t *testing.T) {
{"with comment", `a="" #cool`, "[a]=STR()#(cool)", ""},
{"with whitespaces", ` a = "" `, "[a]=STR()", ""},
{"dotted", ` a.b = "" `, "[a].[b]=STR()", ""},
{"multiple same line", `a=""b=""`, "[a]=STR()[b]=STR()", ""},
{"multiple lines", "a=\"\" \n b = \"\" ", "[a]=STR()[b]=STR()", ""},
{"multiple on same line", `a=""b=""`, "[a]=STR()[b]=STR()", ""},
{"multiple on multiple lines", "a=\"\" \n b = \"\" ", "[a]=STR()[b]=STR()", ""},
})
}

View File

@ -1,14 +0,0 @@
package parser
import "github.com/mmakaay/toml/parsekit"
// Values must be of the following types: String, Integer, Float, Boolean,
// Datetime, Array, or Inline Table. Unspecified values are invalid.
func startValue(p *parsekit.P) {
p.SkipConsecutive(whitespace)
if p.Upcoming(quoteChars) {
p.RouteTo(startString)
} else {
p.UnexpectedInput("a value")
}
}