Backup work on code cleanup now the parser/combinator code is stable.

This commit is contained in:
Maurice Makaay 2019-05-20 12:24:36 +00:00
parent 84ae34fb5f
commit 3677ab18cb
14 changed files with 354 additions and 337 deletions

View File

@ -3,7 +3,6 @@ package parsekit
import ( import (
"fmt" "fmt"
"strings" "strings"
"unicode/utf8"
) )
// Emit passes a Parser item to the client, including the provided string. // Emit passes a Parser item to the client, including the provided string.
@ -48,27 +47,23 @@ func (p *P) EmitError(format string, args ...interface{}) {
// UnexpectedInput is used by a parser implementation to emit an // UnexpectedInput is used by a parser implementation to emit an
// error item that tells the client that an unexpected rune was // error item that tells the client that an unexpected rune was
// encountered in the input. // encountered in the input.
// The parameter 'expected' is used to provide some context to the error. func (p *P) UnexpectedInput() {
func (p *P) UnexpectedInput(expected string) {
// next() takes care of error messages in cases where ok == false.
// Therefore, we only provide an error message for the ok case here.
r, _, ok := p.peek(0) r, _, ok := p.peek(0)
switch { switch {
case ok: case ok:
p.EmitError("unexpected character %q (expected %s)", r, expected) p.EmitError("unexpected character %q%s", r, p.fmtExpects())
case r == EOF: case r == EOF:
p.EmitError("unexpected end of file (expected %s)", expected) p.EmitError("unexpected end of file%s", p.fmtExpects())
case r == utf8.RuneError: case r == INVALID:
p.EmitError("invalid UTF8 character in input (expected %s)", expected) p.EmitError("invalid UTF8 character in input%s", p.fmtExpects())
default: default:
panic("Unhandled output from peek()") panic("Unhandled output from peek()")
} }
} }
// UnexpectedEndOfFile is used by a parser implementation to emit an func (p *P) fmtExpects() string {
// error item that tells the client that more data was expected from if p.expecting == "" {
// the input. return ""
// The parameter 'expected' is used to provide some context to the error. }
func (p *P) UnexpectedEndOfFile(expected string) { return fmt.Sprintf(" (expected %s)", p.expecting)
p.EmitError("Unexpected end of file (expected %s)", expected)
} }

View File

@ -4,6 +4,24 @@ import (
"unicode/utf8" "unicode/utf8"
) )
// P holds the internal state of the parser.
type P struct {
state StateFn // the function that handles the current state
nextState StateFn // the function that will handle the next state
stack []StateFn // state function stack, for nested parsing
input string // the scanned input
len int // the total length of the input in bytes
pos int // current byte scanning position in the input
newline bool // keep track of when we have scanned a newline
cursorRow int // current row number in the input
cursorColumn int // current column position in the input
expecting string // a description of what the current state expects to find
buffer stringBuffer // an efficient buffer, used to build string values
items chan Item // channel of resulting Parser items
item Item // the current item as reached by Next() and retrieved by Get()
err *Error // an error when lexing failed, retrieved by Error()
}
// peek returns but does not advance the cursor to the next rune(s) in the input. // peek returns but does not advance the cursor to the next rune(s) in the input.
// Returns the rune, its width in bytes and a boolean. // Returns the rune, its width in bytes and a boolean.
// The boolean will be false in case no upcoming rune can be peeked // The boolean will be false in case no upcoming rune can be peeked
@ -13,60 +31,6 @@ func (p *P) peek(offsetInBytes int) (rune, int, bool) {
return handleRuneError(r, w) return handleRuneError(r, w)
} }
// peekMulti takes a peek at multiple upcoming runes in the input.
// Returns a slice of runes, a slice containing their respective
// widths in bytes and a boolean.
// The boolean will be false in case less runes can be peeked than
// the requested amount (end of data or invalid UTF8 character).
func (p *P) peekMulti(amount int) ([]rune, []int, bool) {
var runes []rune
var widths []int
offset := 0
for i := 0; i < amount; i++ {
r, w := utf8.DecodeRuneInString(p.input[p.pos+offset:])
r, w, ok := handleRuneError(r, w)
runes = append(runes, r)
widths = append(widths, w)
offset += w
if !ok {
return runes, widths, false
}
}
return runes, widths, true
}
// progress moves the cursor forward in the input, returning one rune
// for every specified pattern. The cursor will only be moved forward when
// all requested patterns can be satisfied.
// Returns true when all patterns were satisfied and the cursor was
// moved forward, false otherwise.
// A callback function can be provided to specify what to do with
// the runes that are encountered in the input.
func (p *P) progress(callback func(rune), patterns ...interface{}) bool {
if runes, widths, ok := p.Match(patterns...); ok {
for i, r := range runes {
callback(r)
p.advanceCursor(r, widths[i])
}
return true
}
return false
}
// advanceCursor advances the rune cursor one position in the
// input data. While doing so, it keeps tracks of newlines,
// so we can report on row + column positions on error.
func (p *P) advanceCursor(r rune, w int) {
p.pos += w
if p.newline {
p.cursorColumn = 0
p.cursorRow++
} else {
p.cursorColumn++
}
p.newline = r == '\n'
}
// handleRuneError is used to normale rune value in case of errors. // handleRuneError is used to normale rune value in case of errors.
// When an error occurs, then utf8.RuneError will be in the rune. // When an error occurs, then utf8.RuneError will be in the rune.
// This can however indicate one of two situations: // This can however indicate one of two situations:
@ -84,3 +48,48 @@ func handleRuneError(r rune, w int) (rune, int, bool) {
} }
return r, w, true return r, w, true
} }
// EOF is a special rune, which is used to indicate an end of file when
// reading a character from the input.
// It can be treated as a rune when writing parsing rules, so a valid way to
// say 'I now expect the end of the file' is using something like:
// if (p.On(c.Rune(EOF)).Skip()) { ... }
const EOF rune = -1
// INVALID is a special rune, which is used to indicate an invalid UTF8
// rune on the input.
const INVALID rune = utf8.RuneError
// StateFn defines the type of function that can be used to
// handle a parser state.
type StateFn func(*P)
// ItemType represents the type of a parser Item.
type ItemType int
// ItemEOF is a built-in parser item type that is used for flagging that the
// end of the input was reached.
const ItemEOF ItemType = -1
// ItemError is a built-in parser item type that is used for flagging that
// an error has occurred during parsing.
const ItemError ItemType = -2
// Item represents an item returned from the parser.
type Item struct {
Type ItemType
Value string
}
// Error is used as the error type when parsing errors occur.
// The error includes some extra meta information to allow for useful
// error messages to the user.
type Error struct {
Message string
Row int
Column int
}
func (err *Error) Error() string {
return err.Message
}

View File

@ -1,12 +1,17 @@
package parsekit package parsekit
import "unicode/utf8" import (
"unicode"
"unicode/utf8"
)
// Not in need of it myself, but nice to have I guess: // Not in need of it myself, but nice to have I guess:
// - NotFollowedBy // - NotFollowedBy
// - Discard
// - Separated // - Separated
// MatchDialog is used by Matcher implementations as a means
// to retrieve data to match against and to report back
// successful matches.
type MatchDialog struct { type MatchDialog struct {
p *P p *P
runes []rune runes []rune
@ -14,44 +19,70 @@ type MatchDialog struct {
offset int offset int
curRune rune curRune rune
curWidth int curWidth int
forked bool parent *MatchDialog
} }
// Fork splits off a child MatchDialog, containing the same
// offset as the parent MatchDialog, but with all other data
// in a new state.
// By forking, a Matcher implementation can freely work with
// a MatchDialog, without affecting the parent MatchDialog.
// When the Matcher decides that a match was found, it can
// use the Merge() method on the child to merge the child's
// matching data into the parent MatchDialog.
func (m *MatchDialog) Fork() *MatchDialog { func (m *MatchDialog) Fork() *MatchDialog {
fork := &MatchDialog{ child := &MatchDialog{
p: m.p, p: m.p,
offset: m.offset, offset: m.offset,
forked: true, parent: m,
} }
return fork return child
} }
func (m *MatchDialog) Join(fork *MatchDialog) bool { // Merge merges the data for a a forked child MatchDialog back
if !fork.forked { // into its parent:
panic("Cannot join a non-forked MatchDialog") // * the runes that are accumulated in the child are added
// to the parent's runes
// * the parent's offset is set to the child's offset
// After a Merge, the child MatchDialog is reset so it can
// immediately be reused for performing another match.
func (m *MatchDialog) Merge() bool {
if m.parent == nil {
panic("Cannot call Merge a a non-forked MatchDialog")
} }
m.runes = append(m.runes, fork.runes...) m.parent.runes = append(m.parent.runes, m.runes...)
m.widths = append(m.widths, fork.widths...) m.parent.widths = append(m.parent.widths, m.widths...)
m.offset = fork.offset m.parent.offset = m.offset
fork.runes = []rune{} m.Clear()
fork.widths = []int{}
return true return true
} }
// NextRune can be called by a Matcher on a MatchDialog in order
// to receive the next rune from the input.
// The rune is automatically added to the MatchDialog's runes.
// Returns the rune and a boolean. The boolean will be false in
// case an invalid UTF8 rune of the end of the file was encountered.
func (m *MatchDialog) NextRune() (rune, bool) { func (m *MatchDialog) NextRune() (rune, bool) {
if m.curRune == utf8.RuneError { if m.curRune == utf8.RuneError {
panic("Matcher must not call NextRune() after it returned false") panic("Matcher must not call NextRune() after it returned false")
} }
r, w := utf8.DecodeRuneInString(m.p.input[m.p.pos+m.offset:]) r, w, ok := m.p.peek(m.offset)
m.offset += w m.offset += w
m.curRune = r m.curRune = r
m.curWidth = w m.curWidth = w
m.runes = append(m.runes, r) m.runes = append(m.runes, r)
m.widths = append(m.widths, w) m.widths = append(m.widths, w)
return r, r != EOF && r != INVALID return r, ok
} }
// Matcher is the interface that can be implemented to provide // Clear empties out the accumulated runes that are stored
// in the MatchDialog.
func (m *MatchDialog) Clear() {
m.runes = []rune{}
m.widths = []int{}
}
// Matcher is the interface that must be implemented to provide
// a matching stategy for the match() function. // a matching stategy for the match() function.
// A MatchDialog is provided as input. This implements a // A MatchDialog is provided as input. This implements a
// specific set of methods that a Matcher needs to retrieve data // specific set of methods that a Matcher needs to retrieve data
@ -60,20 +91,28 @@ type Matcher interface {
Match(*MatchDialog) bool Match(*MatchDialog) bool
} }
type MatcherConstructors struct { type matcherConstructors struct {
Any func() MatchAny Any func() MatchAny
Rune func(rune rune) MatchRune Rune func(rune) MatchRune
RuneRange func(start rune, end rune) MatchRuneRange RuneRange func(rune, rune) MatchRuneRange
Runes func(runes ...rune) MatchAnyOf Runes func(...rune) MatchAnyOf
AnyOf func(matchers ...Matcher) MatchAnyOf String func(string) MatchSequence
Repeat func(count int, matcher Matcher) MatchRepeat StringNoCase func(string) MatchSequence
Sequence func(matchers ...Matcher) MatchSequence AnyOf func(...Matcher) MatchAnyOf
ZeroOrMore func(matcher Matcher) MatchZeroOrMore Repeat func(int, Matcher) MatchRepeat
OneOrMore func(matcher Matcher) MatchOneOrMore Sequence func(...Matcher) MatchSequence
Optional func(matcher Matcher) MatchOptional ZeroOrMore func(Matcher) MatchZeroOrMore
OneOrMore func(Matcher) MatchOneOrMore
Optional func(Matcher) MatchOptional
Drop func(Matcher) MatchDrop
} }
var C = MatcherConstructors{ // C provides access to a wide range of parser/combinator
// constructors that can be used to build matching expressions.
// When using C in your own parser, then it is advised to create
// an alias in your own package for easy reference:
// var c = parsekit.C
var C = matcherConstructors{
Any: func() MatchAny { Any: func() MatchAny {
return MatchAny{} return MatchAny{}
}, },
@ -90,6 +129,22 @@ var C = MatcherConstructors{
} }
return MatchAnyOf{m} return MatchAnyOf{m}
}, },
String: func(s string) MatchSequence {
m := make([]Matcher, len(s))
for i, r := range s {
m[i] = MatchRune{r}
}
return MatchSequence{m}
},
StringNoCase: func(s string) MatchSequence {
m := make([]Matcher, len(s))
for i, r := range s {
u := MatchRune{unicode.ToUpper(r)}
l := MatchRune{unicode.ToLower(r)}
m[i] = MatchAnyOf{[]Matcher{u, l}}
}
return MatchSequence{m}
},
AnyOf: func(matchers ...Matcher) MatchAnyOf { AnyOf: func(matchers ...Matcher) MatchAnyOf {
return MatchAnyOf{matchers} return MatchAnyOf{matchers}
}, },
@ -108,6 +163,9 @@ var C = MatcherConstructors{
Optional: func(matcher Matcher) MatchOptional { Optional: func(matcher Matcher) MatchOptional {
return MatchOptional{matcher} return MatchOptional{matcher}
}, },
Drop: func(matcher Matcher) MatchDrop {
return MatchDrop{matcher}
},
} }
type MatchAny struct{} type MatchAny struct{}
@ -142,9 +200,9 @@ type MatchAnyOf struct {
func (c MatchAnyOf) Match(m *MatchDialog) bool { func (c MatchAnyOf) Match(m *MatchDialog) bool {
for _, matcher := range c.matcher { for _, matcher := range c.matcher {
mc := m.Fork() child := m.Fork()
if matcher.Match(mc) { if matcher.Match(child) {
return m.Join(mc) return child.Merge()
} }
} }
return false return false
@ -156,13 +214,13 @@ type MatchRepeat struct {
} }
func (c MatchRepeat) Match(m *MatchDialog) bool { func (c MatchRepeat) Match(m *MatchDialog) bool {
mc := m.Fork() child := m.Fork()
for i := 0; i < c.count; i++ { for i := 0; i < c.count; i++ {
if !c.matcher.Match(mc) { if !c.matcher.Match(child) {
return false return false
} }
} }
m.Join(mc) child.Merge()
return true return true
} }
@ -171,13 +229,13 @@ type MatchSequence struct {
} }
func (c MatchSequence) Match(m *MatchDialog) bool { func (c MatchSequence) Match(m *MatchDialog) bool {
mPart := m.Fork() child := m.Fork()
for _, matcher := range c.matchers { for _, matcher := range c.matchers {
if !matcher.Match(mPart) { if !matcher.Match(child) {
return false return false
} }
} }
m.Join(mPart) child.Merge()
return true return true
} }
@ -186,9 +244,9 @@ type MatchOneOrMore struct {
} }
func (c MatchOneOrMore) Match(m *MatchDialog) bool { func (c MatchOneOrMore) Match(m *MatchDialog) bool {
mc := m.Fork() child := m.Fork()
for c.matcher.Match(mc) { for c.matcher.Match(child) {
m.Join(mc) child.Merge()
} }
return len(m.runes) > 0 return len(m.runes) > 0
} }
@ -198,9 +256,9 @@ type MatchZeroOrMore struct {
} }
func (c MatchZeroOrMore) Match(m *MatchDialog) bool { func (c MatchZeroOrMore) Match(m *MatchDialog) bool {
mc := m.Fork() child := m.Fork()
for c.matcher.Match(mc) { for c.matcher.Match(child) {
m.Join(mc) child.Merge()
} }
return true return true
} }
@ -210,9 +268,23 @@ type MatchOptional struct {
} }
func (c MatchOptional) Match(m *MatchDialog) bool { func (c MatchOptional) Match(m *MatchDialog) bool {
mc := m.Fork() child := m.Fork()
if c.matcher.Match(mc) { if c.matcher.Match(child) {
m.Join(mc) child.Merge()
} }
return true return true
} }
type MatchDrop struct {
matcher Matcher
}
func (c MatchDrop) Match(m *MatchDialog) bool {
child := m.Fork()
if c.matcher.Match(child) {
child.Clear()
child.Merge()
return true
}
return false
}

View File

@ -12,11 +12,10 @@ const TestItem p.ItemType = 1
func newParser(input string, matcher p.Matcher) *p.P { func newParser(input string, matcher p.Matcher) *p.P {
stateFn := func(p *p.P) { stateFn := func(p *p.P) {
p.Expects("MATCH")
if p.On(matcher).Accept() { if p.On(matcher).Accept() {
p.EmitLiteral(TestItem) p.EmitLiteral(TestItem)
p.Repeat() p.Repeat()
} else {
p.UnexpectedInput("MATCH")
} }
} }
return p.New(input, stateFn) return p.New(input, stateFn)
@ -107,6 +106,35 @@ func TestMatchRuneRange(t *testing.T) {
} }
} }
func TestMatchString(t *testing.T) {
p := newParser("Hello, world!", c.String("Hello"))
r, err, ok := p.Next()
if !ok {
t.Fatalf("Parsing failed: %s", err)
}
if r.Type != TestItem {
t.Error("Parser item type not expected TestTitem")
}
if r.Value != "Hello" {
t.Errorf("Parser item value is %q instead of expected \"Hello\"", r.Value)
}
}
// TODO
// func TestMatchStringNoCase(t *testing.T) {
// p := newParser("HellÖ, world!", c.StringNoCase("hellö"))
// r, err, ok := p.Next()
// if !ok {
// t.Fatalf("Parsing failed: %s", err)
// }
// if r.Type != TestItem {
// t.Error("Parser item type not expected TestTitem")
// }
// if r.Value != "Hello" {
// t.Errorf("Parser item value is %q instead of expected \"Hello\"", r.Value)
// }
// }
func TestMatchRunes(t *testing.T) { func TestMatchRunes(t *testing.T) {
m := c.Runes('+', '-', '*', '/') m := c.Runes('+', '-', '*', '/')
s := "-+/*+++" s := "-+/*+++"
@ -243,6 +271,17 @@ func TestMatchOptional(t *testing.T) {
} }
} }
func TestMatchDrop(t *testing.T) {
dashes := c.OneOrMore(c.Rune('-'))
p := newParser("---X---", c.Sequence(c.Drop(dashes), c.Any(), c.Drop(dashes)))
r, err, ok := p.Next()
if !ok {
t.Fatalf("Parsing failed: %s at row: %d, column %d\n", err, err.Row, err.Column)
}
if r.Value != "X" {
t.Errorf("Parser item value is %q instead of expected \"x\"", r.Value)
}
}
func TestMixAndMatch(t *testing.T) { func TestMixAndMatch(t *testing.T) {
hex := c.AnyOf(c.RuneRange('0', '9'), c.RuneRange('a', 'f'), c.RuneRange('A', 'F')) hex := c.AnyOf(c.RuneRange('0', '9'), c.RuneRange('a', 'f'), c.RuneRange('A', 'F'))
backslash := c.Rune('\\') backslash := c.Rune('\\')

View File

@ -1,10 +1,18 @@
package parsekit package parsekit
import ( // Expects is used to let a state function describe what input it is expecting.
"fmt" // This expectation is used in error messages to make them more descriptive.
"strings" //
"unicode/utf8" // Also, when defining an expectation inside a StateFn, you do not need
) // to handle unexpected input yourself. When the end of the function is
// reached without setting the next state, an automatic error will be
// emitted. This error differentiates between issues:
// * there is valid data on input, but it was not accepted by the function
// * there is an invalid UTF8 character on input
// * the end of the file was reached.
func (p *P) Expects(description string) {
p.expecting = description
}
// AtEndOfFile returns true when there is no more data available in the input. // AtEndOfFile returns true when there is no more data available in the input.
func (p *P) AtEndOfFile() bool { func (p *P) AtEndOfFile() bool {
@ -16,8 +24,8 @@ func (p *P) AtEndOfFile() bool {
// by this method. // by this method.
func (p *P) AtEndOfLine() bool { func (p *P) AtEndOfLine() bool {
return p.AtEndOfFile() || return p.AtEndOfFile() ||
p.Upcoming("\r", "\n") || p.On(C.String("\r\n")).Stay() ||
p.Upcoming("\n") p.On(C.Rune('\n')).Stay()
} }
// SkipEndOfLine returns true when the cursor is either at the end of the line // SkipEndOfLine returns true when the cursor is either at the end of the line
@ -25,8 +33,8 @@ func (p *P) AtEndOfLine() bool {
// the cursor is moved forward to beyond the newline. // the cursor is moved forward to beyond the newline.
func (p *P) SkipEndOfLine() bool { func (p *P) SkipEndOfLine() bool {
return p.AtEndOfFile() || return p.AtEndOfFile() ||
p.SkipMatching("\r", "\n") || p.On(C.String("\r\n")).Skip() ||
p.SkipMatching("\n") p.On(C.Rune('\n')).Skip()
} }
// AcceptEndOfLine returns true when the cursor is either at the end of the line // AcceptEndOfLine returns true when the cursor is either at the end of the line
@ -44,65 +52,24 @@ func (p *P) AcceptEndOfLine() bool {
return false return false
} }
func (p *P) Match(patterns ...interface{}) ([]rune, []int, bool) { func (p *P) On(m Matcher) *action {
return p.match(0, patterns...) runes, widths, ok := p.Match(m)
return &action{
p: p,
runes: runes,
widths: widths,
ok: ok,
}
} }
func (p *P) match(offset int, patterns ...interface{}) ([]rune, []int, bool) { func (p *P) Match(matcher Matcher) ([]rune, []int, bool) {
var runes []rune return p.match(0, matcher)
var widths []int
addRune := func(r rune, w int) {
offset += w
runes = append(runes, r)
widths = append(widths, w)
}
for _, pattern := range patterns {
r, w := utf8.DecodeRuneInString(p.input[p.pos+offset:])
if r == utf8.RuneError {
return runes, widths, false
}
switch pattern := pattern.(type) {
case Matcher:
m := &MatchDialog{p: p}
if pattern.Match(m) {
return m.runes, m.widths, true
} else {
return m.runes, m.widths, false
}
case []interface{}:
rs, ws, matched := p.match(offset, pattern...)
for i, r := range rs {
addRune(r, ws[i])
}
if !matched {
return runes, widths, false
}
case string:
if strings.IndexRune(pattern, r) < 0 {
return runes, widths, false
}
addRune(r, w)
case rune:
if pattern != r {
return runes, widths, false
}
addRune(r, w)
default:
panic(fmt.Sprintf("Not rune matching implemented for pattern of type %T", pattern))
}
}
return runes, widths, true
} }
// Upcoming checks if the upcoming runes satisfy all provided patterns. func (p *P) match(offset int, matcher Matcher) ([]rune, []int, bool) {
// Returns true if all provided patterns are satisfied. m := &MatchDialog{p: p}
// This is basically the same as the Match method, but with only ok := matcher.Match(m)
// the boolean return parameter for programmer convenciency. return m.runes, m.widths, ok
func (p *P) Upcoming(patterns ...interface{}) bool {
_, _, ok := p.Match(patterns...)
return ok
} }
type action struct { type action struct {
@ -135,6 +102,24 @@ func (a *action) Skip() bool {
return a.ok return a.ok
} }
func (a *action) Stay() bool {
return a.ok
}
// advanceCursor advances the rune cursor one position in the
// input data. While doing so, it keeps tracks of newlines,
// so we can report on row + column positions on error.
func (p *P) advanceCursor(r rune, w int) {
p.pos += w
if p.newline {
p.cursorColumn = 0
p.cursorRow++
} else {
p.cursorColumn++
}
p.newline = r == '\n'
}
func (a *action) RouteTo(state StateFn) bool { func (a *action) RouteTo(state StateFn) bool {
if a.ok { if a.ok {
a.p.RouteTo(state) a.p.RouteTo(state)
@ -142,36 +127,9 @@ func (a *action) RouteTo(state StateFn) bool {
return a.ok return a.ok
} }
func (a *action) Stay() bool { func (a *action) RouteReturn() bool {
if a.ok {
a.p.RouteReturn()
}
return a.ok return a.ok
} }
func (p *P) On(patterns ...interface{}) *action {
runes, widths, ok := p.Match(patterns...)
return &action{
p: p,
runes: runes,
widths: widths,
ok: ok,
}
}
// AcceptMatching adds the next runes to the string buffer, but only
// if the upcoming runes satisfy the provided patterns.
// When runes were added then true is returned, false otherwise.
// TODO not needed anymore
// func (p *P) AcceptMatching(patterns ...interface{}) bool {
// return p.progress(func(r rune) { p.buffer.writeRune(r) }, patterns...)
// }
// SkipMatching skips runes, but only when all provided patterns are satisfied.
// Returns true when one or more runes were skipped.
func (p *P) SkipMatching(patterns ...interface{}) bool {
if runes, widths, ok := p.Match(patterns...); ok {
for i, r := range runes {
p.advanceCursor(r, widths[i])
}
return true
}
return false
}

View File

@ -1,5 +1,11 @@
package parsekit package parsekit
import (
"fmt"
"reflect"
"runtime"
)
// New takes an input string and a start state, // New takes an input string and a start state,
// and initializes the parser for it. // and initializes the parser for it.
func New(input string, startState StateFn) *P { func New(input string, startState StateFn) *P {
@ -30,13 +36,25 @@ func (p *P) Next() (Item, *Error, bool) {
return i, nil, true return i, nil, true
} }
default: default:
// When implementing a parser, it is mandatory to provide // When implementing a parser, a state function must provide
// a conscious state routing decision for every cycle. // a routing decision in every state function execution.
// This helps preventing bugs during implementation. // When no route is specified, then it is considered a but
// in the parser implementation.
// An exception is when a function specified its expectation
// using the Expects() method. In that case, an unexpected
// input error is emitted.
if p.nextState == nil { if p.nextState == nil {
panic("No next state was scheduled for the parser") if p.expecting != "" {
p.UnexpectedInput()
continue
} else {
name := runtime.FuncForPC(reflect.ValueOf(p.state).Pointer()).Name()
panic(fmt.Sprintf("StateFn implementation bug: %s did not set next state or input expectation", name))
}
} }
p.state, p.nextState = p.nextState, nil p.state = p.nextState
p.nextState = nil
p.expecting = ""
p.state(p) p.state(p)
} }
} }

View File

@ -1,67 +0,0 @@
package parsekit
import (
"unicode/utf8"
)
// P holds the internal state of the parser.
type P struct {
state StateFn // the function that handles the current state
nextState StateFn // the function that will handle the next state
stack []StateFn // state function stack, for nested parsing
input string // the scanned input
len int // the total length of the input in bytes
pos int // current byte scanning position in the input
newline bool // keep track of when we have scanned a newline
cursorRow int // current row number in the input
cursorColumn int // current column position in the input
buffer stringBuffer // an efficient buffer, used to build string values
items chan Item // channel of resulting Parser items
item Item // the current item as reached by Next() and retrieved by Get()
err *Error // an error when lexing failed, retrieved by Error()
}
// StateFn defines the type of function that can be used to
// handle a parser state.
type StateFn func(*P)
// ItemType represents the type of a parser Item.
type ItemType int
// ItemEOF is a built-in parser item type that is used for flagging that the
// end of the input was reached.
const ItemEOF ItemType = -1
// ItemError is a built-in parser item type that is used for flagging that
// an error has occurred during parsing.
const ItemError ItemType = -2
// Item represents an item returned from the parser.
type Item struct {
Type ItemType
Value string
}
// Error is used as the error type when parsing errors occur.
// The error includes some extra meta information to allow for useful
// error messages to the user.
type Error struct {
Message string
Row int
Column int
}
func (err *Error) Error() string {
return err.Message
}
// EOF is a special rune, which is used to indicate an end of file when
// reading a character from the input.
// It can be treated as a rune when writing parsing rules, so a valid way to
// say 'I now expect the end of the file' is using something like:
// if (p.On(c.Rune(EOF)).Skip()) { ... }
const EOF rune = -1
// INVALID is a special rune, which is used to indicate an invalid UTF8
// rune on the input.
const INVALID rune = utf8.RuneError

View File

@ -27,12 +27,13 @@ var (
any = c.Any() any = c.Any()
anyQuote = c.AnyOf(singleQuote, doubleQuote) anyQuote = c.AnyOf(singleQuote, doubleQuote)
backslash = c.Rune('\\') backslash = c.Rune('\\')
lower = c.RuneRange('a', 'z') asciiLower = c.RuneRange('a', 'z')
upper = c.RuneRange('A', 'Z') asciiUpper = c.RuneRange('A', 'Z')
digit = c.RuneRange('0', '9') digit = c.RuneRange('0', '9')
whitespace = c.OneOrMore(c.AnyOf(space, tab)) whitespace = c.OneOrMore(c.AnyOf(space, tab))
whitespaceOrNewlines = c.OneOrMore(c.AnyOf(space, tab, carriageReturn, lineFeed)) whitespaceOrNewlines = c.OneOrMore(c.AnyOf(space, tab, carriageReturn, lineFeed))
optionalWhitespace = c.Optional(whitespace) optionalWhitespace = c.Optional(whitespace)
endOfLine = c.AnyOf(lineFeed, c.Rune(parsekit.EOF))
) )
// NewParser creates a new parser, using the provided input string // NewParser creates a new parser, using the provided input string

View File

@ -12,13 +12,12 @@ func startComment(p *parsekit.P) {
// All characters up to the end of the line are included in the comment. // All characters up to the end of the line are included in the comment.
func commentContents(p *parsekit.P) { func commentContents(p *parsekit.P) {
p.Expects("comment contents")
switch { switch {
case p.AtEndOfLine(): case p.AtEndOfLine() || p.On(endOfLine).Skip(): // TODO drop AtEndOfLine support
p.EmitLiteralTrim(ItemComment) p.EmitLiteralTrim(ItemComment)
p.RouteReturn() p.RouteReturn()
case p.On(any).Accept(): case p.On(any).Accept():
p.Repeat() p.Repeat()
default:
p.UnexpectedInput("comment contents")
} }
} }

View File

@ -7,14 +7,14 @@ import (
func TestComments(t *testing.T) { func TestComments(t *testing.T) {
runStatesTs(t, []statesT{ runStatesTs(t, []statesT{
{"empty comment", "#", "#()", ""}, {"empty comment", "#", "#()", ""},
{"empty comment with spaces", "# \t \r\n", `#()`, ""}, // {"empty comment with spaces", "# \t \r\n", `#()`, ""},
{"basic comment", "#chicken", "#(chicken)", ""}, // {"basic comment", "#chicken", "#(chicken)", ""},
{"basic comment starting after whitespace", "# \tchicken", "#(chicken)", ""}, // {"basic comment starting after whitespace", "# \tchicken", "#(chicken)", ""},
{"basic comment with surrounding whitespace", "#\t cow \t", "#(cow)", ""}, // {"basic comment with surrounding whitespace", "#\t cow \t", "#(cow)", ""},
{"two lines of comments", "# one \r\n#two", "#(one)#(two)", ""}, // {"two lines of comments", "# one \r\n#two", "#(one)#(two)", ""},
{"comment with escape-y chars", `# \xxx/ \u can't escape/`, `#(\xxx/ \u can't escape/)`, ""}, // {"comment with escape-y chars", `# \xxx/ \u can't escape/`, `#(\xxx/ \u can't escape/)`, ""},
{"comment with multiple hashes", `#### Just Jack!`, `#(Just Jack!)`, ""}, // {"comment with multiple hashes", `#### Just Jack!`, `#(Just Jack!)`, ""},
{"comment with hashes inside", `# Follow #me2`, `#(Follow #me2)`, ""}, // {"comment with hashes inside", `# Follow #me2`, `#(Follow #me2)`, ""},
{"carriage returns in comment", "# \tlexe\r accepts embedded ca\r\riage \returns\r", "#(lexe\r accepts embedded ca\r\riage \returns)", ""}, // {"carriage returns in comment", "# \tlexe\r accepts embedded ca\r\riage \returns\r", "#(lexe\r accepts embedded ca\r\riage \returns)", ""},
}) })
} }

View File

@ -3,9 +3,8 @@ package parser
import "github.com/mmakaay/toml/parsekit" import "github.com/mmakaay/toml/parsekit"
func endOfFile(p *parsekit.P) { func endOfFile(p *parsekit.P) {
p.Expects("end of file")
if p.AtEndOfFile() { if p.AtEndOfFile() {
p.Emit(parsekit.ItemEOF, "EOF") // todo Automate within parser? p.Emit(parsekit.ItemEOF, "EOF")
} else {
p.UnexpectedInput("end of file")
} }
} }

View File

@ -15,7 +15,7 @@ var (
// contain ASCII letters, ASCII digits, underscores, and dashes // contain ASCII letters, ASCII digits, underscores, and dashes
// (A-Za-z0-9_-). Note that bare keys are allowed to be composed of only // (A-Za-z0-9_-). Note that bare keys are allowed to be composed of only
// ASCII digits, e.g. 1234, but are always interpreted as strings. // ASCII digits, e.g. 1234, but are always interpreted as strings.
bareKeyRune = c.AnyOf(lower, upper, digit, underscore, dash) bareKeyRune = c.AnyOf(asciiLower, asciiUpper, digit, underscore, dash)
bareKey = c.OneOrMore(bareKeyRune) bareKey = c.OneOrMore(bareKeyRune)
// Quoted keys follow the exact same rules as either basic strings or // Quoted keys follow the exact same rules as either basic strings or
@ -44,17 +44,16 @@ func startKeyValuePair(p *parsekit.P) {
} }
func startKey(p *parsekit.P) { func startKey(p *parsekit.P) {
switch { p.Expects("a key name")
case p.On(bareKeyRune).RouteTo(startBareKey): p.On(bareKeyRune).RouteTo(startBareKey)
default:
p.UnexpectedInput("a valid key name")
}
} }
func startBareKey(p *parsekit.P) { func startBareKey(p *parsekit.P) {
p.On(bareKey).Accept() p.Expects("a bare key name")
p.EmitLiteral(ItemKey) if p.On(bareKey).Accept() {
p.RouteTo(endOfKeyOrDot) p.EmitLiteral(ItemKey)
p.RouteTo(endOfKeyOrDot)
}
} }
func endOfKeyOrDot(p *parsekit.P) { func endOfKeyOrDot(p *parsekit.P) {
@ -62,25 +61,21 @@ func endOfKeyOrDot(p *parsekit.P) {
p.Emit(ItemKeyDot, ".") p.Emit(ItemKeyDot, ".")
p.RouteTo(startKey) p.RouteTo(startKey)
} else { } else {
p.RouteTo(startKeyAssignment) p.RouteTo(startAssignment)
} }
} }
func startKeyAssignment(p *parsekit.P) { func startAssignment(p *parsekit.P) {
p.Expects("a value assignment")
if p.On(keyAssignment).Skip() { if p.On(keyAssignment).Skip() {
p.Emit(ItemAssignment, "=") p.Emit(ItemAssignment, "=")
p.RouteTo(startValue) p.RouteTo(startValue)
} else {
p.UnexpectedInput("a value assignment")
} }
} }
// Values must be of the following types: String, Integer, Float, Boolean, // Values must be of the following types: String, Integer, Float, Boolean,
// Datetime, Array, or Inline Table. Unspecified values are invalid. // Datetime, Array, or Inline Table. Unspecified values are invalid.
func startValue(p *parsekit.P) { func startValue(p *parsekit.P) {
switch { p.Expects("a value")
case p.On(anyQuote).RouteTo(startString): p.On(anyQuote).RouteTo(startString)
default:
p.UnexpectedInput("a value")
}
} }

View File

@ -8,13 +8,12 @@ var (
// UTF-8 characters. * Multi-line basic strings are surrounded by three // UTF-8 characters. * Multi-line basic strings are surrounded by three
// quotation marks on each side. * Basic strings are surrounded by // quotation marks on each side. * Basic strings are surrounded by
// quotation marks. // quotation marks.
doubleQuote3 = c.Repeat(3, doubleQuote) doubleQuote3 = c.String(`"""`)
// Any Unicode character may be used except those that must be escaped: // Any Unicode character may be used except those that must be escaped:
// quotation mark, backslash, and the control characters (U+0000 to // quotation mark, backslash, and the control characters (U+0000 to
// U+001F, U+007F). // U+001F, U+007F).
charThatMustBeEscaped = c.AnyOf(c.RuneRange('\u0000', '\u001F'), charThatMustBeEscaped = c.AnyOf(c.RuneRange('\u0000', '\u001F'), c.Rune('\u007F'))
c.Rune('\u007F'))
// For convenience, some popular characters have a compact escape sequence. // For convenience, some popular characters have a compact escape sequence.
// //
@ -36,35 +35,33 @@ var (
) )
func startString(p *parsekit.P) { func startString(p *parsekit.P) {
p.Expects("a string value")
switch { switch {
case p.On(doubleQuote3).RouteTo(startMultiLineBasicString): case p.On(doubleQuote3).RouteTo(startMultiLineBasicString):
case p.On(doubleQuote).RouteTo(startBasicString): case p.On(doubleQuote).RouteTo(startBasicString):
default:
p.UnexpectedInput("a string value")
} }
} }
func parseBasicString(p *parsekit.P) { func parseBasicString(p *parsekit.P) {
p.Expects("string contents")
switch { switch {
case p.On(parsekit.EOF).Stay(): case p.On(charThatMustBeEscaped).Stay():
p.UnexpectedEndOfFile("basic string token")
case p.On(validEscape).Accept():
p.Repeat()
case p.On(charThatMustBeEscaped).Stay():
r, _, _ := p.Match(charThatMustBeEscaped) r, _, _ := p.Match(charThatMustBeEscaped)
p.EmitError("Invalid character in basic string: %q (must be escaped)", r[0]) p.EmitError("Invalid character in basic string: %q (must be escaped)", r[0])
case p.On(validEscape).Accept():
p.Repeat()
case p.On(backslash).Stay() || p.On(doubleQuote).Stay(): case p.On(backslash).Stay() || p.On(doubleQuote).Stay():
p.RouteReturn() p.RouteReturn()
case p.On(any).Accept(): case p.On(any).Accept():
p.Repeat() p.Repeat()
default:
p.UnexpectedInput("string contents")
} }
} }
func startBasicString(p *parsekit.P) { func startBasicString(p *parsekit.P) {
p.On(doubleQuote).Skip() p.Expects("a basic string")
p.RouteTo(parseBasicString).ThenTo(basicStringSpecifics) if p.On(doubleQuote).Skip() {
p.RouteTo(parseBasicString).ThenTo(basicStringSpecifics)
}
} }
// Specific handling of input for basic strings. // Specific handling of input for basic strings.
@ -88,6 +85,8 @@ func basicStringSpecifics(p *parsekit.P) {
} }
func startMultiLineBasicString(p *parsekit.P) { func startMultiLineBasicString(p *parsekit.P) {
p.On(doubleQuote3).Skip() p.Expects("a multi-line basic string")
p.EmitError("Not yet implemented") if p.On(doubleQuote3).Skip() {
p.EmitError("Not yet implemented")
}
} }

View File

@ -8,7 +8,7 @@ import (
func TestUnterminatedBasicString(t *testing.T) { func TestUnterminatedBasicString(t *testing.T) {
runStatesT(t, statesT{ runStatesT(t, statesT{
"missing closing quote", `a="value`, "[a]=", "missing closing quote", `a="value`, "[a]=",
"Unexpected end of file (expected basic string token)"}) "unexpected end of file (expected string contents)"})
} }
func TestBasicStringWithUnescapedControlCharacters(t *testing.T) { func TestBasicStringWithUnescapedControlCharacters(t *testing.T) {