Added a load of parser/combinator implementation, the system seems feasible!

This commit is contained in:
Maurice Makaay 2019-05-20 22:40:59 +00:00
parent 3677ab18cb
commit d9d837fe6e
18 changed files with 502 additions and 331 deletions

View File

@ -5,26 +5,45 @@ import (
"strings" "strings"
) )
// ItemType represents the type of a parser Item.
type ItemType int
// TODO private?
// ItemEOF is a built-in parser item type that is used for flagging that the
// end of the input was reached.
const ItemEOF ItemType = -1
// TODO private?
// ItemError is a built-in parser item type that is used for flagging that
// an error has occurred during parsing.
const ItemError ItemType = -2
// Item represents an item that can be emitted from the parser.
type Item struct {
Type ItemType
Value string
}
// Emit passes a Parser item to the client, including the provided string. // Emit passes a Parser item to the client, including the provided string.
func (p *P) Emit(t ItemType, s string) { func (p *P) Emit(t ItemType, s string) {
p.items <- Item{t, s} p.items <- Item{t, s}
p.buffer.reset() p.buffer.reset()
} }
// EmitLiteral passes a Parser item to the client, including the accumulated // EmitLiteral passes a Parser item to the client, including accumulated
// string buffer data as a literal string. // string buffer data as a literal string.
func (p *P) EmitLiteral(t ItemType) { func (p *P) EmitLiteral(t ItemType) {
p.Emit(t, p.buffer.asLiteralString()) p.Emit(t, p.buffer.asLiteralString())
} }
// EmitLiteralTrim passes a Parser item to the client, including the // EmitLiteralTrim passes a Parser item to the client, including
// accumulated string buffer data as a literal string with whitespace // accumulated string buffer data as a literal string with whitespace
// trimmed from it. // trimmed from it.
func (p *P) EmitLiteralTrim(t ItemType) { func (p *P) EmitLiteralTrim(t ItemType) {
p.Emit(t, strings.TrimSpace(p.buffer.asLiteralString())) p.Emit(t, strings.TrimSpace(p.buffer.asLiteralString()))
} }
// EmitInterpreted passes a Parser item to the client, including the // EmitInterpreted passes a Parser item to the client, including
// accumulated string buffer data a Go doubled quoted interpreted string // accumulated string buffer data a Go doubled quoted interpreted string
// (handling escape codes like \n, \t, \uXXXX, etc.) // (handling escape codes like \n, \t, \uXXXX, etc.)
// This method might return an error, in case there is data in the // This method might return an error, in case there is data in the
@ -38,6 +57,19 @@ func (p *P) EmitInterpreted(t ItemType) error {
return nil return nil
} }
// Error is used as the error type when parsing errors occur.
// The error includes some extra meta information to allow for useful
// error messages to the user.
type Error struct {
Message string
Row int
Column int
}
func (err *Error) Error() string {
return err.Message
}
// EmitError emits a Parser error item to the client. // EmitError emits a Parser error item to the client.
func (p *P) EmitError(format string, args ...interface{}) { func (p *P) EmitError(format string, args ...interface{}) {
message := fmt.Sprintf(format, args...) message := fmt.Sprintf(format, args...)
@ -51,17 +83,17 @@ func (p *P) UnexpectedInput() {
r, _, ok := p.peek(0) r, _, ok := p.peek(0)
switch { switch {
case ok: case ok:
p.EmitError("unexpected character %q%s", r, p.fmtExpects()) p.EmitError("unexpected character %q%s", r, fmtExpects(p))
case r == EOF: case r == EOF:
p.EmitError("unexpected end of file%s", p.fmtExpects()) p.EmitError("unexpected end of file%s", fmtExpects(p))
case r == INVALID: case r == INVALID:
p.EmitError("invalid UTF8 character in input%s", p.fmtExpects()) p.EmitError("invalid UTF8 character in input%s", fmtExpects(p))
default: default:
panic("Unhandled output from peek()") panic("Unhandled output from peek()")
} }
} }
func (p *P) fmtExpects() string { func fmtExpects(p *P) string {
if p.expecting == "" { if p.expecting == "" {
return "" return ""
} }

View File

@ -1,95 +0,0 @@
package parsekit
import (
"unicode/utf8"
)
// P holds the internal state of the parser.
type P struct {
state StateFn // the function that handles the current state
nextState StateFn // the function that will handle the next state
stack []StateFn // state function stack, for nested parsing
input string // the scanned input
len int // the total length of the input in bytes
pos int // current byte scanning position in the input
newline bool // keep track of when we have scanned a newline
cursorRow int // current row number in the input
cursorColumn int // current column position in the input
expecting string // a description of what the current state expects to find
buffer stringBuffer // an efficient buffer, used to build string values
items chan Item // channel of resulting Parser items
item Item // the current item as reached by Next() and retrieved by Get()
err *Error // an error when lexing failed, retrieved by Error()
}
// peek returns but does not advance the cursor to the next rune(s) in the input.
// Returns the rune, its width in bytes and a boolean.
// The boolean will be false in case no upcoming rune can be peeked
// (end of data or invalid UTF8 character).
func (p *P) peek(offsetInBytes int) (rune, int, bool) {
r, w := utf8.DecodeRuneInString(p.input[p.pos+offsetInBytes:])
return handleRuneError(r, w)
}
// handleRuneError is used to normale rune value in case of errors.
// When an error occurs, then utf8.RuneError will be in the rune.
// This can however indicate one of two situations:
// * w == 0: end of file is reached
// * w == 1: invalid UTF character on input
// This function lets these two cases return respectively the
// package's own EOF or INVALID runes, to make it easy for client
// code to distinct between these two cases.
func handleRuneError(r rune, w int) (rune, int, bool) {
if r == utf8.RuneError {
if w == 0 {
return EOF, 0, false
}
return INVALID, w, false
}
return r, w, true
}
// EOF is a special rune, which is used to indicate an end of file when
// reading a character from the input.
// It can be treated as a rune when writing parsing rules, so a valid way to
// say 'I now expect the end of the file' is using something like:
// if (p.On(c.Rune(EOF)).Skip()) { ... }
const EOF rune = -1
// INVALID is a special rune, which is used to indicate an invalid UTF8
// rune on the input.
const INVALID rune = utf8.RuneError
// StateFn defines the type of function that can be used to
// handle a parser state.
type StateFn func(*P)
// ItemType represents the type of a parser Item.
type ItemType int
// ItemEOF is a built-in parser item type that is used for flagging that the
// end of the input was reached.
const ItemEOF ItemType = -1
// ItemError is a built-in parser item type that is used for flagging that
// an error has occurred during parsing.
const ItemError ItemType = -2
// Item represents an item returned from the parser.
type Item struct {
Type ItemType
Value string
}
// Error is used as the error type when parsing errors occur.
// The error includes some extra meta information to allow for useful
// error messages to the user.
type Error struct {
Message string
Row int
Column int
}
func (err *Error) Error() string {
return err.Message
}

View File

@ -6,8 +6,7 @@ import (
) )
// Not in need of it myself, but nice to have I guess: // Not in need of it myself, but nice to have I guess:
// - NotFollowedBy // - LookAhead
// - Separated
// MatchDialog is used by Matcher implementations as a means // MatchDialog is used by Matcher implementations as a means
// to retrieve data to match against and to report back // to retrieve data to match against and to report back
@ -92,6 +91,7 @@ type Matcher interface {
} }
type matcherConstructors struct { type matcherConstructors struct {
EndOfFile func() MatchEndOfFile
Any func() MatchAny Any func() MatchAny
Rune func(rune) MatchRune Rune func(rune) MatchRune
RuneRange func(rune, rune) MatchRuneRange RuneRange func(rune, rune) MatchRuneRange
@ -99,20 +99,28 @@ type matcherConstructors struct {
String func(string) MatchSequence String func(string) MatchSequence
StringNoCase func(string) MatchSequence StringNoCase func(string) MatchSequence
AnyOf func(...Matcher) MatchAnyOf AnyOf func(...Matcher) MatchAnyOf
Repeat func(int, Matcher) MatchRepeat Not func(Matcher) MatchNot
Sequence func(...Matcher) MatchSequence
ZeroOrMore func(Matcher) MatchZeroOrMore
OneOrMore func(Matcher) MatchOneOrMore
Optional func(Matcher) MatchOptional Optional func(Matcher) MatchOptional
Sequence func(...Matcher) MatchSequence
Repeat func(int, Matcher) MatchRepeat
Min func(int, Matcher) MatchRepeat
Max func(int, Matcher) MatchRepeat
Bounded func(int, int, Matcher) MatchRepeat
ZeroOrMore func(Matcher) MatchRepeat
OneOrMore func(Matcher) MatchRepeat
Separated func(Matcher, Matcher) MatchSeparated
Drop func(Matcher) MatchDrop Drop func(Matcher) MatchDrop
} }
// C provides access to a wide range of parser/combinator // C provides access to a wide range of parser/combinator
// constructors that can be used to build matching expressions. // constructorshat can be used to build matching expressions.
// When using C in your own parser, then it is advised to create // When using C in your own parser, then it is advised to create
// an alias in your own package for easy reference: // an alias in your own package for easy reference:
// var c = parsekit.C // var c = parsekit.C
var C = matcherConstructors{ var C = matcherConstructors{
EndOfFile: func() MatchEndOfFile {
return MatchEndOfFile{}
},
Any: func() MatchAny { Any: func() MatchAny {
return MatchAny{} return MatchAny{}
}, },
@ -130,44 +138,73 @@ var C = matcherConstructors{
return MatchAnyOf{m} return MatchAnyOf{m}
}, },
String: func(s string) MatchSequence { String: func(s string) MatchSequence {
m := make([]Matcher, len(s)) var m = []Matcher{}
for i, r := range s { for _, r := range s {
m[i] = MatchRune{r} m = append(m, MatchRune{r})
} }
return MatchSequence{m} return MatchSequence{m}
}, },
StringNoCase: func(s string) MatchSequence { StringNoCase: func(s string) MatchSequence {
m := make([]Matcher, len(s)) var m = []Matcher{}
for i, r := range s { for _, r := range s {
u := MatchRune{unicode.ToUpper(r)} u := MatchRune{unicode.ToUpper(r)}
l := MatchRune{unicode.ToLower(r)} l := MatchRune{unicode.ToLower(r)}
m[i] = MatchAnyOf{[]Matcher{u, l}} m = append(m, MatchAnyOf{[]Matcher{u, l}})
} }
return MatchSequence{m} return MatchSequence{m}
}, },
AnyOf: func(matchers ...Matcher) MatchAnyOf {
return MatchAnyOf{matchers}
},
Repeat: func(count int, matcher Matcher) MatchRepeat {
return MatchRepeat{count, matcher}
},
Sequence: func(matchers ...Matcher) MatchSequence {
return MatchSequence{matchers}
},
OneOrMore: func(matcher Matcher) MatchOneOrMore {
return MatchOneOrMore{matcher}
},
ZeroOrMore: func(matcher Matcher) MatchZeroOrMore {
return MatchZeroOrMore{matcher}
},
Optional: func(matcher Matcher) MatchOptional { Optional: func(matcher Matcher) MatchOptional {
return MatchOptional{matcher} return MatchOptional{matcher}
}, },
Not: func(matcher Matcher) MatchNot {
return MatchNot{matcher}
},
AnyOf: func(matchers ...Matcher) MatchAnyOf {
return MatchAnyOf{matchers}
},
Sequence: func(matchers ...Matcher) MatchSequence {
return MatchSequence{matchers}
},
Repeat: func(count int, matcher Matcher) MatchRepeat {
return MatchRepeat{count, count, matcher}
},
Min: func(min int, matcher Matcher) MatchRepeat {
return MatchRepeat{min, -1, matcher}
},
Max: func(max int, matcher Matcher) MatchRepeat {
return MatchRepeat{-1, max, matcher}
},
Bounded: func(min int, max int, matcher Matcher) MatchRepeat {
return MatchRepeat{min, max, matcher}
},
OneOrMore: func(matcher Matcher) MatchRepeat {
return MatchRepeat{1, -1, matcher}
},
ZeroOrMore: func(matcher Matcher) MatchRepeat {
return MatchRepeat{0, -1, matcher}
},
Separated: func(separator Matcher, matcher Matcher) MatchSeparated {
return MatchSeparated{separator, matcher}
},
Drop: func(matcher Matcher) MatchDrop { Drop: func(matcher Matcher) MatchDrop {
return MatchDrop{matcher} return MatchDrop{matcher}
}, },
} }
type MatchEndOfFile struct{}
func (c MatchEndOfFile) Match(m *MatchDialog) bool {
r, ok := m.NextRune()
return !ok && r == EOF
}
type MatchInvalidRune struct{}
func (c MatchInvalidRune) Match(m *MatchDialog) bool {
r, ok := m.NextRune()
return !ok && r == INVALID
}
type MatchAny struct{} type MatchAny struct{}
func (c MatchAny) Match(m *MatchDialog) bool { func (c MatchAny) Match(m *MatchDialog) bool {
@ -175,6 +212,31 @@ func (c MatchAny) Match(m *MatchDialog) bool {
return ok return ok
} }
type MatchNot struct {
matcher Matcher
}
func (c MatchNot) Match(m *MatchDialog) bool {
child := m.Fork()
if !c.matcher.Match(child) {
child.Merge()
return true
}
return false
}
type MatchOptional struct {
matcher Matcher
}
func (c MatchOptional) Match(m *MatchDialog) bool {
child := m.Fork()
if c.matcher.Match(child) {
child.Merge()
}
return true
}
type MatchRune struct { type MatchRune struct {
match rune match rune
} }
@ -209,18 +271,41 @@ func (c MatchAnyOf) Match(m *MatchDialog) bool {
} }
type MatchRepeat struct { type MatchRepeat struct {
count int min int
max int
matcher Matcher matcher Matcher
} }
func (c MatchRepeat) Match(m *MatchDialog) bool { func (c MatchRepeat) Match(m *MatchDialog) bool {
child := m.Fork() child := m.Fork()
for i := 0; i < c.count; i++ { if c.min >= 0 && c.max >= 0 && c.min > c.max {
panic("MatchRepeat definition error: max must not be < min")
}
total := 0
// Specified min: check for the minimal required amount of matches.
for total < c.min {
total++
if !c.matcher.Match(child) { if !c.matcher.Match(child) {
return false return false
} }
} }
// No specified max: include the rest of the available matches.
if c.max < 0 {
child.Merge()
for c.matcher.Match(child) {
child.Merge()
}
return true
}
// Specified max: include the rest of the availble matches, up to the max.
child.Merge() child.Merge()
for total < c.max {
total++
if !c.matcher.Match(child) {
break
}
child.Merge()
}
return true return true
} }
@ -239,40 +324,14 @@ func (c MatchSequence) Match(m *MatchDialog) bool {
return true return true
} }
type MatchOneOrMore struct { type MatchSeparated struct {
matcher Matcher separator Matcher
matcher Matcher
} }
func (c MatchOneOrMore) Match(m *MatchDialog) bool { func (c MatchSeparated) Match(m *MatchDialog) bool {
child := m.Fork() seq := C.Sequence(c.matcher, C.ZeroOrMore(C.Sequence(c.separator, c.matcher)))
for c.matcher.Match(child) { return seq.Match(m)
child.Merge()
}
return len(m.runes) > 0
}
type MatchZeroOrMore struct {
matcher Matcher
}
func (c MatchZeroOrMore) Match(m *MatchDialog) bool {
child := m.Fork()
for c.matcher.Match(child) {
child.Merge()
}
return true
}
type MatchOptional struct {
matcher Matcher
}
func (c MatchOptional) Match(m *MatchDialog) bool {
child := m.Fork()
if c.matcher.Match(child) {
child.Merge()
}
return true
} }
type MatchDrop struct { type MatchDrop struct {

View File

@ -15,7 +15,7 @@ func newParser(input string, matcher p.Matcher) *p.P {
p.Expects("MATCH") p.Expects("MATCH")
if p.On(matcher).Accept() { if p.On(matcher).Accept() {
p.EmitLiteral(TestItem) p.EmitLiteral(TestItem)
p.Repeat() p.RouteRepeat()
} }
} }
return p.New(input, stateFn) return p.New(input, stateFn)
@ -120,20 +120,19 @@ func TestMatchString(t *testing.T) {
} }
} }
// TODO func TestMatchStringNoCase(t *testing.T) {
// func TestMatchStringNoCase(t *testing.T) { p := newParser("HellÖ, world!", c.StringNoCase("hellö"))
// p := newParser("HellÖ, world!", c.StringNoCase("hellö")) r, err, ok := p.Next()
// r, err, ok := p.Next() if !ok {
// if !ok { t.Fatalf("Parsing failed: %s", err)
// t.Fatalf("Parsing failed: %s", err) }
// } if r.Type != TestItem {
// if r.Type != TestItem { t.Error("Parser item type not expected TestTitem")
// t.Error("Parser item type not expected TestTitem") }
// } if r.Value != "HellÖ" {
// if r.Value != "Hello" { t.Errorf("Parser item value is %q instead of expected \"HellÖ\"", r.Value)
// t.Errorf("Parser item value is %q instead of expected \"Hello\"", r.Value) }
// } }
// }
func TestMatchRunes(t *testing.T) { func TestMatchRunes(t *testing.T) {
m := c.Runes('+', '-', '*', '/') m := c.Runes('+', '-', '*', '/')
@ -156,6 +155,29 @@ func TestMatchRunes(t *testing.T) {
} }
} }
func TestMatchNot(t *testing.T) {
p := newParser("aabc", c.Not(c.Rune('b')))
r, err, ok := p.Next()
if !ok {
t.Fatalf("Parsing failed: %s", err)
}
if r.Value != "a" {
t.Errorf("Parser item value is %q instead of expected \"a\"", r.Value)
}
}
func TestMatchNot_Mismatch(t *testing.T) {
p := newParser("aabc", c.Not(c.Rune('a')))
_, err, ok := p.Next()
if ok {
t.Fatalf("Parsing unexpectedly succeeded")
}
expected := "unexpected character 'a' (expected MATCH)"
if err.Error() != expected {
t.Fatalf("Unexpected error from parser:\nexpectd: %s\nactual: %s\n", expected, err.Error())
}
}
func TestMatchAnyOf(t *testing.T) { func TestMatchAnyOf(t *testing.T) {
p := newParser("abc", c.AnyOf(c.Rune('a'), c.Rune('b'))) p := newParser("abc", c.AnyOf(c.Rune('a'), c.Rune('b')))
r, err, ok := p.Next() r, err, ok := p.Next()
@ -192,6 +214,30 @@ func TestMatchRepeat(t *testing.T) {
} }
} }
func TestMatchRepeat_Min(t *testing.T) {
p := newParser("1111112345", c.Min(4, c.Rune('1')))
r, _, _ := p.Next()
if r.Value != "111111" {
t.Errorf("Parser item value is %q instead of expected \"111111\"", r.Value)
}
}
func TestMatchRepeat_Max(t *testing.T) {
p := newParser("1111112345", c.Max(4, c.Rune('1')))
r, _, _ := p.Next()
if r.Value != "1111" {
t.Errorf("Parser item value is %q instead of expected \"1111\"", r.Value)
}
}
func TestMatchRepeat_Bounded(t *testing.T) {
p := newParser("1111112345", c.Bounded(3, 5, c.Rune('1')))
r, _, _ := p.Next()
if r.Value != "11111" {
t.Errorf("Parser item value is %q instead of expected \"11111\"", r.Value)
}
}
func TestMatchRepeat_Mismatch(t *testing.T) { func TestMatchRepeat_Mismatch(t *testing.T) {
p := newParser("xxxyyyy", c.Repeat(4, c.Rune('x'))) p := newParser("xxxyyyy", c.Repeat(4, c.Rune('x')))
_, err, ok := p.Next() _, err, ok := p.Next()
@ -282,6 +328,21 @@ func TestMatchDrop(t *testing.T) {
t.Errorf("Parser item value is %q instead of expected \"x\"", r.Value) t.Errorf("Parser item value is %q instead of expected \"x\"", r.Value)
} }
} }
func TestMatchSeparated(t *testing.T) {
number := c.Bounded(1, 3, c.RuneRange('0', '9'))
separators := c.Runes('|', ';', ',')
separated_numbers := c.Separated(separators, number)
p := newParser("1,2;3|44,55|66;777,abc", separated_numbers)
r, err, ok := p.Next()
if !ok {
t.Fatalf("Parsing failed: %s at row: %d, column %d\n", err, err.Row, err.Column)
}
if r.Value != "1,2;3|44,55|66;777" {
t.Errorf("Parser item value is %q instead of expected \"1,2;3|44,55|66;777\"", r.Value)
}
}
func TestMixAndMatch(t *testing.T) { func TestMixAndMatch(t *testing.T) {
hex := c.AnyOf(c.RuneRange('0', '9'), c.RuneRange('a', 'f'), c.RuneRange('A', 'F')) hex := c.AnyOf(c.RuneRange('0', '9'), c.RuneRange('a', 'f'), c.RuneRange('A', 'F'))
backslash := c.Rune('\\') backslash := c.Rune('\\')

View File

@ -3,7 +3,7 @@ package parsekit
// Expects is used to let a state function describe what input it is expecting. // Expects is used to let a state function describe what input it is expecting.
// This expectation is used in error messages to make them more descriptive. // This expectation is used in error messages to make them more descriptive.
// //
// Also, when defining an expectation inside a StateFn, you do not need // Also, when defining an expectation inside a StateHandler, you do not need
// to handle unexpected input yourself. When the end of the function is // to handle unexpected input yourself. When the end of the function is
// reached without setting the next state, an automatic error will be // reached without setting the next state, an automatic error will be
// emitted. This error differentiates between issues: // emitted. This error differentiates between issues:
@ -14,47 +14,18 @@ func (p *P) Expects(description string) {
p.expecting = description p.expecting = description
} }
// AtEndOfFile returns true when there is no more data available in the input. // On checks if the current input matches the provided Matcher.
func (p *P) AtEndOfFile() bool { // It returns a MatchAction struct, which provides methods that
return p.pos >= p.len // can be used to tell the parser what to do with a match.
} //
// The intended way to use this, is by chaining some methods,
// AtEndOfLine returns true when the cursor is either at the end of the line // for example: p.On(...).Accept()
// or at the end of the file. The cursor is not moved to a new position // The chained methods will as a whole return a boolean value,
// by this method. // indicating whether or not a match was found and processed.
func (p *P) AtEndOfLine() bool { func (p *P) On(m Matcher) *MatchAction {
return p.AtEndOfFile() || runes, widths, ok := p.match(m)
p.On(C.String("\r\n")).Stay() || p.LastMatch = string(runes)
p.On(C.Rune('\n')).Stay() return &MatchAction{
}
// SkipEndOfLine returns true when the cursor is either at the end of the line
// or at the end of the file. Additionally, when not at the end of the file,
// the cursor is moved forward to beyond the newline.
func (p *P) SkipEndOfLine() bool {
return p.AtEndOfFile() ||
p.On(C.String("\r\n")).Skip() ||
p.On(C.Rune('\n')).Skip()
}
// AcceptEndOfLine returns true when the cursor is either at the end of the line
// or at the end of the file. When not at the end of the file, a normalized
// newline (only a '\n' character, even with '\r\n' on the input)
// is added to the string buffer.
func (p *P) AcceptEndOfLine() bool {
if p.AtEndOfFile() {
return true
}
if p.SkipEndOfLine() {
p.buffer.writeRune('\n')
return true
}
return false
}
func (p *P) On(m Matcher) *action {
runes, widths, ok := p.Match(m)
return &action{
p: p, p: p,
runes: runes, runes: runes,
widths: widths, widths: widths,
@ -62,24 +33,29 @@ func (p *P) On(m Matcher) *action {
} }
} }
func (p *P) Match(matcher Matcher) ([]rune, []int, bool) { // Match checks if the provided Matcher matches the current input.
return p.match(0, matcher) // Returns a slice of matching runes, a slice of their respective
} // byte widths and a boolean.
// The boolean will be false and the slices will be empty in case
func (p *P) match(offset int, matcher Matcher) ([]rune, []int, bool) { // the input did not match.
func (p *P) match(matcher Matcher) ([]rune, []int, bool) {
m := &MatchDialog{p: p} m := &MatchDialog{p: p}
ok := matcher.Match(m) ok := matcher.Match(m)
return m.runes, m.widths, ok return m.runes, m.widths, ok
} }
type action struct { type MatchAction struct {
p *P p *P
runes []rune runes []rune
widths []int widths []int
ok bool ok bool
} }
func (a *action) Accept() bool { // Accept tells the parser to move the cursor past a match that was found,
// and to store the input that matched in the string buffer.
// Returns true in case a match was found.
// When no match was found, then no action is taken and false is returned.
func (a *MatchAction) Accept() bool {
if a.ok { if a.ok {
for i, r := range a.runes { for i, r := range a.runes {
a.p.buffer.writeRune(r) a.p.buffer.writeRune(r)
@ -89,7 +65,11 @@ func (a *action) Accept() bool {
return a.ok return a.ok
} }
func (a *action) Skip() bool { // Skip tells the parser to move the cursor past a match that was found,
// without storing the actual match in the string buffer.
// Returns true in case a match was found.
// When no match was found, then no action is taken and false is returned.
func (a *MatchAction) Skip() bool {
if a.ok { if a.ok {
for i, r := range a.runes { for i, r := range a.runes {
type C struct { type C struct {
@ -102,13 +82,31 @@ func (a *action) Skip() bool {
return a.ok return a.ok
} }
func (a *action) Stay() bool { // Stay tells the parser to not move the cursor after finding a match.
// Returns true in case a match was found, false otherwise.
func (a *MatchAction) Stay() bool {
return a.ok return a.ok
} }
// advanceCursor advances the rune cursor one position in the // RouteTo is a shortcut for p.On(...).Stay() + p.RouteTo(...).
// input data. While doing so, it keeps tracks of newlines, func (a *MatchAction) RouteTo(state StateHandler) bool {
// so we can report on row + column positions on error. if a.ok {
a.p.RouteTo(state)
}
return a.ok
}
// RouteReturn is a shortcut for p.On(...).Stay() + p.RouteReturn().
func (a *MatchAction) RouteReturn() bool {
if a.ok {
a.p.RouteReturn()
}
return a.ok
}
// advanceCursor advances the rune cursor one position in the input data.
// While doing so, it keeps tracks of newlines, so we can report on
// row + column positions on error.
func (p *P) advanceCursor(r rune, w int) { func (p *P) advanceCursor(r rune, w int) {
p.pos += w p.pos += w
if p.newline { if p.newline {
@ -119,17 +117,3 @@ func (p *P) advanceCursor(r rune, w int) {
} }
p.newline = r == '\n' p.newline = r == '\n'
} }
func (a *action) RouteTo(state StateFn) bool {
if a.ok {
a.p.RouteTo(state)
}
return a.ok
}
func (a *action) RouteReturn() bool {
if a.ok {
a.p.RouteReturn()
}
return a.ok
}

View File

@ -6,13 +6,36 @@ import (
"runtime" "runtime"
) )
// P holds the internal state of the parser.
type P struct {
state StateHandler // the function that handles the current state
nextState StateHandler // the function that will handle the next state
stack []StateHandler // state function stack, for nested parsing
input string // the scanned input
len int // the total length of the input in bytes
pos int // current byte scanning position in the input
newline bool // keep track of when we have scanned a newline
cursorRow int // current row number in the input
cursorColumn int // current column position in the input
expecting string // a description of what the current state expects to find
buffer stringBuffer // an efficient buffer, used to build string values
LastMatch string // a string representation of the last matched input data
items chan Item // channel of resulting Parser items
item Item // the current item as reached by Next() and retrieved by Get()
err *Error // an error when lexing failed, retrieved by Error()
}
// StateHandler defines the type of function that can be used to
// handle a parser state.
type StateHandler func(*P)
// New takes an input string and a start state, // New takes an input string and a start state,
// and initializes the parser for it. // and initializes the parser for it.
func New(input string, startState StateFn) *P { func New(input string, start StateHandler) *P {
return &P{ return &P{
input: input, input: input,
len: len(input), len: len(input),
nextState: startState, nextState: start,
items: make(chan Item, 2), items: make(chan Item, 2),
} }
} }
@ -25,51 +48,80 @@ func (p *P) Next() (Item, *Error, bool) {
for { for {
select { select {
case i := <-p.items: case i := <-p.items:
switch { return p.makeReturnValues(i)
case i.Type == ItemEOF:
return i, nil, false
case i.Type == ItemError:
p.err = &Error{i.Value, p.cursorRow, p.cursorColumn}
return i, p.err, false
default:
p.item = i
return i, nil, true
}
default: default:
// When implementing a parser, a state function must provide p.runStatusHandler()
// a routing decision in every state function execution.
// When no route is specified, then it is considered a but
// in the parser implementation.
// An exception is when a function specified its expectation
// using the Expects() method. In that case, an unexpected
// input error is emitted.
if p.nextState == nil {
if p.expecting != "" {
p.UnexpectedInput()
continue
} else {
name := runtime.FuncForPC(reflect.ValueOf(p.state).Pointer()).Name()
panic(fmt.Sprintf("StateFn implementation bug: %s did not set next state or input expectation", name))
}
}
p.state = p.nextState
p.nextState = nil
p.expecting = ""
p.state(p)
} }
} }
} }
// ToArray returns Parser items as an array (mainly intended for testing purposes) // runStatusHandler moves the parser, which is bascially a state machine,
// When an error occurs during scanning, a partial result will be // to its next status. It does so by invoking a function of the
// returned, accompanied by the error that occurred. // type StateHandler. This function represents the current status.
func (p *P) ToArray() ([]Item, *Error) { func (p *P) runStatusHandler() {
var items []Item if state, ok := p.getNextStateHandler(); ok {
for { p.invokeNextStatusHandler(state)
item, err, more := p.Next() }
if !more { }
return items, err
} // getNextStateHandler determintes the next StatusHandler to invoke in order
items = append(items, item) // to move the parsing state machine one step further.
//
// When implementing a parser, the StateHandler functions must provide
// a routing decision in every invocation. A routing decision is one
// of the following:
//
// * A route is specified explicitly, which means that the next StatusHandler
// function to invoke is registered during the StateHandler function
// invocation. For example: p.RouteTo(nextStatus)
//
// * A route is specified implicitly, which means that a previous StateHandler
// invocation has registered the followup route for the current state.
// For example: p.RouteTo(nextStatus).ThenTo(otherStatus)
// In this example, the nextStatus StateHandler will not have to specify
// a route explicitly, but otherStatus will be used implicitly after
// the nextStatus function has returned.
//
// * An expectation is registered by the StatusHandler.
// For example: p.Expects("a cool thing")
// When the StatusHandler returns without having specified a route, this
// expectation is used to generate an "unexpected input" error message.
//
// When no routing decision is provided by a StateHandler, then this is
// considered a bug in the state handler, and the parser will panic.
func (p *P) getNextStateHandler() (StateHandler, bool) {
switch {
case p.nextState != nil:
return p.nextState, true
case len(p.stack) > 0:
return p.popState(), true
case p.expecting != "":
p.UnexpectedInput()
return nil, false
default:
name := runtime.FuncForPC(reflect.ValueOf(p.state).Pointer()).Name()
panic(fmt.Sprintf("StateHandler %s did not provide a routing decision", name))
}
}
// invokeNextStatusHandler moves the parser state to the provided state
// and invokes the StatusHandler function.
func (p *P) invokeNextStatusHandler(state StateHandler) {
p.state = state
p.nextState = nil
p.expecting = ""
p.state(p)
}
func (p *P) makeReturnValues(i Item) (Item, *Error, bool) {
switch {
case i.Type == ItemEOF:
return i, nil, false
case i.Type == ItemError:
p.err = &Error{i.Value, p.cursorRow, p.cursorColumn}
return i, p.err, false
default:
p.item = i
return i, nil, true
} }
} }

43
parsekit/peek.go Normal file
View File

@ -0,0 +1,43 @@
package parsekit
import (
"unicode/utf8"
)
// peek returns but does not advance the cursor to the next rune(s) in the input.
// Returns the rune, its width in bytes and a boolean.
// The boolean will be false in case no upcoming rune can be peeked
// (end of data or invalid UTF8 character).
func (p *P) peek(offsetInBytes int) (rune, int, bool) {
r, w := utf8.DecodeRuneInString(p.input[p.pos+offsetInBytes:])
return handleRuneError(r, w)
}
// handleRuneError is used to normale rune value in case of errors.
// When an error occurs, then utf8.RuneError will be in the rune.
// This can however indicate one of two situations:
// * w == 0: end of file is reached
// * w == 1: invalid UTF character on input
// This function lets these two cases return respectively the
// package's own EOF or INVALID runes, to make it easy for client
// code to distinct between these two cases.
func handleRuneError(r rune, w int) (rune, int, bool) {
if r == utf8.RuneError {
if w == 0 {
return EOF, 0, false
}
return INVALID, w, false
}
return r, w, true
}
// EOF is a special rune, which is used to indicate an end of file when
// reading a character from the input.
// It can be treated as a rune when writing parsing rules, so a valid way to
// say 'I now expect the end of the file' is using something like:
// if (p.On(c.Rune(EOF)).Skip()) { ... }
const EOF rune = -1
// INVALID is a special rune, which is used to indicate an invalid UTF8
// rune on the input.
const INVALID rune = utf8.RuneError

View File

@ -1,40 +1,58 @@
package parsekit package parsekit
func (p *P) Repeat() { // RouteRepeat indicates that on the next parsing cycle,
p.nextState = p.state // the current StateHandler must be invoked again.
return func (p *P) RouteRepeat() {
p.RouteTo(p.state)
} }
func (p *P) RouteTo(state StateFn) *routeFollowup { // RouteTo tells the parser what StateHandler function to invoke
// in the next parsing cycle.
func (p *P) RouteTo(state StateHandler) *RouteFollowup {
p.nextState = state p.nextState = state
return &routeFollowup{p} return &RouteFollowup{p}
} }
type routeFollowup struct { // RouteFollowup chains parsing routes.
// It allows for routing code like p.RouteTo(handlerA).ThenTo(handlerB).
type RouteFollowup struct {
p *P p *P
} }
func (r *routeFollowup) ThenTo(state StateFn) *routeFollowup { // ThenTo schedules a StateHandler that must be invoked
// after the RouteTo StateHandler has been completed.
// For example: p.RouteTo(handlerA).ThenTo(handlerB)
func (r *RouteFollowup) ThenTo(state StateHandler) {
r.p.pushState(state) r.p.pushState(state)
return r
} }
func (r *routeFollowup) ThenReturnHere() { // ThenReturnHere schedules the current StateHandler to be
// invoked after the RouteTo StateHandler has been completed.
// For example: p.RouteTo(handlerA).ThenReturnHere()
func (r *RouteFollowup) ThenReturnHere() {
r.p.pushState(r.p.state) r.p.pushState(r.p.state)
} }
// RouteReturn tells the parser that on the next cycle the
// next scheduled route must be invoked.
// Using this method is optional. When implementating a
// StateHandler that is used as a sort of subroutine (using
// constructions like p.RouteTo(subroutine).ThenReturnHere()),
// then you can refrain from providing a routing decision
// from that handler. The parser will automatically assume
// a RouteReturn in that case.
func (p *P) RouteReturn() { func (p *P) RouteReturn() {
p.nextState = p.popState() p.nextState = p.popState()
} }
// PushState adds the state function to the state stack. // PushState adds the state function to the state stack.
// This is used for implementing nested parsing. // This is used for implementing nested parsing.
func (p *P) pushState(state StateFn) { func (p *P) pushState(state StateHandler) {
p.stack = append(p.stack, state) p.stack = append(p.stack, state)
} }
// PopState pops the last pushed state from the state stack. // PopState pops the last pushed state from the state stack.
func (p *P) popState() StateFn { func (p *P) popState() StateHandler {
last := len(p.stack) - 1 last := len(p.stack) - 1
head, tail := p.stack[:last], p.stack[last] head, tail := p.stack[:last], p.stack[last]
p.stack = head p.stack = head

View File

@ -6,18 +6,20 @@ import (
// A '#' hash symbol marks the rest of the line as a comment. // A '#' hash symbol marks the rest of the line as a comment.
func startComment(p *parsekit.P) { func startComment(p *parsekit.P) {
p.On(c.OneOrMore(hash)).Skip() p.Expects("start of comment")
p.RouteTo(commentContents) if p.On(c.OneOrMore(hash)).Skip() {
p.RouteTo(commentContents)
}
} }
// All characters up to the end of the line are included in the comment. // All characters up to the end of the line are included in the comment.
func commentContents(p *parsekit.P) { func commentContents(p *parsekit.P) {
p.Expects("comment contents") p.Expects("comment contents")
switch { switch {
case p.AtEndOfLine() || p.On(endOfLine).Skip(): // TODO drop AtEndOfLine support case p.On(endOfLine).Skip():
p.EmitLiteralTrim(ItemComment) p.EmitLiteralTrim(ItemComment)
p.RouteReturn() p.RouteReturn()
case p.On(any).Accept(): case p.On(any).Accept():
p.Repeat() p.RouteRepeat()
} }
} }

View File

@ -2,9 +2,10 @@ package parser
import "github.com/mmakaay/toml/parsekit" import "github.com/mmakaay/toml/parsekit"
// TODO move into parsekit
func endOfFile(p *parsekit.P) { func endOfFile(p *parsekit.P) {
p.Expects("end of file") p.Expects("end of file")
if p.AtEndOfFile() { if p.On(c.EndOfFile()).Stay() {
p.Emit(parsekit.ItemEOF, "EOF") p.Emit(parsekit.ItemEOF, "EOF")
} }
} }

View File

@ -22,8 +22,23 @@ func runStatesTs(t *testing.T, tests []statesT) {
} }
} }
// ToArray returns Parser items as an array.
// When an error occurs during scanning, a partial result will be
// returned, accompanied by the error that occurred.
func parseItemsToArray(p *parsekit.P) ([]parsekit.Item, *parsekit.Error) {
var items []parsekit.Item
for {
item, err, more := p.Next()
if !more {
return items, err
}
items = append(items, item)
}
}
func runStatesT(t *testing.T, c statesT) { func runStatesT(t *testing.T, c statesT) {
l, err := parser.NewParser(c.in).ToArray() p := parser.NewParser(c.in)
l, err := parseItemsToArray(p)
if err == nil && c.err != "" { if err == nil && c.err != "" {
t.Errorf("[%s] Expected error '%s', but no error occurred", c.name, c.err) t.Errorf("[%s] Expected error '%s', but no error occurred", c.name, c.err)
} }

View File

@ -33,7 +33,7 @@ var (
whitespace = c.OneOrMore(c.AnyOf(space, tab)) whitespace = c.OneOrMore(c.AnyOf(space, tab))
whitespaceOrNewlines = c.OneOrMore(c.AnyOf(space, tab, carriageReturn, lineFeed)) whitespaceOrNewlines = c.OneOrMore(c.AnyOf(space, tab, carriageReturn, lineFeed))
optionalWhitespace = c.Optional(whitespace) optionalWhitespace = c.Optional(whitespace)
endOfLine = c.AnyOf(lineFeed, c.Rune(parsekit.EOF)) endOfLine = c.AnyOf(lineFeed, c.EndOfFile())
) )
// NewParser creates a new parser, using the provided input string // NewParser creates a new parser, using the provided input string

View File

@ -11,7 +11,8 @@ func TestEmptyInput(t *testing.T) {
} }
func TestErrorsIncludeLineAndRowPosition(t *testing.T) { func TestErrorsIncludeLineAndRowPosition(t *testing.T) {
_, err := parser.NewParser("# 12345 abcde\t\n\n\n# 67890\r\n# 12345\xbc").ToArray() p := parser.NewParser("# 12345 abcde\t\n\n\n# 67890\r\n# 12345\xbc")
_, err := parseItemsToArray(p)
t.Logf("Got error: %s", err.Error()) t.Logf("Got error: %s", err.Error())
if err.Row != 4 { if err.Row != 4 {
t.Errorf("Unexpected line number: %d (expected %d)", err.Row, 4) t.Errorf("Unexpected line number: %d (expected %d)", err.Row, 4)
@ -23,7 +24,7 @@ func TestErrorsIncludeLineAndRowPosition(t *testing.T) {
func TestInvalidUtf8Data(t *testing.T) { func TestInvalidUtf8Data(t *testing.T) {
runStatesTs(t, []statesT{ runStatesTs(t, []statesT{
{"inside comment", "# \xbc", "", "invalid UTF8 character in input (expected comment contents)"}, {"inside comment", "# \xbc", "", "invalid UTF8 character in input (expected end of file)"},
{"bare key 1", "\xbc", "", "invalid UTF8 character in input (expected end of file)"}, {"bare key 1", "\xbc", "", "invalid UTF8 character in input (expected end of file)"},
{"bare key 2", "key\xbc", "[key]", "invalid UTF8 character in input (expected a value assignment)"}, {"bare key 2", "key\xbc", "[key]", "invalid UTF8 character in input (expected a value assignment)"},
{"start of value", "key=\xbc", "[key]=", "invalid UTF8 character in input (expected a value)"}, {"start of value", "key=\xbc", "[key]=", "invalid UTF8 character in input (expected a value)"},

View File

@ -42,21 +42,6 @@ func startString(p *parsekit.P) {
} }
} }
func parseBasicString(p *parsekit.P) {
p.Expects("string contents")
switch {
case p.On(charThatMustBeEscaped).Stay():
r, _, _ := p.Match(charThatMustBeEscaped)
p.EmitError("Invalid character in basic string: %q (must be escaped)", r[0])
case p.On(validEscape).Accept():
p.Repeat()
case p.On(backslash).Stay() || p.On(doubleQuote).Stay():
p.RouteReturn()
case p.On(any).Accept():
p.Repeat()
}
}
func startBasicString(p *parsekit.P) { func startBasicString(p *parsekit.P) {
p.Expects("a basic string") p.Expects("a basic string")
if p.On(doubleQuote).Skip() { if p.On(doubleQuote).Skip() {
@ -64,12 +49,27 @@ func startBasicString(p *parsekit.P) {
} }
} }
func parseBasicString(p *parsekit.P) {
p.Expects("string contents")
switch {
case p.On(charThatMustBeEscaped).Stay():
p.EmitError("Invalid character in basic string: %q (must be escaped)", p.LastMatch)
case p.On(validEscape).Accept():
p.RouteRepeat()
case p.On(backslash).RouteReturn():
case p.On(doubleQuote).RouteReturn():
case p.On(any).Accept():
p.RouteRepeat()
}
}
// Specific handling of input for basic strings. // Specific handling of input for basic strings.
// * A double quote ends the string // * A double quote ends the string
// * No additional \escape sequences are allowed. What the spec say about this: // * No additional \escape sequences are allowed. What the spec say about this:
// "All other escape sequences [..] are reserved and, if used, TOML should // "All other escape sequences [..] are reserved and, if used, TOML should
// produce an error."" // produce an error.""
func basicStringSpecifics(p *parsekit.P) { func basicStringSpecifics(p *parsekit.P) {
p.Expects("string contents")
switch { switch {
case p.On(doubleQuote).Skip(): case p.On(doubleQuote).Skip():
if err := p.EmitInterpreted(ItemString); err != nil { // TODO testcase? if err := p.EmitInterpreted(ItemString); err != nil { // TODO testcase?
@ -79,8 +79,6 @@ func basicStringSpecifics(p *parsekit.P) {
} }
case p.On(backslash).Stay(): case p.On(backslash).Stay():
p.EmitError("Invalid escape sequence") p.EmitError("Invalid escape sequence")
default:
panic("String parsing should not have ended up here")
} }
} }

View File

@ -13,9 +13,9 @@ func TestUnterminatedBasicString(t *testing.T) {
func TestBasicStringWithUnescapedControlCharacters(t *testing.T) { func TestBasicStringWithUnescapedControlCharacters(t *testing.T) {
runStatesTs(t, []statesT{ runStatesTs(t, []statesT{
{"null char", "a=\"\u0000\"", "[a]=", `Invalid character in basic string: '\x00' (must be escaped)`}, {"null char", "a=\"\u0000\"", "[a]=", `Invalid character in basic string: "\x00" (must be escaped)`},
{"newline", "a=\"b\nc\nd\"", "[a]=", `Invalid character in basic string: '\n' (must be escaped)`}, {"newline", "a=\"b\nc\nd\"", "[a]=", `Invalid character in basic string: "\n" (must be escaped)`},
{"delete", "a=\"\u007F\"", "[a]=", `Invalid character in basic string: '\u007f' (must be escaped)`}, {"delete", "a=\"\u007F\"", "[a]=", `Invalid character in basic string: "\u007f" (must be escaped)`},
}) })
// No need to write all test cases for disallowed characters by hand. // No need to write all test cases for disallowed characters by hand.
@ -23,7 +23,7 @@ func TestBasicStringWithUnescapedControlCharacters(t *testing.T) {
name := fmt.Sprintf("control character %x", rune(i)) name := fmt.Sprintf("control character %x", rune(i))
runStatesT( runStatesT(
t, statesT{name, fmt.Sprintf(`_="%c"`, rune(i)), "[_]=", t, statesT{name, fmt.Sprintf(`_="%c"`, rune(i)), "[_]=",
fmt.Sprintf(`Invalid character in basic string: %q (must be escaped)`, rune(i))}) fmt.Sprintf(`Invalid character in basic string: %q (must be escaped)`, string(rune(i)))})
} }
} }