Added a load of parser/combinator implementation, the system seems feasible!

2019-05-20 22:40:59 +00:00 · 2019-05-20 22:40:59 +00:00 · d9d837fe6e
parent 3677ab18cb
commit d9d837fe6e
18 changed files with 502 additions and 331 deletions
--- a/parsekit/emitting.go
+++ b/parsekit/emitting.go
@ -5,26 +5,45 @@ import (
 	"strings"
 )

+// ItemType represents the type of a parser Item.
+type ItemType int
+
+// TODO private?
+// ItemEOF is a built-in parser item type that is used for flagging that the
+// end of the input was reached.
+const ItemEOF ItemType = -1
+
+// TODO private?
+// ItemError is a built-in parser item type that is used for flagging that
+// an error has occurred during parsing.
+const ItemError ItemType = -2
+
+// Item represents an item that can be emitted from the parser.
+type Item struct {
+	Type  ItemType
+	Value string
+}
+
 // Emit passes a Parser item to the client, including the provided string.
 func (p *P) Emit(t ItemType, s string) {
 	p.items <- Item{t, s}
 	p.buffer.reset()
 }

-// EmitLiteral passes a Parser item to the client, including the accumulated
+// EmitLiteral passes a Parser item to the client, including accumulated
 // string buffer data as a literal string.
 func (p *P) EmitLiteral(t ItemType) {
 	p.Emit(t, p.buffer.asLiteralString())
 }

-// EmitLiteralTrim passes a Parser item to the client, including the
+// EmitLiteralTrim passes a Parser item to the client, including
 // accumulated string buffer data as a literal string with whitespace
 // trimmed from it.
 func (p *P) EmitLiteralTrim(t ItemType) {
 	p.Emit(t, strings.TrimSpace(p.buffer.asLiteralString()))
 }

-// EmitInterpreted passes a Parser item to the client, including the
+// EmitInterpreted passes a Parser item to the client, including
 // accumulated string buffer data a Go doubled quoted interpreted string
 // (handling escape codes like \n, \t, \uXXXX, etc.)
 // This method might return an error, in case there is data in the
@ -38,6 +57,19 @@ func (p *P) EmitInterpreted(t ItemType) error {
 	return nil
 }

+// Error is used as the error type when parsing errors occur.
+// The error includes some extra meta information to allow for useful
+// error messages to the user.
+type Error struct {
+	Message string
+	Row     int
+	Column  int
+}
+
+func (err *Error) Error() string {
+	return err.Message
+}
+
 // EmitError emits a Parser error item to the client.
 func (p *P) EmitError(format string, args ...interface{}) {
 	message := fmt.Sprintf(format, args...)
@ -51,17 +83,17 @@ func (p *P) UnexpectedInput() {
 	r, _, ok := p.peek(0)
 	switch {
 	case ok:
-		p.EmitError("unexpected character %q%s", r, p.fmtExpects())
+		p.EmitError("unexpected character %q%s", r, fmtExpects(p))
 	case r == EOF:
-		p.EmitError("unexpected end of file%s", p.fmtExpects())
+		p.EmitError("unexpected end of file%s", fmtExpects(p))
 	case r == INVALID:
-		p.EmitError("invalid UTF8 character in input%s", p.fmtExpects())
+		p.EmitError("invalid UTF8 character in input%s", fmtExpects(p))
 	default:
 		panic("Unhandled output from peek()")
 	}
 }

-func (p *P) fmtExpects() string {
+func fmtExpects(p *P) string {
 	if p.expecting == "" {
 		return ""
 	}
--- a/parsekit/internals.go
+++ b/parsekit/internals.go
@ -1,95 +0,0 @@
-package parsekit
-
-import (
-	"unicode/utf8"
-)
-
-// P holds the internal state of the parser.
-type P struct {
-	state        StateFn      // the function that handles the current state
-	nextState    StateFn      // the function that will handle the next state
-	stack        []StateFn    // state function stack, for nested parsing
-	input        string       // the scanned input
-	len          int          // the total length of the input in bytes
-	pos          int          // current byte scanning position in the input
-	newline      bool         // keep track of when we have scanned a newline
-	cursorRow    int          // current row number in the input
-	cursorColumn int          // current column position in the input
-	expecting    string       // a description of what the current state expects to find
-	buffer       stringBuffer // an efficient buffer, used to build string values
-	items        chan Item    // channel of resulting Parser items
-	item         Item         // the current item as reached by Next() and retrieved by Get()
-	err          *Error       // an error when lexing failed, retrieved by Error()
-}
-
-// peek returns but does not advance the cursor to the next rune(s) in the input.
-// Returns the rune, its width in bytes and a boolean.
-// The boolean will be false in case no upcoming rune can be peeked
-// (end of data or invalid UTF8 character).
-func (p *P) peek(offsetInBytes int) (rune, int, bool) {
-	r, w := utf8.DecodeRuneInString(p.input[p.pos+offsetInBytes:])
-	return handleRuneError(r, w)
-}
-
-// handleRuneError is used to normale rune value in case of errors.
-// When an error occurs, then utf8.RuneError will be in the rune.
-// This can however indicate one of two situations:
-// * w == 0: end of file is reached
-// * w == 1: invalid UTF character on input
-// This function lets these two cases return respectively the
-// package's own EOF or INVALID runes, to make it easy for client
-// code to distinct between these two cases.
-func handleRuneError(r rune, w int) (rune, int, bool) {
-	if r == utf8.RuneError {
-		if w == 0 {
-			return EOF, 0, false
-		}
-		return INVALID, w, false
-	}
-	return r, w, true
-}
-
-// EOF is a special rune, which is used to indicate an end of file when
-// reading a character from the input.
-// It can be treated as a rune when writing parsing rules, so a valid way to
-// say 'I now expect the end of the file' is using something like:
-// if (p.On(c.Rune(EOF)).Skip()) { ... }
-const EOF rune = -1
-
-// INVALID is a special rune, which is used to indicate an invalid UTF8
-// rune on the input.
-const INVALID rune = utf8.RuneError
-
-// StateFn defines the type of function that can be used to
-// handle a parser state.
-type StateFn func(*P)
-
-// ItemType represents the type of a parser Item.
-type ItemType int
-
-// ItemEOF is a built-in parser item type that is used for flagging that the
-// end of the input was reached.
-const ItemEOF ItemType = -1
-
-// ItemError is a built-in parser item type that is used for flagging that
-// an error has occurred during parsing.
-const ItemError ItemType = -2
-
-// Item represents an item returned from the parser.
-type Item struct {
-	Type  ItemType
-	Value string
-}
-
-// Error is used as the error type when parsing errors occur.
-// The error includes some extra meta information to allow for useful
-// error messages to the user.
-type Error struct {
-	Message string
-	Row     int
-	Column  int
-}
-
-func (err *Error) Error() string {
-	return err.Message
-}
--- a/parsekit/matchers.go
+++ b/parsekit/matchers.go
@ -6,8 +6,7 @@ import (
 )

 // Not in need of it myself, but nice to have I guess:
-// - NotFollowedBy
-// - Separated
+// - LookAhead

 // MatchDialog is used by Matcher implementations as a means
 // to retrieve data to match against and to report back
@ -92,6 +91,7 @@ type Matcher interface {
 }

 type matcherConstructors struct {
+	EndOfFile    func() MatchEndOfFile
 	Any          func() MatchAny
 	Rune         func(rune) MatchRune
 	RuneRange    func(rune, rune) MatchRuneRange
@ -99,20 +99,28 @@ type matcherConstructors struct {
 	String       func(string) MatchSequence
 	StringNoCase func(string) MatchSequence
 	AnyOf        func(...Matcher) MatchAnyOf
-	Repeat       func(int, Matcher) MatchRepeat
-	Sequence     func(...Matcher) MatchSequence
-	ZeroOrMore   func(Matcher) MatchZeroOrMore
-	OneOrMore    func(Matcher) MatchOneOrMore
+	Not          func(Matcher) MatchNot
 	Optional     func(Matcher) MatchOptional
+	Sequence     func(...Matcher) MatchSequence
+	Repeat       func(int, Matcher) MatchRepeat
+	Min          func(int, Matcher) MatchRepeat
+	Max          func(int, Matcher) MatchRepeat
+	Bounded      func(int, int, Matcher) MatchRepeat
+	ZeroOrMore   func(Matcher) MatchRepeat
+	OneOrMore    func(Matcher) MatchRepeat
+	Separated    func(Matcher, Matcher) MatchSeparated
 	Drop         func(Matcher) MatchDrop
 }

 // C provides access to a wide range of parser/combinator
-// constructors that can be used to build matching expressions.
+// constructorshat can be used to build matching expressions.
 // When using C in your own parser, then it is advised to create
 // an alias in your own package for easy reference:
 // var c = parsekit.C
 var C = matcherConstructors{
+	EndOfFile: func() MatchEndOfFile {
+		return MatchEndOfFile{}
+	},
 	Any: func() MatchAny {
 		return MatchAny{}
 	},
@ -130,44 +138,73 @@ var C = matcherConstructors{
 		return MatchAnyOf{m}
 	},
 	String: func(s string) MatchSequence {
-		m := make([]Matcher, len(s))
-		for i, r := range s {
-			m[i] = MatchRune{r}
+		var m = []Matcher{}
+		for _, r := range s {
+			m = append(m, MatchRune{r})
 		}
 		return MatchSequence{m}
 	},
 	StringNoCase: func(s string) MatchSequence {
-		m := make([]Matcher, len(s))
-		for i, r := range s {
+		var m = []Matcher{}
+		for _, r := range s {
 			u := MatchRune{unicode.ToUpper(r)}
 			l := MatchRune{unicode.ToLower(r)}
-			m[i] = MatchAnyOf{[]Matcher{u, l}}
+			m = append(m, MatchAnyOf{[]Matcher{u, l}})
 		}
 		return MatchSequence{m}
 	},
-	AnyOf: func(matchers ...Matcher) MatchAnyOf {
-		return MatchAnyOf{matchers}
-	},
-	Repeat: func(count int, matcher Matcher) MatchRepeat {
-		return MatchRepeat{count, matcher}
-	},
-	Sequence: func(matchers ...Matcher) MatchSequence {
-		return MatchSequence{matchers}
-	},
-	OneOrMore: func(matcher Matcher) MatchOneOrMore {
-		return MatchOneOrMore{matcher}
-	},
-	ZeroOrMore: func(matcher Matcher) MatchZeroOrMore {
-		return MatchZeroOrMore{matcher}
-	},
 	Optional: func(matcher Matcher) MatchOptional {
 		return MatchOptional{matcher}
 	},
+	Not: func(matcher Matcher) MatchNot {
+		return MatchNot{matcher}
+	},
+	AnyOf: func(matchers ...Matcher) MatchAnyOf {
+		return MatchAnyOf{matchers}
+	},
+	Sequence: func(matchers ...Matcher) MatchSequence {
+		return MatchSequence{matchers}
+	},
+	Repeat: func(count int, matcher Matcher) MatchRepeat {
+		return MatchRepeat{count, count, matcher}
+	},
+	Min: func(min int, matcher Matcher) MatchRepeat {
+		return MatchRepeat{min, -1, matcher}
+	},
+	Max: func(max int, matcher Matcher) MatchRepeat {
+		return MatchRepeat{-1, max, matcher}
+	},
+	Bounded: func(min int, max int, matcher Matcher) MatchRepeat {
+		return MatchRepeat{min, max, matcher}
+	},
+	OneOrMore: func(matcher Matcher) MatchRepeat {
+		return MatchRepeat{1, -1, matcher}
+	},
+	ZeroOrMore: func(matcher Matcher) MatchRepeat {
+		return MatchRepeat{0, -1, matcher}
+	},
+	Separated: func(separator Matcher, matcher Matcher) MatchSeparated {
+		return MatchSeparated{separator, matcher}
+	},
 	Drop: func(matcher Matcher) MatchDrop {
 		return MatchDrop{matcher}
 	},
 }

+type MatchEndOfFile struct{}
+
+func (c MatchEndOfFile) Match(m *MatchDialog) bool {
+	r, ok := m.NextRune()
+	return !ok && r == EOF
+}
+
+type MatchInvalidRune struct{}
+
+func (c MatchInvalidRune) Match(m *MatchDialog) bool {
+	r, ok := m.NextRune()
+	return !ok && r == INVALID
+}
+
 type MatchAny struct{}

 func (c MatchAny) Match(m *MatchDialog) bool {
@ -175,6 +212,31 @@ func (c MatchAny) Match(m *MatchDialog) bool {
 	return ok
 }

+type MatchNot struct {
+	matcher Matcher
+}
+
+func (c MatchNot) Match(m *MatchDialog) bool {
+	child := m.Fork()
+	if !c.matcher.Match(child) {
+		child.Merge()
+		return true
+	}
+	return false
+}
+
+type MatchOptional struct {
+	matcher Matcher
+}
+
+func (c MatchOptional) Match(m *MatchDialog) bool {
+	child := m.Fork()
+	if c.matcher.Match(child) {
+		child.Merge()
+	}
+	return true
+}
+
 type MatchRune struct {
 	match rune
 }
@ -209,18 +271,41 @@ func (c MatchAnyOf) Match(m *MatchDialog) bool {
 }

 type MatchRepeat struct {
-	count   int
+	min     int
+	max     int
 	matcher Matcher
 }

 func (c MatchRepeat) Match(m *MatchDialog) bool {
 	child := m.Fork()
-	for i := 0; i < c.count; i++ {
+	if c.min >= 0 && c.max >= 0 && c.min > c.max {
+		panic("MatchRepeat definition error: max must not be < min")
+	}
+	total := 0
+	// Specified min: check for the minimal required amount of matches.
+	for total < c.min {
+		total++
 		if !c.matcher.Match(child) {
 			return false
 		}
 	}
+	// No specified max: include the rest of the available matches.
+	if c.max < 0 {
 		child.Merge()
+		for c.matcher.Match(child) {
+			child.Merge()
+		}
+		return true
+	}
+	// Specified max: include the rest of the availble matches, up to the max.
+	child.Merge()
+	for total < c.max {
+		total++
+		if !c.matcher.Match(child) {
+			break
+		}
+		child.Merge()
+	}
 	return true
 }

@ -239,40 +324,14 @@ func (c MatchSequence) Match(m *MatchDialog) bool {
 	return true
 }

-type MatchOneOrMore struct {
+type MatchSeparated struct {
+	separator Matcher
 	matcher   Matcher
 }

-func (c MatchOneOrMore) Match(m *MatchDialog) bool {
-	child := m.Fork()
-	for c.matcher.Match(child) {
-		child.Merge()
-	}
-	return len(m.runes) > 0
-}
-
-type MatchZeroOrMore struct {
-	matcher Matcher
-}
-
-func (c MatchZeroOrMore) Match(m *MatchDialog) bool {
-	child := m.Fork()
-	for c.matcher.Match(child) {
-		child.Merge()
-	}
-	return true
-}
-
-type MatchOptional struct {
-	matcher Matcher
-}
-
-func (c MatchOptional) Match(m *MatchDialog) bool {
-	child := m.Fork()
-	if c.matcher.Match(child) {
-		child.Merge()
-	}
-	return true
+func (c MatchSeparated) Match(m *MatchDialog) bool {
+	seq := C.Sequence(c.matcher, C.ZeroOrMore(C.Sequence(c.separator, c.matcher)))
+	return seq.Match(m)
 }

 type MatchDrop struct {
--- a/parsekit/matchers_test.go
+++ b/parsekit/matchers_test.go
@ -15,7 +15,7 @@ func newParser(input string, matcher p.Matcher) *p.P {
 		p.Expects("MATCH")
 		if p.On(matcher).Accept() {
 			p.EmitLiteral(TestItem)
-			p.Repeat()
+			p.RouteRepeat()
 		}
 	}
 	return p.New(input, stateFn)
@ -120,20 +120,19 @@ func TestMatchString(t *testing.T) {
 	}
 }

-// TODO
-// func TestMatchStringNoCase(t *testing.T) {
-// 	p := newParser("HellÖ, world!", c.StringNoCase("hellö"))
-// 	r, err, ok := p.Next()
-// 	if !ok {
-// 		t.Fatalf("Parsing failed: %s", err)
-// 	}
-// 	if r.Type != TestItem {
-// 		t.Error("Parser item type not expected TestTitem")
-// 	}
-// 	if r.Value != "Hello" {
-// 		t.Errorf("Parser item value is %q instead of expected \"Hello\"", r.Value)
-// 	}
-// }
+func TestMatchStringNoCase(t *testing.T) {
+	p := newParser("HellÖ, world!", c.StringNoCase("hellö"))
+	r, err, ok := p.Next()
+	if !ok {
+		t.Fatalf("Parsing failed: %s", err)
+	}
+	if r.Type != TestItem {
+		t.Error("Parser item type not expected TestTitem")
+	}
+	if r.Value != "HellÖ" {
+		t.Errorf("Parser item value is %q instead of expected \"HellÖ\"", r.Value)
+	}
+}

 func TestMatchRunes(t *testing.T) {
 	m := c.Runes('+', '-', '*', '/')
@ -156,6 +155,29 @@ func TestMatchRunes(t *testing.T) {
 	}
 }

+func TestMatchNot(t *testing.T) {
+	p := newParser("aabc", c.Not(c.Rune('b')))
+	r, err, ok := p.Next()
+	if !ok {
+		t.Fatalf("Parsing failed: %s", err)
+	}
+	if r.Value != "a" {
+		t.Errorf("Parser item value is %q instead of expected \"a\"", r.Value)
+	}
+}
+
+func TestMatchNot_Mismatch(t *testing.T) {
+	p := newParser("aabc", c.Not(c.Rune('a')))
+	_, err, ok := p.Next()
+	if ok {
+		t.Fatalf("Parsing unexpectedly succeeded")
+	}
+	expected := "unexpected character 'a' (expected MATCH)"
+	if err.Error() != expected {
+		t.Fatalf("Unexpected error from parser:\nexpectd: %s\nactual: %s\n", expected, err.Error())
+	}
+}
+
 func TestMatchAnyOf(t *testing.T) {
 	p := newParser("abc", c.AnyOf(c.Rune('a'), c.Rune('b')))
 	r, err, ok := p.Next()
@ -192,6 +214,30 @@ func TestMatchRepeat(t *testing.T) {
 	}
 }

+func TestMatchRepeat_Min(t *testing.T) {
+	p := newParser("1111112345", c.Min(4, c.Rune('1')))
+	r, _, _ := p.Next()
+	if r.Value != "111111" {
+		t.Errorf("Parser item value is %q instead of expected \"111111\"", r.Value)
+	}
+}
+
+func TestMatchRepeat_Max(t *testing.T) {
+	p := newParser("1111112345", c.Max(4, c.Rune('1')))
+	r, _, _ := p.Next()
+	if r.Value != "1111" {
+		t.Errorf("Parser item value is %q instead of expected \"1111\"", r.Value)
+	}
+}
+
+func TestMatchRepeat_Bounded(t *testing.T) {
+	p := newParser("1111112345", c.Bounded(3, 5, c.Rune('1')))
+	r, _, _ := p.Next()
+	if r.Value != "11111" {
+		t.Errorf("Parser item value is %q instead of expected \"11111\"", r.Value)
+	}
+}
+
 func TestMatchRepeat_Mismatch(t *testing.T) {
 	p := newParser("xxxyyyy", c.Repeat(4, c.Rune('x')))
 	_, err, ok := p.Next()
@ -282,6 +328,21 @@ func TestMatchDrop(t *testing.T) {
 		t.Errorf("Parser item value is %q instead of expected \"x\"", r.Value)
 	}
 }
+
+func TestMatchSeparated(t *testing.T) {
+	number := c.Bounded(1, 3, c.RuneRange('0', '9'))
+	separators := c.Runes('|', ';', ',')
+	separated_numbers := c.Separated(separators, number)
+	p := newParser("1,2;3|44,55|66;777,abc", separated_numbers)
+	r, err, ok := p.Next()
+	if !ok {
+		t.Fatalf("Parsing failed: %s at row: %d, column %d\n", err, err.Row, err.Column)
+	}
+	if r.Value != "1,2;3|44,55|66;777" {
+		t.Errorf("Parser item value is %q instead of expected \"1,2;3|44,55|66;777\"", r.Value)
+	}
+}
+
 func TestMixAndMatch(t *testing.T) {
 	hex := c.AnyOf(c.RuneRange('0', '9'), c.RuneRange('a', 'f'), c.RuneRange('A', 'F'))
 	backslash := c.Rune('\\')
--- a/parsekit/matching.go
+++ b/parsekit/matching.go
@ -3,7 +3,7 @@ package parsekit
 // Expects is used to let a state function describe what input it is expecting.
 // This expectation is used in error messages to make them more descriptive.
 //
-// Also, when defining an expectation inside a StateFn, you do not need
+// Also, when defining an expectation inside a StateHandler, you do not need
 // to handle unexpected input yourself. When the end of the function is
 // reached without setting the next state, an automatic error will be
 // emitted. This error differentiates between issues:
@ -14,47 +14,18 @@ func (p *P) Expects(description string) {
 	p.expecting = description
 }

-// AtEndOfFile returns true when there is no more data available in the input.
-func (p *P) AtEndOfFile() bool {
-	return p.pos >= p.len
-}
-
-// AtEndOfLine returns true when the cursor is either at the end of the line
-// or at the end of the file. The cursor is not moved to a new position
-// by this method.
-func (p *P) AtEndOfLine() bool {
-	return p.AtEndOfFile() ||
-		p.On(C.String("\r\n")).Stay() ||
-		p.On(C.Rune('\n')).Stay()
-}
-
-// SkipEndOfLine returns true when the cursor is either at the end of the line
-// or at the end of the file. Additionally, when not at the end of the file,
-// the cursor is moved forward to beyond the newline.
-func (p *P) SkipEndOfLine() bool {
-	return p.AtEndOfFile() ||
-		p.On(C.String("\r\n")).Skip() ||
-		p.On(C.Rune('\n')).Skip()
-}
-
-// AcceptEndOfLine returns true when the cursor is either at the end of the line
-// or at the end of the file. When not at the end of the file, a normalized
-// newline (only a '\n' character, even with '\r\n' on the input)
-// is added to the string buffer.
-func (p *P) AcceptEndOfLine() bool {
-	if p.AtEndOfFile() {
-		return true
-	}
-	if p.SkipEndOfLine() {
-		p.buffer.writeRune('\n')
-		return true
-	}
-	return false
-}
-
-func (p *P) On(m Matcher) *action {
-	runes, widths, ok := p.Match(m)
-	return &action{
+// On checks if the current input matches the provided Matcher.
+// It returns a MatchAction struct, which provides methods that
+// can be used to tell the parser what to do with a match.
+//
+// The intended way to use this, is by chaining some methods,
+// for example: p.On(...).Accept()
+// The chained methods will as a whole return a boolean value,
+// indicating whether or not a match was found and processed.
+func (p *P) On(m Matcher) *MatchAction {
+	runes, widths, ok := p.match(m)
+	p.LastMatch = string(runes)
+	return &MatchAction{
 		p:      p,
 		runes:  runes,
 		widths: widths,
@ -62,24 +33,29 @@ func (p *P) On(m Matcher) *action {
 	}
 }

-func (p *P) Match(matcher Matcher) ([]rune, []int, bool) {
-	return p.match(0, matcher)
-}
-
-func (p *P) match(offset int, matcher Matcher) ([]rune, []int, bool) {
+// Match checks if the provided Matcher matches the current input.
+// Returns a slice of matching runes, a slice of their respective
+// byte widths and a boolean.
+// The boolean will be false and the slices will be empty in case
+// the input did not match.
+func (p *P) match(matcher Matcher) ([]rune, []int, bool) {
 	m := &MatchDialog{p: p}
 	ok := matcher.Match(m)
 	return m.runes, m.widths, ok
 }

-type action struct {
+type MatchAction struct {
 	p      *P
 	runes  []rune
 	widths []int
 	ok     bool
 }

-func (a *action) Accept() bool {
+// Accept tells the parser to move the cursor past a match that was found,
+// and to store the input that matched in the string buffer.
+// Returns true in case a match was found.
+// When no match was found, then no action is taken and false is returned.
+func (a *MatchAction) Accept() bool {
 	if a.ok {
 		for i, r := range a.runes {
 			a.p.buffer.writeRune(r)
@ -89,7 +65,11 @@ func (a *action) Accept() bool {
 	return a.ok
 }

-func (a *action) Skip() bool {
+// Skip tells the parser to move the cursor past a match that was found,
+// without storing the actual match in the string buffer.
+// Returns true in case a match was found.
+// When no match was found, then no action is taken and false is returned.
+func (a *MatchAction) Skip() bool {
 	if a.ok {
 		for i, r := range a.runes {
 			type C struct {
@ -102,13 +82,31 @@ func (a *action) Skip() bool {
 	return a.ok
 }

-func (a *action) Stay() bool {
+// Stay tells the parser to not move the cursor after finding a match.
+// Returns true in case a match was found, false otherwise.
+func (a *MatchAction) Stay() bool {
 	return a.ok
 }

-// advanceCursor advances the rune cursor one position in the
-// input data. While doing so, it keeps tracks of newlines,
-// so we can report on row + column positions on error.
+// RouteTo is a shortcut for p.On(...).Stay() + p.RouteTo(...).
+func (a *MatchAction) RouteTo(state StateHandler) bool {
+	if a.ok {
+		a.p.RouteTo(state)
+	}
+	return a.ok
+}
+
+// RouteReturn is a shortcut for p.On(...).Stay() + p.RouteReturn().
+func (a *MatchAction) RouteReturn() bool {
+	if a.ok {
+		a.p.RouteReturn()
+	}
+	return a.ok
+}
+
+// advanceCursor advances the rune cursor one position in the input data.
+// While doing so, it keeps tracks of newlines, so we can report on
+// row + column positions on error.
 func (p *P) advanceCursor(r rune, w int) {
 	p.pos += w
 	if p.newline {
@ -119,17 +117,3 @@ func (p *P) advanceCursor(r rune, w int) {
 	}
 	p.newline = r == '\n'
 }
-
-func (a *action) RouteTo(state StateFn) bool {
-	if a.ok {
-		a.p.RouteTo(state)
-	}
-	return a.ok
-}
-
-func (a *action) RouteReturn() bool {
-	if a.ok {
-		a.p.RouteReturn()
-	}
-	return a.ok
-}
--- a/parsekit/parsekit.go
+++ b/parsekit/parsekit.go
@ -6,13 +6,36 @@ import (
 	"runtime"
 )

+// P holds the internal state of the parser.
+type P struct {
+	state        StateHandler   // the function that handles the current state
+	nextState    StateHandler   // the function that will handle the next state
+	stack        []StateHandler // state function stack, for nested parsing
+	input        string         // the scanned input
+	len          int            // the total length of the input in bytes
+	pos          int            // current byte scanning position in the input
+	newline      bool           // keep track of when we have scanned a newline
+	cursorRow    int            // current row number in the input
+	cursorColumn int            // current column position in the input
+	expecting    string         // a description of what the current state expects to find
+	buffer       stringBuffer   // an efficient buffer, used to build string values
+	LastMatch    string         // a string representation of the last matched input data
+	items        chan Item      // channel of resulting Parser items
+	item         Item           // the current item as reached by Next() and retrieved by Get()
+	err          *Error         // an error when lexing failed, retrieved by Error()
+}
+
+// StateHandler defines the type of function that can be used to
+// handle a parser state.
+type StateHandler func(*P)
+
 // New takes an input string and a start state,
 // and initializes the parser for it.
-func New(input string, startState StateFn) *P {
+func New(input string, start StateHandler) *P {
 	return &P{
 		input:     input,
 		len:       len(input),
-		nextState: startState,
+		nextState: start,
 		items:     make(chan Item, 2),
 	}
 }
@ -25,6 +48,72 @@ func (p *P) Next() (Item, *Error, bool) {
 	for {
 		select {
 		case i := <-p.items:
+			return p.makeReturnValues(i)
+		default:
+			p.runStatusHandler()
+		}
+	}
+}
+
+// runStatusHandler moves the parser, which is bascially a state machine,
+// to its next status. It does so by invoking a function of the
+// type StateHandler. This function represents the current status.
+func (p *P) runStatusHandler() {
+	if state, ok := p.getNextStateHandler(); ok {
+		p.invokeNextStatusHandler(state)
+	}
+}
+
+// getNextStateHandler determintes the next StatusHandler to invoke in order
+// to move the parsing state machine one step further.
+//
+// When implementing a parser, the StateHandler functions must provide
+// a routing decision in every invocation. A routing decision is one
+// of the following:
+//
+// * A route is specified explicitly, which means that the next StatusHandler
+//   function to invoke is registered during the StateHandler function
+//   invocation. For example: p.RouteTo(nextStatus)
+//
+// * A route is specified implicitly, which means that a previous StateHandler
+//   invocation has registered the followup route for the current state.
+//   For example: p.RouteTo(nextStatus).ThenTo(otherStatus)
+//   In this example, the nextStatus StateHandler will not have to specify
+//   a route explicitly, but otherStatus will be used implicitly after
+//   the nextStatus function has returned.
+//
+// * An expectation is registered by the StatusHandler.
+//   For example: p.Expects("a cool thing")
+//   When the StatusHandler returns without having specified a route, this
+//   expectation is used to generate an "unexpected input" error message.
+//
+// When no routing decision is provided by a StateHandler, then this is
+// considered a bug in the state handler, and the parser will panic.
+func (p *P) getNextStateHandler() (StateHandler, bool) {
+	switch {
+	case p.nextState != nil:
+		return p.nextState, true
+	case len(p.stack) > 0:
+		return p.popState(), true
+	case p.expecting != "":
+		p.UnexpectedInput()
+		return nil, false
+	default:
+		name := runtime.FuncForPC(reflect.ValueOf(p.state).Pointer()).Name()
+		panic(fmt.Sprintf("StateHandler %s did not provide a routing decision", name))
+	}
+}
+
+// invokeNextStatusHandler moves the parser state to the provided state
+// and invokes the StatusHandler function.
+func (p *P) invokeNextStatusHandler(state StateHandler) {
+	p.state = state
+	p.nextState = nil
+	p.expecting = ""
+	p.state(p)
+}
+
+func (p *P) makeReturnValues(i Item) (Item, *Error, bool) {
 	switch {
 	case i.Type == ItemEOF:
 		return i, nil, false
@ -35,41 +124,4 @@ func (p *P) Next() (Item, *Error, bool) {
 		p.item = i
 		return i, nil, true
 	}
-		default:
-			// When implementing a parser, a state function must provide
-			// a routing decision in every state function execution.
-			// When no route is specified, then it is considered a but
-			// in the parser implementation.
-			// An exception is when a function specified its expectation
-			// using the Expects() method. In that case, an unexpected
-			// input error is emitted.
-			if p.nextState == nil {
-				if p.expecting != "" {
-					p.UnexpectedInput()
-					continue
-				} else {
-					name := runtime.FuncForPC(reflect.ValueOf(p.state).Pointer()).Name()
-					panic(fmt.Sprintf("StateFn implementation bug: %s did not set next state or input expectation", name))
-				}
-			}
-			p.state = p.nextState
-			p.nextState = nil
-			p.expecting = ""
-			p.state(p)
-		}
-	}
-}
-
-// ToArray returns Parser items as an array (mainly intended for testing purposes)
-// When an error occurs during scanning, a partial result will be
-// returned, accompanied by the error that occurred.
-func (p *P) ToArray() ([]Item, *Error) {
-	var items []Item
-	for {
-		item, err, more := p.Next()
-		if !more {
-			return items, err
-		}
-		items = append(items, item)
-	}
 }
--- a/parsekit/peek.go
+++ b/parsekit/peek.go
@ -0,0 +1,43 @@
+package parsekit
+
+import (
+	"unicode/utf8"
+)
+
+// peek returns but does not advance the cursor to the next rune(s) in the input.
+// Returns the rune, its width in bytes and a boolean.
+// The boolean will be false in case no upcoming rune can be peeked
+// (end of data or invalid UTF8 character).
+func (p *P) peek(offsetInBytes int) (rune, int, bool) {
+	r, w := utf8.DecodeRuneInString(p.input[p.pos+offsetInBytes:])
+	return handleRuneError(r, w)
+}
+
+// handleRuneError is used to normale rune value in case of errors.
+// When an error occurs, then utf8.RuneError will be in the rune.
+// This can however indicate one of two situations:
+// * w == 0: end of file is reached
+// * w == 1: invalid UTF character on input
+// This function lets these two cases return respectively the
+// package's own EOF or INVALID runes, to make it easy for client
+// code to distinct between these two cases.
+func handleRuneError(r rune, w int) (rune, int, bool) {
+	if r == utf8.RuneError {
+		if w == 0 {
+			return EOF, 0, false
+		}
+		return INVALID, w, false
+	}
+	return r, w, true
+}
+
+// EOF is a special rune, which is used to indicate an end of file when
+// reading a character from the input.
+// It can be treated as a rune when writing parsing rules, so a valid way to
+// say 'I now expect the end of the file' is using something like:
+// if (p.On(c.Rune(EOF)).Skip()) { ... }
+const EOF rune = -1
+
+// INVALID is a special rune, which is used to indicate an invalid UTF8
+// rune on the input.
+const INVALID rune = utf8.RuneError
--- a/parsekit/staterouting.go
+++ b/parsekit/staterouting.go
@ -1,40 +1,58 @@
 package parsekit

-func (p *P) Repeat() {
-	p.nextState = p.state
-	return
+// RouteRepeat indicates that on the next parsing cycle,
+// the current StateHandler must be invoked again.
+func (p *P) RouteRepeat() {
+	p.RouteTo(p.state)
 }

-func (p *P) RouteTo(state StateFn) *routeFollowup {
+// RouteTo tells the parser what StateHandler function to invoke
+// in the next parsing cycle.
+func (p *P) RouteTo(state StateHandler) *RouteFollowup {
 	p.nextState = state
-	return &routeFollowup{p}
+	return &RouteFollowup{p}
 }

-type routeFollowup struct {
+// RouteFollowup chains parsing routes.
+// It allows for routing code like p.RouteTo(handlerA).ThenTo(handlerB).
+type RouteFollowup struct {
 	p *P
 }

-func (r *routeFollowup) ThenTo(state StateFn) *routeFollowup {
+// ThenTo schedules a StateHandler that must be invoked
+// after the RouteTo StateHandler has been completed.
+// For example: p.RouteTo(handlerA).ThenTo(handlerB)
+func (r *RouteFollowup) ThenTo(state StateHandler) {
 	r.p.pushState(state)
-	return r
 }

-func (r *routeFollowup) ThenReturnHere() {
+// ThenReturnHere schedules the current StateHandler to be
+// invoked after the RouteTo StateHandler has been completed.
+// For example: p.RouteTo(handlerA).ThenReturnHere()
+func (r *RouteFollowup) ThenReturnHere() {
 	r.p.pushState(r.p.state)
 }

+// RouteReturn tells the parser that on the next cycle the
+// next scheduled route must be invoked.
+// Using this method is optional. When implementating a
+// StateHandler that is used as a sort of subroutine (using
+// constructions like p.RouteTo(subroutine).ThenReturnHere()),
+// then you can refrain from providing a routing decision
+// from that handler. The parser will automatically assume
+// a RouteReturn in that case.
 func (p *P) RouteReturn() {
 	p.nextState = p.popState()
 }

 // PushState adds the state function to the state stack.
 // This is used for implementing nested parsing.
-func (p *P) pushState(state StateFn) {
+func (p *P) pushState(state StateHandler) {
 	p.stack = append(p.stack, state)
 }

 // PopState pops the last pushed state from the state stack.
-func (p *P) popState() StateFn {
+func (p *P) popState() StateHandler {
 	last := len(p.stack) - 1
 	head, tail := p.stack[:last], p.stack[last]
 	p.stack = head
--- a/parser/syn_comments.go
+++ b/parser/syn_comments.go
@ -6,18 +6,20 @@ import (

 // A '#' hash symbol marks the rest of the line as a comment.
 func startComment(p *parsekit.P) {
-	p.On(c.OneOrMore(hash)).Skip()
+	p.Expects("start of comment")
+	if p.On(c.OneOrMore(hash)).Skip() {
 		p.RouteTo(commentContents)
+	}
 }

 // All characters up to the end of the line are included in the comment.
 func commentContents(p *parsekit.P) {
 	p.Expects("comment contents")
 	switch {
-	case p.AtEndOfLine() || p.On(endOfLine).Skip(): // TODO drop AtEndOfLine support
+	case p.On(endOfLine).Skip():
 		p.EmitLiteralTrim(ItemComment)
 		p.RouteReturn()
 	case p.On(any).Accept():
-		p.Repeat()
+		p.RouteRepeat()
 	}
 }
--- a/parser/syn_comments_test.go
+++ b/parser/syn_comments_test.go
--- a/parser/syn_eof.go
+++ b/parser/syn_eof.go
@ -2,9 +2,10 @@ package parser

 import "github.com/mmakaay/toml/parsekit"

+// TODO move into parsekit
 func endOfFile(p *parsekit.P) {
 	p.Expects("end of file")
-	if p.AtEndOfFile() {
+	if p.On(c.EndOfFile()).Stay() {
 		p.Emit(parsekit.ItemEOF, "EOF")
 	}
 }
--- a/parser/helpers_test.go
+++ b/parser/helpers_test.go
@ -22,8 +22,23 @@ func runStatesTs(t *testing.T, tests []statesT) {
 	}
 }

+// ToArray returns Parser items as an array.
+// When an error occurs during scanning, a partial result will be
+// returned, accompanied by the error that occurred.
+func parseItemsToArray(p *parsekit.P) ([]parsekit.Item, *parsekit.Error) {
+	var items []parsekit.Item
+	for {
+		item, err, more := p.Next()
+		if !more {
+			return items, err
+		}
+		items = append(items, item)
+	}
+}
+
 func runStatesT(t *testing.T, c statesT) {
-	l, err := parser.NewParser(c.in).ToArray()
+	p := parser.NewParser(c.in)
+	l, err := parseItemsToArray(p)
 	if err == nil && c.err != "" {
 		t.Errorf("[%s] Expected error '%s', but no error occurred", c.name, c.err)
 	}
--- a/parser/syn_keyvaluepair.go
+++ b/parser/syn_keyvaluepair.go
--- a/parser/syn_keyvaluepair_test.go
+++ b/parser/syn_keyvaluepair_test.go
--- a/parser/parser.go
+++ b/parser/parser.go
@ -33,7 +33,7 @@ var (
 	whitespace           = c.OneOrMore(c.AnyOf(space, tab))
 	whitespaceOrNewlines = c.OneOrMore(c.AnyOf(space, tab, carriageReturn, lineFeed))
 	optionalWhitespace   = c.Optional(whitespace)
-	endOfLine            = c.AnyOf(lineFeed, c.Rune(parsekit.EOF))
+	endOfLine            = c.AnyOf(lineFeed, c.EndOfFile())
 )

 // NewParser creates a new parser, using the provided input string
--- a/parser/parser_test.go
+++ b/parser/parser_test.go
@ -11,7 +11,8 @@ func TestEmptyInput(t *testing.T) {
 }

 func TestErrorsIncludeLineAndRowPosition(t *testing.T) {
-	_, err := parser.NewParser("# 12345 abcde\t\n\n\n# 67890\r\n# 12345\xbc").ToArray()
+	p := parser.NewParser("# 12345 abcde\t\n\n\n# 67890\r\n# 12345\xbc")
+	_, err := parseItemsToArray(p)
 	t.Logf("Got error: %s", err.Error())
 	if err.Row != 4 {
 		t.Errorf("Unexpected line number: %d (expected %d)", err.Row, 4)
@ -23,7 +24,7 @@ func TestErrorsIncludeLineAndRowPosition(t *testing.T) {

 func TestInvalidUtf8Data(t *testing.T) {
 	runStatesTs(t, []statesT{
-		{"inside comment", "# \xbc", "", "invalid UTF8 character in input (expected comment contents)"},
+		{"inside comment", "# \xbc", "", "invalid UTF8 character in input (expected end of file)"},
 		{"bare key 1", "\xbc", "", "invalid UTF8 character in input (expected end of file)"},
 		{"bare key 2", "key\xbc", "[key]", "invalid UTF8 character in input (expected a value assignment)"},
 		{"start of value", "key=\xbc", "[key]=", "invalid UTF8 character in input (expected a value)"},
--- a/parser/value_string.go
+++ b/parser/value_string.go
@ -42,21 +42,6 @@ func startString(p *parsekit.P) {
 	}
 }

-func parseBasicString(p *parsekit.P) {
-	p.Expects("string contents")
-	switch {
-	case p.On(charThatMustBeEscaped).Stay():		
-		r, _, _ := p.Match(charThatMustBeEscaped)
-		p.EmitError("Invalid character in basic string: %q (must be escaped)", r[0])
-	case p.On(validEscape).Accept():
-		p.Repeat()
-	case p.On(backslash).Stay() || p.On(doubleQuote).Stay():
-		p.RouteReturn()
-	case p.On(any).Accept():
-		p.Repeat()
-	}
-}
-
 func startBasicString(p *parsekit.P) {
 	p.Expects("a basic string")
 	if p.On(doubleQuote).Skip() {
@ -64,12 +49,27 @@ func startBasicString(p *parsekit.P) {
 	}
 }

+func parseBasicString(p *parsekit.P) {
+	p.Expects("string contents")
+	switch {
+	case p.On(charThatMustBeEscaped).Stay():
+		p.EmitError("Invalid character in basic string: %q (must be escaped)", p.LastMatch)
+	case p.On(validEscape).Accept():
+		p.RouteRepeat()
+	case p.On(backslash).RouteReturn():
+	case p.On(doubleQuote).RouteReturn():
+	case p.On(any).Accept():
+		p.RouteRepeat()
+	}
+}
+
 // Specific handling of input for basic strings.
 // * A double quote ends the string
 // * No additional \escape sequences are allowed. What the spec say about this:
 //   "All other escape sequences [..] are reserved and, if used, TOML should
 //    produce an error.""
 func basicStringSpecifics(p *parsekit.P) {
+	p.Expects("string contents")
 	switch {
 	case p.On(doubleQuote).Skip():
 		if err := p.EmitInterpreted(ItemString); err != nil { // TODO testcase?
@ -79,8 +79,6 @@ func basicStringSpecifics(p *parsekit.P) {
 		}
 	case p.On(backslash).Stay():
 		p.EmitError("Invalid escape sequence")
-	default:
-		panic("String parsing should not have ended up here")
 	}
 }

--- a/parser/value_tring_test.go
+++ b/parser/value_tring_test.go
@ -13,9 +13,9 @@ func TestUnterminatedBasicString(t *testing.T) {

 func TestBasicStringWithUnescapedControlCharacters(t *testing.T) {
 	runStatesTs(t, []statesT{
-		{"null char", "a=\"\u0000\"", "[a]=", `Invalid character in basic string: '\x00' (must be escaped)`},
-		{"newline", "a=\"b\nc\nd\"", "[a]=", `Invalid character in basic string: '\n' (must be escaped)`},
-		{"delete", "a=\"\u007F\"", "[a]=", `Invalid character in basic string: '\u007f' (must be escaped)`},
+		{"null char", "a=\"\u0000\"", "[a]=", `Invalid character in basic string: "\x00" (must be escaped)`},
+		{"newline", "a=\"b\nc\nd\"", "[a]=", `Invalid character in basic string: "\n" (must be escaped)`},
+		{"delete", "a=\"\u007F\"", "[a]=", `Invalid character in basic string: "\u007f" (must be escaped)`},
 	})

 	// No need to write all test cases for disallowed characters by hand.
@ -23,7 +23,7 @@ func TestBasicStringWithUnescapedControlCharacters(t *testing.T) {
 		name := fmt.Sprintf("control character %x", rune(i))
 		runStatesT(
 			t, statesT{name, fmt.Sprintf(`_="%c"`, rune(i)), "[_]=",
-				fmt.Sprintf(`Invalid character in basic string: %q (must be escaped)`, rune(i))})
+				fmt.Sprintf(`Invalid character in basic string: %q (must be escaped)`, string(rune(i)))})
 	}
 }