Splitting off a more generic parser (it's fun getting to know a language, but you keep refactoring with all new stuff that you learn :-)

2019-05-17 12:44:24 +00:00 · 2019-05-17 12:44:24 +00:00 · f86ef2b918
parent aeb48edc44
commit f86ef2b918
11 changed files with 720 additions and 234 deletions
--- a/lexer/items.go
+++ b/lexer/items.go
@ -1,48 +1,35 @@
 package lexer

-import "fmt"
+import (
+	"fmt"

-// itemType represents the type of lexer items.
-type itemType int
+	"github.com/mmakaay/toml/parser"
+)

 // Definition of all the lexer item types for the TOML lexer.
 const (
-	ItemError      itemType = iota // An error occurred
-	ItemEOF                        // End of input reached
-	ItemComment                    // Comment string, starts with # till en of line
+	ItemComment    parser.ItemType = iota // An error occurred
 	ItemKey                               // Key of a key/value pair
 	ItemKeyDot                            // Dot for a dotted key
 	ItemAssignment                        // Value assignment coming up (=)
 	ItemString                            // A value of type string
 )

-// Item represents a lexer item returned from the scanner.
-type Item struct {
-	Type  itemType //Type, e.g. ItemComment, ItemString
-	Value string   // Value, e.g. "10.42", "["
-}
-
-// String returns a string representation of the lexer item.
-func (i Item) String() string {
+// ParserItemToString returns a string representation of the
+// parser.Item. This is used for unit testing purposes.
+func ParserItemToString(i parser.Item) string {
 	switch i.Type {
+	case ItemComment:
+		return fmt.Sprintf("#(%s)", i.Value)
 	case ItemKey:
 		return fmt.Sprintf("[%s]", i.Value)
+	case ItemString:
+		return fmt.Sprintf("STR(%s)", i.Value)
 	case ItemKeyDot:
 		return "."
 	case ItemAssignment:
 		return "="
-	}
-	return fmt.Sprintf("%s(%s)", i.Type, i.Value)
-}
-
-// String returns a string representation of the lexer item type.
-func (i itemType) String() string {
-	switch i {
-	case ItemComment:
-		return "#"
-	case ItemString:
-		return "STR"
 	default:
-		panic(fmt.Sprintf("No translation available for type id %d", i))
+		panic(fmt.Sprintf("No string representation available for parser.Item id %d", i.Type))
 	}
 }
--- a/lexer/lexer.no
+++ b/lexer/lexer.no
@ -4,21 +4,23 @@ import (
 	"fmt"
 	"strings"
 	"unicode/utf8"
+
+	"github.com/mmakaay/toml/parser"
 )

 // Lexer holds the state of the lexer.
 type Lexer struct {
-	input        string       // the scanned input string
-	state        stateFn      // a function that handles the current state
-	stack        []stateFn    // state function stack, for nested parsing
+	input        string           // the scanned input
+	state        parser.StateFn   // a function that handles the current state
+	stack        []parser.StateFn // state function stack, for nested parsing
+	len          int              // the total length of the input in bytes
 	pos          int              // current byte scanning position in the input
 	newline      bool             // keep track of when we have scanned a newline
 	cursorRow    int              // current row number in the input
 	cursorColumn int              // current column position in the input
-	width        int          // width of the last rune read, for supporting backup()
 	buffer       StringBuffer     // an efficient buffer, used to build string values
-	items        chan Item    // channel of resulting lexer items
-	item         Item         // the current item as reached by Next() and retrieved by Get()
+	items        chan parser.Item // channel of resulting lexer items
+	item         parser.Item      // the current item as reached by Next() and retrieved by Get()
 	err          *Error           // an error when lexing failed, retrieved by Error()
 }

@ -35,46 +37,45 @@ func (err *Error) Error() string {
 	return err.Message
 }

-// Lex takes an input string and initializes the TOML lexer for it.
-func Lex(input string) *Lexer {
+// New takes an input string and initializes the lexer for it.
+func New(input string) *Lexer {
 	return &Lexer{
 		input: input,
+		len:   len(input),
 		state: stateKeyValuePair,
-		items: make(chan Item, 2),
+		items: make(chan parser.Item, 2),
 	}
 }

 // Next advances to the next lexer item in the input string.
-// When a valid item was found, then the boolean return parameter is returned.
+// When a valid item was found, then the boolean return parameter will be true.
 // On error or when reaching the end of the input, false is returned.
-// When an error occurred, it will be set in the error return value.
-func (l *Lexer) Next() (Item, *Error, bool) {
-	if l.state == nil {
-		panic("This should not happen: nil state reached, but entering Next()")
-	}
+// When an error occurred, it will be set in the error return value, nil otherwise.
+func (l *Lexer) Next() (parser.Item, *Error, bool) {
 	for {
 		select {
 		case i := <-l.items:
-			if i.Type == ItemEOF {
+			switch {
+			case i.Type == ItemEOF:
 				return i, nil, false
-			}
-			if i.Type == ItemError {
+			case i.Type == ItemError:
 				l.err = &Error{i.Value, l.cursorRow, l.cursorColumn}
 				return i, l.err, false
-			}
+			default:
 				l.item = i
 				return i, nil, true
+			}
 		default:
 			l.state = l.state(l)
 		}
 	}
 }

-// ToArray returns lexer items as an array.
+// ToArray returns lexer items as an array (mainly intended for testing purposes)
 // When an error occurs during scanning, a partial result will be
 // returned, accompanied by the error that occurred.
-func (l *Lexer) ToArray() ([]Item, *Error) {
-	var items []Item
+func (l *Lexer) ToArray() ([]parser.Item, *Error) {
+	var items []parser.Item
 	for {
 		item, err, more := l.Next()
 		if !more {
@ -100,25 +101,25 @@ func (l *Lexer) popState() stateFn {

 // atEndOfFile returns true when there is no more data available in the input.
 func (l *Lexer) atEndOfFile() bool {
-	return l.pos >= len(l.input)
+	return l.pos >= l.len
 }

 // emit passes a lexer item back to the client, including the provided string.
-func (l *Lexer) emit(t itemType, s string) {
-	l.items <- Item{t, s}
+func (l *Lexer) emit(t parser.ItemType, s string) {
+	l.items <- parser.Item{Type: t, Value: s}
 	l.buffer.Reset()
 }

 // emitLiteral passes a lexer item back to the client, including the accumulated
 // string buffer data as a literal string.
-func (l *Lexer) emitLiteral(t itemType) {
+func (l *Lexer) emitLiteral(t parser.ItemType) {
 	l.emit(t, l.buffer.AsLiteralString())
 }

 // emitTrimmedLiteral passes a lexer item back to the client, including the
 // accumulated string buffer data as a literal string with whitespace
 // trimmed from it.
-func (l *Lexer) emitTrimmedLiteral(t itemType) {
+func (l *Lexer) emitTrimmedLiteral(t parser.ItemType) {
 	l.emit(t, strings.TrimSpace(l.buffer.AsLiteralString()))
 }

@ -127,7 +128,7 @@ func (l *Lexer) emitTrimmedLiteral(t itemType) {
 // codes like \n, \t, \uXXXX, etc.)
 // This method might return an error, in case there is data in the
 // string buffer that is not valid for string interpretation.
-func (l *Lexer) emitInterpreted(t itemType) error {
+func (l *Lexer) emitInterpreted(t parser.ItemType) error {
 	s, err := l.buffer.AsInterpretedString()
 	if err != nil {
 		return err
@ -137,15 +138,10 @@ func (l *Lexer) emitInterpreted(t itemType) error {
 }

 // emitError emits a lexer error item back to the client.
-func (l *Lexer) emitError(message string) {
+func (l *Lexer) emitError(format string, args ...interface{}) stateFn {
+	message := fmt.Sprintf(format, args...)
 	l.emit(ItemError, message)
-}
-
-// backup steps back one rune
-// Can be called only once per call of next.
-func (l *Lexer) backup() {
-	l.pos -= l.width
-	l.cursorColumn--
+	return nil
 }

 // peek returns but does not advance to the next rune(s) in the input.
@ -176,18 +172,41 @@ func (l *Lexer) peekMulti(amount int) ([]rune, int, bool) {
 	return peeked, width, true
 }

-// acceptNext adds the specified amount of runes from the input to the string buffer.
-// If not enough runes could be read (end of file or invalid UTF8 data), then false is returned.
-func (l *Lexer) acceptNext(count int) bool {
-	for i := 0; i < count; i++ {
+// acceptAny adds the next rune from the input to the string buffer.
+// If no rune could be read (end of file or invalid UTF8 data), then
+// false is returned.
+func (l *Lexer) acceptAny() bool {
 	if r, ok := l.next(); ok {
 		l.buffer.WriteRune(r)
-		} else {
+		return true
+	}
 	return false
 }
+
+// accept adds the next rune to the string buffer and returns true if it's
+// from the valid set of runes. Otherwise false is returned.
+func (l *Lexer) accept(matches ...string) bool {
+	return l.acceptPattern(matches...)
+}
+
+// AcceptMatching adds the next runes to the string buffer, but only
+// if the upcoming runes satisfy the provided pattern.
+// When runes were added then true is returned, false otherwise.
+func (l *Lexer) acceptPattern(pattern ...string) bool {
+	return l.progress(func(r rune) { l.buffer.WriteRune(r) }, pattern...)
+}
+
+func (l *Lexer) progress(callback func(rune), matches ...string) bool {
+	if runes, w, ok := l.match(matches...); ok {
+		l.pos += w
+		for _, r := range runes {
+			callback(r)
+			l.advanceCursor(r)
 		}
 		return true
 	}
+	return false
+}

 // acceptConsecutive adds consecutive runes from the input to the string
 // buffer when they match the rune match.
@ -200,27 +219,9 @@ func (l *Lexer) acceptConsecutive(match string) bool {
 	return accepted
 }

-// next returns the next rune from the input and a boolean indicating if
-// reading the input was successful.
-// When the end of input is reached, or an invalid UTF8 character is
-// read, then false is returned.
-func (l *Lexer) next() (rune, bool) {
-	r, w, ok := l.peek()
-	if ok {
-		l.width = w
-		l.pos += w
-		l.advanceCursor(r)
-		return r, true
-	}
-	l.width = 0
-	if r == utf8.RuneError && w == 0 {
-		l.emitError("unexpected end of file")
-	} else {
-		l.emitError("invalid UTF8 character")
-	}
-	return r, false
-}
-
+// advanceCursor advances the rune cursor one position in the
+// input data. While doing so, it keeps tracks of newlines,
+// so we can report on row + column positions on error.
 func (l *Lexer) advanceCursor(r rune) {
 	if l.newline {
 		l.cursorColumn = 0
@ -233,40 +234,20 @@ func (l *Lexer) advanceCursor(r rune) {

 // skip skips runes, but only when all provided matches are satisfied.
 // Returns true when one or more runes were skipped.
-func (l *Lexer) skipMatching(matches ...string) bool {
-	if runes, w, ok := l.match(matches...); ok {
-		l.pos += w
-		for _, r := range runes {
-			l.advanceCursor(r)
-		}
-		return true
-	}
-	return false
+func (l *Lexer) skipMatching(pattern ...string) bool {
+	return l.progress(func(r rune) {}, pattern...)
 }

 // skipConsecutive skips consecutive runes from the provided match.
 // Returns true when one or more runes were skipped.
-func (l *Lexer) skipConsecutive(match string) bool {
+func (l *Lexer) skipConsecutive(pattern string) bool {
 	didSkip := false
-	for l.skipMatching(match) {
+	for l.skipMatching(pattern) {
 		didSkip = true
 	}
 	return didSkip
 }

-// accept adds the next rune to the string buffer and returns true if it's
-// from the valid set of runes. Otherwise false is returned.
-func (l *Lexer) accept(match string) bool {
-	if r, ok := l.next(); ok {
-		if strings.IndexRune(match, r) >= 0 {
-			l.buffer.WriteRune(r)
-			return true
-		}
-	}
-	l.backup()
-	return false
-}
-
 // upcoming checks if the upcoming runes satisfy the provided rune matches.
 // This is a lot like the match method, with the difference that
 // this one only returns the boolean value.
@ -275,6 +256,25 @@ func (l *Lexer) upcoming(matches ...string) bool {
 	return ok
 }

+// next returns the next rune from the input and a boolean indicating if
+// reading the input was successful.
+// When the end of input is reached, or an invalid UTF8 character is
+// read, then false is returned.
+func (l *Lexer) next() (rune, bool) {
+	r, w, ok := l.peek()
+	if ok {
+		l.pos += w
+		l.advanceCursor(r)
+		return r, true
+	}
+	if r == utf8.RuneError && w == 0 {
+		l.emitError("unexpected end of file")
+	} else {
+		l.emitError("invalid UTF8 character")
+	}
+	return r, false
+}
+
 // match checks if the upcoming runes satisfy the provided rune matches.
 // It returns a slice of runes that were found, their total byte width
 // and a boolean indicating whether or not all provided matches matched
@ -292,24 +292,14 @@ func (l *Lexer) match(matches ...string) ([]rune, int, bool) {
 	return peeked, width, false
 }

-// error returns an error token and terminates the scan
-// by returning nil to l.run.
-func (l *Lexer) errorf(format string, args ...interface{}) stateFn {
-	l.items <- Item{
-		ItemError,
-		fmt.Sprintf(format, args...),
-	}
-	return nil
-}
-
 func (l *Lexer) unexpectedInputError(expected string) stateFn {
-	// next() takes care of error messages for ok == false.
+	// next() takes care of emitting errors for ok == false.
 	if r, ok := l.next(); ok {
-		l.emitError(fmt.Sprintf("unexpected character %q (expected %s)", r, expected))
+		return l.emitError(fmt.Sprintf("unexpected character %q (expected %s)", r, expected))
 	}
 	return nil
 }

 func (l *Lexer) unexpectedEndOfFile(expected string) stateFn {
-	return l.errorf("Unexpected end of file (expected %s)", expected)
+	return l.emitError("Unexpected end of file (expected %s)", expected)
 }
--- a/lexer/states.go
+++ b/lexer/states.go
@ -1,8 +1,6 @@
 package lexer

-// stateFn represents the state of the lexer as a function
-// that returns the next state.
-type stateFn func(*Lexer) stateFn
+import "github.com/mmakaay/toml/parser"

 const (
 	whitespace      string = " \t"
@ -28,59 +26,65 @@ const (
 	longUtf8Escape  string = "U"
 )

-func stateKeyValuePair(l *Lexer) stateFn {
-	l.skipConsecutive(whitespace + carriageReturn + newline)
-	if l.skipMatching(hash) {
+// NewParser creates a new parser, using the provided input string
+// as the data to parse.
+func NewParser(input string) *parser.Parser {
+	return parser.New(input, stateKeyValuePair)
+}
+
+func stateKeyValuePair(l *parser.Parser) parser.StateFn {
+	l.SkipConsecutive(whitespace + carriageReturn + newline)
+	if l.SkipMatching(hash) {
 		return stateComment
 	}
-	if l.upcoming(startOfKey) {
+	if l.Upcoming(startOfKey) {
 		return stateKey
 	}
 	return stateEndOfFile
 }

 // A '#' hash symbol marks the rest of the line as a comment.
-func stateComment(l *Lexer) stateFn {
+func stateComment(l *parser.Parser) parser.StateFn {
 	for {
 		switch {
-		case l.atEndOfFile() || l.skipMatching(newline):
-			l.emitTrimmedLiteral(ItemComment)
+		case l.AtEndOfFile() || l.SkipMatching(newline):
+			l.EmitLiteralTrim(ItemComment)
 			return stateKeyValuePair
 		default:
-			if !l.acceptNext(1) {
-				return l.unexpectedInputError("comment")
+			if !l.AcceptAny() {
+				return nil
 			}
 		}
 	}
 }

 // A key may be either bare, quoted or dotted.
-func stateKey(l *Lexer) stateFn {
-	if l.accept(bareKeyChars) {
+func stateKey(l *parser.Parser) parser.StateFn {
+	if l.AcceptMatching(bareKeyChars) {
 		return statebareKeyChars
 	}
-	return l.unexpectedInputError("a valid key name")
+	return l.UnexpectedInputError("a valid key name")
 }

 // Bare keys may only contain ASCII letters, ASCII digits,
 // underscores, and dashes (A-Za-z0-9_-). Note that bare
 // keys are allowed to be composed of only ASCII digits,
 // e.g. 1234, but are always interpreted as strings.
-func statebareKeyChars(l *Lexer) stateFn {
-	l.acceptConsecutive(bareKeyChars)
-	l.emitLiteral(ItemKey)
+func statebareKeyChars(l *parser.Parser) parser.StateFn {
+	l.AcceptConsecutive(bareKeyChars)
+	l.EmitLiteral(ItemKey)
 	return stateEndOfKeyOrKeyDot
 }

 // Dotted keys are a sequence of bare or quoted keys joined with a dot.
 // This allows for grouping similar properties together:
-func stateEndOfKeyOrKeyDot(l *Lexer) stateFn {
+func stateEndOfKeyOrKeyDot(l *parser.Parser) parser.StateFn {
 	// Whitespace around dot-separated parts is ignored, however,
 	// best practice is to not use any extraneous whitespace.
-	l.skipConsecutive(whitespace)
-	if l.skipMatching(dot) {
-		l.emit(ItemKeyDot, "")
-		l.skipConsecutive(whitespace)
+	l.SkipConsecutive(whitespace)
+	if l.SkipMatching(dot) {
+		l.Emit(ItemKeyDot, "")
+		l.SkipConsecutive(whitespace)
 		return stateKey
 	}
 	return stateKeyAssignment
@ -90,62 +94,69 @@ func stateEndOfKeyOrKeyDot(l *Lexer) stateFn {
 // Whitespace is ignored around key names and values. The key, equals
 // sign, and value must be on the same line (though some values can
 // be broken over multiple lines).
-func stateKeyAssignment(l *Lexer) stateFn {
-	l.skipConsecutive(whitespace)
-	if l.skipMatching(equal) {
-		l.emit(ItemAssignment, "")
-		l.skipConsecutive(whitespace)
+func stateKeyAssignment(l *parser.Parser) parser.StateFn {
+	l.SkipConsecutive(whitespace)
+	if l.SkipMatching(equal) {
+		l.Emit(ItemAssignment, "")
+		l.SkipConsecutive(whitespace)
 		return stateValue
 	}
-	return l.unexpectedInputError("a value assignment")
+	return l.UnexpectedInputError("a value assignment")
 }

 // Values must be of the following types: String, Integer, Float, Boolean,
 // Datetime, Array, or Inline Table. Unspecified values are invalid.
-func stateValue(l *Lexer) stateFn {
-	l.skipConsecutive(whitespace)
-	if l.upcoming(quoteChars) {
+func stateValue(l *parser.Parser) parser.StateFn {
+	l.SkipConsecutive(whitespace)
+	if l.Upcoming(quoteChars) {
 		return stateStringValue
 	}
-	return l.unexpectedInputError("a value")
+	return l.UnexpectedInputError("a value")
 }

 // There are four ways to express strings: basic, multi-line basic, literal,
 // and multi-line literal. All strings must contain only valid UTF-8 characters.
-func stateStringValue(l *Lexer) stateFn {
+func stateStringValue(l *parser.Parser) parser.StateFn {
 	switch {
-	case l.skipMatching(doubleQuote, doubleQuote, doubleQuote):
+	case l.SkipMatching(doubleQuote, doubleQuote, doubleQuote):
 		// Multi-line basic strings are surrounded by three quotation marks on each side.
 		return stateMultiLineBasicString
-	case l.skipMatching(doubleQuote):
+	case l.SkipMatching(doubleQuote):
 		// Basic strings are surrounded by quotation marks.
-		return stateBasicStringValue
+		return stateSingleLineBasicString
 	}
-	return l.unexpectedInputError("a string value")
+	return l.UnexpectedInputError("a string value")
 }

-func stateBasicStringValue(l *Lexer) stateFn {
-	if l.upcoming(doubleQuote, doubleQuote) {
+func stateSingleLineBasicString(l *parser.Parser) parser.StateFn {
+	if l.Upcoming(doubleQuote, doubleQuote) {
 		return stateMultiLineBasicString
 	}
 	return stateBasicString
 }

-const invalidBasicStringCharacters string = "" +
+func stateMultiLineBasicString(l *parser.Parser) parser.StateFn {
+	l.EmitError("Not yet implemented")
+	return nil
+}
+
+// Any Unicode character may be used except those that must be escaped:
+// quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F).
+const invalidBasicStringCharacters string = "\"\\" +
 	"\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007" +
 	"\u0008\u0009\u000A\u000B\u000C\u000D\u000E\u000F" +
 	"\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017" +
 	"\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F" +
 	"\u007F"

-func stateParseBasicString(l *Lexer) stateFn {
+func stateParseBasicString(l *parser.Parser) parser.StateFn {
 	for {
 		switch {
-		case l.atEndOfFile():
-			return l.unexpectedEndOfFile("basic string token")
-		case l.skipMatching(doubleQuote):
-			return l.popState()
-		case l.upcoming(backslash, escapeChars):
+		case l.AtEndOfFile():
+			return l.UnexpectedEndOfFile("basic string token")
+		case l.SkipMatching(doubleQuote):
+			return l.PopState()
+		case l.AcceptMatching(backslash, escapeChars):
 			// For convenience, some popular characters have a compact escape sequence.
 			// \b         - backspace       (U+0008)
 			// \t         - tab             (U+0009)
@ -154,50 +165,45 @@ func stateParseBasicString(l *Lexer) stateFn {
 			// \r         - carriage return (U+000D)
 			// \"         - quote           (U+0022)
 			// \\         - backslash       (U+005C)
-			l.acceptNext(2)
-		case l.upcoming(backslash, shortUtf8Escape, hex, hex, hex, hex):
+		case l.AcceptMatching(backslash, shortUtf8Escape, hex, hex, hex, hex):
 			// \uXXXX     - unicode         (U+XXXX)
-			l.acceptNext(6)
-		case l.upcoming(backslash, longUtf8Escape, hex, hex, hex, hex, hex, hex, hex, hex):
+		case l.AcceptMatching(backslash, longUtf8Escape, hex, hex, hex, hex, hex, hex, hex, hex):
 			// \UXXXXXXXX - unicode         (U+XXXXXXXX)
-			l.acceptNext(10)
-		case l.upcoming(backslash):
+		case l.Upcoming(backslash):
 			// All other escape sequences not listed above are reserved and,
 			// if used, TOML should produce an error.
-			return l.errorf("Invalid escape sequence in basic string")
-		case l.upcoming(invalidBasicStringCharacters):
+			return l.EmitError("Invalid escape sequence in basic string")
+		case l.Upcoming(invalidBasicStringCharacters):
 			// Any Unicode character may be used except those that must be escaped:
 			// quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F).
-			r, _ := l.next()
-			return l.errorf("Invalid character in basic string: %q", r)
+			r, _, _ := l.Match(invalidBasicStringCharacters)
+			l.EmitError("Invalid character in basic string: %q (must be escaped)", r[0])
+			return nil
 		default:
-			if !l.acceptNext(1) {
-				return l.unexpectedInputError("string value")
+			if !l.AcceptAny() {
+				return nil
 			}
 		}
 	}
 }

-func stateBasicString(l *Lexer) stateFn {
-	l.pushState(func(l *Lexer) stateFn {
-		err := l.emitInterpreted(ItemString)
+func stateBasicString(l *parser.Parser) parser.StateFn {
+	l.PushState(func(l *parser.Parser) parser.StateFn {
+		err := l.EmitInterpreted(ItemString)
 		if err != nil {
-			return l.errorf("Invalid data in string: %s", err)
+			l.EmitError("Invalid data in string: %s", err)
+			return nil
 		}
 		return stateKeyValuePair
 	})
 	return stateParseBasicString
 }

-func stateMultiLineBasicString(l *Lexer) stateFn {
-	return l.errorf("Not yet implemented")
-}
-
-func stateEndOfFile(l *Lexer) stateFn {
-	if l.atEndOfFile() {
-		l.emit(ItemEOF, "EOF")
+func stateEndOfFile(l *parser.Parser) parser.StateFn {
+	if l.AtEndOfFile() {
+		l.Emit(parser.ItemEOF, "EOF") // todo Automate within parser?
 	} else {
-		l.unexpectedInputError("end of file")
+		l.UnexpectedInputError("end of file")
 	}
 	return nil
 }
--- a/lexer/states_test.go
+++ b/lexer/states_test.go
@ -9,7 +9,7 @@ import (
 )

 func TestErrorsIncludeLineAndRowPosition(t *testing.T) {
-	_, err := lexer.Lex("# 12345 abcde\t\n\n\n# 67890\r\n# 12345\xbc").ToArray()
+	_, err := lexer.NewParser("# 12345 abcde\t\n\n\n# 67890\r\n# 12345\xbc").ToArray()
 	t.Logf("Got error: %s", err.Error())
 	if err.Row != 4 {
 		t.Errorf("Unexpected line number: %d (expected %d)", err.Row, 4)
@ -19,21 +19,20 @@ func TestErrorsIncludeLineAndRowPosition(t *testing.T) {
 	}
 }

+func TestEmptyInput(t *testing.T) {
+	runStatesT(t, statesT{"empty string", "", "", ""})
+}
+
 func TestInvalidUtf8Data(t *testing.T) {
 	runStatesTs(t, []statesT{
 		{"inside comment", "# \xbc", "", "invalid UTF8 character"},
 		{"bare key 1", "\xbc", "", "invalid UTF8 character"},
-		{"bare key 2", "key\xbc", "", "invalid UTF8 character"},
+		{"bare key 2", "key\xbc", "[key]", "invalid UTF8 character"},
 		{"assignment", "key \xbc", "[key]", "invalid UTF8 character"},
 		{"start of value", "key=\xbc", "[key]=", "invalid UTF8 character"},
 		{"basic string value", "a=\"\xbc\"", "[a]=", "invalid UTF8 character"},
 	})
 }
-
-func TestEmptyInput(t *testing.T) {
-	runStatesT(t, statesT{"empty string", "", "", ""})
-}
-
 func TestWhiteSpaceAndNewlines(t *testing.T) {
 	runStatesTs(t, []statesT{
 		{"space", " ", "", ""},
@ -61,13 +60,13 @@ func TestKeyWithoutAssignment(t *testing.T) {
 	err := "unexpected end of file"
 	runStatesTs(t, []statesT{
 		{"bare with whitespace", " a ", "[a]", err},
-		{"bare lower", "abcdefghijklmnopqrstuvwxyz", "", err},
-		{"bare upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "", err},
-		{"bare numbers", "0123456789", "", err},
-		{"bare underscore", "_", "", err},
-		{"bare dash", "-", "", err},
-		{"bare big mix", "-hey_good_Lookin123-", "", err},
-		{"bare dotted", "a._.c", "[a].[_].", err},
+		{"bare lower", "abcdefghijklmnopqrstuvwxyz", "[abcdefghijklmnopqrstuvwxyz]", err},
+		{"bare upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "[ABCDEFGHIJKLMNOPQRSTUVWXYZ]", err},
+		{"bare numbers", "0123456789", "[0123456789]", err},
+		{"bare underscore", "_", "[_]", err},
+		{"bare dash", "-", "[-]", err},
+		{"bare big mix", "-hey_good_Lookin123-", "[-hey_good_Lookin123-]", err},
+		{"bare dotted", "a._.c", "[a].[_].[c]", err},
 		{"bare dotted with whitespace", " a .\t\t b\t ", "[a].[b]", err},
 	})
 }
@ -90,9 +89,9 @@ func TestUnterminatedBasicString(t *testing.T) {

 func TestBasicStringWithUnescapedControlCharacters(t *testing.T) {
 	runStatesTs(t, []statesT{
-		{"null char", "a=\"\u0000\"", "[a]=", `Invalid character in basic string: '\x00'`},
-		{"newline", "a=\"b\nc\nd\"", "[a]=", `Invalid character in basic string: '\n'`},
-		{"delete", "a=\"\u007F\"", "[a]=", `Invalid character in basic string: '\u007f'`},
+		{"null char", "a=\"\u0000\"", "[a]=", `Invalid character in basic string: '\x00' (must be escaped)`},
+		{"newline", "a=\"b\nc\nd\"", "[a]=", `Invalid character in basic string: '\n' (must be escaped)`},
+		{"delete", "a=\"\u007F\"", "[a]=", `Invalid character in basic string: '\u007f' (must be escaped)`},
 	})

 	// No need to write all test cases for disallowed characters by hand.
@ -100,7 +99,7 @@ func TestBasicStringWithUnescapedControlCharacters(t *testing.T) {
 		name := fmt.Sprintf("control character %x", rune(i))
 		runStatesT(
 			t, statesT{name, fmt.Sprintf(`_="%c"`, rune(i)), "[_]=",
-				fmt.Sprintf(`Invalid character in basic string: %q`, rune(i))})
+				fmt.Sprintf(`Invalid character in basic string: %q (must be escaped)`, rune(i))})
 	}
 }

@ -163,7 +162,7 @@ func runStatesTs(t *testing.T, tests []statesT) {
 }

 func runStatesT(t *testing.T, c statesT) {
-	l, err := lexer.Lex(c.in).ToArray()
+	l, err := lexer.NewParser(c.in).ToArray()
 	if err == nil && c.err != "" {
 		t.Errorf("[%s] Expected error '%s', but no error occurred", c.name, c.err)
 	}
@ -179,14 +178,15 @@ func runStatesT(t *testing.T, c statesT) {
 			t.Errorf("[%s] Unexpected number of lexer items:\nexpected: %d\nactual: %d\n", c.name, len(expected), len(l))
 		}
 		for i, e := range expected {
-			if l[i].String() != e {
-				t.Errorf("[%s] Unexpected lexer item at index %d:\nexpected: %s\nactual: %s\n", c.name, i, e, l[i])
+			v := lexer.ParserItemToString(l[i])
+			if v != e {
+				t.Errorf("[%s] Unexpected lexer item at index %d:\nexpected: %s\nactual: %s\n", c.name, i, e, v)
 			}
 		}
 	case string:
 		a := make([]string, len(l))
 		for _, v := range l {
-			a = append(a, v.String())
+			a = append(a, lexer.ParserItemToString(v))
 		}
 		actual := strings.Join(a, "")
 		if actual != expected {
--- a/lexer/stringbuf.no
+++ b/lexer/stringbuf.no
--- a/lexer/stringbuf_test.no
+++ b/lexer/stringbuf_test.no
--- a/parser/parser.go
+++ b/parser/parser.go
@ -0,0 +1,261 @@
+package parser
+
+import (
+	"fmt"
+	"strings"
+	"unicode/utf8"
+)
+
+// New takes an input string and a start state,
+// and initializes the parser for it.
+func New(input string, startState StateFn) *Parser {
+	return &Parser{
+		input: input,
+		len:   len(input),
+		state: startState,
+		items: make(chan Item, 2),
+	}
+}
+
+// PushState adds the state function to the state stack.
+// This is used for implementing nested parsing.
+func (l *Parser) PushState(state StateFn) {
+	l.stack = append(l.stack, state)
+}
+
+// PopState pops the last pushed state from the state stack.
+func (l *Parser) PopState() StateFn {
+	last := len(l.stack) - 1
+	head, tail := l.stack[:last], l.stack[last]
+	l.stack = head
+	return tail
+}
+
+// AtEndOfFile returns true when there is no more data available in the input.
+func (l *Parser) AtEndOfFile() bool {
+	return l.pos >= l.len
+}
+
+// Emit passes a Parser item to the client, including the provided string.
+func (l *Parser) Emit(t ItemType, s string) {
+	l.items <- Item{t, s}
+	l.buffer.Reset()
+}
+
+// EmitLiteral passes a Parser item to the client, including the accumulated
+// string buffer data as a literal string.
+func (l *Parser) EmitLiteral(t ItemType) {
+	l.Emit(t, l.buffer.AsLiteralString())
+}
+
+// EmitLiteralTrim passes a Parser item to the client, including the
+// accumulated string buffer data as a literal string with whitespace
+// trimmed from it.
+func (l *Parser) EmitLiteralTrim(t ItemType) {
+	l.Emit(t, strings.TrimSpace(l.buffer.AsLiteralString()))
+}
+
+// EmitInterpreted passes a Parser item to the client, including the
+// accumulated string buffer data a Go doubled quoted interpreted string
+// (handling escape codes like \n, \t, \uXXXX, etc.)
+// This method might return an error, in case there is data in the
+// string buffer that is not valid for string interpretation.
+func (l *Parser) EmitInterpreted(t ItemType) error {
+	s, err := l.buffer.AsInterpretedString()
+	if err != nil {
+		return err
+	}
+	l.Emit(t, s)
+	return nil
+}
+
+// EmitError emits a Parser error item to the client.
+func (l *Parser) EmitError(format string, args ...interface{}) StateFn {
+	message := fmt.Sprintf(format, args...)
+	l.Emit(ItemError, message)
+	return nil
+}
+
+// Match checks if the upcoming runes satisfy all provided patterns.
+// It returns a slice of runes that were found, their total byte width
+// and a boolean indicating whether or not all provided patterns were
+// satisfied by the input data.
+func (l *Parser) Match(patterns ...string) ([]rune, int, bool) {
+	peeked, width, ok := l.peekMulti(len(patterns))
+	if ok {
+		for i, r := range patterns {
+			if strings.IndexRune(r, peeked[i]) < 0 {
+				return peeked, width, false
+			}
+		}
+		return peeked, width, true
+	}
+	return peeked, width, false
+}
+
+// Upcoming checks if the upcoming runes satisfy all provided patterns.
+// Returns true if all provided patterns are satisfied.
+func (l *Parser) Upcoming(patterns ...string) bool {
+	_, _, ok := l.Match(patterns...)
+	return ok
+}
+
+// AcceptAny adds the next rune from the input to the string buffer.
+// If no rune could be read (end of file or invalid UTF8 data),
+// then false is returned.
+func (l *Parser) AcceptAny() bool {
+	if r, ok := l.next(); ok {
+		l.buffer.WriteRune(r)
+		return true
+	}
+	return false
+}
+
+// AcceptMatching adds the next runes to the string buffer, but only
+// if the upcoming runes satisfy the provided patterns.
+// When runes were added then true is returned, false otherwise.
+func (l *Parser) AcceptMatching(patterns ...string) bool {
+	return l.progress(func(r rune) { l.buffer.WriteRune(r) }, patterns...)
+}
+
+// AcceptConsecutive adds consecutive runes from the input to the string
+// buffer, as long as they exist in the pattern.
+// If any runes were added then true is returned, false otherwise.
+func (l *Parser) AcceptConsecutive(pattern string) bool {
+	accepted := false
+	for l.AcceptMatching(pattern) {
+		accepted = true
+	}
+	return accepted
+}
+
+// SkipMatching skips runes, but only when all provided patterns are satisfied.
+// Returns true when one or more runes were skipped.
+func (l *Parser) SkipMatching(patterns ...string) bool {
+	if runes, w, ok := l.Match(patterns...); ok {
+		l.pos += w
+		for _, r := range runes {
+			l.advanceCursor(r)
+		}
+		return true
+	}
+	return false
+}
+
+// SkipConsecutive skips consecutive runes from the provided pattern.
+// Returns true when one or more runes were skipped.
+func (l *Parser) SkipConsecutive(pattern string) bool {
+	didSkip := false
+	for l.SkipMatching(pattern) {
+		didSkip = true
+	}
+	return didSkip
+}
+
+// ============================================================================
+// EMIT DATA AND ERRORS
+// ============================================================================
+
+// UnexpectedInputError is used by a parser implementation to emit an
+// error item that tells the client that an unexpected rune was
+// encountered in the input.
+// The parameter 'expected' is used to provide some context to the error.
+func (l *Parser) UnexpectedInputError(expected string) StateFn {
+	// next() takes care of error messages for ok == false.
+	if r, ok := l.next(); ok {
+		return l.EmitError(fmt.Sprintf("unexpected character %q (expected %s)", r, expected))
+	}
+	return nil
+}
+
+// UnexpectedEndOfFile is used by a parser implementation to emit an
+// error item that tells the client that more data was expected from
+// the input.
+// The parameter 'expected' is used to provide some context to the error.
+func (l *Parser) UnexpectedEndOfFile(expected string) StateFn {
+	return l.EmitError("Unexpected end of file (expected %s)", expected)
+}
+
+// ============================================================================
+// LEXER : our lexer is quite low level, it only returns UTF8 runes
+// ============================================================================
+
+// peek returns but does not advance to the next rune(s) in the input.
+// Returns the rune, its width and a boolean. The boolean will be false in case
+// no upcoming rune can be peeked (end of data or invalid UTF8 character).
+func (l *Parser) peek() (rune, int, bool) {
+	peeked, width := utf8.DecodeRuneInString(l.input[l.pos:])
+	return peeked, width, peeked != utf8.RuneError
+}
+
+// peekMulti takes a peek at multiple upcoming runes in the input.
+// Returns a slice of runes, their total width in bytes and a boolean.
+// The boolean will be false in case less runes can be peeked than
+// the requested amount (end of data or invalid UTF8 character).
+func (l *Parser) peekMulti(amount int) ([]rune, int, bool) {
+	width := 0
+	var peeked []rune
+	for i := 0; i < amount; i++ {
+		r, w := utf8.DecodeRuneInString(l.input[l.pos+width:])
+		switch {
+		case r == utf8.RuneError:
+			return peeked, width, false
+		default:
+			width += w
+			peeked = append(peeked, r)
+		}
+	}
+	return peeked, width, true
+}
+
+// progress moves the cursor forward in the input, returning one rune
+// for every specified pattern. The cursor is only moved forward when
+// all patterns are satisfied.
+// Returns true when all patterns were satisfied and the cursor was
+// moved forward, false otherwise.
+// A callback function can be provided to specify what to do with
+// the runes that are encountered in the input.
+func (l *Parser) progress(callback func(rune), patterns ...string) bool {
+	if runes, w, ok := l.Match(patterns...); ok {
+		l.pos += w
+		for _, r := range runes {
+			callback(r)
+			l.advanceCursor(r)
+		}
+		return true
+	}
+	return false
+}
+
+// next returns the next rune from the input and a boolean indicating if
+// reading the input was successful.
+// When the end of input is reached, or an invalid UTF8 character is
+// read, then false is returned. Both are considered error cases,
+// and for that reason these automatically emit an error to the client.
+func (l *Parser) next() (rune, bool) {
+	r, w, ok := l.peek()
+	if ok {
+		l.pos += w
+		l.advanceCursor(r)
+		return r, true
+	}
+	if r == utf8.RuneError && w == 0 {
+		l.EmitError("unexpected end of file")
+	} else {
+		l.EmitError("invalid UTF8 character")
+	}
+	return r, false
+}
+
+// advanceCursor advances the rune cursor one position in the
+// input data. While doing so, it keeps tracks of newlines,
+// so we can report on row + column positions on error.
+func (l *Parser) advanceCursor(r rune) {
+	if l.newline {
+		l.cursorColumn = 0
+		l.cursorRow++
+	} else {
+		l.cursorColumn++
+	}
+	l.newline = r == '\n'
+}
--- a/parser/stringbuf.go
+++ b/parser/stringbuf.go
@ -0,0 +1,62 @@
+package parser
+
+import (
+	"bytes"
+	"strconv"
+	"strings"
+)
+
+// StringBuffer is a string buffer implementation, which is used by the parser
+// to efficiently accumulate runes from the input and eventually turn these
+// into a string, either literal or interpreted.
+type StringBuffer struct {
+	buffer bytes.Buffer
+}
+
+// Reset resets the string buffer, in order to build a new string.
+func (b *StringBuffer) Reset() *StringBuffer {
+	b.buffer.Reset()
+	return b
+}
+
+// WriteString adds the runes of the input string to the string buffer.
+func (b *StringBuffer) WriteString(s string) *StringBuffer {
+	for _, r := range s {
+		b.WriteRune(r)
+	}
+	return b
+}
+
+// WriteRune adds a single rune to the string buffer.
+func (b *StringBuffer) WriteRune(r rune) *StringBuffer {
+	b.buffer.WriteRune(r)
+	return b
+}
+
+// AsLiteralString returns the string buffer as a literal string.
+// Literal means that no escape sequences are processed.
+func (b *StringBuffer) AsLiteralString() string {
+	return b.buffer.String()
+}
+
+// AsInterpretedString returns the string in its interpreted form.
+// Interpreted means that escape sequences are handled in the way that Go would
+// have, had it been inside double quotes. It translates for example escape
+// sequences like "\n", "\t", \uXXXX" and "\UXXXXXXXX" into their string
+// representations.
+// Since the input might contain invalid escape sequences, this method
+// also returns an error. When an error is returned, the returned string will
+// contain the string as far as it could be interpreted.
+func (b *StringBuffer) AsInterpretedString() (string, error) {
+	var sb strings.Builder
+	tail := b.buffer.String()
+	for len(tail) > 0 {
+		r, _, newtail, err := strconv.UnquoteChar(tail, '"')
+		if err != nil {
+			return sb.String(), err
+		}
+		tail = newtail
+		sb.WriteRune(r)
+	}
+	return sb.String(), nil
+}
--- a/parser/stringbuf_test.go
+++ b/parser/stringbuf_test.go
@ -0,0 +1,90 @@
+package parser_test
+
+import (
+	"testing"
+
+	"github.com/mmakaay/toml/parser"
+)
+
+func TestGeneratingStringDoesNotResetBuffer(t *testing.T) {
+	var b parser.StringBuffer
+	s1, _ := b.WriteString(`hi\nthere`).AsInterpretedString()
+	s2 := b.AsLiteralString()
+	if s1 != "hi\nthere" {
+		t.Fatalf("Did not get expected string\"X\" for try 1, but %q", s1)
+	}
+	if s2 != "hi\\nthere" {
+		t.Fatalf("Did not get expected string\"X\" for try 2, but %q", s2)
+	}
+}
+
+func TestResetResetsBuffer(t *testing.T) {
+	var b parser.StringBuffer
+	s := b.WriteRune('X').Reset().AsLiteralString()
+	if s != "" {
+		t.Fatalf("Did not get expected empty string, but %q", s)
+	}
+}
+
+func TestAsLiteralString(t *testing.T) {
+	b := parser.StringBuffer{}
+	for _, c := range []stringbufT{
+		{"empty string", ``, ``, OK},
+		{"simple string", `Simple string!`, `Simple string!`, OK},
+		{"single quote", `'`, `'`, OK},
+		{"double quote", `"`, `"`, OK},
+		{"escaped single quote", `\'`, `\'`, OK},
+		{"escaped double quote", `\"`, `\"`, OK},
+		{"escape anything", `\x\t\f\n\r\'\"\\`, `\x\t\f\n\r\'\"\\`, OK},
+		{"UTF8 escapes", `\uceb2\U00e0b8bf`, `\uceb2\U00e0b8bf`, OK},
+		{"actual newline", "on\nmultiple\nlines", "on\nmultiple\nlines", OK},
+	} {
+		s := b.Reset().WriteString(c.in).AsLiteralString()
+		if s != c.out {
+			t.Fatalf("[%s] %q -> %q failed: actual result = %q", c.name, c.in, c.out, s)
+		}
+	}
+}
+
+func TestAsInterpretedString(t *testing.T) {
+	b := parser.StringBuffer{}
+	for _, c := range []stringbufT{
+		{"empty string", "", "", OK},
+		{"one character", "Simple string!", "Simple string!", OK},
+		{"escaped single quote", `\'`, "", FAIL},
+		{"escaped double quote", `\"`, `"`, OK},
+		{"bare single quote", `'`, "'", OK},
+		{"string in single quotes", `'Hello'`, `'Hello'`, OK},
+		{"string in escaped double quotes", `\"Hello\"`, `"Hello"`, OK},
+		{"escape something", `\t\f\n\r\"\\`, "\t\f\n\r\"\\", OK},
+		{"short UTF8 escapes", `\u2318Wh\u00e9\u00e9!`, `⌘Whéé!`, OK},
+		{"long UTF8 escapes", `\U0001014D \u2318 Wh\u00e9\u00e9!`, `𐅍 ⌘ Whéé!`, OK},
+		{"UTF8 characters", "Ѝюج wut Ж ?", "Ѝюج wut Ж ?", OK},
+		{"example from spec",
+			`I'm a string. \"You can quote me\". Name\tJos\u00E9\nLocation\tSF.`,
+			"I'm a string. \"You can quote me\". Name\tJosé\nLocation\tSF.", OK},
+	} {
+		s, err := b.Reset().WriteString(c.in).AsInterpretedString()
+		if c.isSuccessCase && err != nil {
+			t.Fatalf("[%s] unexpected error for input %q: %s", c.name, c.in, err)
+		}
+		if !c.isSuccessCase && err == nil {
+			t.Fatalf("[%s] expected a failure, but no failure occurred", c.name)
+		}
+		if s != c.out && c.isSuccessCase {
+			t.Fatalf("[%s] %q -> %q failed: actual result = %q", c.name, c.in, c.out, s)
+		}
+	}
+}
+
+type stringbufT struct {
+	name          string
+	in            string
+	out           string
+	isSuccessCase bool
+}
+
+const (
+	OK   bool = true
+	FAIL bool = false
+)
--- a/parser/types.go
+++ b/parser/types.go
@ -0,0 +1,51 @@
+package parser
+
+// Parser holds the internal state of the Parser.
+type Parser struct {
+	state        StateFn      // a function that handles the current state
+	stack        []StateFn    // state function stack, for nested parsing
+	input        string       // the scanned input
+	len          int          // the total length of the input in bytes
+	pos          int          // current byte scanning position in the input
+	newline      bool         // keep track of when we have scanned a newline
+	cursorRow    int          // current row number in the input
+	cursorColumn int          // current column position in the input
+	buffer       StringBuffer // an efficient buffer, used to build string values
+	items        chan Item    // channel of resulting Parser items
+	item         Item         // the current item as reached by Next() and retrieved by Get()
+	err          *Error       // an error when lexing failed, retrieved by Error()
+}
+
+// StateFn represents the state of the parser as a function
+// that returns the next state.
+type StateFn func(*Parser) StateFn
+
+// ItemType represents the type of a parser Item.
+type ItemType int
+
+// ItemEOF is a built-in parser item type that is used for flagging that the
+// end of the input was reached.
+const ItemEOF ItemType = -1
+
+// ItemError is a built-in parser item type that is used for flagging that
+// an error has occurred during parsing.
+const ItemError ItemType = -2
+
+// Item represents an item returned from the parser.
+type Item struct {
+	Type  ItemType
+	Value string
+}
+
+// Error is used as the error type when parsing errors occur.
+// The error includes some extra meta information to allow for useful
+// error messages to the user.
+type Error struct {
+	Message string
+	Row     int
+	Column  int
+}
+
+func (err *Error) Error() string {
+	return err.Message
+}
--- a/parser/user_api.go
+++ b/parser/user_api.go
@ -0,0 +1,39 @@
+package parser
+
+// Next retrieves the next parsed item.
+// When a valid item was found, then the boolean return parameter will be true.
+// On error or when successfully reaching the end of the input, false is returned.
+// When an error occurred, it will be set in the error return value, nil otherwise.
+func (l *Parser) Next() (Item, *Error, bool) {
+	for {
+		select {
+		case i := <-l.items:
+			switch {
+			case i.Type == ItemEOF:
+				return i, nil, false
+			case i.Type == ItemError:
+				l.err = &Error{i.Value, l.cursorRow, l.cursorColumn}
+				return i, l.err, false
+			default:
+				l.item = i
+				return i, nil, true
+			}
+		default:
+			l.state = l.state(l)
+		}
+	}
+}
+
+// ToArray returns Parser items as an array (mainly intended for testing purposes)
+// When an error occurs during scanning, a partial result will be
+// returned, accompanied by the error that occurred.
+func (l *Parser) ToArray() ([]Item, *Error) {
+	var items []Item
+	for {
+		item, err, more := l.Next()
+		if !more {
+			return items, err
+		}
+		items = append(items, item)
+	}
+}