Code cleanup and refactoring run, both functional code and the tests.

2019-05-16 14:17:06 +00:00 · 2019-05-16 14:17:06 +00:00 · cbc4f04179
parent 6636a7a672
commit cbc4f04179
7 changed files with 435 additions and 334 deletions
--- a/lexer/items.go
+++ b/lexer/items.go
@ -12,6 +12,7 @@ const (
 	ItemComment                    // Comment string, starts with # till en of line
 	ItemKey                        // Key of a key/value pair
 	ItemKeyDot                     // Dot for a dotted key
+	ItemAssignment                 // Value assignment coming up (=)
 	ItemString                     // A value of type string
 )

@ -26,26 +27,26 @@ func (i Item) String() string {
 	switch i.Type {
 	case ItemEOF:
 		return "EOF"
-	case ItemError:
-		return "Error: " + i.Value
+	case ItemKey:
+		return fmt.Sprintf("[%s]", i.Value)
+	case ItemKeyDot:
+		return "."
+	case ItemAssignment:
+		return "="
 	}
-	return fmt.Sprintf("%s(%q)", i.Type, i.Value)
+	return fmt.Sprintf("%s(%s)", i.Type, i.Value)
 }

 // String returns a string representation of the lexer item type.
 func (i itemType) String() string {
 	switch i {
 	case ItemError:
-		return "Error"
+		return "ERR"
 	case ItemComment:
-		return "Comment"
-	case ItemKey:
-		return "Key"
-	case ItemKeyDot:
-		return "KeyDot"
+		return "#"
 	case ItemString:
-		return "String"
+		return "STR"
 	default:
-		return fmt.Sprintf("<type id %d>", i)
+		panic(fmt.Sprintf("No translation available for type id %d", i))
 	}
 }
--- a/lexer/lexer.go
+++ b/lexer/lexer.go
@ -12,7 +12,6 @@ type Lexer struct {
 	input    string       // the scanned input string
 	state    stateFn      // a function that handles the current state
 	stack    []stateFn    // state function stack, for nested parsing
-	start    int          // start position of the currently scanned item
 	pos      int          // current scanning position in the input
 	width    int          // width of the last rune read, for supporting backup()
 	buffer   StringBuffer // an efficient buffer, used to build string values
@ -99,29 +98,44 @@ func (l *Lexer) popState() stateFn {
 	return tail
 }

-// TODO niet meer nodig?
-// getAcceptedString returns the string as accepted by the
-// accept* methods so far.
-func (l *Lexer) getAcceptedString() string {
-	return l.input[l.start:l.pos]
-}
-
-// emit passes a scanned item back to the client.
-func (l *Lexer) emit(t itemType, v string) {
-	l.items <- Item{t, v}
-	l.start = l.pos
-}
-
-// TODO niet meer nodig met stringbuilder?
-// ignore skips over the pending input before the current position.
-func (l *Lexer) ignore() {
-	l.start = l.pos
-}
-
+// atEndOfFile returns true when there is no more data available in the input.
 func (l *Lexer) atEndOfFile() bool {
 	return l.pos >= len(l.input)
 }

+// emit passes a lexer item back to the client, including the provided string.
+func (l *Lexer) emit(t itemType, s string) {
+	l.items <- Item{t, s}
+	l.buffer.Reset()
+}
+
+// emitLiteral passes a lexer item back to the client, including the accumulated
+// string buffer data as a literal string.
+func (l *Lexer) emitLiteral(t itemType) {
+	l.emit(t, l.buffer.AsLiteralString())
+}
+
+// emitTrimmedLiteral passes a lexer item back to the client, including the
+// accumulated string buffer data as a literal string with whitespace
+// trimmed from it.
+func (l *Lexer) emitTrimmedLiteral(t itemType) {
+	l.emit(t, strings.TrimSpace(l.buffer.AsLiteralString()))
+}
+
+// emitInterpreted passes a lexer item back to the client, including the
+// accumulated string buffer data an interpreted string (handling escape
+// codes like \n, \t, \uXXXX, etc.)
+// This method might return an error, in case there is data in the
+// string buffer that is not valid for string interpretation.
+func (l *Lexer) emitInterpreted(t itemType) error {
+	s, err := l.buffer.AsInterpretedString()
+	if err != nil {
+		return err
+	}
+	l.emit(t, s)
+	return nil
+}
+
 // backup steps back one rune
 // Can be called only once per call of next.
 func (l *Lexer) backup() {
@ -129,16 +143,119 @@ func (l *Lexer) backup() {
 }

 // peek returns but does not advance to the next rune(s) in the input.
-func (l *Lexer) peek() rune {
-	r := l.next()
-	l.backup()
-	return r
+// Returns the rune, its width and a boolean. The boolean will be false in case
+// no upcoming rune can be peeked (end of data or invalid UTF8 character).
+func (l *Lexer) peek() (rune, int, bool) {
+	r, w := utf8.DecodeRuneInString(l.input[l.pos:])
+	switch {
+	case r == utf8.RuneError:
+		return utf8.RuneError, w, false
+	default:
+		return r, w, true
+	}
 }

-// TODO nog nodig met stringbuffer?
-// accept consumes the next rune if it's from the valid set of runes.
+// peekMulti takes a peek at multiple upcoming runes in the input.
+// Returns a slice of runes and a boolean. The boolean will be false in case
+// less upcoming runes can be peeked than the requested amount
+// (end of data or invalid UTF8 character).
+func (l *Lexer) peekMulti(amount int) ([]rune, bool) {
+	offset := 0
+	var peeked []rune
+	for i := 0; i < amount; i++ {
+		r, w := utf8.DecodeRuneInString(l.input[l.pos+offset:])
+		switch {
+		case r == utf8.RuneError:
+			return peeked, false
+		default:
+			offset += w
+			peeked = append(peeked, r)
+		}
+	}
+	return peeked, true
+}
+
+// acceptNext adds the next rune from the input to the string buffer.
+// If no rune could be read (end of file or invalid UTF8 data),
+// then false is returned.
+func (l *Lexer) acceptNext() bool {
+	r := l.next()
+	if r == endOfFile || r == utf8.RuneError {
+		return false
+	}
+	l.buffer.WriteRune(r)
+	return true
+}
+
+// acceptFrom adds the next rune from the input to the string buffer
+// when it matches in the provided runes. If the next rune does
+// not match, false is returned.
+func (l *Lexer) acceptFrom(runes string) bool {
+	r := l.next()
+	if strings.IndexRune(runes, r) >= 0 {
+		l.buffer.WriteRune(r)
+		return true
+	}
+	l.backup()
+	return false
+}
+
+// acceptRun adds consecutive runes from the input to the string
+// buffer when they match the provided runes. If no runes were added
+// at all, false it returned.
+func (l *Lexer) acceptRun(runes string) bool {
+	accepted := false
+	for l.acceptFrom(runes) {
+		accepted = true
+	}
+	return accepted
+}
+
+// TODO meh... ugly rune.
+var endOfFile rune = -1
+
+// next returns the next rune from the input.
+func (l *Lexer) next() rune {
+	l.width = 0
+	r, w := utf8.DecodeRuneInString(l.input[l.pos:])
+	switch {
+	case r == utf8.RuneError && w == 0:
+		return endOfFile
+	case r == utf8.RuneError:
+		return utf8.RuneError
+	default:
+		l.width = w
+		l.pos += w
+		return r
+	}
+}
+
+// skip skips a rune from the set of accepted runes.
+// Returns true when a rune was skipped.
+func (l *Lexer) skip(runes string) bool {
+	r, w, _ := l.peek()
+	if strings.IndexRune(runes, r) >= 0 {
+		l.pos += w
+		return true
+	}
+	return false
+}
+
+// skipRun skips a run of runes from the set of accepted runes.
+// Returns true when one or more runes were skipped.
+func (l *Lexer) skipRun(runes string) bool {
+	didSkip := false
+	for l.skip(runes) {
+		didSkip = true
+	}
+	return didSkip
+}
+
+// accept adds the next rune to the string buffer and returns true if it's
+// from the valid set of runes. Otherwise false is returned.
 func (l *Lexer) accept(runes string) bool {
-	if strings.IndexRune(runes, l.next()) >= 0 {
+	r := l.next()
+	if strings.IndexRune(runes, r) >= 0 {
 		return true
 	}
 	l.backup()
@ -187,34 +304,10 @@ func (l *Lexer) acceptWhile(runes string) bool {
 	return accepted
 }

-// skip skips a run of runes from the set of accepted runs.
-func (l *Lexer) skip(runes string) {
-	if l.acceptWhile(runes) {
-		l.ignore()
-	}
-}
-
 // skipUntil skips a run of runes, until a rune from the set of
 // runes of EOF is reached.
 func (l *Lexer) skipUntil(runes string) {
-	if l.acceptUntil(runes) {
-		l.ignore()
-	}
-}
-
-// TODO meh... ugly rune.
-var endOfFile rune = -1
-
-// next returns the next rune in the input.
-func (l *Lexer) next() rune {
-	if l.atEndOfFile() {
-		l.width = 0
-		return endOfFile // TODO phase out this bizarro rune?
-	}
-	r, w := utf8.DecodeRuneInString(l.input[l.pos:])
-	l.width = w
-	l.pos += w
-	return r
+	l.acceptUntil(runes)
 }

 // error returns an error token and terminates the scan
@ -227,15 +320,16 @@ func (l *Lexer) errorf(format string, args ...interface{}) stateFn {
 	return nil
 }

-func (l *Lexer) unexpectedTokenError(expected string) stateFn {
+func (l *Lexer) unexpectedInputError(expected string) stateFn {
 	var actual string
 	switch {
-	case l.peek() == endOfFile:
+	case l.atEndOfFile(): // TODO maybe not hit anymore after refactoring?
 		actual = "end of file"
-	case !utf8.ValidString(l.input[l.start:]):
+	case !utf8.ValidString(l.input[l.pos:]):
 		actual = "non-UTF8 data"
 	default:
-		actual = fmt.Sprintf("token '%c'", l.peek())
+		r, _, _ := l.peek()
+		actual = fmt.Sprintf("token '%c'", r)
 	}
 	return l.errorf("Unexpected %s (expected %s)", actual, expected)
 }
--- a/lexer/lexer_test.go
+++ b/lexer/lexer_test.go
@ -1,175 +0,0 @@
-package lexer_test
-
-import (
-	"fmt"
-	"testing"
-
-	"github.com/mmakaay/toml/lexer"
-)
-
-func TestInvalidUtf8Data(t *testing.T) {
-	assertFailureAndCheck(t, "\xbc", []string{}, "Unexpected non-UTF8 data (expected end of file)")
-}
-
-func TestEmptyInput(t *testing.T) {
-	assertSuccessAndCheck(t, "", []string{})
-}
-func TestWhiteSpace(t *testing.T) {
-	assertSuccessAndCheck(t, " ", []string{})
-	assertSuccessAndCheck(t, "\t", []string{})
-	assertSuccessAndCheck(t, " \t \t ", []string{})
-}
-func TestWhiteSpaceAndNewlines(t *testing.T) {
-	assertSuccessAndCheck(t, "\n", []string{})
-	assertSuccessAndCheck(t, "\n \t\r\n", []string{})
-}
-func TestComments(t *testing.T) {
-	assertSuccessAndCheck(t, "#", []string{`Comment("#")`})
-	assertSuccessAndCheck(t, " \t \t #", []string{`Comment("#")`})
-	assertSuccessAndCheck(t, " \t \t # not empty", []string{`Comment("# not empty")`})
-	assertSuccessAndCheck(t, " \t \t # not empty\r\r\r\n", []string{`Comment("# not empty")`})
-	assertSuccessAndCheck(t, "\n \t\r\n# AAP\r\n", []string{`Comment("# AAP")`})
-	assertSuccessAndCheck(t,
-		"# two lines\n# of comments\n",
-		[]string{`Comment("# two lines")`, `Comment("# of comments")`})
-	assertSuccessAndCheck(t,
-		`# \tcomment\nwith escape-y chars`,
-		[]string{`Comment("# \\tcomment\\nwith escape-y chars")`})
-}
-
-func TestBareKeyWithoutValue(t *testing.T) {
-	err := "Unexpected end of file (expected an '=' value assignment)"
-	assertFailureAndCheck(t, "a", []string{`Key("a")`}, err)
-	assertFailureAndCheck(t, "_", []string{`Key("_")`}, err)
-	assertFailureAndCheck(t, " a", []string{`Key("a")`}, err)
-	assertFailureAndCheck(t, " a ", []string{`Key("a")`}, err)
-	assertFailureAndCheck(t, "ab", []string{`Key("ab")`}, err)
-	assertFailureAndCheck(t, "Ab", []string{`Key("Ab")`}, err)
-	assertFailureAndCheck(t, "Ab1", []string{`Key("Ab1")`}, err)
-	assertFailureAndCheck(t, "_Ab1", []string{`Key("_Ab1")`}, err)
-	assertFailureAndCheck(t, "_-Ab1", []string{`Key("_-Ab1")`}, err)
-	assertFailureAndCheck(t, "_-Ab1_this-is_GOOD987", []string{`Key("_-Ab1_this-is_GOOD987")`}, err)
-}
-
-func TestDottedKey(t *testing.T) {
-	err := "Unexpected end of file (expected an '=' value assignment)"
-	assertFailureAndCheck(t, "a.b", []string{`Key("a")`, `KeyDot(".")`, `Key("b")`}, err)
-	assertFailureAndCheck(t, " a .\t\t b\t ", []string{`Key("a")`, `KeyDot(".")`, `Key("b")`}, err)
-}
-
-func TestKeyWithAssignmentButNoValue(t *testing.T) {
-	err := "Unexpected end of file (expected a value)"
-	assertFailureAndCheck(t, "  some_cool_key   =  ", []string{`Key("some_cool_key")`}, err)
-}
-
-func TestUnterminatedBasicString(t *testing.T) {
-	assertFailure(t, `key="value`, "Unexpected end of file (expected basic string token)")
-}
-
-func TestBasicStringWithNewline(t *testing.T) {
-	assertFailure(t, "key=\"value\nwith\nnewlines\"", "ohoh")
-}
-
-func TestEmptyBasicString(t *testing.T) {
-	assertSuccessAndCheck(t, `a=""`, []string{`Key("a")`, `String("")`})
-	assertSuccessAndCheck(t, `a=""#hi`, []string{`Key("a")`, `String("")`, `Comment("#hi")`})
-	assertSuccessAndCheck(t, `a = ""`, []string{`Key("a")`, `String("")`})
-	assertSuccessAndCheck(t, `a.b = ""`, []string{`Key("a")`, `KeyDot(".")`, `Key("b")`, `String("")`})
-	assertSuccessAndCheck(t, `a=""b=""`, []string{`Key("a")`, `String("")`, `Key("b")`, `String("")`})
-}
-func TestBasicString(t *testing.T) {
-	assertSuccessAndCheck(t, `_ = "b"`,
-		[]string{
-			`Key("_")`,
-			`String("b")`})
-	assertSuccessAndCheck(t, `thing = "A cool ʎǝʞ" # huh, it's up-side down!!`,
-		[]string{
-			`Key("thing")`,
-			`String("A cool ʎǝʞ")`,
-			`Comment("# huh, it's up-side down!!")`})
-}
-
-func TestInvalidEscapeSequence(t *testing.T) {
-	assertFailure(t, `a="\x"`, `Invalid escape sequence \x in string value`)
-}
-func TestBasicStringEscapes(t *testing.T) {
-	for in, out := range map[string]string{
-		`\b`:              "\b",
-		`\t`:              "\t",
-		`\n`:              "\n",
-		`\f`:              "\f",
-		`\r`:              "\r",
-		`\"`:              "\"",
-		`\b\t\nhuh\f\r\"`: "\b\t\nhuh\f\r\"",
-		`\u2318`:          "⌘",
-		`\U0001014D`:      "𐅍",
-	} {
-		l := assertSuccess(t, fmt.Sprintf(`x="%s"`, in))
-		if out != l[1].Value {
-			t.Fatalf("Unexpected result when parsing '%s'\nexpected: %q\nactual: %q", in, out, l[1].Value)
-		}
-	}
-}
-
-// func TestBasicStringUnicodeEscapes(t *testing.T) {
-// 	for in, out := range map[string]string{
-// 		`\u`: "\b",
-// 	} {
-// 		l := assertSuccess(t, fmt.Sprintf(`x="%s"`, in))
-// 		s := l[2]
-// 		if out != s.Value {
-// 			t.Fatalf("Unexpected result when parsing '%s'", in)
-// 		}
-// 	}
-// }
-
-func TestTwoKeyValuePairs(t *testing.T) {
-	assertSuccessAndCheck(t, "a=\"Hello\" #comment1\nb=\"World!\"#comment2\r\n",
-		[]string{
-			`Key("a")`,
-			`String("Hello")`,
-			`Comment("#comment1")`,
-			`Key("b")`,
-			`String("World!")`,
-			`Comment("#comment2")`})
-}
-
-func assertSuccessAndCheck(t *testing.T, input string, expected []string) {
-	l := assertSuccess(t, input)
-	assertItems(t, l, expected)
-}
-
-func assertFailureAndCheck(t *testing.T, input string, expected []string, expectedErr string) {
-	l := assertFailure(t, input, expectedErr)
-	assertItems(t, l, expected)
-}
-
-func assertFailure(t *testing.T, input string, expectedErr string) []lexer.Item {
-	l, err := lexer.Lex(input).ToArray()
-	if err == nil {
-		t.Fatalf("Expected lexer error '%s', but no error occurred", expectedErr)
-	}
-	if err.Error() != expectedErr {
-		t.Fatalf("Mismatch between expected and actual error:\nExpected: %s\nActual: %s\n", expectedErr, err)
-	}
-	return l
-}
-
-func assertSuccess(t *testing.T, input string) []lexer.Item {
-	l, err := lexer.Lex(input).ToArray()
-	if err != nil {
-		t.Fatalf("Unexpected lexer error: %s", err)
-	}
-	return l
-}
-
-func assertItems(t *testing.T, l []lexer.Item, expected []string) {
-	if len(expected) != len(l) {
-		t.Fatalf("Unexpected number of lexer items: %d (expected: %d)", len(l), len(expected))
-	}
-	for i, e := range expected {
-		if l[i].String() != e {
-			t.Fatalf("Unexpected lexer item at index %d: %s (expected: %s)", i, l[i], e)
-		}
-	}
-}
--- a/lexer/states.go
+++ b/lexer/states.go
@ -1,6 +1,6 @@
 package lexer

-// stateFn represents the state of the scanner as a function
+// stateFn represents the state of the lexer as a function
 // that returns the next state.
 type stateFn func(*Lexer) stateFn

@ -19,15 +19,17 @@ const (
 	singleQuote     string = "'"
 	doubleQuote     string = "\""
 	backslash       string = "\\"
-	someQuote      string = singleQuote + doubleQuote
-	bareKey        string = lower + upper + digits + underscore + dash
-	startOfKey     string = bareKey + someQuote
-	quotable       string = `btnfr\"`
+	quoteChars      string = singleQuote + doubleQuote
+	bareKeyChars    string = lower + upper + digits + underscore + dash
+	startOfKey      string = bareKeyChars + quoteChars
+	escapeChars     string = `btnfr"\`
+	shortUtf8Escape string = "u"
+	longUtf8Escape  string = "U"
 )

 func stateKeyValuePair(l *Lexer) stateFn {
-	l.skip(whitespace + carriageReturn + newline)
-	if l.upcoming(hash) {
+	l.skipRun(whitespace + carriageReturn + newline)
+	if l.skip(hash) {
 		return stateComment
 	}
 	if l.upcoming(startOfKey) {
@ -38,36 +40,34 @@ func stateKeyValuePair(l *Lexer) stateFn {

 // A '#' hash symbol marks the rest of the line as a comment.
 func stateComment(l *Lexer) stateFn {
-	l.buffer.Reset()
 	for {
 		switch {
-		case l.atEndOfFile() || l.accept(newline):
-			s := l.buffer.AsLiteralString()
-			l.emit(ItemComment, s)
+		case l.atEndOfFile() || l.skip(newline):
+			l.emitTrimmedLiteral(ItemComment)
 			return stateKeyValuePair
-		case l.accept(carriageReturn):
-			l.ignore()
 		default:
-			l.buffer.WriteRune(l.next())
+			if !l.acceptNext() {
+				return nil
+			}
 		}
 	}
 }

 // A key may be either bare, quoted or dotted.
 func stateKey(l *Lexer) stateFn {
-	if l.upcoming(bareKey) {
-		return stateBareKey
+	if l.acceptFrom(bareKeyChars) {
+		return statebareKeyChars
 	}
-	return l.unexpectedTokenError("a valid key name")
+	return l.unexpectedInputError("a valid key name")
 }

 // Bare keys may only contain ASCII letters, ASCII digits,
 // underscores, and dashes (A-Za-z0-9_-). Note that bare
 // keys are allowed to be composed of only ASCII digits,
 // e.g. 1234, but are always interpreted as strings.
-func stateBareKey(l *Lexer) stateFn {
-	l.acceptWhile(bareKey)
-	l.emit(ItemKey, l.getAcceptedString())
+func statebareKeyChars(l *Lexer) stateFn {
+	l.acceptRun(bareKeyChars)
+	l.emitLiteral(ItemKey)
 	return stateEndOfKeyOrKeyDot
 }

@ -76,10 +76,10 @@ func stateBareKey(l *Lexer) stateFn {
 func stateEndOfKeyOrKeyDot(l *Lexer) stateFn {
 	// Whitespace around dot-separated parts is ignored, however,
 	// best practice is to not use any extraneous whitespace.
-	l.skip(whitespace)
-	if l.accept(dot) {
-		l.emit(ItemKeyDot, ".")
-		l.skip(whitespace)
+	l.skipRun(whitespace)
+	if l.skip(dot) {
+		l.emit(ItemKeyDot, "")
+		l.skipRun(whitespace)
 		return stateKey
 	}
 	return stateKeyAssignment
@ -90,70 +90,57 @@ func stateEndOfKeyOrKeyDot(l *Lexer) stateFn {
 // sign, and value must be on the same line (though some values can
 // be broken over multiple lines).
 func stateKeyAssignment(l *Lexer) stateFn {
-	l.skip(whitespace)
-	if l.accept(equal) {
-		l.skip(whitespace)
+	l.skipRun(whitespace)
+	if l.skip(equal) {
+		l.emit(ItemAssignment, "")
+		l.skipRun(whitespace)
 		return stateValue
 	}
-	return l.unexpectedTokenError("an '=' value assignment")
+	return l.unexpectedInputError("a value assignment")
 }

+// Values must be of the following types: String, Integer, Float, Boolean,
+// Datetime, Array, or Inline Table. Unspecified values are invalid.
 func stateValue(l *Lexer) stateFn {
-	l.skip(whitespace)
-	if l.upcoming(someQuote) {
+	l.skipRun(whitespace)
+	if l.upcoming(quoteChars) {
 		return stateStringValue
 	}
-	return l.unexpectedTokenError("a value")
+	return l.unexpectedInputError("a value")
 }

 // There are four ways to express strings: basic, multi-line basic, literal,
 // and multi-line literal. All strings must contain only valid UTF-8 characters.
 func stateStringValue(l *Lexer) stateFn {
-	if l.accept(doubleQuote) {
+	// Basic strings are surrounded by quotation marks.
+	if l.skip(doubleQuote) {
 		return stateBasicStringValue
 	}
-	return l.unexpectedTokenError("a string value")
+	return l.unexpectedInputError("a string value")
 }

-// Basic strings are surrounded by quotation marks. Any Unicode character
-// may be used except those that must be escaped: quotation mark, backslash,
-// and the control characters (U+0000 to U+001F, U+007F).
-//
-// For convenience, some popular characters have a compact escape sequence.
-//
-// \b         - backspace       (U+0008)
-// \t         - tab             (U+0009)
-// \n         - linefeed        (U+000A)
-// \f         - form feed       (U+000C)
-// \r         - carriage return (U+000D)
-// \"         - quote           (U+0022)
-// \\         - backslash       (U+005C)
-// \uXXXX     - unicode         (U+XXXX)
-// \UXXXXXXXX - unicode         (U+XXXXXXXX)
-//
-// Any Unicode character may be escaped with the \uXXXX or \UXXXXXXXX forms.
-// The escape codes must be valid Unicode scalar values.
-//
-// All other escape sequences not listed above are reserved and,
-// if used, TOML should produce an error.
 func stateBasicStringValue(l *Lexer) stateFn {
 	// Possibly a """ multi-line string start,
 	// possibly the end of an "" empty string.
-	if l.accept(doubleQuote) {
+	if l.skip(doubleQuote) {
 		// It's a """ multi-line string.
-		if l.accept(doubleQuote) {
-			l.ignore()
+		if l.skip(doubleQuote) {
 			return stateMultiLineBasicString
 		}
 		// It's an "" empty string.
-		l.ignore()
 		l.emit(ItemString, "")
 		return stateKeyValuePair
 	}
-	l.ignore()
 	return stateBasicString
 }

+const invalidBasicStringCharacters string = "" +
+	"\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007" +
+	"\u0008\u0009\u000A\u000B\u000C\u000D\u000E\u000F" +
+	"\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017" +
+	"\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F" +
+	"\u007F"
+
 func stateParseBasicString(l *Lexer) stateFn {
 	for {
 		switch {
@ -162,26 +149,47 @@ func stateParseBasicString(l *Lexer) stateFn {
 		case l.accept(doubleQuote):
 			return l.popState()
 		case l.accept(backslash):
-			if l.upcoming(quotable) {
+			// For convenience, some popular characters have a compact escape sequence.
+			// Any Unicode character may be escaped with the \uXXXX or \UXXXXXXXX forms.
+			// The escape codes must be valid Unicode scalar values.
+			switch {
+			case l.upcoming(escapeChars):
+				// \b         - backspace       (U+0008)
+				// \t         - tab             (U+0009)
+				// \n         - linefeed        (U+000A)
+				// \f         - form feed       (U+000C)
+				// \r         - carriage return (U+000D)
+				// \"         - quote           (U+0022)
+				// \\         - backslash       (U+005C)
 				l.buffer.WriteRune('\\')
 				l.buffer.WriteRune(l.next())
-			} else {
+			case l.upcoming(shortUtf8Escape):
+				// \uXXXX     - unicode         (U+XXXX)
+				return l.errorf("Not yet implemented: short utf8")
+			case l.upcoming(longUtf8Escape):
+				// \UXXXXXXXX - unicode         (U+XXXXXXXX)
+				return l.errorf("Not yet implemented: long utf8")
+			default:
+				// All other escape sequences not listed above are reserved and,
+				// if used, TOML should produce an error.
 				return l.errorf("Invalid escape sequence \\%c in string value", l.next())
 			}
+		case l.upcoming(invalidBasicStringCharacters):
+			// Any Unicode character may be used except those that must be escaped:
+			// quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F).
+			return l.errorf("Invalid character in basic string: %q", l.next())
 		default:
-			l.buffer.WriteRune(l.next())
+			l.acceptNext()
 		}
 	}
 }

 func stateBasicString(l *Lexer) stateFn {
-	l.buffer.Reset()
 	l.pushState(func(l *Lexer) stateFn {
-		s, err := l.buffer.AsInterpretedString()
+		err := l.emitInterpreted(ItemString)
 		if err != nil {
 			return l.errorf("Invalid data in string: %s", err)
 		}
-		l.emit(ItemString, s)
 		return stateKeyValuePair
 	})
 	return stateParseBasicString
@ -192,10 +200,9 @@ func stateMultiLineBasicString(l *Lexer) stateFn {
 }

 func stateEndOfFile(l *Lexer) stateFn {
-	i := l.peek()
-	if i == endOfFile {
+	if l.atEndOfFile() {
 		l.emit(ItemEOF, "EOF")
 		return nil
 	}
-	return l.unexpectedTokenError("end of file")
+	return l.unexpectedInputError("end of file")
 }
--- a/lexer/states_test.go
+++ b/lexer/states_test.go
@ -0,0 +1,174 @@
+package lexer_test
+
+import (
+	"fmt"
+	"strings"
+	"testing"
+
+	"github.com/mmakaay/toml/lexer"
+)
+
+func TestInvalidUtf8Data(t *testing.T) {
+	runStatesT(t, statesT{
+		"invalid UTF8 data", "\xbc", "",
+		"Unexpected non-UTF8 data (expected end of file)"})
+}
+
+func TestEmptyInput(t *testing.T) {
+	runStatesT(t, statesT{"empty string", "", "", ""})
+}
+
+func TestWhiteSpaceAndNewlines(t *testing.T) {
+	runStatesTs(t, []statesT{
+		{"space", " ", "", ""},
+		{"tab", "\t", "", ""},
+		{"newline", "\n", "", ""},
+		{"carriage return", "\r", "", ""},
+		{"all whitespace and newlines", " \t \t \r\r\n\n  \n \t", "", ""},
+	})
+}
+
+func TestComments(t *testing.T) {
+	runStatesTs(t, []statesT{
+		{"empty comment", "#", "#()", ""},
+		{"empty comment with spaces", "# \t \r\n", `#()`, ""},
+		{"basic comment", "#chicken", "#(chicken)", ""},
+		{"basic comment starting after whitespace", "# \tchicken", "#(chicken)", ""},
+		{"basic comment with surrounding whitespace", "#\t cow \t", "#(cow)", ""},
+		{"two lines of comments", "# one \r\n#two", "#(one)#(two)", ""},
+		{"comment with escape-y chars", `# \xxx/ \u can't escape/`, `#(\xxx/ \u can't escape/)`, ""},
+		{"carriage returns in comment", "# \tlexe\r accepts embedded ca\r\riage \returns\r", "#(lexe\r accepts embedded ca\r\riage \returns)", ""},
+	})
+}
+
+func TestKeyWithoutAssignment(t *testing.T) {
+	err := "Unexpected end of file (expected a value assignment)"
+	runStatesTs(t, []statesT{
+		{"bare with whitespace", " a ", []string{"[a]"}, err},
+		{"bare lower", "abcdefghijklmnopqrstuvwxyz", []string{"[abcdefghijklmnopqrstuvwxyz]"}, err},
+		{"bare upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", []string{"[ABCDEFGHIJKLMNOPQRSTUVWXYZ]"}, err},
+		{"bare numbers", "0123456789", []string{"[0123456789]"}, err},
+		{"bare underscore", "_", []string{"[_]"}, err},
+		{"bare dash", "-", []string{"[-]"}, err},
+		{"bare big mix", "-hey_good_Lookin123-", []string{"[-hey_good_Lookin123-]"}, err},
+		{"bare dotted", "a._.c", []string{"[a]", ".", "[_]", ".", "[c]"}, err},
+		{"bare dotted with whitespace", " a .\t\t b\t ", []string{"[a]", ".", "[b]"}, err},
+	})
+}
+
+func TestKeyWithAssignmentButNoValue(t *testing.T) {
+	err := "Unexpected end of file (expected a value)"
+	runStatesTs(t, []statesT{
+		{"bare", "a=", "[a]=", err},
+		{"double equal sign", "a==", "[a]=", "Unexpected token '=' (expected a value)"},
+		{"bare dotted", "a.b=", "[a].[b]=", err},
+		{"bare dotted with whitespace", " a .\tb\t = ", "[a].[b]=", err},
+	})
+}
+
+func TestUnterminatedBasicString(t *testing.T) {
+	runStatesT(t, statesT{
+		"missing closing quote", `a="value`, "[a]=",
+		"Unexpected end of file (expected basic string token)"})
+}
+
+func TestBasicStringWithUnescapedControlCharacters(t *testing.T) {
+	runStatesTs(t, []statesT{
+		{"null char", "a=\"\u0000\"", "[a]=", `Invalid character in basic string: '\x00'`},
+		{"newline", "a=\"b\nc\nd\"", "[a]=", `Invalid character in basic string: '\n'`},
+		{"delete", "a=\"\u007F\"", "[a]=", `Invalid character in basic string: '\u007f'`},
+	})
+
+	// No need to write all test cases for disallowed characters by hand.
+	for i := 0x00; i <= 0x1F; i++ {
+		name := fmt.Sprintf("control character %x", rune(i))
+		runStatesT(
+			t, statesT{name, fmt.Sprintf(`_="%c"`, rune(i)), "[_]=",
+				fmt.Sprintf(`Invalid character in basic string: %q`, rune(i))})
+	}
+}
+
+func TestEmptyBasicString(t *testing.T) {
+	runStatesTs(t, []statesT{
+		{"empty", `a=""`, "[a]=STR()", ""},
+		{"with comment", `a="" #cool`, "[a]=STR()#(cool)", ""},
+		{"with whitespaces", ` a = "" `, "[a]=STR()", ""},
+		{"dotted", ` a.b = "" `, "[a].[b]=STR()", ""},
+		{"multiple same line", `a=""b=""`, "[a]=STR()[b]=STR()", ""},
+		{"multiple lines", "a=\"\" \n b = \"\" ", "[a]=STR()[b]=STR()", ""},
+	})
+}
+
+func TestBasicString(t *testing.T) {
+	runStatesTs(t, []statesT{
+		{"ascii value", `_ = "Nothing fancy!"`, "[_]=STR(Nothing fancy!)", ""},
+		{"UTF8 value", `_ = "A cool ƃuıɹʇs" # what!?`, "[_]=STR(A cool ƃuıɹʇs)#(what!?)", ""},
+	})
+}
+
+func TestBasicStringWithInvalidEscapeSequence(t *testing.T) {
+	runStatesT(t, statesT{
+		"invalid escape sequence", `a="\x"`, "[a]=", `Invalid escape sequence \x in string value`,
+	})
+}
+
+func TestBasicStringEscapes(t *testing.T) {
+	runStatesTs(t, []statesT{
+		{"bell escape", `_="\b"`, "[_]=STR(\b)", ""},
+		{"tab escape", `_="\t"`, "[_]=STR(\t)", ""},
+		{"newline escape", `_="\n"`, "[_]=STR(\n)", ""},
+		{"form feed escape", `_="\f"`, "[_]=STR(\f)", ""},
+		{"carriage return escape", `_="\r"`, "[_]=STR(\r)", ""},
+		{"double quote escape", `_="\""`, `[_]=STR(")`, ""},
+		{"backslash escape", `_="\\"`, `[_]=STR(\)`, ""},
+		{"mix of escapes", `_="\b\t\nhuh\f\r\""`, "[_]=STR(\b\t\nhuh\f\r\")", ""},
+		{"UTF8 escape short", `_="\u2318"`, "[_]=STR(⌘)", ""},
+		{"UTF8 escape long", `_="\U0001014D"`, "[_]=STR(𐅍)", ""},
+	})
+}
+
+type statesT struct {
+	name string
+	in   string
+	out  interface{}
+	err  string
+}
+
+func runStatesTs(t *testing.T, tests []statesT) {
+	for _, c := range tests {
+		runStatesT(t, c)
+	}
+}
+
+func runStatesT(t *testing.T, c statesT) {
+	l, err := lexer.Lex(c.in).ToArray()
+	if err == nil && c.err != "" {
+		t.Errorf("[%s] Expected error '%s', but no error occurred", c.name, c.err)
+	}
+	if err != nil && c.err == "" {
+		t.Errorf("[%s] Expected no error, but got error '%s'", c.name, err)
+	}
+	if err != nil && c.err != "" && err.Error() != c.err {
+		t.Errorf("[%s] Got an unexpected error:\nexpected: %s\nactual: %s\n", c.name, c.err, err)
+	}
+	switch expected := c.out.(type) {
+	case []string:
+		if len(expected) != len(l) {
+			t.Errorf("[%s] Unexpected number of lexer items:\nexpected: %d\nactual: %d\n", c.name, len(expected), len(l))
+		}
+		for i, e := range expected {
+			if l[i].String() != e {
+				t.Errorf("[%s] Unexpected lexer item at index %d:\nexpected: %s\nactual: %s\n", c.name, i, e, l[i])
+			}
+		}
+	case string:
+		a := make([]string, len(l))
+		for _, v := range l {
+			a = append(a, v.String())
+		}
+		actual := strings.Join(a, "")
+		if actual != expected {
+			t.Errorf("[%s] Unexpected lexer output:\nexpected; %s\nactual: %s\n", c.name, expected, actual)
+		}
+	}
+}
--- a/lexer/stringbuf.go
+++ b/lexer/stringbuf.go
@ -19,7 +19,7 @@ func (b *StringBuffer) Reset() *StringBuffer {
 	return b
 }

-// AddString adds the runes of the input string to the string buffer.
+// WriteString adds the runes of the input string to the string buffer.
 func (b *StringBuffer) WriteString(s string) *StringBuffer {
 	for _, r := range s {
 		b.WriteRune(r)
--- a/lexer/stringbuf_test.go
+++ b/lexer/stringbuf_test.go
@ -23,7 +23,7 @@ func TestResetResetsBuffer(t *testing.T) {
 	}
 }

-type testCase struct {
+type stringbufT struct {
 	name          string
 	in            string
 	out           string
@ -37,7 +37,7 @@ const (

 func TestAsLiteralString(t *testing.T) {
 	b := lexer.StringBuffer{}
-	for _, c := range []testCase{
+	for _, c := range []stringbufT{
 		{"empty string", ``, ``, OK},
 		{"simple string", `Simple string!`, `Simple string!`, OK},
 		{"single quote", `'`, `'`, OK},
@ -57,7 +57,7 @@ func TestAsLiteralString(t *testing.T) {

 func TestAsInterpretedString(t *testing.T) {
 	b := lexer.StringBuffer{}
-	for _, c := range []testCase{
+	for _, c := range []stringbufT{
 		{"empty string", "", "", OK},
 		{"one character", "Simple string!", "Simple string!", OK},
 		{"escaped single quote", `\'`, "", FAIL},