Implemented a separated lexer.StringBuffer, to not pollute lexer code with string building code. The string builder can provide built strings both as literal as-is string (in TOML: single quotes), or as interpreted strings (in TOML: between double quotes)

2019-05-15 22:47:06 +00:00 · 2019-05-15 22:47:06 +00:00 · 6636a7a672
parent 866a928f57
commit 6636a7a672
6 changed files with 220 additions and 82 deletions
--- a/lexer/items.go
+++ b/lexer/items.go
@ -17,7 +17,7 @@ const (
 // Item represents a lexer item returned from the scanner.
 type Item struct {
-	Type  itemType //Type, e.g. itemNumber, itemSquareBracket
+	Type  itemType //Type, e.g. ItemComment, ItemString
 	Value string   // Value, e.g. "10.42", "["
 }
--- a/lexer/lexer.go
+++ b/lexer/lexer.go
@ -7,18 +7,18 @@ import (
 	"unicode/utf8"
 )
-// Lexer holds the state of the scanner.
+// Lexer holds the state of the lexer.
 type Lexer struct {
-	input    string          // the scanned input string
+	input    string       // the scanned input string
-	state    stateFn         // the current state
+	state    stateFn      // a function that handles the current state
-	stack    []stateFn       // state stack, for nested parsing
+	stack    []stateFn    // state function stack, for nested parsing
-	start    int             // start position of the currently scanned item
+	start    int          // start position of the currently scanned item
-	pos      int             // current scanning position in the input
+	pos      int          // current scanning position in the input
-	width    int             // width of the last rune read
+	width    int          // width of the last rune read, for supporting backup()
-	strValue strings.Builder // used to build string values
+	buffer   StringBuffer // an efficient buffer, used to build string values
-	items    chan Item       // channel of scanned items
+	items    chan Item    // channel of resulting lexer items
-	nextItem Item            // the current item as reached by Next() and retrieved by Get()
+	nextItem Item         // the current item as reached by Next() and retrieved by Get()
-	err      error           // an error message when lexing failed, retrieved by Error()
+	err      error        // an error message when lexing failed, retrieved by Error()
 }
 // Lex takes an input string and initializes the TOML lexer for it.
@ -69,6 +69,11 @@ func (l *Lexer) Error() error {
 	return l.err
 }
 // Get returns the next lexer item, as reached by Next()
 func (l *Lexer) Get() Item {
 	return l.nextItem
 }
 // ToArray returns lexer items as an array.
 // When an error occurs during scanning, a partial result will be
 // returned, accompanied by the error that occurred.
@ -80,11 +85,6 @@ func (l *Lexer) ToArray() ([]Item, error) {
 	return items, l.Error()
 }
 // Get returns the next lexer item, as reached by Next()
 func (l *Lexer) Get() Item {
 	return l.nextItem
 }
 // pushState adds the state function to its stack.
 // This is used for implementing nested parsing.
 func (l *Lexer) pushState(state stateFn) {
@ -99,6 +99,7 @@ func (l *Lexer) popState() stateFn {
 	return tail
 }
 // TODO niet meer nodig?
 // getAcceptedString returns the string as accepted by the
 // accept* methods so far.
 func (l *Lexer) getAcceptedString() string {
@ -111,6 +112,7 @@ func (l *Lexer) emit(t itemType, v string) {
 	l.start = l.pos
 }
 // TODO niet meer nodig met stringbuilder?
 // ignore skips over the pending input before the current position.
 func (l *Lexer) ignore() {
 	l.start = l.pos
@ -133,6 +135,7 @@ func (l *Lexer) peek() rune {
 	return r
 }
 // TODO nog nodig met stringbuffer?
 // accept consumes the next rune if it's from the valid set of runes.
 func (l *Lexer) accept(runes string) bool {
 	if strings.IndexRune(runes, l.next()) >= 0 {
@ -150,6 +153,7 @@ func (l *Lexer) upcoming(runes string) bool {
 	return false
 }
 // TODO nog nodig met stringbuffer?
 // acceptNot consumes the next rune if it's not from the set of runes.
 func (l *Lexer) acceptNot(runes string) bool {
 	r := l.next()
@ -198,24 +202,7 @@ func (l *Lexer) skipUntil(runes string) {
 	}
 }
-// resetStringBuild initializes a new string builder, used for building
+// TODO meh... ugly rune.
 // string by interpreting input data, e.g. for translating
 // double quoted strings with escape codes into an actual
 // Go string value.
 func (l *Lexer) resetStringBuilder() {
 	l.strValue.Reset()
 }
 // addToString adds a rune to the string builder.
 func (l *Lexer) addToString(r rune) {
 	l.strValue.WriteRune(r)
 }
 // getString returns the runes in the string builder as a string value.
 func (l *Lexer) getString() string {
 	return l.strValue.String()
 }
 var endOfFile rune = -1
 // next returns the next rune in the input.
--- a/lexer/lexer_test.go
+++ b/lexer/lexer_test.go
@ -23,19 +23,24 @@ func TestWhiteSpaceAndNewlines(t *testing.T) {
 	assertSuccessAndCheck(t, "\n", []string{})
 	assertSuccessAndCheck(t, "\n \t\r\n", []string{})
 }
-func TestWhitespacePlusComment(t *testing.T) {
+func TestComments(t *testing.T) {
 	assertSuccessAndCheck(t, "#", []string{`Comment("#")`})
 	assertSuccessAndCheck(t, " \t \t #", []string{`Comment("#")`})
 	assertSuccessAndCheck(t, " \t \t # not empty", []string{`Comment("# not empty")`})
 	assertSuccessAndCheck(t, " \t \t # not empty\r\r\r\n", []string{`Comment("# not empty")`})
 	assertSuccessAndCheck(t, "\n \t\r\n# AAP\r\n", []string{`Comment("# AAP")`})
-	assertSuccessAndCheck(t, "# two lines\n# of comments\n",
+	assertSuccessAndCheck(t,
 		"# two lines\n# of comments\n",
 		[]string{`Comment("# two lines")`, `Comment("# of comments")`})
 	assertSuccessAndCheck(t,
 		`# \tcomment\nwith escape-y chars`,
 		[]string{`Comment("# \\tcomment\\nwith escape-y chars")`})
 }
 func TestBareKeyWithoutValue(t *testing.T) {
 	err := "Unexpected end of file (expected an '=' value assignment)"
 	assertFailureAndCheck(t, "a", []string{`Key("a")`}, err)
 	assertFailureAndCheck(t, "_", []string{`Key("_")`}, err)
 	assertFailureAndCheck(t, " a", []string{`Key("a")`}, err)
 	assertFailureAndCheck(t, " a ", []string{`Key("a")`}, err)
 	assertFailureAndCheck(t, "ab", []string{`Key("ab")`}, err)
@ -89,17 +94,19 @@ func TestInvalidEscapeSequence(t *testing.T) {
 }
 func TestBasicStringEscapes(t *testing.T) {
 	for in, out := range map[string]string{
-		`\b`:           "\b",
+		`\b`:              "\b",
-		`\t`:           "\t",
+		`\t`:              "\t",
-		`\n`:           "\n",
+		`\n`:              "\n",
-		`\f`:           "\f",
+		`\f`:              "\f",
-		`\r`:           "\r",
+		`\r`:              "\r",
-		`\"`:           "\"",
+		`\"`:              "\"",
-		`\b\t\n\f\r\"`: "\b\t\n\f\r\"",
+		`\b\t\nhuh\f\r\"`: "\b\t\nhuh\f\r\"",
 		`\u2318`:          "⌘",
 		`\U0001014D`:      "𐅍",
 	} {
 		l := assertSuccess(t, fmt.Sprintf(`x="%s"`, in))
 		if out != l[1].Value {
-			t.Fatalf("Unexpected result when parsing '%s'", in)
+			t.Fatalf("Unexpected result when parsing '%s'\nexpected: %q\nactual: %q", in, out, l[1].Value)
 		}
 	}
 }
--- a/lexer/states.go
+++ b/lexer/states.go
@ -22,6 +22,7 @@ const (
 	someQuote      string = singleQuote + doubleQuote
 	bareKey        string = lower + upper + digits + underscore + dash
 	startOfKey     string = bareKey + someQuote
 	quotable       string = `btnfr\"`
 )
 func stateKeyValuePair(l *Lexer) stateFn {
@ -37,16 +38,17 @@ func stateKeyValuePair(l *Lexer) stateFn {
 // A '#' hash symbol marks the rest of the line as a comment.
 func stateComment(l *Lexer) stateFn {
-	l.resetStringBuilder()
+	l.buffer.Reset()
 	for {
 		switch {
 		case l.atEndOfFile() || l.accept(newline):
-			l.emit(ItemComment, l.getString())
+			s := l.buffer.AsLiteralString()
 			l.emit(ItemComment, s)
 			return stateKeyValuePair
 		case l.accept(carriageReturn):
 			l.ignore()
 		default:
-			l.addToString(l.next())
+			l.buffer.WriteRune(l.next())
 		}
 	}
 }
@ -113,24 +115,6 @@ func stateStringValue(l *Lexer) stateFn {
 	return l.unexpectedTokenError("a string value")
 }
 func stateBasicStringValue(l *Lexer) stateFn {
 	// Possibly a """ multi-line string start,
 	// possibly the end of an "" empty string.
 	if l.accept(doubleQuote) {
 		// A """ multi-line string.
 		if l.accept(doubleQuote) {
 			l.ignore()
 			return stateMultiLineBasicString
 		}
 		// An "" empty string.
 		l.ignore()
 		l.emit(ItemString, "")
 		return stateKeyValuePair
 	}
 	l.ignore()
 	return stateBasicString
 }
 // Basic strings are surrounded by quotation marks. Any Unicode character
 // may be used except those that must be escaped: quotation mark, backslash,
 // and the control characters (U+0000 to U+001F, U+007F).
@ -152,15 +136,22 @@ func stateBasicStringValue(l *Lexer) stateFn {
 //
 // All other escape sequences not listed above are reserved and,
 // if used, TOML should produce an error.
-
+func stateBasicStringValue(l *Lexer) stateFn {
-var basicEscapes = map[rune]rune{
+	// Possibly a """ multi-line string start,
-	'b':  rune(8),
+	// possibly the end of an "" empty string.
-	't':  rune(9),
+	if l.accept(doubleQuote) {
-	'n':  rune(10),
+		// It's a """ multi-line string.
-	'f':  rune(12),
+		if l.accept(doubleQuote) {
-	'r':  rune(13),
+			l.ignore()
-	'"':  rune(34),
+			return stateMultiLineBasicString
-	'\\': rune(92),
+		}
 		// It's an "" empty string.
 		l.ignore()
 		l.emit(ItemString, "")
 		return stateKeyValuePair
 	}
 	l.ignore()
 	return stateBasicString
 }
 func stateParseBasicString(l *Lexer) stateFn {
@ -171,22 +162,26 @@ func stateParseBasicString(l *Lexer) stateFn {
 		case l.accept(doubleQuote):
 			return l.popState()
 		case l.accept(backslash):
-			r := l.next()
+			if l.upcoming(quotable) {
-			if escaped, ok := basicEscapes[r]; ok {
+				l.buffer.WriteRune('\\')
-				l.addToString(escaped)
+				l.buffer.WriteRune(l.next())
 			} else {
-				return l.errorf("Invalid escape sequence \\%c in string value", r)
+				return l.errorf("Invalid escape sequence \\%c in string value", l.next())
 			}
 		default:
-			l.addToString(l.next())
+			l.buffer.WriteRune(l.next())
 		}
 	}
 }
 func stateBasicString(l *Lexer) stateFn {
-	l.resetStringBuilder()
+	l.buffer.Reset()
 	l.pushState(func(l *Lexer) stateFn {
-		l.emit(ItemString, l.getString())
+		s, err := l.buffer.AsInterpretedString()
 		if err != nil {
 			return l.errorf("Invalid data in string: %s", err)
 		}
 		l.emit(ItemString, s)
 		return stateKeyValuePair
 	})
 	return stateParseBasicString
--- a/lexer/stringbuf.go
+++ b/lexer/stringbuf.go
@ -0,0 +1,62 @@
 package lexer
 import (
 	"bytes"
 	"strconv"
 	"strings"
 )
 // StringBuffer is a string buffer implementation, which is used by the lexer
 // to efficiently accumulate runes from the input and eventually turn these
 // into a string, either literal or interpreted.
 type StringBuffer struct {
 	buffer bytes.Buffer
 }
 // Reset resets the string buffer, in order to build a new string.
 func (b *StringBuffer) Reset() *StringBuffer {
 	b.buffer.Reset()
 	return b
 }
 // AddString adds the runes of the input string to the string buffer.
 func (b *StringBuffer) WriteString(s string) *StringBuffer {
 	for _, r := range s {
 		b.WriteRune(r)
 	}
 	return b
 }
 // WriteRune adds a single rune to the string buffer.
 func (b *StringBuffer) WriteRune(r rune) *StringBuffer {
 	b.buffer.WriteRune(r)
 	return b
 }
 // AsLiteralString returns the string buffer as a literal string.
 // Literal means that no escape sequences are processed.
 func (b *StringBuffer) AsLiteralString() string {
 	return b.buffer.String()
 }
 // AsInterpretedString returns the string in its interpreted form.
 // Interpreted means that escape sequences are handled in the way that Go would
 // have, had it been inside double quotes. It translates for example escape
 // sequences like "\n", "\t", \uXXXX" and "\UXXXXXXXX" into their string
 // representations.
 // Since the input might contain invalid escape sequences, this method
 // also returns an error. When an error is returned, the returned string will
 // contain the string as far as it could be interpreted.
 func (b *StringBuffer) AsInterpretedString() (string, error) {
 	var sb strings.Builder
 	tail := b.buffer.String()
 	for len(tail) > 0 {
 		r, _, newtail, err := strconv.UnquoteChar(tail, '"')
 		if err != nil {
 			return sb.String(), err
 		}
 		tail = newtail
 		sb.WriteRune(r)
 	}
 	return sb.String(), nil
 }
--- a/lexer/stringbuf_test.go
+++ b/lexer/stringbuf_test.go
@ -0,0 +1,87 @@
 package lexer_test
 import "testing"
 import "github.com/mmakaay/toml/lexer"
 func TestGeneratingStringDoesNotResetBuffer(t *testing.T) {
 	var b lexer.StringBuffer
 	s1, _ := b.WriteString(`hi\nthere`).AsInterpretedString()
 	s2 := b.AsLiteralString()
 	if s1 != "hi\nthere" {
 		t.Fatalf("Did not get expected string\"X\" for try 1, but %q", s1)
 	}
 	if s2 != "hi\\nthere" {
 		t.Fatalf("Did not get expected string\"X\" for try 2, but %q", s2)
 	}
 }
 func TestResetResetsBuffer(t *testing.T) {
 	var b lexer.StringBuffer
 	s := b.WriteRune('X').Reset().AsLiteralString()
 	if s != "" {
 		t.Fatalf("Did not get expected empty string, but %q", s)
 	}
 }
 type testCase struct {
 	name          string
 	in            string
 	out           string
 	isSuccessCase bool
 }
 const (
 	OK   bool = true
 	FAIL bool = false
 )
 func TestAsLiteralString(t *testing.T) {
 	b := lexer.StringBuffer{}
 	for _, c := range []testCase{
 		{"empty string", ``, ``, OK},
 		{"simple string", `Simple string!`, `Simple string!`, OK},
 		{"single quote", `'`, `'`, OK},
 		{"double quote", `"`, `"`, OK},
 		{"escaped single quote", `\'`, `\'`, OK},
 		{"escaped double quote", `\"`, `\"`, OK},
 		{"escape anything", `\x\t\f\n\r\'\"\\`, `\x\t\f\n\r\'\"\\`, OK},
 		{"UTF8 escapes", `\uceb2\U00e0b8bf`, `\uceb2\U00e0b8bf`, OK},
 		{"actual newline", "on\nmultiple\nlines", "on\nmultiple\nlines", OK},
 	} {
 		s := b.Reset().WriteString(c.in).AsLiteralString()
 		if s != c.out {
 			t.Fatalf("[%s] %q -> %q failed: actual result = %q", c.name, c.in, c.out, s)
 		}
 	}
 }
 func TestAsInterpretedString(t *testing.T) {
 	b := lexer.StringBuffer{}
 	for _, c := range []testCase{
 		{"empty string", "", "", OK},
 		{"one character", "Simple string!", "Simple string!", OK},
 		{"escaped single quote", `\'`, "", FAIL},
 		{"escaped double quote", `\"`, `"`, OK},
 		{"bare single quote", `'`, "'", OK},
 		{"string in single quotes", `'Hello'`, `'Hello'`, OK},
 		{"string in escaped double quotes", `\"Hello\"`, `"Hello"`, OK},
 		{"escape something", `\t\f\n\r\"\\`, "\t\f\n\r\"\\", OK},
 		{"short UTF8 escapes", `\u2318Wh\u00e9\u00e9!`, `⌘Whéé!`, OK},
 		{"long UTF8 escapes", `\U0001014D \u2318 Wh\u00e9\u00e9!`, `𐅍 ⌘ Whéé!`, OK},
 		{"UTF8 characters", "Ѝюج wut Ж ?", "Ѝюج wut Ж ?", OK},
 		{"example from spec",
 			`I'm a string. \"You can quote me\". Name\tJos\u00E9\nLocation\tSF.`,
 			"I'm a string. \"You can quote me\". Name\tJosé\nLocation\tSF.", OK},
 	} {
 		s, err := b.Reset().WriteString(c.in).AsInterpretedString()
 		if c.isSuccessCase && err != nil {
 			t.Fatalf("[%s] unexpected error for input %q: %s", c.name, c.in, err)
 		}
 		if !c.isSuccessCase && err == nil {
 			t.Fatalf("[%s] expected a failure, but no failure occurred", c.name)
 		}
 		if s != c.out && c.isSuccessCase {
 			t.Fatalf("[%s] %q -> %q failed: actual result = %q", c.name, c.in, c.out, s)
 		}
 	}
 }