Implemented a separated lexer.StringBuffer, to not pollute lexer code with string building code. The string builder can provide built strings both as literal as-is string (in TOML: single quotes), or as interpreted strings (in TOML: between double quotes)

This commit is contained in:
Ubuntu 2019-05-15 22:47:06 +00:00
parent 866a928f57
commit 6636a7a672
6 changed files with 220 additions and 82 deletions

View File

@ -17,7 +17,7 @@ const (
// Item represents a lexer item returned from the scanner. // Item represents a lexer item returned from the scanner.
type Item struct { type Item struct {
Type itemType //Type, e.g. itemNumber, itemSquareBracket Type itemType //Type, e.g. ItemComment, ItemString
Value string // Value, e.g. "10.42", "[" Value string // Value, e.g. "10.42", "["
} }

View File

@ -7,18 +7,18 @@ import (
"unicode/utf8" "unicode/utf8"
) )
// Lexer holds the state of the scanner. // Lexer holds the state of the lexer.
type Lexer struct { type Lexer struct {
input string // the scanned input string input string // the scanned input string
state stateFn // the current state state stateFn // a function that handles the current state
stack []stateFn // state stack, for nested parsing stack []stateFn // state function stack, for nested parsing
start int // start position of the currently scanned item start int // start position of the currently scanned item
pos int // current scanning position in the input pos int // current scanning position in the input
width int // width of the last rune read width int // width of the last rune read, for supporting backup()
strValue strings.Builder // used to build string values buffer StringBuffer // an efficient buffer, used to build string values
items chan Item // channel of scanned items items chan Item // channel of resulting lexer items
nextItem Item // the current item as reached by Next() and retrieved by Get() nextItem Item // the current item as reached by Next() and retrieved by Get()
err error // an error message when lexing failed, retrieved by Error() err error // an error message when lexing failed, retrieved by Error()
} }
// Lex takes an input string and initializes the TOML lexer for it. // Lex takes an input string and initializes the TOML lexer for it.
@ -69,6 +69,11 @@ func (l *Lexer) Error() error {
return l.err return l.err
} }
// Get returns the next lexer item, as reached by Next()
func (l *Lexer) Get() Item {
return l.nextItem
}
// ToArray returns lexer items as an array. // ToArray returns lexer items as an array.
// When an error occurs during scanning, a partial result will be // When an error occurs during scanning, a partial result will be
// returned, accompanied by the error that occurred. // returned, accompanied by the error that occurred.
@ -80,11 +85,6 @@ func (l *Lexer) ToArray() ([]Item, error) {
return items, l.Error() return items, l.Error()
} }
// Get returns the next lexer item, as reached by Next()
func (l *Lexer) Get() Item {
return l.nextItem
}
// pushState adds the state function to its stack. // pushState adds the state function to its stack.
// This is used for implementing nested parsing. // This is used for implementing nested parsing.
func (l *Lexer) pushState(state stateFn) { func (l *Lexer) pushState(state stateFn) {
@ -99,6 +99,7 @@ func (l *Lexer) popState() stateFn {
return tail return tail
} }
// TODO niet meer nodig?
// getAcceptedString returns the string as accepted by the // getAcceptedString returns the string as accepted by the
// accept* methods so far. // accept* methods so far.
func (l *Lexer) getAcceptedString() string { func (l *Lexer) getAcceptedString() string {
@ -111,6 +112,7 @@ func (l *Lexer) emit(t itemType, v string) {
l.start = l.pos l.start = l.pos
} }
// TODO niet meer nodig met stringbuilder?
// ignore skips over the pending input before the current position. // ignore skips over the pending input before the current position.
func (l *Lexer) ignore() { func (l *Lexer) ignore() {
l.start = l.pos l.start = l.pos
@ -133,6 +135,7 @@ func (l *Lexer) peek() rune {
return r return r
} }
// TODO nog nodig met stringbuffer?
// accept consumes the next rune if it's from the valid set of runes. // accept consumes the next rune if it's from the valid set of runes.
func (l *Lexer) accept(runes string) bool { func (l *Lexer) accept(runes string) bool {
if strings.IndexRune(runes, l.next()) >= 0 { if strings.IndexRune(runes, l.next()) >= 0 {
@ -150,6 +153,7 @@ func (l *Lexer) upcoming(runes string) bool {
return false return false
} }
// TODO nog nodig met stringbuffer?
// acceptNot consumes the next rune if it's not from the set of runes. // acceptNot consumes the next rune if it's not from the set of runes.
func (l *Lexer) acceptNot(runes string) bool { func (l *Lexer) acceptNot(runes string) bool {
r := l.next() r := l.next()
@ -198,24 +202,7 @@ func (l *Lexer) skipUntil(runes string) {
} }
} }
// resetStringBuild initializes a new string builder, used for building // TODO meh... ugly rune.
// string by interpreting input data, e.g. for translating
// double quoted strings with escape codes into an actual
// Go string value.
func (l *Lexer) resetStringBuilder() {
l.strValue.Reset()
}
// addToString adds a rune to the string builder.
func (l *Lexer) addToString(r rune) {
l.strValue.WriteRune(r)
}
// getString returns the runes in the string builder as a string value.
func (l *Lexer) getString() string {
return l.strValue.String()
}
var endOfFile rune = -1 var endOfFile rune = -1
// next returns the next rune in the input. // next returns the next rune in the input.

View File

@ -23,19 +23,24 @@ func TestWhiteSpaceAndNewlines(t *testing.T) {
assertSuccessAndCheck(t, "\n", []string{}) assertSuccessAndCheck(t, "\n", []string{})
assertSuccessAndCheck(t, "\n \t\r\n", []string{}) assertSuccessAndCheck(t, "\n \t\r\n", []string{})
} }
func TestWhitespacePlusComment(t *testing.T) { func TestComments(t *testing.T) {
assertSuccessAndCheck(t, "#", []string{`Comment("#")`}) assertSuccessAndCheck(t, "#", []string{`Comment("#")`})
assertSuccessAndCheck(t, " \t \t #", []string{`Comment("#")`}) assertSuccessAndCheck(t, " \t \t #", []string{`Comment("#")`})
assertSuccessAndCheck(t, " \t \t # not empty", []string{`Comment("# not empty")`}) assertSuccessAndCheck(t, " \t \t # not empty", []string{`Comment("# not empty")`})
assertSuccessAndCheck(t, " \t \t # not empty\r\r\r\n", []string{`Comment("# not empty")`}) assertSuccessAndCheck(t, " \t \t # not empty\r\r\r\n", []string{`Comment("# not empty")`})
assertSuccessAndCheck(t, "\n \t\r\n# AAP\r\n", []string{`Comment("# AAP")`}) assertSuccessAndCheck(t, "\n \t\r\n# AAP\r\n", []string{`Comment("# AAP")`})
assertSuccessAndCheck(t, "# two lines\n# of comments\n", assertSuccessAndCheck(t,
"# two lines\n# of comments\n",
[]string{`Comment("# two lines")`, `Comment("# of comments")`}) []string{`Comment("# two lines")`, `Comment("# of comments")`})
assertSuccessAndCheck(t,
`# \tcomment\nwith escape-y chars`,
[]string{`Comment("# \\tcomment\\nwith escape-y chars")`})
} }
func TestBareKeyWithoutValue(t *testing.T) { func TestBareKeyWithoutValue(t *testing.T) {
err := "Unexpected end of file (expected an '=' value assignment)" err := "Unexpected end of file (expected an '=' value assignment)"
assertFailureAndCheck(t, "a", []string{`Key("a")`}, err) assertFailureAndCheck(t, "a", []string{`Key("a")`}, err)
assertFailureAndCheck(t, "_", []string{`Key("_")`}, err)
assertFailureAndCheck(t, " a", []string{`Key("a")`}, err) assertFailureAndCheck(t, " a", []string{`Key("a")`}, err)
assertFailureAndCheck(t, " a ", []string{`Key("a")`}, err) assertFailureAndCheck(t, " a ", []string{`Key("a")`}, err)
assertFailureAndCheck(t, "ab", []string{`Key("ab")`}, err) assertFailureAndCheck(t, "ab", []string{`Key("ab")`}, err)
@ -89,17 +94,19 @@ func TestInvalidEscapeSequence(t *testing.T) {
} }
func TestBasicStringEscapes(t *testing.T) { func TestBasicStringEscapes(t *testing.T) {
for in, out := range map[string]string{ for in, out := range map[string]string{
`\b`: "\b", `\b`: "\b",
`\t`: "\t", `\t`: "\t",
`\n`: "\n", `\n`: "\n",
`\f`: "\f", `\f`: "\f",
`\r`: "\r", `\r`: "\r",
`\"`: "\"", `\"`: "\"",
`\b\t\n\f\r\"`: "\b\t\n\f\r\"", `\b\t\nhuh\f\r\"`: "\b\t\nhuh\f\r\"",
`\u2318`: "⌘",
`\U0001014D`: "𐅍",
} { } {
l := assertSuccess(t, fmt.Sprintf(`x="%s"`, in)) l := assertSuccess(t, fmt.Sprintf(`x="%s"`, in))
if out != l[1].Value { if out != l[1].Value {
t.Fatalf("Unexpected result when parsing '%s'", in) t.Fatalf("Unexpected result when parsing '%s'\nexpected: %q\nactual: %q", in, out, l[1].Value)
} }
} }
} }

View File

@ -22,6 +22,7 @@ const (
someQuote string = singleQuote + doubleQuote someQuote string = singleQuote + doubleQuote
bareKey string = lower + upper + digits + underscore + dash bareKey string = lower + upper + digits + underscore + dash
startOfKey string = bareKey + someQuote startOfKey string = bareKey + someQuote
quotable string = `btnfr\"`
) )
func stateKeyValuePair(l *Lexer) stateFn { func stateKeyValuePair(l *Lexer) stateFn {
@ -37,16 +38,17 @@ func stateKeyValuePair(l *Lexer) stateFn {
// A '#' hash symbol marks the rest of the line as a comment. // A '#' hash symbol marks the rest of the line as a comment.
func stateComment(l *Lexer) stateFn { func stateComment(l *Lexer) stateFn {
l.resetStringBuilder() l.buffer.Reset()
for { for {
switch { switch {
case l.atEndOfFile() || l.accept(newline): case l.atEndOfFile() || l.accept(newline):
l.emit(ItemComment, l.getString()) s := l.buffer.AsLiteralString()
l.emit(ItemComment, s)
return stateKeyValuePair return stateKeyValuePair
case l.accept(carriageReturn): case l.accept(carriageReturn):
l.ignore() l.ignore()
default: default:
l.addToString(l.next()) l.buffer.WriteRune(l.next())
} }
} }
} }
@ -113,24 +115,6 @@ func stateStringValue(l *Lexer) stateFn {
return l.unexpectedTokenError("a string value") return l.unexpectedTokenError("a string value")
} }
func stateBasicStringValue(l *Lexer) stateFn {
// Possibly a """ multi-line string start,
// possibly the end of an "" empty string.
if l.accept(doubleQuote) {
// A """ multi-line string.
if l.accept(doubleQuote) {
l.ignore()
return stateMultiLineBasicString
}
// An "" empty string.
l.ignore()
l.emit(ItemString, "")
return stateKeyValuePair
}
l.ignore()
return stateBasicString
}
// Basic strings are surrounded by quotation marks. Any Unicode character // Basic strings are surrounded by quotation marks. Any Unicode character
// may be used except those that must be escaped: quotation mark, backslash, // may be used except those that must be escaped: quotation mark, backslash,
// and the control characters (U+0000 to U+001F, U+007F). // and the control characters (U+0000 to U+001F, U+007F).
@ -152,15 +136,22 @@ func stateBasicStringValue(l *Lexer) stateFn {
// //
// All other escape sequences not listed above are reserved and, // All other escape sequences not listed above are reserved and,
// if used, TOML should produce an error. // if used, TOML should produce an error.
func stateBasicStringValue(l *Lexer) stateFn {
var basicEscapes = map[rune]rune{ // Possibly a """ multi-line string start,
'b': rune(8), // possibly the end of an "" empty string.
't': rune(9), if l.accept(doubleQuote) {
'n': rune(10), // It's a """ multi-line string.
'f': rune(12), if l.accept(doubleQuote) {
'r': rune(13), l.ignore()
'"': rune(34), return stateMultiLineBasicString
'\\': rune(92), }
// It's an "" empty string.
l.ignore()
l.emit(ItemString, "")
return stateKeyValuePair
}
l.ignore()
return stateBasicString
} }
func stateParseBasicString(l *Lexer) stateFn { func stateParseBasicString(l *Lexer) stateFn {
@ -171,22 +162,26 @@ func stateParseBasicString(l *Lexer) stateFn {
case l.accept(doubleQuote): case l.accept(doubleQuote):
return l.popState() return l.popState()
case l.accept(backslash): case l.accept(backslash):
r := l.next() if l.upcoming(quotable) {
if escaped, ok := basicEscapes[r]; ok { l.buffer.WriteRune('\\')
l.addToString(escaped) l.buffer.WriteRune(l.next())
} else { } else {
return l.errorf("Invalid escape sequence \\%c in string value", r) return l.errorf("Invalid escape sequence \\%c in string value", l.next())
} }
default: default:
l.addToString(l.next()) l.buffer.WriteRune(l.next())
} }
} }
} }
func stateBasicString(l *Lexer) stateFn { func stateBasicString(l *Lexer) stateFn {
l.resetStringBuilder() l.buffer.Reset()
l.pushState(func(l *Lexer) stateFn { l.pushState(func(l *Lexer) stateFn {
l.emit(ItemString, l.getString()) s, err := l.buffer.AsInterpretedString()
if err != nil {
return l.errorf("Invalid data in string: %s", err)
}
l.emit(ItemString, s)
return stateKeyValuePair return stateKeyValuePair
}) })
return stateParseBasicString return stateParseBasicString

62
lexer/stringbuf.go Normal file
View File

@ -0,0 +1,62 @@
package lexer
import (
"bytes"
"strconv"
"strings"
)
// StringBuffer is a string buffer implementation, which is used by the lexer
// to efficiently accumulate runes from the input and eventually turn these
// into a string, either literal or interpreted.
type StringBuffer struct {
buffer bytes.Buffer
}
// Reset resets the string buffer, in order to build a new string.
func (b *StringBuffer) Reset() *StringBuffer {
b.buffer.Reset()
return b
}
// AddString adds the runes of the input string to the string buffer.
func (b *StringBuffer) WriteString(s string) *StringBuffer {
for _, r := range s {
b.WriteRune(r)
}
return b
}
// WriteRune adds a single rune to the string buffer.
func (b *StringBuffer) WriteRune(r rune) *StringBuffer {
b.buffer.WriteRune(r)
return b
}
// AsLiteralString returns the string buffer as a literal string.
// Literal means that no escape sequences are processed.
func (b *StringBuffer) AsLiteralString() string {
return b.buffer.String()
}
// AsInterpretedString returns the string in its interpreted form.
// Interpreted means that escape sequences are handled in the way that Go would
// have, had it been inside double quotes. It translates for example escape
// sequences like "\n", "\t", \uXXXX" and "\UXXXXXXXX" into their string
// representations.
// Since the input might contain invalid escape sequences, this method
// also returns an error. When an error is returned, the returned string will
// contain the string as far as it could be interpreted.
func (b *StringBuffer) AsInterpretedString() (string, error) {
var sb strings.Builder
tail := b.buffer.String()
for len(tail) > 0 {
r, _, newtail, err := strconv.UnquoteChar(tail, '"')
if err != nil {
return sb.String(), err
}
tail = newtail
sb.WriteRune(r)
}
return sb.String(), nil
}

87
lexer/stringbuf_test.go Normal file
View File

@ -0,0 +1,87 @@
package lexer_test
import "testing"
import "github.com/mmakaay/toml/lexer"
func TestGeneratingStringDoesNotResetBuffer(t *testing.T) {
var b lexer.StringBuffer
s1, _ := b.WriteString(`hi\nthere`).AsInterpretedString()
s2 := b.AsLiteralString()
if s1 != "hi\nthere" {
t.Fatalf("Did not get expected string\"X\" for try 1, but %q", s1)
}
if s2 != "hi\\nthere" {
t.Fatalf("Did not get expected string\"X\" for try 2, but %q", s2)
}
}
func TestResetResetsBuffer(t *testing.T) {
var b lexer.StringBuffer
s := b.WriteRune('X').Reset().AsLiteralString()
if s != "" {
t.Fatalf("Did not get expected empty string, but %q", s)
}
}
type testCase struct {
name string
in string
out string
isSuccessCase bool
}
const (
OK bool = true
FAIL bool = false
)
func TestAsLiteralString(t *testing.T) {
b := lexer.StringBuffer{}
for _, c := range []testCase{
{"empty string", ``, ``, OK},
{"simple string", `Simple string!`, `Simple string!`, OK},
{"single quote", `'`, `'`, OK},
{"double quote", `"`, `"`, OK},
{"escaped single quote", `\'`, `\'`, OK},
{"escaped double quote", `\"`, `\"`, OK},
{"escape anything", `\x\t\f\n\r\'\"\\`, `\x\t\f\n\r\'\"\\`, OK},
{"UTF8 escapes", `\uceb2\U00e0b8bf`, `\uceb2\U00e0b8bf`, OK},
{"actual newline", "on\nmultiple\nlines", "on\nmultiple\nlines", OK},
} {
s := b.Reset().WriteString(c.in).AsLiteralString()
if s != c.out {
t.Fatalf("[%s] %q -> %q failed: actual result = %q", c.name, c.in, c.out, s)
}
}
}
func TestAsInterpretedString(t *testing.T) {
b := lexer.StringBuffer{}
for _, c := range []testCase{
{"empty string", "", "", OK},
{"one character", "Simple string!", "Simple string!", OK},
{"escaped single quote", `\'`, "", FAIL},
{"escaped double quote", `\"`, `"`, OK},
{"bare single quote", `'`, "'", OK},
{"string in single quotes", `'Hello'`, `'Hello'`, OK},
{"string in escaped double quotes", `\"Hello\"`, `"Hello"`, OK},
{"escape something", `\t\f\n\r\"\\`, "\t\f\n\r\"\\", OK},
{"short UTF8 escapes", `\u2318Wh\u00e9\u00e9!`, `⌘Whéé!`, OK},
{"long UTF8 escapes", `\U0001014D \u2318 Wh\u00e9\u00e9!`, `𐅍 ⌘ Whéé!`, OK},
{"UTF8 characters", "Ѝюج wut Ж ?", "Ѝюج wut Ж ?", OK},
{"example from spec",
`I'm a string. \"You can quote me\". Name\tJos\u00E9\nLocation\tSF.`,
"I'm a string. \"You can quote me\". Name\tJosé\nLocation\tSF.", OK},
} {
s, err := b.Reset().WriteString(c.in).AsInterpretedString()
if c.isSuccessCase && err != nil {
t.Fatalf("[%s] unexpected error for input %q: %s", c.name, c.in, err)
}
if !c.isSuccessCase && err == nil {
t.Fatalf("[%s] expected a failure, but no failure occurred", c.name)
}
if s != c.out && c.isSuccessCase {
t.Fatalf("[%s] %q -> %q failed: actual result = %q", c.name, c.in, c.out, s)
}
}
}