Implemented a separated lexer.StringBuffer, to not pollute lexer code with string building code. The string builder can provide built strings both as literal as-is string (in TOML: single quotes), or as interpreted strings (in TOML: between double quotes)
This commit is contained in:
parent
866a928f57
commit
6636a7a672
|
@ -17,7 +17,7 @@ const (
|
|||
|
||||
// Item represents a lexer item returned from the scanner.
|
||||
type Item struct {
|
||||
Type itemType //Type, e.g. itemNumber, itemSquareBracket
|
||||
Type itemType //Type, e.g. ItemComment, ItemString
|
||||
Value string // Value, e.g. "10.42", "["
|
||||
}
|
||||
|
||||
|
|
|
@ -7,16 +7,16 @@ import (
|
|||
"unicode/utf8"
|
||||
)
|
||||
|
||||
// Lexer holds the state of the scanner.
|
||||
// Lexer holds the state of the lexer.
|
||||
type Lexer struct {
|
||||
input string // the scanned input string
|
||||
state stateFn // the current state
|
||||
stack []stateFn // state stack, for nested parsing
|
||||
state stateFn // a function that handles the current state
|
||||
stack []stateFn // state function stack, for nested parsing
|
||||
start int // start position of the currently scanned item
|
||||
pos int // current scanning position in the input
|
||||
width int // width of the last rune read
|
||||
strValue strings.Builder // used to build string values
|
||||
items chan Item // channel of scanned items
|
||||
width int // width of the last rune read, for supporting backup()
|
||||
buffer StringBuffer // an efficient buffer, used to build string values
|
||||
items chan Item // channel of resulting lexer items
|
||||
nextItem Item // the current item as reached by Next() and retrieved by Get()
|
||||
err error // an error message when lexing failed, retrieved by Error()
|
||||
}
|
||||
|
@ -69,6 +69,11 @@ func (l *Lexer) Error() error {
|
|||
return l.err
|
||||
}
|
||||
|
||||
// Get returns the next lexer item, as reached by Next()
|
||||
func (l *Lexer) Get() Item {
|
||||
return l.nextItem
|
||||
}
|
||||
|
||||
// ToArray returns lexer items as an array.
|
||||
// When an error occurs during scanning, a partial result will be
|
||||
// returned, accompanied by the error that occurred.
|
||||
|
@ -80,11 +85,6 @@ func (l *Lexer) ToArray() ([]Item, error) {
|
|||
return items, l.Error()
|
||||
}
|
||||
|
||||
// Get returns the next lexer item, as reached by Next()
|
||||
func (l *Lexer) Get() Item {
|
||||
return l.nextItem
|
||||
}
|
||||
|
||||
// pushState adds the state function to its stack.
|
||||
// This is used for implementing nested parsing.
|
||||
func (l *Lexer) pushState(state stateFn) {
|
||||
|
@ -99,6 +99,7 @@ func (l *Lexer) popState() stateFn {
|
|||
return tail
|
||||
}
|
||||
|
||||
// TODO niet meer nodig?
|
||||
// getAcceptedString returns the string as accepted by the
|
||||
// accept* methods so far.
|
||||
func (l *Lexer) getAcceptedString() string {
|
||||
|
@ -111,6 +112,7 @@ func (l *Lexer) emit(t itemType, v string) {
|
|||
l.start = l.pos
|
||||
}
|
||||
|
||||
// TODO niet meer nodig met stringbuilder?
|
||||
// ignore skips over the pending input before the current position.
|
||||
func (l *Lexer) ignore() {
|
||||
l.start = l.pos
|
||||
|
@ -133,6 +135,7 @@ func (l *Lexer) peek() rune {
|
|||
return r
|
||||
}
|
||||
|
||||
// TODO nog nodig met stringbuffer?
|
||||
// accept consumes the next rune if it's from the valid set of runes.
|
||||
func (l *Lexer) accept(runes string) bool {
|
||||
if strings.IndexRune(runes, l.next()) >= 0 {
|
||||
|
@ -150,6 +153,7 @@ func (l *Lexer) upcoming(runes string) bool {
|
|||
return false
|
||||
}
|
||||
|
||||
// TODO nog nodig met stringbuffer?
|
||||
// acceptNot consumes the next rune if it's not from the set of runes.
|
||||
func (l *Lexer) acceptNot(runes string) bool {
|
||||
r := l.next()
|
||||
|
@ -198,24 +202,7 @@ func (l *Lexer) skipUntil(runes string) {
|
|||
}
|
||||
}
|
||||
|
||||
// resetStringBuild initializes a new string builder, used for building
|
||||
// string by interpreting input data, e.g. for translating
|
||||
// double quoted strings with escape codes into an actual
|
||||
// Go string value.
|
||||
func (l *Lexer) resetStringBuilder() {
|
||||
l.strValue.Reset()
|
||||
}
|
||||
|
||||
// addToString adds a rune to the string builder.
|
||||
func (l *Lexer) addToString(r rune) {
|
||||
l.strValue.WriteRune(r)
|
||||
}
|
||||
|
||||
// getString returns the runes in the string builder as a string value.
|
||||
func (l *Lexer) getString() string {
|
||||
return l.strValue.String()
|
||||
}
|
||||
|
||||
// TODO meh... ugly rune.
|
||||
var endOfFile rune = -1
|
||||
|
||||
// next returns the next rune in the input.
|
||||
|
|
|
@ -23,19 +23,24 @@ func TestWhiteSpaceAndNewlines(t *testing.T) {
|
|||
assertSuccessAndCheck(t, "\n", []string{})
|
||||
assertSuccessAndCheck(t, "\n \t\r\n", []string{})
|
||||
}
|
||||
func TestWhitespacePlusComment(t *testing.T) {
|
||||
func TestComments(t *testing.T) {
|
||||
assertSuccessAndCheck(t, "#", []string{`Comment("#")`})
|
||||
assertSuccessAndCheck(t, " \t \t #", []string{`Comment("#")`})
|
||||
assertSuccessAndCheck(t, " \t \t # not empty", []string{`Comment("# not empty")`})
|
||||
assertSuccessAndCheck(t, " \t \t # not empty\r\r\r\n", []string{`Comment("# not empty")`})
|
||||
assertSuccessAndCheck(t, "\n \t\r\n# AAP\r\n", []string{`Comment("# AAP")`})
|
||||
assertSuccessAndCheck(t, "# two lines\n# of comments\n",
|
||||
assertSuccessAndCheck(t,
|
||||
"# two lines\n# of comments\n",
|
||||
[]string{`Comment("# two lines")`, `Comment("# of comments")`})
|
||||
assertSuccessAndCheck(t,
|
||||
`# \tcomment\nwith escape-y chars`,
|
||||
[]string{`Comment("# \\tcomment\\nwith escape-y chars")`})
|
||||
}
|
||||
|
||||
func TestBareKeyWithoutValue(t *testing.T) {
|
||||
err := "Unexpected end of file (expected an '=' value assignment)"
|
||||
assertFailureAndCheck(t, "a", []string{`Key("a")`}, err)
|
||||
assertFailureAndCheck(t, "_", []string{`Key("_")`}, err)
|
||||
assertFailureAndCheck(t, " a", []string{`Key("a")`}, err)
|
||||
assertFailureAndCheck(t, " a ", []string{`Key("a")`}, err)
|
||||
assertFailureAndCheck(t, "ab", []string{`Key("ab")`}, err)
|
||||
|
@ -95,11 +100,13 @@ func TestBasicStringEscapes(t *testing.T) {
|
|||
`\f`: "\f",
|
||||
`\r`: "\r",
|
||||
`\"`: "\"",
|
||||
`\b\t\n\f\r\"`: "\b\t\n\f\r\"",
|
||||
`\b\t\nhuh\f\r\"`: "\b\t\nhuh\f\r\"",
|
||||
`\u2318`: "⌘",
|
||||
`\U0001014D`: "𐅍",
|
||||
} {
|
||||
l := assertSuccess(t, fmt.Sprintf(`x="%s"`, in))
|
||||
if out != l[1].Value {
|
||||
t.Fatalf("Unexpected result when parsing '%s'", in)
|
||||
t.Fatalf("Unexpected result when parsing '%s'\nexpected: %q\nactual: %q", in, out, l[1].Value)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,6 +22,7 @@ const (
|
|||
someQuote string = singleQuote + doubleQuote
|
||||
bareKey string = lower + upper + digits + underscore + dash
|
||||
startOfKey string = bareKey + someQuote
|
||||
quotable string = `btnfr\"`
|
||||
)
|
||||
|
||||
func stateKeyValuePair(l *Lexer) stateFn {
|
||||
|
@ -37,16 +38,17 @@ func stateKeyValuePair(l *Lexer) stateFn {
|
|||
|
||||
// A '#' hash symbol marks the rest of the line as a comment.
|
||||
func stateComment(l *Lexer) stateFn {
|
||||
l.resetStringBuilder()
|
||||
l.buffer.Reset()
|
||||
for {
|
||||
switch {
|
||||
case l.atEndOfFile() || l.accept(newline):
|
||||
l.emit(ItemComment, l.getString())
|
||||
s := l.buffer.AsLiteralString()
|
||||
l.emit(ItemComment, s)
|
||||
return stateKeyValuePair
|
||||
case l.accept(carriageReturn):
|
||||
l.ignore()
|
||||
default:
|
||||
l.addToString(l.next())
|
||||
l.buffer.WriteRune(l.next())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -113,24 +115,6 @@ func stateStringValue(l *Lexer) stateFn {
|
|||
return l.unexpectedTokenError("a string value")
|
||||
}
|
||||
|
||||
func stateBasicStringValue(l *Lexer) stateFn {
|
||||
// Possibly a """ multi-line string start,
|
||||
// possibly the end of an "" empty string.
|
||||
if l.accept(doubleQuote) {
|
||||
// A """ multi-line string.
|
||||
if l.accept(doubleQuote) {
|
||||
l.ignore()
|
||||
return stateMultiLineBasicString
|
||||
}
|
||||
// An "" empty string.
|
||||
l.ignore()
|
||||
l.emit(ItemString, "")
|
||||
return stateKeyValuePair
|
||||
}
|
||||
l.ignore()
|
||||
return stateBasicString
|
||||
}
|
||||
|
||||
// Basic strings are surrounded by quotation marks. Any Unicode character
|
||||
// may be used except those that must be escaped: quotation mark, backslash,
|
||||
// and the control characters (U+0000 to U+001F, U+007F).
|
||||
|
@ -152,15 +136,22 @@ func stateBasicStringValue(l *Lexer) stateFn {
|
|||
//
|
||||
// All other escape sequences not listed above are reserved and,
|
||||
// if used, TOML should produce an error.
|
||||
|
||||
var basicEscapes = map[rune]rune{
|
||||
'b': rune(8),
|
||||
't': rune(9),
|
||||
'n': rune(10),
|
||||
'f': rune(12),
|
||||
'r': rune(13),
|
||||
'"': rune(34),
|
||||
'\\': rune(92),
|
||||
func stateBasicStringValue(l *Lexer) stateFn {
|
||||
// Possibly a """ multi-line string start,
|
||||
// possibly the end of an "" empty string.
|
||||
if l.accept(doubleQuote) {
|
||||
// It's a """ multi-line string.
|
||||
if l.accept(doubleQuote) {
|
||||
l.ignore()
|
||||
return stateMultiLineBasicString
|
||||
}
|
||||
// It's an "" empty string.
|
||||
l.ignore()
|
||||
l.emit(ItemString, "")
|
||||
return stateKeyValuePair
|
||||
}
|
||||
l.ignore()
|
||||
return stateBasicString
|
||||
}
|
||||
|
||||
func stateParseBasicString(l *Lexer) stateFn {
|
||||
|
@ -171,22 +162,26 @@ func stateParseBasicString(l *Lexer) stateFn {
|
|||
case l.accept(doubleQuote):
|
||||
return l.popState()
|
||||
case l.accept(backslash):
|
||||
r := l.next()
|
||||
if escaped, ok := basicEscapes[r]; ok {
|
||||
l.addToString(escaped)
|
||||
if l.upcoming(quotable) {
|
||||
l.buffer.WriteRune('\\')
|
||||
l.buffer.WriteRune(l.next())
|
||||
} else {
|
||||
return l.errorf("Invalid escape sequence \\%c in string value", r)
|
||||
return l.errorf("Invalid escape sequence \\%c in string value", l.next())
|
||||
}
|
||||
default:
|
||||
l.addToString(l.next())
|
||||
l.buffer.WriteRune(l.next())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func stateBasicString(l *Lexer) stateFn {
|
||||
l.resetStringBuilder()
|
||||
l.buffer.Reset()
|
||||
l.pushState(func(l *Lexer) stateFn {
|
||||
l.emit(ItemString, l.getString())
|
||||
s, err := l.buffer.AsInterpretedString()
|
||||
if err != nil {
|
||||
return l.errorf("Invalid data in string: %s", err)
|
||||
}
|
||||
l.emit(ItemString, s)
|
||||
return stateKeyValuePair
|
||||
})
|
||||
return stateParseBasicString
|
||||
|
|
|
@ -0,0 +1,62 @@
|
|||
package lexer
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// StringBuffer is a string buffer implementation, which is used by the lexer
|
||||
// to efficiently accumulate runes from the input and eventually turn these
|
||||
// into a string, either literal or interpreted.
|
||||
type StringBuffer struct {
|
||||
buffer bytes.Buffer
|
||||
}
|
||||
|
||||
// Reset resets the string buffer, in order to build a new string.
|
||||
func (b *StringBuffer) Reset() *StringBuffer {
|
||||
b.buffer.Reset()
|
||||
return b
|
||||
}
|
||||
|
||||
// AddString adds the runes of the input string to the string buffer.
|
||||
func (b *StringBuffer) WriteString(s string) *StringBuffer {
|
||||
for _, r := range s {
|
||||
b.WriteRune(r)
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
// WriteRune adds a single rune to the string buffer.
|
||||
func (b *StringBuffer) WriteRune(r rune) *StringBuffer {
|
||||
b.buffer.WriteRune(r)
|
||||
return b
|
||||
}
|
||||
|
||||
// AsLiteralString returns the string buffer as a literal string.
|
||||
// Literal means that no escape sequences are processed.
|
||||
func (b *StringBuffer) AsLiteralString() string {
|
||||
return b.buffer.String()
|
||||
}
|
||||
|
||||
// AsInterpretedString returns the string in its interpreted form.
|
||||
// Interpreted means that escape sequences are handled in the way that Go would
|
||||
// have, had it been inside double quotes. It translates for example escape
|
||||
// sequences like "\n", "\t", \uXXXX" and "\UXXXXXXXX" into their string
|
||||
// representations.
|
||||
// Since the input might contain invalid escape sequences, this method
|
||||
// also returns an error. When an error is returned, the returned string will
|
||||
// contain the string as far as it could be interpreted.
|
||||
func (b *StringBuffer) AsInterpretedString() (string, error) {
|
||||
var sb strings.Builder
|
||||
tail := b.buffer.String()
|
||||
for len(tail) > 0 {
|
||||
r, _, newtail, err := strconv.UnquoteChar(tail, '"')
|
||||
if err != nil {
|
||||
return sb.String(), err
|
||||
}
|
||||
tail = newtail
|
||||
sb.WriteRune(r)
|
||||
}
|
||||
return sb.String(), nil
|
||||
}
|
|
@ -0,0 +1,87 @@
|
|||
package lexer_test
|
||||
|
||||
import "testing"
|
||||
import "github.com/mmakaay/toml/lexer"
|
||||
|
||||
func TestGeneratingStringDoesNotResetBuffer(t *testing.T) {
|
||||
var b lexer.StringBuffer
|
||||
s1, _ := b.WriteString(`hi\nthere`).AsInterpretedString()
|
||||
s2 := b.AsLiteralString()
|
||||
if s1 != "hi\nthere" {
|
||||
t.Fatalf("Did not get expected string\"X\" for try 1, but %q", s1)
|
||||
}
|
||||
if s2 != "hi\\nthere" {
|
||||
t.Fatalf("Did not get expected string\"X\" for try 2, but %q", s2)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResetResetsBuffer(t *testing.T) {
|
||||
var b lexer.StringBuffer
|
||||
s := b.WriteRune('X').Reset().AsLiteralString()
|
||||
if s != "" {
|
||||
t.Fatalf("Did not get expected empty string, but %q", s)
|
||||
}
|
||||
}
|
||||
|
||||
type testCase struct {
|
||||
name string
|
||||
in string
|
||||
out string
|
||||
isSuccessCase bool
|
||||
}
|
||||
|
||||
const (
|
||||
OK bool = true
|
||||
FAIL bool = false
|
||||
)
|
||||
|
||||
func TestAsLiteralString(t *testing.T) {
|
||||
b := lexer.StringBuffer{}
|
||||
for _, c := range []testCase{
|
||||
{"empty string", ``, ``, OK},
|
||||
{"simple string", `Simple string!`, `Simple string!`, OK},
|
||||
{"single quote", `'`, `'`, OK},
|
||||
{"double quote", `"`, `"`, OK},
|
||||
{"escaped single quote", `\'`, `\'`, OK},
|
||||
{"escaped double quote", `\"`, `\"`, OK},
|
||||
{"escape anything", `\x\t\f\n\r\'\"\\`, `\x\t\f\n\r\'\"\\`, OK},
|
||||
{"UTF8 escapes", `\uceb2\U00e0b8bf`, `\uceb2\U00e0b8bf`, OK},
|
||||
{"actual newline", "on\nmultiple\nlines", "on\nmultiple\nlines", OK},
|
||||
} {
|
||||
s := b.Reset().WriteString(c.in).AsLiteralString()
|
||||
if s != c.out {
|
||||
t.Fatalf("[%s] %q -> %q failed: actual result = %q", c.name, c.in, c.out, s)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestAsInterpretedString(t *testing.T) {
|
||||
b := lexer.StringBuffer{}
|
||||
for _, c := range []testCase{
|
||||
{"empty string", "", "", OK},
|
||||
{"one character", "Simple string!", "Simple string!", OK},
|
||||
{"escaped single quote", `\'`, "", FAIL},
|
||||
{"escaped double quote", `\"`, `"`, OK},
|
||||
{"bare single quote", `'`, "'", OK},
|
||||
{"string in single quotes", `'Hello'`, `'Hello'`, OK},
|
||||
{"string in escaped double quotes", `\"Hello\"`, `"Hello"`, OK},
|
||||
{"escape something", `\t\f\n\r\"\\`, "\t\f\n\r\"\\", OK},
|
||||
{"short UTF8 escapes", `\u2318Wh\u00e9\u00e9!`, `⌘Whéé!`, OK},
|
||||
{"long UTF8 escapes", `\U0001014D \u2318 Wh\u00e9\u00e9!`, `𐅍 ⌘ Whéé!`, OK},
|
||||
{"UTF8 characters", "Ѝюج wut Ж ?", "Ѝюج wut Ж ?", OK},
|
||||
{"example from spec",
|
||||
`I'm a string. \"You can quote me\". Name\tJos\u00E9\nLocation\tSF.`,
|
||||
"I'm a string. \"You can quote me\". Name\tJosé\nLocation\tSF.", OK},
|
||||
} {
|
||||
s, err := b.Reset().WriteString(c.in).AsInterpretedString()
|
||||
if c.isSuccessCase && err != nil {
|
||||
t.Fatalf("[%s] unexpected error for input %q: %s", c.name, c.in, err)
|
||||
}
|
||||
if !c.isSuccessCase && err == nil {
|
||||
t.Fatalf("[%s] expected a failure, but no failure occurred", c.name)
|
||||
}
|
||||
if s != c.out && c.isSuccessCase {
|
||||
t.Fatalf("[%s] %q -> %q failed: actual result = %q", c.name, c.in, c.out, s)
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue