diff --git a/lexer/items.go b/lexer/items.go index fa68b8f..9e8ae7f 100644 --- a/lexer/items.go +++ b/lexer/items.go @@ -17,7 +17,7 @@ const ( // Item represents a lexer item returned from the scanner. type Item struct { - Type itemType //Type, e.g. itemNumber, itemSquareBracket + Type itemType //Type, e.g. ItemComment, ItemString Value string // Value, e.g. "10.42", "[" } diff --git a/lexer/lexer.go b/lexer/lexer.go index bf3fdff..c4d1ce2 100644 --- a/lexer/lexer.go +++ b/lexer/lexer.go @@ -7,18 +7,18 @@ import ( "unicode/utf8" ) -// Lexer holds the state of the scanner. +// Lexer holds the state of the lexer. type Lexer struct { - input string // the scanned input string - state stateFn // the current state - stack []stateFn // state stack, for nested parsing - start int // start position of the currently scanned item - pos int // current scanning position in the input - width int // width of the last rune read - strValue strings.Builder // used to build string values - items chan Item // channel of scanned items - nextItem Item // the current item as reached by Next() and retrieved by Get() - err error // an error message when lexing failed, retrieved by Error() + input string // the scanned input string + state stateFn // a function that handles the current state + stack []stateFn // state function stack, for nested parsing + start int // start position of the currently scanned item + pos int // current scanning position in the input + width int // width of the last rune read, for supporting backup() + buffer StringBuffer // an efficient buffer, used to build string values + items chan Item // channel of resulting lexer items + nextItem Item // the current item as reached by Next() and retrieved by Get() + err error // an error message when lexing failed, retrieved by Error() } // Lex takes an input string and initializes the TOML lexer for it. @@ -69,6 +69,11 @@ func (l *Lexer) Error() error { return l.err } +// Get returns the next lexer item, as reached by Next() +func (l *Lexer) Get() Item { + return l.nextItem +} + // ToArray returns lexer items as an array. // When an error occurs during scanning, a partial result will be // returned, accompanied by the error that occurred. @@ -80,11 +85,6 @@ func (l *Lexer) ToArray() ([]Item, error) { return items, l.Error() } -// Get returns the next lexer item, as reached by Next() -func (l *Lexer) Get() Item { - return l.nextItem -} - // pushState adds the state function to its stack. // This is used for implementing nested parsing. func (l *Lexer) pushState(state stateFn) { @@ -99,6 +99,7 @@ func (l *Lexer) popState() stateFn { return tail } +// TODO niet meer nodig? // getAcceptedString returns the string as accepted by the // accept* methods so far. func (l *Lexer) getAcceptedString() string { @@ -111,6 +112,7 @@ func (l *Lexer) emit(t itemType, v string) { l.start = l.pos } +// TODO niet meer nodig met stringbuilder? // ignore skips over the pending input before the current position. func (l *Lexer) ignore() { l.start = l.pos @@ -133,6 +135,7 @@ func (l *Lexer) peek() rune { return r } +// TODO nog nodig met stringbuffer? // accept consumes the next rune if it's from the valid set of runes. func (l *Lexer) accept(runes string) bool { if strings.IndexRune(runes, l.next()) >= 0 { @@ -150,6 +153,7 @@ func (l *Lexer) upcoming(runes string) bool { return false } +// TODO nog nodig met stringbuffer? // acceptNot consumes the next rune if it's not from the set of runes. func (l *Lexer) acceptNot(runes string) bool { r := l.next() @@ -198,24 +202,7 @@ func (l *Lexer) skipUntil(runes string) { } } -// resetStringBuild initializes a new string builder, used for building -// string by interpreting input data, e.g. for translating -// double quoted strings with escape codes into an actual -// Go string value. -func (l *Lexer) resetStringBuilder() { - l.strValue.Reset() -} - -// addToString adds a rune to the string builder. -func (l *Lexer) addToString(r rune) { - l.strValue.WriteRune(r) -} - -// getString returns the runes in the string builder as a string value. -func (l *Lexer) getString() string { - return l.strValue.String() -} - +// TODO meh... ugly rune. var endOfFile rune = -1 // next returns the next rune in the input. diff --git a/lexer/lexer_test.go b/lexer/lexer_test.go index 1bacd2e..658060a 100644 --- a/lexer/lexer_test.go +++ b/lexer/lexer_test.go @@ -23,19 +23,24 @@ func TestWhiteSpaceAndNewlines(t *testing.T) { assertSuccessAndCheck(t, "\n", []string{}) assertSuccessAndCheck(t, "\n \t\r\n", []string{}) } -func TestWhitespacePlusComment(t *testing.T) { +func TestComments(t *testing.T) { assertSuccessAndCheck(t, "#", []string{`Comment("#")`}) assertSuccessAndCheck(t, " \t \t #", []string{`Comment("#")`}) assertSuccessAndCheck(t, " \t \t # not empty", []string{`Comment("# not empty")`}) assertSuccessAndCheck(t, " \t \t # not empty\r\r\r\n", []string{`Comment("# not empty")`}) assertSuccessAndCheck(t, "\n \t\r\n# AAP\r\n", []string{`Comment("# AAP")`}) - assertSuccessAndCheck(t, "# two lines\n# of comments\n", + assertSuccessAndCheck(t, + "# two lines\n# of comments\n", []string{`Comment("# two lines")`, `Comment("# of comments")`}) + assertSuccessAndCheck(t, + `# \tcomment\nwith escape-y chars`, + []string{`Comment("# \\tcomment\\nwith escape-y chars")`}) } func TestBareKeyWithoutValue(t *testing.T) { err := "Unexpected end of file (expected an '=' value assignment)" assertFailureAndCheck(t, "a", []string{`Key("a")`}, err) + assertFailureAndCheck(t, "_", []string{`Key("_")`}, err) assertFailureAndCheck(t, " a", []string{`Key("a")`}, err) assertFailureAndCheck(t, " a ", []string{`Key("a")`}, err) assertFailureAndCheck(t, "ab", []string{`Key("ab")`}, err) @@ -89,17 +94,19 @@ func TestInvalidEscapeSequence(t *testing.T) { } func TestBasicStringEscapes(t *testing.T) { for in, out := range map[string]string{ - `\b`: "\b", - `\t`: "\t", - `\n`: "\n", - `\f`: "\f", - `\r`: "\r", - `\"`: "\"", - `\b\t\n\f\r\"`: "\b\t\n\f\r\"", + `\b`: "\b", + `\t`: "\t", + `\n`: "\n", + `\f`: "\f", + `\r`: "\r", + `\"`: "\"", + `\b\t\nhuh\f\r\"`: "\b\t\nhuh\f\r\"", + `\u2318`: "⌘", + `\U0001014D`: "𐅍", } { l := assertSuccess(t, fmt.Sprintf(`x="%s"`, in)) if out != l[1].Value { - t.Fatalf("Unexpected result when parsing '%s'", in) + t.Fatalf("Unexpected result when parsing '%s'\nexpected: %q\nactual: %q", in, out, l[1].Value) } } } diff --git a/lexer/states.go b/lexer/states.go index d86b348..4eb335a 100644 --- a/lexer/states.go +++ b/lexer/states.go @@ -22,6 +22,7 @@ const ( someQuote string = singleQuote + doubleQuote bareKey string = lower + upper + digits + underscore + dash startOfKey string = bareKey + someQuote + quotable string = `btnfr\"` ) func stateKeyValuePair(l *Lexer) stateFn { @@ -37,16 +38,17 @@ func stateKeyValuePair(l *Lexer) stateFn { // A '#' hash symbol marks the rest of the line as a comment. func stateComment(l *Lexer) stateFn { - l.resetStringBuilder() + l.buffer.Reset() for { switch { case l.atEndOfFile() || l.accept(newline): - l.emit(ItemComment, l.getString()) + s := l.buffer.AsLiteralString() + l.emit(ItemComment, s) return stateKeyValuePair case l.accept(carriageReturn): l.ignore() default: - l.addToString(l.next()) + l.buffer.WriteRune(l.next()) } } } @@ -113,24 +115,6 @@ func stateStringValue(l *Lexer) stateFn { return l.unexpectedTokenError("a string value") } -func stateBasicStringValue(l *Lexer) stateFn { - // Possibly a """ multi-line string start, - // possibly the end of an "" empty string. - if l.accept(doubleQuote) { - // A """ multi-line string. - if l.accept(doubleQuote) { - l.ignore() - return stateMultiLineBasicString - } - // An "" empty string. - l.ignore() - l.emit(ItemString, "") - return stateKeyValuePair - } - l.ignore() - return stateBasicString -} - // Basic strings are surrounded by quotation marks. Any Unicode character // may be used except those that must be escaped: quotation mark, backslash, // and the control characters (U+0000 to U+001F, U+007F). @@ -152,15 +136,22 @@ func stateBasicStringValue(l *Lexer) stateFn { // // All other escape sequences not listed above are reserved and, // if used, TOML should produce an error. - -var basicEscapes = map[rune]rune{ - 'b': rune(8), - 't': rune(9), - 'n': rune(10), - 'f': rune(12), - 'r': rune(13), - '"': rune(34), - '\\': rune(92), +func stateBasicStringValue(l *Lexer) stateFn { + // Possibly a """ multi-line string start, + // possibly the end of an "" empty string. + if l.accept(doubleQuote) { + // It's a """ multi-line string. + if l.accept(doubleQuote) { + l.ignore() + return stateMultiLineBasicString + } + // It's an "" empty string. + l.ignore() + l.emit(ItemString, "") + return stateKeyValuePair + } + l.ignore() + return stateBasicString } func stateParseBasicString(l *Lexer) stateFn { @@ -171,22 +162,26 @@ func stateParseBasicString(l *Lexer) stateFn { case l.accept(doubleQuote): return l.popState() case l.accept(backslash): - r := l.next() - if escaped, ok := basicEscapes[r]; ok { - l.addToString(escaped) + if l.upcoming(quotable) { + l.buffer.WriteRune('\\') + l.buffer.WriteRune(l.next()) } else { - return l.errorf("Invalid escape sequence \\%c in string value", r) + return l.errorf("Invalid escape sequence \\%c in string value", l.next()) } default: - l.addToString(l.next()) + l.buffer.WriteRune(l.next()) } } } func stateBasicString(l *Lexer) stateFn { - l.resetStringBuilder() + l.buffer.Reset() l.pushState(func(l *Lexer) stateFn { - l.emit(ItemString, l.getString()) + s, err := l.buffer.AsInterpretedString() + if err != nil { + return l.errorf("Invalid data in string: %s", err) + } + l.emit(ItemString, s) return stateKeyValuePair }) return stateParseBasicString diff --git a/lexer/stringbuf.go b/lexer/stringbuf.go new file mode 100644 index 0000000..7c810a1 --- /dev/null +++ b/lexer/stringbuf.go @@ -0,0 +1,62 @@ +package lexer + +import ( + "bytes" + "strconv" + "strings" +) + +// StringBuffer is a string buffer implementation, which is used by the lexer +// to efficiently accumulate runes from the input and eventually turn these +// into a string, either literal or interpreted. +type StringBuffer struct { + buffer bytes.Buffer +} + +// Reset resets the string buffer, in order to build a new string. +func (b *StringBuffer) Reset() *StringBuffer { + b.buffer.Reset() + return b +} + +// AddString adds the runes of the input string to the string buffer. +func (b *StringBuffer) WriteString(s string) *StringBuffer { + for _, r := range s { + b.WriteRune(r) + } + return b +} + +// WriteRune adds a single rune to the string buffer. +func (b *StringBuffer) WriteRune(r rune) *StringBuffer { + b.buffer.WriteRune(r) + return b +} + +// AsLiteralString returns the string buffer as a literal string. +// Literal means that no escape sequences are processed. +func (b *StringBuffer) AsLiteralString() string { + return b.buffer.String() +} + +// AsInterpretedString returns the string in its interpreted form. +// Interpreted means that escape sequences are handled in the way that Go would +// have, had it been inside double quotes. It translates for example escape +// sequences like "\n", "\t", \uXXXX" and "\UXXXXXXXX" into their string +// representations. +// Since the input might contain invalid escape sequences, this method +// also returns an error. When an error is returned, the returned string will +// contain the string as far as it could be interpreted. +func (b *StringBuffer) AsInterpretedString() (string, error) { + var sb strings.Builder + tail := b.buffer.String() + for len(tail) > 0 { + r, _, newtail, err := strconv.UnquoteChar(tail, '"') + if err != nil { + return sb.String(), err + } + tail = newtail + sb.WriteRune(r) + } + return sb.String(), nil +} diff --git a/lexer/stringbuf_test.go b/lexer/stringbuf_test.go new file mode 100644 index 0000000..f3caa04 --- /dev/null +++ b/lexer/stringbuf_test.go @@ -0,0 +1,87 @@ +package lexer_test + +import "testing" +import "github.com/mmakaay/toml/lexer" + +func TestGeneratingStringDoesNotResetBuffer(t *testing.T) { + var b lexer.StringBuffer + s1, _ := b.WriteString(`hi\nthere`).AsInterpretedString() + s2 := b.AsLiteralString() + if s1 != "hi\nthere" { + t.Fatalf("Did not get expected string\"X\" for try 1, but %q", s1) + } + if s2 != "hi\\nthere" { + t.Fatalf("Did not get expected string\"X\" for try 2, but %q", s2) + } +} + +func TestResetResetsBuffer(t *testing.T) { + var b lexer.StringBuffer + s := b.WriteRune('X').Reset().AsLiteralString() + if s != "" { + t.Fatalf("Did not get expected empty string, but %q", s) + } +} + +type testCase struct { + name string + in string + out string + isSuccessCase bool +} + +const ( + OK bool = true + FAIL bool = false +) + +func TestAsLiteralString(t *testing.T) { + b := lexer.StringBuffer{} + for _, c := range []testCase{ + {"empty string", ``, ``, OK}, + {"simple string", `Simple string!`, `Simple string!`, OK}, + {"single quote", `'`, `'`, OK}, + {"double quote", `"`, `"`, OK}, + {"escaped single quote", `\'`, `\'`, OK}, + {"escaped double quote", `\"`, `\"`, OK}, + {"escape anything", `\x\t\f\n\r\'\"\\`, `\x\t\f\n\r\'\"\\`, OK}, + {"UTF8 escapes", `\uceb2\U00e0b8bf`, `\uceb2\U00e0b8bf`, OK}, + {"actual newline", "on\nmultiple\nlines", "on\nmultiple\nlines", OK}, + } { + s := b.Reset().WriteString(c.in).AsLiteralString() + if s != c.out { + t.Fatalf("[%s] %q -> %q failed: actual result = %q", c.name, c.in, c.out, s) + } + } +} + +func TestAsInterpretedString(t *testing.T) { + b := lexer.StringBuffer{} + for _, c := range []testCase{ + {"empty string", "", "", OK}, + {"one character", "Simple string!", "Simple string!", OK}, + {"escaped single quote", `\'`, "", FAIL}, + {"escaped double quote", `\"`, `"`, OK}, + {"bare single quote", `'`, "'", OK}, + {"string in single quotes", `'Hello'`, `'Hello'`, OK}, + {"string in escaped double quotes", `\"Hello\"`, `"Hello"`, OK}, + {"escape something", `\t\f\n\r\"\\`, "\t\f\n\r\"\\", OK}, + {"short UTF8 escapes", `\u2318Wh\u00e9\u00e9!`, `⌘Whéé!`, OK}, + {"long UTF8 escapes", `\U0001014D \u2318 Wh\u00e9\u00e9!`, `𐅍 ⌘ Whéé!`, OK}, + {"UTF8 characters", "Ѝюج wut Ж ?", "Ѝюج wut Ж ?", OK}, + {"example from spec", + `I'm a string. \"You can quote me\". Name\tJos\u00E9\nLocation\tSF.`, + "I'm a string. \"You can quote me\". Name\tJosé\nLocation\tSF.", OK}, + } { + s, err := b.Reset().WriteString(c.in).AsInterpretedString() + if c.isSuccessCase && err != nil { + t.Fatalf("[%s] unexpected error for input %q: %s", c.name, c.in, err) + } + if !c.isSuccessCase && err == nil { + t.Fatalf("[%s] expected a failure, but no failure occurred", c.name) + } + if s != c.out && c.isSuccessCase { + t.Fatalf("[%s] %q -> %q failed: actual result = %q", c.name, c.in, c.out, s) + } + } +}