From dc47ac3b716a0b53a451ac72d553b52485d44777 Mon Sep 17 00:00:00 2001 From: Maurice Makaay Date: Thu, 16 May 2019 16:17:23 +0000 Subject: [PATCH] Make short and long UTF8 escape sequences work in strings. --- lexer/lexer.go | 27 +++++++++++++--------- lexer/states.go | 51 +++++++++++++++++++---------------------- lexer/states_test.go | 8 +++++-- lexer/stringbuf_test.go | 24 +++++++++---------- 4 files changed, 57 insertions(+), 53 deletions(-) diff --git a/lexer/lexer.go b/lexer/lexer.go index b5c2256..5c351c1 100644 --- a/lexer/lexer.go +++ b/lexer/lexer.go @@ -175,15 +175,16 @@ func (l *Lexer) peekMulti(amount int) ([]rune, bool) { return peeked, true } -// acceptNext adds the next rune from the input to the string buffer. -// If no rune could be read (end of file or invalid UTF8 data), -// then false is returned. -func (l *Lexer) acceptNext() bool { - r := l.next() - if r == endOfFile || r == utf8.RuneError { - return false +// acceptNext adds the specified amount of runes from the input to the string buffer. +// If not enough runes could be read (end of file or invalid UTF8 data), then false is returned. +func (l *Lexer) acceptNext(count int) bool { + for i := 0; i < count; i++ { + r := l.next() + if r == endOfFile || r == utf8.RuneError { + return false + } + l.buffer.WriteRune(r) } - l.buffer.WriteRune(r) return true } @@ -262,9 +263,13 @@ func (l *Lexer) accept(runes string) bool { return false } -func (l *Lexer) upcoming(runes string) bool { - if l.accept(runes) { - l.backup() +func (l *Lexer) upcoming(runes ...string) bool { + if peeked, ok := l.peekMulti(len(runes)); ok { + for i, r := range runes { + if strings.IndexRune(r, peeked[i]) < 0 { + return false + } + } return true } return false diff --git a/lexer/states.go b/lexer/states.go index ea686bd..de4e039 100644 --- a/lexer/states.go +++ b/lexer/states.go @@ -13,6 +13,7 @@ const ( lower string = "abcdefghijklmnopqrstuvwxyz" upper string = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" digits string = "0123456789" + hex string = digits + "abcdefABCDEF" dot string = "." underscore string = "_" dash string = "-" @@ -46,7 +47,7 @@ func stateComment(l *Lexer) stateFn { l.emitTrimmedLiteral(ItemComment) return stateKeyValuePair default: - if !l.acceptNext() { + if !l.acceptNext(1) { return nil } } @@ -146,40 +147,34 @@ func stateParseBasicString(l *Lexer) stateFn { switch { case l.atEndOfFile(): return l.unexpectedEndOfFile("basic string token") - case l.accept(doubleQuote): + case l.skip(doubleQuote): return l.popState() - case l.accept(backslash): + case l.upcoming(backslash, escapeChars): // For convenience, some popular characters have a compact escape sequence. - // Any Unicode character may be escaped with the \uXXXX or \UXXXXXXXX forms. - // The escape codes must be valid Unicode scalar values. - switch { - case l.upcoming(escapeChars): - // \b - backspace (U+0008) - // \t - tab (U+0009) - // \n - linefeed (U+000A) - // \f - form feed (U+000C) - // \r - carriage return (U+000D) - // \" - quote (U+0022) - // \\ - backslash (U+005C) - l.buffer.WriteRune('\\') - l.buffer.WriteRune(l.next()) - case l.upcoming(shortUtf8Escape): - // \uXXXX - unicode (U+XXXX) - return l.errorf("Not yet implemented: short utf8") - case l.upcoming(longUtf8Escape): - // \UXXXXXXXX - unicode (U+XXXXXXXX) - return l.errorf("Not yet implemented: long utf8") - default: - // All other escape sequences not listed above are reserved and, - // if used, TOML should produce an error. - return l.errorf("Invalid escape sequence \\%c in string value", l.next()) - } + // \b - backspace (U+0008) + // \t - tab (U+0009) + // \n - linefeed (U+000A) + // \f - form feed (U+000C) + // \r - carriage return (U+000D) + // \" - quote (U+0022) + // \\ - backslash (U+005C) + l.acceptNext(2) + case l.upcoming(backslash, shortUtf8Escape, hex, hex, hex, hex): + // \uXXXX - unicode (U+XXXX) + l.acceptNext(6) + case l.upcoming(backslash, longUtf8Escape, hex, hex, hex, hex, hex, hex, hex, hex): + // \UXXXXXXXX - unicode (U+XXXXXXXX) + l.acceptNext(10) + case l.upcoming(backslash): + // All other escape sequences not listed above are reserved and, + // if used, TOML should produce an error. + return l.errorf("Invalid escape sequence in basic string") case l.upcoming(invalidBasicStringCharacters): // Any Unicode character may be used except those that must be escaped: // quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F). return l.errorf("Invalid character in basic string: %q", l.next()) default: - l.acceptNext() + l.acceptNext(1) } } } diff --git a/lexer/states_test.go b/lexer/states_test.go index 320207c..9db14df 100644 --- a/lexer/states_test.go +++ b/lexer/states_test.go @@ -107,8 +107,12 @@ func TestBasicString(t *testing.T) { } func TestBasicStringWithInvalidEscapeSequence(t *testing.T) { - runStatesT(t, statesT{ - "invalid escape sequence", `a="\x"`, "[a]=", `Invalid escape sequence \x in string value`, + runStatesTs(t, []statesT{ + {"invalid escape sequence", `a="\x"`, "[a]=", "Invalid escape sequence in basic string"}, + {"too short \\u UTF8", `a="\u123"`, "[a]=", "Invalid escape sequence in basic string"}, + {"invalid hex in \\u UTF8", `a="\u000P"`, "[a]=", "Invalid escape sequence in basic string"}, + {"too short \\U UTF8", `a="\U1234567"`, "[a]=", "Invalid escape sequence in basic string"}, + {"invalid hex in \\U UTF8", `a="\U0000000P"`, "[a]=", "Invalid escape sequence in basic string"}, }) } diff --git a/lexer/stringbuf_test.go b/lexer/stringbuf_test.go index 41e59d1..c751581 100644 --- a/lexer/stringbuf_test.go +++ b/lexer/stringbuf_test.go @@ -23,18 +23,6 @@ func TestResetResetsBuffer(t *testing.T) { } } -type stringbufT struct { - name string - in string - out string - isSuccessCase bool -} - -const ( - OK bool = true - FAIL bool = false -) - func TestAsLiteralString(t *testing.T) { b := lexer.StringBuffer{} for _, c := range []stringbufT{ @@ -85,3 +73,15 @@ func TestAsInterpretedString(t *testing.T) { } } } + +type stringbufT struct { + name string + in string + out string + isSuccessCase bool +} + +const ( + OK bool = true + FAIL bool = false +)