Make short and long UTF8 escape sequences work in strings.

2019-05-16 16:17:23 +00:00 · 2019-05-16 16:17:23 +00:00 · dc47ac3b71
parent cbc4f04179
commit dc47ac3b71
4 changed files with 57 additions and 53 deletions
--- a/lexer/lexer.go
+++ b/lexer/lexer.go
@ -175,15 +175,16 @@ func (l *Lexer) peekMulti(amount int) ([]rune, bool) {
 	return peeked, true
 }

-// acceptNext adds the next rune from the input to the string buffer.
-// If no rune could be read (end of file or invalid UTF8 data),
-// then false is returned.
-func (l *Lexer) acceptNext() bool {
+// acceptNext adds the specified amount of runes from the input to the string buffer.
+// If not enough runes could be read (end of file or invalid UTF8 data), then false is returned.
+func (l *Lexer) acceptNext(count int) bool {
+	for i := 0; i < count; i++ {
 		r := l.next()
 		if r == endOfFile || r == utf8.RuneError {
 			return false
 		}
 		l.buffer.WriteRune(r)
+	}
 	return true
 }

@ -262,9 +263,13 @@ func (l *Lexer) accept(runes string) bool {
 	return false
 }

-func (l *Lexer) upcoming(runes string) bool {
-	if l.accept(runes) {
-		l.backup()
+func (l *Lexer) upcoming(runes ...string) bool {
+	if peeked, ok := l.peekMulti(len(runes)); ok {
+		for i, r := range runes {
+			if strings.IndexRune(r, peeked[i]) < 0 {
+				return false
+			}
+		}
 		return true
 	}
 	return false
--- a/lexer/states.go
+++ b/lexer/states.go
@ -13,6 +13,7 @@ const (
 	lower           string = "abcdefghijklmnopqrstuvwxyz"
 	upper           string = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 	digits          string = "0123456789"
+	hex             string = digits + "abcdefABCDEF"
 	dot             string = "."
 	underscore      string = "_"
 	dash            string = "-"
@ -46,7 +47,7 @@ func stateComment(l *Lexer) stateFn {
 			l.emitTrimmedLiteral(ItemComment)
 			return stateKeyValuePair
 		default:
-			if !l.acceptNext() {
+			if !l.acceptNext(1) {
 				return nil
 			}
 		}
@ -146,14 +147,10 @@ func stateParseBasicString(l *Lexer) stateFn {
 		switch {
 		case l.atEndOfFile():
 			return l.unexpectedEndOfFile("basic string token")
-		case l.accept(doubleQuote):
+		case l.skip(doubleQuote):
 			return l.popState()
-		case l.accept(backslash):
+		case l.upcoming(backslash, escapeChars):
 			// For convenience, some popular characters have a compact escape sequence.
-			// Any Unicode character may be escaped with the \uXXXX or \UXXXXXXXX forms.
-			// The escape codes must be valid Unicode scalar values.
-			switch {
-			case l.upcoming(escapeChars):
 			// \b         - backspace       (U+0008)
 			// \t         - tab             (U+0009)
 			// \n         - linefeed        (U+000A)
@ -161,25 +158,23 @@ func stateParseBasicString(l *Lexer) stateFn {
 			// \r         - carriage return (U+000D)
 			// \"         - quote           (U+0022)
 			// \\         - backslash       (U+005C)
-				l.buffer.WriteRune('\\')
-				l.buffer.WriteRune(l.next())
-			case l.upcoming(shortUtf8Escape):
+			l.acceptNext(2)
+		case l.upcoming(backslash, shortUtf8Escape, hex, hex, hex, hex):
 			// \uXXXX     - unicode         (U+XXXX)
-				return l.errorf("Not yet implemented: short utf8")
-			case l.upcoming(longUtf8Escape):
+			l.acceptNext(6)
+		case l.upcoming(backslash, longUtf8Escape, hex, hex, hex, hex, hex, hex, hex, hex):
 			// \UXXXXXXXX - unicode         (U+XXXXXXXX)
-				return l.errorf("Not yet implemented: long utf8")
-			default:
+			l.acceptNext(10)
+		case l.upcoming(backslash):
 			// All other escape sequences not listed above are reserved and,
 			// if used, TOML should produce an error.
-				return l.errorf("Invalid escape sequence \\%c in string value", l.next())
-			}
+			return l.errorf("Invalid escape sequence in basic string")
 		case l.upcoming(invalidBasicStringCharacters):
 			// Any Unicode character may be used except those that must be escaped:
 			// quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F).
 			return l.errorf("Invalid character in basic string: %q", l.next())
 		default:
-			l.acceptNext()
+			l.acceptNext(1)
 		}
 	}
 }
--- a/lexer/states_test.go
+++ b/lexer/states_test.go
@ -107,8 +107,12 @@ func TestBasicString(t *testing.T) {
 }

 func TestBasicStringWithInvalidEscapeSequence(t *testing.T) {
-	runStatesT(t, statesT{
-		"invalid escape sequence", `a="\x"`, "[a]=", `Invalid escape sequence \x in string value`,
+	runStatesTs(t, []statesT{
+		{"invalid escape sequence", `a="\x"`, "[a]=", "Invalid escape sequence in basic string"},
+		{"too short \\u UTF8", `a="\u123"`, "[a]=", "Invalid escape sequence in basic string"},
+		{"invalid hex in \\u UTF8", `a="\u000P"`, "[a]=", "Invalid escape sequence in basic string"},
+		{"too short \\U UTF8", `a="\U1234567"`, "[a]=", "Invalid escape sequence in basic string"},
+		{"invalid hex in \\U UTF8", `a="\U0000000P"`, "[a]=", "Invalid escape sequence in basic string"},
 	})
 }

--- a/lexer/stringbuf_test.go
+++ b/lexer/stringbuf_test.go
@ -23,18 +23,6 @@ func TestResetResetsBuffer(t *testing.T) {
 	}
 }

-type stringbufT struct {
-	name          string
-	in            string
-	out           string
-	isSuccessCase bool
-}
-
-const (
-	OK   bool = true
-	FAIL bool = false
-)
-
 func TestAsLiteralString(t *testing.T) {
 	b := lexer.StringBuffer{}
 	for _, c := range []stringbufT{
@ -85,3 +73,15 @@ func TestAsInterpretedString(t *testing.T) {
 		}
 	}
 }
+
+type stringbufT struct {
+	name          string
+	in            string
+	out           string
+	isSuccessCase bool
+}
+
+const (
+	OK   bool = true
+	FAIL bool = false
+)