From dc47ac3b716a0b53a451ac72d553b52485d44777 Mon Sep 17 00:00:00 2001
From: Maurice Makaay <maurice@makaay.nl>
Date: Thu, 16 May 2019 16:17:23 +0000
Subject: [PATCH] Make short and long UTF8 escape sequences work in strings.

---
 lexer/lexer.go          | 27 +++++++++++++---------
 lexer/states.go         | 51 +++++++++++++++++++----------------------
 lexer/states_test.go    |  8 +++++--
 lexer/stringbuf_test.go | 24 +++++++++----------
 4 files changed, 57 insertions(+), 53 deletions(-)

diff --git a/lexer/lexer.go b/lexer/lexer.go
index b5c2256..5c351c1 100644
--- a/lexer/lexer.go
+++ b/lexer/lexer.go
@@ -175,15 +175,16 @@ func (l *Lexer) peekMulti(amount int) ([]rune, bool) {
 	return peeked, true
 }
 
-// acceptNext adds the next rune from the input to the string buffer.
-// If no rune could be read (end of file or invalid UTF8 data),
-// then false is returned.
-func (l *Lexer) acceptNext() bool {
-	r := l.next()
-	if r == endOfFile || r == utf8.RuneError {
-		return false
+// acceptNext adds the specified amount of runes from the input to the string buffer.
+// If not enough runes could be read (end of file or invalid UTF8 data), then false is returned.
+func (l *Lexer) acceptNext(count int) bool {
+	for i := 0; i < count; i++ {
+		r := l.next()
+		if r == endOfFile || r == utf8.RuneError {
+			return false
+		}
+		l.buffer.WriteRune(r)
 	}
-	l.buffer.WriteRune(r)
 	return true
 }
 
@@ -262,9 +263,13 @@ func (l *Lexer) accept(runes string) bool {
 	return false
 }
 
-func (l *Lexer) upcoming(runes string) bool {
-	if l.accept(runes) {
-		l.backup()
+func (l *Lexer) upcoming(runes ...string) bool {
+	if peeked, ok := l.peekMulti(len(runes)); ok {
+		for i, r := range runes {
+			if strings.IndexRune(r, peeked[i]) < 0 {
+				return false
+			}
+		}
 		return true
 	}
 	return false
diff --git a/lexer/states.go b/lexer/states.go
index ea686bd..de4e039 100644
--- a/lexer/states.go
+++ b/lexer/states.go
@@ -13,6 +13,7 @@ const (
 	lower           string = "abcdefghijklmnopqrstuvwxyz"
 	upper           string = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 	digits          string = "0123456789"
+	hex             string = digits + "abcdefABCDEF"
 	dot             string = "."
 	underscore      string = "_"
 	dash            string = "-"
@@ -46,7 +47,7 @@ func stateComment(l *Lexer) stateFn {
 			l.emitTrimmedLiteral(ItemComment)
 			return stateKeyValuePair
 		default:
-			if !l.acceptNext() {
+			if !l.acceptNext(1) {
 				return nil
 			}
 		}
@@ -146,40 +147,34 @@ func stateParseBasicString(l *Lexer) stateFn {
 		switch {
 		case l.atEndOfFile():
 			return l.unexpectedEndOfFile("basic string token")
-		case l.accept(doubleQuote):
+		case l.skip(doubleQuote):
 			return l.popState()
-		case l.accept(backslash):
+		case l.upcoming(backslash, escapeChars):
 			// For convenience, some popular characters have a compact escape sequence.
-			// Any Unicode character may be escaped with the \uXXXX or \UXXXXXXXX forms.
-			// The escape codes must be valid Unicode scalar values.
-			switch {
-			case l.upcoming(escapeChars):
-				// \b         - backspace       (U+0008)
-				// \t         - tab             (U+0009)
-				// \n         - linefeed        (U+000A)
-				// \f         - form feed       (U+000C)
-				// \r         - carriage return (U+000D)
-				// \"         - quote           (U+0022)
-				// \\         - backslash       (U+005C)
-				l.buffer.WriteRune('\\')
-				l.buffer.WriteRune(l.next())
-			case l.upcoming(shortUtf8Escape):
-				// \uXXXX     - unicode         (U+XXXX)
-				return l.errorf("Not yet implemented: short utf8")
-			case l.upcoming(longUtf8Escape):
-				// \UXXXXXXXX - unicode         (U+XXXXXXXX)
-				return l.errorf("Not yet implemented: long utf8")
-			default:
-				// All other escape sequences not listed above are reserved and,
-				// if used, TOML should produce an error.
-				return l.errorf("Invalid escape sequence \\%c in string value", l.next())
-			}
+			// \b         - backspace       (U+0008)
+			// \t         - tab             (U+0009)
+			// \n         - linefeed        (U+000A)
+			// \f         - form feed       (U+000C)
+			// \r         - carriage return (U+000D)
+			// \"         - quote           (U+0022)
+			// \\         - backslash       (U+005C)
+			l.acceptNext(2)
+		case l.upcoming(backslash, shortUtf8Escape, hex, hex, hex, hex):
+			// \uXXXX     - unicode         (U+XXXX)
+			l.acceptNext(6)
+		case l.upcoming(backslash, longUtf8Escape, hex, hex, hex, hex, hex, hex, hex, hex):
+			// \UXXXXXXXX - unicode         (U+XXXXXXXX)
+			l.acceptNext(10)
+		case l.upcoming(backslash):
+			// All other escape sequences not listed above are reserved and,
+			// if used, TOML should produce an error.
+			return l.errorf("Invalid escape sequence in basic string")
 		case l.upcoming(invalidBasicStringCharacters):
 			// Any Unicode character may be used except those that must be escaped:
 			// quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F).
 			return l.errorf("Invalid character in basic string: %q", l.next())
 		default:
-			l.acceptNext()
+			l.acceptNext(1)
 		}
 	}
 }
diff --git a/lexer/states_test.go b/lexer/states_test.go
index 320207c..9db14df 100644
--- a/lexer/states_test.go
+++ b/lexer/states_test.go
@@ -107,8 +107,12 @@ func TestBasicString(t *testing.T) {
 }
 
 func TestBasicStringWithInvalidEscapeSequence(t *testing.T) {
-	runStatesT(t, statesT{
-		"invalid escape sequence", `a="\x"`, "[a]=", `Invalid escape sequence \x in string value`,
+	runStatesTs(t, []statesT{
+		{"invalid escape sequence", `a="\x"`, "[a]=", "Invalid escape sequence in basic string"},
+		{"too short \\u UTF8", `a="\u123"`, "[a]=", "Invalid escape sequence in basic string"},
+		{"invalid hex in \\u UTF8", `a="\u000P"`, "[a]=", "Invalid escape sequence in basic string"},
+		{"too short \\U UTF8", `a="\U1234567"`, "[a]=", "Invalid escape sequence in basic string"},
+		{"invalid hex in \\U UTF8", `a="\U0000000P"`, "[a]=", "Invalid escape sequence in basic string"},
 	})
 }
 
diff --git a/lexer/stringbuf_test.go b/lexer/stringbuf_test.go
index 41e59d1..c751581 100644
--- a/lexer/stringbuf_test.go
+++ b/lexer/stringbuf_test.go
@@ -23,18 +23,6 @@ func TestResetResetsBuffer(t *testing.T) {
 	}
 }
 
-type stringbufT struct {
-	name          string
-	in            string
-	out           string
-	isSuccessCase bool
-}
-
-const (
-	OK   bool = true
-	FAIL bool = false
-)
-
 func TestAsLiteralString(t *testing.T) {
 	b := lexer.StringBuffer{}
 	for _, c := range []stringbufT{
@@ -85,3 +73,15 @@ func TestAsInterpretedString(t *testing.T) {
 		}
 	}
 }
+
+type stringbufT struct {
+	name          string
+	in            string
+	out           string
+	isSuccessCase bool
+}
+
+const (
+	OK   bool = true
+	FAIL bool = false
+)