Make short and long UTF8 escape sequences work in strings.

This commit is contained in:
Maurice Makaay 2019-05-16 16:17:23 +00:00
parent cbc4f04179
commit dc47ac3b71
4 changed files with 57 additions and 53 deletions

View File

@ -175,15 +175,16 @@ func (l *Lexer) peekMulti(amount int) ([]rune, bool) {
return peeked, true
}
// acceptNext adds the next rune from the input to the string buffer.
// If no rune could be read (end of file or invalid UTF8 data),
// then false is returned.
func (l *Lexer) acceptNext() bool {
r := l.next()
if r == endOfFile || r == utf8.RuneError {
return false
// acceptNext adds the specified amount of runes from the input to the string buffer.
// If not enough runes could be read (end of file or invalid UTF8 data), then false is returned.
func (l *Lexer) acceptNext(count int) bool {
for i := 0; i < count; i++ {
r := l.next()
if r == endOfFile || r == utf8.RuneError {
return false
}
l.buffer.WriteRune(r)
}
l.buffer.WriteRune(r)
return true
}
@ -262,9 +263,13 @@ func (l *Lexer) accept(runes string) bool {
return false
}
func (l *Lexer) upcoming(runes string) bool {
if l.accept(runes) {
l.backup()
func (l *Lexer) upcoming(runes ...string) bool {
if peeked, ok := l.peekMulti(len(runes)); ok {
for i, r := range runes {
if strings.IndexRune(r, peeked[i]) < 0 {
return false
}
}
return true
}
return false

View File

@ -13,6 +13,7 @@ const (
lower string = "abcdefghijklmnopqrstuvwxyz"
upper string = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
digits string = "0123456789"
hex string = digits + "abcdefABCDEF"
dot string = "."
underscore string = "_"
dash string = "-"
@ -46,7 +47,7 @@ func stateComment(l *Lexer) stateFn {
l.emitTrimmedLiteral(ItemComment)
return stateKeyValuePair
default:
if !l.acceptNext() {
if !l.acceptNext(1) {
return nil
}
}
@ -146,40 +147,34 @@ func stateParseBasicString(l *Lexer) stateFn {
switch {
case l.atEndOfFile():
return l.unexpectedEndOfFile("basic string token")
case l.accept(doubleQuote):
case l.skip(doubleQuote):
return l.popState()
case l.accept(backslash):
case l.upcoming(backslash, escapeChars):
// For convenience, some popular characters have a compact escape sequence.
// Any Unicode character may be escaped with the \uXXXX or \UXXXXXXXX forms.
// The escape codes must be valid Unicode scalar values.
switch {
case l.upcoming(escapeChars):
// \b - backspace (U+0008)
// \t - tab (U+0009)
// \n - linefeed (U+000A)
// \f - form feed (U+000C)
// \r - carriage return (U+000D)
// \" - quote (U+0022)
// \\ - backslash (U+005C)
l.buffer.WriteRune('\\')
l.buffer.WriteRune(l.next())
case l.upcoming(shortUtf8Escape):
// \uXXXX - unicode (U+XXXX)
return l.errorf("Not yet implemented: short utf8")
case l.upcoming(longUtf8Escape):
// \UXXXXXXXX - unicode (U+XXXXXXXX)
return l.errorf("Not yet implemented: long utf8")
default:
// All other escape sequences not listed above are reserved and,
// if used, TOML should produce an error.
return l.errorf("Invalid escape sequence \\%c in string value", l.next())
}
// \b - backspace (U+0008)
// \t - tab (U+0009)
// \n - linefeed (U+000A)
// \f - form feed (U+000C)
// \r - carriage return (U+000D)
// \" - quote (U+0022)
// \\ - backslash (U+005C)
l.acceptNext(2)
case l.upcoming(backslash, shortUtf8Escape, hex, hex, hex, hex):
// \uXXXX - unicode (U+XXXX)
l.acceptNext(6)
case l.upcoming(backslash, longUtf8Escape, hex, hex, hex, hex, hex, hex, hex, hex):
// \UXXXXXXXX - unicode (U+XXXXXXXX)
l.acceptNext(10)
case l.upcoming(backslash):
// All other escape sequences not listed above are reserved and,
// if used, TOML should produce an error.
return l.errorf("Invalid escape sequence in basic string")
case l.upcoming(invalidBasicStringCharacters):
// Any Unicode character may be used except those that must be escaped:
// quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F).
return l.errorf("Invalid character in basic string: %q", l.next())
default:
l.acceptNext()
l.acceptNext(1)
}
}
}

View File

@ -107,8 +107,12 @@ func TestBasicString(t *testing.T) {
}
func TestBasicStringWithInvalidEscapeSequence(t *testing.T) {
runStatesT(t, statesT{
"invalid escape sequence", `a="\x"`, "[a]=", `Invalid escape sequence \x in string value`,
runStatesTs(t, []statesT{
{"invalid escape sequence", `a="\x"`, "[a]=", "Invalid escape sequence in basic string"},
{"too short \\u UTF8", `a="\u123"`, "[a]=", "Invalid escape sequence in basic string"},
{"invalid hex in \\u UTF8", `a="\u000P"`, "[a]=", "Invalid escape sequence in basic string"},
{"too short \\U UTF8", `a="\U1234567"`, "[a]=", "Invalid escape sequence in basic string"},
{"invalid hex in \\U UTF8", `a="\U0000000P"`, "[a]=", "Invalid escape sequence in basic string"},
})
}

View File

@ -23,18 +23,6 @@ func TestResetResetsBuffer(t *testing.T) {
}
}
type stringbufT struct {
name string
in string
out string
isSuccessCase bool
}
const (
OK bool = true
FAIL bool = false
)
func TestAsLiteralString(t *testing.T) {
b := lexer.StringBuffer{}
for _, c := range []stringbufT{
@ -85,3 +73,15 @@ func TestAsInterpretedString(t *testing.T) {
}
}
}
type stringbufT struct {
name string
in string
out string
isSuccessCase bool
}
const (
OK bool = true
FAIL bool = false
)