Make short and long UTF8 escape sequences work in strings.

This commit is contained in:
Maurice Makaay 2019-05-16 16:17:23 +00:00
parent cbc4f04179
commit dc47ac3b71
4 changed files with 57 additions and 53 deletions

View File

@ -175,15 +175,16 @@ func (l *Lexer) peekMulti(amount int) ([]rune, bool) {
return peeked, true return peeked, true
} }
// acceptNext adds the next rune from the input to the string buffer. // acceptNext adds the specified amount of runes from the input to the string buffer.
// If no rune could be read (end of file or invalid UTF8 data), // If not enough runes could be read (end of file or invalid UTF8 data), then false is returned.
// then false is returned. func (l *Lexer) acceptNext(count int) bool {
func (l *Lexer) acceptNext() bool { for i := 0; i < count; i++ {
r := l.next() r := l.next()
if r == endOfFile || r == utf8.RuneError { if r == endOfFile || r == utf8.RuneError {
return false return false
}
l.buffer.WriteRune(r)
} }
l.buffer.WriteRune(r)
return true return true
} }
@ -262,9 +263,13 @@ func (l *Lexer) accept(runes string) bool {
return false return false
} }
func (l *Lexer) upcoming(runes string) bool { func (l *Lexer) upcoming(runes ...string) bool {
if l.accept(runes) { if peeked, ok := l.peekMulti(len(runes)); ok {
l.backup() for i, r := range runes {
if strings.IndexRune(r, peeked[i]) < 0 {
return false
}
}
return true return true
} }
return false return false

View File

@ -13,6 +13,7 @@ const (
lower string = "abcdefghijklmnopqrstuvwxyz" lower string = "abcdefghijklmnopqrstuvwxyz"
upper string = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" upper string = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
digits string = "0123456789" digits string = "0123456789"
hex string = digits + "abcdefABCDEF"
dot string = "." dot string = "."
underscore string = "_" underscore string = "_"
dash string = "-" dash string = "-"
@ -46,7 +47,7 @@ func stateComment(l *Lexer) stateFn {
l.emitTrimmedLiteral(ItemComment) l.emitTrimmedLiteral(ItemComment)
return stateKeyValuePair return stateKeyValuePair
default: default:
if !l.acceptNext() { if !l.acceptNext(1) {
return nil return nil
} }
} }
@ -146,40 +147,34 @@ func stateParseBasicString(l *Lexer) stateFn {
switch { switch {
case l.atEndOfFile(): case l.atEndOfFile():
return l.unexpectedEndOfFile("basic string token") return l.unexpectedEndOfFile("basic string token")
case l.accept(doubleQuote): case l.skip(doubleQuote):
return l.popState() return l.popState()
case l.accept(backslash): case l.upcoming(backslash, escapeChars):
// For convenience, some popular characters have a compact escape sequence. // For convenience, some popular characters have a compact escape sequence.
// Any Unicode character may be escaped with the \uXXXX or \UXXXXXXXX forms. // \b - backspace (U+0008)
// The escape codes must be valid Unicode scalar values. // \t - tab (U+0009)
switch { // \n - linefeed (U+000A)
case l.upcoming(escapeChars): // \f - form feed (U+000C)
// \b - backspace (U+0008) // \r - carriage return (U+000D)
// \t - tab (U+0009) // \" - quote (U+0022)
// \n - linefeed (U+000A) // \\ - backslash (U+005C)
// \f - form feed (U+000C) l.acceptNext(2)
// \r - carriage return (U+000D) case l.upcoming(backslash, shortUtf8Escape, hex, hex, hex, hex):
// \" - quote (U+0022) // \uXXXX - unicode (U+XXXX)
// \\ - backslash (U+005C) l.acceptNext(6)
l.buffer.WriteRune('\\') case l.upcoming(backslash, longUtf8Escape, hex, hex, hex, hex, hex, hex, hex, hex):
l.buffer.WriteRune(l.next()) // \UXXXXXXXX - unicode (U+XXXXXXXX)
case l.upcoming(shortUtf8Escape): l.acceptNext(10)
// \uXXXX - unicode (U+XXXX) case l.upcoming(backslash):
return l.errorf("Not yet implemented: short utf8") // All other escape sequences not listed above are reserved and,
case l.upcoming(longUtf8Escape): // if used, TOML should produce an error.
// \UXXXXXXXX - unicode (U+XXXXXXXX) return l.errorf("Invalid escape sequence in basic string")
return l.errorf("Not yet implemented: long utf8")
default:
// All other escape sequences not listed above are reserved and,
// if used, TOML should produce an error.
return l.errorf("Invalid escape sequence \\%c in string value", l.next())
}
case l.upcoming(invalidBasicStringCharacters): case l.upcoming(invalidBasicStringCharacters):
// Any Unicode character may be used except those that must be escaped: // Any Unicode character may be used except those that must be escaped:
// quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F). // quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F).
return l.errorf("Invalid character in basic string: %q", l.next()) return l.errorf("Invalid character in basic string: %q", l.next())
default: default:
l.acceptNext() l.acceptNext(1)
} }
} }
} }

View File

@ -107,8 +107,12 @@ func TestBasicString(t *testing.T) {
} }
func TestBasicStringWithInvalidEscapeSequence(t *testing.T) { func TestBasicStringWithInvalidEscapeSequence(t *testing.T) {
runStatesT(t, statesT{ runStatesTs(t, []statesT{
"invalid escape sequence", `a="\x"`, "[a]=", `Invalid escape sequence \x in string value`, {"invalid escape sequence", `a="\x"`, "[a]=", "Invalid escape sequence in basic string"},
{"too short \\u UTF8", `a="\u123"`, "[a]=", "Invalid escape sequence in basic string"},
{"invalid hex in \\u UTF8", `a="\u000P"`, "[a]=", "Invalid escape sequence in basic string"},
{"too short \\U UTF8", `a="\U1234567"`, "[a]=", "Invalid escape sequence in basic string"},
{"invalid hex in \\U UTF8", `a="\U0000000P"`, "[a]=", "Invalid escape sequence in basic string"},
}) })
} }

View File

@ -23,18 +23,6 @@ func TestResetResetsBuffer(t *testing.T) {
} }
} }
type stringbufT struct {
name string
in string
out string
isSuccessCase bool
}
const (
OK bool = true
FAIL bool = false
)
func TestAsLiteralString(t *testing.T) { func TestAsLiteralString(t *testing.T) {
b := lexer.StringBuffer{} b := lexer.StringBuffer{}
for _, c := range []stringbufT{ for _, c := range []stringbufT{
@ -85,3 +73,15 @@ func TestAsInterpretedString(t *testing.T) {
} }
} }
} }
type stringbufT struct {
name string
in string
out string
isSuccessCase bool
}
const (
OK bool = true
FAIL bool = false
)