Make short and long UTF8 escape sequences work in strings.
This commit is contained in:
parent
cbc4f04179
commit
dc47ac3b71
|
@ -175,15 +175,16 @@ func (l *Lexer) peekMulti(amount int) ([]rune, bool) {
|
|||
return peeked, true
|
||||
}
|
||||
|
||||
// acceptNext adds the next rune from the input to the string buffer.
|
||||
// If no rune could be read (end of file or invalid UTF8 data),
|
||||
// then false is returned.
|
||||
func (l *Lexer) acceptNext() bool {
|
||||
// acceptNext adds the specified amount of runes from the input to the string buffer.
|
||||
// If not enough runes could be read (end of file or invalid UTF8 data), then false is returned.
|
||||
func (l *Lexer) acceptNext(count int) bool {
|
||||
for i := 0; i < count; i++ {
|
||||
r := l.next()
|
||||
if r == endOfFile || r == utf8.RuneError {
|
||||
return false
|
||||
}
|
||||
l.buffer.WriteRune(r)
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
|
@ -262,9 +263,13 @@ func (l *Lexer) accept(runes string) bool {
|
|||
return false
|
||||
}
|
||||
|
||||
func (l *Lexer) upcoming(runes string) bool {
|
||||
if l.accept(runes) {
|
||||
l.backup()
|
||||
func (l *Lexer) upcoming(runes ...string) bool {
|
||||
if peeked, ok := l.peekMulti(len(runes)); ok {
|
||||
for i, r := range runes {
|
||||
if strings.IndexRune(r, peeked[i]) < 0 {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
return false
|
||||
|
|
|
@ -13,6 +13,7 @@ const (
|
|||
lower string = "abcdefghijklmnopqrstuvwxyz"
|
||||
upper string = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||
digits string = "0123456789"
|
||||
hex string = digits + "abcdefABCDEF"
|
||||
dot string = "."
|
||||
underscore string = "_"
|
||||
dash string = "-"
|
||||
|
@ -46,7 +47,7 @@ func stateComment(l *Lexer) stateFn {
|
|||
l.emitTrimmedLiteral(ItemComment)
|
||||
return stateKeyValuePair
|
||||
default:
|
||||
if !l.acceptNext() {
|
||||
if !l.acceptNext(1) {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
@ -146,14 +147,10 @@ func stateParseBasicString(l *Lexer) stateFn {
|
|||
switch {
|
||||
case l.atEndOfFile():
|
||||
return l.unexpectedEndOfFile("basic string token")
|
||||
case l.accept(doubleQuote):
|
||||
case l.skip(doubleQuote):
|
||||
return l.popState()
|
||||
case l.accept(backslash):
|
||||
case l.upcoming(backslash, escapeChars):
|
||||
// For convenience, some popular characters have a compact escape sequence.
|
||||
// Any Unicode character may be escaped with the \uXXXX or \UXXXXXXXX forms.
|
||||
// The escape codes must be valid Unicode scalar values.
|
||||
switch {
|
||||
case l.upcoming(escapeChars):
|
||||
// \b - backspace (U+0008)
|
||||
// \t - tab (U+0009)
|
||||
// \n - linefeed (U+000A)
|
||||
|
@ -161,25 +158,23 @@ func stateParseBasicString(l *Lexer) stateFn {
|
|||
// \r - carriage return (U+000D)
|
||||
// \" - quote (U+0022)
|
||||
// \\ - backslash (U+005C)
|
||||
l.buffer.WriteRune('\\')
|
||||
l.buffer.WriteRune(l.next())
|
||||
case l.upcoming(shortUtf8Escape):
|
||||
l.acceptNext(2)
|
||||
case l.upcoming(backslash, shortUtf8Escape, hex, hex, hex, hex):
|
||||
// \uXXXX - unicode (U+XXXX)
|
||||
return l.errorf("Not yet implemented: short utf8")
|
||||
case l.upcoming(longUtf8Escape):
|
||||
l.acceptNext(6)
|
||||
case l.upcoming(backslash, longUtf8Escape, hex, hex, hex, hex, hex, hex, hex, hex):
|
||||
// \UXXXXXXXX - unicode (U+XXXXXXXX)
|
||||
return l.errorf("Not yet implemented: long utf8")
|
||||
default:
|
||||
l.acceptNext(10)
|
||||
case l.upcoming(backslash):
|
||||
// All other escape sequences not listed above are reserved and,
|
||||
// if used, TOML should produce an error.
|
||||
return l.errorf("Invalid escape sequence \\%c in string value", l.next())
|
||||
}
|
||||
return l.errorf("Invalid escape sequence in basic string")
|
||||
case l.upcoming(invalidBasicStringCharacters):
|
||||
// Any Unicode character may be used except those that must be escaped:
|
||||
// quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F).
|
||||
return l.errorf("Invalid character in basic string: %q", l.next())
|
||||
default:
|
||||
l.acceptNext()
|
||||
l.acceptNext(1)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -107,8 +107,12 @@ func TestBasicString(t *testing.T) {
|
|||
}
|
||||
|
||||
func TestBasicStringWithInvalidEscapeSequence(t *testing.T) {
|
||||
runStatesT(t, statesT{
|
||||
"invalid escape sequence", `a="\x"`, "[a]=", `Invalid escape sequence \x in string value`,
|
||||
runStatesTs(t, []statesT{
|
||||
{"invalid escape sequence", `a="\x"`, "[a]=", "Invalid escape sequence in basic string"},
|
||||
{"too short \\u UTF8", `a="\u123"`, "[a]=", "Invalid escape sequence in basic string"},
|
||||
{"invalid hex in \\u UTF8", `a="\u000P"`, "[a]=", "Invalid escape sequence in basic string"},
|
||||
{"too short \\U UTF8", `a="\U1234567"`, "[a]=", "Invalid escape sequence in basic string"},
|
||||
{"invalid hex in \\U UTF8", `a="\U0000000P"`, "[a]=", "Invalid escape sequence in basic string"},
|
||||
})
|
||||
}
|
||||
|
||||
|
|
|
@ -23,18 +23,6 @@ func TestResetResetsBuffer(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
type stringbufT struct {
|
||||
name string
|
||||
in string
|
||||
out string
|
||||
isSuccessCase bool
|
||||
}
|
||||
|
||||
const (
|
||||
OK bool = true
|
||||
FAIL bool = false
|
||||
)
|
||||
|
||||
func TestAsLiteralString(t *testing.T) {
|
||||
b := lexer.StringBuffer{}
|
||||
for _, c := range []stringbufT{
|
||||
|
@ -85,3 +73,15 @@ func TestAsInterpretedString(t *testing.T) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
type stringbufT struct {
|
||||
name string
|
||||
in string
|
||||
out string
|
||||
isSuccessCase bool
|
||||
}
|
||||
|
||||
const (
|
||||
OK bool = true
|
||||
FAIL bool = false
|
||||
)
|
||||
|
|
Loading…
Reference in New Issue