Make short and long UTF8 escape sequences work in strings.
This commit is contained in:
parent
cbc4f04179
commit
dc47ac3b71
|
@ -175,15 +175,16 @@ func (l *Lexer) peekMulti(amount int) ([]rune, bool) {
|
||||||
return peeked, true
|
return peeked, true
|
||||||
}
|
}
|
||||||
|
|
||||||
// acceptNext adds the next rune from the input to the string buffer.
|
// acceptNext adds the specified amount of runes from the input to the string buffer.
|
||||||
// If no rune could be read (end of file or invalid UTF8 data),
|
// If not enough runes could be read (end of file or invalid UTF8 data), then false is returned.
|
||||||
// then false is returned.
|
func (l *Lexer) acceptNext(count int) bool {
|
||||||
func (l *Lexer) acceptNext() bool {
|
for i := 0; i < count; i++ {
|
||||||
r := l.next()
|
r := l.next()
|
||||||
if r == endOfFile || r == utf8.RuneError {
|
if r == endOfFile || r == utf8.RuneError {
|
||||||
return false
|
return false
|
||||||
|
}
|
||||||
|
l.buffer.WriteRune(r)
|
||||||
}
|
}
|
||||||
l.buffer.WriteRune(r)
|
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -262,9 +263,13 @@ func (l *Lexer) accept(runes string) bool {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
func (l *Lexer) upcoming(runes string) bool {
|
func (l *Lexer) upcoming(runes ...string) bool {
|
||||||
if l.accept(runes) {
|
if peeked, ok := l.peekMulti(len(runes)); ok {
|
||||||
l.backup()
|
for i, r := range runes {
|
||||||
|
if strings.IndexRune(r, peeked[i]) < 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
return false
|
return false
|
||||||
|
|
|
@ -13,6 +13,7 @@ const (
|
||||||
lower string = "abcdefghijklmnopqrstuvwxyz"
|
lower string = "abcdefghijklmnopqrstuvwxyz"
|
||||||
upper string = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
upper string = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||||
digits string = "0123456789"
|
digits string = "0123456789"
|
||||||
|
hex string = digits + "abcdefABCDEF"
|
||||||
dot string = "."
|
dot string = "."
|
||||||
underscore string = "_"
|
underscore string = "_"
|
||||||
dash string = "-"
|
dash string = "-"
|
||||||
|
@ -46,7 +47,7 @@ func stateComment(l *Lexer) stateFn {
|
||||||
l.emitTrimmedLiteral(ItemComment)
|
l.emitTrimmedLiteral(ItemComment)
|
||||||
return stateKeyValuePair
|
return stateKeyValuePair
|
||||||
default:
|
default:
|
||||||
if !l.acceptNext() {
|
if !l.acceptNext(1) {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -146,40 +147,34 @@ func stateParseBasicString(l *Lexer) stateFn {
|
||||||
switch {
|
switch {
|
||||||
case l.atEndOfFile():
|
case l.atEndOfFile():
|
||||||
return l.unexpectedEndOfFile("basic string token")
|
return l.unexpectedEndOfFile("basic string token")
|
||||||
case l.accept(doubleQuote):
|
case l.skip(doubleQuote):
|
||||||
return l.popState()
|
return l.popState()
|
||||||
case l.accept(backslash):
|
case l.upcoming(backslash, escapeChars):
|
||||||
// For convenience, some popular characters have a compact escape sequence.
|
// For convenience, some popular characters have a compact escape sequence.
|
||||||
// Any Unicode character may be escaped with the \uXXXX or \UXXXXXXXX forms.
|
// \b - backspace (U+0008)
|
||||||
// The escape codes must be valid Unicode scalar values.
|
// \t - tab (U+0009)
|
||||||
switch {
|
// \n - linefeed (U+000A)
|
||||||
case l.upcoming(escapeChars):
|
// \f - form feed (U+000C)
|
||||||
// \b - backspace (U+0008)
|
// \r - carriage return (U+000D)
|
||||||
// \t - tab (U+0009)
|
// \" - quote (U+0022)
|
||||||
// \n - linefeed (U+000A)
|
// \\ - backslash (U+005C)
|
||||||
// \f - form feed (U+000C)
|
l.acceptNext(2)
|
||||||
// \r - carriage return (U+000D)
|
case l.upcoming(backslash, shortUtf8Escape, hex, hex, hex, hex):
|
||||||
// \" - quote (U+0022)
|
// \uXXXX - unicode (U+XXXX)
|
||||||
// \\ - backslash (U+005C)
|
l.acceptNext(6)
|
||||||
l.buffer.WriteRune('\\')
|
case l.upcoming(backslash, longUtf8Escape, hex, hex, hex, hex, hex, hex, hex, hex):
|
||||||
l.buffer.WriteRune(l.next())
|
// \UXXXXXXXX - unicode (U+XXXXXXXX)
|
||||||
case l.upcoming(shortUtf8Escape):
|
l.acceptNext(10)
|
||||||
// \uXXXX - unicode (U+XXXX)
|
case l.upcoming(backslash):
|
||||||
return l.errorf("Not yet implemented: short utf8")
|
// All other escape sequences not listed above are reserved and,
|
||||||
case l.upcoming(longUtf8Escape):
|
// if used, TOML should produce an error.
|
||||||
// \UXXXXXXXX - unicode (U+XXXXXXXX)
|
return l.errorf("Invalid escape sequence in basic string")
|
||||||
return l.errorf("Not yet implemented: long utf8")
|
|
||||||
default:
|
|
||||||
// All other escape sequences not listed above are reserved and,
|
|
||||||
// if used, TOML should produce an error.
|
|
||||||
return l.errorf("Invalid escape sequence \\%c in string value", l.next())
|
|
||||||
}
|
|
||||||
case l.upcoming(invalidBasicStringCharacters):
|
case l.upcoming(invalidBasicStringCharacters):
|
||||||
// Any Unicode character may be used except those that must be escaped:
|
// Any Unicode character may be used except those that must be escaped:
|
||||||
// quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F).
|
// quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F).
|
||||||
return l.errorf("Invalid character in basic string: %q", l.next())
|
return l.errorf("Invalid character in basic string: %q", l.next())
|
||||||
default:
|
default:
|
||||||
l.acceptNext()
|
l.acceptNext(1)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -107,8 +107,12 @@ func TestBasicString(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestBasicStringWithInvalidEscapeSequence(t *testing.T) {
|
func TestBasicStringWithInvalidEscapeSequence(t *testing.T) {
|
||||||
runStatesT(t, statesT{
|
runStatesTs(t, []statesT{
|
||||||
"invalid escape sequence", `a="\x"`, "[a]=", `Invalid escape sequence \x in string value`,
|
{"invalid escape sequence", `a="\x"`, "[a]=", "Invalid escape sequence in basic string"},
|
||||||
|
{"too short \\u UTF8", `a="\u123"`, "[a]=", "Invalid escape sequence in basic string"},
|
||||||
|
{"invalid hex in \\u UTF8", `a="\u000P"`, "[a]=", "Invalid escape sequence in basic string"},
|
||||||
|
{"too short \\U UTF8", `a="\U1234567"`, "[a]=", "Invalid escape sequence in basic string"},
|
||||||
|
{"invalid hex in \\U UTF8", `a="\U0000000P"`, "[a]=", "Invalid escape sequence in basic string"},
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -23,18 +23,6 @@ func TestResetResetsBuffer(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
type stringbufT struct {
|
|
||||||
name string
|
|
||||||
in string
|
|
||||||
out string
|
|
||||||
isSuccessCase bool
|
|
||||||
}
|
|
||||||
|
|
||||||
const (
|
|
||||||
OK bool = true
|
|
||||||
FAIL bool = false
|
|
||||||
)
|
|
||||||
|
|
||||||
func TestAsLiteralString(t *testing.T) {
|
func TestAsLiteralString(t *testing.T) {
|
||||||
b := lexer.StringBuffer{}
|
b := lexer.StringBuffer{}
|
||||||
for _, c := range []stringbufT{
|
for _, c := range []stringbufT{
|
||||||
|
@ -85,3 +73,15 @@ func TestAsInterpretedString(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type stringbufT struct {
|
||||||
|
name string
|
||||||
|
in string
|
||||||
|
out string
|
||||||
|
isSuccessCase bool
|
||||||
|
}
|
||||||
|
|
||||||
|
const (
|
||||||
|
OK bool = true
|
||||||
|
FAIL bool = false
|
||||||
|
)
|
||||||
|
|
Loading…
Reference in New Issue