|
|
|
@ -1,12 +1,10 @@
|
|
|
|
|
package parse
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
"fmt"
|
|
|
|
|
"strconv"
|
|
|
|
|
"strings"
|
|
|
|
|
"unicode/utf8"
|
|
|
|
|
|
|
|
|
|
"git.makaay.nl/mauricem/go-parsekit/parse"
|
|
|
|
|
"git.makaay.nl/mauricem/go-parsekit/tokenize"
|
|
|
|
|
"git.makaay.nl/mauricem/go-toml/ast"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
@ -30,11 +28,6 @@ var (
|
|
|
|
|
// Opening and losing character for literal strings.
|
|
|
|
|
literalStringDelimiter = a.SingleQuote
|
|
|
|
|
|
|
|
|
|
// Control characters as defined by TOML (U+0000 to U+001F, U+007F)
|
|
|
|
|
|
|
|
|
|
isControlCharacter = func(b byte) bool { return (b >= 0x00 && b <= 0x1F) || b == 0x7F }
|
|
|
|
|
controlCharacter = a.ByteByCallback(isControlCharacter)
|
|
|
|
|
|
|
|
|
|
// For convenience, some popular characters have a compact escape sequence.
|
|
|
|
|
//
|
|
|
|
|
// \b - backspace (U+0008)
|
|
|
|
@ -96,36 +89,202 @@ func (t *parser) parseString(p *parse.API) (*ast.Value, bool) {
|
|
|
|
|
// "All other escape sequences [..] are reserved and, if used, TOML should
|
|
|
|
|
// produce an error.""
|
|
|
|
|
func (t *parser) parseBasicString(name string, p *parse.API) (string, bool) {
|
|
|
|
|
if !p.Skip(a.DoubleQuote) {
|
|
|
|
|
p.Expected(`opening quotation marks`)
|
|
|
|
|
if !p.Accept(basicStringHandler) {
|
|
|
|
|
return "", false
|
|
|
|
|
}
|
|
|
|
|
sb := &strings.Builder{}
|
|
|
|
|
return p.Result.String(), true
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
type stringTokenizerState int
|
|
|
|
|
|
|
|
|
|
const (
|
|
|
|
|
strStart stringTokenizerState = iota
|
|
|
|
|
strStart2
|
|
|
|
|
strStart3
|
|
|
|
|
strStart4
|
|
|
|
|
strChar
|
|
|
|
|
strEscape
|
|
|
|
|
strEscapeUnicode
|
|
|
|
|
strEscapeConcatWs1
|
|
|
|
|
strEscapeConcatCRLF
|
|
|
|
|
strEscapeConcatWs2
|
|
|
|
|
strCRLF
|
|
|
|
|
strUTF8
|
|
|
|
|
strEnd2
|
|
|
|
|
strEnd3
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
const (
|
|
|
|
|
lowest6bits = 0x3F // 0011 1111
|
|
|
|
|
lowest5bits = 0x1F // 0001 1111
|
|
|
|
|
lowest4bits = 0x0F // 0000 1111
|
|
|
|
|
lowest3bits = 0x07 // 0000 0111
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
func basicStringHandler(tokenAPI *tokenize.API) bool {
|
|
|
|
|
var state stringTokenizerState
|
|
|
|
|
in := tokenAPI.Input
|
|
|
|
|
out := tokenAPI.Output
|
|
|
|
|
|
|
|
|
|
unicodeReqLen := 0
|
|
|
|
|
unicodeLen := 0
|
|
|
|
|
unicodeHex := make([]byte, 8)
|
|
|
|
|
|
|
|
|
|
utf8ReqLen := 0
|
|
|
|
|
utf8Len := 0
|
|
|
|
|
utf8Rune := rune(0)
|
|
|
|
|
utf8Bytes := make([]byte, 4)
|
|
|
|
|
|
|
|
|
|
for {
|
|
|
|
|
bs, _ := in.Byte.PeekBuffered(0)
|
|
|
|
|
bslen := len(bs)
|
|
|
|
|
if bslen == 0 {
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
for i := 0; i < bslen; i++ {
|
|
|
|
|
b := bs[i]
|
|
|
|
|
switch state {
|
|
|
|
|
case strStart:
|
|
|
|
|
if b != '"' {
|
|
|
|
|
// No opening quotes found.
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
in.Byte.MoveCursor(b)
|
|
|
|
|
state = strChar
|
|
|
|
|
case strChar:
|
|
|
|
|
switch {
|
|
|
|
|
case p.Peek(controlCharacter):
|
|
|
|
|
p.SetError("invalid character in %s: %q (must be escaped)", name, p.Result.Byte(0))
|
|
|
|
|
return sb.String(), false
|
|
|
|
|
case p.Accept(validEscape):
|
|
|
|
|
if !appendEscapedRune(p, sb) {
|
|
|
|
|
return sb.String(), false
|
|
|
|
|
case (b >= 0x00 && b <= 0x1F) || b == 0x7F:
|
|
|
|
|
// Control characters as defined by the TOML specification.
|
|
|
|
|
// These must always be escaped.
|
|
|
|
|
// Unescaped control character
|
|
|
|
|
// TODO error reporting instead of full reject
|
|
|
|
|
return false
|
|
|
|
|
case b == '\\':
|
|
|
|
|
in.Byte.MoveCursor(b)
|
|
|
|
|
state = strEscape
|
|
|
|
|
continue
|
|
|
|
|
case b == '"':
|
|
|
|
|
in.Byte.MoveCursor(b)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
case p.Peek(a.Backslash):
|
|
|
|
|
p.SetError("invalid escape sequence")
|
|
|
|
|
return sb.String(), false
|
|
|
|
|
case p.Skip(basicStringDelimiter):
|
|
|
|
|
return sb.String(), true
|
|
|
|
|
case p.Peek(a.InvalidRune):
|
|
|
|
|
p.SetError("invalid UTF8 rune")
|
|
|
|
|
return sb.String(), false
|
|
|
|
|
case p.Accept(a.ValidRune):
|
|
|
|
|
sb.WriteString(p.Result.String())
|
|
|
|
|
switch b >> 4 {
|
|
|
|
|
case 0, 1, 2, 3, 4, 5, 6, 7: // 1 byte UTF8 (0xxxxxxx, a.k.a. ASCII)
|
|
|
|
|
out.AddByte(b)
|
|
|
|
|
in.Byte.MoveCursor(b)
|
|
|
|
|
continue
|
|
|
|
|
case 12, 13: // 2 byte UTF8 (110xxxxx 10xxxxxx)
|
|
|
|
|
utf8ReqLen = 2
|
|
|
|
|
utf8Rune = rune((b & lowest5bits) << 6)
|
|
|
|
|
case 14: // 3 byte UTF8 (1110xxxx 10xxxxxx 10xxxxxx)
|
|
|
|
|
utf8ReqLen = 3
|
|
|
|
|
utf8Rune = rune((b & lowest4bits) << 6)
|
|
|
|
|
case 15: // 4 byte UTF8 (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
|
|
|
|
|
utf8ReqLen = 4
|
|
|
|
|
utf8Rune = rune((b & lowest3bits) << 6)
|
|
|
|
|
default: // Invalid UTF8 rune
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
utf8Bytes[0] = b
|
|
|
|
|
utf8Len = 1
|
|
|
|
|
state = strUTF8
|
|
|
|
|
case strUTF8:
|
|
|
|
|
// This should be a continuation byte (10xxxxxx)
|
|
|
|
|
if b>>6 != 2 {
|
|
|
|
|
// Invalid UTF8 rune
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
utf8Bytes[utf8Len] = b
|
|
|
|
|
utf8Len++
|
|
|
|
|
utf8Rune = utf8Rune<<6 + rune(b&lowest6bits)
|
|
|
|
|
if utf8Len == utf8ReqLen {
|
|
|
|
|
if !utf8.ValidRune(utf8Rune) {
|
|
|
|
|
// Invalid unicode character
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
bytes := utf8Bytes[:utf8Len]
|
|
|
|
|
out.AddBytes(bytes...)
|
|
|
|
|
in.Byte.MoveCursorMulti(bytes...)
|
|
|
|
|
state = strChar
|
|
|
|
|
}
|
|
|
|
|
case strEscape:
|
|
|
|
|
state = strChar
|
|
|
|
|
if escaped, ok := getEscapedChar(b); ok {
|
|
|
|
|
out.AddByte(escaped)
|
|
|
|
|
in.Byte.MoveCursor(b)
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
switch b {
|
|
|
|
|
case 'u', 'U':
|
|
|
|
|
in.Byte.MoveCursor(b)
|
|
|
|
|
unicodeReqLen = 4
|
|
|
|
|
if b == 'u' {
|
|
|
|
|
unicodeReqLen = 4
|
|
|
|
|
} else {
|
|
|
|
|
unicodeReqLen = 8
|
|
|
|
|
}
|
|
|
|
|
unicodeLen = 0
|
|
|
|
|
utf8Rune = 0
|
|
|
|
|
state = strEscapeUnicode
|
|
|
|
|
default:
|
|
|
|
|
p.Expected(`closing quotation marks`)
|
|
|
|
|
return sb.String(), false
|
|
|
|
|
// Invalid escape sequence used.
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
case strEscapeUnicode:
|
|
|
|
|
value, ok := getHexValueForChar(b)
|
|
|
|
|
if !ok {
|
|
|
|
|
// Invalid unicode escape sequence used.
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
utf8Rune = utf8Rune<<4 + rune(value)
|
|
|
|
|
unicodeHex[unicodeLen] = b
|
|
|
|
|
unicodeLen++
|
|
|
|
|
if unicodeLen == unicodeReqLen {
|
|
|
|
|
if !utf8.ValidRune(utf8Rune) {
|
|
|
|
|
// Invalid unicode escape
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
in.Byte.MoveCursorMulti(unicodeHex[:unicodeLen]...)
|
|
|
|
|
w := utf8.EncodeRune(utf8Bytes, utf8Rune)
|
|
|
|
|
out.AddBytes(utf8Bytes[:w]...)
|
|
|
|
|
state = strChar
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func getHexValueForChar(b byte) (byte, bool) {
|
|
|
|
|
switch {
|
|
|
|
|
case '0' <= b && b <= '9':
|
|
|
|
|
return b - '0', true
|
|
|
|
|
case 'a' <= b && b <= 'z':
|
|
|
|
|
return b - 'a' + 10, true
|
|
|
|
|
case 'A' <= b && b <= 'Z':
|
|
|
|
|
return b - 'A' + 10, true
|
|
|
|
|
default:
|
|
|
|
|
return 0, false
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func getEscapedChar(b byte) (byte, bool) {
|
|
|
|
|
switch b {
|
|
|
|
|
case 'b':
|
|
|
|
|
return '\b', true
|
|
|
|
|
case 't':
|
|
|
|
|
return '\t', true
|
|
|
|
|
case 'n':
|
|
|
|
|
return '\n', true
|
|
|
|
|
case 'f':
|
|
|
|
|
return '\f', true
|
|
|
|
|
case 'r':
|
|
|
|
|
return '\r', true
|
|
|
|
|
case '"':
|
|
|
|
|
return '"', true
|
|
|
|
|
case '\\':
|
|
|
|
|
return '\\', true
|
|
|
|
|
}
|
|
|
|
|
return 0, false
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Specific handling of input for literal strings.
|
|
|
|
|
//
|
|
|
|
@ -135,28 +294,88 @@ func (t *parser) parseBasicString(name string, p *parse.API) (string, bool) {
|
|
|
|
|
//
|
|
|
|
|
// • Control characters other than tab are not permitted in a literal string.
|
|
|
|
|
func (t *parser) parseLiteralString(name string, p *parse.API) (string, bool) {
|
|
|
|
|
if !p.Skip(a.SingleQuote) {
|
|
|
|
|
p.Expected("opening single quote")
|
|
|
|
|
if !p.Accept(literalStringHandler) {
|
|
|
|
|
return "", false
|
|
|
|
|
}
|
|
|
|
|
sb := &strings.Builder{}
|
|
|
|
|
return p.Result.String(), true
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func literalStringHandler(tokenAPI *tokenize.API) bool {
|
|
|
|
|
var state stringTokenizerState
|
|
|
|
|
in := tokenAPI.Input
|
|
|
|
|
out := tokenAPI.Output
|
|
|
|
|
|
|
|
|
|
utf8ReqLen := 0
|
|
|
|
|
utf8Len := 0
|
|
|
|
|
utf8Rune := rune(0)
|
|
|
|
|
utf8Bytes := [4]byte{}
|
|
|
|
|
|
|
|
|
|
for {
|
|
|
|
|
bs, _ := tokenAPI.Input.Byte.PeekBuffered(0)
|
|
|
|
|
bslen := len(bs)
|
|
|
|
|
if bslen == 0 {
|
|
|
|
|
// Unexpected end of file.
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
for i := 0; i < bslen; i++ {
|
|
|
|
|
b := bs[i]
|
|
|
|
|
switch state {
|
|
|
|
|
case strStart:
|
|
|
|
|
if b != '\'' {
|
|
|
|
|
// No opening quote found.
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
in.Byte.MoveCursor(b)
|
|
|
|
|
state = strChar
|
|
|
|
|
case strChar:
|
|
|
|
|
switch {
|
|
|
|
|
case p.Skip(literalStringDelimiter):
|
|
|
|
|
return sb.String(), true
|
|
|
|
|
case p.Skip(a.Tab):
|
|
|
|
|
sb.WriteString("\t")
|
|
|
|
|
case p.Peek(controlCharacter):
|
|
|
|
|
p.SetError("invalid character in %s: %q (no control chars allowed, except for tab)", name, p.Result.Byte(0))
|
|
|
|
|
return sb.String(), false
|
|
|
|
|
case p.Peek(a.InvalidRune):
|
|
|
|
|
p.SetError("invalid UTF8 rune")
|
|
|
|
|
return sb.String(), false
|
|
|
|
|
case p.Accept(a.ValidRune):
|
|
|
|
|
sb.WriteString(p.Result.String())
|
|
|
|
|
default:
|
|
|
|
|
p.Expected("closing single quote")
|
|
|
|
|
return sb.String(), false
|
|
|
|
|
case (b >= 0x00 && b < 0x09) || (b > 0x09 && b <= 0x1F) || b == 0x7F:
|
|
|
|
|
// Unescaped control character
|
|
|
|
|
return false
|
|
|
|
|
case b == '\'':
|
|
|
|
|
in.Byte.MoveCursor(b)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
switch b >> 4 {
|
|
|
|
|
case 0, 1, 2, 3, 4, 5, 6, 7: // 1 byte UTF8 (0xxxxxxx, a.k.a. ASCII)
|
|
|
|
|
out.AddByte(b)
|
|
|
|
|
in.Byte.MoveCursor(b)
|
|
|
|
|
continue
|
|
|
|
|
case 12, 13: // 2 byte UTF8 (110xxxxx 10xxxxxx)
|
|
|
|
|
utf8ReqLen = 2
|
|
|
|
|
utf8Rune = rune((b & lowest5bits) << 6)
|
|
|
|
|
case 14: // 3 byte UTF8 (1110xxxx 10xxxxxx 10xxxxxx)
|
|
|
|
|
utf8ReqLen = 3
|
|
|
|
|
utf8Rune = rune((b & lowest4bits) << 6)
|
|
|
|
|
case 15: // 4 byte UTF8 (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
|
|
|
|
|
utf8ReqLen = 4
|
|
|
|
|
utf8Rune = rune((b & lowest3bits) << 6)
|
|
|
|
|
default: // Invalid UTF8 rune
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
utf8Bytes[0] = b
|
|
|
|
|
utf8Len = 1
|
|
|
|
|
state = strUTF8
|
|
|
|
|
case strUTF8:
|
|
|
|
|
// This should be a continuation byte (10xxxxxx)
|
|
|
|
|
if b>>6 != 2 {
|
|
|
|
|
// Invalid UTF8 rune
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
utf8Bytes[utf8Len] = b
|
|
|
|
|
utf8Len++
|
|
|
|
|
utf8Rune = utf8Rune<<6 + rune(b&lowest6bits)
|
|
|
|
|
if utf8Len == utf8ReqLen {
|
|
|
|
|
if !utf8.ValidRune(utf8Rune) {
|
|
|
|
|
// Invalid unicode character
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
bytes := utf8Bytes[:utf8Len]
|
|
|
|
|
out.AddBytes(bytes...)
|
|
|
|
|
in.Byte.MoveCursorMulti(bytes...)
|
|
|
|
|
state = strChar
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
@ -185,71 +404,258 @@ func (t *parser) parseLiteralString(name string, p *parse.API) (string, bool) {
|
|
|
|
|
// a \, it will be trimmed along with all whitespace (including newlines) up to
|
|
|
|
|
// the next non-whitespace character or closing delimiter.
|
|
|
|
|
func (t *parser) parseMultiLineBasicString(p *parse.API) (string, bool) {
|
|
|
|
|
if !p.Skip(openingMultiLineBasicString) {
|
|
|
|
|
p.Expected("opening three quotation marks")
|
|
|
|
|
if !p.Accept(multiLineBasicStringHandler) {
|
|
|
|
|
return "", false
|
|
|
|
|
}
|
|
|
|
|
sb := &strings.Builder{}
|
|
|
|
|
for {
|
|
|
|
|
switch {
|
|
|
|
|
case p.Skip(newline):
|
|
|
|
|
sb.WriteString("\n")
|
|
|
|
|
case p.Peek(controlCharacter):
|
|
|
|
|
p.SetError("invalid character in multi-line basic string: %q (must be escaped)", p.Result.Byte(0))
|
|
|
|
|
return sb.String(), false
|
|
|
|
|
case p.Accept(validEscape):
|
|
|
|
|
if !appendEscapedRune(p, sb) {
|
|
|
|
|
return sb.String(), false
|
|
|
|
|
}
|
|
|
|
|
case p.Skip(lineEndingBackslash):
|
|
|
|
|
// NOOP
|
|
|
|
|
case p.Peek(a.Backslash):
|
|
|
|
|
p.SetError("invalid escape sequence")
|
|
|
|
|
return sb.String(), false
|
|
|
|
|
case p.Skip(closingMultiLineBasicString):
|
|
|
|
|
return sb.String(), true
|
|
|
|
|
case p.Accept(a.ValidRune):
|
|
|
|
|
sb.WriteString(p.Result.String())
|
|
|
|
|
case p.Peek(a.InvalidRune):
|
|
|
|
|
p.SetError("invalid UTF8 rune")
|
|
|
|
|
return sb.String(), false
|
|
|
|
|
default:
|
|
|
|
|
p.Expected("closing three quotation marks")
|
|
|
|
|
return sb.String(), false
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return p.Result.String(), true
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func appendEscapedRune(p *parse.API, sb *strings.Builder) bool {
|
|
|
|
|
s := p.Result.String()
|
|
|
|
|
switch s {
|
|
|
|
|
case `\b`:
|
|
|
|
|
sb.WriteRune('\b')
|
|
|
|
|
case `\t`:
|
|
|
|
|
sb.WriteRune('\t')
|
|
|
|
|
case `\n`:
|
|
|
|
|
sb.WriteRune('\n')
|
|
|
|
|
case `\f`:
|
|
|
|
|
sb.WriteRune('\f')
|
|
|
|
|
case `\r`:
|
|
|
|
|
sb.WriteRune('\r')
|
|
|
|
|
case `\"`:
|
|
|
|
|
sb.WriteRune('"')
|
|
|
|
|
case `\\`:
|
|
|
|
|
sb.WriteRune('\\')
|
|
|
|
|
default:
|
|
|
|
|
// UTF8 escape code: \uXXXX or \UXXXXXXXXXXXX.
|
|
|
|
|
hex := s[2:]
|
|
|
|
|
val, _ := strconv.ParseUint(hex, 16, 32) // hex format already validated by parser
|
|
|
|
|
r := rune(val)
|
|
|
|
|
if !utf8.ValidRune(r) {
|
|
|
|
|
p.SetError(fmt.Sprintf("invalid UTF8 escape '%s'", s))
|
|
|
|
|
func multiLineBasicStringHandler(tokenAPI *tokenize.API) bool {
|
|
|
|
|
var state stringTokenizerState
|
|
|
|
|
in := tokenAPI.Input
|
|
|
|
|
out := tokenAPI.Output
|
|
|
|
|
|
|
|
|
|
unicodeReqLen := 0
|
|
|
|
|
unicodeLen := 0
|
|
|
|
|
unicodeHex := make([]byte, 8)
|
|
|
|
|
|
|
|
|
|
utf8ReqLen := 0
|
|
|
|
|
utf8Len := 0
|
|
|
|
|
utf8Rune := rune(0)
|
|
|
|
|
utf8Bytes := make([]byte, 4)
|
|
|
|
|
|
|
|
|
|
crlf := false
|
|
|
|
|
|
|
|
|
|
for {
|
|
|
|
|
bs, _ := in.Byte.PeekBuffered(0)
|
|
|
|
|
bslen := len(bs)
|
|
|
|
|
if bslen == 0 {
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
sb.WriteRune(r)
|
|
|
|
|
for i := 0; i < bslen; i++ {
|
|
|
|
|
b := bs[i]
|
|
|
|
|
switch state {
|
|
|
|
|
case strStart, strStart2, strStart3:
|
|
|
|
|
if b != '"' {
|
|
|
|
|
// No triple opening quotes found.
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
in.Byte.MoveCursor(b)
|
|
|
|
|
switch state {
|
|
|
|
|
case strStart:
|
|
|
|
|
state = strStart2
|
|
|
|
|
case strStart2:
|
|
|
|
|
state = strStart3
|
|
|
|
|
case strStart3:
|
|
|
|
|
state = strStart4
|
|
|
|
|
}
|
|
|
|
|
case strStart4:
|
|
|
|
|
if !crlf && b == '\r' {
|
|
|
|
|
crlf = true
|
|
|
|
|
in.Byte.MoveCursor(b)
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
if b == '\n' {
|
|
|
|
|
in.Byte.MoveCursor(b)
|
|
|
|
|
state = strChar
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
if crlf {
|
|
|
|
|
// Lonely \r without \n.
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
state = strChar
|
|
|
|
|
fallthrough
|
|
|
|
|
case strChar:
|
|
|
|
|
switch {
|
|
|
|
|
case b == '\r':
|
|
|
|
|
state = strCRLF
|
|
|
|
|
continue
|
|
|
|
|
case b == '\n':
|
|
|
|
|
out.AddByte(b)
|
|
|
|
|
in.Byte.MoveCursor(b)
|
|
|
|
|
continue
|
|
|
|
|
case (b >= 0x00 && b <= 0x1F) || b == 0x7F:
|
|
|
|
|
// Unescaped control character
|
|
|
|
|
// TODO error reporting instead of full reject
|
|
|
|
|
return false
|
|
|
|
|
case b == '\\':
|
|
|
|
|
in.Byte.MoveCursor(b)
|
|
|
|
|
state = strEscape
|
|
|
|
|
continue
|
|
|
|
|
case b == '"':
|
|
|
|
|
in.Byte.MoveCursor(b)
|
|
|
|
|
state = strEnd2
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
switch b >> 4 {
|
|
|
|
|
case 0, 1, 2, 3, 4, 5, 6, 7: // 1 byte UTF8 (0xxxxxxx, a.k.a. ASCII)
|
|
|
|
|
out.AddByte(b)
|
|
|
|
|
in.Byte.MoveCursor(b)
|
|
|
|
|
continue
|
|
|
|
|
case 12, 13: // 2 byte UTF8 (110xxxxx 10xxxxxx)
|
|
|
|
|
utf8ReqLen = 2
|
|
|
|
|
utf8Rune = rune((b & lowest5bits) << 6)
|
|
|
|
|
case 14: // 3 byte UTF8 (1110xxxx 10xxxxxx 10xxxxxx)
|
|
|
|
|
utf8ReqLen = 3
|
|
|
|
|
utf8Rune = rune((b & lowest4bits) << 6)
|
|
|
|
|
case 15: // 4 byte UTF8 (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
|
|
|
|
|
utf8ReqLen = 4
|
|
|
|
|
utf8Rune = rune((b & lowest3bits) << 6)
|
|
|
|
|
default: // Invalid UTF8 rune
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
utf8Bytes[0] = b
|
|
|
|
|
utf8Len = 1
|
|
|
|
|
state = strUTF8
|
|
|
|
|
case strUTF8:
|
|
|
|
|
// This should be a continuation byte (10xxxxxx)
|
|
|
|
|
if b>>6 != 2 {
|
|
|
|
|
// Invalid UTF8 rune
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
utf8Bytes[utf8Len] = b
|
|
|
|
|
utf8Len++
|
|
|
|
|
utf8Rune = utf8Rune<<6 + rune(b&lowest6bits)
|
|
|
|
|
if utf8Len == utf8ReqLen {
|
|
|
|
|
if !utf8.ValidRune(utf8Rune) {
|
|
|
|
|
// Invalid unicode character
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
bytes := utf8Bytes[:utf8Len]
|
|
|
|
|
out.AddBytes(bytes...)
|
|
|
|
|
in.Byte.MoveCursorMulti(bytes...)
|
|
|
|
|
state = strChar
|
|
|
|
|
}
|
|
|
|
|
case strCRLF:
|
|
|
|
|
if b == '\n' {
|
|
|
|
|
in.Byte.MoveCursorMulti('\r', b)
|
|
|
|
|
out.AddByte('\n')
|
|
|
|
|
state = strChar
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
// Lonely \r, should have been escaped.
|
|
|
|
|
return false
|
|
|
|
|
case strEscape:
|
|
|
|
|
state = strChar
|
|
|
|
|
if escaped, ok := getEscapedChar(b); ok {
|
|
|
|
|
out.AddByte(escaped)
|
|
|
|
|
in.Byte.MoveCursor(b)
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
switch b {
|
|
|
|
|
case ' ', '\t':
|
|
|
|
|
in.Byte.MoveCursor(b)
|
|
|
|
|
state = strEscapeConcatWs1
|
|
|
|
|
continue
|
|
|
|
|
case '\r':
|
|
|
|
|
in.Byte.MoveCursor(b)
|
|
|
|
|
state = strEscapeConcatCRLF
|
|
|
|
|
continue
|
|
|
|
|
case '\n':
|
|
|
|
|
in.Byte.MoveCursor(b)
|
|
|
|
|
state = strEscapeConcatWs2
|
|
|
|
|
continue
|
|
|
|
|
case 'u', 'U':
|
|
|
|
|
in.Byte.MoveCursor(b)
|
|
|
|
|
unicodeReqLen = 4
|
|
|
|
|
if b == 'u' {
|
|
|
|
|
unicodeReqLen = 4
|
|
|
|
|
} else {
|
|
|
|
|
unicodeReqLen = 8
|
|
|
|
|
}
|
|
|
|
|
unicodeLen = 0
|
|
|
|
|
utf8Rune = 0
|
|
|
|
|
state = strEscapeUnicode
|
|
|
|
|
default:
|
|
|
|
|
// Invalid escape sequence used.
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
case strEscapeConcatWs1:
|
|
|
|
|
switch b {
|
|
|
|
|
case ' ', '\t':
|
|
|
|
|
in.Byte.MoveCursor(b)
|
|
|
|
|
continue
|
|
|
|
|
case '\r':
|
|
|
|
|
in.Byte.MoveCursor(b)
|
|
|
|
|
state = strEscapeConcatCRLF
|
|
|
|
|
continue
|
|
|
|
|
case '\n':
|
|
|
|
|
in.Byte.MoveCursor(b)
|
|
|
|
|
state = strEscapeConcatWs2
|
|
|
|
|
continue
|
|
|
|
|
default:
|
|
|
|
|
// Invalid line concatenation
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
case strEscapeConcatCRLF:
|
|
|
|
|
switch b {
|
|
|
|
|
case '\n':
|
|
|
|
|
in.Byte.MoveCursor(b)
|
|
|
|
|
state = strEscapeConcatWs2
|
|
|
|
|
continue
|
|
|
|
|
default:
|
|
|
|
|
// Invalid line concatenation
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
case strEscapeConcatWs2:
|
|
|
|
|
switch b {
|
|
|
|
|
case ' ', '\t':
|
|
|
|
|
in.Byte.MoveCursor(b)
|
|
|
|
|
continue
|
|
|
|
|
case '\r':
|
|
|
|
|
in.Byte.MoveCursor(b)
|
|
|
|
|
state = strEscapeConcatCRLF
|
|
|
|
|
continue
|
|
|
|
|
case '\n':
|
|
|
|
|
in.Byte.MoveCursor(b)
|
|
|
|
|
state = strEscapeConcatWs2
|
|
|
|
|
continue
|
|
|
|
|
default:
|
|
|
|
|
i--
|
|
|
|
|
state = strChar
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
case strEscapeUnicode:
|
|
|
|
|
value, ok := getHexValueForChar(b)
|
|
|
|
|
if !ok {
|
|
|
|
|
// Invalid unicode escape sequence used.
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
utf8Rune = utf8Rune<<4 + rune(value)
|
|
|
|
|
unicodeHex[unicodeLen] = b
|
|
|
|
|
unicodeLen++
|
|
|
|
|
if unicodeLen == unicodeReqLen {
|
|
|
|
|
if !utf8.ValidRune(utf8Rune) {
|
|
|
|
|
// Invalid unicode escape
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
in.Byte.MoveCursorMulti(unicodeHex[:unicodeLen]...)
|
|
|
|
|
w := utf8.EncodeRune(utf8Bytes, utf8Rune)
|
|
|
|
|
out.AddBytes(utf8Bytes[:w]...)
|
|
|
|
|
state = strChar
|
|
|
|
|
}
|
|
|
|
|
case strEnd2:
|
|
|
|
|
if b == '"' {
|
|
|
|
|
state = strEnd3
|
|
|
|
|
in.Byte.MoveCursor(b)
|
|
|
|
|
} else {
|
|
|
|
|
state = strChar
|
|
|
|
|
out.AddByte('"')
|
|
|
|
|
i--
|
|
|
|
|
}
|
|
|
|
|
case strEnd3:
|
|
|
|
|
if b == '"' {
|
|
|
|
|
in.Byte.MoveCursor(b)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
state = strChar
|
|
|
|
|
out.AddBytes('"', '"')
|
|
|
|
|
i--
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Specific handling of input for multi-line literal strings.
|
|
|
|
|
//
|
|
|
|
@ -265,30 +671,148 @@ func appendEscapedRune(p *parse.API, sb *strings.Builder) bool {
|
|
|
|
|
//
|
|
|
|
|
// • Control characters other than tab and newline are not permitted in a multi-line literal string.
|
|
|
|
|
func (t *parser) parseMultiLineLiteralString(p *parse.API) (string, bool) {
|
|
|
|
|
if !p.Skip(openingMultiLineLiteralString) {
|
|
|
|
|
p.Expected("opening three single quotes")
|
|
|
|
|
if !p.Accept(multiLineLiteralStringHandler) {
|
|
|
|
|
return "", false
|
|
|
|
|
}
|
|
|
|
|
sb := &strings.Builder{}
|
|
|
|
|
return p.Result.String(), true
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func multiLineLiteralStringHandler(tokenAPI *tokenize.API) bool {
|
|
|
|
|
var state stringTokenizerState
|
|
|
|
|
in := tokenAPI.Input
|
|
|
|
|
out := tokenAPI.Output
|
|
|
|
|
|
|
|
|
|
utf8ReqLen := 0
|
|
|
|
|
utf8Len := 0
|
|
|
|
|
utf8Rune := rune(0)
|
|
|
|
|
utf8Bytes := make([]byte, 4)
|
|
|
|
|
|
|
|
|
|
crlf := false
|
|
|
|
|
|
|
|
|
|
for {
|
|
|
|
|
bs, _ := in.Byte.PeekBuffered(0)
|
|
|
|
|
bslen := len(bs)
|
|
|
|
|
if bslen == 0 {
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
for i := 0; i < bslen; i++ {
|
|
|
|
|
b := bs[i]
|
|
|
|
|
switch state {
|
|
|
|
|
case strStart, strStart2, strStart3:
|
|
|
|
|
if b != '\'' {
|
|
|
|
|
// No triple opening quotes found.
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
in.Byte.MoveCursor(b)
|
|
|
|
|
switch state {
|
|
|
|
|
case strStart:
|
|
|
|
|
state = strStart2
|
|
|
|
|
case strStart2:
|
|
|
|
|
state = strStart3
|
|
|
|
|
case strStart3:
|
|
|
|
|
state = strStart4
|
|
|
|
|
}
|
|
|
|
|
case strStart4:
|
|
|
|
|
if !crlf && b == '\r' {
|
|
|
|
|
crlf = true
|
|
|
|
|
in.Byte.MoveCursor(b)
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
if b == '\n' {
|
|
|
|
|
in.Byte.MoveCursor(b)
|
|
|
|
|
state = strChar
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
if crlf {
|
|
|
|
|
// Lonely \r without \n.
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
state = strChar
|
|
|
|
|
fallthrough
|
|
|
|
|
case strChar:
|
|
|
|
|
switch {
|
|
|
|
|
case p.Skip(closingMultiLineLiteralString):
|
|
|
|
|
return sb.String(), true
|
|
|
|
|
case p.Skip(a.Tab):
|
|
|
|
|
sb.WriteString("\t")
|
|
|
|
|
case p.Skip(newline):
|
|
|
|
|
sb.WriteString("\n")
|
|
|
|
|
case p.Peek(controlCharacter):
|
|
|
|
|
p.SetError("invalid character in literal string: %q (no control chars allowed, except for tab and newline)", p.Result.Byte(0))
|
|
|
|
|
return sb.String(), false
|
|
|
|
|
case p.Accept(a.ValidRune):
|
|
|
|
|
sb.WriteString(p.Result.String())
|
|
|
|
|
case p.Peek(a.InvalidRune):
|
|
|
|
|
p.SetError("invalid UTF8 rune")
|
|
|
|
|
return sb.String(), false
|
|
|
|
|
default:
|
|
|
|
|
p.Expected("closing three single quotes")
|
|
|
|
|
return sb.String(), false
|
|
|
|
|
case b == '\r':
|
|
|
|
|
state = strCRLF
|
|
|
|
|
continue
|
|
|
|
|
case b == '\n' || b == '\t':
|
|
|
|
|
out.AddByte(b)
|
|
|
|
|
in.Byte.MoveCursor(b)
|
|
|
|
|
continue
|
|
|
|
|
case (b >= 0x00 && b <= 0x1F) || b == 0x7F:
|
|
|
|
|
// Unescaped control character
|
|
|
|
|
// TODO error reporting instead of full reject
|
|
|
|
|
return false
|
|
|
|
|
case b == '\'':
|
|
|
|
|
in.Byte.MoveCursor(b)
|
|
|
|
|
state = strEnd2
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
switch b >> 4 {
|
|
|
|
|
case 0, 1, 2, 3, 4, 5, 6, 7: // 1 byte UTF8 (0xxxxxxx, a.k.a. ASCII)
|
|
|
|
|
out.AddByte(b)
|
|
|
|
|
in.Byte.MoveCursor(b)
|
|
|
|
|
continue
|
|
|
|
|
case 12, 13: // 2 byte UTF8 (110xxxxx 10xxxxxx)
|
|
|
|
|
utf8ReqLen = 2
|
|
|
|
|
utf8Rune = rune((b & lowest5bits) << 6)
|
|
|
|
|
case 14: // 3 byte UTF8 (1110xxxx 10xxxxxx 10xxxxxx)
|
|
|
|
|
utf8ReqLen = 3
|
|
|
|
|
utf8Rune = rune((b & lowest4bits) << 6)
|
|
|
|
|
case 15: // 4 byte UTF8 (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
|
|
|
|
|
utf8ReqLen = 4
|
|
|
|
|
utf8Rune = rune((b & lowest3bits) << 6)
|
|
|
|
|
default: // Invalid UTF8 rune
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
utf8Bytes[0] = b
|
|
|
|
|
utf8Len = 1
|
|
|
|
|
state = strUTF8
|
|
|
|
|
case strUTF8:
|
|
|
|
|
// This should be a continuation byte (10xxxxxx)
|
|
|
|
|
if b>>6 != 2 {
|
|
|
|
|
// Invalid UTF8 rune
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
utf8Bytes[utf8Len] = b
|
|
|
|
|
utf8Len++
|
|
|
|
|
utf8Rune = utf8Rune<<6 + rune(b&lowest6bits)
|
|
|
|
|
if utf8Len == utf8ReqLen {
|
|
|
|
|
if !utf8.ValidRune(utf8Rune) {
|
|
|
|
|
// Invalid unicode character
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
bytes := utf8Bytes[:utf8Len]
|
|
|
|
|
out.AddBytes(bytes...)
|
|
|
|
|
in.Byte.MoveCursorMulti(bytes...)
|
|
|
|
|
state = strChar
|
|
|
|
|
}
|
|
|
|
|
case strCRLF:
|
|
|
|
|
if b == '\n' {
|
|
|
|
|
in.Byte.MoveCursorMulti('\r', b)
|
|
|
|
|
out.AddByte('\n')
|
|
|
|
|
state = strChar
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
// Lonely \r, should have been escaped.
|
|
|
|
|
return false
|
|
|
|
|
case strEnd2:
|
|
|
|
|
if b == '\'' {
|
|
|
|
|
state = strEnd3
|
|
|
|
|
in.Byte.MoveCursor(b)
|
|
|
|
|
} else {
|
|
|
|
|
state = strChar
|
|
|
|
|
out.AddByte('\'')
|
|
|
|
|
i--
|
|
|
|
|
}
|
|
|
|
|
case strEnd3:
|
|
|
|
|
if b == '\'' {
|
|
|
|
|
in.Byte.MoveCursor(b)
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
state = strChar
|
|
|
|
|
out.AddBytes('\'', '\'')
|
|
|
|
|
i--
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|