954 lines
25 KiB
Go
954 lines
25 KiB
Go
package parse
|
|
|
|
import (
|
|
"unicode/utf8"
|
|
|
|
"git.makaay.nl/mauricem/go-parsekit/parse"
|
|
"git.makaay.nl/mauricem/go-parsekit/tokenize"
|
|
)
|
|
|
|
func (t *parser) parseString(p *parse.API) (string, stringType, bool) {
|
|
if !p.Accept(t.stringHandler) {
|
|
p.Expected("a string value")
|
|
return "", strTypeNone, false
|
|
}
|
|
strType := stringTypeFromFlags(t.strFlags)
|
|
str := p.Result.String()
|
|
return str, strType, true
|
|
}
|
|
|
|
type stringType byte
|
|
|
|
const (
|
|
strTypeNone stringType = iota
|
|
strTypeBasic
|
|
strTypeLiteral
|
|
strTypeMultiLineBasic
|
|
strTypeMultiLineLiteral
|
|
)
|
|
|
|
func stringTypeFromFlags(flags byte) stringType {
|
|
if flags&strFlagBasic == strFlagBasic {
|
|
if flags&strFlagMultiLine == 0 {
|
|
return strTypeBasic
|
|
}
|
|
return strTypeMultiLineBasic
|
|
}
|
|
if flags&strFlagMultiLine == 0 {
|
|
return strTypeLiteral
|
|
}
|
|
return strTypeMultiLineLiteral
|
|
}
|
|
|
|
const (
|
|
strFlagLiteral byte = 1
|
|
strFlagBasic byte = 2
|
|
strFlagMultiLine byte = 4
|
|
strFlagNewlinesOK byte = 8
|
|
strFlagTabsOK byte = 16
|
|
strFlagEscapesOK byte = 32
|
|
strFlagLineConcatOK byte = 64
|
|
)
|
|
|
|
func (t *parser) stringHandler(tokenAPI *tokenize.API) bool {
|
|
var state stringTokenizerState
|
|
in := tokenAPI.Input
|
|
out := tokenAPI.Output
|
|
|
|
unicodeReqLen := 0
|
|
unicodeLen := 0
|
|
unicodeHex := make([]byte, 8)
|
|
|
|
utf8ReqLen := 0
|
|
utf8Len := 0
|
|
utf8Rune := rune(0)
|
|
utf8Bytes := make([]byte, 4)
|
|
|
|
flags := byte(0)
|
|
delim := byte(0)
|
|
subState := 0
|
|
|
|
for {
|
|
bs, _ := in.Byte.PeekBuffered(0)
|
|
bslen := len(bs)
|
|
|
|
// End of input reached.
|
|
if bslen == 0 {
|
|
// We might be at the second delimiter of a basic or literal string.
|
|
if state == strStateStart && subState == 2 {
|
|
return true
|
|
}
|
|
// Unexpected end of input.
|
|
return false
|
|
}
|
|
|
|
for i := 0; i < bslen; i++ {
|
|
b := bs[i]
|
|
switch state {
|
|
|
|
// Parse the string opener.
|
|
// There are four ways to express strings: basic, multi-line basic, literal and
|
|
// multi-line literal. Basic strings are surrounded by quotation marks ("...").
|
|
// Literal strings are surrounded by single quotes ('...').
|
|
// Multi-line basic strings are surrounded by three quotation marks on each
|
|
// side and allow newlines ("""..."""). Multi-line literal strings are surrounded
|
|
// by three single quotes on each side and allow newlines as well ('''...''').
|
|
case strStateStart:
|
|
if subState == 0 {
|
|
if b != '"' && b != '\'' {
|
|
// Expected an opener quote here.
|
|
return false
|
|
}
|
|
if b == '\'' {
|
|
flags |= strFlagLiteral | strFlagTabsOK
|
|
} else {
|
|
flags |= strFlagBasic | strFlagEscapesOK
|
|
}
|
|
t.strFlags = flags
|
|
subState = 1
|
|
delim = b
|
|
in.Byte.MoveCursor(b)
|
|
continue
|
|
}
|
|
if subState == 1 {
|
|
// Not a second quote, so this is the start of
|
|
// single-line string content.
|
|
if b != delim {
|
|
i--
|
|
state = strStateContent
|
|
continue
|
|
}
|
|
in.Byte.MoveCursor(b)
|
|
subState = 2
|
|
continue
|
|
}
|
|
if subState == 2 {
|
|
// Not a third quote, so this is an empty string ('' or "").
|
|
if b != delim {
|
|
return true
|
|
}
|
|
// Third quote, so this is a multi-line string (''' or """).
|
|
flags |= strFlagMultiLine | strFlagNewlinesOK
|
|
if flags&strFlagBasic == strFlagBasic {
|
|
flags |= strFlagLineConcatOK
|
|
}
|
|
t.strFlags = flags
|
|
in.Byte.MoveCursor(b)
|
|
subState = 3
|
|
continue
|
|
}
|
|
if subState == 3 {
|
|
// We're in a multi-line string. From the TOML spec:
|
|
// A newline immediately following the opening delimiter will be trimmed.
|
|
// All other whitespace and newline characters remain intact.
|
|
if b == '\n' {
|
|
in.Byte.MoveCursor(b)
|
|
state = strStateContent
|
|
continue
|
|
}
|
|
if b == '\r' {
|
|
in.Byte.MoveCursor(b)
|
|
subState = 4
|
|
continue
|
|
}
|
|
// Not a newline, so this byte is part of the content.
|
|
i--
|
|
state = strStateContent
|
|
continue
|
|
}
|
|
if subState == 4 {
|
|
// We've seen a \r, so here we should see a \n for a newline
|
|
// after a multi-line opener.
|
|
if b == '\n' {
|
|
in.Byte.MoveCursor(b)
|
|
state = strStateContent
|
|
continue
|
|
}
|
|
// Lonely \r found. Pass it to the content handler.
|
|
i -= 2
|
|
state = strStateContent
|
|
continue
|
|
}
|
|
|
|
// Parse string contents.
|
|
case strStateContent:
|
|
switch {
|
|
case b == '\r' && flags&strFlagNewlinesOK == strFlagNewlinesOK:
|
|
state = strStateCRLF
|
|
continue
|
|
case b == '\n' && flags&strFlagNewlinesOK == strFlagNewlinesOK:
|
|
out.AddByte(b)
|
|
in.Byte.MoveCursor(b)
|
|
continue
|
|
case b == '\t' && flags&strFlagTabsOK == strFlagTabsOK:
|
|
out.AddByte(b)
|
|
in.Byte.MoveCursor(b)
|
|
continue
|
|
case (b >= 0x00 && b <= 0x1F) || b == 0x7F:
|
|
// Control characters must be escaped.
|
|
return false
|
|
case b == '\\':
|
|
in.Byte.MoveCursor(b)
|
|
// Handle escape codes, when they are allowed.
|
|
if flags&strFlagEscapesOK == strFlagEscapesOK {
|
|
state = strStateEscape
|
|
continue
|
|
}
|
|
// Otherwise, add the backslash as plain output.
|
|
out.AddByte(b)
|
|
continue
|
|
case b == delim:
|
|
// Single-line string.
|
|
if flags&strFlagMultiLine == 0 {
|
|
in.Byte.MoveCursor(b)
|
|
return true
|
|
}
|
|
// Multi-line string
|
|
in.Byte.MoveCursor(b)
|
|
state = strStateMultiLineEnd
|
|
subState = 0
|
|
continue
|
|
}
|
|
|
|
// At this point, we must have a UTF8 character on the input.
|
|
// Here we check what length the character must have in bytes.
|
|
// Then the rest of the work is offloaded to the strUTF8 state.
|
|
switch b >> 4 {
|
|
case 0, 1, 2, 3, 4, 5, 6, 7: // 1 byte UTF8 (0xxxxxxx, a.k.a. ASCII)
|
|
out.AddByte(b)
|
|
in.Byte.MoveCursor(b)
|
|
continue
|
|
case 12, 13: // 2 byte UTF8 (110xxxxx 10xxxxxx)
|
|
utf8ReqLen = 2
|
|
utf8Rune = rune((b & lowest5bits) << 6)
|
|
case 14: // 3 byte UTF8 (1110xxxx 10xxxxxx 10xxxxxx)
|
|
utf8ReqLen = 3
|
|
utf8Rune = rune((b & lowest4bits) << 6)
|
|
case 15: // 4 byte UTF8 (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
|
|
utf8ReqLen = 4
|
|
utf8Rune = rune((b & lowest3bits) << 6)
|
|
default: // Invalid UTF8 rune
|
|
return false
|
|
}
|
|
utf8Bytes[0] = b
|
|
utf8Len = 1
|
|
state = strStateUTF8
|
|
|
|
// Parse followup bytes of a UTF8 byte sequence.
|
|
case strStateUTF8:
|
|
// The input byte must be a continuation byte (10xxxxxx)
|
|
if b>>6 != 2 {
|
|
// Invalid UTF8 rune
|
|
return false
|
|
}
|
|
utf8Bytes[utf8Len] = b
|
|
utf8Len++
|
|
utf8Rune = utf8Rune<<6 + rune(b&lowest6bits)
|
|
if utf8Len == utf8ReqLen {
|
|
if !utf8.ValidRune(utf8Rune) {
|
|
// Invalid unicode character
|
|
return false
|
|
}
|
|
bytes := utf8Bytes[:utf8Len]
|
|
out.AddBytes(bytes...)
|
|
in.Byte.MoveCursorMulti(bytes...)
|
|
state = strStateContent
|
|
}
|
|
|
|
// Parse the \n in a \r\n sequence.
|
|
case strStateCRLF:
|
|
// \r\n is normalized to just \n here (as allowed by the TOML spec).
|
|
if b == '\n' {
|
|
in.Byte.MoveCursorMulti('\r', b)
|
|
out.AddByte('\n')
|
|
state = strStateContent
|
|
continue
|
|
}
|
|
// Lonely \r, should have been escaped.
|
|
return false
|
|
|
|
// Parse escape byte sequences.
|
|
// For convenience, some popular characters have a compact escape sequence.
|
|
//
|
|
// \b - backspace (U+0008)
|
|
// \t - tab (U+0009)
|
|
// \n - LF (U+000A)
|
|
// \f - form feed (U+000C)
|
|
// \r - carriage return (U+000D)
|
|
// \" - quote (U+0022)
|
|
// \\ - backslash (U+005C)
|
|
// \uXXXX - unicode (U+XXXX)
|
|
// \UXXXXXXXX - unicode (U+XXXXXXXX)
|
|
case strStateEscape:
|
|
// Handle short control character escape sequence (\t, \a, etc).
|
|
if escaped, ok := getEscapedChar(b); ok {
|
|
out.AddByte(escaped)
|
|
in.Byte.MoveCursor(b)
|
|
state = strStateContent
|
|
continue
|
|
}
|
|
switch b {
|
|
case ' ', '\t', '\r', '\n':
|
|
// Handle line concatenation escape sequence.
|
|
if flags&strFlagLineConcatOK == 0 {
|
|
// Invalid escape.
|
|
return false
|
|
}
|
|
// Point the parser at an appropriate subState of
|
|
// the strEscapeConcat state.
|
|
switch b {
|
|
case ' ', '\t':
|
|
subState = 0
|
|
case '\r':
|
|
subState = 1
|
|
case '\n':
|
|
subState = 2
|
|
}
|
|
in.Byte.MoveCursor(b)
|
|
state = strStateEscapeConcat
|
|
continue
|
|
case 'u', 'U':
|
|
// Handle unicode escape sequence (\uXXXX, \UXXXXXXXX).
|
|
in.Byte.MoveCursor(b)
|
|
unicodeReqLen = 4
|
|
if b == 'u' {
|
|
unicodeReqLen = 4
|
|
} else {
|
|
unicodeReqLen = 8
|
|
}
|
|
unicodeLen = 0
|
|
utf8Rune = 0
|
|
state = strStateEscapeUnicode
|
|
default:
|
|
// Invalid escape sequence used.
|
|
return false
|
|
}
|
|
|
|
// For writing long strings without introducing extraneous whitespace, use a
|
|
// "line ending backslash". When the last non-whitespace character on a line is
|
|
// a \, it will be trimmed along with all whitespace (including newlines) up to
|
|
// the next non-whitespace character or closing delimiter.
|
|
case strStateEscapeConcat:
|
|
// Skip over whitespace until the end of the line is found.
|
|
if subState == 0 {
|
|
switch b {
|
|
case ' ', '\t':
|
|
in.Byte.MoveCursor(b)
|
|
continue
|
|
case '\r':
|
|
in.Byte.MoveCursor(b)
|
|
subState = 1
|
|
continue
|
|
case '\n':
|
|
in.Byte.MoveCursor(b)
|
|
subState = 2
|
|
continue
|
|
default:
|
|
// Invalid escape sequence used. Expected whitespace or newline.
|
|
return false
|
|
}
|
|
}
|
|
// We've seen a \r at the same line as the escape char,
|
|
// skip over the following \n.
|
|
if subState == 1 {
|
|
if b == '\n' {
|
|
in.Byte.MoveCursor(b)
|
|
subState = 2
|
|
continue
|
|
}
|
|
// Invalid escape sequence used. Expected newline.
|
|
return false
|
|
}
|
|
// We've seen a \n at the same line as the escape char,
|
|
// skip over all whitespace and newlines from here on.
|
|
if subState == 2 {
|
|
if b == ' ' || b == '\t' || b == '\n' {
|
|
in.Byte.MoveCursor(b)
|
|
continue
|
|
}
|
|
if b == '\r' {
|
|
in.Byte.MoveCursor(b)
|
|
subState = 3
|
|
continue
|
|
}
|
|
}
|
|
// We've seen a \r, skip over the following \n.
|
|
if subState == 3 {
|
|
if b == '\n' {
|
|
in.Byte.MoveCursor(b)
|
|
subState = 2
|
|
continue
|
|
}
|
|
}
|
|
// End of concat escape. Let the strContent state reprocess the byte.
|
|
i--
|
|
state = strStateContent
|
|
continue
|
|
|
|
// Parse unicode escape sequence (\uXXXX, \UXXXXXXXX).
|
|
case strStateEscapeUnicode:
|
|
value, ok := getHexValueForChar(b)
|
|
if !ok {
|
|
// Invalid unicode escape sequence used.
|
|
return false
|
|
}
|
|
utf8Rune = utf8Rune<<4 + rune(value)
|
|
unicodeHex[unicodeLen] = b
|
|
unicodeLen++
|
|
if unicodeLen == unicodeReqLen {
|
|
if !utf8.ValidRune(utf8Rune) {
|
|
// Invalid unicode escape
|
|
return false
|
|
}
|
|
in.Byte.MoveCursorMulti(unicodeHex[:unicodeLen]...)
|
|
w := utf8.EncodeRune(utf8Bytes, utf8Rune)
|
|
out.AddBytes(utf8Bytes[:w]...)
|
|
state = strStateContent
|
|
}
|
|
|
|
// Parse the end of the string.
|
|
// One delimiter has already been seen by the strContent state.
|
|
// Here we check if we have a full set of 3 delimiters to end
|
|
// the string.
|
|
case strStateMultiLineEnd: // TODO rename to strEndMultiLine
|
|
if subState == 0 {
|
|
// Second delimiter found.
|
|
if b == delim {
|
|
subState = 1
|
|
in.Byte.MoveCursor(b)
|
|
continue
|
|
}
|
|
// No delimiter found, so we're looking at a single
|
|
// delimiter within the multi-line body. Add the delimiter
|
|
// to the output and feed the current byte back to the
|
|
// strContent state.
|
|
out.AddByte(delim)
|
|
i--
|
|
state = strStateContent
|
|
continue
|
|
}
|
|
if subState == 1 {
|
|
// Third delimiter found. This ends the string.
|
|
if b == delim {
|
|
in.Byte.MoveCursor(b)
|
|
return true
|
|
}
|
|
// No delimiter found, so we're looking at two delimiters
|
|
// within the multi-line body. Add the delimiters to the
|
|
// output and feed the current byte back to the strContent state.
|
|
out.AddBytes(delim, delim)
|
|
i--
|
|
state = strStateContent
|
|
continue
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Specific handling of input for basic strings.
|
|
//
|
|
// • Any Unicode character may be used except those that must be escaped:
|
|
// quotation mark, backslash, and the control characters (U+0000 to
|
|
// U+001F, U+007F).
|
|
//
|
|
// • No additional \escape sequences are allowed. What the spec say about this:
|
|
// "All other escape sequences [..] are reserved and, if used, TOML should
|
|
// produce an error.""
|
|
// func (t *parser) parseBasicString(name string, p *parse.API) (string, bool) {
|
|
// if !p.Accept(basicStringHandler) {
|
|
// return "", false
|
|
// }
|
|
// return p.Result.String(), true
|
|
// }
|
|
|
|
type stringTokenizerState int
|
|
|
|
const (
|
|
strStateStart stringTokenizerState = iota
|
|
strStateContent
|
|
strStateEscape
|
|
strStateEscapeUnicode
|
|
strStateEscapeConcat
|
|
strStateCRLF
|
|
strStateUTF8
|
|
strStateMultiLineEnd
|
|
)
|
|
|
|
const (
|
|
lowest6bits = 0x3F // 0011 1111
|
|
lowest5bits = 0x1F // 0001 1111
|
|
lowest4bits = 0x0F // 0000 1111
|
|
lowest3bits = 0x07 // 0000 0111
|
|
)
|
|
|
|
func getHexValueForChar(b byte) (byte, bool) {
|
|
switch {
|
|
case '0' <= b && b <= '9':
|
|
return b - '0', true
|
|
case 'a' <= b && b <= 'z':
|
|
return b - 'a' + 10, true
|
|
case 'A' <= b && b <= 'Z':
|
|
return b - 'A' + 10, true
|
|
default:
|
|
return 0, false
|
|
}
|
|
}
|
|
|
|
func getEscapedChar(b byte) (byte, bool) {
|
|
switch b {
|
|
case 'b':
|
|
return '\b', true
|
|
case 't':
|
|
return '\t', true
|
|
case 'n':
|
|
return '\n', true
|
|
case 'f':
|
|
return '\f', true
|
|
case 'r':
|
|
return '\r', true
|
|
case '"':
|
|
return '"', true
|
|
case '\\':
|
|
return '\\', true
|
|
}
|
|
return 0, false
|
|
}
|
|
|
|
// Specific handling of input for multi-line basic strings.
|
|
//
|
|
// • Multi-line basic strings are surrounded by three quotation marks on
|
|
// each side and allow newlines.
|
|
//
|
|
// • A newline immediately following the opening delimiter will be trimmed.
|
|
// All other whitespace and newline characters remain intact.
|
|
//
|
|
// • TOML parsers should feel free to normalize newline to whatever makes
|
|
// sense for their platform.
|
|
//
|
|
// • All of the escape sequences that are valid for basic strings are also valid
|
|
// for multi-line basic strings.
|
|
//
|
|
// • Any Unicode character may be used except those that must be escaped:
|
|
// backslash and the control characters (U+0000 to U+001F, U+007F). Quotation
|
|
// marks need not be escaped unless their presence would create a premature
|
|
// closing delimiter.
|
|
//
|
|
// • For writing long strings without introducing extraneous whitespace, use a
|
|
// "line ending backslash". When the last non-whitespace character on a line is
|
|
// a \, it will be trimmed along with all whitespace (including newlines) up to
|
|
// the next non-whitespace character or closing delimiter.
|
|
// func (t *parser) parseMultiLineBasicString(p *parse.API) (string, bool) {
|
|
// if !p.Accept(multiLineBasicStringHandler) {
|
|
// return "", false
|
|
// }
|
|
// return p.Result.String(), true
|
|
// }
|
|
|
|
// func multiLineBasicStringHandler(tokenAPI *tokenize.API) bool {
|
|
// var state stringTokenizerState
|
|
// in := tokenAPI.Input
|
|
// out := tokenAPI.Output
|
|
|
|
// unicodeReqLen := 0
|
|
// unicodeLen := 0
|
|
// unicodeHex := make([]byte, 8)
|
|
|
|
// utf8ReqLen := 0
|
|
// utf8Len := 0
|
|
// utf8Rune := rune(0)
|
|
// utf8Bytes := make([]byte, 4)
|
|
|
|
// crlf := false
|
|
|
|
// for {
|
|
// bs, _ := in.Byte.PeekBuffered(0)
|
|
// bslen := len(bs)
|
|
// if bslen == 0 {
|
|
// return false
|
|
// }
|
|
// for i := 0; i < bslen; i++ {
|
|
// b := bs[i]
|
|
// switch state {
|
|
// case strStart, strStart2, strStart3:
|
|
// if b != '"' {
|
|
// // No triple opening quotes found.
|
|
// return false
|
|
// }
|
|
// in.Byte.MoveCursor(b)
|
|
// switch state {
|
|
// case strStart:
|
|
// state = strStart2
|
|
// case strStart2:
|
|
// state = strStart3
|
|
// case strStart3:
|
|
// state = strStart4
|
|
// }
|
|
// case strStart4:
|
|
// if !crlf && b == '\r' {
|
|
// crlf = true
|
|
// in.Byte.MoveCursor(b)
|
|
// continue
|
|
// }
|
|
// if b == '\n' {
|
|
// in.Byte.MoveCursor(b)
|
|
// state = strContent
|
|
// continue
|
|
// }
|
|
// if crlf {
|
|
// // Lonely \r without \n.
|
|
// return false
|
|
// }
|
|
// state = strContent
|
|
// fallthrough
|
|
// case strContent:
|
|
// switch {
|
|
// case b == '\r':
|
|
// state = strCRLF
|
|
// continue
|
|
// case b == '\n':
|
|
// out.AddByte(b)
|
|
// in.Byte.MoveCursor(b)
|
|
// continue
|
|
// case (b >= 0x00 && b <= 0x1F) || b == 0x7F:
|
|
// // Unescaped control character
|
|
// // TODO error reporting instead of full reject
|
|
// return false
|
|
// case b == '\\':
|
|
// in.Byte.MoveCursor(b)
|
|
// state = strEscape
|
|
// continue
|
|
// case b == '"':
|
|
// in.Byte.MoveCursor(b)
|
|
// state = strEnd
|
|
// continue
|
|
// }
|
|
// switch b >> 4 {
|
|
// case 0, 1, 2, 3, 4, 5, 6, 7: // 1 byte UTF8 (0xxxxxxx, a.k.a. ASCII)
|
|
// out.AddByte(b)
|
|
// in.Byte.MoveCursor(b)
|
|
// continue
|
|
// case 12, 13: // 2 byte UTF8 (110xxxxx 10xxxxxx)
|
|
// utf8ReqLen = 2
|
|
// utf8Rune = rune((b & lowest5bits) << 6)
|
|
// case 14: // 3 byte UTF8 (1110xxxx 10xxxxxx 10xxxxxx)
|
|
// utf8ReqLen = 3
|
|
// utf8Rune = rune((b & lowest4bits) << 6)
|
|
// case 15: // 4 byte UTF8 (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
|
|
// utf8ReqLen = 4
|
|
// utf8Rune = rune((b & lowest3bits) << 6)
|
|
// default: // Invalid UTF8 rune
|
|
// return false
|
|
// }
|
|
// utf8Bytes[0] = b
|
|
// utf8Len = 1
|
|
// state = strUTF8
|
|
// case strUTF8:
|
|
// // This should be a continuation byte (10xxxxxx)
|
|
// if b>>6 != 2 {
|
|
// // Invalid UTF8 rune
|
|
// return false
|
|
// }
|
|
// utf8Bytes[utf8Len] = b
|
|
// utf8Len++
|
|
// utf8Rune = utf8Rune<<6 + rune(b&lowest6bits)
|
|
// if utf8Len == utf8ReqLen {
|
|
// if !utf8.ValidRune(utf8Rune) {
|
|
// // Invalid unicode character
|
|
// return false
|
|
// }
|
|
// bytes := utf8Bytes[:utf8Len]
|
|
// out.AddBytes(bytes...)
|
|
// in.Byte.MoveCursorMulti(bytes...)
|
|
// state = strContent
|
|
// }
|
|
// case strCRLF:
|
|
// if b == '\n' {
|
|
// in.Byte.MoveCursorMulti('\r', b)
|
|
// out.AddByte('\n')
|
|
// state = strContent
|
|
// continue
|
|
// }
|
|
// // Lonely \r, should have been escaped.
|
|
// return false
|
|
// case strEscape:
|
|
// state = strContent
|
|
// if escaped, ok := getEscapedChar(b); ok {
|
|
// out.AddByte(escaped)
|
|
// in.Byte.MoveCursor(b)
|
|
// continue
|
|
// }
|
|
// switch b {
|
|
// case ' ', '\t':
|
|
// in.Byte.MoveCursor(b)
|
|
// state = strEscapeConcat
|
|
// continue
|
|
// case '\r':
|
|
// in.Byte.MoveCursor(b)
|
|
// state = strEscapeConcatCRLF
|
|
// continue
|
|
// case '\n':
|
|
// in.Byte.MoveCursor(b)
|
|
// state = strEscapeConcatWs2
|
|
// continue
|
|
// case 'u', 'U':
|
|
// in.Byte.MoveCursor(b)
|
|
// unicodeReqLen = 4
|
|
// if b == 'u' {
|
|
// unicodeReqLen = 4
|
|
// } else {
|
|
// unicodeReqLen = 8
|
|
// }
|
|
// unicodeLen = 0
|
|
// utf8Rune = 0
|
|
// state = strEscapeUnicode
|
|
// default:
|
|
// // Invalid escape sequence used.
|
|
// return false
|
|
// }
|
|
// case strEscapeConcat:
|
|
// switch b {
|
|
// case ' ', '\t':
|
|
// in.Byte.MoveCursor(b)
|
|
// continue
|
|
// case '\r':
|
|
// in.Byte.MoveCursor(b)
|
|
// state = strEscapeConcatCRLF
|
|
// continue
|
|
// case '\n':
|
|
// in.Byte.MoveCursor(b)
|
|
// state = strEscapeConcatWs2
|
|
// continue
|
|
// default:
|
|
// // Invalid line concatenation
|
|
// return false
|
|
// }
|
|
// case strEscapeConcatCRLF:
|
|
// switch b {
|
|
// case '\n':
|
|
// in.Byte.MoveCursor(b)
|
|
// state = strEscapeConcatWs2
|
|
// continue
|
|
// default:
|
|
// // Invalid line concatenation
|
|
// return false
|
|
// }
|
|
// case strEscapeConcatWs2:
|
|
// switch b {
|
|
// case ' ', '\t':
|
|
// in.Byte.MoveCursor(b)
|
|
// continue
|
|
// case '\r':
|
|
// in.Byte.MoveCursor(b)
|
|
// state = strEscapeConcatCRLF
|
|
// continue
|
|
// case '\n':
|
|
// in.Byte.MoveCursor(b)
|
|
// state = strEscapeConcatWs2
|
|
// continue
|
|
// default:
|
|
// i--
|
|
// state = strContent
|
|
// continue
|
|
// }
|
|
// case strEscapeUnicode:
|
|
// value, ok := getHexValueForChar(b)
|
|
// if !ok {
|
|
// // Invalid unicode escape sequence used.
|
|
// return false
|
|
// }
|
|
// utf8Rune = utf8Rune<<4 + rune(value)
|
|
// unicodeHex[unicodeLen] = b
|
|
// unicodeLen++
|
|
// if unicodeLen == unicodeReqLen {
|
|
// if !utf8.ValidRune(utf8Rune) {
|
|
// // Invalid unicode escape
|
|
// return false
|
|
// }
|
|
// in.Byte.MoveCursorMulti(unicodeHex[:unicodeLen]...)
|
|
// w := utf8.EncodeRune(utf8Bytes, utf8Rune)
|
|
// out.AddBytes(utf8Bytes[:w]...)
|
|
// state = strContent
|
|
// }
|
|
// case strEnd:
|
|
// if b == '"' {
|
|
// state = strEnd3
|
|
// in.Byte.MoveCursor(b)
|
|
// } else {
|
|
// state = strContent
|
|
// out.AddByte('"')
|
|
// i--
|
|
// }
|
|
// case strEnd3:
|
|
// if b == '"' {
|
|
// in.Byte.MoveCursor(b)
|
|
// return true
|
|
// }
|
|
// state = strContent
|
|
// out.AddBytes('"', '"')
|
|
// i--
|
|
// }
|
|
// }
|
|
// }
|
|
// }
|
|
|
|
// Specific handling of input for multi-line literal strings.
|
|
//
|
|
// • Multi-line literal strings are surrounded by three single quotes on
|
|
// each side and allow newlines.
|
|
//
|
|
// • A newline immediately following the opening delimiter will be trimmed.
|
|
//
|
|
// • All other content between the delimiters is interpreted as-is without modification.
|
|
//
|
|
// • TOML parsers should feel free to normalize newline to whatever makes
|
|
// sense for their platform.
|
|
//
|
|
// • Control characters other than tab and newline are not permitted in a multi-line literal string.
|
|
// func (t *parser) parseMultiLineLiteralString(p *parse.API) (string, bool) {
|
|
// if !p.Accept(multiLineLiteralStringHandler) {
|
|
// return "", false
|
|
// }
|
|
// return p.Result.String(), true
|
|
// }
|
|
|
|
// func multiLineLiteralStringHandler(tokenAPI *tokenize.API) bool {
|
|
// var state stringTokenizerState
|
|
// in := tokenAPI.Input
|
|
// out := tokenAPI.Output
|
|
|
|
// utf8ReqLen := 0
|
|
// utf8Len := 0
|
|
// utf8Rune := rune(0)
|
|
// utf8Bytes := make([]byte, 4)
|
|
|
|
// crlf := false
|
|
|
|
// for {
|
|
// bs, _ := in.Byte.PeekBuffered(0)
|
|
// bslen := len(bs)
|
|
// if bslen == 0 {
|
|
// return false
|
|
// }
|
|
// for i := 0; i < bslen; i++ {
|
|
// b := bs[i]
|
|
// switch state {
|
|
// case strStart, strStart2, strStart3:
|
|
// if b != '\'' {
|
|
// // No triple opening quotes found.
|
|
// return false
|
|
// }
|
|
// in.Byte.MoveCursor(b)
|
|
// switch state {
|
|
// case strStart:
|
|
// state = strStart2
|
|
// case strStart2:
|
|
// state = strStart3
|
|
// case strStart3:
|
|
// state = strStart4
|
|
// }
|
|
// case strStart4:
|
|
// if !crlf && b == '\r' {
|
|
// crlf = true
|
|
// in.Byte.MoveCursor(b)
|
|
// continue
|
|
// }
|
|
// if b == '\n' {
|
|
// in.Byte.MoveCursor(b)
|
|
// state = strContent
|
|
// continue
|
|
// }
|
|
// if crlf {
|
|
// // Lonely \r without \n.
|
|
// return false
|
|
// }
|
|
// state = strContent
|
|
// fallthrough
|
|
// case strContent:
|
|
// switch {
|
|
// case b == '\r':
|
|
// state = strCRLF
|
|
// continue
|
|
// case b == '\n' || b == '\t':
|
|
// out.AddByte(b)
|
|
// in.Byte.MoveCursor(b)
|
|
// continue
|
|
// case (b >= 0x00 && b <= 0x1F) || b == 0x7F:
|
|
// // Unescaped control character
|
|
// // TODO error reporting instead of full reject
|
|
// return false
|
|
// case b == '\'':
|
|
// in.Byte.MoveCursor(b)
|
|
// state = strEnd
|
|
// continue
|
|
// }
|
|
// switch b >> 4 {
|
|
// case 0, 1, 2, 3, 4, 5, 6, 7: // 1 byte UTF8 (0xxxxxxx, a.k.a. ASCII)
|
|
// out.AddByte(b)
|
|
// in.Byte.MoveCursor(b)
|
|
// continue
|
|
// case 12, 13: // 2 byte UTF8 (110xxxxx 10xxxxxx)
|
|
// utf8ReqLen = 2
|
|
// utf8Rune = rune((b & lowest5bits) << 6)
|
|
// case 14: // 3 byte UTF8 (1110xxxx 10xxxxxx 10xxxxxx)
|
|
// utf8ReqLen = 3
|
|
// utf8Rune = rune((b & lowest4bits) << 6)
|
|
// case 15: // 4 byte UTF8 (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
|
|
// utf8ReqLen = 4
|
|
// utf8Rune = rune((b & lowest3bits) << 6)
|
|
// default: // Invalid UTF8 rune
|
|
// return false
|
|
// }
|
|
// utf8Bytes[0] = b
|
|
// utf8Len = 1
|
|
// state = strUTF8
|
|
// case strUTF8:
|
|
// // This should be a continuation byte (10xxxxxx)
|
|
// if b>>6 != 2 {
|
|
// // Invalid UTF8 rune
|
|
// return false
|
|
// }
|
|
// utf8Bytes[utf8Len] = b
|
|
// utf8Len++
|
|
// utf8Rune = utf8Rune<<6 + rune(b&lowest6bits)
|
|
// if utf8Len == utf8ReqLen {
|
|
// if !utf8.ValidRune(utf8Rune) {
|
|
// // Invalid unicode character
|
|
// return false
|
|
// }
|
|
// bytes := utf8Bytes[:utf8Len]
|
|
// out.AddBytes(bytes...)
|
|
// in.Byte.MoveCursorMulti(bytes...)
|
|
// state = strContent
|
|
// }
|
|
// case strCRLF:
|
|
// if b == '\n' {
|
|
// in.Byte.MoveCursorMulti('\r', b)
|
|
// out.AddByte('\n')
|
|
// state = strContent
|
|
// continue
|
|
// }
|
|
// // Lonely \r, should have been escaped.
|
|
// return false
|
|
// case strEnd:
|
|
// if b == '\'' {
|
|
// state = strEnd3
|
|
// in.Byte.MoveCursor(b)
|
|
// } else {
|
|
// state = strContent
|
|
// out.AddByte('\'')
|
|
// i--
|
|
// }
|
|
// case strEnd3:
|
|
// if b == '\'' {
|
|
// in.Byte.MoveCursor(b)
|
|
// return true
|
|
// }
|
|
// state = strContent
|
|
// out.AddBytes('\'', '\'')
|
|
// i--
|
|
// }
|
|
// }
|
|
// }
|
|
// }
|