go-toml/parse/value_string.go

819 lines
20 KiB
Go

package parse
import (
"unicode/utf8"
"git.makaay.nl/mauricem/go-parsekit/parse"
"git.makaay.nl/mauricem/go-parsekit/tokenize"
"git.makaay.nl/mauricem/go-toml/ast"
)
var (
// Multi-line basic strings are surrounded by three quotation marks on each
// side and allow newlines.
multiLineBasicStringDelimiter = a.Str(`"""`)
openingMultiLineBasicString = multiLineBasicStringDelimiter.Then(newline.Optional())
closingMultiLineBasicString = m.Drop(multiLineBasicStringDelimiter)
// Multi-line literal strings are surrounded by three single quotes on each side and allow newlines.
multiLineLiteralStringDelimiter = a.Str(`'''`)
openingMultiLineLiteralString = multiLineLiteralStringDelimiter.Then(newline.Optional())
closingMultiLineLiteralString = m.Drop(multiLineLiteralStringDelimiter)
// Opening and closing character for basic strings.
basicStringDelimiter = a.DoubleQuote
// Opening and losing character for literal strings.
literalStringDelimiter = a.SingleQuote
// For convenience, some popular characters have a compact escape sequence.
//
// \b - backspace (U+0008)
// \t - tab (U+0009)
// \n - LF (U+000A)
// \f - form feed (U+000C)
// \r - carriage return (U+000D)
// \" - quote (U+0022)
// \\ - backslash (U+005C)
// \uXXXX - unicode (U+XXXX)
// \UXXXXXXXX - unicode (U+XXXXXXXX)
validEscapeChar = a.Char('b', 't', 'n', 'f', 'r', '"', '\\')
shortEscape = c.Seq(a.Backslash, validEscapeChar)
shortUTF8Escape = c.Seq(a.Backslash, a.Char('u'), a.HexDigit.Times(4))
longUTF8Escape = c.Seq(a.Backslash, a.Char('U'), a.HexDigit.Times(8))
validEscape = c.Any(shortEscape, shortUTF8Escape, longUTF8Escape)
// For writing long strings without introducing extraneous whitespace, use a
// "line ending backslash". When the last non-whitespace character on a line is
// a \, it will be trimmed along with all whitespace (including newlines) up to
// the next non-whitespace character or closing delimiter.
lineEndingBackslash = c.Seq(a.Backslash, whitespace, newline, whitespaceInclNewlines.Optional())
)
// There are four ways to express strings: basic, multi-line basic, literal and
// multi-line literal. All strings must parse/value_array.gocontain only valid UTF-8 characters.
func (t *parser) parseString(p *parse.API) (*ast.Value, bool) {
var value string
var ok bool
switch {
case p.Peek(openingMultiLineBasicString):
value, ok = t.parseMultiLineBasicString(p)
case p.Peek(basicStringDelimiter):
value, ok = t.parseBasicString("string value", p)
case p.Peek(openingMultiLineLiteralString):
value, ok = t.parseMultiLineLiteralString(p)
case p.Peek(literalStringDelimiter):
value, ok = t.parseLiteralString("string value", p)
default:
p.Expected("a string value")
}
if ok {
return ast.NewValue(ast.TypeString, value), ok
}
return nil, false
}
// Specific handling of input for basic strings.
//
// • Basic strings are surrounded by quotation marks.
//
// • Any Unicode character may be used except those that must be escaped:
// quotation mark, backslash, and the control characters (U+0000 to
// U+001F, U+007F).
//
// • No additional \escape sequences are allowed. What the spec say about this:
// "All other escape sequences [..] are reserved and, if used, TOML should
// produce an error.""
func (t *parser) parseBasicString(name string, p *parse.API) (string, bool) {
if !p.Accept(basicStringHandler) {
return "", false
}
return p.Result.String(), true
}
type stringTokenizerState int
const (
strStart stringTokenizerState = iota
strStart2
strStart3
strStart4
strChar
strEscape
strEscapeUnicode
strEscapeConcatWs1
strEscapeConcatCRLF
strEscapeConcatWs2
strCRLF
strUTF8
strEnd2
strEnd3
)
const (
lowest6bits = 0x3F // 0011 1111
lowest5bits = 0x1F // 0001 1111
lowest4bits = 0x0F // 0000 1111
lowest3bits = 0x07 // 0000 0111
)
func basicStringHandler(tokenAPI *tokenize.API) bool {
var state stringTokenizerState
in := tokenAPI.Input
out := tokenAPI.Output
unicodeReqLen := 0
unicodeLen := 0
unicodeHex := make([]byte, 8)
utf8ReqLen := 0
utf8Len := 0
utf8Rune := rune(0)
utf8Bytes := make([]byte, 4)
for {
bs, _ := in.Byte.PeekBuffered(0)
bslen := len(bs)
if bslen == 0 {
return false
}
for i := 0; i < bslen; i++ {
b := bs[i]
switch state {
case strStart:
if b != '"' {
// No opening quotes found.
return false
}
in.Byte.MoveCursor(b)
state = strChar
case strChar:
switch {
case (b >= 0x00 && b <= 0x1F) || b == 0x7F:
// Control characters as defined by the TOML specification.
// These must always be escaped.
// Unescaped control character
// TODO error reporting instead of full reject
return false
case b == '\\':
in.Byte.MoveCursor(b)
state = strEscape
continue
case b == '"':
in.Byte.MoveCursor(b)
return true
}
switch b >> 4 {
case 0, 1, 2, 3, 4, 5, 6, 7: // 1 byte UTF8 (0xxxxxxx, a.k.a. ASCII)
out.AddByte(b)
in.Byte.MoveCursor(b)
continue
case 12, 13: // 2 byte UTF8 (110xxxxx 10xxxxxx)
utf8ReqLen = 2
utf8Rune = rune((b & lowest5bits) << 6)
case 14: // 3 byte UTF8 (1110xxxx 10xxxxxx 10xxxxxx)
utf8ReqLen = 3
utf8Rune = rune((b & lowest4bits) << 6)
case 15: // 4 byte UTF8 (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
utf8ReqLen = 4
utf8Rune = rune((b & lowest3bits) << 6)
default: // Invalid UTF8 rune
return false
}
utf8Bytes[0] = b
utf8Len = 1
state = strUTF8
case strUTF8:
// This should be a continuation byte (10xxxxxx)
if b>>6 != 2 {
// Invalid UTF8 rune
return false
}
utf8Bytes[utf8Len] = b
utf8Len++
utf8Rune = utf8Rune<<6 + rune(b&lowest6bits)
if utf8Len == utf8ReqLen {
if !utf8.ValidRune(utf8Rune) {
// Invalid unicode character
return false
}
bytes := utf8Bytes[:utf8Len]
out.AddBytes(bytes...)
in.Byte.MoveCursorMulti(bytes...)
state = strChar
}
case strEscape:
state = strChar
if escaped, ok := getEscapedChar(b); ok {
out.AddByte(escaped)
in.Byte.MoveCursor(b)
continue
}
switch b {
case 'u', 'U':
in.Byte.MoveCursor(b)
unicodeReqLen = 4
if b == 'u' {
unicodeReqLen = 4
} else {
unicodeReqLen = 8
}
unicodeLen = 0
utf8Rune = 0
state = strEscapeUnicode
default:
// Invalid escape sequence used.
return false
}
case strEscapeUnicode:
value, ok := getHexValueForChar(b)
if !ok {
// Invalid unicode escape sequence used.
return false
}
utf8Rune = utf8Rune<<4 + rune(value)
unicodeHex[unicodeLen] = b
unicodeLen++
if unicodeLen == unicodeReqLen {
if !utf8.ValidRune(utf8Rune) {
// Invalid unicode escape
return false
}
in.Byte.MoveCursorMulti(unicodeHex[:unicodeLen]...)
w := utf8.EncodeRune(utf8Bytes, utf8Rune)
out.AddBytes(utf8Bytes[:w]...)
state = strChar
}
}
}
}
}
func getHexValueForChar(b byte) (byte, bool) {
switch {
case '0' <= b && b <= '9':
return b - '0', true
case 'a' <= b && b <= 'z':
return b - 'a' + 10, true
case 'A' <= b && b <= 'Z':
return b - 'A' + 10, true
default:
return 0, false
}
}
func getEscapedChar(b byte) (byte, bool) {
switch b {
case 'b':
return '\b', true
case 't':
return '\t', true
case 'n':
return '\n', true
case 'f':
return '\f', true
case 'r':
return '\r', true
case '"':
return '"', true
case '\\':
return '\\', true
}
return 0, false
}
// Specific handling of input for literal strings.
//
// • Literal strings are surrounded by single quotes.
//
// • Like basic strings, they must appear on a single line.
//
// • Control characters other than tab are not permitted in a literal string.
func (t *parser) parseLiteralString(name string, p *parse.API) (string, bool) {
if !p.Accept(literalStringHandler) {
return "", false
}
return p.Result.String(), true
}
func literalStringHandler(tokenAPI *tokenize.API) bool {
var state stringTokenizerState
in := tokenAPI.Input
out := tokenAPI.Output
utf8ReqLen := 0
utf8Len := 0
utf8Rune := rune(0)
utf8Bytes := [4]byte{}
for {
bs, _ := tokenAPI.Input.Byte.PeekBuffered(0)
bslen := len(bs)
if bslen == 0 {
// Unexpected end of file.
return false
}
for i := 0; i < bslen; i++ {
b := bs[i]
switch state {
case strStart:
if b != '\'' {
// No opening quote found.
return false
}
in.Byte.MoveCursor(b)
state = strChar
case strChar:
switch {
case (b >= 0x00 && b < 0x09) || (b > 0x09 && b <= 0x1F) || b == 0x7F:
// Unescaped control character
return false
case b == '\'':
in.Byte.MoveCursor(b)
return true
}
switch b >> 4 {
case 0, 1, 2, 3, 4, 5, 6, 7: // 1 byte UTF8 (0xxxxxxx, a.k.a. ASCII)
out.AddByte(b)
in.Byte.MoveCursor(b)
continue
case 12, 13: // 2 byte UTF8 (110xxxxx 10xxxxxx)
utf8ReqLen = 2
utf8Rune = rune((b & lowest5bits) << 6)
case 14: // 3 byte UTF8 (1110xxxx 10xxxxxx 10xxxxxx)
utf8ReqLen = 3
utf8Rune = rune((b & lowest4bits) << 6)
case 15: // 4 byte UTF8 (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
utf8ReqLen = 4
utf8Rune = rune((b & lowest3bits) << 6)
default: // Invalid UTF8 rune
return false
}
utf8Bytes[0] = b
utf8Len = 1
state = strUTF8
case strUTF8:
// This should be a continuation byte (10xxxxxx)
if b>>6 != 2 {
// Invalid UTF8 rune
return false
}
utf8Bytes[utf8Len] = b
utf8Len++
utf8Rune = utf8Rune<<6 + rune(b&lowest6bits)
if utf8Len == utf8ReqLen {
if !utf8.ValidRune(utf8Rune) {
// Invalid unicode character
return false
}
bytes := utf8Bytes[:utf8Len]
out.AddBytes(bytes...)
in.Byte.MoveCursorMulti(bytes...)
state = strChar
}
}
}
}
}
// Specific handling of input for multi-line basic strings.
//
// • Multi-line basic strings are surrounded by three quotation marks on
// each side and allow newlines.
//
// • A newline immediately following the opening delimiter will be trimmed.
// All other whitespace and newline characters remain intact.
//
// • TOML parsers should feel free to normalize newline to whatever makes
// sense for their platform.
//
// • All of the escape sequences that are valid for basic strings are also valid
// for multi-line basic strings.
//
// • Any Unicode character may be used except those that must be escaped:
// backslash and the control characters (U+0000 to U+001F, U+007F). Quotation
// marks need not be escaped unless their presence would create a premature
// closing delimiter.
//
// • For writing long strings without introducing extraneous whitespace, use a
// "line ending backslash". When the last non-whitespace character on a line is
// a \, it will be trimmed along with all whitespace (including newlines) up to
// the next non-whitespace character or closing delimiter.
func (t *parser) parseMultiLineBasicString(p *parse.API) (string, bool) {
if !p.Accept(multiLineBasicStringHandler) {
return "", false
}
return p.Result.String(), true
}
func multiLineBasicStringHandler(tokenAPI *tokenize.API) bool {
var state stringTokenizerState
in := tokenAPI.Input
out := tokenAPI.Output
unicodeReqLen := 0
unicodeLen := 0
unicodeHex := make([]byte, 8)
utf8ReqLen := 0
utf8Len := 0
utf8Rune := rune(0)
utf8Bytes := make([]byte, 4)
crlf := false
for {
bs, _ := in.Byte.PeekBuffered(0)
bslen := len(bs)
if bslen == 0 {
return false
}
for i := 0; i < bslen; i++ {
b := bs[i]
switch state {
case strStart, strStart2, strStart3:
if b != '"' {
// No triple opening quotes found.
return false
}
in.Byte.MoveCursor(b)
switch state {
case strStart:
state = strStart2
case strStart2:
state = strStart3
case strStart3:
state = strStart4
}
case strStart4:
if !crlf && b == '\r' {
crlf = true
in.Byte.MoveCursor(b)
continue
}
if b == '\n' {
in.Byte.MoveCursor(b)
state = strChar
continue
}
if crlf {
// Lonely \r without \n.
return false
}
state = strChar
fallthrough
case strChar:
switch {
case b == '\r':
state = strCRLF
continue
case b == '\n':
out.AddByte(b)
in.Byte.MoveCursor(b)
continue
case (b >= 0x00 && b <= 0x1F) || b == 0x7F:
// Unescaped control character
// TODO error reporting instead of full reject
return false
case b == '\\':
in.Byte.MoveCursor(b)
state = strEscape
continue
case b == '"':
in.Byte.MoveCursor(b)
state = strEnd2
continue
}
switch b >> 4 {
case 0, 1, 2, 3, 4, 5, 6, 7: // 1 byte UTF8 (0xxxxxxx, a.k.a. ASCII)
out.AddByte(b)
in.Byte.MoveCursor(b)
continue
case 12, 13: // 2 byte UTF8 (110xxxxx 10xxxxxx)
utf8ReqLen = 2
utf8Rune = rune((b & lowest5bits) << 6)
case 14: // 3 byte UTF8 (1110xxxx 10xxxxxx 10xxxxxx)
utf8ReqLen = 3
utf8Rune = rune((b & lowest4bits) << 6)
case 15: // 4 byte UTF8 (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
utf8ReqLen = 4
utf8Rune = rune((b & lowest3bits) << 6)
default: // Invalid UTF8 rune
return false
}
utf8Bytes[0] = b
utf8Len = 1
state = strUTF8
case strUTF8:
// This should be a continuation byte (10xxxxxx)
if b>>6 != 2 {
// Invalid UTF8 rune
return false
}
utf8Bytes[utf8Len] = b
utf8Len++
utf8Rune = utf8Rune<<6 + rune(b&lowest6bits)
if utf8Len == utf8ReqLen {
if !utf8.ValidRune(utf8Rune) {
// Invalid unicode character
return false
}
bytes := utf8Bytes[:utf8Len]
out.AddBytes(bytes...)
in.Byte.MoveCursorMulti(bytes...)
state = strChar
}
case strCRLF:
if b == '\n' {
in.Byte.MoveCursorMulti('\r', b)
out.AddByte('\n')
state = strChar
continue
}
// Lonely \r, should have been escaped.
return false
case strEscape:
state = strChar
if escaped, ok := getEscapedChar(b); ok {
out.AddByte(escaped)
in.Byte.MoveCursor(b)
continue
}
switch b {
case ' ', '\t':
in.Byte.MoveCursor(b)
state = strEscapeConcatWs1
continue
case '\r':
in.Byte.MoveCursor(b)
state = strEscapeConcatCRLF
continue
case '\n':
in.Byte.MoveCursor(b)
state = strEscapeConcatWs2
continue
case 'u', 'U':
in.Byte.MoveCursor(b)
unicodeReqLen = 4
if b == 'u' {
unicodeReqLen = 4
} else {
unicodeReqLen = 8
}
unicodeLen = 0
utf8Rune = 0
state = strEscapeUnicode
default:
// Invalid escape sequence used.
return false
}
case strEscapeConcatWs1:
switch b {
case ' ', '\t':
in.Byte.MoveCursor(b)
continue
case '\r':
in.Byte.MoveCursor(b)
state = strEscapeConcatCRLF
continue
case '\n':
in.Byte.MoveCursor(b)
state = strEscapeConcatWs2
continue
default:
// Invalid line concatenation
return false
}
case strEscapeConcatCRLF:
switch b {
case '\n':
in.Byte.MoveCursor(b)
state = strEscapeConcatWs2
continue
default:
// Invalid line concatenation
return false
}
case strEscapeConcatWs2:
switch b {
case ' ', '\t':
in.Byte.MoveCursor(b)
continue
case '\r':
in.Byte.MoveCursor(b)
state = strEscapeConcatCRLF
continue
case '\n':
in.Byte.MoveCursor(b)
state = strEscapeConcatWs2
continue
default:
i--
state = strChar
continue
}
case strEscapeUnicode:
value, ok := getHexValueForChar(b)
if !ok {
// Invalid unicode escape sequence used.
return false
}
utf8Rune = utf8Rune<<4 + rune(value)
unicodeHex[unicodeLen] = b
unicodeLen++
if unicodeLen == unicodeReqLen {
if !utf8.ValidRune(utf8Rune) {
// Invalid unicode escape
return false
}
in.Byte.MoveCursorMulti(unicodeHex[:unicodeLen]...)
w := utf8.EncodeRune(utf8Bytes, utf8Rune)
out.AddBytes(utf8Bytes[:w]...)
state = strChar
}
case strEnd2:
if b == '"' {
state = strEnd3
in.Byte.MoveCursor(b)
} else {
state = strChar
out.AddByte('"')
i--
}
case strEnd3:
if b == '"' {
in.Byte.MoveCursor(b)
return true
}
state = strChar
out.AddBytes('"', '"')
i--
}
}
}
}
// Specific handling of input for multi-line literal strings.
//
// • Multi-line literal strings are surrounded by three single quotes on
// each side and allow newlines.
//
// • A newline immediately following the opening delimiter will be trimmed.
//
// • All other content between the delimiters is interpreted as-is without modification.
//
// • TOML parsers should feel free to normalize newline to whatever makes
// sense for their platform.
//
// • Control characters other than tab and newline are not permitted in a multi-line literal string.
func (t *parser) parseMultiLineLiteralString(p *parse.API) (string, bool) {
if !p.Accept(multiLineLiteralStringHandler) {
return "", false
}
return p.Result.String(), true
}
func multiLineLiteralStringHandler(tokenAPI *tokenize.API) bool {
var state stringTokenizerState
in := tokenAPI.Input
out := tokenAPI.Output
utf8ReqLen := 0
utf8Len := 0
utf8Rune := rune(0)
utf8Bytes := make([]byte, 4)
crlf := false
for {
bs, _ := in.Byte.PeekBuffered(0)
bslen := len(bs)
if bslen == 0 {
return false
}
for i := 0; i < bslen; i++ {
b := bs[i]
switch state {
case strStart, strStart2, strStart3:
if b != '\'' {
// No triple opening quotes found.
return false
}
in.Byte.MoveCursor(b)
switch state {
case strStart:
state = strStart2
case strStart2:
state = strStart3
case strStart3:
state = strStart4
}
case strStart4:
if !crlf && b == '\r' {
crlf = true
in.Byte.MoveCursor(b)
continue
}
if b == '\n' {
in.Byte.MoveCursor(b)
state = strChar
continue
}
if crlf {
// Lonely \r without \n.
return false
}
state = strChar
fallthrough
case strChar:
switch {
case b == '\r':
state = strCRLF
continue
case b == '\n' || b == '\t':
out.AddByte(b)
in.Byte.MoveCursor(b)
continue
case (b >= 0x00 && b <= 0x1F) || b == 0x7F:
// Unescaped control character
// TODO error reporting instead of full reject
return false
case b == '\'':
in.Byte.MoveCursor(b)
state = strEnd2
continue
}
switch b >> 4 {
case 0, 1, 2, 3, 4, 5, 6, 7: // 1 byte UTF8 (0xxxxxxx, a.k.a. ASCII)
out.AddByte(b)
in.Byte.MoveCursor(b)
continue
case 12, 13: // 2 byte UTF8 (110xxxxx 10xxxxxx)
utf8ReqLen = 2
utf8Rune = rune((b & lowest5bits) << 6)
case 14: // 3 byte UTF8 (1110xxxx 10xxxxxx 10xxxxxx)
utf8ReqLen = 3
utf8Rune = rune((b & lowest4bits) << 6)
case 15: // 4 byte UTF8 (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
utf8ReqLen = 4
utf8Rune = rune((b & lowest3bits) << 6)
default: // Invalid UTF8 rune
return false
}
utf8Bytes[0] = b
utf8Len = 1
state = strUTF8
case strUTF8:
// This should be a continuation byte (10xxxxxx)
if b>>6 != 2 {
// Invalid UTF8 rune
return false
}
utf8Bytes[utf8Len] = b
utf8Len++
utf8Rune = utf8Rune<<6 + rune(b&lowest6bits)
if utf8Len == utf8ReqLen {
if !utf8.ValidRune(utf8Rune) {
// Invalid unicode character
return false
}
bytes := utf8Bytes[:utf8Len]
out.AddBytes(bytes...)
in.Byte.MoveCursorMulti(bytes...)
state = strChar
}
case strCRLF:
if b == '\n' {
in.Byte.MoveCursorMulti('\r', b)
out.AddByte('\n')
state = strChar
continue
}
// Lonely \r, should have been escaped.
return false
case strEnd2:
if b == '\'' {
state = strEnd3
in.Byte.MoveCursor(b)
} else {
state = strChar
out.AddByte('\'')
i--
}
case strEnd3:
if b == '\'' {
in.Byte.MoveCursor(b)
return true
}
state = strChar
out.AddBytes('\'', '\'')
i--
}
}
}
}