go-toml/parse/value_string.go

295 lines
9.5 KiB
Go

package parse
import (
"fmt"
"strconv"
"strings"
"unicode/utf8"
"git.makaay.nl/mauricem/go-parsekit/parse"
"git.makaay.nl/mauricem/go-toml/ast"
)
var (
// Multi-line basic strings are surrounded by three quotation marks on each
// side and allow newlines.
multiLineBasicStringDelimiter = a.Str(`"""`)
openingMultiLineBasicString = multiLineBasicStringDelimiter.Then(newline.Optional())
closingMultiLineBasicString = m.Drop(multiLineBasicStringDelimiter)
// Multi-line literal strings are surrounded by three single quotes on each side and allow newlines.
multiLineLiteralStringDelimiter = a.Str(`'''`)
openingMultiLineLiteralString = multiLineLiteralStringDelimiter.Then(newline.Optional())
closingMultiLineLiteralString = m.Drop(multiLineLiteralStringDelimiter)
// Opening and closing character for basic strings.
basicStringDelimiter = m.Drop(a.DoubleQuote)
// Opening and losing character for literal strings.
literalStringDelimiter = m.Drop(a.SingleQuote)
// Control characters as defined by TOML (U+0000 to U+001F, U+007F)
isControlCharacter = func(b byte) bool { return (b >= 0x00 && b <= 0x1F) || b == 0x7F }
controlCharacter = a.ByteByCallback(isControlCharacter)
// For convenience, some popular characters have a compact escape sequence.
//
// \b - backspace (U+0008)
// \t - tab (U+0009)
// \n - LF (U+000A)
// \f - form feed (U+000C)
// \r - carriage return (U+000D)
// \" - quote (U+0022)
// \\ - backslash (U+005C)
// \uXXXX - unicode (U+XXXX)
// \UXXXXXXXX - unicode (U+XXXXXXXX)
validEscapeChar = a.Bytes('b', 't', 'n', 'f', 'r', '"', '\\')
shortEscape = c.Seq(a.Backslash, validEscapeChar)
shortUTF8Escape = c.Seq(a.Backslash, a.Byte('u'), a.HexDigit.Times(4))
longUTF8Escape = c.Seq(a.Backslash, a.Byte('U'), a.HexDigit.Times(8))
validEscape = c.Any(shortEscape, shortUTF8Escape, longUTF8Escape)
// For writing long strings without introducing extraneous whitespace, use a
// "line ending backslash". When the last non-whitespace character on a line is
// a \, it will be trimmed along with all whitespace (including newlines) up to
// the next non-whitespace character or closing delimiter.
lineEndingBackslash = c.Seq(a.Backslash, whitespace, newline, whitespaceInclNewlines.Optional())
)
// There are four ways to express strings: basic, multi-line basic, literal and
// multi-line literal. All strings must parse/value_array.gocontain only valid UTF-8 characters.
func (t *parser) parseString(p *parse.API) (*ast.Value, bool) {
var value string
var ok bool
switch {
case p.Peek(openingMultiLineBasicString):
value, ok = t.parseMultiLineBasicString(p)
case p.Peek(basicStringDelimiter):
value, ok = t.parseBasicString("string value", p)
case p.Peek(openingMultiLineLiteralString):
value, ok = t.parseMultiLineLiteralString(p)
case p.Peek(literalStringDelimiter):
value, ok = t.parseLiteralString("string value", p)
default:
p.Expected("a string value")
}
if ok {
return ast.NewValue(ast.TypeString, value), ok
}
return nil, false
}
// Specific handling of input for basic strings.
//
// • Basic strings are surrounded by quotation marks.
//
// • Any Unicode character may be used except those that must be escaped:
// quotation mark, backslash, and the control characters (U+0000 to
// U+001F, U+007F).
//
// • No additional \escape sequences are allowed. What the spec say about this:
// "All other escape sequences [..] are reserved and, if used, TOML should
// produce an error.""
func (t *parser) parseBasicString(name string, p *parse.API) (string, bool) {
if !p.Skip(a.DoubleQuote) {
p.Expected(`opening quotation marks`)
return "", false
}
sb := &strings.Builder{}
for {
switch {
case p.Peek(controlCharacter):
p.SetError("invalid character in %s: %q (must be escaped)", name, p.Result.Bytes[0])
return sb.String(), false
case p.Accept(validEscape):
if !appendEscapedRune(p, sb) {
return sb.String(), false
}
case p.Peek(a.Backslash):
p.SetError("invalid escape sequence")
return sb.String(), false
case p.Skip(basicStringDelimiter):
return sb.String(), true
case p.Peek(a.InvalidRune):
p.SetError("invalid UTF8 rune")
return sb.String(), false
case p.Accept(a.ValidRune):
sb.WriteString(p.Result.String())
default:
p.Expected(`closing quotation marks`)
return sb.String(), false
}
}
}
// Specific handling of input for literal strings.
//
// • Literal strings are surrounded by single quotes.
//
// • Like basic strings, they must appear on a single line.
//
// • Control characters other than tab are not permitted in a literal string.
func (t *parser) parseLiteralString(name string, p *parse.API) (string, bool) {
if !p.Skip(a.SingleQuote) {
p.Expected("opening single quote")
return "", false
}
sb := &strings.Builder{}
for {
switch {
case p.Skip(literalStringDelimiter):
return sb.String(), true
case p.Skip(a.Tab):
sb.WriteString("\t")
case p.Peek(controlCharacter):
p.SetError("invalid character in %s: %q (no control chars allowed, except for tab)", name, p.Result.Bytes[0])
return sb.String(), false
case p.Peek(a.InvalidRune):
p.SetError("invalid UTF8 rune")
return sb.String(), false
case p.Accept(a.ValidRune):
sb.WriteString(p.Result.String())
default:
p.Expected("closing single quote")
return sb.String(), false
}
}
}
// Specific handling of input for multi-line basic strings.
//
// • Multi-line basic strings are surrounded by three quotation marks on
// each side and allow newlines.
//
// • A newline immediately following the opening delimiter will be trimmed.
// All other whitespace and newline characters remain intact.
//
// • TOML parsers should feel free to normalize newline to whatever makes
// sense for their platform.
//
// • All of the escape sequences that are valid for basic strings are also valid
// for multi-line basic strings.
//
// • Any Unicode character may be used except those that must be escaped:
// backslash and the control characters (U+0000 to U+001F, U+007F). Quotation
// marks need not be escaped unless their presence would create a premature
// closing delimiter.
//
// • For writing long strings without introducing extraneous whitespace, use a
// "line ending backslash". When the last non-whitespace character on a line is
// a \, it will be trimmed along with all whitespace (including newlines) up to
// the next non-whitespace character or closing delimiter.
func (t *parser) parseMultiLineBasicString(p *parse.API) (string, bool) {
if !p.Skip(openingMultiLineBasicString) {
p.Expected("opening three quotation marks")
return "", false
}
sb := &strings.Builder{}
for {
switch {
case p.Skip(newline):
sb.WriteString("\n")
case p.Peek(controlCharacter):
p.SetError("invalid character in multi-line basic string: %q (must be escaped)", p.Result.Bytes[0])
return sb.String(), false
case p.Accept(validEscape):
if !appendEscapedRune(p, sb) {
return sb.String(), false
}
case p.Skip(lineEndingBackslash):
// NOOP
case p.Peek(a.Backslash):
p.SetError("invalid escape sequence")
return sb.String(), false
case p.Skip(closingMultiLineBasicString):
return sb.String(), true
case p.Accept(a.ValidRune):
sb.WriteString(p.Result.String())
case p.Peek(a.InvalidRune):
p.SetError("invalid UTF8 rune")
return sb.String(), false
default:
p.Expected("closing three quotation marks")
return sb.String(), false
}
}
}
func appendEscapedRune(p *parse.API, sb *strings.Builder) bool {
s := p.Result.String()
switch s {
case `\b`:
sb.WriteRune('\b')
case `\t`:
sb.WriteRune('\t')
case `\n`:
sb.WriteRune('\n')
case `\f`:
sb.WriteRune('\f')
case `\r`:
sb.WriteRune('\r')
case `\"`:
sb.WriteRune('"')
case `\\`:
sb.WriteRune('\\')
default:
// UTF8 escape code: \uXXXX or \UXXXXXXXXXXXX.
hex := s[2:]
val, _ := strconv.ParseUint(hex, 16, 32) // hex format already validated by parser
r := rune(val)
if !utf8.ValidRune(r) {
p.SetError(fmt.Sprintf("invalid UTF8 escape '%s'", s))
return false
}
sb.WriteRune(r)
}
return true
}
// Specific handling of input for multi-line literal strings.
//
// • Multi-line literal strings are surrounded by three single quotes on
// each side and allow newlines.
//
// • A newline immediately following the opening delimiter will be trimmed.
//
// • All other content between the delimiters is interpreted as-is without modification.
//
// • TOML parsers should feel free to normalize newline to whatever makes
// sense for their platform.
//
// • Control characters other than tab and newline are not permitted in a multi-line literal string.
func (t *parser) parseMultiLineLiteralString(p *parse.API) (string, bool) {
if !p.Skip(openingMultiLineLiteralString) {
p.Expected("opening three single quotes")
return "", false
}
sb := &strings.Builder{}
for {
switch {
case p.Skip(closingMultiLineLiteralString):
return sb.String(), true
case p.Skip(a.Tab):
sb.WriteString("\t")
case p.Skip(newline):
sb.WriteString("\n")
case p.Peek(controlCharacter):
p.SetError("invalid character in literal string: %q (no control chars allowed, except for tab and newline)", p.Result.Bytes[0])
return sb.String(), false
case p.Accept(a.ValidRune):
sb.WriteString(p.Result.String())
case p.Peek(a.InvalidRune):
p.SetError("invalid UTF8 rune")
return sb.String(), false
default:
p.Expected("closing three single quotes")
return sb.String(), false
}
}
}