285 lines
8.9 KiB
Go
285 lines
8.9 KiB
Go
package parse
|
|
|
|
import (
|
|
"fmt"
|
|
"strconv"
|
|
"strings"
|
|
"unicode/utf8"
|
|
|
|
"git.makaay.nl/mauricem/go-parsekit/parse"
|
|
"git.makaay.nl/mauricem/go-toml/ast"
|
|
)
|
|
|
|
var (
|
|
// Multi-line basic strings are surrounded by three quotation marks on each
|
|
// side and allow newlines.
|
|
|
|
doubleQuote3 = a.Str(`"""`)
|
|
|
|
// Multi-line literal strings are surrounded by three single quotes on each side and allow newlines.
|
|
|
|
singleQuote3 = a.Str(`'''`)
|
|
|
|
// Control characters as defined by TOML (U+0000 to U+001F, U+007F)
|
|
|
|
isControlCharacter = func(b byte) bool { return (b >= 0x00 && b <= 0x1F) || b == 0x7F }
|
|
controlCharacter = a.ByteByCallback(isControlCharacter)
|
|
|
|
// For convenience, some popular characters have a compact escape sequence.
|
|
//
|
|
// \b - backspace (U+0008)
|
|
// \t - tab (U+0009)
|
|
// \n - LF (U+000A)
|
|
// \f - form feed (U+000C)
|
|
// \r - carriage return (U+000D)
|
|
// \" - quote (U+0022)
|
|
// \\ - backslash (U+005C)
|
|
// \uXXXX - unicode (U+XXXX)
|
|
// \UXXXXXXXX - unicode (U+XXXXXXXX)
|
|
|
|
validEscapeChar = a.Bytes('b', 't', 'n', 'f', 'r', '"', '\\')
|
|
shortEscape = c.Seq(a.Backslash, validEscapeChar)
|
|
shortUTF8Escape = c.Seq(a.Backslash, a.Byte('u'), a.HexDigit.Times(4))
|
|
longUTF8Escape = c.Seq(a.Backslash, a.Byte('U'), a.HexDigit.Times(8))
|
|
validEscape = c.Any(shortEscape, shortUTF8Escape, longUTF8Escape)
|
|
|
|
// For writing long strings without introducing extraneous whitespace, use a
|
|
// "line ending backslash". When the last non-whitespace character on a line is
|
|
// a \, it will be trimmed along with all whitespace (including newlines) up to
|
|
// the next non-whitespace character or closing delimiter.
|
|
|
|
lineEndingBackslash = c.Seq(a.Backslash, whitespace, newline, whitespaceInclNewlines.Optional())
|
|
)
|
|
|
|
// There are four ways to express strings: basic, multi-line basic, literal and
|
|
// multi-line literal. All strings must parse/value_array.gocontain only valid UTF-8 characters.
|
|
func (t *parser) parseString(p *parse.API) (*ast.Value, bool) {
|
|
var value string
|
|
var ok bool
|
|
switch {
|
|
case p.Peek(doubleQuote3):
|
|
value, ok = t.parseMultiLineBasicString(p)
|
|
case p.Peek(a.DoubleQuote):
|
|
value, ok = t.parseBasicString("string value", p)
|
|
case p.Peek(singleQuote3):
|
|
value, ok = t.parseMultiLineLiteralString(p)
|
|
case p.Peek(a.SingleQuote):
|
|
value, ok = t.parseLiteralString("string value", p)
|
|
default:
|
|
p.Expected("a string value")
|
|
}
|
|
if ok {
|
|
return ast.NewValue(ast.TypeString, value), ok
|
|
}
|
|
return nil, false
|
|
}
|
|
|
|
// Specific handling of input for basic strings.
|
|
//
|
|
// • Basic strings are surrounded by quotation marks.
|
|
//
|
|
// • Any Unicode character may be used except those that must be escaped:
|
|
// quotation mark, backslash, and the control characters (U+0000 to
|
|
// U+001F, U+007F).
|
|
//
|
|
// • No additional \escape sequences are allowed. What the spec say about this:
|
|
// "All other escape sequences [..] are reserved and, if used, TOML should
|
|
// produce an error.""
|
|
func (t *parser) parseBasicString(name string, p *parse.API) (string, bool) {
|
|
if !p.Accept(a.DoubleQuote) {
|
|
p.Expected(`opening quotation marks`)
|
|
return "", false
|
|
}
|
|
sb := &strings.Builder{}
|
|
for {
|
|
switch {
|
|
case p.Peek(controlCharacter):
|
|
p.Error("invalid character in %s: %q (must be escaped)", name, p.Result.Runes[0])
|
|
return sb.String(), false
|
|
case p.Accept(validEscape):
|
|
if !appendEscapedRune(p, sb) {
|
|
return sb.String(), false
|
|
}
|
|
case p.Peek(a.Backslash):
|
|
p.Error("invalid escape sequence")
|
|
return sb.String(), false
|
|
case p.Accept(m.Drop(a.DoubleQuote)):
|
|
return sb.String(), true
|
|
case p.Peek(a.InvalidRune):
|
|
p.Error("invalid UTF8 rune")
|
|
return sb.String(), false
|
|
case p.Accept(a.ValidRune):
|
|
sb.WriteString(p.Result.String())
|
|
default:
|
|
p.Expected(`closing quotation marks`)
|
|
return sb.String(), false
|
|
}
|
|
}
|
|
}
|
|
|
|
// Specific handling of input for literal strings.
|
|
//
|
|
// • Literal strings are surrounded by single quotes.
|
|
//
|
|
// • Like basic strings, they must appear on a single line.
|
|
//
|
|
// • Control characters other than tab are not permitted in a literal string.
|
|
func (t *parser) parseLiteralString(name string, p *parse.API) (string, bool) {
|
|
if !p.Accept(a.SingleQuote) {
|
|
p.Expected("opening single quote")
|
|
return "", false
|
|
}
|
|
sb := &strings.Builder{}
|
|
for {
|
|
switch {
|
|
case p.Accept(m.Drop(a.SingleQuote)):
|
|
return sb.String(), true
|
|
case p.Accept(a.Tab):
|
|
sb.WriteString("\t")
|
|
case p.Peek(controlCharacter):
|
|
p.Error("invalid character in %s: %q (no control chars allowed, except for tab)", name, p.Result.Runes[0])
|
|
return sb.String(), false
|
|
case p.Peek(a.InvalidRune):
|
|
p.Error("invalid UTF8 rune")
|
|
return sb.String(), false
|
|
case p.Accept(a.ValidRune):
|
|
sb.WriteString(p.Result.String())
|
|
default:
|
|
p.Expected("closing single quote")
|
|
return sb.String(), false
|
|
}
|
|
}
|
|
}
|
|
|
|
// Specific handling of input for multi-line basic strings.
|
|
//
|
|
// • Multi-line basic strings are surrounded by three quotation marks on
|
|
// each side and allow newlines.
|
|
//
|
|
// • A newline immediately following the opening delimiter will be trimmed.
|
|
// All other whitespace and newline characters remain intact.
|
|
//
|
|
// • TOML parsers should feel free to normalize newline to whatever makes
|
|
// sense for their platform.
|
|
//
|
|
// • All of the escape sequences that are valid for basic strings are also valid
|
|
// for multi-line basic strings.
|
|
//
|
|
// • Any Unicode character may be used except those that must be escaped:
|
|
// backslash and the control characters (U+0000 to U+001F, U+007F). Quotation
|
|
// marks need not be escaped unless their presence would create a premature
|
|
// closing delimiter.
|
|
//
|
|
// • For writing long strings without introducing extraneous whitespace, use a
|
|
// "line ending backslash". When the last non-whitespace character on a line is
|
|
// a \, it will be trimmed along with all whitespace (including newlines) up to
|
|
// the next non-whitespace character or closing delimiter.
|
|
func (t *parser) parseMultiLineBasicString(p *parse.API) (string, bool) {
|
|
if !p.Accept(doubleQuote3.Then(newline.Optional())) {
|
|
p.Expected("opening three quotation marks")
|
|
return "", false
|
|
}
|
|
sb := &strings.Builder{}
|
|
for {
|
|
switch {
|
|
case p.Accept(newline):
|
|
sb.WriteString("\n")
|
|
case p.Peek(controlCharacter):
|
|
p.Error("invalid character in multi-line basic string: %q (must be escaped)", p.Result.Runes[0])
|
|
return sb.String(), false
|
|
case p.Accept(validEscape):
|
|
if !appendEscapedRune(p, sb) {
|
|
return sb.String(), false
|
|
}
|
|
case p.Accept(lineEndingBackslash):
|
|
// NOOP, the line-ending backslash sequence is skipped.
|
|
case p.Peek(a.Backslash):
|
|
p.Error("invalid escape sequence")
|
|
return sb.String(), false
|
|
case p.Accept(m.Drop(doubleQuote3)):
|
|
return sb.String(), true
|
|
case p.Accept(a.ValidRune):
|
|
sb.WriteString(p.Result.String())
|
|
case p.Peek(a.InvalidRune):
|
|
p.Error("invalid UTF8 rune")
|
|
return sb.String(), false
|
|
default:
|
|
p.Expected("closing three quotation marks")
|
|
return sb.String(), false
|
|
}
|
|
}
|
|
}
|
|
|
|
func appendEscapedRune(p *parse.API, sb *strings.Builder) bool {
|
|
s := p.Result.String()
|
|
switch s {
|
|
case `\b`:
|
|
sb.WriteRune('\b')
|
|
case `\t`:
|
|
sb.WriteRune('\t')
|
|
case `\n`:
|
|
sb.WriteRune('\n')
|
|
case `\f`:
|
|
sb.WriteRune('\f')
|
|
case `\r`:
|
|
sb.WriteRune('\r')
|
|
case `\"`:
|
|
sb.WriteRune('"')
|
|
case `\\`:
|
|
sb.WriteRune('\\')
|
|
default:
|
|
// UTF8 escape code: \uXXXX or \UXXXXXXXXXXXX.
|
|
hex := s[2:]
|
|
val, _ := strconv.ParseUint(hex, 16, 32) // hex format already validated by parser
|
|
r := rune(val)
|
|
if !utf8.ValidRune(r) {
|
|
p.Error(fmt.Sprintf("invalid UTF8 escape '%s'", s))
|
|
return false
|
|
}
|
|
sb.WriteRune(r)
|
|
}
|
|
return true
|
|
}
|
|
|
|
// Specific handling of input for multi-line literal strings.
|
|
//
|
|
// • Multi-line literal strings are surrounded by three single quotes on
|
|
// each side and allow newlines.
|
|
//
|
|
// • A newline immediately following the opening delimiter will be trimmed.
|
|
//
|
|
// • All other content between the delimiters is interpreted as-is without modification.
|
|
//
|
|
// • TOML parsers should feel free to normalize newline to whatever makes
|
|
// sense for their platform.
|
|
//
|
|
// • Control characters other than tab and newline are not permitted in a multi-line literal string.
|
|
func (t *parser) parseMultiLineLiteralString(p *parse.API) (string, bool) {
|
|
if !p.Accept(singleQuote3.Then(newline.Optional())) {
|
|
p.Expected("opening three single quotes")
|
|
return "", false
|
|
}
|
|
sb := &strings.Builder{}
|
|
for {
|
|
switch {
|
|
case p.Accept(m.Drop(singleQuote3)):
|
|
return sb.String(), true
|
|
case p.Accept(a.Tab):
|
|
sb.WriteString("\t")
|
|
case p.Accept(newline):
|
|
sb.WriteString("\n")
|
|
case p.Peek(controlCharacter):
|
|
p.Error("invalid character in literal string: %q (no control chars allowed, except for tab and newline)", p.Result.Runes[0])
|
|
return sb.String(), false
|
|
case p.Accept(a.ValidRune):
|
|
sb.WriteString(p.Result.String())
|
|
case p.Peek(a.InvalidRune):
|
|
p.Error("invalid UTF8 rune")
|
|
return sb.String(), false
|
|
default:
|
|
p.Expected("closing three single quotes")
|
|
return sb.String(), false
|
|
}
|
|
}
|
|
}
|