go-toml/parse/value_string.go

package parse

import (
	"fmt"
	"strconv"
	"strings"
	"unicode/utf8"

	"git.makaay.nl/mauricem/go-parsekit/parse"
	"git.makaay.nl/mauricem/go-toml/ast"
)

var (
	// Multi-line basic strings are surrounded by three quotation marks on each
	// side and allow newlines.

	multiLineBasicStringDelimiter = a.Str(`"""`)
	openingMultiLineBasicString   = multiLineBasicStringDelimiter.Then(newline.Optional())
	closingMultiLineBasicString   = m.Drop(multiLineBasicStringDelimiter)

	// Multi-line literal strings are surrounded by three single quotes on each side and allow newlines.

	multiLineLiteralStringDelimiter = a.Str(`'''`)
	openingMultiLineLiteralString   = multiLineLiteralStringDelimiter.Then(newline.Optional())
	closingMultiLineLiteralString   = m.Drop(multiLineLiteralStringDelimiter)

	// Opening and closing character for basic strings.
	basicStringDelimiter = m.Drop(a.DoubleQuote)

	// Opening and losing character for literal strings.
	literalStringDelimiter = m.Drop(a.SingleQuote)

	// Control characters as defined by TOML (U+0000 to U+001F, U+007F)

	isControlCharacter = func(b byte) bool { return (b >= 0x00 && b <= 0x1F) || b == 0x7F }
	controlCharacter   = a.ByteByCallback(isControlCharacter)

	// For convenience, some popular characters have a compact escape sequence.
	//
	// \b         - backspace       (U+0008)
	// \t         - tab             (U+0009)
	// \n         - LF        (U+000A)
	// \f         - form feed       (U+000C)
	// \r         - carriage return (U+000D)
	// \"         - quote           (U+0022)
	// \\         - backslash       (U+005C)
	// \uXXXX     - unicode         (U+XXXX)
	// \UXXXXXXXX - unicode         (U+XXXXXXXX)

	validEscapeChar = a.Bytes('b', 't', 'n', 'f', 'r', '"', '\\')
	shortEscape     = c.Seq(a.Backslash, validEscapeChar)
	shortUTF8Escape = c.Seq(a.Backslash, a.Byte('u'), a.HexDigit.Times(4))
	longUTF8Escape  = c.Seq(a.Backslash, a.Byte('U'), a.HexDigit.Times(8))
	validEscape     = c.Any(shortEscape, shortUTF8Escape, longUTF8Escape)

	// For writing long strings without introducing extraneous whitespace, use a
	// "line ending backslash". When the last non-whitespace character on a line is
	// a \, it will be trimmed along with all whitespace (including newlines) up to
	// the next non-whitespace character or closing delimiter.

	lineEndingBackslash = c.Seq(a.Backslash, whitespace, newline, whitespaceInclNewlines.Optional())
)

// There are four ways to express strings: basic, multi-line basic, literal and
// multi-line literal. All strings must parse/value_array.gocontain only valid UTF-8 characters.
func (t *parser) parseString(p *parse.API) (*ast.Value, bool) {
	var value string
	var ok bool
	switch {
	case p.Peek(openingMultiLineBasicString):
		value, ok = t.parseMultiLineBasicString(p)
	case p.Peek(basicStringDelimiter):
		value, ok = t.parseBasicString("string value", p)
	case p.Peek(openingMultiLineLiteralString):
		value, ok = t.parseMultiLineLiteralString(p)
	case p.Peek(literalStringDelimiter):
		value, ok = t.parseLiteralString("string value", p)
	default:
		p.Expected("a string value")
	}
	if ok {
		return ast.NewValue(ast.TypeString, value), ok
	}
	return nil, false
}

// Specific handling of input for basic strings.
//
// • Basic strings are surrounded by quotation marks.
//
// • Any Unicode character may be used except those that must be escaped:
// quotation mark, backslash, and the control characters (U+0000 to
// U+001F, U+007F).
//
// • No additional \escape sequences are allowed. What the spec say about this:
// "All other escape sequences [..] are reserved and, if used, TOML should
// produce an error.""
func (t *parser) parseBasicString(name string, p *parse.API) (string, bool) {
	if !p.Skip(a.DoubleQuote) {
		p.Expected(`opening quotation marks`)
		return "", false
	}
	sb := &strings.Builder{}
	for {
		switch {
		case p.Peek(controlCharacter):
			p.SetError("invalid character in %s: %q (must be escaped)", name, p.Result.Bytes[0])
			return sb.String(), false
		case p.Accept(validEscape):
			if !appendEscapedRune(p, sb) {
				return sb.String(), false
			}
		case p.Peek(a.Backslash):
			p.SetError("invalid escape sequence")
			return sb.String(), false
		case p.Skip(basicStringDelimiter):
			return sb.String(), true
		case p.Peek(a.InvalidRune):
			p.SetError("invalid UTF8 rune")
			return sb.String(), false
		case p.Accept(a.ValidRune):
			sb.WriteString(p.Result.String())
		default:
			p.Expected(`closing quotation marks`)
			return sb.String(), false
		}
	}
}

// Specific handling of input for literal strings.
//
// • Literal strings are surrounded by single quotes.
//
// • Like basic strings, they must appear on a single line.
//
// • Control characters other than tab are not permitted in a literal string.
func (t *parser) parseLiteralString(name string, p *parse.API) (string, bool) {
	if !p.Skip(a.SingleQuote) {
		p.Expected("opening single quote")
		return "", false
	}
	sb := &strings.Builder{}
	for {
		switch {
		case p.Skip(literalStringDelimiter):
			return sb.String(), true
		case p.Skip(a.Tab):
			sb.WriteString("\t")
		case p.Peek(controlCharacter):
			p.SetError("invalid character in %s: %q (no control chars allowed, except for tab)", name, p.Result.Bytes[0])
			return sb.String(), false
		case p.Peek(a.InvalidRune):
			p.SetError("invalid UTF8 rune")
			return sb.String(), false
		case p.Accept(a.ValidRune):
			sb.WriteString(p.Result.String())
		default:
			p.Expected("closing single quote")
			return sb.String(), false
		}
	}
}

// Specific handling of input for multi-line basic strings.
//
// • Multi-line basic strings are surrounded by three quotation marks on
// each side and allow newlines.
//
// • A newline immediately following the opening delimiter will be trimmed.
// All other whitespace and newline characters remain intact.
//
// • TOML parsers should feel free to normalize newline to whatever makes
// sense for their platform.
//
// • All of the escape sequences that are valid for basic strings are also valid
// for multi-line basic strings.
//
// • Any Unicode character may be used except those that must be escaped:
// backslash and the control characters (U+0000 to U+001F, U+007F). Quotation
// marks need not be escaped unless their presence would create a premature
// closing delimiter.
//
// • For writing long strings without introducing extraneous whitespace, use a
// "line ending backslash". When the last non-whitespace character on a line is
// a \, it will be trimmed along with all whitespace (including newlines) up to
// the next non-whitespace character or closing delimiter.
func (t *parser) parseMultiLineBasicString(p *parse.API) (string, bool) {
	if !p.Skip(openingMultiLineBasicString) {
		p.Expected("opening three quotation marks")
		return "", false
	}
	sb := &strings.Builder{}
	for {
		switch {
		case p.Skip(newline):
			sb.WriteString("\n")
		case p.Peek(controlCharacter):
			p.SetError("invalid character in multi-line basic string: %q (must be escaped)", p.Result.Bytes[0])
			return sb.String(), false
		case p.Accept(validEscape):
			if !appendEscapedRune(p, sb) {
				return sb.String(), false
			}
		case p.Skip(lineEndingBackslash):
			// NOOP
		case p.Peek(a.Backslash):
			p.SetError("invalid escape sequence")
			return sb.String(), false
		case p.Skip(closingMultiLineBasicString):
			return sb.String(), true
		case p.Accept(a.ValidRune):
			sb.WriteString(p.Result.String())
		case p.Peek(a.InvalidRune):
			p.SetError("invalid UTF8 rune")
			return sb.String(), false
		default:
			p.Expected("closing three quotation marks")
			return sb.String(), false
		}
	}
}

func appendEscapedRune(p *parse.API, sb *strings.Builder) bool {
	s := p.Result.String()
	switch s {
	case `\b`:
		sb.WriteRune('\b')
	case `\t`:
		sb.WriteRune('\t')
	case `\n`:
		sb.WriteRune('\n')
	case `\f`:
		sb.WriteRune('\f')
	case `\r`:
		sb.WriteRune('\r')
	case `\"`:
		sb.WriteRune('"')
	case `\\`:
		sb.WriteRune('\\')
	default:
		// UTF8 escape code: \uXXXX or \UXXXXXXXXXXXX.
		hex := s[2:]
		val, _ := strconv.ParseUint(hex, 16, 32) // hex format already validated by parser
		r := rune(val)
		if !utf8.ValidRune(r) {
			p.SetError(fmt.Sprintf("invalid UTF8 escape '%s'", s))
			return false
		}
		sb.WriteRune(r)
	}
	return true
}

// Specific handling of input for multi-line literal strings.
//
// • Multi-line literal strings are surrounded by three single quotes on
// each side and allow newlines.
//
// • A newline immediately following the opening delimiter will be trimmed.
//
// • All other content between the delimiters is interpreted as-is without modification.
//
// • TOML parsers should feel free to normalize newline to whatever makes
// sense for their platform.
//
// • Control characters other than tab and newline are not permitted in a multi-line literal string.
func (t *parser) parseMultiLineLiteralString(p *parse.API) (string, bool) {
	if !p.Skip(openingMultiLineLiteralString) {
		p.Expected("opening three single quotes")
		return "", false
	}
	sb := &strings.Builder{}
	for {
		switch {
		case p.Skip(closingMultiLineLiteralString):
			return sb.String(), true
		case p.Skip(a.Tab):
			sb.WriteString("\t")
		case p.Skip(newline):
			sb.WriteString("\n")
		case p.Peek(controlCharacter):
			p.SetError("invalid character in literal string: %q (no control chars allowed, except for tab and newline)", p.Result.Bytes[0])
			return sb.String(), false
		case p.Accept(a.ValidRune):
			sb.WriteString(p.Result.String())
		case p.Peek(a.InvalidRune):
			p.SetError("invalid UTF8 rune")
			return sb.String(), false
		default:
			p.Expected("closing three single quotes")
			return sb.String(), false
		}
	}
}