go-toml/parse/value_string.go

package parse

import (
	"unicode/utf8"

	"git.makaay.nl/mauricem/go-parsekit/parse"
	"git.makaay.nl/mauricem/go-parsekit/tokenize"
	"git.makaay.nl/mauricem/go-toml/ast"
)

var (
	// Multi-line basic strings are surrounded by three quotation marks on each
	// side and allow newlines.

	multiLineBasicStringDelimiter = a.Str(`"""`)
	openingMultiLineBasicString   = multiLineBasicStringDelimiter.Then(newline.Optional())
	closingMultiLineBasicString   = m.Drop(multiLineBasicStringDelimiter)

	// Multi-line literal strings are surrounded by three single quotes on each side and allow newlines.

	multiLineLiteralStringDelimiter = a.Str(`'''`)
	openingMultiLineLiteralString   = multiLineLiteralStringDelimiter.Then(newline.Optional())
	closingMultiLineLiteralString   = m.Drop(multiLineLiteralStringDelimiter)

	// Opening and closing character for basic strings.
	basicStringDelimiter = a.DoubleQuote

	// Opening and losing character for literal strings.
	literalStringDelimiter = a.SingleQuote

	// For convenience, some popular characters have a compact escape sequence.
	//
	// \b         - backspace       (U+0008)
	// \t         - tab             (U+0009)
	// \n         - LF        (U+000A)
	// \f         - form feed       (U+000C)
	// \r         - carriage return (U+000D)
	// \"         - quote           (U+0022)
	// \\         - backslash       (U+005C)
	// \uXXXX     - unicode         (U+XXXX)
	// \UXXXXXXXX - unicode         (U+XXXXXXXX)

	validEscapeChar = a.Char('b', 't', 'n', 'f', 'r', '"', '\\')
	shortEscape     = c.Seq(a.Backslash, validEscapeChar)
	shortUTF8Escape = c.Seq(a.Backslash, a.Char('u'), a.HexDigit.Times(4))
	longUTF8Escape  = c.Seq(a.Backslash, a.Char('U'), a.HexDigit.Times(8))
	validEscape     = c.Any(shortEscape, shortUTF8Escape, longUTF8Escape)

	// For writing long strings without introducing extraneous whitespace, use a
	// "line ending backslash". When the last non-whitespace character on a line is
	// a \, it will be trimmed along with all whitespace (including newlines) up to
	// the next non-whitespace character or closing delimiter.

	lineEndingBackslash = c.Seq(a.Backslash, whitespace, newline, whitespaceInclNewlines.Optional())
)

// There are four ways to express strings: basic, multi-line basic, literal and
// multi-line literal. All strings must parse/value_array.gocontain only valid UTF-8 characters.
func (t *parser) parseString(p *parse.API) (*ast.Value, bool) {
	var value string
	var ok bool
	switch {
	case p.Peek(openingMultiLineBasicString):
		value, ok = t.parseMultiLineBasicString(p)
	case p.Peek(basicStringDelimiter):
		value, ok = t.parseBasicString("string value", p)
	case p.Peek(openingMultiLineLiteralString):
		value, ok = t.parseMultiLineLiteralString(p)
	case p.Peek(literalStringDelimiter):
		value, ok = t.parseLiteralString("string value", p)
	default:
		p.Expected("a string value")
	}
	if ok {
		return ast.NewValue(ast.TypeString, value), ok
	}
	return nil, false
}

// Specific handling of input for basic strings.
//
// • Basic strings are surrounded by quotation marks.
//
// • Any Unicode character may be used except those that must be escaped:
// quotation mark, backslash, and the control characters (U+0000 to
// U+001F, U+007F).
//
// • No additional \escape sequences are allowed. What the spec say about this:
// "All other escape sequences [..] are reserved and, if used, TOML should
// produce an error.""
func (t *parser) parseBasicString(name string, p *parse.API) (string, bool) {
	if !p.Accept(basicStringHandler) {
		return "", false
	}
	return p.Result.String(), true
}

type stringTokenizerState int

const (
	strStart stringTokenizerState = iota
	strStart2
	strStart3
	strStart4
	strChar
	strEscape
	strEscapeUnicode
	strEscapeConcatWs1
	strEscapeConcatCRLF
	strEscapeConcatWs2
	strCRLF
	strUTF8
	strEnd2
	strEnd3
)

const (
	lowest6bits = 0x3F // 0011 1111
	lowest5bits = 0x1F // 0001 1111
	lowest4bits = 0x0F // 0000 1111
	lowest3bits = 0x07 // 0000 0111
)

func basicStringHandler(tokenAPI *tokenize.API) bool {
	var state stringTokenizerState
	in := tokenAPI.Input
	out := tokenAPI.Output

	unicodeReqLen := 0
	unicodeLen := 0
	unicodeHex := make([]byte, 8)

	utf8ReqLen := 0
	utf8Len := 0
	utf8Rune := rune(0)
	utf8Bytes := make([]byte, 4)

	for {
		bs, _ := in.Byte.PeekBuffered(0)
		bslen := len(bs)
		if bslen == 0 {
			return false
		}
		for i := 0; i < bslen; i++ {
			b := bs[i]
			switch state {
			case strStart:
				if b != '"' {
					// No opening quotes found.
					return false
				}
				in.Byte.MoveCursor(b)
				state = strChar
			case strChar:
				switch {
				case (b >= 0x00 && b <= 0x1F) || b == 0x7F:
					// Control characters as defined by the TOML specification.
					// These must always be escaped.
					// Unescaped control character
					// TODO error reporting instead of full reject
					return false
				case b == '\\':
					in.Byte.MoveCursor(b)
					state = strEscape
					continue
				case b == '"':
					in.Byte.MoveCursor(b)
					return true
				}
				switch b >> 4 {
				case 0, 1, 2, 3, 4, 5, 6, 7: // 1 byte UTF8 (0xxxxxxx, a.k.a. ASCII)
					out.AddByte(b)
					in.Byte.MoveCursor(b)
					continue
				case 12, 13: // 2 byte UTF8 (110xxxxx 10xxxxxx)
					utf8ReqLen = 2
					utf8Rune = rune((b & lowest5bits) << 6)
				case 14: // 3 byte UTF8 (1110xxxx 10xxxxxx 10xxxxxx)
					utf8ReqLen = 3
					utf8Rune = rune((b & lowest4bits) << 6)
				case 15: // 4 byte UTF8 (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
					utf8ReqLen = 4
					utf8Rune = rune((b & lowest3bits) << 6)
				default: // Invalid UTF8 rune
					return false
				}
				utf8Bytes[0] = b
				utf8Len = 1
				state = strUTF8
			case strUTF8:
				// This should be a continuation byte (10xxxxxx)
				if b>>6 != 2 {
					// Invalid UTF8 rune
					return false
				}
				utf8Bytes[utf8Len] = b
				utf8Len++
				utf8Rune = utf8Rune<<6 + rune(b&lowest6bits)
				if utf8Len == utf8ReqLen {
					if !utf8.ValidRune(utf8Rune) {
						// Invalid unicode character
						return false
					}
					bytes := utf8Bytes[:utf8Len]
					out.AddBytes(bytes...)
					in.Byte.MoveCursorMulti(bytes...)
					state = strChar
				}
			case strEscape:
				state = strChar
				if escaped, ok := getEscapedChar(b); ok {
					out.AddByte(escaped)
					in.Byte.MoveCursor(b)
					continue
				}
				switch b {
				case 'u', 'U':
					in.Byte.MoveCursor(b)
					unicodeReqLen = 4
					if b == 'u' {
						unicodeReqLen = 4
					} else {
						unicodeReqLen = 8
					}
					unicodeLen = 0
					utf8Rune = 0
					state = strEscapeUnicode
				default:
					// Invalid escape sequence used.
					return false
				}
			case strEscapeUnicode:
				value, ok := getHexValueForChar(b)
				if !ok {
					// Invalid unicode escape sequence used.
					return false
				}
				utf8Rune = utf8Rune<<4 + rune(value)
				unicodeHex[unicodeLen] = b
				unicodeLen++
				if unicodeLen == unicodeReqLen {
					if !utf8.ValidRune(utf8Rune) {
						// Invalid unicode escape
						return false
					}
					in.Byte.MoveCursorMulti(unicodeHex[:unicodeLen]...)
					w := utf8.EncodeRune(utf8Bytes, utf8Rune)
					out.AddBytes(utf8Bytes[:w]...)
					state = strChar
				}
			}
		}
	}
}

func getHexValueForChar(b byte) (byte, bool) {
	switch {
	case '0' <= b && b <= '9':
		return b - '0', true
	case 'a' <= b && b <= 'z':
		return b - 'a' + 10, true
	case 'A' <= b && b <= 'Z':
		return b - 'A' + 10, true
	default:
		return 0, false
	}
}

func getEscapedChar(b byte) (byte, bool) {
	switch b {
	case 'b':
		return '\b', true
	case 't':
		return '\t', true
	case 'n':
		return '\n', true
	case 'f':
		return '\f', true
	case 'r':
		return '\r', true
	case '"':
		return '"', true
	case '\\':
		return '\\', true
	}
	return 0, false
}

// Specific handling of input for literal strings.
//
// • Literal strings are surrounded by single quotes.
//
// • Like basic strings, they must appear on a single line.
//
// • Control characters other than tab are not permitted in a literal string.
func (t *parser) parseLiteralString(name string, p *parse.API) (string, bool) {
	if !p.Accept(literalStringHandler) {
		return "", false
	}
	return p.Result.String(), true
}

func literalStringHandler(tokenAPI *tokenize.API) bool {
	var state stringTokenizerState
	in := tokenAPI.Input
	out := tokenAPI.Output

	utf8ReqLen := 0
	utf8Len := 0
	utf8Rune := rune(0)
	utf8Bytes := [4]byte{}

	for {
		bs, _ := tokenAPI.Input.Byte.PeekBuffered(0)
		bslen := len(bs)
		if bslen == 0 {
			// Unexpected end of file.
			return false
		}
		for i := 0; i < bslen; i++ {
			b := bs[i]
			switch state {
			case strStart:
				if b != '\'' {
					// No opening quote found.
					return false
				}
				in.Byte.MoveCursor(b)
				state = strChar
			case strChar:
				switch {
				case (b >= 0x00 && b < 0x09) || (b > 0x09 && b <= 0x1F) || b == 0x7F:
					// Unescaped control character
					return false
				case b == '\'':
					in.Byte.MoveCursor(b)
					return true
				}
				switch b >> 4 {
				case 0, 1, 2, 3, 4, 5, 6, 7: // 1 byte UTF8 (0xxxxxxx, a.k.a. ASCII)
					out.AddByte(b)
					in.Byte.MoveCursor(b)
					continue
				case 12, 13: // 2 byte UTF8 (110xxxxx 10xxxxxx)
					utf8ReqLen = 2
					utf8Rune = rune((b & lowest5bits) << 6)
				case 14: // 3 byte UTF8 (1110xxxx 10xxxxxx 10xxxxxx)
					utf8ReqLen = 3
					utf8Rune = rune((b & lowest4bits) << 6)
				case 15: // 4 byte UTF8 (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
					utf8ReqLen = 4
					utf8Rune = rune((b & lowest3bits) << 6)
				default: // Invalid UTF8 rune
					return false
				}
				utf8Bytes[0] = b
				utf8Len = 1
				state = strUTF8
			case strUTF8:
				// This should be a continuation byte (10xxxxxx)
				if b>>6 != 2 {
					// Invalid UTF8 rune
					return false
				}
				utf8Bytes[utf8Len] = b
				utf8Len++
				utf8Rune = utf8Rune<<6 + rune(b&lowest6bits)
				if utf8Len == utf8ReqLen {
					if !utf8.ValidRune(utf8Rune) {
						// Invalid unicode character
						return false
					}
					bytes := utf8Bytes[:utf8Len]
					out.AddBytes(bytes...)
					in.Byte.MoveCursorMulti(bytes...)
					state = strChar
				}
			}
		}
	}
}

// Specific handling of input for multi-line basic strings.
//
// • Multi-line basic strings are surrounded by three quotation marks on
// each side and allow newlines.
//
// • A newline immediately following the opening delimiter will be trimmed.
// All other whitespace and newline characters remain intact.
//
// • TOML parsers should feel free to normalize newline to whatever makes
// sense for their platform.
//
// • All of the escape sequences that are valid for basic strings are also valid
// for multi-line basic strings.
//
// • Any Unicode character may be used except those that must be escaped:
// backslash and the control characters (U+0000 to U+001F, U+007F). Quotation
// marks need not be escaped unless their presence would create a premature
// closing delimiter.
//
// • For writing long strings without introducing extraneous whitespace, use a
// "line ending backslash". When the last non-whitespace character on a line is
// a \, it will be trimmed along with all whitespace (including newlines) up to
// the next non-whitespace character or closing delimiter.
func (t *parser) parseMultiLineBasicString(p *parse.API) (string, bool) {
	if !p.Accept(multiLineBasicStringHandler) {
		return "", false
	}
	return p.Result.String(), true
}

func multiLineBasicStringHandler(tokenAPI *tokenize.API) bool {
	var state stringTokenizerState
	in := tokenAPI.Input
	out := tokenAPI.Output

	unicodeReqLen := 0
	unicodeLen := 0
	unicodeHex := make([]byte, 8)

	utf8ReqLen := 0
	utf8Len := 0
	utf8Rune := rune(0)
	utf8Bytes := make([]byte, 4)

	crlf := false

	for {
		bs, _ := in.Byte.PeekBuffered(0)
		bslen := len(bs)
		if bslen == 0 {
			return false
		}
		for i := 0; i < bslen; i++ {
			b := bs[i]
			switch state {
			case strStart, strStart2, strStart3:
				if b != '"' {
					// No triple opening quotes found.
					return false
				}
				in.Byte.MoveCursor(b)
				switch state {
				case strStart:
					state = strStart2
				case strStart2:
					state = strStart3
				case strStart3:
					state = strStart4
				}
			case strStart4:
				if !crlf && b == '\r' {
					crlf = true
					in.Byte.MoveCursor(b)
					continue
				}
				if b == '\n' {
					in.Byte.MoveCursor(b)
					state = strChar
					continue
				}
				if crlf {
					// Lonely \r without \n.
					return false
				}
				state = strChar
				fallthrough
			case strChar:
				switch {
				case b == '\r':
					state = strCRLF
					continue
				case b == '\n':
					out.AddByte(b)
					in.Byte.MoveCursor(b)
					continue
				case (b >= 0x00 && b <= 0x1F) || b == 0x7F:
					// Unescaped control character
					// TODO error reporting instead of full reject
					return false
				case b == '\\':
					in.Byte.MoveCursor(b)
					state = strEscape
					continue
				case b == '"':
					in.Byte.MoveCursor(b)
					state = strEnd2
					continue
				}
				switch b >> 4 {
				case 0, 1, 2, 3, 4, 5, 6, 7: // 1 byte UTF8 (0xxxxxxx, a.k.a. ASCII)
					out.AddByte(b)
					in.Byte.MoveCursor(b)
					continue
				case 12, 13: // 2 byte UTF8 (110xxxxx 10xxxxxx)
					utf8ReqLen = 2
					utf8Rune = rune((b & lowest5bits) << 6)
				case 14: // 3 byte UTF8 (1110xxxx 10xxxxxx 10xxxxxx)
					utf8ReqLen = 3
					utf8Rune = rune((b & lowest4bits) << 6)
				case 15: // 4 byte UTF8 (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
					utf8ReqLen = 4
					utf8Rune = rune((b & lowest3bits) << 6)
				default: // Invalid UTF8 rune
					return false
				}
				utf8Bytes[0] = b
				utf8Len = 1
				state = strUTF8
			case strUTF8:
				// This should be a continuation byte (10xxxxxx)
				if b>>6 != 2 {
					// Invalid UTF8 rune
					return false
				}
				utf8Bytes[utf8Len] = b
				utf8Len++
				utf8Rune = utf8Rune<<6 + rune(b&lowest6bits)
				if utf8Len == utf8ReqLen {
					if !utf8.ValidRune(utf8Rune) {
						// Invalid unicode character
						return false
					}
					bytes := utf8Bytes[:utf8Len]
					out.AddBytes(bytes...)
					in.Byte.MoveCursorMulti(bytes...)
					state = strChar
				}
			case strCRLF:
				if b == '\n' {
					in.Byte.MoveCursorMulti('\r', b)
					out.AddByte('\n')
					state = strChar
					continue
				}
				// Lonely \r, should have been escaped.
				return false
			case strEscape:
				state = strChar
				if escaped, ok := getEscapedChar(b); ok {
					out.AddByte(escaped)
					in.Byte.MoveCursor(b)
					continue
				}
				switch b {
				case ' ', '\t':
					in.Byte.MoveCursor(b)
					state = strEscapeConcatWs1
					continue
				case '\r':
					in.Byte.MoveCursor(b)
					state = strEscapeConcatCRLF
					continue
				case '\n':
					in.Byte.MoveCursor(b)
					state = strEscapeConcatWs2
					continue
				case 'u', 'U':
					in.Byte.MoveCursor(b)
					unicodeReqLen = 4
					if b == 'u' {
						unicodeReqLen = 4
					} else {
						unicodeReqLen = 8
					}
					unicodeLen = 0
					utf8Rune = 0
					state = strEscapeUnicode
				default:
					// Invalid escape sequence used.
					return false
				}
			case strEscapeConcatWs1:
				switch b {
				case ' ', '\t':
					in.Byte.MoveCursor(b)
					continue
				case '\r':
					in.Byte.MoveCursor(b)
					state = strEscapeConcatCRLF
					continue
				case '\n':
					in.Byte.MoveCursor(b)
					state = strEscapeConcatWs2
					continue
				default:
					// Invalid line concatenation
					return false
				}
			case strEscapeConcatCRLF:
				switch b {
				case '\n':
					in.Byte.MoveCursor(b)
					state = strEscapeConcatWs2
					continue
				default:
					// Invalid line concatenation
					return false
				}
			case strEscapeConcatWs2:
				switch b {
				case ' ', '\t':
					in.Byte.MoveCursor(b)
					continue
				case '\r':
					in.Byte.MoveCursor(b)
					state = strEscapeConcatCRLF
					continue
				case '\n':
					in.Byte.MoveCursor(b)
					state = strEscapeConcatWs2
					continue
				default:
					i--
					state = strChar
					continue
				}
			case strEscapeUnicode:
				value, ok := getHexValueForChar(b)
				if !ok {
					// Invalid unicode escape sequence used.
					return false
				}
				utf8Rune = utf8Rune<<4 + rune(value)
				unicodeHex[unicodeLen] = b
				unicodeLen++
				if unicodeLen == unicodeReqLen {
					if !utf8.ValidRune(utf8Rune) {
						// Invalid unicode escape
						return false
					}
					in.Byte.MoveCursorMulti(unicodeHex[:unicodeLen]...)
					w := utf8.EncodeRune(utf8Bytes, utf8Rune)
					out.AddBytes(utf8Bytes[:w]...)
					state = strChar
				}
			case strEnd2:
				if b == '"' {
					state = strEnd3
					in.Byte.MoveCursor(b)
				} else {
					state = strChar
					out.AddByte('"')
					i--
				}
			case strEnd3:
				if b == '"' {
					in.Byte.MoveCursor(b)
					return true
				}
				state = strChar
				out.AddBytes('"', '"')
				i--
			}
		}
	}
}

// Specific handling of input for multi-line literal strings.
//
// • Multi-line literal strings are surrounded by three single quotes on
// each side and allow newlines.
//
// • A newline immediately following the opening delimiter will be trimmed.
//
// • All other content between the delimiters is interpreted as-is without modification.
//
// • TOML parsers should feel free to normalize newline to whatever makes
// sense for their platform.
//
// • Control characters other than tab and newline are not permitted in a multi-line literal string.
func (t *parser) parseMultiLineLiteralString(p *parse.API) (string, bool) {
	if !p.Accept(multiLineLiteralStringHandler) {
		return "", false
	}
	return p.Result.String(), true
}

func multiLineLiteralStringHandler(tokenAPI *tokenize.API) bool {
	var state stringTokenizerState
	in := tokenAPI.Input
	out := tokenAPI.Output

	utf8ReqLen := 0
	utf8Len := 0
	utf8Rune := rune(0)
	utf8Bytes := make([]byte, 4)

	crlf := false

	for {
		bs, _ := in.Byte.PeekBuffered(0)
		bslen := len(bs)
		if bslen == 0 {
			return false
		}
		for i := 0; i < bslen; i++ {
			b := bs[i]
			switch state {
			case strStart, strStart2, strStart3:
				if b != '\'' {
					// No triple opening quotes found.
					return false
				}
				in.Byte.MoveCursor(b)
				switch state {
				case strStart:
					state = strStart2
				case strStart2:
					state = strStart3
				case strStart3:
					state = strStart4
				}
			case strStart4:
				if !crlf && b == '\r' {
					crlf = true
					in.Byte.MoveCursor(b)
					continue
				}
				if b == '\n' {
					in.Byte.MoveCursor(b)
					state = strChar
					continue
				}
				if crlf {
					// Lonely \r without \n.
					return false
				}
				state = strChar
				fallthrough
			case strChar:
				switch {
				case b == '\r':
					state = strCRLF
					continue
				case b == '\n' || b == '\t':
					out.AddByte(b)
					in.Byte.MoveCursor(b)
					continue
				case (b >= 0x00 && b <= 0x1F) || b == 0x7F:
					// Unescaped control character
					// TODO error reporting instead of full reject
					return false
				case b == '\'':
					in.Byte.MoveCursor(b)
					state = strEnd2
					continue
				}
				switch b >> 4 {
				case 0, 1, 2, 3, 4, 5, 6, 7: // 1 byte UTF8 (0xxxxxxx, a.k.a. ASCII)
					out.AddByte(b)
					in.Byte.MoveCursor(b)
					continue
				case 12, 13: // 2 byte UTF8 (110xxxxx 10xxxxxx)
					utf8ReqLen = 2
					utf8Rune = rune((b & lowest5bits) << 6)
				case 14: // 3 byte UTF8 (1110xxxx 10xxxxxx 10xxxxxx)
					utf8ReqLen = 3
					utf8Rune = rune((b & lowest4bits) << 6)
				case 15: // 4 byte UTF8 (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
					utf8ReqLen = 4
					utf8Rune = rune((b & lowest3bits) << 6)
				default: // Invalid UTF8 rune
					return false
				}
				utf8Bytes[0] = b
				utf8Len = 1
				state = strUTF8
			case strUTF8:
				// This should be a continuation byte (10xxxxxx)
				if b>>6 != 2 {
					// Invalid UTF8 rune
					return false
				}
				utf8Bytes[utf8Len] = b
				utf8Len++
				utf8Rune = utf8Rune<<6 + rune(b&lowest6bits)
				if utf8Len == utf8ReqLen {
					if !utf8.ValidRune(utf8Rune) {
						// Invalid unicode character
						return false
					}
					bytes := utf8Bytes[:utf8Len]
					out.AddBytes(bytes...)
					in.Byte.MoveCursorMulti(bytes...)
					state = strChar
				}
			case strCRLF:
				if b == '\n' {
					in.Byte.MoveCursorMulti('\r', b)
					out.AddByte('\n')
					state = strChar
					continue
				}
				// Lonely \r, should have been escaped.
				return false
			case strEnd2:
				if b == '\'' {
					state = strEnd3
					in.Byte.MoveCursor(b)
				} else {
					state = strChar
					out.AddByte('\'')
					i--
				}
			case strEnd3:
				if b == '\'' {
					in.Byte.MoveCursor(b)
					return true
				}
				state = strChar
				out.AddBytes('\'', '\'')
				i--
			}
		}
	}
}