package parse import ( "fmt" "strconv" "strings" "unicode/utf8" "git.makaay.nl/mauricem/go-parsekit/parse" "git.makaay.nl/mauricem/go-toml/ast" ) var ( // Multi-line basic strings are surrounded by three quotation marks on each // side and allow newlines. multiLineBasicStringDelimiter = a.Str(`"""`) openingMultiLineBasicString = multiLineBasicStringDelimiter.Then(newline.Optional()) closingMultiLineBasicString = m.Drop(multiLineBasicStringDelimiter) // Multi-line literal strings are surrounded by three single quotes on each side and allow newlines. multiLineLiteralStringDelimiter = a.Str(`'''`) openingMultiLineLiteralString = multiLineLiteralStringDelimiter.Then(newline.Optional()) closingMultiLineLiteralString = m.Drop(multiLineLiteralStringDelimiter) // Opening and closing character for basic strings. basicStringDelimiter = m.Drop(a.DoubleQuote) // Opening and losing character for literal strings. literalStringDelimiter = m.Drop(a.SingleQuote) // Control characters as defined by TOML (U+0000 to U+001F, U+007F) isControlCharacter = func(b byte) bool { return (b >= 0x00 && b <= 0x1F) || b == 0x7F } controlCharacter = a.ByteByCallback(isControlCharacter) // For convenience, some popular characters have a compact escape sequence. // // \b - backspace (U+0008) // \t - tab (U+0009) // \n - LF (U+000A) // \f - form feed (U+000C) // \r - carriage return (U+000D) // \" - quote (U+0022) // \\ - backslash (U+005C) // \uXXXX - unicode (U+XXXX) // \UXXXXXXXX - unicode (U+XXXXXXXX) validEscapeChar = a.Bytes('b', 't', 'n', 'f', 'r', '"', '\\') shortEscape = c.Seq(a.Backslash, validEscapeChar) shortUTF8Escape = c.Seq(a.Backslash, a.Byte('u'), a.HexDigit.Times(4)) longUTF8Escape = c.Seq(a.Backslash, a.Byte('U'), a.HexDigit.Times(8)) validEscape = c.Any(shortEscape, shortUTF8Escape, longUTF8Escape) // For writing long strings without introducing extraneous whitespace, use a // "line ending backslash". When the last non-whitespace character on a line is // a \, it will be trimmed along with all whitespace (including newlines) up to // the next non-whitespace character or closing delimiter. lineEndingBackslash = c.Seq(a.Backslash, whitespace, newline, whitespaceInclNewlines.Optional()) ) // There are four ways to express strings: basic, multi-line basic, literal and // multi-line literal. All strings must parse/value_array.gocontain only valid UTF-8 characters. func (t *parser) parseString(p *parse.API) (*ast.Value, bool) { var value string var ok bool switch { case p.Peek(openingMultiLineBasicString): value, ok = t.parseMultiLineBasicString(p) case p.Peek(basicStringDelimiter): value, ok = t.parseBasicString("string value", p) case p.Peek(openingMultiLineLiteralString): value, ok = t.parseMultiLineLiteralString(p) case p.Peek(literalStringDelimiter): value, ok = t.parseLiteralString("string value", p) default: p.Expected("a string value") } if ok { return ast.NewValue(ast.TypeString, value), ok } return nil, false } // Specific handling of input for basic strings. // // • Basic strings are surrounded by quotation marks. // // • Any Unicode character may be used except those that must be escaped: // quotation mark, backslash, and the control characters (U+0000 to // U+001F, U+007F). // // • No additional \escape sequences are allowed. What the spec say about this: // "All other escape sequences [..] are reserved and, if used, TOML should // produce an error."" func (t *parser) parseBasicString(name string, p *parse.API) (string, bool) { if !p.Skip(a.DoubleQuote) { p.Expected(`opening quotation marks`) return "", false } sb := &strings.Builder{} for { switch { case p.Peek(controlCharacter): p.SetError("invalid character in %s: %q (must be escaped)", name, p.Result.Byte(0)) return sb.String(), false case p.Accept(validEscape): if !appendEscapedRune(p, sb) { return sb.String(), false } case p.Peek(a.Backslash): p.SetError("invalid escape sequence") return sb.String(), false case p.Skip(basicStringDelimiter): return sb.String(), true case p.Peek(a.InvalidRune): p.SetError("invalid UTF8 rune") return sb.String(), false case p.Accept(a.ValidRune): sb.WriteString(p.Result.String()) default: p.Expected(`closing quotation marks`) return sb.String(), false } } } // Specific handling of input for literal strings. // // • Literal strings are surrounded by single quotes. // // • Like basic strings, they must appear on a single line. // // • Control characters other than tab are not permitted in a literal string. func (t *parser) parseLiteralString(name string, p *parse.API) (string, bool) { if !p.Skip(a.SingleQuote) { p.Expected("opening single quote") return "", false } sb := &strings.Builder{} for { switch { case p.Skip(literalStringDelimiter): return sb.String(), true case p.Skip(a.Tab): sb.WriteString("\t") case p.Peek(controlCharacter): p.SetError("invalid character in %s: %q (no control chars allowed, except for tab)", name, p.Result.Byte(0)) return sb.String(), false case p.Peek(a.InvalidRune): p.SetError("invalid UTF8 rune") return sb.String(), false case p.Accept(a.ValidRune): sb.WriteString(p.Result.String()) default: p.Expected("closing single quote") return sb.String(), false } } } // Specific handling of input for multi-line basic strings. // // • Multi-line basic strings are surrounded by three quotation marks on // each side and allow newlines. // // • A newline immediately following the opening delimiter will be trimmed. // All other whitespace and newline characters remain intact. // // • TOML parsers should feel free to normalize newline to whatever makes // sense for their platform. // // • All of the escape sequences that are valid for basic strings are also valid // for multi-line basic strings. // // • Any Unicode character may be used except those that must be escaped: // backslash and the control characters (U+0000 to U+001F, U+007F). Quotation // marks need not be escaped unless their presence would create a premature // closing delimiter. // // • For writing long strings without introducing extraneous whitespace, use a // "line ending backslash". When the last non-whitespace character on a line is // a \, it will be trimmed along with all whitespace (including newlines) up to // the next non-whitespace character or closing delimiter. func (t *parser) parseMultiLineBasicString(p *parse.API) (string, bool) { if !p.Skip(openingMultiLineBasicString) { p.Expected("opening three quotation marks") return "", false } sb := &strings.Builder{} for { switch { case p.Skip(newline): sb.WriteString("\n") case p.Peek(controlCharacter): p.SetError("invalid character in multi-line basic string: %q (must be escaped)", p.Result.Byte(0)) return sb.String(), false case p.Accept(validEscape): if !appendEscapedRune(p, sb) { return sb.String(), false } case p.Skip(lineEndingBackslash): // NOOP case p.Peek(a.Backslash): p.SetError("invalid escape sequence") return sb.String(), false case p.Skip(closingMultiLineBasicString): return sb.String(), true case p.Accept(a.ValidRune): sb.WriteString(p.Result.String()) case p.Peek(a.InvalidRune): p.SetError("invalid UTF8 rune") return sb.String(), false default: p.Expected("closing three quotation marks") return sb.String(), false } } } func appendEscapedRune(p *parse.API, sb *strings.Builder) bool { s := p.Result.String() switch s { case `\b`: sb.WriteRune('\b') case `\t`: sb.WriteRune('\t') case `\n`: sb.WriteRune('\n') case `\f`: sb.WriteRune('\f') case `\r`: sb.WriteRune('\r') case `\"`: sb.WriteRune('"') case `\\`: sb.WriteRune('\\') default: // UTF8 escape code: \uXXXX or \UXXXXXXXXXXXX. hex := s[2:] val, _ := strconv.ParseUint(hex, 16, 32) // hex format already validated by parser r := rune(val) if !utf8.ValidRune(r) { p.SetError(fmt.Sprintf("invalid UTF8 escape '%s'", s)) return false } sb.WriteRune(r) } return true } // Specific handling of input for multi-line literal strings. // // • Multi-line literal strings are surrounded by three single quotes on // each side and allow newlines. // // • A newline immediately following the opening delimiter will be trimmed. // // • All other content between the delimiters is interpreted as-is without modification. // // • TOML parsers should feel free to normalize newline to whatever makes // sense for their platform. // // • Control characters other than tab and newline are not permitted in a multi-line literal string. func (t *parser) parseMultiLineLiteralString(p *parse.API) (string, bool) { if !p.Skip(openingMultiLineLiteralString) { p.Expected("opening three single quotes") return "", false } sb := &strings.Builder{} for { switch { case p.Skip(closingMultiLineLiteralString): return sb.String(), true case p.Skip(a.Tab): sb.WriteString("\t") case p.Skip(newline): sb.WriteString("\n") case p.Peek(controlCharacter): p.SetError("invalid character in literal string: %q (no control chars allowed, except for tab and newline)", p.Result.Byte(0)) return sb.String(), false case p.Accept(a.ValidRune): sb.WriteString(p.Result.String()) case p.Peek(a.InvalidRune): p.SetError("invalid UTF8 rune") return sb.String(), false default: p.Expected("closing three single quotes") return sb.String(), false } } }