package parse import ( "fmt" "strconv" "strings" "unicode/utf8" "git.makaay.nl/mauricem/go-parsekit/parse" "git.makaay.nl/mauricem/go-toml/ast" ) var ( // Multi-line basic strings are surrounded by three quotation marks on each // side and allow newlines. doubleQuote3 = a.Str(`"""`) // Multi-line literal strings are surrounded by three single quotes on each side and allow newlines. singleQuote3 = a.Str(`'''`) // Control characters as defined by TOML (U+0000 to U+001F, U+007F) controlCharacter = a.RuneRange('\u0000', '\u001F').Or(a.Rune('\u007F')) // For convenience, some popular characters have a compact escape sequence. // // \b - backspace (U+0008) // \t - tab (U+0009) // \n - LF (U+000A) // \f - form feed (U+000C) // \r - carriage return (U+000D) // \" - quote (U+0022) // \\ - backslash (U+005C) // \uXXXX - unicode (U+XXXX) // \UXXXXXXXX - unicode (U+XXXXXXXX) validEscapeChar = a.Runes('b', 't', 'n', 'f', 'r', '"', '\\') shortEscape = c.Seq(a.Backslash, validEscapeChar) shortUTF8Escape = c.Seq(a.Backslash, a.Rune('u'), a.HexDigit.Times(4)) longUTF8Escape = c.Seq(a.Backslash, a.Rune('U'), a.HexDigit.Times(8)) validEscape = c.Any(shortEscape, shortUTF8Escape, longUTF8Escape) // For writing long strings without introducing extraneous whitespace, use a // "line ending backslash". When the last non-whitespace character on a line is // a \, it will be trimmed along with all whitespace (including newlines) up to // the next non-whitespace character or closing delimiter. lineEndingBackslash = c.Seq(a.Backslash, whitespace, newline, whitespaceInclNewlines.Optional()) ) // There are four ways to express strings: basic, multi-line basic, literal and // multi-line literal. All strings must parse/value_array.gocontain only valid UTF-8 characters. func (t *parser) parseString(p *parse.API) (*ast.Value, bool) { var value string var ok bool switch { case p.Peek(doubleQuote3): value, ok = t.parseMultiLineBasicString(p) case p.Peek(a.DoubleQuote): value, ok = t.parseBasicString("string value", p) case p.Peek(singleQuote3): value, ok = t.parseMultiLineLiteralString(p) case p.Peek(a.SingleQuote): value, ok = t.parseLiteralString("string value", p) default: p.Expected("a string value") } if ok { return ast.NewValue(ast.TypeString, value), ok } return nil, false } // Specific handling of input for basic strings. // // • Basic strings are surrounded by quotation marks. // // • Any Unicode character may be used except those that must be escaped: // quotation mark, backslash, and the control characters (U+0000 to // U+001F, U+007F). // // • No additional \escape sequences are allowed. What the spec say about this: // "All other escape sequences [..] are reserved and, if used, TOML should // produce an error."" func (t *parser) parseBasicString(name string, p *parse.API) (string, bool) { if !p.Accept(a.DoubleQuote) { p.Expected(`opening quotation marks`) return "", false } sb := &strings.Builder{} for { switch { case p.Peek(controlCharacter): p.Error("invalid character in %s: %q (must be escaped)", name, p.Result.Runes[0]) return sb.String(), false case p.Accept(validEscape): if !appendEscapedRune(p, sb) { return sb.String(), false } case p.Peek(a.Backslash): p.Error("invalid escape sequence") return sb.String(), false case p.Accept(m.Drop(a.DoubleQuote)): return sb.String(), true case p.Peek(a.InvalidRune): p.Error("invalid UTF8 rune") return sb.String(), false case p.Accept(a.ValidRune): sb.WriteString(p.Result.String()) default: p.Expected(`closing quotation marks`) return sb.String(), false } } } // Specific handling of input for literal strings. // // • Literal strings are surrounded by single quotes. // // • Like basic strings, they must appear on a single line. // // • Control characters other than tab are not permitted in a literal string. func (t *parser) parseLiteralString(name string, p *parse.API) (string, bool) { if !p.Accept(a.SingleQuote) { p.Expected("opening single quote") return "", false } sb := &strings.Builder{} for { switch { case p.Accept(m.Drop(a.SingleQuote)): return sb.String(), true case p.Accept(a.Tab): sb.WriteString("\t") case p.Peek(controlCharacter): p.Error("invalid character in %s: %q (no control chars allowed, except for tab)", name, p.Result.Runes[0]) return sb.String(), false case p.Peek(a.InvalidRune): p.Error("invalid UTF8 rune") return sb.String(), false case p.Accept(a.ValidRune): sb.WriteString(p.Result.String()) default: p.Expected("closing single quote") return sb.String(), false } } } // Specific handling of input for multi-line basic strings. // // • Multi-line basic strings are surrounded by three quotation marks on // each side and allow newlines. // // • A newline immediately following the opening delimiter will be trimmed. // All other whitespace and newline characters remain intact. // // • TOML parsers should feel free to normalize newline to whatever makes // sense for their platform. // // • All of the escape sequences that are valid for basic strings are also valid // for multi-line basic strings. // // • Any Unicode character may be used except those that must be escaped: // backslash and the control characters (U+0000 to U+001F, U+007F). Quotation // marks need not be escaped unless their presence would create a premature // closing delimiter. // // • For writing long strings without introducing extraneous whitespace, use a // "line ending backslash". When the last non-whitespace character on a line is // a \, it will be trimmed along with all whitespace (including newlines) up to // the next non-whitespace character or closing delimiter. func (t *parser) parseMultiLineBasicString(p *parse.API) (string, bool) { if !p.Accept(doubleQuote3.Then(newline.Optional())) { p.Expected("opening three quotation marks") return "", false } sb := &strings.Builder{} for { switch { case p.Accept(newline): sb.WriteString("\n") case p.Peek(controlCharacter): p.Error("invalid character in multi-line basic string: %q (must be escaped)", p.Result.Runes[0]) return sb.String(), false case p.Accept(validEscape): if !appendEscapedRune(p, sb) { return sb.String(), false } case p.Accept(lineEndingBackslash): // NOOP, the line-ending backslash sequence is skipped. case p.Peek(a.Backslash): p.Error("invalid escape sequence") return sb.String(), false case p.Accept(m.Drop(doubleQuote3)): return sb.String(), true case p.Accept(a.ValidRune): sb.WriteString(p.Result.String()) case p.Peek(a.InvalidRune): p.Error("invalid UTF8 rune") return sb.String(), false default: p.Expected("closing three quotation marks") return sb.String(), false } } } func appendEscapedRune(p *parse.API, sb *strings.Builder) bool { s := p.Result.String() switch s { case `\b`: sb.WriteRune('\b') case `\t`: sb.WriteRune('\t') case `\n`: sb.WriteRune('\n') case `\f`: sb.WriteRune('\f') case `\r`: sb.WriteRune('\r') case `\"`: sb.WriteRune('"') case `\\`: sb.WriteRune('\\') default: // UTF8 escape code: \uXXXX or \UXXXXXXXXXXXX. hex := s[2:] val, _ := strconv.ParseUint(hex, 16, 32) // hex format already validated by parser r := rune(val) if !utf8.ValidRune(r) { p.Error(fmt.Sprintf("invalid UTF8 escape '%s'", s)) return false } sb.WriteRune(r) } return true } // Specific handling of input for multi-line literal strings. // // • Multi-line literal strings are surrounded by three single quotes on // each side and allow newlines. // // • A newline immediately following the opening delimiter will be trimmed. // // • All other content between the delimiters is interpreted as-is without modification. // // • TOML parsers should feel free to normalize newline to whatever makes // sense for their platform. // // • Control characters other than tab and newline are not permitted in a multi-line literal string. func (t *parser) parseMultiLineLiteralString(p *parse.API) (string, bool) { if !p.Accept(singleQuote3.Then(newline.Optional())) { p.Expected("opening three single quotes") return "", false } sb := &strings.Builder{} for { switch { case p.Accept(m.Drop(singleQuote3)): return sb.String(), true case p.Accept(a.Tab): sb.WriteString("\t") case p.Accept(newline): sb.WriteString("\n") case p.Peek(controlCharacter): p.Error("invalid character in literal string: %q (no control chars allowed, except for tab and newline)", p.Result.Runes[0]) return sb.String(), false case p.Accept(a.ValidRune): sb.WriteString(p.Result.String()) case p.Peek(a.InvalidRune): p.Error("invalid UTF8 rune") return sb.String(), false default: p.Expected("closing three single quotes") return sb.String(), false } } }