package parse import ( "unicode/utf8" "git.makaay.nl/mauricem/go-parsekit/parse" "git.makaay.nl/mauricem/go-parsekit/tokenize" "git.makaay.nl/mauricem/go-toml/ast" ) var ( // Multi-line basic strings are surrounded by three quotation marks on each // side and allow newlines. multiLineBasicStringDelimiter = a.Str(`"""`) openingMultiLineBasicString = multiLineBasicStringDelimiter.Then(newline.Optional()) closingMultiLineBasicString = m.Drop(multiLineBasicStringDelimiter) // Multi-line literal strings are surrounded by three single quotes on each side and allow newlines. multiLineLiteralStringDelimiter = a.Str(`'''`) openingMultiLineLiteralString = multiLineLiteralStringDelimiter.Then(newline.Optional()) closingMultiLineLiteralString = m.Drop(multiLineLiteralStringDelimiter) // Opening and closing character for basic strings. basicStringDelimiter = a.DoubleQuote // Opening and losing character for literal strings. literalStringDelimiter = a.SingleQuote // For convenience, some popular characters have a compact escape sequence. // // \b - backspace (U+0008) // \t - tab (U+0009) // \n - LF (U+000A) // \f - form feed (U+000C) // \r - carriage return (U+000D) // \" - quote (U+0022) // \\ - backslash (U+005C) // \uXXXX - unicode (U+XXXX) // \UXXXXXXXX - unicode (U+XXXXXXXX) validEscapeChar = a.Char('b', 't', 'n', 'f', 'r', '"', '\\') shortEscape = c.Seq(a.Backslash, validEscapeChar) shortUTF8Escape = c.Seq(a.Backslash, a.Char('u'), a.HexDigit.Times(4)) longUTF8Escape = c.Seq(a.Backslash, a.Char('U'), a.HexDigit.Times(8)) validEscape = c.Any(shortEscape, shortUTF8Escape, longUTF8Escape) // For writing long strings without introducing extraneous whitespace, use a // "line ending backslash". When the last non-whitespace character on a line is // a \, it will be trimmed along with all whitespace (including newlines) up to // the next non-whitespace character or closing delimiter. lineEndingBackslash = c.Seq(a.Backslash, whitespace, newline, whitespaceInclNewlines.Optional()) ) // There are four ways to express strings: basic, multi-line basic, literal and // multi-line literal. All strings must parse/value_array.gocontain only valid UTF-8 characters. func (t *parser) parseString(p *parse.API) (*ast.Value, bool) { var value string var ok bool switch { case p.Peek(openingMultiLineBasicString): value, ok = t.parseMultiLineBasicString(p) case p.Peek(basicStringDelimiter): value, ok = t.parseBasicString("string value", p) case p.Peek(openingMultiLineLiteralString): value, ok = t.parseMultiLineLiteralString(p) case p.Peek(literalStringDelimiter): value, ok = t.parseLiteralString("string value", p) default: p.Expected("a string value") } if ok { return ast.NewValue(ast.TypeString, value), ok } return nil, false } // Specific handling of input for basic strings. // // • Basic strings are surrounded by quotation marks. // // • Any Unicode character may be used except those that must be escaped: // quotation mark, backslash, and the control characters (U+0000 to // U+001F, U+007F). // // • No additional \escape sequences are allowed. What the spec say about this: // "All other escape sequences [..] are reserved and, if used, TOML should // produce an error."" func (t *parser) parseBasicString(name string, p *parse.API) (string, bool) { if !p.Accept(basicStringHandler) { return "", false } return p.Result.String(), true } type stringTokenizerState int const ( strStart stringTokenizerState = iota strStart2 strStart3 strStart4 strChar strEscape strEscapeUnicode strEscapeConcatWs1 strEscapeConcatCRLF strEscapeConcatWs2 strCRLF strUTF8 strEnd2 strEnd3 ) const ( lowest6bits = 0x3F // 0011 1111 lowest5bits = 0x1F // 0001 1111 lowest4bits = 0x0F // 0000 1111 lowest3bits = 0x07 // 0000 0111 ) func basicStringHandler(tokenAPI *tokenize.API) bool { var state stringTokenizerState in := tokenAPI.Input out := tokenAPI.Output unicodeReqLen := 0 unicodeLen := 0 unicodeHex := make([]byte, 8) utf8ReqLen := 0 utf8Len := 0 utf8Rune := rune(0) utf8Bytes := make([]byte, 4) for { bs, _ := in.Byte.PeekBuffered(0) bslen := len(bs) if bslen == 0 { return false } for i := 0; i < bslen; i++ { b := bs[i] switch state { case strStart: if b != '"' { // No opening quotes found. return false } in.Byte.MoveCursor(b) state = strChar case strChar: switch { case (b >= 0x00 && b <= 0x1F) || b == 0x7F: // Control characters as defined by the TOML specification. // These must always be escaped. // Unescaped control character // TODO error reporting instead of full reject return false case b == '\\': in.Byte.MoveCursor(b) state = strEscape continue case b == '"': in.Byte.MoveCursor(b) return true } switch b >> 4 { case 0, 1, 2, 3, 4, 5, 6, 7: // 1 byte UTF8 (0xxxxxxx, a.k.a. ASCII) out.AddByte(b) in.Byte.MoveCursor(b) continue case 12, 13: // 2 byte UTF8 (110xxxxx 10xxxxxx) utf8ReqLen = 2 utf8Rune = rune((b & lowest5bits) << 6) case 14: // 3 byte UTF8 (1110xxxx 10xxxxxx 10xxxxxx) utf8ReqLen = 3 utf8Rune = rune((b & lowest4bits) << 6) case 15: // 4 byte UTF8 (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx) utf8ReqLen = 4 utf8Rune = rune((b & lowest3bits) << 6) default: // Invalid UTF8 rune return false } utf8Bytes[0] = b utf8Len = 1 state = strUTF8 case strUTF8: // This should be a continuation byte (10xxxxxx) if b>>6 != 2 { // Invalid UTF8 rune return false } utf8Bytes[utf8Len] = b utf8Len++ utf8Rune = utf8Rune<<6 + rune(b&lowest6bits) if utf8Len == utf8ReqLen { if !utf8.ValidRune(utf8Rune) { // Invalid unicode character return false } bytes := utf8Bytes[:utf8Len] out.AddBytes(bytes...) in.Byte.MoveCursorMulti(bytes...) state = strChar } case strEscape: state = strChar if escaped, ok := getEscapedChar(b); ok { out.AddByte(escaped) in.Byte.MoveCursor(b) continue } switch b { case 'u', 'U': in.Byte.MoveCursor(b) unicodeReqLen = 4 if b == 'u' { unicodeReqLen = 4 } else { unicodeReqLen = 8 } unicodeLen = 0 utf8Rune = 0 state = strEscapeUnicode default: // Invalid escape sequence used. return false } case strEscapeUnicode: value, ok := getHexValueForChar(b) if !ok { // Invalid unicode escape sequence used. return false } utf8Rune = utf8Rune<<4 + rune(value) unicodeHex[unicodeLen] = b unicodeLen++ if unicodeLen == unicodeReqLen { if !utf8.ValidRune(utf8Rune) { // Invalid unicode escape return false } in.Byte.MoveCursorMulti(unicodeHex[:unicodeLen]...) w := utf8.EncodeRune(utf8Bytes, utf8Rune) out.AddBytes(utf8Bytes[:w]...) state = strChar } } } } } func getHexValueForChar(b byte) (byte, bool) { switch { case '0' <= b && b <= '9': return b - '0', true case 'a' <= b && b <= 'z': return b - 'a' + 10, true case 'A' <= b && b <= 'Z': return b - 'A' + 10, true default: return 0, false } } func getEscapedChar(b byte) (byte, bool) { switch b { case 'b': return '\b', true case 't': return '\t', true case 'n': return '\n', true case 'f': return '\f', true case 'r': return '\r', true case '"': return '"', true case '\\': return '\\', true } return 0, false } // Specific handling of input for literal strings. // // • Literal strings are surrounded by single quotes. // // • Like basic strings, they must appear on a single line. // // • Control characters other than tab are not permitted in a literal string. func (t *parser) parseLiteralString(name string, p *parse.API) (string, bool) { if !p.Accept(literalStringHandler) { return "", false } return p.Result.String(), true } func literalStringHandler(tokenAPI *tokenize.API) bool { var state stringTokenizerState in := tokenAPI.Input out := tokenAPI.Output utf8ReqLen := 0 utf8Len := 0 utf8Rune := rune(0) utf8Bytes := [4]byte{} for { bs, _ := tokenAPI.Input.Byte.PeekBuffered(0) bslen := len(bs) if bslen == 0 { // Unexpected end of file. return false } for i := 0; i < bslen; i++ { b := bs[i] switch state { case strStart: if b != '\'' { // No opening quote found. return false } in.Byte.MoveCursor(b) state = strChar case strChar: switch { case (b >= 0x00 && b < 0x09) || (b > 0x09 && b <= 0x1F) || b == 0x7F: // Unescaped control character return false case b == '\'': in.Byte.MoveCursor(b) return true } switch b >> 4 { case 0, 1, 2, 3, 4, 5, 6, 7: // 1 byte UTF8 (0xxxxxxx, a.k.a. ASCII) out.AddByte(b) in.Byte.MoveCursor(b) continue case 12, 13: // 2 byte UTF8 (110xxxxx 10xxxxxx) utf8ReqLen = 2 utf8Rune = rune((b & lowest5bits) << 6) case 14: // 3 byte UTF8 (1110xxxx 10xxxxxx 10xxxxxx) utf8ReqLen = 3 utf8Rune = rune((b & lowest4bits) << 6) case 15: // 4 byte UTF8 (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx) utf8ReqLen = 4 utf8Rune = rune((b & lowest3bits) << 6) default: // Invalid UTF8 rune return false } utf8Bytes[0] = b utf8Len = 1 state = strUTF8 case strUTF8: // This should be a continuation byte (10xxxxxx) if b>>6 != 2 { // Invalid UTF8 rune return false } utf8Bytes[utf8Len] = b utf8Len++ utf8Rune = utf8Rune<<6 + rune(b&lowest6bits) if utf8Len == utf8ReqLen { if !utf8.ValidRune(utf8Rune) { // Invalid unicode character return false } bytes := utf8Bytes[:utf8Len] out.AddBytes(bytes...) in.Byte.MoveCursorMulti(bytes...) state = strChar } } } } } // Specific handling of input for multi-line basic strings. // // • Multi-line basic strings are surrounded by three quotation marks on // each side and allow newlines. // // • A newline immediately following the opening delimiter will be trimmed. // All other whitespace and newline characters remain intact. // // • TOML parsers should feel free to normalize newline to whatever makes // sense for their platform. // // • All of the escape sequences that are valid for basic strings are also valid // for multi-line basic strings. // // • Any Unicode character may be used except those that must be escaped: // backslash and the control characters (U+0000 to U+001F, U+007F). Quotation // marks need not be escaped unless their presence would create a premature // closing delimiter. // // • For writing long strings without introducing extraneous whitespace, use a // "line ending backslash". When the last non-whitespace character on a line is // a \, it will be trimmed along with all whitespace (including newlines) up to // the next non-whitespace character or closing delimiter. func (t *parser) parseMultiLineBasicString(p *parse.API) (string, bool) { if !p.Accept(multiLineBasicStringHandler) { return "", false } return p.Result.String(), true } func multiLineBasicStringHandler(tokenAPI *tokenize.API) bool { var state stringTokenizerState in := tokenAPI.Input out := tokenAPI.Output unicodeReqLen := 0 unicodeLen := 0 unicodeHex := make([]byte, 8) utf8ReqLen := 0 utf8Len := 0 utf8Rune := rune(0) utf8Bytes := make([]byte, 4) crlf := false for { bs, _ := in.Byte.PeekBuffered(0) bslen := len(bs) if bslen == 0 { return false } for i := 0; i < bslen; i++ { b := bs[i] switch state { case strStart, strStart2, strStart3: if b != '"' { // No triple opening quotes found. return false } in.Byte.MoveCursor(b) switch state { case strStart: state = strStart2 case strStart2: state = strStart3 case strStart3: state = strStart4 } case strStart4: if !crlf && b == '\r' { crlf = true in.Byte.MoveCursor(b) continue } if b == '\n' { in.Byte.MoveCursor(b) state = strChar continue } if crlf { // Lonely \r without \n. return false } state = strChar fallthrough case strChar: switch { case b == '\r': state = strCRLF continue case b == '\n': out.AddByte(b) in.Byte.MoveCursor(b) continue case (b >= 0x00 && b <= 0x1F) || b == 0x7F: // Unescaped control character // TODO error reporting instead of full reject return false case b == '\\': in.Byte.MoveCursor(b) state = strEscape continue case b == '"': in.Byte.MoveCursor(b) state = strEnd2 continue } switch b >> 4 { case 0, 1, 2, 3, 4, 5, 6, 7: // 1 byte UTF8 (0xxxxxxx, a.k.a. ASCII) out.AddByte(b) in.Byte.MoveCursor(b) continue case 12, 13: // 2 byte UTF8 (110xxxxx 10xxxxxx) utf8ReqLen = 2 utf8Rune = rune((b & lowest5bits) << 6) case 14: // 3 byte UTF8 (1110xxxx 10xxxxxx 10xxxxxx) utf8ReqLen = 3 utf8Rune = rune((b & lowest4bits) << 6) case 15: // 4 byte UTF8 (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx) utf8ReqLen = 4 utf8Rune = rune((b & lowest3bits) << 6) default: // Invalid UTF8 rune return false } utf8Bytes[0] = b utf8Len = 1 state = strUTF8 case strUTF8: // This should be a continuation byte (10xxxxxx) if b>>6 != 2 { // Invalid UTF8 rune return false } utf8Bytes[utf8Len] = b utf8Len++ utf8Rune = utf8Rune<<6 + rune(b&lowest6bits) if utf8Len == utf8ReqLen { if !utf8.ValidRune(utf8Rune) { // Invalid unicode character return false } bytes := utf8Bytes[:utf8Len] out.AddBytes(bytes...) in.Byte.MoveCursorMulti(bytes...) state = strChar } case strCRLF: if b == '\n' { in.Byte.MoveCursorMulti('\r', b) out.AddByte('\n') state = strChar continue } // Lonely \r, should have been escaped. return false case strEscape: state = strChar if escaped, ok := getEscapedChar(b); ok { out.AddByte(escaped) in.Byte.MoveCursor(b) continue } switch b { case ' ', '\t': in.Byte.MoveCursor(b) state = strEscapeConcatWs1 continue case '\r': in.Byte.MoveCursor(b) state = strEscapeConcatCRLF continue case '\n': in.Byte.MoveCursor(b) state = strEscapeConcatWs2 continue case 'u', 'U': in.Byte.MoveCursor(b) unicodeReqLen = 4 if b == 'u' { unicodeReqLen = 4 } else { unicodeReqLen = 8 } unicodeLen = 0 utf8Rune = 0 state = strEscapeUnicode default: // Invalid escape sequence used. return false } case strEscapeConcatWs1: switch b { case ' ', '\t': in.Byte.MoveCursor(b) continue case '\r': in.Byte.MoveCursor(b) state = strEscapeConcatCRLF continue case '\n': in.Byte.MoveCursor(b) state = strEscapeConcatWs2 continue default: // Invalid line concatenation return false } case strEscapeConcatCRLF: switch b { case '\n': in.Byte.MoveCursor(b) state = strEscapeConcatWs2 continue default: // Invalid line concatenation return false } case strEscapeConcatWs2: switch b { case ' ', '\t': in.Byte.MoveCursor(b) continue case '\r': in.Byte.MoveCursor(b) state = strEscapeConcatCRLF continue case '\n': in.Byte.MoveCursor(b) state = strEscapeConcatWs2 continue default: i-- state = strChar continue } case strEscapeUnicode: value, ok := getHexValueForChar(b) if !ok { // Invalid unicode escape sequence used. return false } utf8Rune = utf8Rune<<4 + rune(value) unicodeHex[unicodeLen] = b unicodeLen++ if unicodeLen == unicodeReqLen { if !utf8.ValidRune(utf8Rune) { // Invalid unicode escape return false } in.Byte.MoveCursorMulti(unicodeHex[:unicodeLen]...) w := utf8.EncodeRune(utf8Bytes, utf8Rune) out.AddBytes(utf8Bytes[:w]...) state = strChar } case strEnd2: if b == '"' { state = strEnd3 in.Byte.MoveCursor(b) } else { state = strChar out.AddByte('"') i-- } case strEnd3: if b == '"' { in.Byte.MoveCursor(b) return true } state = strChar out.AddBytes('"', '"') i-- } } } } // Specific handling of input for multi-line literal strings. // // • Multi-line literal strings are surrounded by three single quotes on // each side and allow newlines. // // • A newline immediately following the opening delimiter will be trimmed. // // • All other content between the delimiters is interpreted as-is without modification. // // • TOML parsers should feel free to normalize newline to whatever makes // sense for their platform. // // • Control characters other than tab and newline are not permitted in a multi-line literal string. func (t *parser) parseMultiLineLiteralString(p *parse.API) (string, bool) { if !p.Accept(multiLineLiteralStringHandler) { return "", false } return p.Result.String(), true } func multiLineLiteralStringHandler(tokenAPI *tokenize.API) bool { var state stringTokenizerState in := tokenAPI.Input out := tokenAPI.Output utf8ReqLen := 0 utf8Len := 0 utf8Rune := rune(0) utf8Bytes := make([]byte, 4) crlf := false for { bs, _ := in.Byte.PeekBuffered(0) bslen := len(bs) if bslen == 0 { return false } for i := 0; i < bslen; i++ { b := bs[i] switch state { case strStart, strStart2, strStart3: if b != '\'' { // No triple opening quotes found. return false } in.Byte.MoveCursor(b) switch state { case strStart: state = strStart2 case strStart2: state = strStart3 case strStart3: state = strStart4 } case strStart4: if !crlf && b == '\r' { crlf = true in.Byte.MoveCursor(b) continue } if b == '\n' { in.Byte.MoveCursor(b) state = strChar continue } if crlf { // Lonely \r without \n. return false } state = strChar fallthrough case strChar: switch { case b == '\r': state = strCRLF continue case b == '\n' || b == '\t': out.AddByte(b) in.Byte.MoveCursor(b) continue case (b >= 0x00 && b <= 0x1F) || b == 0x7F: // Unescaped control character // TODO error reporting instead of full reject return false case b == '\'': in.Byte.MoveCursor(b) state = strEnd2 continue } switch b >> 4 { case 0, 1, 2, 3, 4, 5, 6, 7: // 1 byte UTF8 (0xxxxxxx, a.k.a. ASCII) out.AddByte(b) in.Byte.MoveCursor(b) continue case 12, 13: // 2 byte UTF8 (110xxxxx 10xxxxxx) utf8ReqLen = 2 utf8Rune = rune((b & lowest5bits) << 6) case 14: // 3 byte UTF8 (1110xxxx 10xxxxxx 10xxxxxx) utf8ReqLen = 3 utf8Rune = rune((b & lowest4bits) << 6) case 15: // 4 byte UTF8 (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx) utf8ReqLen = 4 utf8Rune = rune((b & lowest3bits) << 6) default: // Invalid UTF8 rune return false } utf8Bytes[0] = b utf8Len = 1 state = strUTF8 case strUTF8: // This should be a continuation byte (10xxxxxx) if b>>6 != 2 { // Invalid UTF8 rune return false } utf8Bytes[utf8Len] = b utf8Len++ utf8Rune = utf8Rune<<6 + rune(b&lowest6bits) if utf8Len == utf8ReqLen { if !utf8.ValidRune(utf8Rune) { // Invalid unicode character return false } bytes := utf8Bytes[:utf8Len] out.AddBytes(bytes...) in.Byte.MoveCursorMulti(bytes...) state = strChar } case strCRLF: if b == '\n' { in.Byte.MoveCursorMulti('\r', b) out.AddByte('\n') state = strChar continue } // Lonely \r, should have been escaped. return false case strEnd2: if b == '\'' { state = strEnd3 in.Byte.MoveCursor(b) } else { state = strChar out.AddByte('\'') i-- } case strEnd3: if b == '\'' { in.Byte.MoveCursor(b) return true } state = strChar out.AddBytes('\'', '\'') i-- } } } }