package parse import ( "unicode/utf8" "git.makaay.nl/mauricem/go-parsekit/parse" "git.makaay.nl/mauricem/go-parsekit/tokenize" ) func (t *parser) parseString(p *parse.API) (string, stringType, bool) { if !p.Accept(t.stringHandler) { p.Expected("a string value") return "", strTypeNone, false } strType := stringTypeFromFlags(t.strFlags) str := p.Result.String() return str, strType, true } type stringType byte const ( strTypeNone stringType = iota strTypeBasic strTypeLiteral strTypeMultiLineBasic strTypeMultiLineLiteral ) func stringTypeFromFlags(flags byte) stringType { if flags&strFlagBasic == strFlagBasic { if flags&strFlagMultiLine == 0 { return strTypeBasic } return strTypeMultiLineBasic } if flags&strFlagMultiLine == 0 { return strTypeLiteral } return strTypeMultiLineLiteral } const ( strFlagLiteral byte = 1 strFlagBasic byte = 2 strFlagMultiLine byte = 4 strFlagNewlinesOK byte = 8 strFlagTabsOK byte = 16 strFlagEscapesOK byte = 32 strFlagLineConcatOK byte = 64 ) func (t *parser) stringHandler(tokenAPI *tokenize.API) bool { var state stringTokenizerState in := tokenAPI.Input out := tokenAPI.Output unicodeReqLen := 0 unicodeLen := 0 unicodeHex := make([]byte, 8) utf8ReqLen := 0 utf8Len := 0 utf8Rune := rune(0) utf8Bytes := make([]byte, 4) flags := byte(0) delim := byte(0) subState := 0 for { bs, _ := in.Byte.PeekBuffered(0) bslen := len(bs) // End of input reached. if bslen == 0 { // We might be at the second delimiter of a basic or literal string. if state == strStateStart && subState == 2 { return true } // Unexpected end of input. return false } for i := 0; i < bslen; i++ { b := bs[i] switch state { // Parse the string opener. // There are four ways to express strings: basic, multi-line basic, literal and // multi-line literal. Basic strings are surrounded by quotation marks ("..."). // Literal strings are surrounded by single quotes ('...'). // Multi-line basic strings are surrounded by three quotation marks on each // side and allow newlines ("""..."""). Multi-line literal strings are surrounded // by three single quotes on each side and allow newlines as well ('''...'''). case strStateStart: if subState == 0 { if b != '"' && b != '\'' { // Expected an opener quote here. return false } if b == '\'' { flags |= strFlagLiteral | strFlagTabsOK } else { flags |= strFlagBasic | strFlagEscapesOK } t.strFlags = flags subState = 1 delim = b in.Byte.MoveCursor(b) continue } if subState == 1 { // Not a second quote, so this is the start of // single-line string content. if b != delim { i-- state = strStateContent continue } in.Byte.MoveCursor(b) subState = 2 continue } if subState == 2 { // Not a third quote, so this is an empty string ('' or ""). if b != delim { return true } // Third quote, so this is a multi-line string (''' or """). flags |= strFlagMultiLine | strFlagNewlinesOK if flags&strFlagBasic == strFlagBasic { flags |= strFlagLineConcatOK } t.strFlags = flags in.Byte.MoveCursor(b) subState = 3 continue } if subState == 3 { // We're in a multi-line string. From the TOML spec: // A newline immediately following the opening delimiter will be trimmed. // All other whitespace and newline characters remain intact. if b == '\n' { in.Byte.MoveCursor(b) state = strStateContent continue } if b == '\r' { in.Byte.MoveCursor(b) subState = 4 continue } // Not a newline, so this byte is part of the content. i-- state = strStateContent continue } if subState == 4 { // We've seen a \r, so here we should see a \n for a newline // after a multi-line opener. if b == '\n' { in.Byte.MoveCursor(b) state = strStateContent continue } // Lonely \r found. Pass it to the content handler. i -= 2 state = strStateContent continue } // Parse string contents. case strStateContent: switch { case b == '\r' && flags&strFlagNewlinesOK == strFlagNewlinesOK: state = strStateCRLF continue case b == '\n' && flags&strFlagNewlinesOK == strFlagNewlinesOK: out.AddByte(b) in.Byte.MoveCursor(b) continue case b == '\t' && flags&strFlagTabsOK == strFlagTabsOK: out.AddByte(b) in.Byte.MoveCursor(b) continue case (b >= 0x00 && b <= 0x1F) || b == 0x7F: // Control characters must be escaped. return false case b == '\\': in.Byte.MoveCursor(b) // Handle escape codes, when they are allowed. if flags&strFlagEscapesOK == strFlagEscapesOK { state = strStateEscape continue } // Otherwise, add the backslash as plain output. out.AddByte(b) continue case b == delim: // Single-line string. if flags&strFlagMultiLine == 0 { in.Byte.MoveCursor(b) return true } // Multi-line string in.Byte.MoveCursor(b) state = strStateMultiLineEnd subState = 0 continue } // At this point, we must have a UTF8 character on the input. // Here we check what length the character must have in bytes. // Then the rest of the work is offloaded to the strUTF8 state. switch b >> 4 { case 0, 1, 2, 3, 4, 5, 6, 7: // 1 byte UTF8 (0xxxxxxx, a.k.a. ASCII) out.AddByte(b) in.Byte.MoveCursor(b) continue case 12, 13: // 2 byte UTF8 (110xxxxx 10xxxxxx) utf8ReqLen = 2 utf8Rune = rune((b & lowest5bits) << 6) case 14: // 3 byte UTF8 (1110xxxx 10xxxxxx 10xxxxxx) utf8ReqLen = 3 utf8Rune = rune((b & lowest4bits) << 6) case 15: // 4 byte UTF8 (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx) utf8ReqLen = 4 utf8Rune = rune((b & lowest3bits) << 6) default: // Invalid UTF8 rune return false } utf8Bytes[0] = b utf8Len = 1 state = strStateUTF8 // Parse followup bytes of a UTF8 byte sequence. case strStateUTF8: // The input byte must be a continuation byte (10xxxxxx) if b>>6 != 2 { // Invalid UTF8 rune return false } utf8Bytes[utf8Len] = b utf8Len++ utf8Rune = utf8Rune<<6 + rune(b&lowest6bits) if utf8Len == utf8ReqLen { if !utf8.ValidRune(utf8Rune) { // Invalid unicode character return false } bytes := utf8Bytes[:utf8Len] out.AddBytes(bytes...) in.Byte.MoveCursorMulti(bytes...) state = strStateContent } // Parse the \n in a \r\n sequence. case strStateCRLF: // \r\n is normalized to just \n here (as allowed by the TOML spec). if b == '\n' { in.Byte.MoveCursorMulti('\r', b) out.AddByte('\n') state = strStateContent continue } // Lonely \r, should have been escaped. return false // Parse escape byte sequences. // For convenience, some popular characters have a compact escape sequence. // // \b - backspace (U+0008) // \t - tab (U+0009) // \n - LF (U+000A) // \f - form feed (U+000C) // \r - carriage return (U+000D) // \" - quote (U+0022) // \\ - backslash (U+005C) // \uXXXX - unicode (U+XXXX) // \UXXXXXXXX - unicode (U+XXXXXXXX) case strStateEscape: // Handle short control character escape sequence (\t, \a, etc). if escaped, ok := getEscapedChar(b); ok { out.AddByte(escaped) in.Byte.MoveCursor(b) state = strStateContent continue } switch b { case ' ', '\t', '\r', '\n': // Handle line concatenation escape sequence. if flags&strFlagLineConcatOK == 0 { // Invalid escape. return false } // Point the parser at an appropriate subState of // the strEscapeConcat state. switch b { case ' ', '\t': subState = 0 case '\r': subState = 1 case '\n': subState = 2 } in.Byte.MoveCursor(b) state = strStateEscapeConcat continue case 'u', 'U': // Handle unicode escape sequence (\uXXXX, \UXXXXXXXX). in.Byte.MoveCursor(b) unicodeReqLen = 4 if b == 'u' { unicodeReqLen = 4 } else { unicodeReqLen = 8 } unicodeLen = 0 utf8Rune = 0 state = strStateEscapeUnicode default: // Invalid escape sequence used. return false } // For writing long strings without introducing extraneous whitespace, use a // "line ending backslash". When the last non-whitespace character on a line is // a \, it will be trimmed along with all whitespace (including newlines) up to // the next non-whitespace character or closing delimiter. case strStateEscapeConcat: // Skip over whitespace until the end of the line is found. if subState == 0 { switch b { case ' ', '\t': in.Byte.MoveCursor(b) continue case '\r': in.Byte.MoveCursor(b) subState = 1 continue case '\n': in.Byte.MoveCursor(b) subState = 2 continue default: // Invalid escape sequence used. Expected whitespace or newline. return false } } // We've seen a \r at the same line as the escape char, // skip over the following \n. if subState == 1 { if b == '\n' { in.Byte.MoveCursor(b) subState = 2 continue } // Invalid escape sequence used. Expected newline. return false } // We've seen a \n at the same line as the escape char, // skip over all whitespace and newlines from here on. if subState == 2 { if b == ' ' || b == '\t' || b == '\n' { in.Byte.MoveCursor(b) continue } if b == '\r' { in.Byte.MoveCursor(b) subState = 3 continue } } // We've seen a \r, skip over the following \n. if subState == 3 { if b == '\n' { in.Byte.MoveCursor(b) subState = 2 continue } } // End of concat escape. Let the strContent state reprocess the byte. i-- state = strStateContent continue // Parse unicode escape sequence (\uXXXX, \UXXXXXXXX). case strStateEscapeUnicode: value, ok := getHexValueForChar(b) if !ok { // Invalid unicode escape sequence used. return false } utf8Rune = utf8Rune<<4 + rune(value) unicodeHex[unicodeLen] = b unicodeLen++ if unicodeLen == unicodeReqLen { if !utf8.ValidRune(utf8Rune) { // Invalid unicode escape return false } in.Byte.MoveCursorMulti(unicodeHex[:unicodeLen]...) w := utf8.EncodeRune(utf8Bytes, utf8Rune) out.AddBytes(utf8Bytes[:w]...) state = strStateContent } // Parse the end of the string. // One delimiter has already been seen by the strContent state. // Here we check if we have a full set of 3 delimiters to end // the string. case strStateMultiLineEnd: // TODO rename to strEndMultiLine if subState == 0 { // Second delimiter found. if b == delim { subState = 1 in.Byte.MoveCursor(b) continue } // No delimiter found, so we're looking at a single // delimiter within the multi-line body. Add the delimiter // to the output and feed the current byte back to the // strContent state. out.AddByte(delim) i-- state = strStateContent continue } if subState == 1 { // Third delimiter found. This ends the string. if b == delim { in.Byte.MoveCursor(b) return true } // No delimiter found, so we're looking at two delimiters // within the multi-line body. Add the delimiters to the // output and feed the current byte back to the strContent state. out.AddBytes(delim, delim) i-- state = strStateContent continue } } } } } // Specific handling of input for basic strings. // // • Any Unicode character may be used except those that must be escaped: // quotation mark, backslash, and the control characters (U+0000 to // U+001F, U+007F). // // • No additional \escape sequences are allowed. What the spec say about this: // "All other escape sequences [..] are reserved and, if used, TOML should // produce an error."" // func (t *parser) parseBasicString(name string, p *parse.API) (string, bool) { // if !p.Accept(basicStringHandler) { // return "", false // } // return p.Result.String(), true // } type stringTokenizerState int const ( strStateStart stringTokenizerState = iota strStateContent strStateEscape strStateEscapeUnicode strStateEscapeConcat strStateCRLF strStateUTF8 strStateMultiLineEnd ) const ( lowest6bits = 0x3F // 0011 1111 lowest5bits = 0x1F // 0001 1111 lowest4bits = 0x0F // 0000 1111 lowest3bits = 0x07 // 0000 0111 ) func getHexValueForChar(b byte) (byte, bool) { switch { case '0' <= b && b <= '9': return b - '0', true case 'a' <= b && b <= 'z': return b - 'a' + 10, true case 'A' <= b && b <= 'Z': return b - 'A' + 10, true default: return 0, false } } func getEscapedChar(b byte) (byte, bool) { switch b { case 'b': return '\b', true case 't': return '\t', true case 'n': return '\n', true case 'f': return '\f', true case 'r': return '\r', true case '"': return '"', true case '\\': return '\\', true } return 0, false } // Specific handling of input for multi-line basic strings. // // • Multi-line basic strings are surrounded by three quotation marks on // each side and allow newlines. // // • A newline immediately following the opening delimiter will be trimmed. // All other whitespace and newline characters remain intact. // // • TOML parsers should feel free to normalize newline to whatever makes // sense for their platform. // // • All of the escape sequences that are valid for basic strings are also valid // for multi-line basic strings. // // • Any Unicode character may be used except those that must be escaped: // backslash and the control characters (U+0000 to U+001F, U+007F). Quotation // marks need not be escaped unless their presence would create a premature // closing delimiter. // // • For writing long strings without introducing extraneous whitespace, use a // "line ending backslash". When the last non-whitespace character on a line is // a \, it will be trimmed along with all whitespace (including newlines) up to // the next non-whitespace character or closing delimiter. // func (t *parser) parseMultiLineBasicString(p *parse.API) (string, bool) { // if !p.Accept(multiLineBasicStringHandler) { // return "", false // } // return p.Result.String(), true // } // func multiLineBasicStringHandler(tokenAPI *tokenize.API) bool { // var state stringTokenizerState // in := tokenAPI.Input // out := tokenAPI.Output // unicodeReqLen := 0 // unicodeLen := 0 // unicodeHex := make([]byte, 8) // utf8ReqLen := 0 // utf8Len := 0 // utf8Rune := rune(0) // utf8Bytes := make([]byte, 4) // crlf := false // for { // bs, _ := in.Byte.PeekBuffered(0) // bslen := len(bs) // if bslen == 0 { // return false // } // for i := 0; i < bslen; i++ { // b := bs[i] // switch state { // case strStart, strStart2, strStart3: // if b != '"' { // // No triple opening quotes found. // return false // } // in.Byte.MoveCursor(b) // switch state { // case strStart: // state = strStart2 // case strStart2: // state = strStart3 // case strStart3: // state = strStart4 // } // case strStart4: // if !crlf && b == '\r' { // crlf = true // in.Byte.MoveCursor(b) // continue // } // if b == '\n' { // in.Byte.MoveCursor(b) // state = strContent // continue // } // if crlf { // // Lonely \r without \n. // return false // } // state = strContent // fallthrough // case strContent: // switch { // case b == '\r': // state = strCRLF // continue // case b == '\n': // out.AddByte(b) // in.Byte.MoveCursor(b) // continue // case (b >= 0x00 && b <= 0x1F) || b == 0x7F: // // Unescaped control character // // TODO error reporting instead of full reject // return false // case b == '\\': // in.Byte.MoveCursor(b) // state = strEscape // continue // case b == '"': // in.Byte.MoveCursor(b) // state = strEnd // continue // } // switch b >> 4 { // case 0, 1, 2, 3, 4, 5, 6, 7: // 1 byte UTF8 (0xxxxxxx, a.k.a. ASCII) // out.AddByte(b) // in.Byte.MoveCursor(b) // continue // case 12, 13: // 2 byte UTF8 (110xxxxx 10xxxxxx) // utf8ReqLen = 2 // utf8Rune = rune((b & lowest5bits) << 6) // case 14: // 3 byte UTF8 (1110xxxx 10xxxxxx 10xxxxxx) // utf8ReqLen = 3 // utf8Rune = rune((b & lowest4bits) << 6) // case 15: // 4 byte UTF8 (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx) // utf8ReqLen = 4 // utf8Rune = rune((b & lowest3bits) << 6) // default: // Invalid UTF8 rune // return false // } // utf8Bytes[0] = b // utf8Len = 1 // state = strUTF8 // case strUTF8: // // This should be a continuation byte (10xxxxxx) // if b>>6 != 2 { // // Invalid UTF8 rune // return false // } // utf8Bytes[utf8Len] = b // utf8Len++ // utf8Rune = utf8Rune<<6 + rune(b&lowest6bits) // if utf8Len == utf8ReqLen { // if !utf8.ValidRune(utf8Rune) { // // Invalid unicode character // return false // } // bytes := utf8Bytes[:utf8Len] // out.AddBytes(bytes...) // in.Byte.MoveCursorMulti(bytes...) // state = strContent // } // case strCRLF: // if b == '\n' { // in.Byte.MoveCursorMulti('\r', b) // out.AddByte('\n') // state = strContent // continue // } // // Lonely \r, should have been escaped. // return false // case strEscape: // state = strContent // if escaped, ok := getEscapedChar(b); ok { // out.AddByte(escaped) // in.Byte.MoveCursor(b) // continue // } // switch b { // case ' ', '\t': // in.Byte.MoveCursor(b) // state = strEscapeConcat // continue // case '\r': // in.Byte.MoveCursor(b) // state = strEscapeConcatCRLF // continue // case '\n': // in.Byte.MoveCursor(b) // state = strEscapeConcatWs2 // continue // case 'u', 'U': // in.Byte.MoveCursor(b) // unicodeReqLen = 4 // if b == 'u' { // unicodeReqLen = 4 // } else { // unicodeReqLen = 8 // } // unicodeLen = 0 // utf8Rune = 0 // state = strEscapeUnicode // default: // // Invalid escape sequence used. // return false // } // case strEscapeConcat: // switch b { // case ' ', '\t': // in.Byte.MoveCursor(b) // continue // case '\r': // in.Byte.MoveCursor(b) // state = strEscapeConcatCRLF // continue // case '\n': // in.Byte.MoveCursor(b) // state = strEscapeConcatWs2 // continue // default: // // Invalid line concatenation // return false // } // case strEscapeConcatCRLF: // switch b { // case '\n': // in.Byte.MoveCursor(b) // state = strEscapeConcatWs2 // continue // default: // // Invalid line concatenation // return false // } // case strEscapeConcatWs2: // switch b { // case ' ', '\t': // in.Byte.MoveCursor(b) // continue // case '\r': // in.Byte.MoveCursor(b) // state = strEscapeConcatCRLF // continue // case '\n': // in.Byte.MoveCursor(b) // state = strEscapeConcatWs2 // continue // default: // i-- // state = strContent // continue // } // case strEscapeUnicode: // value, ok := getHexValueForChar(b) // if !ok { // // Invalid unicode escape sequence used. // return false // } // utf8Rune = utf8Rune<<4 + rune(value) // unicodeHex[unicodeLen] = b // unicodeLen++ // if unicodeLen == unicodeReqLen { // if !utf8.ValidRune(utf8Rune) { // // Invalid unicode escape // return false // } // in.Byte.MoveCursorMulti(unicodeHex[:unicodeLen]...) // w := utf8.EncodeRune(utf8Bytes, utf8Rune) // out.AddBytes(utf8Bytes[:w]...) // state = strContent // } // case strEnd: // if b == '"' { // state = strEnd3 // in.Byte.MoveCursor(b) // } else { // state = strContent // out.AddByte('"') // i-- // } // case strEnd3: // if b == '"' { // in.Byte.MoveCursor(b) // return true // } // state = strContent // out.AddBytes('"', '"') // i-- // } // } // } // } // Specific handling of input for multi-line literal strings. // // • Multi-line literal strings are surrounded by three single quotes on // each side and allow newlines. // // • A newline immediately following the opening delimiter will be trimmed. // // • All other content between the delimiters is interpreted as-is without modification. // // • TOML parsers should feel free to normalize newline to whatever makes // sense for their platform. // // • Control characters other than tab and newline are not permitted in a multi-line literal string. // func (t *parser) parseMultiLineLiteralString(p *parse.API) (string, bool) { // if !p.Accept(multiLineLiteralStringHandler) { // return "", false // } // return p.Result.String(), true // } // func multiLineLiteralStringHandler(tokenAPI *tokenize.API) bool { // var state stringTokenizerState // in := tokenAPI.Input // out := tokenAPI.Output // utf8ReqLen := 0 // utf8Len := 0 // utf8Rune := rune(0) // utf8Bytes := make([]byte, 4) // crlf := false // for { // bs, _ := in.Byte.PeekBuffered(0) // bslen := len(bs) // if bslen == 0 { // return false // } // for i := 0; i < bslen; i++ { // b := bs[i] // switch state { // case strStart, strStart2, strStart3: // if b != '\'' { // // No triple opening quotes found. // return false // } // in.Byte.MoveCursor(b) // switch state { // case strStart: // state = strStart2 // case strStart2: // state = strStart3 // case strStart3: // state = strStart4 // } // case strStart4: // if !crlf && b == '\r' { // crlf = true // in.Byte.MoveCursor(b) // continue // } // if b == '\n' { // in.Byte.MoveCursor(b) // state = strContent // continue // } // if crlf { // // Lonely \r without \n. // return false // } // state = strContent // fallthrough // case strContent: // switch { // case b == '\r': // state = strCRLF // continue // case b == '\n' || b == '\t': // out.AddByte(b) // in.Byte.MoveCursor(b) // continue // case (b >= 0x00 && b <= 0x1F) || b == 0x7F: // // Unescaped control character // // TODO error reporting instead of full reject // return false // case b == '\'': // in.Byte.MoveCursor(b) // state = strEnd // continue // } // switch b >> 4 { // case 0, 1, 2, 3, 4, 5, 6, 7: // 1 byte UTF8 (0xxxxxxx, a.k.a. ASCII) // out.AddByte(b) // in.Byte.MoveCursor(b) // continue // case 12, 13: // 2 byte UTF8 (110xxxxx 10xxxxxx) // utf8ReqLen = 2 // utf8Rune = rune((b & lowest5bits) << 6) // case 14: // 3 byte UTF8 (1110xxxx 10xxxxxx 10xxxxxx) // utf8ReqLen = 3 // utf8Rune = rune((b & lowest4bits) << 6) // case 15: // 4 byte UTF8 (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx) // utf8ReqLen = 4 // utf8Rune = rune((b & lowest3bits) << 6) // default: // Invalid UTF8 rune // return false // } // utf8Bytes[0] = b // utf8Len = 1 // state = strUTF8 // case strUTF8: // // This should be a continuation byte (10xxxxxx) // if b>>6 != 2 { // // Invalid UTF8 rune // return false // } // utf8Bytes[utf8Len] = b // utf8Len++ // utf8Rune = utf8Rune<<6 + rune(b&lowest6bits) // if utf8Len == utf8ReqLen { // if !utf8.ValidRune(utf8Rune) { // // Invalid unicode character // return false // } // bytes := utf8Bytes[:utf8Len] // out.AddBytes(bytes...) // in.Byte.MoveCursorMulti(bytes...) // state = strContent // } // case strCRLF: // if b == '\n' { // in.Byte.MoveCursorMulti('\r', b) // out.AddByte('\n') // state = strContent // continue // } // // Lonely \r, should have been escaped. // return false // case strEnd: // if b == '\'' { // state = strEnd3 // in.Byte.MoveCursor(b) // } else { // state = strContent // out.AddByte('\'') // i-- // } // case strEnd3: // if b == '\'' { // in.Byte.MoveCursor(b) // return true // } // state = strContent // out.AddBytes('\'', '\'') // i-- // } // } // } // }