diff --git a/cmd/burntsushi-tester/test.toml b/cmd/burntsushi-tester/test.toml new file mode 100644 index 0000000..bc88494 --- /dev/null +++ b/cmd/burntsushi-tester/test.toml @@ -0,0 +1,7 @@ +regex2 = '''I [dw]on't need \d{2} apples''' +lines = ''' +The first newline is +trimmed in raw strings. + All other whitespace + is preserved. +''' diff --git a/parse/benchmark_test.go b/parse/benchmark_test.go new file mode 100644 index 0000000..e974a98 --- /dev/null +++ b/parse/benchmark_test.go @@ -0,0 +1,76 @@ +package parse_test + +import ( + "testing" +) + +func A(b byte) (byte, bool) { + if b > 'b' { + switch b { + case 't': + return '\t', true + case 'n': + return '\n', true + case 'r': + return '\r', true + case 'f': + return '\f', true + } + } else { + switch b { + case '"': + return '"', true + case '\\': + return '\\', true + case 'b': + return '\b', true + } + } + return 0x00, false +} + +func B(b byte) (byte, bool) { + switch b { + case 'r': + return '\r', true + case 'n': + return '\n', true + case 't': + return '\t', true + case 'b': + return '\b', true + case 'f': + return '\f', true + case '"': + return '"', true + case '\\': + return '\\', true + } + return 0x00, false +} + +// TODO cleanup unused benchmark. +func Benchmark_A(b *testing.B) { + for i := 0; i < b.N; i++ { + A('b') + A('t') + A('n') + A('f') + A('r') + A('"') + A('\\') + } +} + +// TODO cleanup unused benchmark. +func Benchmark_B(b *testing.B) { + for i := 0; i < b.N; i++ { + B('b') + B('t') + B('n') + B('f') + B('r') + B('"') + B('\\') + } +} diff --git a/parse/keyvaluepair.go b/parse/keyvaluepair.go index 4465221..f92a060 100644 --- a/parse/keyvaluepair.go +++ b/parse/keyvaluepair.go @@ -52,6 +52,8 @@ func (t *parser) startKeyValuePair(p *parse.API) { } else if !p.Skip(endOfLineOrComment) { p.Expected("end of line") } + } else { + p.Expected("a value") } } } diff --git a/parse/value.go b/parse/value.go index 0d46342..dac8251 100644 --- a/parse/value.go +++ b/parse/value.go @@ -6,9 +6,9 @@ import ( ) var ( - detectString = a.SingleQuote.Or(a.DoubleQuote) - detectBoolean = a.Str("true").Or(a.Str("false")) - detectNumberSpecials = c.Any(a.Plus, a.Minus, a.Str("inf"), a.Str("nan")) + detectString = a.Char('\'', '"') + detectBoolean = a.Str("true").Or(a.Str("false")) // TODO use 't' or 'f' and let the boolean handler format errors on mismatch + detectNumberSpecials = c.Any(a.Plus, a.Minus, a.Str("inf"), a.Str("nan")) // TODO likewise as for boolean detectDateTime = a.Digits.Then(a.Minus.Or(a.Colon)) detectNumber = a.Digit detectArray = a.SquareOpen diff --git a/parse/value_string.go b/parse/value_string.go index 46bb2b1..12c1c42 100644 --- a/parse/value_string.go +++ b/parse/value_string.go @@ -1,12 +1,10 @@ package parse import ( - "fmt" - "strconv" - "strings" "unicode/utf8" "git.makaay.nl/mauricem/go-parsekit/parse" + "git.makaay.nl/mauricem/go-parsekit/tokenize" "git.makaay.nl/mauricem/go-toml/ast" ) @@ -30,11 +28,6 @@ var ( // Opening and losing character for literal strings. literalStringDelimiter = a.SingleQuote - // Control characters as defined by TOML (U+0000 to U+001F, U+007F) - - isControlCharacter = func(b byte) bool { return (b >= 0x00 && b <= 0x1F) || b == 0x7F } - controlCharacter = a.ByteByCallback(isControlCharacter) - // For convenience, some popular characters have a compact escape sequence. // // \b - backspace (U+0008) @@ -96,37 +89,203 @@ func (t *parser) parseString(p *parse.API) (*ast.Value, bool) { // "All other escape sequences [..] are reserved and, if used, TOML should // produce an error."" func (t *parser) parseBasicString(name string, p *parse.API) (string, bool) { - if !p.Skip(a.DoubleQuote) { - p.Expected(`opening quotation marks`) + if !p.Accept(basicStringHandler) { return "", false } - sb := &strings.Builder{} + return p.Result.String(), true +} + +type stringTokenizerState int + +const ( + strStart stringTokenizerState = iota + strStart2 + strStart3 + strStart4 + strChar + strEscape + strEscapeUnicode + strEscapeConcatWs1 + strEscapeConcatCRLF + strEscapeConcatWs2 + strCRLF + strUTF8 + strEnd2 + strEnd3 +) + +const ( + lowest6bits = 0x3F // 0011 1111 + lowest5bits = 0x1F // 0001 1111 + lowest4bits = 0x0F // 0000 1111 + lowest3bits = 0x07 // 0000 0111 +) + +func basicStringHandler(tokenAPI *tokenize.API) bool { + var state stringTokenizerState + in := tokenAPI.Input + out := tokenAPI.Output + + unicodeReqLen := 0 + unicodeLen := 0 + unicodeHex := make([]byte, 8) + + utf8ReqLen := 0 + utf8Len := 0 + utf8Rune := rune(0) + utf8Bytes := make([]byte, 4) + for { - switch { - case p.Peek(controlCharacter): - p.SetError("invalid character in %s: %q (must be escaped)", name, p.Result.Byte(0)) - return sb.String(), false - case p.Accept(validEscape): - if !appendEscapedRune(p, sb) { - return sb.String(), false + bs, _ := in.Byte.PeekBuffered(0) + bslen := len(bs) + if bslen == 0 { + return false + } + for i := 0; i < bslen; i++ { + b := bs[i] + switch state { + case strStart: + if b != '"' { + // No opening quotes found. + return false + } + in.Byte.MoveCursor(b) + state = strChar + case strChar: + switch { + case (b >= 0x00 && b <= 0x1F) || b == 0x7F: + // Control characters as defined by the TOML specification. + // These must always be escaped. + // Unescaped control character + // TODO error reporting instead of full reject + return false + case b == '\\': + in.Byte.MoveCursor(b) + state = strEscape + continue + case b == '"': + in.Byte.MoveCursor(b) + return true + } + switch b >> 4 { + case 0, 1, 2, 3, 4, 5, 6, 7: // 1 byte UTF8 (0xxxxxxx, a.k.a. ASCII) + out.AddByte(b) + in.Byte.MoveCursor(b) + continue + case 12, 13: // 2 byte UTF8 (110xxxxx 10xxxxxx) + utf8ReqLen = 2 + utf8Rune = rune((b & lowest5bits) << 6) + case 14: // 3 byte UTF8 (1110xxxx 10xxxxxx 10xxxxxx) + utf8ReqLen = 3 + utf8Rune = rune((b & lowest4bits) << 6) + case 15: // 4 byte UTF8 (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx) + utf8ReqLen = 4 + utf8Rune = rune((b & lowest3bits) << 6) + default: // Invalid UTF8 rune + return false + } + utf8Bytes[0] = b + utf8Len = 1 + state = strUTF8 + case strUTF8: + // This should be a continuation byte (10xxxxxx) + if b>>6 != 2 { + // Invalid UTF8 rune + return false + } + utf8Bytes[utf8Len] = b + utf8Len++ + utf8Rune = utf8Rune<<6 + rune(b&lowest6bits) + if utf8Len == utf8ReqLen { + if !utf8.ValidRune(utf8Rune) { + // Invalid unicode character + return false + } + bytes := utf8Bytes[:utf8Len] + out.AddBytes(bytes...) + in.Byte.MoveCursorMulti(bytes...) + state = strChar + } + case strEscape: + state = strChar + if escaped, ok := getEscapedChar(b); ok { + out.AddByte(escaped) + in.Byte.MoveCursor(b) + continue + } + switch b { + case 'u', 'U': + in.Byte.MoveCursor(b) + unicodeReqLen = 4 + if b == 'u' { + unicodeReqLen = 4 + } else { + unicodeReqLen = 8 + } + unicodeLen = 0 + utf8Rune = 0 + state = strEscapeUnicode + default: + // Invalid escape sequence used. + return false + } + case strEscapeUnicode: + value, ok := getHexValueForChar(b) + if !ok { + // Invalid unicode escape sequence used. + return false + } + utf8Rune = utf8Rune<<4 + rune(value) + unicodeHex[unicodeLen] = b + unicodeLen++ + if unicodeLen == unicodeReqLen { + if !utf8.ValidRune(utf8Rune) { + // Invalid unicode escape + return false + } + in.Byte.MoveCursorMulti(unicodeHex[:unicodeLen]...) + w := utf8.EncodeRune(utf8Bytes, utf8Rune) + out.AddBytes(utf8Bytes[:w]...) + state = strChar + } } - case p.Peek(a.Backslash): - p.SetError("invalid escape sequence") - return sb.String(), false - case p.Skip(basicStringDelimiter): - return sb.String(), true - case p.Peek(a.InvalidRune): - p.SetError("invalid UTF8 rune") - return sb.String(), false - case p.Accept(a.ValidRune): - sb.WriteString(p.Result.String()) - default: - p.Expected(`closing quotation marks`) - return sb.String(), false } } } +func getHexValueForChar(b byte) (byte, bool) { + switch { + case '0' <= b && b <= '9': + return b - '0', true + case 'a' <= b && b <= 'z': + return b - 'a' + 10, true + case 'A' <= b && b <= 'Z': + return b - 'A' + 10, true + default: + return 0, false + } +} + +func getEscapedChar(b byte) (byte, bool) { + switch b { + case 'b': + return '\b', true + case 't': + return '\t', true + case 'n': + return '\n', true + case 'f': + return '\f', true + case 'r': + return '\r', true + case '"': + return '"', true + case '\\': + return '\\', true + } + return 0, false +} + // Specific handling of input for literal strings. // // • Literal strings are surrounded by single quotes. @@ -135,28 +294,88 @@ func (t *parser) parseBasicString(name string, p *parse.API) (string, bool) { // // • Control characters other than tab are not permitted in a literal string. func (t *parser) parseLiteralString(name string, p *parse.API) (string, bool) { - if !p.Skip(a.SingleQuote) { - p.Expected("opening single quote") + if !p.Accept(literalStringHandler) { return "", false } - sb := &strings.Builder{} + return p.Result.String(), true +} + +func literalStringHandler(tokenAPI *tokenize.API) bool { + var state stringTokenizerState + in := tokenAPI.Input + out := tokenAPI.Output + + utf8ReqLen := 0 + utf8Len := 0 + utf8Rune := rune(0) + utf8Bytes := [4]byte{} + for { - switch { - case p.Skip(literalStringDelimiter): - return sb.String(), true - case p.Skip(a.Tab): - sb.WriteString("\t") - case p.Peek(controlCharacter): - p.SetError("invalid character in %s: %q (no control chars allowed, except for tab)", name, p.Result.Byte(0)) - return sb.String(), false - case p.Peek(a.InvalidRune): - p.SetError("invalid UTF8 rune") - return sb.String(), false - case p.Accept(a.ValidRune): - sb.WriteString(p.Result.String()) - default: - p.Expected("closing single quote") - return sb.String(), false + bs, _ := tokenAPI.Input.Byte.PeekBuffered(0) + bslen := len(bs) + if bslen == 0 { + // Unexpected end of file. + return false + } + for i := 0; i < bslen; i++ { + b := bs[i] + switch state { + case strStart: + if b != '\'' { + // No opening quote found. + return false + } + in.Byte.MoveCursor(b) + state = strChar + case strChar: + switch { + case (b >= 0x00 && b < 0x09) || (b > 0x09 && b <= 0x1F) || b == 0x7F: + // Unescaped control character + return false + case b == '\'': + in.Byte.MoveCursor(b) + return true + } + switch b >> 4 { + case 0, 1, 2, 3, 4, 5, 6, 7: // 1 byte UTF8 (0xxxxxxx, a.k.a. ASCII) + out.AddByte(b) + in.Byte.MoveCursor(b) + continue + case 12, 13: // 2 byte UTF8 (110xxxxx 10xxxxxx) + utf8ReqLen = 2 + utf8Rune = rune((b & lowest5bits) << 6) + case 14: // 3 byte UTF8 (1110xxxx 10xxxxxx 10xxxxxx) + utf8ReqLen = 3 + utf8Rune = rune((b & lowest4bits) << 6) + case 15: // 4 byte UTF8 (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx) + utf8ReqLen = 4 + utf8Rune = rune((b & lowest3bits) << 6) + default: // Invalid UTF8 rune + return false + } + utf8Bytes[0] = b + utf8Len = 1 + state = strUTF8 + case strUTF8: + // This should be a continuation byte (10xxxxxx) + if b>>6 != 2 { + // Invalid UTF8 rune + return false + } + utf8Bytes[utf8Len] = b + utf8Len++ + utf8Rune = utf8Rune<<6 + rune(b&lowest6bits) + if utf8Len == utf8ReqLen { + if !utf8.ValidRune(utf8Rune) { + // Invalid unicode character + return false + } + bytes := utf8Bytes[:utf8Len] + out.AddBytes(bytes...) + in.Byte.MoveCursorMulti(bytes...) + state = strChar + } + } } } } @@ -185,70 +404,257 @@ func (t *parser) parseLiteralString(name string, p *parse.API) (string, bool) { // a \, it will be trimmed along with all whitespace (including newlines) up to // the next non-whitespace character or closing delimiter. func (t *parser) parseMultiLineBasicString(p *parse.API) (string, bool) { - if !p.Skip(openingMultiLineBasicString) { - p.Expected("opening three quotation marks") + if !p.Accept(multiLineBasicStringHandler) { return "", false } - sb := &strings.Builder{} - for { - switch { - case p.Skip(newline): - sb.WriteString("\n") - case p.Peek(controlCharacter): - p.SetError("invalid character in multi-line basic string: %q (must be escaped)", p.Result.Byte(0)) - return sb.String(), false - case p.Accept(validEscape): - if !appendEscapedRune(p, sb) { - return sb.String(), false - } - case p.Skip(lineEndingBackslash): - // NOOP - case p.Peek(a.Backslash): - p.SetError("invalid escape sequence") - return sb.String(), false - case p.Skip(closingMultiLineBasicString): - return sb.String(), true - case p.Accept(a.ValidRune): - sb.WriteString(p.Result.String()) - case p.Peek(a.InvalidRune): - p.SetError("invalid UTF8 rune") - return sb.String(), false - default: - p.Expected("closing three quotation marks") - return sb.String(), false - } - } + return p.Result.String(), true } -func appendEscapedRune(p *parse.API, sb *strings.Builder) bool { - s := p.Result.String() - switch s { - case `\b`: - sb.WriteRune('\b') - case `\t`: - sb.WriteRune('\t') - case `\n`: - sb.WriteRune('\n') - case `\f`: - sb.WriteRune('\f') - case `\r`: - sb.WriteRune('\r') - case `\"`: - sb.WriteRune('"') - case `\\`: - sb.WriteRune('\\') - default: - // UTF8 escape code: \uXXXX or \UXXXXXXXXXXXX. - hex := s[2:] - val, _ := strconv.ParseUint(hex, 16, 32) // hex format already validated by parser - r := rune(val) - if !utf8.ValidRune(r) { - p.SetError(fmt.Sprintf("invalid UTF8 escape '%s'", s)) +func multiLineBasicStringHandler(tokenAPI *tokenize.API) bool { + var state stringTokenizerState + in := tokenAPI.Input + out := tokenAPI.Output + + unicodeReqLen := 0 + unicodeLen := 0 + unicodeHex := make([]byte, 8) + + utf8ReqLen := 0 + utf8Len := 0 + utf8Rune := rune(0) + utf8Bytes := make([]byte, 4) + + crlf := false + + for { + bs, _ := in.Byte.PeekBuffered(0) + bslen := len(bs) + if bslen == 0 { return false } - sb.WriteRune(r) + for i := 0; i < bslen; i++ { + b := bs[i] + switch state { + case strStart, strStart2, strStart3: + if b != '"' { + // No triple opening quotes found. + return false + } + in.Byte.MoveCursor(b) + switch state { + case strStart: + state = strStart2 + case strStart2: + state = strStart3 + case strStart3: + state = strStart4 + } + case strStart4: + if !crlf && b == '\r' { + crlf = true + in.Byte.MoveCursor(b) + continue + } + if b == '\n' { + in.Byte.MoveCursor(b) + state = strChar + continue + } + if crlf { + // Lonely \r without \n. + return false + } + state = strChar + fallthrough + case strChar: + switch { + case b == '\r': + state = strCRLF + continue + case b == '\n': + out.AddByte(b) + in.Byte.MoveCursor(b) + continue + case (b >= 0x00 && b <= 0x1F) || b == 0x7F: + // Unescaped control character + // TODO error reporting instead of full reject + return false + case b == '\\': + in.Byte.MoveCursor(b) + state = strEscape + continue + case b == '"': + in.Byte.MoveCursor(b) + state = strEnd2 + continue + } + switch b >> 4 { + case 0, 1, 2, 3, 4, 5, 6, 7: // 1 byte UTF8 (0xxxxxxx, a.k.a. ASCII) + out.AddByte(b) + in.Byte.MoveCursor(b) + continue + case 12, 13: // 2 byte UTF8 (110xxxxx 10xxxxxx) + utf8ReqLen = 2 + utf8Rune = rune((b & lowest5bits) << 6) + case 14: // 3 byte UTF8 (1110xxxx 10xxxxxx 10xxxxxx) + utf8ReqLen = 3 + utf8Rune = rune((b & lowest4bits) << 6) + case 15: // 4 byte UTF8 (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx) + utf8ReqLen = 4 + utf8Rune = rune((b & lowest3bits) << 6) + default: // Invalid UTF8 rune + return false + } + utf8Bytes[0] = b + utf8Len = 1 + state = strUTF8 + case strUTF8: + // This should be a continuation byte (10xxxxxx) + if b>>6 != 2 { + // Invalid UTF8 rune + return false + } + utf8Bytes[utf8Len] = b + utf8Len++ + utf8Rune = utf8Rune<<6 + rune(b&lowest6bits) + if utf8Len == utf8ReqLen { + if !utf8.ValidRune(utf8Rune) { + // Invalid unicode character + return false + } + bytes := utf8Bytes[:utf8Len] + out.AddBytes(bytes...) + in.Byte.MoveCursorMulti(bytes...) + state = strChar + } + case strCRLF: + if b == '\n' { + in.Byte.MoveCursorMulti('\r', b) + out.AddByte('\n') + state = strChar + continue + } + // Lonely \r, should have been escaped. + return false + case strEscape: + state = strChar + if escaped, ok := getEscapedChar(b); ok { + out.AddByte(escaped) + in.Byte.MoveCursor(b) + continue + } + switch b { + case ' ', '\t': + in.Byte.MoveCursor(b) + state = strEscapeConcatWs1 + continue + case '\r': + in.Byte.MoveCursor(b) + state = strEscapeConcatCRLF + continue + case '\n': + in.Byte.MoveCursor(b) + state = strEscapeConcatWs2 + continue + case 'u', 'U': + in.Byte.MoveCursor(b) + unicodeReqLen = 4 + if b == 'u' { + unicodeReqLen = 4 + } else { + unicodeReqLen = 8 + } + unicodeLen = 0 + utf8Rune = 0 + state = strEscapeUnicode + default: + // Invalid escape sequence used. + return false + } + case strEscapeConcatWs1: + switch b { + case ' ', '\t': + in.Byte.MoveCursor(b) + continue + case '\r': + in.Byte.MoveCursor(b) + state = strEscapeConcatCRLF + continue + case '\n': + in.Byte.MoveCursor(b) + state = strEscapeConcatWs2 + continue + default: + // Invalid line concatenation + return false + } + case strEscapeConcatCRLF: + switch b { + case '\n': + in.Byte.MoveCursor(b) + state = strEscapeConcatWs2 + continue + default: + // Invalid line concatenation + return false + } + case strEscapeConcatWs2: + switch b { + case ' ', '\t': + in.Byte.MoveCursor(b) + continue + case '\r': + in.Byte.MoveCursor(b) + state = strEscapeConcatCRLF + continue + case '\n': + in.Byte.MoveCursor(b) + state = strEscapeConcatWs2 + continue + default: + i-- + state = strChar + continue + } + case strEscapeUnicode: + value, ok := getHexValueForChar(b) + if !ok { + // Invalid unicode escape sequence used. + return false + } + utf8Rune = utf8Rune<<4 + rune(value) + unicodeHex[unicodeLen] = b + unicodeLen++ + if unicodeLen == unicodeReqLen { + if !utf8.ValidRune(utf8Rune) { + // Invalid unicode escape + return false + } + in.Byte.MoveCursorMulti(unicodeHex[:unicodeLen]...) + w := utf8.EncodeRune(utf8Bytes, utf8Rune) + out.AddBytes(utf8Bytes[:w]...) + state = strChar + } + case strEnd2: + if b == '"' { + state = strEnd3 + in.Byte.MoveCursor(b) + } else { + state = strChar + out.AddByte('"') + i-- + } + case strEnd3: + if b == '"' { + in.Byte.MoveCursor(b) + return true + } + state = strChar + out.AddBytes('"', '"') + i-- + } + } } - return true } // Specific handling of input for multi-line literal strings. @@ -265,30 +671,148 @@ func appendEscapedRune(p *parse.API, sb *strings.Builder) bool { // // • Control characters other than tab and newline are not permitted in a multi-line literal string. func (t *parser) parseMultiLineLiteralString(p *parse.API) (string, bool) { - if !p.Skip(openingMultiLineLiteralString) { - p.Expected("opening three single quotes") + if !p.Accept(multiLineLiteralStringHandler) { return "", false } - sb := &strings.Builder{} + return p.Result.String(), true +} + +func multiLineLiteralStringHandler(tokenAPI *tokenize.API) bool { + var state stringTokenizerState + in := tokenAPI.Input + out := tokenAPI.Output + + utf8ReqLen := 0 + utf8Len := 0 + utf8Rune := rune(0) + utf8Bytes := make([]byte, 4) + + crlf := false + for { - switch { - case p.Skip(closingMultiLineLiteralString): - return sb.String(), true - case p.Skip(a.Tab): - sb.WriteString("\t") - case p.Skip(newline): - sb.WriteString("\n") - case p.Peek(controlCharacter): - p.SetError("invalid character in literal string: %q (no control chars allowed, except for tab and newline)", p.Result.Byte(0)) - return sb.String(), false - case p.Accept(a.ValidRune): - sb.WriteString(p.Result.String()) - case p.Peek(a.InvalidRune): - p.SetError("invalid UTF8 rune") - return sb.String(), false - default: - p.Expected("closing three single quotes") - return sb.String(), false + bs, _ := in.Byte.PeekBuffered(0) + bslen := len(bs) + if bslen == 0 { + return false + } + for i := 0; i < bslen; i++ { + b := bs[i] + switch state { + case strStart, strStart2, strStart3: + if b != '\'' { + // No triple opening quotes found. + return false + } + in.Byte.MoveCursor(b) + switch state { + case strStart: + state = strStart2 + case strStart2: + state = strStart3 + case strStart3: + state = strStart4 + } + case strStart4: + if !crlf && b == '\r' { + crlf = true + in.Byte.MoveCursor(b) + continue + } + if b == '\n' { + in.Byte.MoveCursor(b) + state = strChar + continue + } + if crlf { + // Lonely \r without \n. + return false + } + state = strChar + fallthrough + case strChar: + switch { + case b == '\r': + state = strCRLF + continue + case b == '\n' || b == '\t': + out.AddByte(b) + in.Byte.MoveCursor(b) + continue + case (b >= 0x00 && b <= 0x1F) || b == 0x7F: + // Unescaped control character + // TODO error reporting instead of full reject + return false + case b == '\'': + in.Byte.MoveCursor(b) + state = strEnd2 + continue + } + switch b >> 4 { + case 0, 1, 2, 3, 4, 5, 6, 7: // 1 byte UTF8 (0xxxxxxx, a.k.a. ASCII) + out.AddByte(b) + in.Byte.MoveCursor(b) + continue + case 12, 13: // 2 byte UTF8 (110xxxxx 10xxxxxx) + utf8ReqLen = 2 + utf8Rune = rune((b & lowest5bits) << 6) + case 14: // 3 byte UTF8 (1110xxxx 10xxxxxx 10xxxxxx) + utf8ReqLen = 3 + utf8Rune = rune((b & lowest4bits) << 6) + case 15: // 4 byte UTF8 (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx) + utf8ReqLen = 4 + utf8Rune = rune((b & lowest3bits) << 6) + default: // Invalid UTF8 rune + return false + } + utf8Bytes[0] = b + utf8Len = 1 + state = strUTF8 + case strUTF8: + // This should be a continuation byte (10xxxxxx) + if b>>6 != 2 { + // Invalid UTF8 rune + return false + } + utf8Bytes[utf8Len] = b + utf8Len++ + utf8Rune = utf8Rune<<6 + rune(b&lowest6bits) + if utf8Len == utf8ReqLen { + if !utf8.ValidRune(utf8Rune) { + // Invalid unicode character + return false + } + bytes := utf8Bytes[:utf8Len] + out.AddBytes(bytes...) + in.Byte.MoveCursorMulti(bytes...) + state = strChar + } + case strCRLF: + if b == '\n' { + in.Byte.MoveCursorMulti('\r', b) + out.AddByte('\n') + state = strChar + continue + } + // Lonely \r, should have been escaped. + return false + case strEnd2: + if b == '\'' { + state = strEnd3 + in.Byte.MoveCursor(b) + } else { + state = strChar + out.AddByte('\'') + i-- + } + case strEnd3: + if b == '\'' { + in.Byte.MoveCursor(b) + return true + } + state = strChar + out.AddBytes('\'', '\'') + i-- + } } } } diff --git a/parse/value_string_test.go b/parse/value_string_test.go index c5445ad..2a336af 100644 --- a/parse/value_string_test.go +++ b/parse/value_string_test.go @@ -79,7 +79,8 @@ func TestMultiLineBasicString(t *testing.T) { {"x=\"\"\"\n\"\"\"", `{"x": ""}`, ``}, {"x=\"\"\"\r\n\r\n\"\"\"", `{"x": "\n"}`, ``}, {`x="""\"\"\"\""""`, `{"x": "\"\"\"\""}`, ``}, - {"x=\"\"\"\nThe quick brown \\\n\n\n \t fox jumps over \\\n\t the lazy dog.\\\n \"\"\"", `{"x": "The quick brown fox jumps over the lazy dog."}`, ``}, + {"x=\"\"\"\nThe quick brown \\\r\n\r\n\n \t fox jumps over \\\n\t the lazy dog.\\\n \"\"\"", `{"x": "The quick brown fox jumps over the lazy dog."}`, ``}, + {"x=\"\"\"\r\nThe quick brown \\\r\n\r\n\n \t\r\n \n\n fox jumps over \\\n\t the lazy dog.\\\n \"\"\"", `{"x": "The quick brown fox jumps over the lazy dog."}`, ``}, {"x=\"\"\"No control chars \f allowed\"\"\"", `{}`, `invalid character in multi-line basic string: '\f' (must be escaped) at line 1, column 23`}, {"x=\"\"\"Escaping control chars\\nis valid\"\"\"", `{"x": "Escaping control chars\nis valid"}`, ``}, {"x=\"\"\"Invalid escaping \\is not allowed\"\"\"", `{}`, `invalid escape sequence at line 1, column 23`},