Wrote a first crude version of specific tokenizer handlers for string parsing. Missing feature is good error reporting from the tokenize handler code (which has been a TODO for a while, so a nice one to implement after this).

This commit is contained in:
Maurice Makaay 2019-07-31 07:51:37 +00:00
parent 5ff6f20ab7
commit ed846c7e53
6 changed files with 742 additions and 132 deletions

View File

@ -0,0 +1,7 @@
regex2 = '''I [dw]on't need \d{2} apples'''
lines = '''
The first newline is
trimmed in raw strings.
All other whitespace
is preserved.
'''

76
parse/benchmark_test.go Normal file
View File

@ -0,0 +1,76 @@
package parse_test
import (
"testing"
)
func A(b byte) (byte, bool) {
if b > 'b' {
switch b {
case 't':
return '\t', true
case 'n':
return '\n', true
case 'r':
return '\r', true
case 'f':
return '\f', true
}
} else {
switch b {
case '"':
return '"', true
case '\\':
return '\\', true
case 'b':
return '\b', true
}
}
return 0x00, false
}
func B(b byte) (byte, bool) {
switch b {
case 'r':
return '\r', true
case 'n':
return '\n', true
case 't':
return '\t', true
case 'b':
return '\b', true
case 'f':
return '\f', true
case '"':
return '"', true
case '\\':
return '\\', true
}
return 0x00, false
}
// TODO cleanup unused benchmark.
func Benchmark_A(b *testing.B) {
for i := 0; i < b.N; i++ {
A('b')
A('t')
A('n')
A('f')
A('r')
A('"')
A('\\')
}
}
// TODO cleanup unused benchmark.
func Benchmark_B(b *testing.B) {
for i := 0; i < b.N; i++ {
B('b')
B('t')
B('n')
B('f')
B('r')
B('"')
B('\\')
}
}

View File

@ -52,6 +52,8 @@ func (t *parser) startKeyValuePair(p *parse.API) {
} else if !p.Skip(endOfLineOrComment) { } else if !p.Skip(endOfLineOrComment) {
p.Expected("end of line") p.Expected("end of line")
} }
} else {
p.Expected("a value")
} }
} }
} }

View File

@ -6,9 +6,9 @@ import (
) )
var ( var (
detectString = a.SingleQuote.Or(a.DoubleQuote) detectString = a.Char('\'', '"')
detectBoolean = a.Str("true").Or(a.Str("false")) detectBoolean = a.Str("true").Or(a.Str("false")) // TODO use 't' or 'f' and let the boolean handler format errors on mismatch
detectNumberSpecials = c.Any(a.Plus, a.Minus, a.Str("inf"), a.Str("nan")) detectNumberSpecials = c.Any(a.Plus, a.Minus, a.Str("inf"), a.Str("nan")) // TODO likewise as for boolean
detectDateTime = a.Digits.Then(a.Minus.Or(a.Colon)) detectDateTime = a.Digits.Then(a.Minus.Or(a.Colon))
detectNumber = a.Digit detectNumber = a.Digit
detectArray = a.SquareOpen detectArray = a.SquareOpen

View File

@ -1,12 +1,10 @@
package parse package parse
import ( import (
"fmt"
"strconv"
"strings"
"unicode/utf8" "unicode/utf8"
"git.makaay.nl/mauricem/go-parsekit/parse" "git.makaay.nl/mauricem/go-parsekit/parse"
"git.makaay.nl/mauricem/go-parsekit/tokenize"
"git.makaay.nl/mauricem/go-toml/ast" "git.makaay.nl/mauricem/go-toml/ast"
) )
@ -30,11 +28,6 @@ var (
// Opening and losing character for literal strings. // Opening and losing character for literal strings.
literalStringDelimiter = a.SingleQuote literalStringDelimiter = a.SingleQuote
// Control characters as defined by TOML (U+0000 to U+001F, U+007F)
isControlCharacter = func(b byte) bool { return (b >= 0x00 && b <= 0x1F) || b == 0x7F }
controlCharacter = a.ByteByCallback(isControlCharacter)
// For convenience, some popular characters have a compact escape sequence. // For convenience, some popular characters have a compact escape sequence.
// //
// \b - backspace (U+0008) // \b - backspace (U+0008)
@ -96,37 +89,203 @@ func (t *parser) parseString(p *parse.API) (*ast.Value, bool) {
// "All other escape sequences [..] are reserved and, if used, TOML should // "All other escape sequences [..] are reserved and, if used, TOML should
// produce an error."" // produce an error.""
func (t *parser) parseBasicString(name string, p *parse.API) (string, bool) { func (t *parser) parseBasicString(name string, p *parse.API) (string, bool) {
if !p.Skip(a.DoubleQuote) { if !p.Accept(basicStringHandler) {
p.Expected(`opening quotation marks`)
return "", false return "", false
} }
sb := &strings.Builder{} return p.Result.String(), true
}
type stringTokenizerState int
const (
strStart stringTokenizerState = iota
strStart2
strStart3
strStart4
strChar
strEscape
strEscapeUnicode
strEscapeConcatWs1
strEscapeConcatCRLF
strEscapeConcatWs2
strCRLF
strUTF8
strEnd2
strEnd3
)
const (
lowest6bits = 0x3F // 0011 1111
lowest5bits = 0x1F // 0001 1111
lowest4bits = 0x0F // 0000 1111
lowest3bits = 0x07 // 0000 0111
)
func basicStringHandler(tokenAPI *tokenize.API) bool {
var state stringTokenizerState
in := tokenAPI.Input
out := tokenAPI.Output
unicodeReqLen := 0
unicodeLen := 0
unicodeHex := make([]byte, 8)
utf8ReqLen := 0
utf8Len := 0
utf8Rune := rune(0)
utf8Bytes := make([]byte, 4)
for { for {
switch { bs, _ := in.Byte.PeekBuffered(0)
case p.Peek(controlCharacter): bslen := len(bs)
p.SetError("invalid character in %s: %q (must be escaped)", name, p.Result.Byte(0)) if bslen == 0 {
return sb.String(), false return false
case p.Accept(validEscape): }
if !appendEscapedRune(p, sb) { for i := 0; i < bslen; i++ {
return sb.String(), false b := bs[i]
switch state {
case strStart:
if b != '"' {
// No opening quotes found.
return false
}
in.Byte.MoveCursor(b)
state = strChar
case strChar:
switch {
case (b >= 0x00 && b <= 0x1F) || b == 0x7F:
// Control characters as defined by the TOML specification.
// These must always be escaped.
// Unescaped control character
// TODO error reporting instead of full reject
return false
case b == '\\':
in.Byte.MoveCursor(b)
state = strEscape
continue
case b == '"':
in.Byte.MoveCursor(b)
return true
}
switch b >> 4 {
case 0, 1, 2, 3, 4, 5, 6, 7: // 1 byte UTF8 (0xxxxxxx, a.k.a. ASCII)
out.AddByte(b)
in.Byte.MoveCursor(b)
continue
case 12, 13: // 2 byte UTF8 (110xxxxx 10xxxxxx)
utf8ReqLen = 2
utf8Rune = rune((b & lowest5bits) << 6)
case 14: // 3 byte UTF8 (1110xxxx 10xxxxxx 10xxxxxx)
utf8ReqLen = 3
utf8Rune = rune((b & lowest4bits) << 6)
case 15: // 4 byte UTF8 (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
utf8ReqLen = 4
utf8Rune = rune((b & lowest3bits) << 6)
default: // Invalid UTF8 rune
return false
}
utf8Bytes[0] = b
utf8Len = 1
state = strUTF8
case strUTF8:
// This should be a continuation byte (10xxxxxx)
if b>>6 != 2 {
// Invalid UTF8 rune
return false
}
utf8Bytes[utf8Len] = b
utf8Len++
utf8Rune = utf8Rune<<6 + rune(b&lowest6bits)
if utf8Len == utf8ReqLen {
if !utf8.ValidRune(utf8Rune) {
// Invalid unicode character
return false
}
bytes := utf8Bytes[:utf8Len]
out.AddBytes(bytes...)
in.Byte.MoveCursorMulti(bytes...)
state = strChar
}
case strEscape:
state = strChar
if escaped, ok := getEscapedChar(b); ok {
out.AddByte(escaped)
in.Byte.MoveCursor(b)
continue
}
switch b {
case 'u', 'U':
in.Byte.MoveCursor(b)
unicodeReqLen = 4
if b == 'u' {
unicodeReqLen = 4
} else {
unicodeReqLen = 8
}
unicodeLen = 0
utf8Rune = 0
state = strEscapeUnicode
default:
// Invalid escape sequence used.
return false
}
case strEscapeUnicode:
value, ok := getHexValueForChar(b)
if !ok {
// Invalid unicode escape sequence used.
return false
}
utf8Rune = utf8Rune<<4 + rune(value)
unicodeHex[unicodeLen] = b
unicodeLen++
if unicodeLen == unicodeReqLen {
if !utf8.ValidRune(utf8Rune) {
// Invalid unicode escape
return false
}
in.Byte.MoveCursorMulti(unicodeHex[:unicodeLen]...)
w := utf8.EncodeRune(utf8Bytes, utf8Rune)
out.AddBytes(utf8Bytes[:w]...)
state = strChar
}
} }
case p.Peek(a.Backslash):
p.SetError("invalid escape sequence")
return sb.String(), false
case p.Skip(basicStringDelimiter):
return sb.String(), true
case p.Peek(a.InvalidRune):
p.SetError("invalid UTF8 rune")
return sb.String(), false
case p.Accept(a.ValidRune):
sb.WriteString(p.Result.String())
default:
p.Expected(`closing quotation marks`)
return sb.String(), false
} }
} }
} }
func getHexValueForChar(b byte) (byte, bool) {
switch {
case '0' <= b && b <= '9':
return b - '0', true
case 'a' <= b && b <= 'z':
return b - 'a' + 10, true
case 'A' <= b && b <= 'Z':
return b - 'A' + 10, true
default:
return 0, false
}
}
func getEscapedChar(b byte) (byte, bool) {
switch b {
case 'b':
return '\b', true
case 't':
return '\t', true
case 'n':
return '\n', true
case 'f':
return '\f', true
case 'r':
return '\r', true
case '"':
return '"', true
case '\\':
return '\\', true
}
return 0, false
}
// Specific handling of input for literal strings. // Specific handling of input for literal strings.
// //
// • Literal strings are surrounded by single quotes. // • Literal strings are surrounded by single quotes.
@ -135,28 +294,88 @@ func (t *parser) parseBasicString(name string, p *parse.API) (string, bool) {
// //
// • Control characters other than tab are not permitted in a literal string. // • Control characters other than tab are not permitted in a literal string.
func (t *parser) parseLiteralString(name string, p *parse.API) (string, bool) { func (t *parser) parseLiteralString(name string, p *parse.API) (string, bool) {
if !p.Skip(a.SingleQuote) { if !p.Accept(literalStringHandler) {
p.Expected("opening single quote")
return "", false return "", false
} }
sb := &strings.Builder{} return p.Result.String(), true
}
func literalStringHandler(tokenAPI *tokenize.API) bool {
var state stringTokenizerState
in := tokenAPI.Input
out := tokenAPI.Output
utf8ReqLen := 0
utf8Len := 0
utf8Rune := rune(0)
utf8Bytes := [4]byte{}
for { for {
switch { bs, _ := tokenAPI.Input.Byte.PeekBuffered(0)
case p.Skip(literalStringDelimiter): bslen := len(bs)
return sb.String(), true if bslen == 0 {
case p.Skip(a.Tab): // Unexpected end of file.
sb.WriteString("\t") return false
case p.Peek(controlCharacter): }
p.SetError("invalid character in %s: %q (no control chars allowed, except for tab)", name, p.Result.Byte(0)) for i := 0; i < bslen; i++ {
return sb.String(), false b := bs[i]
case p.Peek(a.InvalidRune): switch state {
p.SetError("invalid UTF8 rune") case strStart:
return sb.String(), false if b != '\'' {
case p.Accept(a.ValidRune): // No opening quote found.
sb.WriteString(p.Result.String()) return false
default: }
p.Expected("closing single quote") in.Byte.MoveCursor(b)
return sb.String(), false state = strChar
case strChar:
switch {
case (b >= 0x00 && b < 0x09) || (b > 0x09 && b <= 0x1F) || b == 0x7F:
// Unescaped control character
return false
case b == '\'':
in.Byte.MoveCursor(b)
return true
}
switch b >> 4 {
case 0, 1, 2, 3, 4, 5, 6, 7: // 1 byte UTF8 (0xxxxxxx, a.k.a. ASCII)
out.AddByte(b)
in.Byte.MoveCursor(b)
continue
case 12, 13: // 2 byte UTF8 (110xxxxx 10xxxxxx)
utf8ReqLen = 2
utf8Rune = rune((b & lowest5bits) << 6)
case 14: // 3 byte UTF8 (1110xxxx 10xxxxxx 10xxxxxx)
utf8ReqLen = 3
utf8Rune = rune((b & lowest4bits) << 6)
case 15: // 4 byte UTF8 (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
utf8ReqLen = 4
utf8Rune = rune((b & lowest3bits) << 6)
default: // Invalid UTF8 rune
return false
}
utf8Bytes[0] = b
utf8Len = 1
state = strUTF8
case strUTF8:
// This should be a continuation byte (10xxxxxx)
if b>>6 != 2 {
// Invalid UTF8 rune
return false
}
utf8Bytes[utf8Len] = b
utf8Len++
utf8Rune = utf8Rune<<6 + rune(b&lowest6bits)
if utf8Len == utf8ReqLen {
if !utf8.ValidRune(utf8Rune) {
// Invalid unicode character
return false
}
bytes := utf8Bytes[:utf8Len]
out.AddBytes(bytes...)
in.Byte.MoveCursorMulti(bytes...)
state = strChar
}
}
} }
} }
} }
@ -185,70 +404,257 @@ func (t *parser) parseLiteralString(name string, p *parse.API) (string, bool) {
// a \, it will be trimmed along with all whitespace (including newlines) up to // a \, it will be trimmed along with all whitespace (including newlines) up to
// the next non-whitespace character or closing delimiter. // the next non-whitespace character or closing delimiter.
func (t *parser) parseMultiLineBasicString(p *parse.API) (string, bool) { func (t *parser) parseMultiLineBasicString(p *parse.API) (string, bool) {
if !p.Skip(openingMultiLineBasicString) { if !p.Accept(multiLineBasicStringHandler) {
p.Expected("opening three quotation marks")
return "", false return "", false
} }
sb := &strings.Builder{} return p.Result.String(), true
for {
switch {
case p.Skip(newline):
sb.WriteString("\n")
case p.Peek(controlCharacter):
p.SetError("invalid character in multi-line basic string: %q (must be escaped)", p.Result.Byte(0))
return sb.String(), false
case p.Accept(validEscape):
if !appendEscapedRune(p, sb) {
return sb.String(), false
}
case p.Skip(lineEndingBackslash):
// NOOP
case p.Peek(a.Backslash):
p.SetError("invalid escape sequence")
return sb.String(), false
case p.Skip(closingMultiLineBasicString):
return sb.String(), true
case p.Accept(a.ValidRune):
sb.WriteString(p.Result.String())
case p.Peek(a.InvalidRune):
p.SetError("invalid UTF8 rune")
return sb.String(), false
default:
p.Expected("closing three quotation marks")
return sb.String(), false
}
}
} }
func appendEscapedRune(p *parse.API, sb *strings.Builder) bool { func multiLineBasicStringHandler(tokenAPI *tokenize.API) bool {
s := p.Result.String() var state stringTokenizerState
switch s { in := tokenAPI.Input
case `\b`: out := tokenAPI.Output
sb.WriteRune('\b')
case `\t`: unicodeReqLen := 0
sb.WriteRune('\t') unicodeLen := 0
case `\n`: unicodeHex := make([]byte, 8)
sb.WriteRune('\n')
case `\f`: utf8ReqLen := 0
sb.WriteRune('\f') utf8Len := 0
case `\r`: utf8Rune := rune(0)
sb.WriteRune('\r') utf8Bytes := make([]byte, 4)
case `\"`:
sb.WriteRune('"') crlf := false
case `\\`:
sb.WriteRune('\\') for {
default: bs, _ := in.Byte.PeekBuffered(0)
// UTF8 escape code: \uXXXX or \UXXXXXXXXXXXX. bslen := len(bs)
hex := s[2:] if bslen == 0 {
val, _ := strconv.ParseUint(hex, 16, 32) // hex format already validated by parser
r := rune(val)
if !utf8.ValidRune(r) {
p.SetError(fmt.Sprintf("invalid UTF8 escape '%s'", s))
return false return false
} }
sb.WriteRune(r) for i := 0; i < bslen; i++ {
b := bs[i]
switch state {
case strStart, strStart2, strStart3:
if b != '"' {
// No triple opening quotes found.
return false
}
in.Byte.MoveCursor(b)
switch state {
case strStart:
state = strStart2
case strStart2:
state = strStart3
case strStart3:
state = strStart4
}
case strStart4:
if !crlf && b == '\r' {
crlf = true
in.Byte.MoveCursor(b)
continue
}
if b == '\n' {
in.Byte.MoveCursor(b)
state = strChar
continue
}
if crlf {
// Lonely \r without \n.
return false
}
state = strChar
fallthrough
case strChar:
switch {
case b == '\r':
state = strCRLF
continue
case b == '\n':
out.AddByte(b)
in.Byte.MoveCursor(b)
continue
case (b >= 0x00 && b <= 0x1F) || b == 0x7F:
// Unescaped control character
// TODO error reporting instead of full reject
return false
case b == '\\':
in.Byte.MoveCursor(b)
state = strEscape
continue
case b == '"':
in.Byte.MoveCursor(b)
state = strEnd2
continue
}
switch b >> 4 {
case 0, 1, 2, 3, 4, 5, 6, 7: // 1 byte UTF8 (0xxxxxxx, a.k.a. ASCII)
out.AddByte(b)
in.Byte.MoveCursor(b)
continue
case 12, 13: // 2 byte UTF8 (110xxxxx 10xxxxxx)
utf8ReqLen = 2
utf8Rune = rune((b & lowest5bits) << 6)
case 14: // 3 byte UTF8 (1110xxxx 10xxxxxx 10xxxxxx)
utf8ReqLen = 3
utf8Rune = rune((b & lowest4bits) << 6)
case 15: // 4 byte UTF8 (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
utf8ReqLen = 4
utf8Rune = rune((b & lowest3bits) << 6)
default: // Invalid UTF8 rune
return false
}
utf8Bytes[0] = b
utf8Len = 1
state = strUTF8
case strUTF8:
// This should be a continuation byte (10xxxxxx)
if b>>6 != 2 {
// Invalid UTF8 rune
return false
}
utf8Bytes[utf8Len] = b
utf8Len++
utf8Rune = utf8Rune<<6 + rune(b&lowest6bits)
if utf8Len == utf8ReqLen {
if !utf8.ValidRune(utf8Rune) {
// Invalid unicode character
return false
}
bytes := utf8Bytes[:utf8Len]
out.AddBytes(bytes...)
in.Byte.MoveCursorMulti(bytes...)
state = strChar
}
case strCRLF:
if b == '\n' {
in.Byte.MoveCursorMulti('\r', b)
out.AddByte('\n')
state = strChar
continue
}
// Lonely \r, should have been escaped.
return false
case strEscape:
state = strChar
if escaped, ok := getEscapedChar(b); ok {
out.AddByte(escaped)
in.Byte.MoveCursor(b)
continue
}
switch b {
case ' ', '\t':
in.Byte.MoveCursor(b)
state = strEscapeConcatWs1
continue
case '\r':
in.Byte.MoveCursor(b)
state = strEscapeConcatCRLF
continue
case '\n':
in.Byte.MoveCursor(b)
state = strEscapeConcatWs2
continue
case 'u', 'U':
in.Byte.MoveCursor(b)
unicodeReqLen = 4
if b == 'u' {
unicodeReqLen = 4
} else {
unicodeReqLen = 8
}
unicodeLen = 0
utf8Rune = 0
state = strEscapeUnicode
default:
// Invalid escape sequence used.
return false
}
case strEscapeConcatWs1:
switch b {
case ' ', '\t':
in.Byte.MoveCursor(b)
continue
case '\r':
in.Byte.MoveCursor(b)
state = strEscapeConcatCRLF
continue
case '\n':
in.Byte.MoveCursor(b)
state = strEscapeConcatWs2
continue
default:
// Invalid line concatenation
return false
}
case strEscapeConcatCRLF:
switch b {
case '\n':
in.Byte.MoveCursor(b)
state = strEscapeConcatWs2
continue
default:
// Invalid line concatenation
return false
}
case strEscapeConcatWs2:
switch b {
case ' ', '\t':
in.Byte.MoveCursor(b)
continue
case '\r':
in.Byte.MoveCursor(b)
state = strEscapeConcatCRLF
continue
case '\n':
in.Byte.MoveCursor(b)
state = strEscapeConcatWs2
continue
default:
i--
state = strChar
continue
}
case strEscapeUnicode:
value, ok := getHexValueForChar(b)
if !ok {
// Invalid unicode escape sequence used.
return false
}
utf8Rune = utf8Rune<<4 + rune(value)
unicodeHex[unicodeLen] = b
unicodeLen++
if unicodeLen == unicodeReqLen {
if !utf8.ValidRune(utf8Rune) {
// Invalid unicode escape
return false
}
in.Byte.MoveCursorMulti(unicodeHex[:unicodeLen]...)
w := utf8.EncodeRune(utf8Bytes, utf8Rune)
out.AddBytes(utf8Bytes[:w]...)
state = strChar
}
case strEnd2:
if b == '"' {
state = strEnd3
in.Byte.MoveCursor(b)
} else {
state = strChar
out.AddByte('"')
i--
}
case strEnd3:
if b == '"' {
in.Byte.MoveCursor(b)
return true
}
state = strChar
out.AddBytes('"', '"')
i--
}
}
} }
return true
} }
// Specific handling of input for multi-line literal strings. // Specific handling of input for multi-line literal strings.
@ -265,30 +671,148 @@ func appendEscapedRune(p *parse.API, sb *strings.Builder) bool {
// //
// • Control characters other than tab and newline are not permitted in a multi-line literal string. // • Control characters other than tab and newline are not permitted in a multi-line literal string.
func (t *parser) parseMultiLineLiteralString(p *parse.API) (string, bool) { func (t *parser) parseMultiLineLiteralString(p *parse.API) (string, bool) {
if !p.Skip(openingMultiLineLiteralString) { if !p.Accept(multiLineLiteralStringHandler) {
p.Expected("opening three single quotes")
return "", false return "", false
} }
sb := &strings.Builder{} return p.Result.String(), true
}
func multiLineLiteralStringHandler(tokenAPI *tokenize.API) bool {
var state stringTokenizerState
in := tokenAPI.Input
out := tokenAPI.Output
utf8ReqLen := 0
utf8Len := 0
utf8Rune := rune(0)
utf8Bytes := make([]byte, 4)
crlf := false
for { for {
switch { bs, _ := in.Byte.PeekBuffered(0)
case p.Skip(closingMultiLineLiteralString): bslen := len(bs)
return sb.String(), true if bslen == 0 {
case p.Skip(a.Tab): return false
sb.WriteString("\t") }
case p.Skip(newline): for i := 0; i < bslen; i++ {
sb.WriteString("\n") b := bs[i]
case p.Peek(controlCharacter): switch state {
p.SetError("invalid character in literal string: %q (no control chars allowed, except for tab and newline)", p.Result.Byte(0)) case strStart, strStart2, strStart3:
return sb.String(), false if b != '\'' {
case p.Accept(a.ValidRune): // No triple opening quotes found.
sb.WriteString(p.Result.String()) return false
case p.Peek(a.InvalidRune): }
p.SetError("invalid UTF8 rune") in.Byte.MoveCursor(b)
return sb.String(), false switch state {
default: case strStart:
p.Expected("closing three single quotes") state = strStart2
return sb.String(), false case strStart2:
state = strStart3
case strStart3:
state = strStart4
}
case strStart4:
if !crlf && b == '\r' {
crlf = true
in.Byte.MoveCursor(b)
continue
}
if b == '\n' {
in.Byte.MoveCursor(b)
state = strChar
continue
}
if crlf {
// Lonely \r without \n.
return false
}
state = strChar
fallthrough
case strChar:
switch {
case b == '\r':
state = strCRLF
continue
case b == '\n' || b == '\t':
out.AddByte(b)
in.Byte.MoveCursor(b)
continue
case (b >= 0x00 && b <= 0x1F) || b == 0x7F:
// Unescaped control character
// TODO error reporting instead of full reject
return false
case b == '\'':
in.Byte.MoveCursor(b)
state = strEnd2
continue
}
switch b >> 4 {
case 0, 1, 2, 3, 4, 5, 6, 7: // 1 byte UTF8 (0xxxxxxx, a.k.a. ASCII)
out.AddByte(b)
in.Byte.MoveCursor(b)
continue
case 12, 13: // 2 byte UTF8 (110xxxxx 10xxxxxx)
utf8ReqLen = 2
utf8Rune = rune((b & lowest5bits) << 6)
case 14: // 3 byte UTF8 (1110xxxx 10xxxxxx 10xxxxxx)
utf8ReqLen = 3
utf8Rune = rune((b & lowest4bits) << 6)
case 15: // 4 byte UTF8 (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
utf8ReqLen = 4
utf8Rune = rune((b & lowest3bits) << 6)
default: // Invalid UTF8 rune
return false
}
utf8Bytes[0] = b
utf8Len = 1
state = strUTF8
case strUTF8:
// This should be a continuation byte (10xxxxxx)
if b>>6 != 2 {
// Invalid UTF8 rune
return false
}
utf8Bytes[utf8Len] = b
utf8Len++
utf8Rune = utf8Rune<<6 + rune(b&lowest6bits)
if utf8Len == utf8ReqLen {
if !utf8.ValidRune(utf8Rune) {
// Invalid unicode character
return false
}
bytes := utf8Bytes[:utf8Len]
out.AddBytes(bytes...)
in.Byte.MoveCursorMulti(bytes...)
state = strChar
}
case strCRLF:
if b == '\n' {
in.Byte.MoveCursorMulti('\r', b)
out.AddByte('\n')
state = strChar
continue
}
// Lonely \r, should have been escaped.
return false
case strEnd2:
if b == '\'' {
state = strEnd3
in.Byte.MoveCursor(b)
} else {
state = strChar
out.AddByte('\'')
i--
}
case strEnd3:
if b == '\'' {
in.Byte.MoveCursor(b)
return true
}
state = strChar
out.AddBytes('\'', '\'')
i--
}
} }
} }
} }

View File

@ -79,7 +79,8 @@ func TestMultiLineBasicString(t *testing.T) {
{"x=\"\"\"\n\"\"\"", `{"x": ""}`, ``}, {"x=\"\"\"\n\"\"\"", `{"x": ""}`, ``},
{"x=\"\"\"\r\n\r\n\"\"\"", `{"x": "\n"}`, ``}, {"x=\"\"\"\r\n\r\n\"\"\"", `{"x": "\n"}`, ``},
{`x="""\"\"\"\""""`, `{"x": "\"\"\"\""}`, ``}, {`x="""\"\"\"\""""`, `{"x": "\"\"\"\""}`, ``},
{"x=\"\"\"\nThe quick brown \\\n\n\n \t fox jumps over \\\n\t the lazy dog.\\\n \"\"\"", `{"x": "The quick brown fox jumps over the lazy dog."}`, ``}, {"x=\"\"\"\nThe quick brown \\\r\n\r\n\n \t fox jumps over \\\n\t the lazy dog.\\\n \"\"\"", `{"x": "The quick brown fox jumps over the lazy dog."}`, ``},
{"x=\"\"\"\r\nThe quick brown \\\r\n\r\n\n \t\r\n \n\n fox jumps over \\\n\t the lazy dog.\\\n \"\"\"", `{"x": "The quick brown fox jumps over the lazy dog."}`, ``},
{"x=\"\"\"No control chars \f allowed\"\"\"", `{}`, `invalid character in multi-line basic string: '\f' (must be escaped) at line 1, column 23`}, {"x=\"\"\"No control chars \f allowed\"\"\"", `{}`, `invalid character in multi-line basic string: '\f' (must be escaped) at line 1, column 23`},
{"x=\"\"\"Escaping control chars\\nis valid\"\"\"", `{"x": "Escaping control chars\nis valid"}`, ``}, {"x=\"\"\"Escaping control chars\\nis valid\"\"\"", `{"x": "Escaping control chars\nis valid"}`, ``},
{"x=\"\"\"Invalid escaping \\is not allowed\"\"\"", `{}`, `invalid escape sequence at line 1, column 23`}, {"x=\"\"\"Invalid escaping \\is not allowed\"\"\"", `{}`, `invalid escape sequence at line 1, column 23`},