Wrote a first crude version of specific tokenizer handlers for string parsing. Missing feature is good error reporting from the tokenize handler code (which has been a TODO for a while, so a nice one to implement after this).

2019-07-31 07:51:37 +00:00 · 2019-07-31 07:51:37 +00:00 · ed846c7e53
parent 5ff6f20ab7
commit ed846c7e53
6 changed files with 742 additions and 132 deletions
--- a/cmd/burntsushi-tester/test.toml
+++ b/cmd/burntsushi-tester/test.toml
@ -0,0 +1,7 @@
+regex2 = '''I [dw]on't need \d{2} apples'''
+lines  = '''
+The first newline is
+trimmed in raw strings.
+   All other whitespace
+   is preserved.
+'''
--- a/parse/benchmark_test.go
+++ b/parse/benchmark_test.go
@ -0,0 +1,76 @@
+package parse_test
+
+import (
+	"testing"
+)
+
+func A(b byte) (byte, bool) {
+	if b > 'b' {
+		switch b {
+		case 't':
+			return '\t', true
+		case 'n':
+			return '\n', true
+		case 'r':
+			return '\r', true
+		case 'f':
+			return '\f', true
+		}
+	} else {
+		switch b {
+		case '"':
+			return '"', true
+		case '\\':
+			return '\\', true
+		case 'b':
+			return '\b', true
+		}
+	}
+	return 0x00, false
+}
+
+func B(b byte) (byte, bool) {
+	switch b {
+	case 'r':
+		return '\r', true
+	case 'n':
+		return '\n', true
+	case 't':
+		return '\t', true
+	case 'b':
+		return '\b', true
+	case 'f':
+		return '\f', true
+	case '"':
+		return '"', true
+	case '\\':
+		return '\\', true
+	}
+	return 0x00, false
+}
+
+// TODO cleanup unused benchmark.
+func Benchmark_A(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		A('b')
+		A('t')
+		A('n')
+		A('f')
+		A('r')
+		A('"')
+		A('\\')
+	}
+}
+
+// TODO cleanup unused benchmark.
+func Benchmark_B(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		B('b')
+		B('t')
+		B('n')
+		B('f')
+		B('r')
+		B('"')
+		B('\\')
+	}
+}
--- a/parse/keyvaluepair.go
+++ b/parse/keyvaluepair.go
@ -52,6 +52,8 @@ func (t *parser) startKeyValuePair(p *parse.API) {
 			} else if !p.Skip(endOfLineOrComment) {
 				p.Expected("end of line")
 			}
+		} else {
+			p.Expected("a value")
 		}
 	}
 }
--- a/parse/value.go
+++ b/parse/value.go
@ -6,9 +6,9 @@ import (
 )

 var (
-	detectString         = a.SingleQuote.Or(a.DoubleQuote)
-	detectBoolean        = a.Str("true").Or(a.Str("false"))
-	detectNumberSpecials = c.Any(a.Plus, a.Minus, a.Str("inf"), a.Str("nan"))
+	detectString         = a.Char('\'', '"')
+	detectBoolean        = a.Str("true").Or(a.Str("false"))                   // TODO use 't' or 'f' and let the boolean handler format errors on mismatch
+	detectNumberSpecials = c.Any(a.Plus, a.Minus, a.Str("inf"), a.Str("nan")) // TODO likewise as for boolean
 	detectDateTime       = a.Digits.Then(a.Minus.Or(a.Colon))
 	detectNumber         = a.Digit
 	detectArray          = a.SquareOpen
--- a/parse/value_string.go
+++ b/parse/value_string.go
@ -1,12 +1,10 @@
 package parse

 import (
-	"fmt"
-	"strconv"
-	"strings"
 	"unicode/utf8"

 	"git.makaay.nl/mauricem/go-parsekit/parse"
+	"git.makaay.nl/mauricem/go-parsekit/tokenize"
 	"git.makaay.nl/mauricem/go-toml/ast"
 )

@ -30,11 +28,6 @@ var (
 	// Opening and losing character for literal strings.
 	literalStringDelimiter = a.SingleQuote

-	// Control characters as defined by TOML (U+0000 to U+001F, U+007F)
-
-	isControlCharacter = func(b byte) bool { return (b >= 0x00 && b <= 0x1F) || b == 0x7F }
-	controlCharacter   = a.ByteByCallback(isControlCharacter)
-
 	// For convenience, some popular characters have a compact escape sequence.
 	//
 	// \b         - backspace       (U+0008)
@ -96,36 +89,202 @@ func (t *parser) parseString(p *parse.API) (*ast.Value, bool) {
 // "All other escape sequences [..] are reserved and, if used, TOML should
 // produce an error.""
 func (t *parser) parseBasicString(name string, p *parse.API) (string, bool) {
-	if !p.Skip(a.DoubleQuote) {
-		p.Expected(`opening quotation marks`)
+	if !p.Accept(basicStringHandler) {
 		return "", false
 	}
-	sb := &strings.Builder{}
+	return p.Result.String(), true
+}
+
+type stringTokenizerState int
+
+const (
+	strStart stringTokenizerState = iota
+	strStart2
+	strStart3
+	strStart4
+	strChar
+	strEscape
+	strEscapeUnicode
+	strEscapeConcatWs1
+	strEscapeConcatCRLF
+	strEscapeConcatWs2
+	strCRLF
+	strUTF8
+	strEnd2
+	strEnd3
+)
+
+const (
+	lowest6bits = 0x3F // 0011 1111
+	lowest5bits = 0x1F // 0001 1111
+	lowest4bits = 0x0F // 0000 1111
+	lowest3bits = 0x07 // 0000 0111
+)
+
+func basicStringHandler(tokenAPI *tokenize.API) bool {
+	var state stringTokenizerState
+	in := tokenAPI.Input
+	out := tokenAPI.Output
+
+	unicodeReqLen := 0
+	unicodeLen := 0
+	unicodeHex := make([]byte, 8)
+
+	utf8ReqLen := 0
+	utf8Len := 0
+	utf8Rune := rune(0)
+	utf8Bytes := make([]byte, 4)
+
 	for {
+		bs, _ := in.Byte.PeekBuffered(0)
+		bslen := len(bs)
+		if bslen == 0 {
+			return false
+		}
+		for i := 0; i < bslen; i++ {
+			b := bs[i]
+			switch state {
+			case strStart:
+				if b != '"' {
+					// No opening quotes found.
+					return false
+				}
+				in.Byte.MoveCursor(b)
+				state = strChar
+			case strChar:
 				switch {
-		case p.Peek(controlCharacter):
-			p.SetError("invalid character in %s: %q (must be escaped)", name, p.Result.Byte(0))
-			return sb.String(), false
-		case p.Accept(validEscape):
-			if !appendEscapedRune(p, sb) {
-				return sb.String(), false
+				case (b >= 0x00 && b <= 0x1F) || b == 0x7F:
+					// Control characters as defined by the TOML specification.
+					// These must always be escaped.
+					// Unescaped control character
+					// TODO error reporting instead of full reject
+					return false
+				case b == '\\':
+					in.Byte.MoveCursor(b)
+					state = strEscape
+					continue
+				case b == '"':
+					in.Byte.MoveCursor(b)
+					return true
 				}
-		case p.Peek(a.Backslash):
-			p.SetError("invalid escape sequence")
-			return sb.String(), false
-		case p.Skip(basicStringDelimiter):
-			return sb.String(), true
-		case p.Peek(a.InvalidRune):
-			p.SetError("invalid UTF8 rune")
-			return sb.String(), false
-		case p.Accept(a.ValidRune):
-			sb.WriteString(p.Result.String())
+				switch b >> 4 {
+				case 0, 1, 2, 3, 4, 5, 6, 7: // 1 byte UTF8 (0xxxxxxx, a.k.a. ASCII)
+					out.AddByte(b)
+					in.Byte.MoveCursor(b)
+					continue
+				case 12, 13: // 2 byte UTF8 (110xxxxx 10xxxxxx)
+					utf8ReqLen = 2
+					utf8Rune = rune((b & lowest5bits) << 6)
+				case 14: // 3 byte UTF8 (1110xxxx 10xxxxxx 10xxxxxx)
+					utf8ReqLen = 3
+					utf8Rune = rune((b & lowest4bits) << 6)
+				case 15: // 4 byte UTF8 (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
+					utf8ReqLen = 4
+					utf8Rune = rune((b & lowest3bits) << 6)
+				default: // Invalid UTF8 rune
+					return false
+				}
+				utf8Bytes[0] = b
+				utf8Len = 1
+				state = strUTF8
+			case strUTF8:
+				// This should be a continuation byte (10xxxxxx)
+				if b>>6 != 2 {
+					// Invalid UTF8 rune
+					return false
+				}
+				utf8Bytes[utf8Len] = b
+				utf8Len++
+				utf8Rune = utf8Rune<<6 + rune(b&lowest6bits)
+				if utf8Len == utf8ReqLen {
+					if !utf8.ValidRune(utf8Rune) {
+						// Invalid unicode character
+						return false
+					}
+					bytes := utf8Bytes[:utf8Len]
+					out.AddBytes(bytes...)
+					in.Byte.MoveCursorMulti(bytes...)
+					state = strChar
+				}
+			case strEscape:
+				state = strChar
+				if escaped, ok := getEscapedChar(b); ok {
+					out.AddByte(escaped)
+					in.Byte.MoveCursor(b)
+					continue
+				}
+				switch b {
+				case 'u', 'U':
+					in.Byte.MoveCursor(b)
+					unicodeReqLen = 4
+					if b == 'u' {
+						unicodeReqLen = 4
+					} else {
+						unicodeReqLen = 8
+					}
+					unicodeLen = 0
+					utf8Rune = 0
+					state = strEscapeUnicode
 				default:
-			p.Expected(`closing quotation marks`)
-			return sb.String(), false
+					// Invalid escape sequence used.
+					return false
+				}
+			case strEscapeUnicode:
+				value, ok := getHexValueForChar(b)
+				if !ok {
+					// Invalid unicode escape sequence used.
+					return false
+				}
+				utf8Rune = utf8Rune<<4 + rune(value)
+				unicodeHex[unicodeLen] = b
+				unicodeLen++
+				if unicodeLen == unicodeReqLen {
+					if !utf8.ValidRune(utf8Rune) {
+						// Invalid unicode escape
+						return false
+					}
+					in.Byte.MoveCursorMulti(unicodeHex[:unicodeLen]...)
+					w := utf8.EncodeRune(utf8Bytes, utf8Rune)
+					out.AddBytes(utf8Bytes[:w]...)
+					state = strChar
 				}
 			}
 		}
+	}
+}
+
+func getHexValueForChar(b byte) (byte, bool) {
+	switch {
+	case '0' <= b && b <= '9':
+		return b - '0', true
+	case 'a' <= b && b <= 'z':
+		return b - 'a' + 10, true
+	case 'A' <= b && b <= 'Z':
+		return b - 'A' + 10, true
+	default:
+		return 0, false
+	}
+}
+
+func getEscapedChar(b byte) (byte, bool) {
+	switch b {
+	case 'b':
+		return '\b', true
+	case 't':
+		return '\t', true
+	case 'n':
+		return '\n', true
+	case 'f':
+		return '\f', true
+	case 'r':
+		return '\r', true
+	case '"':
+		return '"', true
+	case '\\':
+		return '\\', true
+	}
+	return 0, false
+}

 // Specific handling of input for literal strings.
 //
@ -135,28 +294,88 @@ func (t *parser) parseBasicString(name string, p *parse.API) (string, bool) {
 //
 // • Control characters other than tab are not permitted in a literal string.
 func (t *parser) parseLiteralString(name string, p *parse.API) (string, bool) {
-	if !p.Skip(a.SingleQuote) {
-		p.Expected("opening single quote")
+	if !p.Accept(literalStringHandler) {
 		return "", false
 	}
-	sb := &strings.Builder{}
+	return p.Result.String(), true
+}
+
+func literalStringHandler(tokenAPI *tokenize.API) bool {
+	var state stringTokenizerState
+	in := tokenAPI.Input
+	out := tokenAPI.Output
+
+	utf8ReqLen := 0
+	utf8Len := 0
+	utf8Rune := rune(0)
+	utf8Bytes := [4]byte{}
+
 	for {
+		bs, _ := tokenAPI.Input.Byte.PeekBuffered(0)
+		bslen := len(bs)
+		if bslen == 0 {
+			// Unexpected end of file.
+			return false
+		}
+		for i := 0; i < bslen; i++ {
+			b := bs[i]
+			switch state {
+			case strStart:
+				if b != '\'' {
+					// No opening quote found.
+					return false
+				}
+				in.Byte.MoveCursor(b)
+				state = strChar
+			case strChar:
 				switch {
-		case p.Skip(literalStringDelimiter):
-			return sb.String(), true
-		case p.Skip(a.Tab):
-			sb.WriteString("\t")
-		case p.Peek(controlCharacter):
-			p.SetError("invalid character in %s: %q (no control chars allowed, except for tab)", name, p.Result.Byte(0))
-			return sb.String(), false
-		case p.Peek(a.InvalidRune):
-			p.SetError("invalid UTF8 rune")
-			return sb.String(), false
-		case p.Accept(a.ValidRune):
-			sb.WriteString(p.Result.String())
-		default:
-			p.Expected("closing single quote")
-			return sb.String(), false
+				case (b >= 0x00 && b < 0x09) || (b > 0x09 && b <= 0x1F) || b == 0x7F:
+					// Unescaped control character
+					return false
+				case b == '\'':
+					in.Byte.MoveCursor(b)
+					return true
+				}
+				switch b >> 4 {
+				case 0, 1, 2, 3, 4, 5, 6, 7: // 1 byte UTF8 (0xxxxxxx, a.k.a. ASCII)
+					out.AddByte(b)
+					in.Byte.MoveCursor(b)
+					continue
+				case 12, 13: // 2 byte UTF8 (110xxxxx 10xxxxxx)
+					utf8ReqLen = 2
+					utf8Rune = rune((b & lowest5bits) << 6)
+				case 14: // 3 byte UTF8 (1110xxxx 10xxxxxx 10xxxxxx)
+					utf8ReqLen = 3
+					utf8Rune = rune((b & lowest4bits) << 6)
+				case 15: // 4 byte UTF8 (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
+					utf8ReqLen = 4
+					utf8Rune = rune((b & lowest3bits) << 6)
+				default: // Invalid UTF8 rune
+					return false
+				}
+				utf8Bytes[0] = b
+				utf8Len = 1
+				state = strUTF8
+			case strUTF8:
+				// This should be a continuation byte (10xxxxxx)
+				if b>>6 != 2 {
+					// Invalid UTF8 rune
+					return false
+				}
+				utf8Bytes[utf8Len] = b
+				utf8Len++
+				utf8Rune = utf8Rune<<6 + rune(b&lowest6bits)
+				if utf8Len == utf8ReqLen {
+					if !utf8.ValidRune(utf8Rune) {
+						// Invalid unicode character
+						return false
+					}
+					bytes := utf8Bytes[:utf8Len]
+					out.AddBytes(bytes...)
+					in.Byte.MoveCursorMulti(bytes...)
+					state = strChar
+				}
+			}
 		}
 	}
 }
@ -185,71 +404,258 @@ func (t *parser) parseLiteralString(name string, p *parse.API) (string, bool) {
 // a \, it will be trimmed along with all whitespace (including newlines) up to
 // the next non-whitespace character or closing delimiter.
 func (t *parser) parseMultiLineBasicString(p *parse.API) (string, bool) {
-	if !p.Skip(openingMultiLineBasicString) {
-		p.Expected("opening three quotation marks")
+	if !p.Accept(multiLineBasicStringHandler) {
 		return "", false
 	}
-	sb := &strings.Builder{}
-	for {
-		switch {
-		case p.Skip(newline):
-			sb.WriteString("\n")
-		case p.Peek(controlCharacter):
-			p.SetError("invalid character in multi-line basic string: %q (must be escaped)", p.Result.Byte(0))
-			return sb.String(), false
-		case p.Accept(validEscape):
-			if !appendEscapedRune(p, sb) {
-				return sb.String(), false
-			}
-		case p.Skip(lineEndingBackslash):
-			// NOOP
-		case p.Peek(a.Backslash):
-			p.SetError("invalid escape sequence")
-			return sb.String(), false
-		case p.Skip(closingMultiLineBasicString):
-			return sb.String(), true
-		case p.Accept(a.ValidRune):
-			sb.WriteString(p.Result.String())
-		case p.Peek(a.InvalidRune):
-			p.SetError("invalid UTF8 rune")
-			return sb.String(), false
-		default:
-			p.Expected("closing three quotation marks")
-			return sb.String(), false
-		}
-	}
+	return p.Result.String(), true
 }

-func appendEscapedRune(p *parse.API, sb *strings.Builder) bool {
-	s := p.Result.String()
-	switch s {
-	case `\b`:
-		sb.WriteRune('\b')
-	case `\t`:
-		sb.WriteRune('\t')
-	case `\n`:
-		sb.WriteRune('\n')
-	case `\f`:
-		sb.WriteRune('\f')
-	case `\r`:
-		sb.WriteRune('\r')
-	case `\"`:
-		sb.WriteRune('"')
-	case `\\`:
-		sb.WriteRune('\\')
-	default:
-		// UTF8 escape code: \uXXXX or \UXXXXXXXXXXXX.
-		hex := s[2:]
-		val, _ := strconv.ParseUint(hex, 16, 32) // hex format already validated by parser
-		r := rune(val)
-		if !utf8.ValidRune(r) {
-			p.SetError(fmt.Sprintf("invalid UTF8 escape '%s'", s))
+func multiLineBasicStringHandler(tokenAPI *tokenize.API) bool {
+	var state stringTokenizerState
+	in := tokenAPI.Input
+	out := tokenAPI.Output
+
+	unicodeReqLen := 0
+	unicodeLen := 0
+	unicodeHex := make([]byte, 8)
+
+	utf8ReqLen := 0
+	utf8Len := 0
+	utf8Rune := rune(0)
+	utf8Bytes := make([]byte, 4)
+
+	crlf := false
+
+	for {
+		bs, _ := in.Byte.PeekBuffered(0)
+		bslen := len(bs)
+		if bslen == 0 {
 			return false
 		}
-		sb.WriteRune(r)
+		for i := 0; i < bslen; i++ {
+			b := bs[i]
+			switch state {
+			case strStart, strStart2, strStart3:
+				if b != '"' {
+					// No triple opening quotes found.
+					return false
 				}
+				in.Byte.MoveCursor(b)
+				switch state {
+				case strStart:
+					state = strStart2
+				case strStart2:
+					state = strStart3
+				case strStart3:
+					state = strStart4
+				}
+			case strStart4:
+				if !crlf && b == '\r' {
+					crlf = true
+					in.Byte.MoveCursor(b)
+					continue
+				}
+				if b == '\n' {
+					in.Byte.MoveCursor(b)
+					state = strChar
+					continue
+				}
+				if crlf {
+					// Lonely \r without \n.
+					return false
+				}
+				state = strChar
+				fallthrough
+			case strChar:
+				switch {
+				case b == '\r':
+					state = strCRLF
+					continue
+				case b == '\n':
+					out.AddByte(b)
+					in.Byte.MoveCursor(b)
+					continue
+				case (b >= 0x00 && b <= 0x1F) || b == 0x7F:
+					// Unescaped control character
+					// TODO error reporting instead of full reject
+					return false
+				case b == '\\':
+					in.Byte.MoveCursor(b)
+					state = strEscape
+					continue
+				case b == '"':
+					in.Byte.MoveCursor(b)
+					state = strEnd2
+					continue
+				}
+				switch b >> 4 {
+				case 0, 1, 2, 3, 4, 5, 6, 7: // 1 byte UTF8 (0xxxxxxx, a.k.a. ASCII)
+					out.AddByte(b)
+					in.Byte.MoveCursor(b)
+					continue
+				case 12, 13: // 2 byte UTF8 (110xxxxx 10xxxxxx)
+					utf8ReqLen = 2
+					utf8Rune = rune((b & lowest5bits) << 6)
+				case 14: // 3 byte UTF8 (1110xxxx 10xxxxxx 10xxxxxx)
+					utf8ReqLen = 3
+					utf8Rune = rune((b & lowest4bits) << 6)
+				case 15: // 4 byte UTF8 (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
+					utf8ReqLen = 4
+					utf8Rune = rune((b & lowest3bits) << 6)
+				default: // Invalid UTF8 rune
+					return false
+				}
+				utf8Bytes[0] = b
+				utf8Len = 1
+				state = strUTF8
+			case strUTF8:
+				// This should be a continuation byte (10xxxxxx)
+				if b>>6 != 2 {
+					// Invalid UTF8 rune
+					return false
+				}
+				utf8Bytes[utf8Len] = b
+				utf8Len++
+				utf8Rune = utf8Rune<<6 + rune(b&lowest6bits)
+				if utf8Len == utf8ReqLen {
+					if !utf8.ValidRune(utf8Rune) {
+						// Invalid unicode character
+						return false
+					}
+					bytes := utf8Bytes[:utf8Len]
+					out.AddBytes(bytes...)
+					in.Byte.MoveCursorMulti(bytes...)
+					state = strChar
+				}
+			case strCRLF:
+				if b == '\n' {
+					in.Byte.MoveCursorMulti('\r', b)
+					out.AddByte('\n')
+					state = strChar
+					continue
+				}
+				// Lonely \r, should have been escaped.
+				return false
+			case strEscape:
+				state = strChar
+				if escaped, ok := getEscapedChar(b); ok {
+					out.AddByte(escaped)
+					in.Byte.MoveCursor(b)
+					continue
+				}
+				switch b {
+				case ' ', '\t':
+					in.Byte.MoveCursor(b)
+					state = strEscapeConcatWs1
+					continue
+				case '\r':
+					in.Byte.MoveCursor(b)
+					state = strEscapeConcatCRLF
+					continue
+				case '\n':
+					in.Byte.MoveCursor(b)
+					state = strEscapeConcatWs2
+					continue
+				case 'u', 'U':
+					in.Byte.MoveCursor(b)
+					unicodeReqLen = 4
+					if b == 'u' {
+						unicodeReqLen = 4
+					} else {
+						unicodeReqLen = 8
+					}
+					unicodeLen = 0
+					utf8Rune = 0
+					state = strEscapeUnicode
+				default:
+					// Invalid escape sequence used.
+					return false
+				}
+			case strEscapeConcatWs1:
+				switch b {
+				case ' ', '\t':
+					in.Byte.MoveCursor(b)
+					continue
+				case '\r':
+					in.Byte.MoveCursor(b)
+					state = strEscapeConcatCRLF
+					continue
+				case '\n':
+					in.Byte.MoveCursor(b)
+					state = strEscapeConcatWs2
+					continue
+				default:
+					// Invalid line concatenation
+					return false
+				}
+			case strEscapeConcatCRLF:
+				switch b {
+				case '\n':
+					in.Byte.MoveCursor(b)
+					state = strEscapeConcatWs2
+					continue
+				default:
+					// Invalid line concatenation
+					return false
+				}
+			case strEscapeConcatWs2:
+				switch b {
+				case ' ', '\t':
+					in.Byte.MoveCursor(b)
+					continue
+				case '\r':
+					in.Byte.MoveCursor(b)
+					state = strEscapeConcatCRLF
+					continue
+				case '\n':
+					in.Byte.MoveCursor(b)
+					state = strEscapeConcatWs2
+					continue
+				default:
+					i--
+					state = strChar
+					continue
+				}
+			case strEscapeUnicode:
+				value, ok := getHexValueForChar(b)
+				if !ok {
+					// Invalid unicode escape sequence used.
+					return false
+				}
+				utf8Rune = utf8Rune<<4 + rune(value)
+				unicodeHex[unicodeLen] = b
+				unicodeLen++
+				if unicodeLen == unicodeReqLen {
+					if !utf8.ValidRune(utf8Rune) {
+						// Invalid unicode escape
+						return false
+					}
+					in.Byte.MoveCursorMulti(unicodeHex[:unicodeLen]...)
+					w := utf8.EncodeRune(utf8Bytes, utf8Rune)
+					out.AddBytes(utf8Bytes[:w]...)
+					state = strChar
+				}
+			case strEnd2:
+				if b == '"' {
+					state = strEnd3
+					in.Byte.MoveCursor(b)
+				} else {
+					state = strChar
+					out.AddByte('"')
+					i--
+				}
+			case strEnd3:
+				if b == '"' {
+					in.Byte.MoveCursor(b)
 					return true
 				}
+				state = strChar
+				out.AddBytes('"', '"')
+				i--
+			}
+		}
+	}
+}

 // Specific handling of input for multi-line literal strings.
 //
@ -265,30 +671,148 @@ func appendEscapedRune(p *parse.API, sb *strings.Builder) bool {
 //
 // • Control characters other than tab and newline are not permitted in a multi-line literal string.
 func (t *parser) parseMultiLineLiteralString(p *parse.API) (string, bool) {
-	if !p.Skip(openingMultiLineLiteralString) {
-		p.Expected("opening three single quotes")
+	if !p.Accept(multiLineLiteralStringHandler) {
 		return "", false
 	}
-	sb := &strings.Builder{}
+	return p.Result.String(), true
+}
+
+func multiLineLiteralStringHandler(tokenAPI *tokenize.API) bool {
+	var state stringTokenizerState
+	in := tokenAPI.Input
+	out := tokenAPI.Output
+
+	utf8ReqLen := 0
+	utf8Len := 0
+	utf8Rune := rune(0)
+	utf8Bytes := make([]byte, 4)
+
+	crlf := false
+
 	for {
+		bs, _ := in.Byte.PeekBuffered(0)
+		bslen := len(bs)
+		if bslen == 0 {
+			return false
+		}
+		for i := 0; i < bslen; i++ {
+			b := bs[i]
+			switch state {
+			case strStart, strStart2, strStart3:
+				if b != '\'' {
+					// No triple opening quotes found.
+					return false
+				}
+				in.Byte.MoveCursor(b)
+				switch state {
+				case strStart:
+					state = strStart2
+				case strStart2:
+					state = strStart3
+				case strStart3:
+					state = strStart4
+				}
+			case strStart4:
+				if !crlf && b == '\r' {
+					crlf = true
+					in.Byte.MoveCursor(b)
+					continue
+				}
+				if b == '\n' {
+					in.Byte.MoveCursor(b)
+					state = strChar
+					continue
+				}
+				if crlf {
+					// Lonely \r without \n.
+					return false
+				}
+				state = strChar
+				fallthrough
+			case strChar:
 				switch {
-		case p.Skip(closingMultiLineLiteralString):
-			return sb.String(), true
-		case p.Skip(a.Tab):
-			sb.WriteString("\t")
-		case p.Skip(newline):
-			sb.WriteString("\n")
-		case p.Peek(controlCharacter):
-			p.SetError("invalid character in literal string: %q (no control chars allowed, except for tab and newline)", p.Result.Byte(0))
-			return sb.String(), false
-		case p.Accept(a.ValidRune):
-			sb.WriteString(p.Result.String())
-		case p.Peek(a.InvalidRune):
-			p.SetError("invalid UTF8 rune")
-			return sb.String(), false
-		default:
-			p.Expected("closing three single quotes")
-			return sb.String(), false
+				case b == '\r':
+					state = strCRLF
+					continue
+				case b == '\n' || b == '\t':
+					out.AddByte(b)
+					in.Byte.MoveCursor(b)
+					continue
+				case (b >= 0x00 && b <= 0x1F) || b == 0x7F:
+					// Unescaped control character
+					// TODO error reporting instead of full reject
+					return false
+				case b == '\'':
+					in.Byte.MoveCursor(b)
+					state = strEnd2
+					continue
+				}
+				switch b >> 4 {
+				case 0, 1, 2, 3, 4, 5, 6, 7: // 1 byte UTF8 (0xxxxxxx, a.k.a. ASCII)
+					out.AddByte(b)
+					in.Byte.MoveCursor(b)
+					continue
+				case 12, 13: // 2 byte UTF8 (110xxxxx 10xxxxxx)
+					utf8ReqLen = 2
+					utf8Rune = rune((b & lowest5bits) << 6)
+				case 14: // 3 byte UTF8 (1110xxxx 10xxxxxx 10xxxxxx)
+					utf8ReqLen = 3
+					utf8Rune = rune((b & lowest4bits) << 6)
+				case 15: // 4 byte UTF8 (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
+					utf8ReqLen = 4
+					utf8Rune = rune((b & lowest3bits) << 6)
+				default: // Invalid UTF8 rune
+					return false
+				}
+				utf8Bytes[0] = b
+				utf8Len = 1
+				state = strUTF8
+			case strUTF8:
+				// This should be a continuation byte (10xxxxxx)
+				if b>>6 != 2 {
+					// Invalid UTF8 rune
+					return false
+				}
+				utf8Bytes[utf8Len] = b
+				utf8Len++
+				utf8Rune = utf8Rune<<6 + rune(b&lowest6bits)
+				if utf8Len == utf8ReqLen {
+					if !utf8.ValidRune(utf8Rune) {
+						// Invalid unicode character
+						return false
+					}
+					bytes := utf8Bytes[:utf8Len]
+					out.AddBytes(bytes...)
+					in.Byte.MoveCursorMulti(bytes...)
+					state = strChar
+				}
+			case strCRLF:
+				if b == '\n' {
+					in.Byte.MoveCursorMulti('\r', b)
+					out.AddByte('\n')
+					state = strChar
+					continue
+				}
+				// Lonely \r, should have been escaped.
+				return false
+			case strEnd2:
+				if b == '\'' {
+					state = strEnd3
+					in.Byte.MoveCursor(b)
+				} else {
+					state = strChar
+					out.AddByte('\'')
+					i--
+				}
+			case strEnd3:
+				if b == '\'' {
+					in.Byte.MoveCursor(b)
+					return true
+				}
+				state = strChar
+				out.AddBytes('\'', '\'')
+				i--
+			}
 		}
 	}
 }
--- a/parse/value_string_test.go
+++ b/parse/value_string_test.go
@ -79,7 +79,8 @@ func TestMultiLineBasicString(t *testing.T) {
 		{"x=\"\"\"\n\"\"\"", `{"x": ""}`, ``},
 		{"x=\"\"\"\r\n\r\n\"\"\"", `{"x": "\n"}`, ``},
 		{`x="""\"\"\"\""""`, `{"x": "\"\"\"\""}`, ``},
-		{"x=\"\"\"\nThe quick brown \\\n\n\n  \t  fox jumps over \\\n\t the lazy dog.\\\n   \"\"\"", `{"x": "The quick brown fox jumps over the lazy dog."}`, ``},
+		{"x=\"\"\"\nThe quick brown \\\r\n\r\n\n  \t  fox jumps over \\\n\t the lazy dog.\\\n   \"\"\"", `{"x": "The quick brown fox jumps over the lazy dog."}`, ``},
+		{"x=\"\"\"\r\nThe quick brown \\\r\n\r\n\n  \t\r\n  \n\n  fox jumps over \\\n\t the lazy dog.\\\n   \"\"\"", `{"x": "The quick brown fox jumps over the lazy dog."}`, ``},
 		{"x=\"\"\"No control chars \f allowed\"\"\"", `{}`, `invalid character in multi-line basic string: '\f' (must be escaped) at line 1, column 23`},
 		{"x=\"\"\"Escaping control chars\\nis valid\"\"\"", `{"x": "Escaping control chars\nis valid"}`, ``},
 		{"x=\"\"\"Invalid escaping \\is not allowed\"\"\"", `{}`, `invalid escape sequence at line 1, column 23`},