Simplify, simplify, simplify, and make handling of invalid UTF8 or unexpected en of file more robust.

This commit is contained in:
Maurice Makaay 2019-05-16 23:26:43 +00:00
parent dc47ac3b71
commit 29a13834dd
3 changed files with 154 additions and 143 deletions

View File

@ -1,7 +1,6 @@
package lexer
import (
"errors"
"fmt"
"strings"
"unicode/utf8"
@ -12,12 +11,28 @@ type Lexer struct {
input string // the scanned input string
state stateFn // a function that handles the current state
stack []stateFn // state function stack, for nested parsing
pos int // current scanning position in the input
pos int // current byte scanning position in the input
newline bool // keep track of when we have scanned a newline
linenr int // current line number in the input
linepos int // current position in the input line
width int // width of the last rune read, for supporting backup()
buffer StringBuffer // an efficient buffer, used to build string values
items chan Item // channel of resulting lexer items
nextItem Item // the current item as reached by Next() and retrieved by Get()
err error // an error message when lexing failed, retrieved by Error()
err *Error // an error when lexing failed, retrieved by Error()
}
// Error is used as the error type when lexing errors occur.
// The error includes some extra meta information to allow for useful
// error messages to the user.
type Error struct {
Message string
LineNr int
LinePos int
}
func (err *Error) Error() string {
return err.Message
}
// Lex takes an input string and initializes the TOML lexer for it.
@ -53,7 +68,7 @@ func (l *Lexer) Next() bool {
return false
}
if i.Type == ItemError {
l.err = errors.New(i.Value)
l.err = &Error{i.Value, l.linenr, l.linepos}
return false
}
l.nextItem = i
@ -64,7 +79,7 @@ func (l *Lexer) Next() bool {
}
}
func (l *Lexer) Error() error {
func (l *Lexer) Error() *Error {
return l.err
}
@ -76,7 +91,7 @@ func (l *Lexer) Get() Item {
// ToArray returns lexer items as an array.
// When an error occurs during scanning, a partial result will be
// returned, accompanied by the error that occurred.
func (l *Lexer) ToArray() ([]Item, error) {
func (l *Lexer) ToArray() ([]Item, *Error) {
var items []Item
for l.Next() {
items = append(items, l.Get())
@ -136,10 +151,16 @@ func (l *Lexer) emitInterpreted(t itemType) error {
return nil
}
// emitError emits a lexer error item back to the client.
func (l *Lexer) emitError(message string) {
l.emit(ItemError, message)
}
// backup steps back one rune
// Can be called only once per call of next.
func (l *Lexer) backup() {
l.pos -= l.width
l.linepos--
}
// peek returns but does not advance to the next rune(s) in the input.
@ -159,31 +180,31 @@ func (l *Lexer) peek() (rune, int, bool) {
// Returns a slice of runes and a boolean. The boolean will be false in case
// less upcoming runes can be peeked than the requested amount
// (end of data or invalid UTF8 character).
func (l *Lexer) peekMulti(amount int) ([]rune, bool) {
offset := 0
func (l *Lexer) peekMulti(amount int) ([]rune, int, bool) {
width := 0
var peeked []rune
for i := 0; i < amount; i++ {
r, w := utf8.DecodeRuneInString(l.input[l.pos+offset:])
r, w := utf8.DecodeRuneInString(l.input[l.pos+width:])
switch {
case r == utf8.RuneError:
return peeked, false
return peeked, 0, false
default:
offset += w
width += w
peeked = append(peeked, r)
}
}
return peeked, true
return peeked, width, true
}
// acceptNext adds the specified amount of runes from the input to the string buffer.
// If not enough runes could be read (end of file or invalid UTF8 data), then false is returned.
func (l *Lexer) acceptNext(count int) bool {
for i := 0; i < count; i++ {
r := l.next()
if r == endOfFile || r == utf8.RuneError {
if r, ok := l.next(); ok {
l.buffer.WriteRune(r)
} else {
return false
}
l.buffer.WriteRune(r)
}
return true
}
@ -191,22 +212,22 @@ func (l *Lexer) acceptNext(count int) bool {
// acceptFrom adds the next rune from the input to the string buffer
// when it matches in the provided runes. If the next rune does
// not match, false is returned.
func (l *Lexer) acceptFrom(runes string) bool {
r := l.next()
if strings.IndexRune(runes, r) >= 0 {
l.buffer.WriteRune(r)
return true
}
l.backup()
return false
}
// func (l *Lexer) acceptFrom(runes string) bool {
// r, ok := l.next()
// if strings.IndexRune(runes, r) >= 0 {
// l.buffer.WriteRune(r)
// return true
// }
// l.backup()
// return false
// }
// acceptRun adds consecutive runes from the input to the string
// buffer when they match the provided runes. If no runes were added
// at all, false it returned.
func (l *Lexer) acceptRun(runes string) bool {
func (l *Lexer) acceptRun(match string) bool {
accepted := false
for l.acceptFrom(runes) {
for l.accept(match) {
accepted = true
}
return accepted
@ -215,38 +236,49 @@ func (l *Lexer) acceptRun(runes string) bool {
// TODO meh... ugly rune.
var endOfFile rune = -1
// next returns the next rune from the input.
func (l *Lexer) next() rune {
// next returns the next rune from the input and a boolean indicating if
// reading the input was successful.
// When the end of input is reached, or an invalid UTF8 character is
// read, then false is returned.
func (l *Lexer) next() (rune, bool) {
if l.newline {
l.linepos = 0
l.linenr++
} else {
l.linepos++
}
l.width = 0
r, w := utf8.DecodeRuneInString(l.input[l.pos:])
switch {
case r == utf8.RuneError && w == 0:
return endOfFile
l.emitError("unexpected end of file")
return utf8.RuneError, false
case r == utf8.RuneError:
return utf8.RuneError
l.emitError("invalid UTF8 character")
return utf8.RuneError, false
default:
l.width = w
l.pos += w
return r
l.newline = r == '\n'
return r, true
}
}
// skip skips a rune from the set of accepted runes.
// Returns true when a rune was skipped.
func (l *Lexer) skip(runes string) bool {
r, w, _ := l.peek()
if strings.IndexRune(runes, r) >= 0 {
// skip skips runes when all provided matches are satisfied.
// Returns true when one or more runes were skipped.
func (l *Lexer) skipMatching(matches ...string) bool {
if _, w, ok := l.match(matches...); ok {
l.pos += w
return true
}
return false
}
// skipRun skips a run of runes from the set of accepted runes.
// skipConsecutive skips consecutive runes from the provided match.
// Returns true when one or more runes were skipped.
func (l *Lexer) skipRun(runes string) bool {
func (l *Lexer) skipConsecutive(match string) bool {
didSkip := false
for l.skip(runes) {
for l.skipMatching(match) {
didSkip = true
}
return didSkip
@ -254,65 +286,33 @@ func (l *Lexer) skipRun(runes string) bool {
// accept adds the next rune to the string buffer and returns true if it's
// from the valid set of runes. Otherwise false is returned.
func (l *Lexer) accept(runes string) bool {
r := l.next()
if strings.IndexRune(runes, r) >= 0 {
func (l *Lexer) accept(match string) bool {
if r, ok := l.next(); ok {
if strings.IndexRune(match, r) >= 0 {
l.buffer.WriteRune(r)
return true
}
}
l.backup()
return false
}
func (l *Lexer) upcoming(runes ...string) bool {
if peeked, ok := l.peekMulti(len(runes)); ok {
for i, r := range runes {
_, _, ok := l.match(runes...)
return ok
}
func (l *Lexer) match(matches ...string) ([]rune, int, bool) {
peeked, width, ok := l.peekMulti(len(matches))
if ok {
for i, r := range matches {
if strings.IndexRune(r, peeked[i]) < 0 {
return false
return peeked, width, false
}
}
return true
return peeked, width, true
}
return false
}
// TODO nog nodig met stringbuffer?
// acceptNot consumes the next rune if it's not from the set of runes.
func (l *Lexer) acceptNot(runes string) bool {
r := l.next()
if r == endOfFile {
l.backup()
return false
}
if strings.IndexRune(runes, r) < 0 {
return true
}
l.backup()
return false
}
// acceptUntil consumes a run of runes until ones from the
// valid set is encountered.
func (l *Lexer) acceptUntil(runes string) bool {
accepted := false
for l.acceptNot(runes) {
accepted = true
}
return accepted
}
// acceptRun consumes a run of runes from the set of accepted runes.
func (l *Lexer) acceptWhile(runes string) bool {
accepted := false
for l.accept(runes) {
accepted = true
}
return accepted
}
// skipUntil skips a run of runes, until a rune from the set of
// runes of EOF is reached.
func (l *Lexer) skipUntil(runes string) {
l.acceptUntil(runes)
return peeked, width, false
}
// error returns an error token and terminates the scan
@ -326,17 +326,11 @@ func (l *Lexer) errorf(format string, args ...interface{}) stateFn {
}
func (l *Lexer) unexpectedInputError(expected string) stateFn {
var actual string
switch {
case l.atEndOfFile(): // TODO maybe not hit anymore after refactoring?
actual = "end of file"
case !utf8.ValidString(l.input[l.pos:]):
actual = "non-UTF8 data"
default:
r, _, _ := l.peek()
actual = fmt.Sprintf("token '%c'", r)
// next() takes care of error messages for ok == false.
if r, ok := l.next(); ok {
l.emitError(fmt.Sprintf("unexpected character %q (expected %s)", r, expected))
}
return l.errorf("Unexpected %s (expected %s)", actual, expected)
return nil
}
func (l *Lexer) unexpectedEndOfFile(expected string) stateFn {

View File

@ -29,8 +29,8 @@ const (
)
func stateKeyValuePair(l *Lexer) stateFn {
l.skipRun(whitespace + carriageReturn + newline)
if l.skip(hash) {
l.skipConsecutive(whitespace + carriageReturn + newline)
if l.skipMatching(hash) {
return stateComment
}
if l.upcoming(startOfKey) {
@ -43,12 +43,12 @@ func stateKeyValuePair(l *Lexer) stateFn {
func stateComment(l *Lexer) stateFn {
for {
switch {
case l.atEndOfFile() || l.skip(newline):
case l.atEndOfFile() || l.skipMatching(newline):
l.emitTrimmedLiteral(ItemComment)
return stateKeyValuePair
default:
if !l.acceptNext(1) {
return nil
return l.unexpectedInputError("comment")
}
}
}
@ -56,7 +56,7 @@ func stateComment(l *Lexer) stateFn {
// A key may be either bare, quoted or dotted.
func stateKey(l *Lexer) stateFn {
if l.acceptFrom(bareKeyChars) {
if l.accept(bareKeyChars) {
return statebareKeyChars
}
return l.unexpectedInputError("a valid key name")
@ -77,10 +77,10 @@ func statebareKeyChars(l *Lexer) stateFn {
func stateEndOfKeyOrKeyDot(l *Lexer) stateFn {
// Whitespace around dot-separated parts is ignored, however,
// best practice is to not use any extraneous whitespace.
l.skipRun(whitespace)
if l.skip(dot) {
l.skipConsecutive(whitespace)
if l.skipMatching(dot) {
l.emit(ItemKeyDot, "")
l.skipRun(whitespace)
l.skipConsecutive(whitespace)
return stateKey
}
return stateKeyAssignment
@ -91,10 +91,10 @@ func stateEndOfKeyOrKeyDot(l *Lexer) stateFn {
// sign, and value must be on the same line (though some values can
// be broken over multiple lines).
func stateKeyAssignment(l *Lexer) stateFn {
l.skipRun(whitespace)
if l.skip(equal) {
l.skipConsecutive(whitespace)
if l.skipMatching(equal) {
l.emit(ItemAssignment, "")
l.skipRun(whitespace)
l.skipConsecutive(whitespace)
return stateValue
}
return l.unexpectedInputError("a value assignment")
@ -103,7 +103,7 @@ func stateKeyAssignment(l *Lexer) stateFn {
// Values must be of the following types: String, Integer, Float, Boolean,
// Datetime, Array, or Inline Table. Unspecified values are invalid.
func stateValue(l *Lexer) stateFn {
l.skipRun(whitespace)
l.skipConsecutive(whitespace)
if l.upcoming(quoteChars) {
return stateStringValue
}
@ -113,25 +113,21 @@ func stateValue(l *Lexer) stateFn {
// There are four ways to express strings: basic, multi-line basic, literal,
// and multi-line literal. All strings must contain only valid UTF-8 characters.
func stateStringValue(l *Lexer) stateFn {
switch {
case l.skipMatching(doubleQuote, doubleQuote, doubleQuote):
// Multi-line basic strings are surrounded by three quotation marks on each side.
return stateMultiLineBasicString
case l.skipMatching(doubleQuote):
// Basic strings are surrounded by quotation marks.
if l.skip(doubleQuote) {
return stateBasicStringValue
}
return l.unexpectedInputError("a string value")
}
func stateBasicStringValue(l *Lexer) stateFn {
// Possibly a """ multi-line string start,
// possibly the end of an "" empty string.
if l.skip(doubleQuote) {
// It's a """ multi-line string.
if l.skip(doubleQuote) {
if l.upcoming(doubleQuote, doubleQuote) {
return stateMultiLineBasicString
}
// It's an "" empty string.
l.emit(ItemString, "")
return stateKeyValuePair
}
return stateBasicString
}
@ -147,7 +143,7 @@ func stateParseBasicString(l *Lexer) stateFn {
switch {
case l.atEndOfFile():
return l.unexpectedEndOfFile("basic string token")
case l.skip(doubleQuote):
case l.skipMatching(doubleQuote):
return l.popState()
case l.upcoming(backslash, escapeChars):
// For convenience, some popular characters have a compact escape sequence.
@ -172,9 +168,12 @@ func stateParseBasicString(l *Lexer) stateFn {
case l.upcoming(invalidBasicStringCharacters):
// Any Unicode character may be used except those that must be escaped:
// quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F).
return l.errorf("Invalid character in basic string: %q", l.next())
r, _ := l.next()
return l.errorf("Invalid character in basic string: %q", r)
default:
l.acceptNext(1)
if !l.acceptNext(1) {
return l.unexpectedInputError("string value")
}
}
}
}
@ -197,7 +196,8 @@ func stateMultiLineBasicString(l *Lexer) stateFn {
func stateEndOfFile(l *Lexer) stateFn {
if l.atEndOfFile() {
l.emit(ItemEOF, "EOF")
} else {
l.unexpectedInputError("end of file")
}
return nil
}
return l.unexpectedInputError("end of file")
}

View File

@ -8,10 +8,26 @@ import (
"github.com/mmakaay/toml/lexer"
)
func TestErrorsIncludeLineAndRowPosition(t *testing.T) {
_, err := lexer.Lex("# 12345\n# 67890\r\n# 12345\xbc").ToArray()
t.Logf("Got error: %s", err.Error())
if err.LineNr != 2 {
t.Errorf("Unexpected line number: %d (expected %d)", err.LineNr, 2)
}
if err.LinePos != 2 {
t.Errorf("Unexpected line position: %d (expected %d)", err.LinePos, 6)
}
}
func TestInvalidUtf8Data(t *testing.T) {
runStatesT(t, statesT{
"invalid UTF8 data", "\xbc", "",
"Unexpected non-UTF8 data (expected end of file)"})
runStatesTs(t, []statesT{
{"inside comment", "# \xbc", "", "invalid UTF8 character"},
{"bare key 1", "\xbc", "", "invalid UTF8 character"},
{"bare key 2", "key\xbc", "", "invalid UTF8 character"},
{"assignment", "key \xbc", "[key]", "invalid UTF8 character"},
{"start of value", "key=\xbc", "[key]=", "invalid UTF8 character"},
{"basic string value", "a=\"\xbc\"", "[a]=", "invalid UTF8 character"},
})
}
func TestEmptyInput(t *testing.T) {
@ -42,25 +58,25 @@ func TestComments(t *testing.T) {
}
func TestKeyWithoutAssignment(t *testing.T) {
err := "Unexpected end of file (expected a value assignment)"
err := "unexpected end of file"
runStatesTs(t, []statesT{
{"bare with whitespace", " a ", []string{"[a]"}, err},
{"bare lower", "abcdefghijklmnopqrstuvwxyz", []string{"[abcdefghijklmnopqrstuvwxyz]"}, err},
{"bare upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", []string{"[ABCDEFGHIJKLMNOPQRSTUVWXYZ]"}, err},
{"bare numbers", "0123456789", []string{"[0123456789]"}, err},
{"bare underscore", "_", []string{"[_]"}, err},
{"bare dash", "-", []string{"[-]"}, err},
{"bare big mix", "-hey_good_Lookin123-", []string{"[-hey_good_Lookin123-]"}, err},
{"bare dotted", "a._.c", []string{"[a]", ".", "[_]", ".", "[c]"}, err},
{"bare dotted with whitespace", " a .\t\t b\t ", []string{"[a]", ".", "[b]"}, err},
{"bare with whitespace", " a ", "[a]", err},
{"bare lower", "abcdefghijklmnopqrstuvwxyz", "", err},
// {"bare upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "[ABCDEFGHIJKLMNOPQRSTUVWXYZ]", err},
// {"bare numbers", "0123456789", "[0123456789]", err},
// {"bare underscore", "_", "[_]", err},
// {"bare dash", "-", "[-]", err},
// {"bare big mix", "-hey_good_Lookin123-", "[-hey_good_Lookin123-]", err},
// {"bare dotted", "a._.c", "[a].[_].[c]", err},
// {"bare dotted with whitespace", " a .\t\t b\t ", "[a].[b]", err},
})
}
func TestKeyWithAssignmentButNoValue(t *testing.T) {
err := "Unexpected end of file (expected a value)"
err := "unexpected end of file"
runStatesTs(t, []statesT{
{"bare", "a=", "[a]=", err},
{"double equal sign", "a==", "[a]=", "Unexpected token '=' (expected a value)"},
{"double equal sign", "a==", "[a]=", "unexpected character '=' (expected a value)"},
{"bare dotted", "a.b=", "[a].[b]=", err},
{"bare dotted with whitespace", " a .\tb\t = ", "[a].[b]=", err},
})
@ -128,6 +144,7 @@ func TestBasicStringEscapes(t *testing.T) {
{"mix of escapes", `_="\b\t\nhuh\f\r\""`, "[_]=STR(\b\t\nhuh\f\r\")", ""},
{"UTF8 escape short", `_="\u2318"`, "[_]=STR(⌘)", ""},
{"UTF8 escape long", `_="\U0001014D"`, "[_]=STR(𐅍)", ""},
{"UTF8 vertical tab", `_="\u000B"`, "[_]=STR(\v)", ""},
})
}
@ -172,7 +189,7 @@ func runStatesT(t *testing.T, c statesT) {
}
actual := strings.Join(a, "")
if actual != expected {
t.Errorf("[%s] Unexpected lexer output:\nexpected; %s\nactual: %s\n", c.name, expected, actual)
t.Errorf("[%s] Unexpected lexer output:\nexpected: %s\nactual: %s\n", c.name, expected, actual)
}
}
}