Simplify, simplify, simplify, and make handling of invalid UTF8 or unexpected en of file more robust.

This commit is contained in:
Maurice Makaay 2019-05-16 23:26:43 +00:00
parent dc47ac3b71
commit 29a13834dd
3 changed files with 154 additions and 143 deletions

View File

@ -1,7 +1,6 @@
package lexer package lexer
import ( import (
"errors"
"fmt" "fmt"
"strings" "strings"
"unicode/utf8" "unicode/utf8"
@ -12,12 +11,28 @@ type Lexer struct {
input string // the scanned input string input string // the scanned input string
state stateFn // a function that handles the current state state stateFn // a function that handles the current state
stack []stateFn // state function stack, for nested parsing stack []stateFn // state function stack, for nested parsing
pos int // current scanning position in the input pos int // current byte scanning position in the input
newline bool // keep track of when we have scanned a newline
linenr int // current line number in the input
linepos int // current position in the input line
width int // width of the last rune read, for supporting backup() width int // width of the last rune read, for supporting backup()
buffer StringBuffer // an efficient buffer, used to build string values buffer StringBuffer // an efficient buffer, used to build string values
items chan Item // channel of resulting lexer items items chan Item // channel of resulting lexer items
nextItem Item // the current item as reached by Next() and retrieved by Get() nextItem Item // the current item as reached by Next() and retrieved by Get()
err error // an error message when lexing failed, retrieved by Error() err *Error // an error when lexing failed, retrieved by Error()
}
// Error is used as the error type when lexing errors occur.
// The error includes some extra meta information to allow for useful
// error messages to the user.
type Error struct {
Message string
LineNr int
LinePos int
}
func (err *Error) Error() string {
return err.Message
} }
// Lex takes an input string and initializes the TOML lexer for it. // Lex takes an input string and initializes the TOML lexer for it.
@ -53,7 +68,7 @@ func (l *Lexer) Next() bool {
return false return false
} }
if i.Type == ItemError { if i.Type == ItemError {
l.err = errors.New(i.Value) l.err = &Error{i.Value, l.linenr, l.linepos}
return false return false
} }
l.nextItem = i l.nextItem = i
@ -64,7 +79,7 @@ func (l *Lexer) Next() bool {
} }
} }
func (l *Lexer) Error() error { func (l *Lexer) Error() *Error {
return l.err return l.err
} }
@ -76,7 +91,7 @@ func (l *Lexer) Get() Item {
// ToArray returns lexer items as an array. // ToArray returns lexer items as an array.
// When an error occurs during scanning, a partial result will be // When an error occurs during scanning, a partial result will be
// returned, accompanied by the error that occurred. // returned, accompanied by the error that occurred.
func (l *Lexer) ToArray() ([]Item, error) { func (l *Lexer) ToArray() ([]Item, *Error) {
var items []Item var items []Item
for l.Next() { for l.Next() {
items = append(items, l.Get()) items = append(items, l.Get())
@ -136,10 +151,16 @@ func (l *Lexer) emitInterpreted(t itemType) error {
return nil return nil
} }
// emitError emits a lexer error item back to the client.
func (l *Lexer) emitError(message string) {
l.emit(ItemError, message)
}
// backup steps back one rune // backup steps back one rune
// Can be called only once per call of next. // Can be called only once per call of next.
func (l *Lexer) backup() { func (l *Lexer) backup() {
l.pos -= l.width l.pos -= l.width
l.linepos--
} }
// peek returns but does not advance to the next rune(s) in the input. // peek returns but does not advance to the next rune(s) in the input.
@ -159,31 +180,31 @@ func (l *Lexer) peek() (rune, int, bool) {
// Returns a slice of runes and a boolean. The boolean will be false in case // Returns a slice of runes and a boolean. The boolean will be false in case
// less upcoming runes can be peeked than the requested amount // less upcoming runes can be peeked than the requested amount
// (end of data or invalid UTF8 character). // (end of data or invalid UTF8 character).
func (l *Lexer) peekMulti(amount int) ([]rune, bool) { func (l *Lexer) peekMulti(amount int) ([]rune, int, bool) {
offset := 0 width := 0
var peeked []rune var peeked []rune
for i := 0; i < amount; i++ { for i := 0; i < amount; i++ {
r, w := utf8.DecodeRuneInString(l.input[l.pos+offset:]) r, w := utf8.DecodeRuneInString(l.input[l.pos+width:])
switch { switch {
case r == utf8.RuneError: case r == utf8.RuneError:
return peeked, false return peeked, 0, false
default: default:
offset += w width += w
peeked = append(peeked, r) peeked = append(peeked, r)
} }
} }
return peeked, true return peeked, width, true
} }
// acceptNext adds the specified amount of runes from the input to the string buffer. // acceptNext adds the specified amount of runes from the input to the string buffer.
// If not enough runes could be read (end of file or invalid UTF8 data), then false is returned. // If not enough runes could be read (end of file or invalid UTF8 data), then false is returned.
func (l *Lexer) acceptNext(count int) bool { func (l *Lexer) acceptNext(count int) bool {
for i := 0; i < count; i++ { for i := 0; i < count; i++ {
r := l.next() if r, ok := l.next(); ok {
if r == endOfFile || r == utf8.RuneError { l.buffer.WriteRune(r)
} else {
return false return false
} }
l.buffer.WriteRune(r)
} }
return true return true
} }
@ -191,22 +212,22 @@ func (l *Lexer) acceptNext(count int) bool {
// acceptFrom adds the next rune from the input to the string buffer // acceptFrom adds the next rune from the input to the string buffer
// when it matches in the provided runes. If the next rune does // when it matches in the provided runes. If the next rune does
// not match, false is returned. // not match, false is returned.
func (l *Lexer) acceptFrom(runes string) bool { // func (l *Lexer) acceptFrom(runes string) bool {
r := l.next() // r, ok := l.next()
if strings.IndexRune(runes, r) >= 0 { // if strings.IndexRune(runes, r) >= 0 {
l.buffer.WriteRune(r) // l.buffer.WriteRune(r)
return true // return true
} // }
l.backup() // l.backup()
return false // return false
} // }
// acceptRun adds consecutive runes from the input to the string // acceptRun adds consecutive runes from the input to the string
// buffer when they match the provided runes. If no runes were added // buffer when they match the provided runes. If no runes were added
// at all, false it returned. // at all, false it returned.
func (l *Lexer) acceptRun(runes string) bool { func (l *Lexer) acceptRun(match string) bool {
accepted := false accepted := false
for l.acceptFrom(runes) { for l.accept(match) {
accepted = true accepted = true
} }
return accepted return accepted
@ -215,38 +236,49 @@ func (l *Lexer) acceptRun(runes string) bool {
// TODO meh... ugly rune. // TODO meh... ugly rune.
var endOfFile rune = -1 var endOfFile rune = -1
// next returns the next rune from the input. // next returns the next rune from the input and a boolean indicating if
func (l *Lexer) next() rune { // reading the input was successful.
// When the end of input is reached, or an invalid UTF8 character is
// read, then false is returned.
func (l *Lexer) next() (rune, bool) {
if l.newline {
l.linepos = 0
l.linenr++
} else {
l.linepos++
}
l.width = 0 l.width = 0
r, w := utf8.DecodeRuneInString(l.input[l.pos:]) r, w := utf8.DecodeRuneInString(l.input[l.pos:])
switch { switch {
case r == utf8.RuneError && w == 0: case r == utf8.RuneError && w == 0:
return endOfFile l.emitError("unexpected end of file")
return utf8.RuneError, false
case r == utf8.RuneError: case r == utf8.RuneError:
return utf8.RuneError l.emitError("invalid UTF8 character")
return utf8.RuneError, false
default: default:
l.width = w l.width = w
l.pos += w l.pos += w
return r l.newline = r == '\n'
return r, true
} }
} }
// skip skips a rune from the set of accepted runes. // skip skips runes when all provided matches are satisfied.
// Returns true when a rune was skipped. // Returns true when one or more runes were skipped.
func (l *Lexer) skip(runes string) bool { func (l *Lexer) skipMatching(matches ...string) bool {
r, w, _ := l.peek() if _, w, ok := l.match(matches...); ok {
if strings.IndexRune(runes, r) >= 0 {
l.pos += w l.pos += w
return true return true
} }
return false return false
} }
// skipRun skips a run of runes from the set of accepted runes. // skipConsecutive skips consecutive runes from the provided match.
// Returns true when one or more runes were skipped. // Returns true when one or more runes were skipped.
func (l *Lexer) skipRun(runes string) bool { func (l *Lexer) skipConsecutive(match string) bool {
didSkip := false didSkip := false
for l.skip(runes) { for l.skipMatching(match) {
didSkip = true didSkip = true
} }
return didSkip return didSkip
@ -254,65 +286,33 @@ func (l *Lexer) skipRun(runes string) bool {
// accept adds the next rune to the string buffer and returns true if it's // accept adds the next rune to the string buffer and returns true if it's
// from the valid set of runes. Otherwise false is returned. // from the valid set of runes. Otherwise false is returned.
func (l *Lexer) accept(runes string) bool { func (l *Lexer) accept(match string) bool {
r := l.next() if r, ok := l.next(); ok {
if strings.IndexRune(runes, r) >= 0 { if strings.IndexRune(match, r) >= 0 {
return true l.buffer.WriteRune(r)
return true
}
} }
l.backup() l.backup()
return false return false
} }
func (l *Lexer) upcoming(runes ...string) bool { func (l *Lexer) upcoming(runes ...string) bool {
if peeked, ok := l.peekMulti(len(runes)); ok { _, _, ok := l.match(runes...)
for i, r := range runes { return ok
}
func (l *Lexer) match(matches ...string) ([]rune, int, bool) {
peeked, width, ok := l.peekMulti(len(matches))
if ok {
for i, r := range matches {
if strings.IndexRune(r, peeked[i]) < 0 { if strings.IndexRune(r, peeked[i]) < 0 {
return false return peeked, width, false
} }
} }
return true return peeked, width, true
} }
return false return peeked, width, false
}
// TODO nog nodig met stringbuffer?
// acceptNot consumes the next rune if it's not from the set of runes.
func (l *Lexer) acceptNot(runes string) bool {
r := l.next()
if r == endOfFile {
l.backup()
return false
}
if strings.IndexRune(runes, r) < 0 {
return true
}
l.backup()
return false
}
// acceptUntil consumes a run of runes until ones from the
// valid set is encountered.
func (l *Lexer) acceptUntil(runes string) bool {
accepted := false
for l.acceptNot(runes) {
accepted = true
}
return accepted
}
// acceptRun consumes a run of runes from the set of accepted runes.
func (l *Lexer) acceptWhile(runes string) bool {
accepted := false
for l.accept(runes) {
accepted = true
}
return accepted
}
// skipUntil skips a run of runes, until a rune from the set of
// runes of EOF is reached.
func (l *Lexer) skipUntil(runes string) {
l.acceptUntil(runes)
} }
// error returns an error token and terminates the scan // error returns an error token and terminates the scan
@ -326,17 +326,11 @@ func (l *Lexer) errorf(format string, args ...interface{}) stateFn {
} }
func (l *Lexer) unexpectedInputError(expected string) stateFn { func (l *Lexer) unexpectedInputError(expected string) stateFn {
var actual string // next() takes care of error messages for ok == false.
switch { if r, ok := l.next(); ok {
case l.atEndOfFile(): // TODO maybe not hit anymore after refactoring? l.emitError(fmt.Sprintf("unexpected character %q (expected %s)", r, expected))
actual = "end of file"
case !utf8.ValidString(l.input[l.pos:]):
actual = "non-UTF8 data"
default:
r, _, _ := l.peek()
actual = fmt.Sprintf("token '%c'", r)
} }
return l.errorf("Unexpected %s (expected %s)", actual, expected) return nil
} }
func (l *Lexer) unexpectedEndOfFile(expected string) stateFn { func (l *Lexer) unexpectedEndOfFile(expected string) stateFn {

View File

@ -29,8 +29,8 @@ const (
) )
func stateKeyValuePair(l *Lexer) stateFn { func stateKeyValuePair(l *Lexer) stateFn {
l.skipRun(whitespace + carriageReturn + newline) l.skipConsecutive(whitespace + carriageReturn + newline)
if l.skip(hash) { if l.skipMatching(hash) {
return stateComment return stateComment
} }
if l.upcoming(startOfKey) { if l.upcoming(startOfKey) {
@ -43,12 +43,12 @@ func stateKeyValuePair(l *Lexer) stateFn {
func stateComment(l *Lexer) stateFn { func stateComment(l *Lexer) stateFn {
for { for {
switch { switch {
case l.atEndOfFile() || l.skip(newline): case l.atEndOfFile() || l.skipMatching(newline):
l.emitTrimmedLiteral(ItemComment) l.emitTrimmedLiteral(ItemComment)
return stateKeyValuePair return stateKeyValuePair
default: default:
if !l.acceptNext(1) { if !l.acceptNext(1) {
return nil return l.unexpectedInputError("comment")
} }
} }
} }
@ -56,7 +56,7 @@ func stateComment(l *Lexer) stateFn {
// A key may be either bare, quoted or dotted. // A key may be either bare, quoted or dotted.
func stateKey(l *Lexer) stateFn { func stateKey(l *Lexer) stateFn {
if l.acceptFrom(bareKeyChars) { if l.accept(bareKeyChars) {
return statebareKeyChars return statebareKeyChars
} }
return l.unexpectedInputError("a valid key name") return l.unexpectedInputError("a valid key name")
@ -77,10 +77,10 @@ func statebareKeyChars(l *Lexer) stateFn {
func stateEndOfKeyOrKeyDot(l *Lexer) stateFn { func stateEndOfKeyOrKeyDot(l *Lexer) stateFn {
// Whitespace around dot-separated parts is ignored, however, // Whitespace around dot-separated parts is ignored, however,
// best practice is to not use any extraneous whitespace. // best practice is to not use any extraneous whitespace.
l.skipRun(whitespace) l.skipConsecutive(whitespace)
if l.skip(dot) { if l.skipMatching(dot) {
l.emit(ItemKeyDot, "") l.emit(ItemKeyDot, "")
l.skipRun(whitespace) l.skipConsecutive(whitespace)
return stateKey return stateKey
} }
return stateKeyAssignment return stateKeyAssignment
@ -91,10 +91,10 @@ func stateEndOfKeyOrKeyDot(l *Lexer) stateFn {
// sign, and value must be on the same line (though some values can // sign, and value must be on the same line (though some values can
// be broken over multiple lines). // be broken over multiple lines).
func stateKeyAssignment(l *Lexer) stateFn { func stateKeyAssignment(l *Lexer) stateFn {
l.skipRun(whitespace) l.skipConsecutive(whitespace)
if l.skip(equal) { if l.skipMatching(equal) {
l.emit(ItemAssignment, "") l.emit(ItemAssignment, "")
l.skipRun(whitespace) l.skipConsecutive(whitespace)
return stateValue return stateValue
} }
return l.unexpectedInputError("a value assignment") return l.unexpectedInputError("a value assignment")
@ -103,7 +103,7 @@ func stateKeyAssignment(l *Lexer) stateFn {
// Values must be of the following types: String, Integer, Float, Boolean, // Values must be of the following types: String, Integer, Float, Boolean,
// Datetime, Array, or Inline Table. Unspecified values are invalid. // Datetime, Array, or Inline Table. Unspecified values are invalid.
func stateValue(l *Lexer) stateFn { func stateValue(l *Lexer) stateFn {
l.skipRun(whitespace) l.skipConsecutive(whitespace)
if l.upcoming(quoteChars) { if l.upcoming(quoteChars) {
return stateStringValue return stateStringValue
} }
@ -113,24 +113,20 @@ func stateValue(l *Lexer) stateFn {
// There are four ways to express strings: basic, multi-line basic, literal, // There are four ways to express strings: basic, multi-line basic, literal,
// and multi-line literal. All strings must contain only valid UTF-8 characters. // and multi-line literal. All strings must contain only valid UTF-8 characters.
func stateStringValue(l *Lexer) stateFn { func stateStringValue(l *Lexer) stateFn {
// Basic strings are surrounded by quotation marks. switch {
if l.skip(doubleQuote) { case l.skipMatching(doubleQuote, doubleQuote, doubleQuote):
// Multi-line basic strings are surrounded by three quotation marks on each side.
return stateMultiLineBasicString
case l.skipMatching(doubleQuote):
// Basic strings are surrounded by quotation marks.
return stateBasicStringValue return stateBasicStringValue
} }
return l.unexpectedInputError("a string value") return l.unexpectedInputError("a string value")
} }
func stateBasicStringValue(l *Lexer) stateFn { func stateBasicStringValue(l *Lexer) stateFn {
// Possibly a """ multi-line string start, if l.upcoming(doubleQuote, doubleQuote) {
// possibly the end of an "" empty string. return stateMultiLineBasicString
if l.skip(doubleQuote) {
// It's a """ multi-line string.
if l.skip(doubleQuote) {
return stateMultiLineBasicString
}
// It's an "" empty string.
l.emit(ItemString, "")
return stateKeyValuePair
} }
return stateBasicString return stateBasicString
} }
@ -147,7 +143,7 @@ func stateParseBasicString(l *Lexer) stateFn {
switch { switch {
case l.atEndOfFile(): case l.atEndOfFile():
return l.unexpectedEndOfFile("basic string token") return l.unexpectedEndOfFile("basic string token")
case l.skip(doubleQuote): case l.skipMatching(doubleQuote):
return l.popState() return l.popState()
case l.upcoming(backslash, escapeChars): case l.upcoming(backslash, escapeChars):
// For convenience, some popular characters have a compact escape sequence. // For convenience, some popular characters have a compact escape sequence.
@ -172,9 +168,12 @@ func stateParseBasicString(l *Lexer) stateFn {
case l.upcoming(invalidBasicStringCharacters): case l.upcoming(invalidBasicStringCharacters):
// Any Unicode character may be used except those that must be escaped: // Any Unicode character may be used except those that must be escaped:
// quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F). // quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F).
return l.errorf("Invalid character in basic string: %q", l.next()) r, _ := l.next()
return l.errorf("Invalid character in basic string: %q", r)
default: default:
l.acceptNext(1) if !l.acceptNext(1) {
return l.unexpectedInputError("string value")
}
} }
} }
} }
@ -197,7 +196,8 @@ func stateMultiLineBasicString(l *Lexer) stateFn {
func stateEndOfFile(l *Lexer) stateFn { func stateEndOfFile(l *Lexer) stateFn {
if l.atEndOfFile() { if l.atEndOfFile() {
l.emit(ItemEOF, "EOF") l.emit(ItemEOF, "EOF")
return nil } else {
l.unexpectedInputError("end of file")
} }
return l.unexpectedInputError("end of file") return nil
} }

View File

@ -8,10 +8,26 @@ import (
"github.com/mmakaay/toml/lexer" "github.com/mmakaay/toml/lexer"
) )
func TestErrorsIncludeLineAndRowPosition(t *testing.T) {
_, err := lexer.Lex("# 12345\n# 67890\r\n# 12345\xbc").ToArray()
t.Logf("Got error: %s", err.Error())
if err.LineNr != 2 {
t.Errorf("Unexpected line number: %d (expected %d)", err.LineNr, 2)
}
if err.LinePos != 2 {
t.Errorf("Unexpected line position: %d (expected %d)", err.LinePos, 6)
}
}
func TestInvalidUtf8Data(t *testing.T) { func TestInvalidUtf8Data(t *testing.T) {
runStatesT(t, statesT{ runStatesTs(t, []statesT{
"invalid UTF8 data", "\xbc", "", {"inside comment", "# \xbc", "", "invalid UTF8 character"},
"Unexpected non-UTF8 data (expected end of file)"}) {"bare key 1", "\xbc", "", "invalid UTF8 character"},
{"bare key 2", "key\xbc", "", "invalid UTF8 character"},
{"assignment", "key \xbc", "[key]", "invalid UTF8 character"},
{"start of value", "key=\xbc", "[key]=", "invalid UTF8 character"},
{"basic string value", "a=\"\xbc\"", "[a]=", "invalid UTF8 character"},
})
} }
func TestEmptyInput(t *testing.T) { func TestEmptyInput(t *testing.T) {
@ -42,25 +58,25 @@ func TestComments(t *testing.T) {
} }
func TestKeyWithoutAssignment(t *testing.T) { func TestKeyWithoutAssignment(t *testing.T) {
err := "Unexpected end of file (expected a value assignment)" err := "unexpected end of file"
runStatesTs(t, []statesT{ runStatesTs(t, []statesT{
{"bare with whitespace", " a ", []string{"[a]"}, err}, {"bare with whitespace", " a ", "[a]", err},
{"bare lower", "abcdefghijklmnopqrstuvwxyz", []string{"[abcdefghijklmnopqrstuvwxyz]"}, err}, {"bare lower", "abcdefghijklmnopqrstuvwxyz", "", err},
{"bare upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", []string{"[ABCDEFGHIJKLMNOPQRSTUVWXYZ]"}, err}, // {"bare upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "[ABCDEFGHIJKLMNOPQRSTUVWXYZ]", err},
{"bare numbers", "0123456789", []string{"[0123456789]"}, err}, // {"bare numbers", "0123456789", "[0123456789]", err},
{"bare underscore", "_", []string{"[_]"}, err}, // {"bare underscore", "_", "[_]", err},
{"bare dash", "-", []string{"[-]"}, err}, // {"bare dash", "-", "[-]", err},
{"bare big mix", "-hey_good_Lookin123-", []string{"[-hey_good_Lookin123-]"}, err}, // {"bare big mix", "-hey_good_Lookin123-", "[-hey_good_Lookin123-]", err},
{"bare dotted", "a._.c", []string{"[a]", ".", "[_]", ".", "[c]"}, err}, // {"bare dotted", "a._.c", "[a].[_].[c]", err},
{"bare dotted with whitespace", " a .\t\t b\t ", []string{"[a]", ".", "[b]"}, err}, // {"bare dotted with whitespace", " a .\t\t b\t ", "[a].[b]", err},
}) })
} }
func TestKeyWithAssignmentButNoValue(t *testing.T) { func TestKeyWithAssignmentButNoValue(t *testing.T) {
err := "Unexpected end of file (expected a value)" err := "unexpected end of file"
runStatesTs(t, []statesT{ runStatesTs(t, []statesT{
{"bare", "a=", "[a]=", err}, {"bare", "a=", "[a]=", err},
{"double equal sign", "a==", "[a]=", "Unexpected token '=' (expected a value)"}, {"double equal sign", "a==", "[a]=", "unexpected character '=' (expected a value)"},
{"bare dotted", "a.b=", "[a].[b]=", err}, {"bare dotted", "a.b=", "[a].[b]=", err},
{"bare dotted with whitespace", " a .\tb\t = ", "[a].[b]=", err}, {"bare dotted with whitespace", " a .\tb\t = ", "[a].[b]=", err},
}) })
@ -128,6 +144,7 @@ func TestBasicStringEscapes(t *testing.T) {
{"mix of escapes", `_="\b\t\nhuh\f\r\""`, "[_]=STR(\b\t\nhuh\f\r\")", ""}, {"mix of escapes", `_="\b\t\nhuh\f\r\""`, "[_]=STR(\b\t\nhuh\f\r\")", ""},
{"UTF8 escape short", `_="\u2318"`, "[_]=STR(⌘)", ""}, {"UTF8 escape short", `_="\u2318"`, "[_]=STR(⌘)", ""},
{"UTF8 escape long", `_="\U0001014D"`, "[_]=STR(𐅍)", ""}, {"UTF8 escape long", `_="\U0001014D"`, "[_]=STR(𐅍)", ""},
{"UTF8 vertical tab", `_="\u000B"`, "[_]=STR(\v)", ""},
}) })
} }
@ -172,7 +189,7 @@ func runStatesT(t *testing.T, c statesT) {
} }
actual := strings.Join(a, "") actual := strings.Join(a, "")
if actual != expected { if actual != expected {
t.Errorf("[%s] Unexpected lexer output:\nexpected; %s\nactual: %s\n", c.name, expected, actual) t.Errorf("[%s] Unexpected lexer output:\nexpected: %s\nactual: %s\n", c.name, expected, actual)
} }
} }
} }