diff --git a/lexer/lexer.go b/lexer/lexer.go index 7594cba..fc54969 100644 --- a/lexer/lexer.go +++ b/lexer/lexer.go @@ -168,12 +168,7 @@ func (l *Lexer) backup() { // no upcoming rune can be peeked (end of data or invalid UTF8 character). func (l *Lexer) peek() (rune, int, bool) { r, w := utf8.DecodeRuneInString(l.input[l.pos:]) - switch { - case r == utf8.RuneError: - return utf8.RuneError, w, false - default: - return r, w, true - } + return r, w, r != utf8.RuneError } // peekMulti takes a peek at multiple upcoming runes in the input. @@ -187,7 +182,7 @@ func (l *Lexer) peekMulti(amount int) ([]rune, int, bool) { r, w := utf8.DecodeRuneInString(l.input[l.pos+width:]) switch { case r == utf8.RuneError: - return peeked, 0, false + return peeked, width, false default: width += w peeked = append(peeked, r) @@ -209,23 +204,10 @@ func (l *Lexer) acceptNext(count int) bool { return true } -// acceptFrom adds the next rune from the input to the string buffer -// when it matches in the provided runes. If the next rune does -// not match, false is returned. -// func (l *Lexer) acceptFrom(runes string) bool { -// r, ok := l.next() -// if strings.IndexRune(runes, r) >= 0 { -// l.buffer.WriteRune(r) -// return true -// } -// l.backup() -// return false -// } - -// acceptRun adds consecutive runes from the input to the string -// buffer when they match the provided runes. If no runes were added -// at all, false it returned. -func (l *Lexer) acceptRun(match string) bool { +// acceptConsecutive adds consecutive runes from the input to the string +// buffer when they match the rune match. +// If any runes were added then true is returned, false otherwise. +func (l *Lexer) acceptConsecutive(match string) bool { accepted := false for l.accept(match) { accepted = true @@ -233,42 +215,45 @@ func (l *Lexer) acceptRun(match string) bool { return accepted } -// TODO meh... ugly rune. -var endOfFile rune = -1 - // next returns the next rune from the input and a boolean indicating if // reading the input was successful. // When the end of input is reached, or an invalid UTF8 character is // read, then false is returned. func (l *Lexer) next() (rune, bool) { + r, w, ok := l.peek() + if ok { + l.width = w + l.pos += w + l.advanceCursor(r) + return r, true + } + l.width = 0 + if r == utf8.RuneError && w == 0 { + l.emitError("unexpected end of file") + } else { + l.emitError("invalid UTF8 character") + } + return r, false +} + +func (l *Lexer) advanceCursor(r rune) { if l.newline { l.linepos = 0 l.linenr++ } else { l.linepos++ } - l.width = 0 - r, w := utf8.DecodeRuneInString(l.input[l.pos:]) - switch { - case r == utf8.RuneError && w == 0: - l.emitError("unexpected end of file") - return utf8.RuneError, false - case r == utf8.RuneError: - l.emitError("invalid UTF8 character") - return utf8.RuneError, false - default: - l.width = w - l.pos += w - l.newline = r == '\n' - return r, true - } + l.newline = r == '\n' } -// skip skips runes when all provided matches are satisfied. +// skip skips runes, but only when all provided matches are satisfied. // Returns true when one or more runes were skipped. func (l *Lexer) skipMatching(matches ...string) bool { - if _, w, ok := l.match(matches...); ok { + if runes, w, ok := l.match(matches...); ok { l.pos += w + for _, r := range runes { + l.advanceCursor(r) + } return true } return false @@ -297,11 +282,18 @@ func (l *Lexer) accept(match string) bool { return false } -func (l *Lexer) upcoming(runes ...string) bool { - _, _, ok := l.match(runes...) +// upcoming checks if the upcoming runes satisfy the provided rune matches. +// This is a lot like the match method, with the difference that +// this one only returns the boolean value. +func (l *Lexer) upcoming(matches ...string) bool { + _, _, ok := l.match(matches...) return ok } +// match checks if the upcoming runes satisfy the provided rune matches. +// It returns a slice of runes that were found, their total byte width +// and a boolean indicating whether or not all provided matches matched +// the input data. func (l *Lexer) match(matches ...string) ([]rune, int, bool) { peeked, width, ok := l.peekMulti(len(matches)) if ok { diff --git a/lexer/states.go b/lexer/states.go index 2418f04..eaee205 100644 --- a/lexer/states.go +++ b/lexer/states.go @@ -67,7 +67,7 @@ func stateKey(l *Lexer) stateFn { // keys are allowed to be composed of only ASCII digits, // e.g. 1234, but are always interpreted as strings. func statebareKeyChars(l *Lexer) stateFn { - l.acceptRun(bareKeyChars) + l.acceptConsecutive(bareKeyChars) l.emitLiteral(ItemKey) return stateEndOfKeyOrKeyDot } diff --git a/lexer/states_test.go b/lexer/states_test.go index c22d10d..1050a06 100644 --- a/lexer/states_test.go +++ b/lexer/states_test.go @@ -9,12 +9,12 @@ import ( ) func TestErrorsIncludeLineAndRowPosition(t *testing.T) { - _, err := lexer.Lex("# 12345\n# 67890\r\n# 12345\xbc").ToArray() + _, err := lexer.Lex("# 12345 abcde\t\n\n\n# 67890\r\n# 12345\xbc").ToArray() t.Logf("Got error: %s", err.Error()) - if err.LineNr != 2 { - t.Errorf("Unexpected line number: %d (expected %d)", err.LineNr, 2) + if err.LineNr != 4 { + t.Errorf("Unexpected line number: %d (expected %d)", err.LineNr, 4) } - if err.LinePos != 2 { + if err.LinePos != 6 { t.Errorf("Unexpected line position: %d (expected %d)", err.LinePos, 6) } } @@ -62,13 +62,13 @@ func TestKeyWithoutAssignment(t *testing.T) { runStatesTs(t, []statesT{ {"bare with whitespace", " a ", "[a]", err}, {"bare lower", "abcdefghijklmnopqrstuvwxyz", "", err}, - // {"bare upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "[ABCDEFGHIJKLMNOPQRSTUVWXYZ]", err}, - // {"bare numbers", "0123456789", "[0123456789]", err}, - // {"bare underscore", "_", "[_]", err}, - // {"bare dash", "-", "[-]", err}, - // {"bare big mix", "-hey_good_Lookin123-", "[-hey_good_Lookin123-]", err}, - // {"bare dotted", "a._.c", "[a].[_].[c]", err}, - // {"bare dotted with whitespace", " a .\t\t b\t ", "[a].[b]", err}, + {"bare upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "", err}, + {"bare numbers", "0123456789", "", err}, + {"bare underscore", "_", "", err}, + {"bare dash", "-", "", err}, + {"bare big mix", "-hey_good_Lookin123-", "", err}, + {"bare dotted", "a._.c", "[a].[_].", err}, + {"bare dotted with whitespace", " a .\t\t b\t ", "[a].[b]", err}, }) } @@ -123,12 +123,13 @@ func TestBasicString(t *testing.T) { } func TestBasicStringWithInvalidEscapeSequence(t *testing.T) { + err := "Invalid escape sequence in basic string" runStatesTs(t, []statesT{ - {"invalid escape sequence", `a="\x"`, "[a]=", "Invalid escape sequence in basic string"}, - {"too short \\u UTF8", `a="\u123"`, "[a]=", "Invalid escape sequence in basic string"}, - {"invalid hex in \\u UTF8", `a="\u000P"`, "[a]=", "Invalid escape sequence in basic string"}, - {"too short \\U UTF8", `a="\U1234567"`, "[a]=", "Invalid escape sequence in basic string"}, - {"invalid hex in \\U UTF8", `a="\U0000000P"`, "[a]=", "Invalid escape sequence in basic string"}, + {"invalid escape sequence", `a="\x"`, "[a]=", err}, + {"too short \\u UTF8", `a="\u123"`, "[a]=", err}, + {"invalid hex in \\u UTF8", `a="\u000P"`, "[a]=", err}, + {"too short \\U UTF8", `a="\U1234567"`, "[a]=", err}, + {"invalid hex in \\U UTF8", `a="\U0000000P"`, "[a]=", err}, }) }