Code cleanup and refactoring run, both functional code and the tests.

This commit is contained in:
Maurice Makaay 2019-05-16 14:17:06 +00:00
parent 6636a7a672
commit cbc4f04179
7 changed files with 435 additions and 334 deletions

View File

@ -12,6 +12,7 @@ const (
ItemComment // Comment string, starts with # till en of line
ItemKey // Key of a key/value pair
ItemKeyDot // Dot for a dotted key
ItemAssignment // Value assignment coming up (=)
ItemString // A value of type string
)
@ -26,26 +27,26 @@ func (i Item) String() string {
switch i.Type {
case ItemEOF:
return "EOF"
case ItemError:
return "Error: " + i.Value
case ItemKey:
return fmt.Sprintf("[%s]", i.Value)
case ItemKeyDot:
return "."
case ItemAssignment:
return "="
}
return fmt.Sprintf("%s(%q)", i.Type, i.Value)
return fmt.Sprintf("%s(%s)", i.Type, i.Value)
}
// String returns a string representation of the lexer item type.
func (i itemType) String() string {
switch i {
case ItemError:
return "Error"
return "ERR"
case ItemComment:
return "Comment"
case ItemKey:
return "Key"
case ItemKeyDot:
return "KeyDot"
return "#"
case ItemString:
return "String"
return "STR"
default:
return fmt.Sprintf("<type id %d>", i)
panic(fmt.Sprintf("No translation available for type id %d", i))
}
}

View File

@ -12,7 +12,6 @@ type Lexer struct {
input string // the scanned input string
state stateFn // a function that handles the current state
stack []stateFn // state function stack, for nested parsing
start int // start position of the currently scanned item
pos int // current scanning position in the input
width int // width of the last rune read, for supporting backup()
buffer StringBuffer // an efficient buffer, used to build string values
@ -99,29 +98,44 @@ func (l *Lexer) popState() stateFn {
return tail
}
// TODO niet meer nodig?
// getAcceptedString returns the string as accepted by the
// accept* methods so far.
func (l *Lexer) getAcceptedString() string {
return l.input[l.start:l.pos]
}
// emit passes a scanned item back to the client.
func (l *Lexer) emit(t itemType, v string) {
l.items <- Item{t, v}
l.start = l.pos
}
// TODO niet meer nodig met stringbuilder?
// ignore skips over the pending input before the current position.
func (l *Lexer) ignore() {
l.start = l.pos
}
// atEndOfFile returns true when there is no more data available in the input.
func (l *Lexer) atEndOfFile() bool {
return l.pos >= len(l.input)
}
// emit passes a lexer item back to the client, including the provided string.
func (l *Lexer) emit(t itemType, s string) {
l.items <- Item{t, s}
l.buffer.Reset()
}
// emitLiteral passes a lexer item back to the client, including the accumulated
// string buffer data as a literal string.
func (l *Lexer) emitLiteral(t itemType) {
l.emit(t, l.buffer.AsLiteralString())
}
// emitTrimmedLiteral passes a lexer item back to the client, including the
// accumulated string buffer data as a literal string with whitespace
// trimmed from it.
func (l *Lexer) emitTrimmedLiteral(t itemType) {
l.emit(t, strings.TrimSpace(l.buffer.AsLiteralString()))
}
// emitInterpreted passes a lexer item back to the client, including the
// accumulated string buffer data an interpreted string (handling escape
// codes like \n, \t, \uXXXX, etc.)
// This method might return an error, in case there is data in the
// string buffer that is not valid for string interpretation.
func (l *Lexer) emitInterpreted(t itemType) error {
s, err := l.buffer.AsInterpretedString()
if err != nil {
return err
}
l.emit(t, s)
return nil
}
// backup steps back one rune
// Can be called only once per call of next.
func (l *Lexer) backup() {
@ -129,16 +143,119 @@ func (l *Lexer) backup() {
}
// peek returns but does not advance to the next rune(s) in the input.
func (l *Lexer) peek() rune {
r := l.next()
l.backup()
return r
// Returns the rune, its width and a boolean. The boolean will be false in case
// no upcoming rune can be peeked (end of data or invalid UTF8 character).
func (l *Lexer) peek() (rune, int, bool) {
r, w := utf8.DecodeRuneInString(l.input[l.pos:])
switch {
case r == utf8.RuneError:
return utf8.RuneError, w, false
default:
return r, w, true
}
}
// TODO nog nodig met stringbuffer?
// accept consumes the next rune if it's from the valid set of runes.
// peekMulti takes a peek at multiple upcoming runes in the input.
// Returns a slice of runes and a boolean. The boolean will be false in case
// less upcoming runes can be peeked than the requested amount
// (end of data or invalid UTF8 character).
func (l *Lexer) peekMulti(amount int) ([]rune, bool) {
offset := 0
var peeked []rune
for i := 0; i < amount; i++ {
r, w := utf8.DecodeRuneInString(l.input[l.pos+offset:])
switch {
case r == utf8.RuneError:
return peeked, false
default:
offset += w
peeked = append(peeked, r)
}
}
return peeked, true
}
// acceptNext adds the next rune from the input to the string buffer.
// If no rune could be read (end of file or invalid UTF8 data),
// then false is returned.
func (l *Lexer) acceptNext() bool {
r := l.next()
if r == endOfFile || r == utf8.RuneError {
return false
}
l.buffer.WriteRune(r)
return true
}
// acceptFrom adds the next rune from the input to the string buffer
// when it matches in the provided runes. If the next rune does
// not match, false is returned.
func (l *Lexer) acceptFrom(runes string) bool {
r := l.next()
if strings.IndexRune(runes, r) >= 0 {
l.buffer.WriteRune(r)
return true
}
l.backup()
return false
}
// acceptRun adds consecutive runes from the input to the string
// buffer when they match the provided runes. If no runes were added
// at all, false it returned.
func (l *Lexer) acceptRun(runes string) bool {
accepted := false
for l.acceptFrom(runes) {
accepted = true
}
return accepted
}
// TODO meh... ugly rune.
var endOfFile rune = -1
// next returns the next rune from the input.
func (l *Lexer) next() rune {
l.width = 0
r, w := utf8.DecodeRuneInString(l.input[l.pos:])
switch {
case r == utf8.RuneError && w == 0:
return endOfFile
case r == utf8.RuneError:
return utf8.RuneError
default:
l.width = w
l.pos += w
return r
}
}
// skip skips a rune from the set of accepted runes.
// Returns true when a rune was skipped.
func (l *Lexer) skip(runes string) bool {
r, w, _ := l.peek()
if strings.IndexRune(runes, r) >= 0 {
l.pos += w
return true
}
return false
}
// skipRun skips a run of runes from the set of accepted runes.
// Returns true when one or more runes were skipped.
func (l *Lexer) skipRun(runes string) bool {
didSkip := false
for l.skip(runes) {
didSkip = true
}
return didSkip
}
// accept adds the next rune to the string buffer and returns true if it's
// from the valid set of runes. Otherwise false is returned.
func (l *Lexer) accept(runes string) bool {
if strings.IndexRune(runes, l.next()) >= 0 {
r := l.next()
if strings.IndexRune(runes, r) >= 0 {
return true
}
l.backup()
@ -187,34 +304,10 @@ func (l *Lexer) acceptWhile(runes string) bool {
return accepted
}
// skip skips a run of runes from the set of accepted runs.
func (l *Lexer) skip(runes string) {
if l.acceptWhile(runes) {
l.ignore()
}
}
// skipUntil skips a run of runes, until a rune from the set of
// runes of EOF is reached.
func (l *Lexer) skipUntil(runes string) {
if l.acceptUntil(runes) {
l.ignore()
}
}
// TODO meh... ugly rune.
var endOfFile rune = -1
// next returns the next rune in the input.
func (l *Lexer) next() rune {
if l.atEndOfFile() {
l.width = 0
return endOfFile // TODO phase out this bizarro rune?
}
r, w := utf8.DecodeRuneInString(l.input[l.pos:])
l.width = w
l.pos += w
return r
l.acceptUntil(runes)
}
// error returns an error token and terminates the scan
@ -227,15 +320,16 @@ func (l *Lexer) errorf(format string, args ...interface{}) stateFn {
return nil
}
func (l *Lexer) unexpectedTokenError(expected string) stateFn {
func (l *Lexer) unexpectedInputError(expected string) stateFn {
var actual string
switch {
case l.peek() == endOfFile:
case l.atEndOfFile(): // TODO maybe not hit anymore after refactoring?
actual = "end of file"
case !utf8.ValidString(l.input[l.start:]):
case !utf8.ValidString(l.input[l.pos:]):
actual = "non-UTF8 data"
default:
actual = fmt.Sprintf("token '%c'", l.peek())
r, _, _ := l.peek()
actual = fmt.Sprintf("token '%c'", r)
}
return l.errorf("Unexpected %s (expected %s)", actual, expected)
}

View File

@ -1,175 +0,0 @@
package lexer_test
import (
"fmt"
"testing"
"github.com/mmakaay/toml/lexer"
)
func TestInvalidUtf8Data(t *testing.T) {
assertFailureAndCheck(t, "\xbc", []string{}, "Unexpected non-UTF8 data (expected end of file)")
}
func TestEmptyInput(t *testing.T) {
assertSuccessAndCheck(t, "", []string{})
}
func TestWhiteSpace(t *testing.T) {
assertSuccessAndCheck(t, " ", []string{})
assertSuccessAndCheck(t, "\t", []string{})
assertSuccessAndCheck(t, " \t \t ", []string{})
}
func TestWhiteSpaceAndNewlines(t *testing.T) {
assertSuccessAndCheck(t, "\n", []string{})
assertSuccessAndCheck(t, "\n \t\r\n", []string{})
}
func TestComments(t *testing.T) {
assertSuccessAndCheck(t, "#", []string{`Comment("#")`})
assertSuccessAndCheck(t, " \t \t #", []string{`Comment("#")`})
assertSuccessAndCheck(t, " \t \t # not empty", []string{`Comment("# not empty")`})
assertSuccessAndCheck(t, " \t \t # not empty\r\r\r\n", []string{`Comment("# not empty")`})
assertSuccessAndCheck(t, "\n \t\r\n# AAP\r\n", []string{`Comment("# AAP")`})
assertSuccessAndCheck(t,
"# two lines\n# of comments\n",
[]string{`Comment("# two lines")`, `Comment("# of comments")`})
assertSuccessAndCheck(t,
`# \tcomment\nwith escape-y chars`,
[]string{`Comment("# \\tcomment\\nwith escape-y chars")`})
}
func TestBareKeyWithoutValue(t *testing.T) {
err := "Unexpected end of file (expected an '=' value assignment)"
assertFailureAndCheck(t, "a", []string{`Key("a")`}, err)
assertFailureAndCheck(t, "_", []string{`Key("_")`}, err)
assertFailureAndCheck(t, " a", []string{`Key("a")`}, err)
assertFailureAndCheck(t, " a ", []string{`Key("a")`}, err)
assertFailureAndCheck(t, "ab", []string{`Key("ab")`}, err)
assertFailureAndCheck(t, "Ab", []string{`Key("Ab")`}, err)
assertFailureAndCheck(t, "Ab1", []string{`Key("Ab1")`}, err)
assertFailureAndCheck(t, "_Ab1", []string{`Key("_Ab1")`}, err)
assertFailureAndCheck(t, "_-Ab1", []string{`Key("_-Ab1")`}, err)
assertFailureAndCheck(t, "_-Ab1_this-is_GOOD987", []string{`Key("_-Ab1_this-is_GOOD987")`}, err)
}
func TestDottedKey(t *testing.T) {
err := "Unexpected end of file (expected an '=' value assignment)"
assertFailureAndCheck(t, "a.b", []string{`Key("a")`, `KeyDot(".")`, `Key("b")`}, err)
assertFailureAndCheck(t, " a .\t\t b\t ", []string{`Key("a")`, `KeyDot(".")`, `Key("b")`}, err)
}
func TestKeyWithAssignmentButNoValue(t *testing.T) {
err := "Unexpected end of file (expected a value)"
assertFailureAndCheck(t, " some_cool_key = ", []string{`Key("some_cool_key")`}, err)
}
func TestUnterminatedBasicString(t *testing.T) {
assertFailure(t, `key="value`, "Unexpected end of file (expected basic string token)")
}
func TestBasicStringWithNewline(t *testing.T) {
assertFailure(t, "key=\"value\nwith\nnewlines\"", "ohoh")
}
func TestEmptyBasicString(t *testing.T) {
assertSuccessAndCheck(t, `a=""`, []string{`Key("a")`, `String("")`})
assertSuccessAndCheck(t, `a=""#hi`, []string{`Key("a")`, `String("")`, `Comment("#hi")`})
assertSuccessAndCheck(t, `a = ""`, []string{`Key("a")`, `String("")`})
assertSuccessAndCheck(t, `a.b = ""`, []string{`Key("a")`, `KeyDot(".")`, `Key("b")`, `String("")`})
assertSuccessAndCheck(t, `a=""b=""`, []string{`Key("a")`, `String("")`, `Key("b")`, `String("")`})
}
func TestBasicString(t *testing.T) {
assertSuccessAndCheck(t, `_ = "b"`,
[]string{
`Key("_")`,
`String("b")`})
assertSuccessAndCheck(t, `thing = "A cool ʎǝʞ" # huh, it's up-side down!!`,
[]string{
`Key("thing")`,
`String("A cool ʎǝʞ")`,
`Comment("# huh, it's up-side down!!")`})
}
func TestInvalidEscapeSequence(t *testing.T) {
assertFailure(t, `a="\x"`, `Invalid escape sequence \x in string value`)
}
func TestBasicStringEscapes(t *testing.T) {
for in, out := range map[string]string{
`\b`: "\b",
`\t`: "\t",
`\n`: "\n",
`\f`: "\f",
`\r`: "\r",
`\"`: "\"",
`\b\t\nhuh\f\r\"`: "\b\t\nhuh\f\r\"",
`\u2318`: "⌘",
`\U0001014D`: "𐅍",
} {
l := assertSuccess(t, fmt.Sprintf(`x="%s"`, in))
if out != l[1].Value {
t.Fatalf("Unexpected result when parsing '%s'\nexpected: %q\nactual: %q", in, out, l[1].Value)
}
}
}
// func TestBasicStringUnicodeEscapes(t *testing.T) {
// for in, out := range map[string]string{
// `\u`: "\b",
// } {
// l := assertSuccess(t, fmt.Sprintf(`x="%s"`, in))
// s := l[2]
// if out != s.Value {
// t.Fatalf("Unexpected result when parsing '%s'", in)
// }
// }
// }
func TestTwoKeyValuePairs(t *testing.T) {
assertSuccessAndCheck(t, "a=\"Hello\" #comment1\nb=\"World!\"#comment2\r\n",
[]string{
`Key("a")`,
`String("Hello")`,
`Comment("#comment1")`,
`Key("b")`,
`String("World!")`,
`Comment("#comment2")`})
}
func assertSuccessAndCheck(t *testing.T, input string, expected []string) {
l := assertSuccess(t, input)
assertItems(t, l, expected)
}
func assertFailureAndCheck(t *testing.T, input string, expected []string, expectedErr string) {
l := assertFailure(t, input, expectedErr)
assertItems(t, l, expected)
}
func assertFailure(t *testing.T, input string, expectedErr string) []lexer.Item {
l, err := lexer.Lex(input).ToArray()
if err == nil {
t.Fatalf("Expected lexer error '%s', but no error occurred", expectedErr)
}
if err.Error() != expectedErr {
t.Fatalf("Mismatch between expected and actual error:\nExpected: %s\nActual: %s\n", expectedErr, err)
}
return l
}
func assertSuccess(t *testing.T, input string) []lexer.Item {
l, err := lexer.Lex(input).ToArray()
if err != nil {
t.Fatalf("Unexpected lexer error: %s", err)
}
return l
}
func assertItems(t *testing.T, l []lexer.Item, expected []string) {
if len(expected) != len(l) {
t.Fatalf("Unexpected number of lexer items: %d (expected: %d)", len(l), len(expected))
}
for i, e := range expected {
if l[i].String() != e {
t.Fatalf("Unexpected lexer item at index %d: %s (expected: %s)", i, l[i], e)
}
}
}

View File

@ -1,6 +1,6 @@
package lexer
// stateFn represents the state of the scanner as a function
// stateFn represents the state of the lexer as a function
// that returns the next state.
type stateFn func(*Lexer) stateFn
@ -19,15 +19,17 @@ const (
singleQuote string = "'"
doubleQuote string = "\""
backslash string = "\\"
someQuote string = singleQuote + doubleQuote
bareKey string = lower + upper + digits + underscore + dash
startOfKey string = bareKey + someQuote
quotable string = `btnfr\"`
quoteChars string = singleQuote + doubleQuote
bareKeyChars string = lower + upper + digits + underscore + dash
startOfKey string = bareKeyChars + quoteChars
escapeChars string = `btnfr"\`
shortUtf8Escape string = "u"
longUtf8Escape string = "U"
)
func stateKeyValuePair(l *Lexer) stateFn {
l.skip(whitespace + carriageReturn + newline)
if l.upcoming(hash) {
l.skipRun(whitespace + carriageReturn + newline)
if l.skip(hash) {
return stateComment
}
if l.upcoming(startOfKey) {
@ -38,36 +40,34 @@ func stateKeyValuePair(l *Lexer) stateFn {
// A '#' hash symbol marks the rest of the line as a comment.
func stateComment(l *Lexer) stateFn {
l.buffer.Reset()
for {
switch {
case l.atEndOfFile() || l.accept(newline):
s := l.buffer.AsLiteralString()
l.emit(ItemComment, s)
case l.atEndOfFile() || l.skip(newline):
l.emitTrimmedLiteral(ItemComment)
return stateKeyValuePair
case l.accept(carriageReturn):
l.ignore()
default:
l.buffer.WriteRune(l.next())
if !l.acceptNext() {
return nil
}
}
}
}
// A key may be either bare, quoted or dotted.
func stateKey(l *Lexer) stateFn {
if l.upcoming(bareKey) {
return stateBareKey
if l.acceptFrom(bareKeyChars) {
return statebareKeyChars
}
return l.unexpectedTokenError("a valid key name")
return l.unexpectedInputError("a valid key name")
}
// Bare keys may only contain ASCII letters, ASCII digits,
// underscores, and dashes (A-Za-z0-9_-). Note that bare
// keys are allowed to be composed of only ASCII digits,
// e.g. 1234, but are always interpreted as strings.
func stateBareKey(l *Lexer) stateFn {
l.acceptWhile(bareKey)
l.emit(ItemKey, l.getAcceptedString())
func statebareKeyChars(l *Lexer) stateFn {
l.acceptRun(bareKeyChars)
l.emitLiteral(ItemKey)
return stateEndOfKeyOrKeyDot
}
@ -76,10 +76,10 @@ func stateBareKey(l *Lexer) stateFn {
func stateEndOfKeyOrKeyDot(l *Lexer) stateFn {
// Whitespace around dot-separated parts is ignored, however,
// best practice is to not use any extraneous whitespace.
l.skip(whitespace)
if l.accept(dot) {
l.emit(ItemKeyDot, ".")
l.skip(whitespace)
l.skipRun(whitespace)
if l.skip(dot) {
l.emit(ItemKeyDot, "")
l.skipRun(whitespace)
return stateKey
}
return stateKeyAssignment
@ -90,70 +90,57 @@ func stateEndOfKeyOrKeyDot(l *Lexer) stateFn {
// sign, and value must be on the same line (though some values can
// be broken over multiple lines).
func stateKeyAssignment(l *Lexer) stateFn {
l.skip(whitespace)
if l.accept(equal) {
l.skip(whitespace)
l.skipRun(whitespace)
if l.skip(equal) {
l.emit(ItemAssignment, "")
l.skipRun(whitespace)
return stateValue
}
return l.unexpectedTokenError("an '=' value assignment")
return l.unexpectedInputError("a value assignment")
}
// Values must be of the following types: String, Integer, Float, Boolean,
// Datetime, Array, or Inline Table. Unspecified values are invalid.
func stateValue(l *Lexer) stateFn {
l.skip(whitespace)
if l.upcoming(someQuote) {
l.skipRun(whitespace)
if l.upcoming(quoteChars) {
return stateStringValue
}
return l.unexpectedTokenError("a value")
return l.unexpectedInputError("a value")
}
// There are four ways to express strings: basic, multi-line basic, literal,
// and multi-line literal. All strings must contain only valid UTF-8 characters.
func stateStringValue(l *Lexer) stateFn {
if l.accept(doubleQuote) {
// Basic strings are surrounded by quotation marks.
if l.skip(doubleQuote) {
return stateBasicStringValue
}
return l.unexpectedTokenError("a string value")
return l.unexpectedInputError("a string value")
}
// Basic strings are surrounded by quotation marks. Any Unicode character
// may be used except those that must be escaped: quotation mark, backslash,
// and the control characters (U+0000 to U+001F, U+007F).
//
// For convenience, some popular characters have a compact escape sequence.
//
// \b - backspace (U+0008)
// \t - tab (U+0009)
// \n - linefeed (U+000A)
// \f - form feed (U+000C)
// \r - carriage return (U+000D)
// \" - quote (U+0022)
// \\ - backslash (U+005C)
// \uXXXX - unicode (U+XXXX)
// \UXXXXXXXX - unicode (U+XXXXXXXX)
//
// Any Unicode character may be escaped with the \uXXXX or \UXXXXXXXX forms.
// The escape codes must be valid Unicode scalar values.
//
// All other escape sequences not listed above are reserved and,
// if used, TOML should produce an error.
func stateBasicStringValue(l *Lexer) stateFn {
// Possibly a """ multi-line string start,
// possibly the end of an "" empty string.
if l.accept(doubleQuote) {
if l.skip(doubleQuote) {
// It's a """ multi-line string.
if l.accept(doubleQuote) {
l.ignore()
if l.skip(doubleQuote) {
return stateMultiLineBasicString
}
// It's an "" empty string.
l.ignore()
l.emit(ItemString, "")
return stateKeyValuePair
}
l.ignore()
return stateBasicString
}
const invalidBasicStringCharacters string = "" +
"\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007" +
"\u0008\u0009\u000A\u000B\u000C\u000D\u000E\u000F" +
"\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017" +
"\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F" +
"\u007F"
func stateParseBasicString(l *Lexer) stateFn {
for {
switch {
@ -162,26 +149,47 @@ func stateParseBasicString(l *Lexer) stateFn {
case l.accept(doubleQuote):
return l.popState()
case l.accept(backslash):
if l.upcoming(quotable) {
// For convenience, some popular characters have a compact escape sequence.
// Any Unicode character may be escaped with the \uXXXX or \UXXXXXXXX forms.
// The escape codes must be valid Unicode scalar values.
switch {
case l.upcoming(escapeChars):
// \b - backspace (U+0008)
// \t - tab (U+0009)
// \n - linefeed (U+000A)
// \f - form feed (U+000C)
// \r - carriage return (U+000D)
// \" - quote (U+0022)
// \\ - backslash (U+005C)
l.buffer.WriteRune('\\')
l.buffer.WriteRune(l.next())
} else {
case l.upcoming(shortUtf8Escape):
// \uXXXX - unicode (U+XXXX)
return l.errorf("Not yet implemented: short utf8")
case l.upcoming(longUtf8Escape):
// \UXXXXXXXX - unicode (U+XXXXXXXX)
return l.errorf("Not yet implemented: long utf8")
default:
// All other escape sequences not listed above are reserved and,
// if used, TOML should produce an error.
return l.errorf("Invalid escape sequence \\%c in string value", l.next())
}
case l.upcoming(invalidBasicStringCharacters):
// Any Unicode character may be used except those that must be escaped:
// quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F).
return l.errorf("Invalid character in basic string: %q", l.next())
default:
l.buffer.WriteRune(l.next())
l.acceptNext()
}
}
}
func stateBasicString(l *Lexer) stateFn {
l.buffer.Reset()
l.pushState(func(l *Lexer) stateFn {
s, err := l.buffer.AsInterpretedString()
err := l.emitInterpreted(ItemString)
if err != nil {
return l.errorf("Invalid data in string: %s", err)
}
l.emit(ItemString, s)
return stateKeyValuePair
})
return stateParseBasicString
@ -192,10 +200,9 @@ func stateMultiLineBasicString(l *Lexer) stateFn {
}
func stateEndOfFile(l *Lexer) stateFn {
i := l.peek()
if i == endOfFile {
if l.atEndOfFile() {
l.emit(ItemEOF, "EOF")
return nil
}
return l.unexpectedTokenError("end of file")
return l.unexpectedInputError("end of file")
}

174
lexer/states_test.go Normal file
View File

@ -0,0 +1,174 @@
package lexer_test
import (
"fmt"
"strings"
"testing"
"github.com/mmakaay/toml/lexer"
)
func TestInvalidUtf8Data(t *testing.T) {
runStatesT(t, statesT{
"invalid UTF8 data", "\xbc", "",
"Unexpected non-UTF8 data (expected end of file)"})
}
func TestEmptyInput(t *testing.T) {
runStatesT(t, statesT{"empty string", "", "", ""})
}
func TestWhiteSpaceAndNewlines(t *testing.T) {
runStatesTs(t, []statesT{
{"space", " ", "", ""},
{"tab", "\t", "", ""},
{"newline", "\n", "", ""},
{"carriage return", "\r", "", ""},
{"all whitespace and newlines", " \t \t \r\r\n\n \n \t", "", ""},
})
}
func TestComments(t *testing.T) {
runStatesTs(t, []statesT{
{"empty comment", "#", "#()", ""},
{"empty comment with spaces", "# \t \r\n", `#()`, ""},
{"basic comment", "#chicken", "#(chicken)", ""},
{"basic comment starting after whitespace", "# \tchicken", "#(chicken)", ""},
{"basic comment with surrounding whitespace", "#\t cow \t", "#(cow)", ""},
{"two lines of comments", "# one \r\n#two", "#(one)#(two)", ""},
{"comment with escape-y chars", `# \xxx/ \u can't escape/`, `#(\xxx/ \u can't escape/)`, ""},
{"carriage returns in comment", "# \tlexe\r accepts embedded ca\r\riage \returns\r", "#(lexe\r accepts embedded ca\r\riage \returns)", ""},
})
}
func TestKeyWithoutAssignment(t *testing.T) {
err := "Unexpected end of file (expected a value assignment)"
runStatesTs(t, []statesT{
{"bare with whitespace", " a ", []string{"[a]"}, err},
{"bare lower", "abcdefghijklmnopqrstuvwxyz", []string{"[abcdefghijklmnopqrstuvwxyz]"}, err},
{"bare upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", []string{"[ABCDEFGHIJKLMNOPQRSTUVWXYZ]"}, err},
{"bare numbers", "0123456789", []string{"[0123456789]"}, err},
{"bare underscore", "_", []string{"[_]"}, err},
{"bare dash", "-", []string{"[-]"}, err},
{"bare big mix", "-hey_good_Lookin123-", []string{"[-hey_good_Lookin123-]"}, err},
{"bare dotted", "a._.c", []string{"[a]", ".", "[_]", ".", "[c]"}, err},
{"bare dotted with whitespace", " a .\t\t b\t ", []string{"[a]", ".", "[b]"}, err},
})
}
func TestKeyWithAssignmentButNoValue(t *testing.T) {
err := "Unexpected end of file (expected a value)"
runStatesTs(t, []statesT{
{"bare", "a=", "[a]=", err},
{"double equal sign", "a==", "[a]=", "Unexpected token '=' (expected a value)"},
{"bare dotted", "a.b=", "[a].[b]=", err},
{"bare dotted with whitespace", " a .\tb\t = ", "[a].[b]=", err},
})
}
func TestUnterminatedBasicString(t *testing.T) {
runStatesT(t, statesT{
"missing closing quote", `a="value`, "[a]=",
"Unexpected end of file (expected basic string token)"})
}
func TestBasicStringWithUnescapedControlCharacters(t *testing.T) {
runStatesTs(t, []statesT{
{"null char", "a=\"\u0000\"", "[a]=", `Invalid character in basic string: '\x00'`},
{"newline", "a=\"b\nc\nd\"", "[a]=", `Invalid character in basic string: '\n'`},
{"delete", "a=\"\u007F\"", "[a]=", `Invalid character in basic string: '\u007f'`},
})
// No need to write all test cases for disallowed characters by hand.
for i := 0x00; i <= 0x1F; i++ {
name := fmt.Sprintf("control character %x", rune(i))
runStatesT(
t, statesT{name, fmt.Sprintf(`_="%c"`, rune(i)), "[_]=",
fmt.Sprintf(`Invalid character in basic string: %q`, rune(i))})
}
}
func TestEmptyBasicString(t *testing.T) {
runStatesTs(t, []statesT{
{"empty", `a=""`, "[a]=STR()", ""},
{"with comment", `a="" #cool`, "[a]=STR()#(cool)", ""},
{"with whitespaces", ` a = "" `, "[a]=STR()", ""},
{"dotted", ` a.b = "" `, "[a].[b]=STR()", ""},
{"multiple same line", `a=""b=""`, "[a]=STR()[b]=STR()", ""},
{"multiple lines", "a=\"\" \n b = \"\" ", "[a]=STR()[b]=STR()", ""},
})
}
func TestBasicString(t *testing.T) {
runStatesTs(t, []statesT{
{"ascii value", `_ = "Nothing fancy!"`, "[_]=STR(Nothing fancy!)", ""},
{"UTF8 value", `_ = "A cool ƃuıɹʇs" # what!?`, "[_]=STR(A cool ƃuıɹʇs)#(what!?)", ""},
})
}
func TestBasicStringWithInvalidEscapeSequence(t *testing.T) {
runStatesT(t, statesT{
"invalid escape sequence", `a="\x"`, "[a]=", `Invalid escape sequence \x in string value`,
})
}
func TestBasicStringEscapes(t *testing.T) {
runStatesTs(t, []statesT{
{"bell escape", `_="\b"`, "[_]=STR(\b)", ""},
{"tab escape", `_="\t"`, "[_]=STR(\t)", ""},
{"newline escape", `_="\n"`, "[_]=STR(\n)", ""},
{"form feed escape", `_="\f"`, "[_]=STR(\f)", ""},
{"carriage return escape", `_="\r"`, "[_]=STR(\r)", ""},
{"double quote escape", `_="\""`, `[_]=STR(")`, ""},
{"backslash escape", `_="\\"`, `[_]=STR(\)`, ""},
{"mix of escapes", `_="\b\t\nhuh\f\r\""`, "[_]=STR(\b\t\nhuh\f\r\")", ""},
{"UTF8 escape short", `_="\u2318"`, "[_]=STR(⌘)", ""},
{"UTF8 escape long", `_="\U0001014D"`, "[_]=STR(𐅍)", ""},
})
}
type statesT struct {
name string
in string
out interface{}
err string
}
func runStatesTs(t *testing.T, tests []statesT) {
for _, c := range tests {
runStatesT(t, c)
}
}
func runStatesT(t *testing.T, c statesT) {
l, err := lexer.Lex(c.in).ToArray()
if err == nil && c.err != "" {
t.Errorf("[%s] Expected error '%s', but no error occurred", c.name, c.err)
}
if err != nil && c.err == "" {
t.Errorf("[%s] Expected no error, but got error '%s'", c.name, err)
}
if err != nil && c.err != "" && err.Error() != c.err {
t.Errorf("[%s] Got an unexpected error:\nexpected: %s\nactual: %s\n", c.name, c.err, err)
}
switch expected := c.out.(type) {
case []string:
if len(expected) != len(l) {
t.Errorf("[%s] Unexpected number of lexer items:\nexpected: %d\nactual: %d\n", c.name, len(expected), len(l))
}
for i, e := range expected {
if l[i].String() != e {
t.Errorf("[%s] Unexpected lexer item at index %d:\nexpected: %s\nactual: %s\n", c.name, i, e, l[i])
}
}
case string:
a := make([]string, len(l))
for _, v := range l {
a = append(a, v.String())
}
actual := strings.Join(a, "")
if actual != expected {
t.Errorf("[%s] Unexpected lexer output:\nexpected; %s\nactual: %s\n", c.name, expected, actual)
}
}
}

View File

@ -19,7 +19,7 @@ func (b *StringBuffer) Reset() *StringBuffer {
return b
}
// AddString adds the runes of the input string to the string buffer.
// WriteString adds the runes of the input string to the string buffer.
func (b *StringBuffer) WriteString(s string) *StringBuffer {
for _, r := range s {
b.WriteRune(r)

View File

@ -23,7 +23,7 @@ func TestResetResetsBuffer(t *testing.T) {
}
}
type testCase struct {
type stringbufT struct {
name string
in string
out string
@ -37,7 +37,7 @@ const (
func TestAsLiteralString(t *testing.T) {
b := lexer.StringBuffer{}
for _, c := range []testCase{
for _, c := range []stringbufT{
{"empty string", ``, ``, OK},
{"simple string", `Simple string!`, `Simple string!`, OK},
{"single quote", `'`, `'`, OK},
@ -57,7 +57,7 @@ func TestAsLiteralString(t *testing.T) {
func TestAsInterpretedString(t *testing.T) {
b := lexer.StringBuffer{}
for _, c := range []testCase{
for _, c := range []stringbufT{
{"empty string", "", "", OK},
{"one character", "Simple string!", "Simple string!", OK},
{"escaped single quote", `\'`, "", FAIL},