diff --git a/lexer/items.go b/lexer/items.go index 9e8ae7f..a6a86a4 100644 --- a/lexer/items.go +++ b/lexer/items.go @@ -7,12 +7,13 @@ type itemType int // Definition of all the lexer item types for the TOML lexer. const ( - ItemError itemType = iota // An error occurred - ItemEOF // End of input reached - ItemComment // Comment string, starts with # till en of line - ItemKey // Key of a key/value pair - ItemKeyDot // Dot for a dotted key - ItemString // A value of type string + ItemError itemType = iota // An error occurred + ItemEOF // End of input reached + ItemComment // Comment string, starts with # till en of line + ItemKey // Key of a key/value pair + ItemKeyDot // Dot for a dotted key + ItemAssignment // Value assignment coming up (=) + ItemString // A value of type string ) // Item represents a lexer item returned from the scanner. @@ -26,26 +27,26 @@ func (i Item) String() string { switch i.Type { case ItemEOF: return "EOF" - case ItemError: - return "Error: " + i.Value + case ItemKey: + return fmt.Sprintf("[%s]", i.Value) + case ItemKeyDot: + return "." + case ItemAssignment: + return "=" } - return fmt.Sprintf("%s(%q)", i.Type, i.Value) + return fmt.Sprintf("%s(%s)", i.Type, i.Value) } // String returns a string representation of the lexer item type. func (i itemType) String() string { switch i { case ItemError: - return "Error" + return "ERR" case ItemComment: - return "Comment" - case ItemKey: - return "Key" - case ItemKeyDot: - return "KeyDot" + return "#" case ItemString: - return "String" + return "STR" default: - return fmt.Sprintf("", i) + panic(fmt.Sprintf("No translation available for type id %d", i)) } } diff --git a/lexer/lexer.go b/lexer/lexer.go index c4d1ce2..b5c2256 100644 --- a/lexer/lexer.go +++ b/lexer/lexer.go @@ -12,7 +12,6 @@ type Lexer struct { input string // the scanned input string state stateFn // a function that handles the current state stack []stateFn // state function stack, for nested parsing - start int // start position of the currently scanned item pos int // current scanning position in the input width int // width of the last rune read, for supporting backup() buffer StringBuffer // an efficient buffer, used to build string values @@ -99,29 +98,44 @@ func (l *Lexer) popState() stateFn { return tail } -// TODO niet meer nodig? -// getAcceptedString returns the string as accepted by the -// accept* methods so far. -func (l *Lexer) getAcceptedString() string { - return l.input[l.start:l.pos] -} - -// emit passes a scanned item back to the client. -func (l *Lexer) emit(t itemType, v string) { - l.items <- Item{t, v} - l.start = l.pos -} - -// TODO niet meer nodig met stringbuilder? -// ignore skips over the pending input before the current position. -func (l *Lexer) ignore() { - l.start = l.pos -} - +// atEndOfFile returns true when there is no more data available in the input. func (l *Lexer) atEndOfFile() bool { return l.pos >= len(l.input) } +// emit passes a lexer item back to the client, including the provided string. +func (l *Lexer) emit(t itemType, s string) { + l.items <- Item{t, s} + l.buffer.Reset() +} + +// emitLiteral passes a lexer item back to the client, including the accumulated +// string buffer data as a literal string. +func (l *Lexer) emitLiteral(t itemType) { + l.emit(t, l.buffer.AsLiteralString()) +} + +// emitTrimmedLiteral passes a lexer item back to the client, including the +// accumulated string buffer data as a literal string with whitespace +// trimmed from it. +func (l *Lexer) emitTrimmedLiteral(t itemType) { + l.emit(t, strings.TrimSpace(l.buffer.AsLiteralString())) +} + +// emitInterpreted passes a lexer item back to the client, including the +// accumulated string buffer data an interpreted string (handling escape +// codes like \n, \t, \uXXXX, etc.) +// This method might return an error, in case there is data in the +// string buffer that is not valid for string interpretation. +func (l *Lexer) emitInterpreted(t itemType) error { + s, err := l.buffer.AsInterpretedString() + if err != nil { + return err + } + l.emit(t, s) + return nil +} + // backup steps back one rune // Can be called only once per call of next. func (l *Lexer) backup() { @@ -129,16 +143,119 @@ func (l *Lexer) backup() { } // peek returns but does not advance to the next rune(s) in the input. -func (l *Lexer) peek() rune { - r := l.next() - l.backup() - return r +// Returns the rune, its width and a boolean. The boolean will be false in case +// no upcoming rune can be peeked (end of data or invalid UTF8 character). +func (l *Lexer) peek() (rune, int, bool) { + r, w := utf8.DecodeRuneInString(l.input[l.pos:]) + switch { + case r == utf8.RuneError: + return utf8.RuneError, w, false + default: + return r, w, true + } } -// TODO nog nodig met stringbuffer? -// accept consumes the next rune if it's from the valid set of runes. +// peekMulti takes a peek at multiple upcoming runes in the input. +// Returns a slice of runes and a boolean. The boolean will be false in case +// less upcoming runes can be peeked than the requested amount +// (end of data or invalid UTF8 character). +func (l *Lexer) peekMulti(amount int) ([]rune, bool) { + offset := 0 + var peeked []rune + for i := 0; i < amount; i++ { + r, w := utf8.DecodeRuneInString(l.input[l.pos+offset:]) + switch { + case r == utf8.RuneError: + return peeked, false + default: + offset += w + peeked = append(peeked, r) + } + } + return peeked, true +} + +// acceptNext adds the next rune from the input to the string buffer. +// If no rune could be read (end of file or invalid UTF8 data), +// then false is returned. +func (l *Lexer) acceptNext() bool { + r := l.next() + if r == endOfFile || r == utf8.RuneError { + return false + } + l.buffer.WriteRune(r) + return true +} + +// acceptFrom adds the next rune from the input to the string buffer +// when it matches in the provided runes. If the next rune does +// not match, false is returned. +func (l *Lexer) acceptFrom(runes string) bool { + r := l.next() + if strings.IndexRune(runes, r) >= 0 { + l.buffer.WriteRune(r) + return true + } + l.backup() + return false +} + +// acceptRun adds consecutive runes from the input to the string +// buffer when they match the provided runes. If no runes were added +// at all, false it returned. +func (l *Lexer) acceptRun(runes string) bool { + accepted := false + for l.acceptFrom(runes) { + accepted = true + } + return accepted +} + +// TODO meh... ugly rune. +var endOfFile rune = -1 + +// next returns the next rune from the input. +func (l *Lexer) next() rune { + l.width = 0 + r, w := utf8.DecodeRuneInString(l.input[l.pos:]) + switch { + case r == utf8.RuneError && w == 0: + return endOfFile + case r == utf8.RuneError: + return utf8.RuneError + default: + l.width = w + l.pos += w + return r + } +} + +// skip skips a rune from the set of accepted runes. +// Returns true when a rune was skipped. +func (l *Lexer) skip(runes string) bool { + r, w, _ := l.peek() + if strings.IndexRune(runes, r) >= 0 { + l.pos += w + return true + } + return false +} + +// skipRun skips a run of runes from the set of accepted runes. +// Returns true when one or more runes were skipped. +func (l *Lexer) skipRun(runes string) bool { + didSkip := false + for l.skip(runes) { + didSkip = true + } + return didSkip +} + +// accept adds the next rune to the string buffer and returns true if it's +// from the valid set of runes. Otherwise false is returned. func (l *Lexer) accept(runes string) bool { - if strings.IndexRune(runes, l.next()) >= 0 { + r := l.next() + if strings.IndexRune(runes, r) >= 0 { return true } l.backup() @@ -187,34 +304,10 @@ func (l *Lexer) acceptWhile(runes string) bool { return accepted } -// skip skips a run of runes from the set of accepted runs. -func (l *Lexer) skip(runes string) { - if l.acceptWhile(runes) { - l.ignore() - } -} - // skipUntil skips a run of runes, until a rune from the set of // runes of EOF is reached. func (l *Lexer) skipUntil(runes string) { - if l.acceptUntil(runes) { - l.ignore() - } -} - -// TODO meh... ugly rune. -var endOfFile rune = -1 - -// next returns the next rune in the input. -func (l *Lexer) next() rune { - if l.atEndOfFile() { - l.width = 0 - return endOfFile // TODO phase out this bizarro rune? - } - r, w := utf8.DecodeRuneInString(l.input[l.pos:]) - l.width = w - l.pos += w - return r + l.acceptUntil(runes) } // error returns an error token and terminates the scan @@ -227,15 +320,16 @@ func (l *Lexer) errorf(format string, args ...interface{}) stateFn { return nil } -func (l *Lexer) unexpectedTokenError(expected string) stateFn { +func (l *Lexer) unexpectedInputError(expected string) stateFn { var actual string switch { - case l.peek() == endOfFile: + case l.atEndOfFile(): // TODO maybe not hit anymore after refactoring? actual = "end of file" - case !utf8.ValidString(l.input[l.start:]): + case !utf8.ValidString(l.input[l.pos:]): actual = "non-UTF8 data" default: - actual = fmt.Sprintf("token '%c'", l.peek()) + r, _, _ := l.peek() + actual = fmt.Sprintf("token '%c'", r) } return l.errorf("Unexpected %s (expected %s)", actual, expected) } diff --git a/lexer/lexer_test.go b/lexer/lexer_test.go deleted file mode 100644 index 658060a..0000000 --- a/lexer/lexer_test.go +++ /dev/null @@ -1,175 +0,0 @@ -package lexer_test - -import ( - "fmt" - "testing" - - "github.com/mmakaay/toml/lexer" -) - -func TestInvalidUtf8Data(t *testing.T) { - assertFailureAndCheck(t, "\xbc", []string{}, "Unexpected non-UTF8 data (expected end of file)") -} - -func TestEmptyInput(t *testing.T) { - assertSuccessAndCheck(t, "", []string{}) -} -func TestWhiteSpace(t *testing.T) { - assertSuccessAndCheck(t, " ", []string{}) - assertSuccessAndCheck(t, "\t", []string{}) - assertSuccessAndCheck(t, " \t \t ", []string{}) -} -func TestWhiteSpaceAndNewlines(t *testing.T) { - assertSuccessAndCheck(t, "\n", []string{}) - assertSuccessAndCheck(t, "\n \t\r\n", []string{}) -} -func TestComments(t *testing.T) { - assertSuccessAndCheck(t, "#", []string{`Comment("#")`}) - assertSuccessAndCheck(t, " \t \t #", []string{`Comment("#")`}) - assertSuccessAndCheck(t, " \t \t # not empty", []string{`Comment("# not empty")`}) - assertSuccessAndCheck(t, " \t \t # not empty\r\r\r\n", []string{`Comment("# not empty")`}) - assertSuccessAndCheck(t, "\n \t\r\n# AAP\r\n", []string{`Comment("# AAP")`}) - assertSuccessAndCheck(t, - "# two lines\n# of comments\n", - []string{`Comment("# two lines")`, `Comment("# of comments")`}) - assertSuccessAndCheck(t, - `# \tcomment\nwith escape-y chars`, - []string{`Comment("# \\tcomment\\nwith escape-y chars")`}) -} - -func TestBareKeyWithoutValue(t *testing.T) { - err := "Unexpected end of file (expected an '=' value assignment)" - assertFailureAndCheck(t, "a", []string{`Key("a")`}, err) - assertFailureAndCheck(t, "_", []string{`Key("_")`}, err) - assertFailureAndCheck(t, " a", []string{`Key("a")`}, err) - assertFailureAndCheck(t, " a ", []string{`Key("a")`}, err) - assertFailureAndCheck(t, "ab", []string{`Key("ab")`}, err) - assertFailureAndCheck(t, "Ab", []string{`Key("Ab")`}, err) - assertFailureAndCheck(t, "Ab1", []string{`Key("Ab1")`}, err) - assertFailureAndCheck(t, "_Ab1", []string{`Key("_Ab1")`}, err) - assertFailureAndCheck(t, "_-Ab1", []string{`Key("_-Ab1")`}, err) - assertFailureAndCheck(t, "_-Ab1_this-is_GOOD987", []string{`Key("_-Ab1_this-is_GOOD987")`}, err) -} - -func TestDottedKey(t *testing.T) { - err := "Unexpected end of file (expected an '=' value assignment)" - assertFailureAndCheck(t, "a.b", []string{`Key("a")`, `KeyDot(".")`, `Key("b")`}, err) - assertFailureAndCheck(t, " a .\t\t b\t ", []string{`Key("a")`, `KeyDot(".")`, `Key("b")`}, err) -} - -func TestKeyWithAssignmentButNoValue(t *testing.T) { - err := "Unexpected end of file (expected a value)" - assertFailureAndCheck(t, " some_cool_key = ", []string{`Key("some_cool_key")`}, err) -} - -func TestUnterminatedBasicString(t *testing.T) { - assertFailure(t, `key="value`, "Unexpected end of file (expected basic string token)") -} - -func TestBasicStringWithNewline(t *testing.T) { - assertFailure(t, "key=\"value\nwith\nnewlines\"", "ohoh") -} - -func TestEmptyBasicString(t *testing.T) { - assertSuccessAndCheck(t, `a=""`, []string{`Key("a")`, `String("")`}) - assertSuccessAndCheck(t, `a=""#hi`, []string{`Key("a")`, `String("")`, `Comment("#hi")`}) - assertSuccessAndCheck(t, `a = ""`, []string{`Key("a")`, `String("")`}) - assertSuccessAndCheck(t, `a.b = ""`, []string{`Key("a")`, `KeyDot(".")`, `Key("b")`, `String("")`}) - assertSuccessAndCheck(t, `a=""b=""`, []string{`Key("a")`, `String("")`, `Key("b")`, `String("")`}) -} -func TestBasicString(t *testing.T) { - assertSuccessAndCheck(t, `_ = "b"`, - []string{ - `Key("_")`, - `String("b")`}) - assertSuccessAndCheck(t, `thing = "A cool ʎǝʞ" # huh, it's up-side down!!`, - []string{ - `Key("thing")`, - `String("A cool ʎǝʞ")`, - `Comment("# huh, it's up-side down!!")`}) -} - -func TestInvalidEscapeSequence(t *testing.T) { - assertFailure(t, `a="\x"`, `Invalid escape sequence \x in string value`) -} -func TestBasicStringEscapes(t *testing.T) { - for in, out := range map[string]string{ - `\b`: "\b", - `\t`: "\t", - `\n`: "\n", - `\f`: "\f", - `\r`: "\r", - `\"`: "\"", - `\b\t\nhuh\f\r\"`: "\b\t\nhuh\f\r\"", - `\u2318`: "⌘", - `\U0001014D`: "𐅍", - } { - l := assertSuccess(t, fmt.Sprintf(`x="%s"`, in)) - if out != l[1].Value { - t.Fatalf("Unexpected result when parsing '%s'\nexpected: %q\nactual: %q", in, out, l[1].Value) - } - } -} - -// func TestBasicStringUnicodeEscapes(t *testing.T) { -// for in, out := range map[string]string{ -// `\u`: "\b", -// } { -// l := assertSuccess(t, fmt.Sprintf(`x="%s"`, in)) -// s := l[2] -// if out != s.Value { -// t.Fatalf("Unexpected result when parsing '%s'", in) -// } -// } -// } - -func TestTwoKeyValuePairs(t *testing.T) { - assertSuccessAndCheck(t, "a=\"Hello\" #comment1\nb=\"World!\"#comment2\r\n", - []string{ - `Key("a")`, - `String("Hello")`, - `Comment("#comment1")`, - `Key("b")`, - `String("World!")`, - `Comment("#comment2")`}) -} - -func assertSuccessAndCheck(t *testing.T, input string, expected []string) { - l := assertSuccess(t, input) - assertItems(t, l, expected) -} - -func assertFailureAndCheck(t *testing.T, input string, expected []string, expectedErr string) { - l := assertFailure(t, input, expectedErr) - assertItems(t, l, expected) -} - -func assertFailure(t *testing.T, input string, expectedErr string) []lexer.Item { - l, err := lexer.Lex(input).ToArray() - if err == nil { - t.Fatalf("Expected lexer error '%s', but no error occurred", expectedErr) - } - if err.Error() != expectedErr { - t.Fatalf("Mismatch between expected and actual error:\nExpected: %s\nActual: %s\n", expectedErr, err) - } - return l -} - -func assertSuccess(t *testing.T, input string) []lexer.Item { - l, err := lexer.Lex(input).ToArray() - if err != nil { - t.Fatalf("Unexpected lexer error: %s", err) - } - return l -} - -func assertItems(t *testing.T, l []lexer.Item, expected []string) { - if len(expected) != len(l) { - t.Fatalf("Unexpected number of lexer items: %d (expected: %d)", len(l), len(expected)) - } - for i, e := range expected { - if l[i].String() != e { - t.Fatalf("Unexpected lexer item at index %d: %s (expected: %s)", i, l[i], e) - } - } -} diff --git a/lexer/states.go b/lexer/states.go index 4eb335a..ea686bd 100644 --- a/lexer/states.go +++ b/lexer/states.go @@ -1,33 +1,35 @@ package lexer -// stateFn represents the state of the scanner as a function +// stateFn represents the state of the lexer as a function // that returns the next state. type stateFn func(*Lexer) stateFn const ( - whitespace string = " \t" - carriageReturn string = "\r" - newline string = "\n" - hash string = "#" - equal string = "=" - lower string = "abcdefghijklmnopqrstuvwxyz" - upper string = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" - digits string = "0123456789" - dot string = "." - underscore string = "_" - dash string = "-" - singleQuote string = "'" - doubleQuote string = "\"" - backslash string = "\\" - someQuote string = singleQuote + doubleQuote - bareKey string = lower + upper + digits + underscore + dash - startOfKey string = bareKey + someQuote - quotable string = `btnfr\"` + whitespace string = " \t" + carriageReturn string = "\r" + newline string = "\n" + hash string = "#" + equal string = "=" + lower string = "abcdefghijklmnopqrstuvwxyz" + upper string = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + digits string = "0123456789" + dot string = "." + underscore string = "_" + dash string = "-" + singleQuote string = "'" + doubleQuote string = "\"" + backslash string = "\\" + quoteChars string = singleQuote + doubleQuote + bareKeyChars string = lower + upper + digits + underscore + dash + startOfKey string = bareKeyChars + quoteChars + escapeChars string = `btnfr"\` + shortUtf8Escape string = "u" + longUtf8Escape string = "U" ) func stateKeyValuePair(l *Lexer) stateFn { - l.skip(whitespace + carriageReturn + newline) - if l.upcoming(hash) { + l.skipRun(whitespace + carriageReturn + newline) + if l.skip(hash) { return stateComment } if l.upcoming(startOfKey) { @@ -38,36 +40,34 @@ func stateKeyValuePair(l *Lexer) stateFn { // A '#' hash symbol marks the rest of the line as a comment. func stateComment(l *Lexer) stateFn { - l.buffer.Reset() for { switch { - case l.atEndOfFile() || l.accept(newline): - s := l.buffer.AsLiteralString() - l.emit(ItemComment, s) + case l.atEndOfFile() || l.skip(newline): + l.emitTrimmedLiteral(ItemComment) return stateKeyValuePair - case l.accept(carriageReturn): - l.ignore() default: - l.buffer.WriteRune(l.next()) + if !l.acceptNext() { + return nil + } } } } // A key may be either bare, quoted or dotted. func stateKey(l *Lexer) stateFn { - if l.upcoming(bareKey) { - return stateBareKey + if l.acceptFrom(bareKeyChars) { + return statebareKeyChars } - return l.unexpectedTokenError("a valid key name") + return l.unexpectedInputError("a valid key name") } // Bare keys may only contain ASCII letters, ASCII digits, // underscores, and dashes (A-Za-z0-9_-). Note that bare // keys are allowed to be composed of only ASCII digits, // e.g. 1234, but are always interpreted as strings. -func stateBareKey(l *Lexer) stateFn { - l.acceptWhile(bareKey) - l.emit(ItemKey, l.getAcceptedString()) +func statebareKeyChars(l *Lexer) stateFn { + l.acceptRun(bareKeyChars) + l.emitLiteral(ItemKey) return stateEndOfKeyOrKeyDot } @@ -76,10 +76,10 @@ func stateBareKey(l *Lexer) stateFn { func stateEndOfKeyOrKeyDot(l *Lexer) stateFn { // Whitespace around dot-separated parts is ignored, however, // best practice is to not use any extraneous whitespace. - l.skip(whitespace) - if l.accept(dot) { - l.emit(ItemKeyDot, ".") - l.skip(whitespace) + l.skipRun(whitespace) + if l.skip(dot) { + l.emit(ItemKeyDot, "") + l.skipRun(whitespace) return stateKey } return stateKeyAssignment @@ -90,70 +90,57 @@ func stateEndOfKeyOrKeyDot(l *Lexer) stateFn { // sign, and value must be on the same line (though some values can // be broken over multiple lines). func stateKeyAssignment(l *Lexer) stateFn { - l.skip(whitespace) - if l.accept(equal) { - l.skip(whitespace) + l.skipRun(whitespace) + if l.skip(equal) { + l.emit(ItemAssignment, "") + l.skipRun(whitespace) return stateValue } - return l.unexpectedTokenError("an '=' value assignment") + return l.unexpectedInputError("a value assignment") } +// Values must be of the following types: String, Integer, Float, Boolean, +// Datetime, Array, or Inline Table. Unspecified values are invalid. func stateValue(l *Lexer) stateFn { - l.skip(whitespace) - if l.upcoming(someQuote) { + l.skipRun(whitespace) + if l.upcoming(quoteChars) { return stateStringValue } - return l.unexpectedTokenError("a value") + return l.unexpectedInputError("a value") } // There are four ways to express strings: basic, multi-line basic, literal, // and multi-line literal. All strings must contain only valid UTF-8 characters. func stateStringValue(l *Lexer) stateFn { - if l.accept(doubleQuote) { + // Basic strings are surrounded by quotation marks. + if l.skip(doubleQuote) { return stateBasicStringValue } - return l.unexpectedTokenError("a string value") + return l.unexpectedInputError("a string value") } -// Basic strings are surrounded by quotation marks. Any Unicode character -// may be used except those that must be escaped: quotation mark, backslash, -// and the control characters (U+0000 to U+001F, U+007F). -// -// For convenience, some popular characters have a compact escape sequence. -// -// \b - backspace (U+0008) -// \t - tab (U+0009) -// \n - linefeed (U+000A) -// \f - form feed (U+000C) -// \r - carriage return (U+000D) -// \" - quote (U+0022) -// \\ - backslash (U+005C) -// \uXXXX - unicode (U+XXXX) -// \UXXXXXXXX - unicode (U+XXXXXXXX) -// -// Any Unicode character may be escaped with the \uXXXX or \UXXXXXXXX forms. -// The escape codes must be valid Unicode scalar values. -// -// All other escape sequences not listed above are reserved and, -// if used, TOML should produce an error. func stateBasicStringValue(l *Lexer) stateFn { // Possibly a """ multi-line string start, // possibly the end of an "" empty string. - if l.accept(doubleQuote) { + if l.skip(doubleQuote) { // It's a """ multi-line string. - if l.accept(doubleQuote) { - l.ignore() + if l.skip(doubleQuote) { return stateMultiLineBasicString } // It's an "" empty string. - l.ignore() l.emit(ItemString, "") return stateKeyValuePair } - l.ignore() return stateBasicString } +const invalidBasicStringCharacters string = "" + + "\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007" + + "\u0008\u0009\u000A\u000B\u000C\u000D\u000E\u000F" + + "\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017" + + "\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F" + + "\u007F" + func stateParseBasicString(l *Lexer) stateFn { for { switch { @@ -162,26 +149,47 @@ func stateParseBasicString(l *Lexer) stateFn { case l.accept(doubleQuote): return l.popState() case l.accept(backslash): - if l.upcoming(quotable) { + // For convenience, some popular characters have a compact escape sequence. + // Any Unicode character may be escaped with the \uXXXX or \UXXXXXXXX forms. + // The escape codes must be valid Unicode scalar values. + switch { + case l.upcoming(escapeChars): + // \b - backspace (U+0008) + // \t - tab (U+0009) + // \n - linefeed (U+000A) + // \f - form feed (U+000C) + // \r - carriage return (U+000D) + // \" - quote (U+0022) + // \\ - backslash (U+005C) l.buffer.WriteRune('\\') l.buffer.WriteRune(l.next()) - } else { + case l.upcoming(shortUtf8Escape): + // \uXXXX - unicode (U+XXXX) + return l.errorf("Not yet implemented: short utf8") + case l.upcoming(longUtf8Escape): + // \UXXXXXXXX - unicode (U+XXXXXXXX) + return l.errorf("Not yet implemented: long utf8") + default: + // All other escape sequences not listed above are reserved and, + // if used, TOML should produce an error. return l.errorf("Invalid escape sequence \\%c in string value", l.next()) } + case l.upcoming(invalidBasicStringCharacters): + // Any Unicode character may be used except those that must be escaped: + // quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F). + return l.errorf("Invalid character in basic string: %q", l.next()) default: - l.buffer.WriteRune(l.next()) + l.acceptNext() } } } func stateBasicString(l *Lexer) stateFn { - l.buffer.Reset() l.pushState(func(l *Lexer) stateFn { - s, err := l.buffer.AsInterpretedString() + err := l.emitInterpreted(ItemString) if err != nil { return l.errorf("Invalid data in string: %s", err) } - l.emit(ItemString, s) return stateKeyValuePair }) return stateParseBasicString @@ -192,10 +200,9 @@ func stateMultiLineBasicString(l *Lexer) stateFn { } func stateEndOfFile(l *Lexer) stateFn { - i := l.peek() - if i == endOfFile { + if l.atEndOfFile() { l.emit(ItemEOF, "EOF") return nil } - return l.unexpectedTokenError("end of file") + return l.unexpectedInputError("end of file") } diff --git a/lexer/states_test.go b/lexer/states_test.go new file mode 100644 index 0000000..320207c --- /dev/null +++ b/lexer/states_test.go @@ -0,0 +1,174 @@ +package lexer_test + +import ( + "fmt" + "strings" + "testing" + + "github.com/mmakaay/toml/lexer" +) + +func TestInvalidUtf8Data(t *testing.T) { + runStatesT(t, statesT{ + "invalid UTF8 data", "\xbc", "", + "Unexpected non-UTF8 data (expected end of file)"}) +} + +func TestEmptyInput(t *testing.T) { + runStatesT(t, statesT{"empty string", "", "", ""}) +} + +func TestWhiteSpaceAndNewlines(t *testing.T) { + runStatesTs(t, []statesT{ + {"space", " ", "", ""}, + {"tab", "\t", "", ""}, + {"newline", "\n", "", ""}, + {"carriage return", "\r", "", ""}, + {"all whitespace and newlines", " \t \t \r\r\n\n \n \t", "", ""}, + }) +} + +func TestComments(t *testing.T) { + runStatesTs(t, []statesT{ + {"empty comment", "#", "#()", ""}, + {"empty comment with spaces", "# \t \r\n", `#()`, ""}, + {"basic comment", "#chicken", "#(chicken)", ""}, + {"basic comment starting after whitespace", "# \tchicken", "#(chicken)", ""}, + {"basic comment with surrounding whitespace", "#\t cow \t", "#(cow)", ""}, + {"two lines of comments", "# one \r\n#two", "#(one)#(two)", ""}, + {"comment with escape-y chars", `# \xxx/ \u can't escape/`, `#(\xxx/ \u can't escape/)`, ""}, + {"carriage returns in comment", "# \tlexe\r accepts embedded ca\r\riage \returns\r", "#(lexe\r accepts embedded ca\r\riage \returns)", ""}, + }) +} + +func TestKeyWithoutAssignment(t *testing.T) { + err := "Unexpected end of file (expected a value assignment)" + runStatesTs(t, []statesT{ + {"bare with whitespace", " a ", []string{"[a]"}, err}, + {"bare lower", "abcdefghijklmnopqrstuvwxyz", []string{"[abcdefghijklmnopqrstuvwxyz]"}, err}, + {"bare upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", []string{"[ABCDEFGHIJKLMNOPQRSTUVWXYZ]"}, err}, + {"bare numbers", "0123456789", []string{"[0123456789]"}, err}, + {"bare underscore", "_", []string{"[_]"}, err}, + {"bare dash", "-", []string{"[-]"}, err}, + {"bare big mix", "-hey_good_Lookin123-", []string{"[-hey_good_Lookin123-]"}, err}, + {"bare dotted", "a._.c", []string{"[a]", ".", "[_]", ".", "[c]"}, err}, + {"bare dotted with whitespace", " a .\t\t b\t ", []string{"[a]", ".", "[b]"}, err}, + }) +} + +func TestKeyWithAssignmentButNoValue(t *testing.T) { + err := "Unexpected end of file (expected a value)" + runStatesTs(t, []statesT{ + {"bare", "a=", "[a]=", err}, + {"double equal sign", "a==", "[a]=", "Unexpected token '=' (expected a value)"}, + {"bare dotted", "a.b=", "[a].[b]=", err}, + {"bare dotted with whitespace", " a .\tb\t = ", "[a].[b]=", err}, + }) +} + +func TestUnterminatedBasicString(t *testing.T) { + runStatesT(t, statesT{ + "missing closing quote", `a="value`, "[a]=", + "Unexpected end of file (expected basic string token)"}) +} + +func TestBasicStringWithUnescapedControlCharacters(t *testing.T) { + runStatesTs(t, []statesT{ + {"null char", "a=\"\u0000\"", "[a]=", `Invalid character in basic string: '\x00'`}, + {"newline", "a=\"b\nc\nd\"", "[a]=", `Invalid character in basic string: '\n'`}, + {"delete", "a=\"\u007F\"", "[a]=", `Invalid character in basic string: '\u007f'`}, + }) + + // No need to write all test cases for disallowed characters by hand. + for i := 0x00; i <= 0x1F; i++ { + name := fmt.Sprintf("control character %x", rune(i)) + runStatesT( + t, statesT{name, fmt.Sprintf(`_="%c"`, rune(i)), "[_]=", + fmt.Sprintf(`Invalid character in basic string: %q`, rune(i))}) + } +} + +func TestEmptyBasicString(t *testing.T) { + runStatesTs(t, []statesT{ + {"empty", `a=""`, "[a]=STR()", ""}, + {"with comment", `a="" #cool`, "[a]=STR()#(cool)", ""}, + {"with whitespaces", ` a = "" `, "[a]=STR()", ""}, + {"dotted", ` a.b = "" `, "[a].[b]=STR()", ""}, + {"multiple same line", `a=""b=""`, "[a]=STR()[b]=STR()", ""}, + {"multiple lines", "a=\"\" \n b = \"\" ", "[a]=STR()[b]=STR()", ""}, + }) +} + +func TestBasicString(t *testing.T) { + runStatesTs(t, []statesT{ + {"ascii value", `_ = "Nothing fancy!"`, "[_]=STR(Nothing fancy!)", ""}, + {"UTF8 value", `_ = "A cool ƃuıɹʇs" # what!?`, "[_]=STR(A cool ƃuıɹʇs)#(what!?)", ""}, + }) +} + +func TestBasicStringWithInvalidEscapeSequence(t *testing.T) { + runStatesT(t, statesT{ + "invalid escape sequence", `a="\x"`, "[a]=", `Invalid escape sequence \x in string value`, + }) +} + +func TestBasicStringEscapes(t *testing.T) { + runStatesTs(t, []statesT{ + {"bell escape", `_="\b"`, "[_]=STR(\b)", ""}, + {"tab escape", `_="\t"`, "[_]=STR(\t)", ""}, + {"newline escape", `_="\n"`, "[_]=STR(\n)", ""}, + {"form feed escape", `_="\f"`, "[_]=STR(\f)", ""}, + {"carriage return escape", `_="\r"`, "[_]=STR(\r)", ""}, + {"double quote escape", `_="\""`, `[_]=STR(")`, ""}, + {"backslash escape", `_="\\"`, `[_]=STR(\)`, ""}, + {"mix of escapes", `_="\b\t\nhuh\f\r\""`, "[_]=STR(\b\t\nhuh\f\r\")", ""}, + {"UTF8 escape short", `_="\u2318"`, "[_]=STR(⌘)", ""}, + {"UTF8 escape long", `_="\U0001014D"`, "[_]=STR(𐅍)", ""}, + }) +} + +type statesT struct { + name string + in string + out interface{} + err string +} + +func runStatesTs(t *testing.T, tests []statesT) { + for _, c := range tests { + runStatesT(t, c) + } +} + +func runStatesT(t *testing.T, c statesT) { + l, err := lexer.Lex(c.in).ToArray() + if err == nil && c.err != "" { + t.Errorf("[%s] Expected error '%s', but no error occurred", c.name, c.err) + } + if err != nil && c.err == "" { + t.Errorf("[%s] Expected no error, but got error '%s'", c.name, err) + } + if err != nil && c.err != "" && err.Error() != c.err { + t.Errorf("[%s] Got an unexpected error:\nexpected: %s\nactual: %s\n", c.name, c.err, err) + } + switch expected := c.out.(type) { + case []string: + if len(expected) != len(l) { + t.Errorf("[%s] Unexpected number of lexer items:\nexpected: %d\nactual: %d\n", c.name, len(expected), len(l)) + } + for i, e := range expected { + if l[i].String() != e { + t.Errorf("[%s] Unexpected lexer item at index %d:\nexpected: %s\nactual: %s\n", c.name, i, e, l[i]) + } + } + case string: + a := make([]string, len(l)) + for _, v := range l { + a = append(a, v.String()) + } + actual := strings.Join(a, "") + if actual != expected { + t.Errorf("[%s] Unexpected lexer output:\nexpected; %s\nactual: %s\n", c.name, expected, actual) + } + } +} diff --git a/lexer/stringbuf.go b/lexer/stringbuf.go index 7c810a1..69030ce 100644 --- a/lexer/stringbuf.go +++ b/lexer/stringbuf.go @@ -19,7 +19,7 @@ func (b *StringBuffer) Reset() *StringBuffer { return b } -// AddString adds the runes of the input string to the string buffer. +// WriteString adds the runes of the input string to the string buffer. func (b *StringBuffer) WriteString(s string) *StringBuffer { for _, r := range s { b.WriteRune(r) diff --git a/lexer/stringbuf_test.go b/lexer/stringbuf_test.go index f3caa04..41e59d1 100644 --- a/lexer/stringbuf_test.go +++ b/lexer/stringbuf_test.go @@ -23,7 +23,7 @@ func TestResetResetsBuffer(t *testing.T) { } } -type testCase struct { +type stringbufT struct { name string in string out string @@ -37,7 +37,7 @@ const ( func TestAsLiteralString(t *testing.T) { b := lexer.StringBuffer{} - for _, c := range []testCase{ + for _, c := range []stringbufT{ {"empty string", ``, ``, OK}, {"simple string", `Simple string!`, `Simple string!`, OK}, {"single quote", `'`, `'`, OK}, @@ -57,7 +57,7 @@ func TestAsLiteralString(t *testing.T) { func TestAsInterpretedString(t *testing.T) { b := lexer.StringBuffer{} - for _, c := range []testCase{ + for _, c := range []stringbufT{ {"empty string", "", "", OK}, {"one character", "Simple string!", "Simple string!", OK}, {"escaped single quote", `\'`, "", FAIL},