From f6efd34b31e7cd0606e4db7fc8348dbd4fa67773 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 15 May 2019 09:00:35 +0000 Subject: [PATCH] Initial import, work in progress. --- go.mod | 3 + lexer/items.go | 54 ++++++++++ lexer/lexer.go | 244 ++++++++++++++++++++++++++++++++++++++++++++ lexer/lexer_test.go | 164 +++++++++++++++++++++++++++++ lexer/states.go | 219 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 684 insertions(+) create mode 100644 go.mod create mode 100644 lexer/items.go create mode 100644 lexer/lexer.go create mode 100644 lexer/lexer_test.go create mode 100644 lexer/states.go diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..bb4a415 --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module github.com/mmakaay/toml + +go 1.12 diff --git a/lexer/items.go b/lexer/items.go new file mode 100644 index 0000000..d7164d9 --- /dev/null +++ b/lexer/items.go @@ -0,0 +1,54 @@ +package lexer + +import "fmt" + +// itemType represents the type of lexer items. +type itemType int + +// Definition of all the lexer item types for the TOML lexer. +const ( + ItemError itemType = iota // An error occurred + ItemEOF // End of input reached + ItemComment // Comment string, starts with # till en of line + ItemKey // Key of a key/value pair + ItemKeyDot // Dot for a dotted key + ItemKeyValueAssignment // Equal sign for a key/value pair assignment + ItemStringValue // A value of type string +) + +// Item represents a lexer item returned from the scanner. +type Item struct { + Type itemType //Type, e.g. itemNumber, itemSquareBracket + Value string // Value, e.g. "10.42", "[" +} + +// String returns a string representation of the lexer item. +func (i Item) String() string { + switch i.Type { + case ItemEOF: + return "EOF" + case ItemError: + return "Error: " + i.Value + } + return fmt.Sprintf("%s(%q)", i.Type, i.Value) +} + +// String returns a string representation of the lexer item type. +func (i itemType) String() string { + switch i { + case ItemError: + return "Error" + case ItemComment: + return "Comment" + case ItemKey: + return "Key" + case ItemKeyDot: + return "KeyDot" + case ItemKeyValueAssignment: + return "Assignment" + case ItemStringValue: + return "StringValue" + default: + return fmt.Sprintf("", i) + } +} diff --git a/lexer/lexer.go b/lexer/lexer.go new file mode 100644 index 0000000..0f39ba5 --- /dev/null +++ b/lexer/lexer.go @@ -0,0 +1,244 @@ +package lexer + +import ( + "errors" + "fmt" + "strings" + "unicode/utf8" +) + +// Lexer holds the state of the scanner. +type Lexer struct { + input string // the scanned input string + state stateFn // the current state + stack []stateFn // state stack, for nested parsing + start int // start position of the currently scanned item + pos int // current scanning position in the input + width int // width of the last rune read + strValue strings.Builder // used to build string values + items chan Item // channel of scanned items + nextItem Item // the current item as reached by Next() and retrieved by Get() + err error // an error message when lexing failed, retrieved by Error() +} + +// Lex takes an input string and initializes the TOML lexer for it. +// Usage: +// +// l := lexer.Lex("...inputstring...") +// for l.Next() { +// item := l.Get() +// ... handle item ... +// } +// if e := l.Error(); e != nil { +// ... handle error message ... +// } +func Lex(input string) *Lexer { + return &Lexer{ + input: input, + state: stateKeyValuePair, + items: make(chan Item, 2), + } +} + +// Next advances to the next lexer item in the input string. +// When a next item was found, then true is returned. +// On error or reaching the end of the input, false is returned. +func (l *Lexer) Next() bool { + if l.state == nil { + panic("This should not happen: nil state reached, but entering Next()") + } + for { + select { + case i := <-l.items: + if i.Type == ItemEOF { + return false + } + if i.Type == ItemError { + l.err = errors.New(i.Value) + return false + } + l.nextItem = i + return true + default: + l.state = l.state(l) + } + } +} + +func (l *Lexer) Error() error { + return l.err +} + +// ToArray returns lexer items as an array. +// When an error occurs during scanning, a partial result will be +// returned, accompanied by the error that occurred. +func (l *Lexer) ToArray() ([]Item, error) { + var items []Item + for l.Next() { + items = append(items, l.Get()) + } + return items, l.Error() +} + +// Get returns the next lexer item, as reached by Next() +func (l *Lexer) Get() Item { + return l.nextItem +} + +// pushState adds the state function to its stack. +// This is used for implementing nested parsing. +func (l *Lexer) pushState(state stateFn) { + l.stack = append(l.stack, state) +} + +// popState pops the last pushed state from its stack. +func (l *Lexer) popState() stateFn { + last := len(l.stack) - 1 + head, tail := l.stack[:last], l.stack[last] + l.stack = head + return tail +} + +// getAcceptedString returns the string as accepted by the +// accept* methods so far. +func (l *Lexer) getAcceptedString() string { + return l.input[l.start:l.pos] +} + +// emit passes a scanned item back to the client. +func (l *Lexer) emit(t itemType, v string) { + l.items <- Item{t, v} + l.start = l.pos +} + +// ignore skips over the pending input before the current position. +func (l *Lexer) ignore() { + l.start = l.pos +} + +// backup steps back one rune +// Can be called only once per call of next. +func (l *Lexer) backup() { + l.pos -= l.width +} + +// peek returns but does not advance to the next rune(s) in the input. +func (l *Lexer) peek() rune { + r := l.next() + l.backup() + return r +} + +// accept consumes the next rune if it's from the valid set of runes. +func (l *Lexer) accept(runes string) bool { + if strings.IndexRune(runes, l.next()) >= 0 { + return true + } + l.backup() + return false +} + +func (l *Lexer) upcoming(runes string) bool { + if l.accept(runes) { + l.backup() + return true + } + return false +} + +// acceptNot consumes the next rune if it's not from the set of runes. +func (l *Lexer) acceptNot(runes string) bool { + r := l.next() + if r == endOfFile { + l.backup() + return false + } + if strings.IndexRune(runes, r) < 0 { + return true + } + l.backup() + return false +} + +// acceptUntil consumes a run of runes until ones from the +// valid set is encountered. +func (l *Lexer) acceptUntil(runes string) bool { + accepted := false + for l.acceptNot(runes) { + accepted = true + } + return accepted +} + +// acceptRun consumes a run of runes from the set of accepted runes. +func (l *Lexer) acceptWhile(runes string) bool { + accepted := false + for l.accept(runes) { + accepted = true + } + return accepted +} + +// skip skips a run of runes from the set of accepted runs. +func (l *Lexer) skip(runes string) { + if l.acceptWhile(runes) { + l.ignore() + } +} + +// skipUntil skips a run of runes, until a rune from the set of +// runes of EOF is reached. +func (l *Lexer) skipUntil(runes string) { + if l.acceptUntil(runes) { + l.ignore() + } +} + +func (l *Lexer) newString() { + l.strValue.Reset() +} + +func (l *Lexer) addToString(r rune) { + l.strValue.WriteRune(r) +} + +func (l *Lexer) getString() string { + return l.strValue.String() +} + +var endOfFile rune = -1 + +// next returns the next rune in the input. +func (l *Lexer) next() rune { + if l.pos >= len(l.input) { + l.width = 0 + return endOfFile + } + r, w := utf8.DecodeRuneInString(l.input[l.pos:]) + l.width = w + l.pos += w + return r +} + +// error returns an error token and terminates the scan +// by returning nil to l.run. +func (l *Lexer) errorf(format string, args ...interface{}) stateFn { + l.items <- Item{ + ItemError, + fmt.Sprintf(format, args...), + } + return nil +} + +func (l *Lexer) unexpectedTokenError(expected string) stateFn { + var actual string + switch { + case l.peek() == endOfFile: + actual = "end of file" + case !utf8.ValidString(l.input[l.start:]): + actual = "non-UTF8 data" + default: + actual = fmt.Sprintf("token '%c'", l.peek()) + } + return l.errorf("Unexpected %s (expected %s)", actual, expected) +} diff --git a/lexer/lexer_test.go b/lexer/lexer_test.go new file mode 100644 index 0000000..dbf2b26 --- /dev/null +++ b/lexer/lexer_test.go @@ -0,0 +1,164 @@ +package lexer_test + +import ( + "fmt" + "testing" + + "github.com/mmakaay/toml/lexer" +) + +func TestInvalidUtf8Data(t *testing.T) { + assertFailureAndCheck(t, "\xbc", []string{}, "Unexpected non-UTF8 data (expected end of file)") +} + +func TestEmptyInput(t *testing.T) { + assertSuccessAndCheck(t, "", []string{}) +} +func TestWhiteSpace(t *testing.T) { + assertSuccessAndCheck(t, " ", []string{}) + assertSuccessAndCheck(t, "\t", []string{}) + assertSuccessAndCheck(t, " \t \t ", []string{}) +} +func TestWhiteSpaceAndNewlines(t *testing.T) { + assertSuccessAndCheck(t, "\n", []string{}) + assertSuccessAndCheck(t, "\n \t\r\n", []string{}) +} +func TestWhitespacePlusComment(t *testing.T) { + assertSuccessAndCheck(t, "#", []string{`Comment("#")`}) + assertSuccessAndCheck(t, " \t \t #", []string{`Comment("#")`}) + assertSuccessAndCheck(t, " \t \t # not empty", []string{`Comment("# not empty")`}) + assertSuccessAndCheck(t, " \t \t # not empty\r\r\r\n", []string{`Comment("# not empty")`}) + assertSuccessAndCheck(t, "\n \t\r\n# AAP\r\n", []string{`Comment("# AAP")`}) + assertSuccessAndCheck(t, "# two lines\n# of comments\n", + []string{`Comment("# two lines")`, `Comment("# of comments")`}) +} + +func TestBareKeyWithoutValue(t *testing.T) { + err := "Unexpected end of file (expected an '=' value assignment)" + assertFailureAndCheck(t, "=", []string{`Key("a")`}, err) + assertFailureAndCheck(t, " a", []string{`Key("a")`}, err) + assertFailureAndCheck(t, " a ", []string{`Key("a")`}, err) + assertFailureAndCheck(t, "ab", []string{`Key("ab")`}, err) + assertFailureAndCheck(t, "Ab", []string{`Key("Ab")`}, err) + assertFailureAndCheck(t, "Ab1", []string{`Key("Ab1")`}, err) + assertFailureAndCheck(t, "_Ab1", []string{`Key("_Ab1")`}, err) + assertFailureAndCheck(t, "_-Ab1", []string{`Key("_-Ab1")`}, err) + assertFailureAndCheck(t, "_-Ab1_this-is_GOOD987", []string{`Key("_-Ab1_this-is_GOOD987")`}, err) +} + +func TestDottedKey(t *testing.T) { + err := "Unexpected end of file (expected an '=' value assignment)" + assertFailureAndCheck(t, "a.b", []string{`Key("a")`, `KeyDot(".")`, `Key("b")`}, err) + assertFailureAndCheck(t, " a .\t\t b\t ", []string{`Key("a")`, `KeyDot(".")`, `Key("b")`}, err) +} + +func TestKeyWithAssignmentButNoValue(t *testing.T) { + err := "Unexpected end of file (expected a value)" + assertFailureAndCheck(t, " some_cool_key = ", []string{`Key("some_cool_key")`, `Assignment("=")`}, err) +} + +func TestEmptyBasicStringValue(t *testing.T) { + assertSuccessAndCheck(t, `a=""`, []string{`Key("a")`, `Assignment("=")`, `StringValue("")`}) + assertSuccessAndCheck(t, `a=""#hi`, []string{`Key("a")`, `Assignment("=")`, `StringValue("")`, `Comment("#hi")`}) + assertSuccessAndCheck(t, `a = ""`, []string{`Key("a")`, `Assignment("=")`, `StringValue("")`}) + assertSuccessAndCheck(t, `a.b = ""`, []string{`Key("a")`, `KeyDot(".")`, `Key("b")`, `Assignment("=")`, `StringValue("")`}) +} +func TestBasicStringValue(t *testing.T) { + assertSuccessAndCheck(t, `_ = "b"`, + []string{ + `Key("_")`, + `Assignment("=")`, + `StringValue("b")`}) + assertSuccessAndCheck(t, `thing = "A cool ʎǝʞ" # huh, it's up-side down!!`, + []string{ + `Key("thing")`, + `Assignment("=")`, + `StringValue("A cool ʎǝʞ")`, + `Comment("# huh, it's up-side down!!")`}) +} + +func TestInvalidEscapeSequence(t *testing.T) { + assertFailure(t, `a="\x"`, `Invalid escape sequence \x in string value`) +} +func TestBasicStringValueEscapes(t *testing.T) { + for in, out := range map[string]string{ + `\b`: "\b", + `\t`: "\t", + `\n`: "\n", + `\f`: "\f", + `\r`: "\r", + `\"`: "\"", + `\b\t\n\f\r\"`: "\b\t\n\f\r\"", + } { + l := assertSuccess(t, fmt.Sprintf(`x="%s"`, in)) + s := l[2] + if out != s.Value { + t.Fatalf("Unexpected result when parsing '%s'", in) + } + } +} + +// func TestBasicStringUnicodeEscapes(t *testing.T) { +// for in, out := range map[string]string{ +// `\u`: "\b", +// } { +// l := assertSuccess(t, fmt.Sprintf(`x="%s"`, in)) +// s := l[2] +// if out != s.Value { +// t.Fatalf("Unexpected result when parsing '%s'", in) +// } +// } +// } + +func TestTwoKeyValuePairs(t *testing.T) { + assertSuccessAndCheck(t, "a=\"Hello\" #comment1\nb=\"World!\"#comment2\r\n", + []string{ + `Key("a")`, + `Assignment("=")`, + `StringValue("Hello")`, + `Comment("#comment1")`, + `Key("b")`, + `Assignment("=")`, + `StringValue("World!")`, + `Comment("#comment2")`}) +} + +func assertSuccessAndCheck(t *testing.T, input string, expected []string) { + l := assertSuccess(t, input) + assertItems(t, l, expected) +} + +func assertFailureAndCheck(t *testing.T, input string, expected []string, expectedErr string) { + l := assertFailure(t, input, expectedErr) + assertItems(t, l, expected) +} + +func assertFailure(t *testing.T, input string, expectedErr string) []lexer.Item { + l, err := lexer.Lex(input).ToArray() + if err == nil { + t.Fatalf("Expected lexer error '%s', but no error occurred", expectedErr) + } + if err.Error() != expectedErr { + t.Fatalf("Mismatch between expected and actual error:\nExpected: %s\nActual: %s\n", expectedErr, err) + } + return l +} + +func assertSuccess(t *testing.T, input string) []lexer.Item { + l, err := lexer.Lex(input).ToArray() + if err != nil { + t.Fatalf("Unexpected lexer error: %s", err) + } + return l +} + +func assertItems(t *testing.T, l []lexer.Item, expected []string) { + if len(expected) != len(l) { + t.Fatalf("Unexpected number of lexer items: %d (expected: %d)", len(l), len(expected)) + } + for i, e := range expected { + if l[i].String() != e { + t.Fatalf("Unexpected lexer item at index %d: %s (expected: %s)", i, l[i], e) + } + } +} diff --git a/lexer/states.go b/lexer/states.go new file mode 100644 index 0000000..be877d4 --- /dev/null +++ b/lexer/states.go @@ -0,0 +1,219 @@ +package lexer + +// stateFn represents the state of the scanner as a function +// that returns the next state. +type stateFn func(*Lexer) stateFn + +const ( + whitespace string = " \t" + newline string = "\r\n" + startOfComment string = "#" + equal string = "=" + lower string = "abcdefghijklmnopqrstuvwxyz" + upper string = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + digits string = "0123456789" + dot string = "." + underscore string = "_" + dash string = "-" + singleQuote string = "'" + doubleQuote string = "\"" + backslash string = "\\" + someQuote string = singleQuote + doubleQuote + singleQuote3 string = singleQuote + singleQuote + singleQuote + doubleQuote3 string = doubleQuote + doubleQuote + doubleQuote + bareKey string = lower + upper + digits + underscore + dash + startOfKey string = bareKey + someQuote +) + +func stateKeyValuePair(l *Lexer) stateFn { + l.skip(whitespace + newline) + if l.upcoming(startOfComment) { + return stateComment + } + if l.upcoming(startOfKey) { + return stateKey + } + return stateEndOfFile +} + +// A hash symbol marks the rest of the line as a comment. +func stateComment(l *Lexer) stateFn { + l.acceptUntil(newline) + l.emit(ItemComment, l.getAcceptedString()) + l.skip(newline) + return stateKeyValuePair +} + +// A key may be either bare, quoted or dotted. +func stateKey(l *Lexer) stateFn { + if l.upcoming(bareKey) { + return stateBareKey + } + return l.unexpectedTokenError("a valid key name") +} + +// Bare keys may only contain ASCII letters, ASCII digits, +// underscores, and dashes (A-Za-z0-9_-). Note that bare +// keys are allowed to be composed of only ASCII digits, +// e.g. 1234, but are always interpreted as strings. +func stateBareKey(l *Lexer) stateFn { + l.acceptWhile(bareKey) + l.emit(ItemKey, l.getAcceptedString()) + return stateEndOfKeyOrKeyDot +} + +// Dotted keys are a sequence of bare or quoted keys joined with a dot. +// This allows for grouping similar properties together: +func stateEndOfKeyOrKeyDot(l *Lexer) stateFn { + // Whitespace around dot-separated parts is ignored, however, + // best practice is to not use any extraneous whitespace. + l.skip(whitespace) + if l.accept(dot) { + l.emit(ItemKeyDot, ".") + l.skip(whitespace) + return stateKey + } + return stateKeyAssignment +} + +// Keys are on the left of the equals sign and values are on the right. +// Whitespace is ignored around key names and values. The key, equals +// sign, and value must be on the same line (though some values can +// be broken over multiple lines). +func stateKeyAssignment(l *Lexer) stateFn { + l.skip(whitespace) + if l.accept(equal) { + l.emit(ItemKeyValueAssignment, "=") + l.skip(whitespace) + return stateValue + } + return l.unexpectedTokenError("an '=' value assignment") +} + +func stateValue(l *Lexer) stateFn { + l.skip(whitespace) + if l.upcoming(someQuote) { + return stateStringValue + } + return l.unexpectedTokenError("a value") +} + +// There are four ways to express strings: basic, multi-line basic, literal, +// and multi-line literal. All strings must contain only valid UTF-8 characters. +func stateStringValue(l *Lexer) stateFn { + if l.accept(doubleQuote) { + return stateBasicStringValue + } + return l.unexpectedTokenError("a string value") +} + +func stateBasicStringValue(l *Lexer) stateFn { + // Possibly a """ multi-line string start, + // possibly the end of an "" empty string. + if l.accept(doubleQuote) { + // A """ multi-line string. + if l.accept(doubleQuote) { + l.ignore() + return stateMultiLineBasicString + } + // An "" empty string. + l.ignore() + l.emit(ItemStringValue, "") + return stateKeyValuePair + } + l.ignore() + return stateBasicString +} + +// Basic strings are surrounded by quotation marks. Any Unicode character +// may be used except those that must be escaped: quotation mark, backslash, +// and the control characters (U+0000 to U+001F, U+007F). +// +// For convenience, some popular characters have a compact escape sequence. +// +// \b - backspace (U+0008) +// \t - tab (U+0009) +// \n - linefeed (U+000A) +// \f - form feed (U+000C) +// \r - carriage return (U+000D) +// \" - quote (U+0022) +// \\ - backslash (U+005C) +// \uXXXX - unicode (U+XXXX) +// \UXXXXXXXX - unicode (U+XXXXXXXX) +// +// Any Unicode character may be escaped with the \uXXXX or \UXXXXXXXX forms. +// The escape codes must be valid Unicode scalar values. +// +// All other escape sequences not listed above are reserved and, +// if used, TOML should produce an error. + +var basicEscapes = map[rune]rune{ + 'b': rune(8), + 't': rune(9), + 'n': rune(10), + 'f': rune(12), + 'r': rune(13), + '"': rune(34), + '\\': rune(92), +} + +func stateParseBasicString(l *Lexer) stateFn { + for { + switch { + case l.upcoming(endOfFile): + l.unexpectedTokenError("basic string token") + case l.upcoming(doubleQuote): + return l.popState() + case l.accept(backslash): + r := l.next() + if escaped, ok := basicEscapes[r]; ok { + l.addToString(escaped) + } else { + return l.errorf("Invalid escape sequence \\%c in string value", r) + } + default: + l.addToString(l.next()) + } + } +} + +func stateBasicString(l *Lexer) stateFn { + l.newString() + l.pushState(stateBasicStringEnd) + return stateParseBasicString + +parsing: + for { + r := l.next() + if r == endOfFile { + break + } + if r == '"' { + l.emit(ItemStringValue, l.getString()) + return stateKeyValuePair + } + if r == '\\' { + r = l.next() + if escaped, ok := basicEscapes[r]; ok { + l.addToString(escaped) + continue parsing + } + return l.errorf("Invalid escape sequence \\%c in string value", r) + } + l.addToString(r) + } + return l.unexpectedTokenError("valid basic string rune") +} + +func stateMultiLineBasicString(l *Lexer) stateFn { + return l.errorf("Not yet implemented") +} + +func stateEndOfFile(l *Lexer) stateFn { + i := l.peek() + if i == endOfFile { + l.emit(ItemEOF, "EOF") + return nil + } + return l.unexpectedTokenError("end of file") +}