From f6efd34b31e7cd0606e4db7fc8348dbd4fa67773 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@code.lxd>
Date: Wed, 15 May 2019 09:00:35 +0000
Subject: [PATCH] Initial import, work in progress.

---
 go.mod              |   3 +
 lexer/items.go      |  54 ++++++++++
 lexer/lexer.go      | 244 ++++++++++++++++++++++++++++++++++++++++++++
 lexer/lexer_test.go | 164 +++++++++++++++++++++++++++++
 lexer/states.go     | 219 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 684 insertions(+)
 create mode 100644 go.mod
 create mode 100644 lexer/items.go
 create mode 100644 lexer/lexer.go
 create mode 100644 lexer/lexer_test.go
 create mode 100644 lexer/states.go
diff --git a/go.mod b/go.mod
new file mode 100644
index 0000000..bb4a415
--- /dev/null
+++ b/go.mod
@@ -0,0 +1,3 @@
+module github.com/mmakaay/toml
+
+go 1.12
diff --git a/lexer/items.go b/lexer/items.go
new file mode 100644
index 0000000..d7164d9
--- /dev/null
+++ b/lexer/items.go
@@ -0,0 +1,54 @@
+package lexer
+
+import "fmt"
+
+// itemType represents the type of lexer items.
+type itemType int
+
+// Definition of all the lexer item types for the TOML lexer.
+const (
+	ItemError              itemType = iota // An error occurred
+	ItemEOF                                // End of input reached
+	ItemComment                            // Comment string, starts with # till en of line
+	ItemKey                                // Key of a key/value pair
+	ItemKeyDot                             // Dot for a dotted key
+	ItemKeyValueAssignment                 // Equal sign for a key/value pair assignment
+	ItemStringValue                        // A value of type string
+)
+
+// Item represents a lexer item returned from the scanner.
+type Item struct {
+	Type  itemType //Type, e.g. itemNumber, itemSquareBracket
+	Value string   // Value, e.g. "10.42", "["
+}
+
+// String returns a string representation of the lexer item.
+func (i Item) String() string {
+	switch i.Type {
+	case ItemEOF:
+		return "EOF"
+	case ItemError:
+		return "Error: " + i.Value
+	}
+	return fmt.Sprintf("%s(%q)", i.Type, i.Value)
+}
+
+// String returns a string representation of the lexer item type.
+func (i itemType) String() string {
+	switch i {
+	case ItemError:
+		return "Error"
+	case ItemComment:
+		return "Comment"
+	case ItemKey:
+		return "Key"
+	case ItemKeyDot:
+		return "KeyDot"
+	case ItemKeyValueAssignment:
+		return "Assignment"
+	case ItemStringValue:
+		return "StringValue"
+	default:
+		return fmt.Sprintf("<type id %d>", i)
+	}
+}
diff --git a/lexer/lexer.go b/lexer/lexer.go
new file mode 100644
index 0000000..0f39ba5
--- /dev/null
+++ b/lexer/lexer.go
@@ -0,0 +1,244 @@
+package lexer
+
+import (
+	"errors"
+	"fmt"
+	"strings"
+	"unicode/utf8"
+)
+
+// Lexer holds the state of the scanner.
+type Lexer struct {
+	input    string          // the scanned input string
+	state    stateFn         // the current state
+	stack    []stateFn       // state stack, for nested parsing
+	start    int             // start position of the currently scanned item
+	pos      int             // current scanning position in the input
+	width    int             // width of the last rune read
+	strValue strings.Builder // used to build string values
+	items    chan Item       // channel of scanned items
+	nextItem Item            // the current item as reached by Next() and retrieved by Get()
+	err      error           // an error message when lexing failed, retrieved by Error()
+}
+
+// Lex takes an input string and initializes the TOML lexer for it.
+// Usage:
+//
+//     l := lexer.Lex("...inputstring...")
+//     for l.Next() {
+//         item := l.Get()
+//         ... handle item ...
+//     }
+//     if e := l.Error(); e != nil {
+//         ... handle error message ...
+//     }
+func Lex(input string) *Lexer {
+	return &Lexer{
+		input: input,
+		state: stateKeyValuePair,
+		items: make(chan Item, 2),
+	}
+}
+
+// Next advances to the next lexer item in the input string.
+// When a next item was found, then true is returned.
+// On error or reaching the end of the input, false is returned.
+func (l *Lexer) Next() bool {
+	if l.state == nil {
+		panic("This should not happen: nil state reached, but entering Next()")
+	}
+	for {
+		select {
+		case i := <-l.items:
+			if i.Type == ItemEOF {
+				return false
+			}
+			if i.Type == ItemError {
+				l.err = errors.New(i.Value)
+				return false
+			}
+			l.nextItem = i
+			return true
+		default:
+			l.state = l.state(l)
+		}
+	}
+}
+
+func (l *Lexer) Error() error {
+	return l.err
+}
+
+// ToArray returns lexer items as an array.
+// When an error occurs during scanning, a partial result will be
+// returned, accompanied by the error that occurred.
+func (l *Lexer) ToArray() ([]Item, error) {
+	var items []Item
+	for l.Next() {
+		items = append(items, l.Get())
+	}
+	return items, l.Error()
+}
+
+// Get returns the next lexer item, as reached by Next()
+func (l *Lexer) Get() Item {
+	return l.nextItem
+}
+
+// pushState adds the state function to its stack.
+// This is used for implementing nested parsing.
+func (l *Lexer) pushState(state stateFn) {
+	l.stack = append(l.stack, state)
+}
+
+// popState pops the last pushed state from its stack.
+func (l *Lexer) popState() stateFn {
+	last := len(l.stack) - 1
+	head, tail := l.stack[:last], l.stack[last]
+	l.stack = head
+	return tail
+}
+
+// getAcceptedString returns the string as accepted by the
+// accept* methods so far.
+func (l *Lexer) getAcceptedString() string {
+	return l.input[l.start:l.pos]
+}
+
+// emit passes a scanned item back to the client.
+func (l *Lexer) emit(t itemType, v string) {
+	l.items <- Item{t, v}
+	l.start = l.pos
+}
+
+// ignore skips over the pending input before the current position.
+func (l *Lexer) ignore() {
+	l.start = l.pos
+}
+
+// backup steps back one rune
+// Can be called only once per call of next.
+func (l *Lexer) backup() {
+	l.pos -= l.width
+}
+
+// peek returns but does not advance to the next rune(s) in the input.
+func (l *Lexer) peek() rune {
+	r := l.next()
+	l.backup()
+	return r
+}
+
+// accept consumes the next rune if it's from the valid set of runes.
+func (l *Lexer) accept(runes string) bool {
+	if strings.IndexRune(runes, l.next()) >= 0 {
+		return true
+	}
+	l.backup()
+	return false
+}
+
+func (l *Lexer) upcoming(runes string) bool {
+	if l.accept(runes) {
+		l.backup()
+		return true
+	}
+	return false
+}
+
+// acceptNot consumes the next rune if it's not from the set of runes.
+func (l *Lexer) acceptNot(runes string) bool {
+	r := l.next()
+	if r == endOfFile {
+		l.backup()
+		return false
+	}
+	if strings.IndexRune(runes, r) < 0 {
+		return true
+	}
+	l.backup()
+	return false
+}
+
+// acceptUntil consumes a run of runes until ones from the
+// valid set is encountered.
+func (l *Lexer) acceptUntil(runes string) bool {
+	accepted := false
+	for l.acceptNot(runes) {
+		accepted = true
+	}
+	return accepted
+}
+
+// acceptRun consumes a run of runes from the set of accepted runes.
+func (l *Lexer) acceptWhile(runes string) bool {
+	accepted := false
+	for l.accept(runes) {
+		accepted = true
+	}
+	return accepted
+}
+
+// skip skips a run of runes from the set of accepted runs.
+func (l *Lexer) skip(runes string) {
+	if l.acceptWhile(runes) {
+		l.ignore()
+	}
+}
+
+// skipUntil skips a run of runes, until a rune from the set of
+// runes of EOF is reached.
+func (l *Lexer) skipUntil(runes string) {
+	if l.acceptUntil(runes) {
+		l.ignore()
+	}
+}
+
+func (l *Lexer) newString() {
+	l.strValue.Reset()
+}
+
+func (l *Lexer) addToString(r rune) {
+	l.strValue.WriteRune(r)
+}
+
+func (l *Lexer) getString() string {
+	return l.strValue.String()
+}
+
+var endOfFile rune = -1
+
+// next returns the next rune in the input.
+func (l *Lexer) next() rune {
+	if l.pos >= len(l.input) {
+		l.width = 0
+		return endOfFile
+	}
+	r, w := utf8.DecodeRuneInString(l.input[l.pos:])
+	l.width = w
+	l.pos += w
+	return r
+}
+
+// error returns an error token and terminates the scan
+// by returning nil to l.run.
+func (l *Lexer) errorf(format string, args ...interface{}) stateFn {
+	l.items <- Item{
+		ItemError,
+		fmt.Sprintf(format, args...),
+	}
+	return nil
+}
+
+func (l *Lexer) unexpectedTokenError(expected string) stateFn {
+	var actual string
+	switch {
+	case l.peek() == endOfFile:
+		actual = "end of file"
+	case !utf8.ValidString(l.input[l.start:]):
+		actual = "non-UTF8 data"
+	default:
+		actual = fmt.Sprintf("token '%c'", l.peek())
+	}
+	return l.errorf("Unexpected %s (expected %s)", actual, expected)
+}
diff --git a/lexer/lexer_test.go b/lexer/lexer_test.go
new file mode 100644
index 0000000..dbf2b26
--- /dev/null
+++ b/lexer/lexer_test.go
@@ -0,0 +1,164 @@
+package lexer_test
+
+import (
+	"fmt"
+	"testing"
+
+	"github.com/mmakaay/toml/lexer"
+)
+
+func TestInvalidUtf8Data(t *testing.T) {
+	assertFailureAndCheck(t, "\xbc", []string{}, "Unexpected non-UTF8 data (expected end of file)")
+}
+
+func TestEmptyInput(t *testing.T) {
+	assertSuccessAndCheck(t, "", []string{})
+}
+func TestWhiteSpace(t *testing.T) {
+	assertSuccessAndCheck(t, " ", []string{})
+	assertSuccessAndCheck(t, "\t", []string{})
+	assertSuccessAndCheck(t, " \t \t ", []string{})
+}
+func TestWhiteSpaceAndNewlines(t *testing.T) {
+	assertSuccessAndCheck(t, "\n", []string{})
+	assertSuccessAndCheck(t, "\n \t\r\n", []string{})
+}
+func TestWhitespacePlusComment(t *testing.T) {
+	assertSuccessAndCheck(t, "#", []string{`Comment("#")`})
+	assertSuccessAndCheck(t, " \t \t #", []string{`Comment("#")`})
+	assertSuccessAndCheck(t, " \t \t # not empty", []string{`Comment("# not empty")`})
+	assertSuccessAndCheck(t, " \t \t # not empty\r\r\r\n", []string{`Comment("# not empty")`})
+	assertSuccessAndCheck(t, "\n \t\r\n# AAP\r\n", []string{`Comment("# AAP")`})
+	assertSuccessAndCheck(t, "# two lines\n# of comments\n",
+		[]string{`Comment("# two lines")`, `Comment("# of comments")`})
+}
+
+func TestBareKeyWithoutValue(t *testing.T) {
+	err := "Unexpected end of file (expected an '=' value assignment)"
+	assertFailureAndCheck(t, "=", []string{`Key("a")`}, err)
+	assertFailureAndCheck(t, " a", []string{`Key("a")`}, err)
+	assertFailureAndCheck(t, " a ", []string{`Key("a")`}, err)
+	assertFailureAndCheck(t, "ab", []string{`Key("ab")`}, err)
+	assertFailureAndCheck(t, "Ab", []string{`Key("Ab")`}, err)
+	assertFailureAndCheck(t, "Ab1", []string{`Key("Ab1")`}, err)
+	assertFailureAndCheck(t, "_Ab1", []string{`Key("_Ab1")`}, err)
+	assertFailureAndCheck(t, "_-Ab1", []string{`Key("_-Ab1")`}, err)
+	assertFailureAndCheck(t, "_-Ab1_this-is_GOOD987", []string{`Key("_-Ab1_this-is_GOOD987")`}, err)
+}
+
+func TestDottedKey(t *testing.T) {
+	err := "Unexpected end of file (expected an '=' value assignment)"
+	assertFailureAndCheck(t, "a.b", []string{`Key("a")`, `KeyDot(".")`, `Key("b")`}, err)
+	assertFailureAndCheck(t, " a .\t\t b\t ", []string{`Key("a")`, `KeyDot(".")`, `Key("b")`}, err)
+}
+
+func TestKeyWithAssignmentButNoValue(t *testing.T) {
+	err := "Unexpected end of file (expected a value)"
+	assertFailureAndCheck(t, "  some_cool_key   =  ", []string{`Key("some_cool_key")`, `Assignment("=")`}, err)
+}
+
+func TestEmptyBasicStringValue(t *testing.T) {
+	assertSuccessAndCheck(t, `a=""`, []string{`Key("a")`, `Assignment("=")`, `StringValue("")`})
+	assertSuccessAndCheck(t, `a=""#hi`, []string{`Key("a")`, `Assignment("=")`, `StringValue("")`, `Comment("#hi")`})
+	assertSuccessAndCheck(t, `a = ""`, []string{`Key("a")`, `Assignment("=")`, `StringValue("")`})
+	assertSuccessAndCheck(t, `a.b = ""`, []string{`Key("a")`, `KeyDot(".")`, `Key("b")`, `Assignment("=")`, `StringValue("")`})
+}
+func TestBasicStringValue(t *testing.T) {
+	assertSuccessAndCheck(t, `_ = "b"`,
+		[]string{
+			`Key("_")`,
+			`Assignment("=")`,
+			`StringValue("b")`})
+	assertSuccessAndCheck(t, `thing = "A cool ʎǝʞ" # huh, it's up-side down!!`,
+		[]string{
+			`Key("thing")`,
+			`Assignment("=")`,
+			`StringValue("A cool ʎǝʞ")`,
+			`Comment("# huh, it's up-side down!!")`})
+}
+
+func TestInvalidEscapeSequence(t *testing.T) {
+	assertFailure(t, `a="\x"`, `Invalid escape sequence \x in string value`)
+}
+func TestBasicStringValueEscapes(t *testing.T) {
+	for in, out := range map[string]string{
+		`\b`:           "\b",
+		`\t`:           "\t",
+		`\n`:           "\n",
+		`\f`:           "\f",
+		`\r`:           "\r",
+		`\"`:           "\"",
+		`\b\t\n\f\r\"`: "\b\t\n\f\r\"",
+	} {
+		l := assertSuccess(t, fmt.Sprintf(`x="%s"`, in))
+		s := l[2]
+		if out != s.Value {
+			t.Fatalf("Unexpected result when parsing '%s'", in)
+		}
+	}
+}
+
+// func TestBasicStringUnicodeEscapes(t *testing.T) {
+// 	for in, out := range map[string]string{
+// 		`\u`: "\b",
+// 	} {
+// 		l := assertSuccess(t, fmt.Sprintf(`x="%s"`, in))
+// 		s := l[2]
+// 		if out != s.Value {
+// 			t.Fatalf("Unexpected result when parsing '%s'", in)
+// 		}
+// 	}
+// }
+
+func TestTwoKeyValuePairs(t *testing.T) {
+	assertSuccessAndCheck(t, "a=\"Hello\" #comment1\nb=\"World!\"#comment2\r\n",
+		[]string{
+			`Key("a")`,
+			`Assignment("=")`,
+			`StringValue("Hello")`,
+			`Comment("#comment1")`,
+			`Key("b")`,
+			`Assignment("=")`,
+			`StringValue("World!")`,
+			`Comment("#comment2")`})
+}
+
+func assertSuccessAndCheck(t *testing.T, input string, expected []string) {
+	l := assertSuccess(t, input)
+	assertItems(t, l, expected)
+}
+
+func assertFailureAndCheck(t *testing.T, input string, expected []string, expectedErr string) {
+	l := assertFailure(t, input, expectedErr)
+	assertItems(t, l, expected)
+}
+
+func assertFailure(t *testing.T, input string, expectedErr string) []lexer.Item {
+	l, err := lexer.Lex(input).ToArray()
+	if err == nil {
+		t.Fatalf("Expected lexer error '%s', but no error occurred", expectedErr)
+	}
+	if err.Error() != expectedErr {
+		t.Fatalf("Mismatch between expected and actual error:\nExpected: %s\nActual: %s\n", expectedErr, err)
+	}
+	return l
+}
+
+func assertSuccess(t *testing.T, input string) []lexer.Item {
+	l, err := lexer.Lex(input).ToArray()
+	if err != nil {
+		t.Fatalf("Unexpected lexer error: %s", err)
+	}
+	return l
+}
+
+func assertItems(t *testing.T, l []lexer.Item, expected []string) {
+	if len(expected) != len(l) {
+		t.Fatalf("Unexpected number of lexer items: %d (expected: %d)", len(l), len(expected))
+	}
+	for i, e := range expected {
+		if l[i].String() != e {
+			t.Fatalf("Unexpected lexer item at index %d: %s (expected: %s)", i, l[i], e)
+		}
+	}
+}
diff --git a/lexer/states.go b/lexer/states.go
new file mode 100644
index 0000000..be877d4
--- /dev/null
+++ b/lexer/states.go
@@ -0,0 +1,219 @@
+package lexer
+
+// stateFn represents the state of the scanner as a function
+// that returns the next state.
+type stateFn func(*Lexer) stateFn
+
+const (
+	whitespace     string = " \t"
+	newline        string = "\r\n"
+	startOfComment string = "#"
+	equal          string = "="
+	lower          string = "abcdefghijklmnopqrstuvwxyz"
+	upper          string = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+	digits         string = "0123456789"
+	dot            string = "."
+	underscore     string = "_"
+	dash           string = "-"
+	singleQuote    string = "'"
+	doubleQuote    string = "\""
+	backslash      string = "\\"
+	someQuote      string = singleQuote + doubleQuote
+	singleQuote3   string = singleQuote + singleQuote + singleQuote
+	doubleQuote3   string = doubleQuote + doubleQuote + doubleQuote
+	bareKey        string = lower + upper + digits + underscore + dash
+	startOfKey     string = bareKey + someQuote
+)
+
+func stateKeyValuePair(l *Lexer) stateFn {
+	l.skip(whitespace + newline)
+	if l.upcoming(startOfComment) {
+		return stateComment
+	}
+	if l.upcoming(startOfKey) {
+		return stateKey
+	}
+	return stateEndOfFile
+}
+
+// A hash symbol marks the rest of the line as a comment.
+func stateComment(l *Lexer) stateFn {
+	l.acceptUntil(newline)
+	l.emit(ItemComment, l.getAcceptedString())
+	l.skip(newline)
+	return stateKeyValuePair
+}
+
+// A key may be either bare, quoted or dotted.
+func stateKey(l *Lexer) stateFn {
+	if l.upcoming(bareKey) {
+		return stateBareKey
+	}
+	return l.unexpectedTokenError("a valid key name")
+}
+
+// Bare keys may only contain ASCII letters, ASCII digits,
+// underscores, and dashes (A-Za-z0-9_-). Note that bare
+// keys are allowed to be composed of only ASCII digits,
+// e.g. 1234, but are always interpreted as strings.
+func stateBareKey(l *Lexer) stateFn {
+	l.acceptWhile(bareKey)
+	l.emit(ItemKey, l.getAcceptedString())
+	return stateEndOfKeyOrKeyDot
+}
+
+// Dotted keys are a sequence of bare or quoted keys joined with a dot.
+// This allows for grouping similar properties together:
+func stateEndOfKeyOrKeyDot(l *Lexer) stateFn {
+	// Whitespace around dot-separated parts is ignored, however,
+	// best practice is to not use any extraneous whitespace.
+	l.skip(whitespace)
+	if l.accept(dot) {
+		l.emit(ItemKeyDot, ".")
+		l.skip(whitespace)
+		return stateKey
+	}
+	return stateKeyAssignment
+}
+
+// Keys are on the left of the equals sign and values are on the right.
+// Whitespace is ignored around key names and values. The key, equals
+// sign, and value must be on the same line (though some values can
+// be broken over multiple lines).
+func stateKeyAssignment(l *Lexer) stateFn {
+	l.skip(whitespace)
+	if l.accept(equal) {
+		l.emit(ItemKeyValueAssignment, "=")
+		l.skip(whitespace)
+		return stateValue
+	}
+	return l.unexpectedTokenError("an '=' value assignment")
+}
+
+func stateValue(l *Lexer) stateFn {
+	l.skip(whitespace)
+	if l.upcoming(someQuote) {
+		return stateStringValue
+	}
+	return l.unexpectedTokenError("a value")
+}
+
+// There are four ways to express strings: basic, multi-line basic, literal,
+// and multi-line literal. All strings must contain only valid UTF-8 characters.
+func stateStringValue(l *Lexer) stateFn {
+	if l.accept(doubleQuote) {
+		return stateBasicStringValue
+	}
+	return l.unexpectedTokenError("a string value")
+}
+
+func stateBasicStringValue(l *Lexer) stateFn {
+	// Possibly a """ multi-line string start,
+	// possibly the end of an "" empty string.
+	if l.accept(doubleQuote) {
+		// A """ multi-line string.
+		if l.accept(doubleQuote) {
+			l.ignore()
+			return stateMultiLineBasicString
+		}
+		// An "" empty string.
+		l.ignore()
+		l.emit(ItemStringValue, "")
+		return stateKeyValuePair
+	}
+	l.ignore()
+	return stateBasicString
+}
+
+// Basic strings are surrounded by quotation marks. Any Unicode character
+// may be used except those that must be escaped: quotation mark, backslash,
+// and the control characters (U+0000 to U+001F, U+007F).
+//
+// For convenience, some popular characters have a compact escape sequence.
+//
+// \b         - backspace       (U+0008)
+// \t         - tab             (U+0009)
+// \n         - linefeed        (U+000A)
+// \f         - form feed       (U+000C)
+// \r         - carriage return (U+000D)
+// \"         - quote           (U+0022)
+// \\         - backslash       (U+005C)
+// \uXXXX     - unicode         (U+XXXX)
+// \UXXXXXXXX - unicode         (U+XXXXXXXX)
+//
+// Any Unicode character may be escaped with the \uXXXX or \UXXXXXXXX forms.
+// The escape codes must be valid Unicode scalar values.
+//
+// All other escape sequences not listed above are reserved and,
+// if used, TOML should produce an error.
+
+var basicEscapes = map[rune]rune{
+	'b':  rune(8),
+	't':  rune(9),
+	'n':  rune(10),
+	'f':  rune(12),
+	'r':  rune(13),
+	'"':  rune(34),
+	'\\': rune(92),
+}
+
+func stateParseBasicString(l *Lexer) stateFn {
+	for {
+		switch {
+		case l.upcoming(endOfFile):
+			l.unexpectedTokenError("basic string token")
+		case l.upcoming(doubleQuote):
+			return l.popState()
+		case l.accept(backslash):
+			r := l.next()
+			if escaped, ok := basicEscapes[r]; ok {
+				l.addToString(escaped)
+			} else {
+				return l.errorf("Invalid escape sequence \\%c in string value", r)
+			}
+		default:
+			l.addToString(l.next())
+		}
+	}
+}
+
+func stateBasicString(l *Lexer) stateFn {
+	l.newString()
+	l.pushState(stateBasicStringEnd)
+	return stateParseBasicString
+
+parsing:
+	for {
+		r := l.next()
+		if r == endOfFile {
+			break
+		}
+		if r == '"' {
+			l.emit(ItemStringValue, l.getString())
+			return stateKeyValuePair
+		}
+		if r == '\\' {
+			r = l.next()
+			if escaped, ok := basicEscapes[r]; ok {
+				l.addToString(escaped)
+				continue parsing
+			}
+			return l.errorf("Invalid escape sequence \\%c in string value", r)
+		}
+		l.addToString(r)
+	}
+	return l.unexpectedTokenError("valid basic string rune")
+}
+
+func stateMultiLineBasicString(l *Lexer) stateFn {
+	return l.errorf("Not yet implemented")
+}
+
+func stateEndOfFile(l *Lexer) stateFn {
+	i := l.peek()
+	if i == endOfFile {
+		l.emit(ItemEOF, "EOF")
+		return nil
+	}
+	return l.unexpectedTokenError("end of file")
+}