Now the parser code is out of the way, we can split up the state functions describing the syntax of TOML into separate files, while still keeping it maintainable.

2019-05-17 14:13:25 +00:00 · 2019-05-17 14:13:25 +00:00 · 9f19add210
parent db4a8f7942
commit 9f19add210
9 changed files with 297 additions and 246 deletions
--- a/lexer/comments.go
+++ b/lexer/comments.go
@ -0,0 +1,21 @@
 package lexer
 import "github.com/mmakaay/toml/parser"
 // A '#' hash symbol marks the rest of the line as a comment.
 func stateCommentStart(p *parser.Parser) parser.StateFn {
 	p.SkipConsecutive(hash)
 	return stateCommentContent
 }
 // All characters up to the end of the line are included in the comment.
 func stateCommentContent(p *parser.Parser) parser.StateFn {
 	switch {
 	case p.AtEndOfLine():
 		p.EmitLiteralTrim(ItemComment)
 		return p.ToParentState()
 	default:
 		p.AcceptAny()
 		return stateCommentContent
 	}
 }
--- a/lexer/end_of_file.go
+++ b/lexer/end_of_file.go
@ -0,0 +1,12 @@
 package lexer
 import "github.com/mmakaay/toml/parser"
 func stateEndOfFile(l *parser.Parser) parser.StateFn {
 	if l.AtEndOfFile() {
 		l.Emit(parser.ItemEOF, "EOF") // todo Automate within parser?
 	} else {
 		l.UnexpectedInputError("end of file")
 	}
 	return nil
 }
--- a/lexer/items.go
+++ b/lexer/items.go
@ -1,35 +0,0 @@
 package lexer
 import (
 	"fmt"
 	"github.com/mmakaay/toml/parser"
 )
 // Definition of all the lexer item types for the TOML lexer.
 const (
 	ItemComment    parser.ItemType = iota // An error occurred
 	ItemKey                               // Key of a key/value pair
 	ItemKeyDot                            // Dot for a dotted key
 	ItemAssignment                        // Value assignment coming up (=)
 	ItemString                            // A value of type string
 )
 // ParserItemToString returns a string representation of the
 // parser.Item. This is used for unit testing purposes.
 func ParserItemToString(i parser.Item) string {
 	switch i.Type {
 	case ItemComment:
 		return fmt.Sprintf("#(%s)", i.Value)
 	case ItemKey:
 		return fmt.Sprintf("[%s]", i.Value)
 	case ItemString:
 		return fmt.Sprintf("STR(%s)", i.Value)
 	case ItemKeyDot:
 		return "."
 	case ItemAssignment:
 		return "="
 	default:
 		panic(fmt.Sprintf("No string representation available for parser.Item id %d", i.Type))
 	}
 }
--- a/lexer/key_value_pairs.go
+++ b/lexer/key_value_pairs.go
@ -0,0 +1,73 @@
 package lexer
 import "github.com/mmakaay/toml/parser"
 // The primary building block of a TOML document is the key/value pair.
 func stateKeyValuePair(l *parser.Parser) parser.StateFn {
 	switch {
 	case l.SkipConsecutive(whitespace + carriageReturn + newline):
 		return stateKeyValuePair
 	case l.Upcoming(hash):
 		return l.ToChildState(stateCommentStart)
 	case l.Upcoming(startOfKey):
 		return l.ToChildState(stateKey)
 	default:
 		return stateEndOfFile
 	}
 }
 // A key may be either bare, quoted or dotted.
 func stateKey(l *parser.Parser) parser.StateFn {
 	if l.AcceptMatching(bareKeyChars) {
 		return statebareKeyChars
 	}
 	return l.UnexpectedInputError("a valid key name")
 }
 // Bare keys may only contain ASCII letters, ASCII digits,
 // underscores, and dashes (A-Za-z0-9_-). Note that bare
 // keys are allowed to be composed of only ASCII digits,
 // e.g. 1234, but are always interpreted as strings.
 func statebareKeyChars(l *parser.Parser) parser.StateFn {
 	l.AcceptConsecutive(bareKeyChars)
 	l.EmitLiteral(ItemKey)
 	return stateEndOfKeyOrKeyDot
 }
 // Dotted keys are a sequence of bare or quoted keys joined with a dot.
 // This allows for grouping similar properties together:
 func stateEndOfKeyOrKeyDot(l *parser.Parser) parser.StateFn {
 	// Whitespace around dot-separated parts is ignored, however,
 	// best practice is to not use any extraneous whitespace.
 	l.SkipConsecutive(whitespace)
 	if l.SkipMatching(dot) {
 		l.Emit(ItemKeyDot, "")
 		l.SkipConsecutive(whitespace)
 		return stateKey
 	}
 	return stateKeyAssignment
 }
 // Keys are on the left of the equals sign and values are on the right.
 // Whitespace is ignored around key names and values. The key, equals
 // sign, and value must be on the same line (though some values can
 // be broken over multiple lines).
 func stateKeyAssignment(l *parser.Parser) parser.StateFn {
 	l.SkipConsecutive(whitespace)
 	if l.SkipMatching(equal) {
 		l.Emit(ItemAssignment, "")
 		l.SkipConsecutive(whitespace)
 		return stateValue
 	}
 	return l.UnexpectedInputError("a value assignment")
 }
 // Values must be of the following types: String, Integer, Float, Boolean,
 // Datetime, Array, or Inline Table. Unspecified values are invalid.
 func stateValue(l *parser.Parser) parser.StateFn {
 	l.SkipConsecutive(whitespace)
 	if l.Upcoming(quoteChars) {
 		return stateStringValue
 	}
 	return l.UnexpectedInputError("a value")
 }
--- a/lexer/main.go
+++ b/lexer/main.go
@ -0,0 +1,42 @@
 package lexer
 import "github.com/mmakaay/toml/parser"
 // Definition of the item types that are emitted by this parser.
 const (
 	ItemComment    parser.ItemType = iota // An error occurred
 	ItemKey                               // Key of a key/value pair
 	ItemKeyDot                            // Dot for a dotted key
 	ItemAssignment                        // Value assignment coming up (=)
 	ItemString                            // A value of type string
 )
 const (
 	whitespace      string = " \t"
 	carriageReturn  string = "\r"
 	newline         string = "\n"
 	hash            string = "#"
 	equal           string = "="
 	lower           string = "abcdefghijklmnopqrstuvwxyz"
 	upper           string = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 	digits          string = "0123456789"
 	hex             string = digits + "abcdefABCDEF"
 	dot             string = "."
 	underscore      string = "_"
 	dash            string = "-"
 	singleQuote     string = "'"
 	doubleQuote     string = "\""
 	backslash       string = "\\"
 	quoteChars      string = singleQuote + doubleQuote
 	bareKeyChars    string = lower + upper + digits + underscore + dash
 	startOfKey      string = bareKeyChars + quoteChars
 	escapeChars     string = `btnfr"\`
 	shortUtf8Escape string = "u"
 	longUtf8Escape  string = "U"
 )
 // NewParser creates a new parser, using the provided input string
 // as the data to parse.
 func NewParser(input string) *parser.Parser {
 	return parser.New(input, stateKeyValuePair)
 }
--- a/lexer/states.go
+++ b/lexer/states.go
@ -1,209 +0,0 @@
 package lexer
 import "github.com/mmakaay/toml/parser"
 const (
 	whitespace      string = " \t"
 	carriageReturn  string = "\r"
 	newline         string = "\n"
 	hash            string = "#"
 	equal           string = "="
 	lower           string = "abcdefghijklmnopqrstuvwxyz"
 	upper           string = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 	digits          string = "0123456789"
 	hex             string = digits + "abcdefABCDEF"
 	dot             string = "."
 	underscore      string = "_"
 	dash            string = "-"
 	singleQuote     string = "'"
 	doubleQuote     string = "\""
 	backslash       string = "\\"
 	quoteChars      string = singleQuote + doubleQuote
 	bareKeyChars    string = lower + upper + digits + underscore + dash
 	startOfKey      string = bareKeyChars + quoteChars
 	escapeChars     string = `btnfr"\`
 	shortUtf8Escape string = "u"
 	longUtf8Escape  string = "U"
 )
 // NewParser creates a new parser, using the provided input string
 // as the data to parse.
 func NewParser(input string) *parser.Parser {
 	return parser.New(input, stateKeyValuePair)
 }
 func stateKeyValuePair(l *parser.Parser) parser.StateFn {
 	l.SkipConsecutive(whitespace + carriageReturn + newline)
 	if l.SkipMatching(hash) {
 		return stateComment
 	}
 	if l.Upcoming(startOfKey) {
 		return stateKey
 	}
 	return stateEndOfFile
 }
 // A '#' hash symbol marks the rest of the line as a comment.
 func stateComment(l *parser.Parser) parser.StateFn {
 	for {
 		switch {
 		case l.AtEndOfFile() || l.SkipMatching(newline):
 			l.EmitLiteralTrim(ItemComment)
 			return stateKeyValuePair
 		default:
 			if !l.AcceptAny() {
 				return nil
 			}
 		}
 	}
 }
 // A key may be either bare, quoted or dotted.
 func stateKey(l *parser.Parser) parser.StateFn {
 	if l.AcceptMatching(bareKeyChars) {
 		return statebareKeyChars
 	}
 	return l.UnexpectedInputError("a valid key name")
 }
 // Bare keys may only contain ASCII letters, ASCII digits,
 // underscores, and dashes (A-Za-z0-9_-). Note that bare
 // keys are allowed to be composed of only ASCII digits,
 // e.g. 1234, but are always interpreted as strings.
 func statebareKeyChars(l *parser.Parser) parser.StateFn {
 	l.AcceptConsecutive(bareKeyChars)
 	l.EmitLiteral(ItemKey)
 	return stateEndOfKeyOrKeyDot
 }
 // Dotted keys are a sequence of bare or quoted keys joined with a dot.
 // This allows for grouping similar properties together:
 func stateEndOfKeyOrKeyDot(l *parser.Parser) parser.StateFn {
 	// Whitespace around dot-separated parts is ignored, however,
 	// best practice is to not use any extraneous whitespace.
 	l.SkipConsecutive(whitespace)
 	if l.SkipMatching(dot) {
 		l.Emit(ItemKeyDot, "")
 		l.SkipConsecutive(whitespace)
 		return stateKey
 	}
 	return stateKeyAssignment
 }
 // Keys are on the left of the equals sign and values are on the right.
 // Whitespace is ignored around key names and values. The key, equals
 // sign, and value must be on the same line (though some values can
 // be broken over multiple lines).
 func stateKeyAssignment(l *parser.Parser) parser.StateFn {
 	l.SkipConsecutive(whitespace)
 	if l.SkipMatching(equal) {
 		l.Emit(ItemAssignment, "")
 		l.SkipConsecutive(whitespace)
 		return stateValue
 	}
 	return l.UnexpectedInputError("a value assignment")
 }
 // Values must be of the following types: String, Integer, Float, Boolean,
 // Datetime, Array, or Inline Table. Unspecified values are invalid.
 func stateValue(l *parser.Parser) parser.StateFn {
 	l.SkipConsecutive(whitespace)
 	if l.Upcoming(quoteChars) {
 		return stateStringValue
 	}
 	return l.UnexpectedInputError("a value")
 }
 // There are four ways to express strings: basic, multi-line basic, literal,
 // and multi-line literal. All strings must contain only valid UTF-8 characters.
 func stateStringValue(l *parser.Parser) parser.StateFn {
 	switch {
 	case l.SkipMatching(doubleQuote, doubleQuote, doubleQuote):
 		// Multi-line basic strings are surrounded by three quotation marks on each side.
 		return stateMultiLineBasicString
 	case l.SkipMatching(doubleQuote):
 		// Basic strings are surrounded by quotation marks.
 		return stateSingleLineBasicString
 	}
 	return l.UnexpectedInputError("a string value")
 }
 func stateSingleLineBasicString(l *parser.Parser) parser.StateFn {
 	if l.Upcoming(doubleQuote, doubleQuote) {
 		return stateMultiLineBasicString
 	}
 	return stateBasicString
 }
 func stateMultiLineBasicString(l *parser.Parser) parser.StateFn {
 	l.EmitError("Not yet implemented")
 	return nil
 }
 // Any Unicode character may be used except those that must be escaped:
 // quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F).
 const invalidBasicStringCharacters string = "\"\\" +
 	"\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007" +
 	"\u0008\u0009\u000A\u000B\u000C\u000D\u000E\u000F" +
 	"\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017" +
 	"\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F" +
 	"\u007F"
 func stateParseBasicString(l *parser.Parser) parser.StateFn {
 	for {
 		switch {
 		case l.AtEndOfFile():
 			return l.UnexpectedEndOfFile("basic string token")
 		case l.SkipMatching(doubleQuote):
 			return l.PopState()
 		case l.AcceptMatching(backslash, escapeChars):
 			// For convenience, some popular characters have a compact escape sequence.
 			// \b         - backspace       (U+0008)
 			// \t         - tab             (U+0009)
 			// \n         - linefeed        (U+000A)
 			// \f         - form feed       (U+000C)
 			// \r         - carriage return (U+000D)
 			// \"         - quote           (U+0022)
 			// \\         - backslash       (U+005C)
 		case l.AcceptMatching(backslash, shortUtf8Escape, hex, hex, hex, hex):
 			// \uXXXX     - unicode         (U+XXXX)
 		case l.AcceptMatching(backslash, longUtf8Escape, hex, hex, hex, hex, hex, hex, hex, hex):
 			// \UXXXXXXXX - unicode         (U+XXXXXXXX)
 		case l.Upcoming(backslash):
 			// All other escape sequences not listed above are reserved and,
 			// if used, TOML should produce an error.
 			return l.EmitError("Invalid escape sequence in basic string")
 		case l.Upcoming(invalidBasicStringCharacters):
 			// Any Unicode character may be used except those that must be escaped:
 			// quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F).
 			r, _, _ := l.Match(invalidBasicStringCharacters)
 			l.EmitError("Invalid character in basic string: %q (must be escaped)", r[0])
 			return nil
 		default:
 			if !l.AcceptAny() {
 				return nil
 			}
 		}
 	}
 }
 func stateBasicString(l *parser.Parser) parser.StateFn {
 	l.PushState(func(l *parser.Parser) parser.StateFn {
 		err := l.EmitInterpreted(ItemString)
 		if err != nil {
 			l.EmitError("Invalid data in string: %s", err)
 			return nil
 		}
 		return stateKeyValuePair
 	})
 	return stateParseBasicString
 }
 func stateEndOfFile(l *parser.Parser) parser.StateFn {
 	if l.AtEndOfFile() {
 		l.Emit(parser.ItemEOF, "EOF") // todo Automate within parser?
 	} else {
 		l.UnexpectedInputError("end of file")
 	}
 	return nil
 }
--- a/lexer/states_test.go
+++ b/lexer/states_test.go
@ -6,6 +6,7 @@ import (
 	"testing"
 	"github.com/mmakaay/toml/lexer"
 	"github.com/mmakaay/toml/parser"
 )
 func TestErrorsIncludeLineAndRowPosition(t *testing.T) {
@ -52,6 +53,8 @@ func TestComments(t *testing.T) {
 		{"basic comment with surrounding whitespace", "#\t cow \t", "#(cow)", ""},
 		{"two lines of comments", "# one \r\n#two", "#(one)#(two)", ""},
 		{"comment with escape-y chars", `# \xxx/ \u can't escape/`, `#(\xxx/ \u can't escape/)`, ""},
 		{"comment with multiple hashes", `#### Just Jack!`, `#(Just Jack!)`, ""},
 		{"comment with hashes inside", `# Follow #me2`, `#(Follow #me2)`, ""},
 		{"carriage returns in comment", "# \tlexe\r accepts embedded ca\r\riage \returns\r", "#(lexe\r accepts embedded ca\r\riage \returns)", ""},
 	})
 }
@ -178,7 +181,7 @@ func runStatesT(t *testing.T, c statesT) {
 			t.Errorf("[%s] Unexpected number of lexer items:\nexpected: %d\nactual: %d\n", c.name, len(expected), len(l))
 		}
 		for i, e := range expected {
-			v := lexer.ParserItemToString(l[i])
+			v := ParserItemToString(l[i])
 			if v != e {
 				t.Errorf("[%s] Unexpected lexer item at index %d:\nexpected: %s\nactual: %s\n", c.name, i, e, v)
 			}
@ -186,7 +189,7 @@ func runStatesT(t *testing.T, c statesT) {
 	case string:
 		a := make([]string, len(l))
 		for _, v := range l {
-			a = append(a, lexer.ParserItemToString(v))
+			a = append(a, ParserItemToString(v))
 		}
 		actual := strings.Join(a, "")
 		if actual != expected {
@ -194,3 +197,22 @@ func runStatesT(t *testing.T, c statesT) {
 		}
 	}
 }
 // ParserItemToString returns a string representation of the
 // parser.Item. This is used for unit testing purposes.
 func ParserItemToString(i parser.Item) string {
 	switch i.Type {
 	case lexer.ItemComment:
 		return fmt.Sprintf("#(%s)", i.Value)
 	case lexer.ItemKey:
 		return fmt.Sprintf("[%s]", i.Value)
 	case lexer.ItemString:
 		return fmt.Sprintf("STR(%s)", i.Value)
 	case lexer.ItemKeyDot:
 		return "."
 	case lexer.ItemAssignment:
 		return "="
 	default:
 		panic(fmt.Sprintf("No string representation available for parser.Item id %d", i.Type))
 	}
 }
--- a/lexer/strings.go
+++ b/lexer/strings.go
@ -0,0 +1,88 @@
 package lexer
 import "github.com/mmakaay/toml/parser"
 // There are four ways to express strings: basic, multi-line basic, literal,
 // and multi-line literal. All strings must contain only valid UTF-8 characters.
 func stateStringValue(l *parser.Parser) parser.StateFn {
 	switch {
 	case l.SkipMatching(doubleQuote, doubleQuote, doubleQuote):
 		// Multi-line basic strings are surrounded by three quotation marks on each side.
 		return stateMultiLineBasicString
 	case l.SkipMatching(doubleQuote):
 		// Basic strings are surrounded by quotation marks.
 		return stateSingleLineBasicString
 	}
 	return l.UnexpectedInputError("a string value")
 }
 func stateSingleLineBasicString(l *parser.Parser) parser.StateFn {
 	if l.Upcoming(doubleQuote, doubleQuote) {
 		return stateMultiLineBasicString
 	}
 	return stateBasicString
 }
 func stateMultiLineBasicString(l *parser.Parser) parser.StateFn {
 	l.EmitError("Not yet implemented")
 	return nil
 }
 // Any Unicode character may be used except those that must be escaped:
 // quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F).
 const invalidBasicStringCharacters string = "\"\\" +
 	"\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007" +
 	"\u0008\u0009\u000A\u000B\u000C\u000D\u000E\u000F" +
 	"\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017" +
 	"\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F" +
 	"\u007F"
 func stateParseBasicString(l *parser.Parser) parser.StateFn {
 	for {
 		switch {
 		case l.AtEndOfFile():
 			return l.UnexpectedEndOfFile("basic string token")
 		case l.SkipMatching(doubleQuote):
 			return l.PopState()
 		case l.AcceptMatching(backslash, escapeChars):
 			// For convenience, some popular characters have a compact escape sequence.
 			// \b         - backspace       (U+0008)
 			// \t         - tab             (U+0009)
 			// \n         - linefeed        (U+000A)
 			// \f         - form feed       (U+000C)
 			// \r         - carriage return (U+000D)
 			// \"         - quote           (U+0022)
 			// \\         - backslash       (U+005C)
 		case l.AcceptMatching(backslash, shortUtf8Escape, hex, hex, hex, hex):
 			// \uXXXX     - unicode         (U+XXXX)
 		case l.AcceptMatching(backslash, longUtf8Escape, hex, hex, hex, hex, hex, hex, hex, hex):
 			// \UXXXXXXXX - unicode         (U+XXXXXXXX)
 		case l.Upcoming(backslash):
 			// All other escape sequences not listed above are reserved and,
 			// if used, TOML should produce an error.
 			return l.EmitError("Invalid escape sequence in basic string")
 		case l.Upcoming(invalidBasicStringCharacters):
 			// Any Unicode character may be used except those that must be escaped:
 			// quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F).
 			r, _, _ := l.Match(invalidBasicStringCharacters)
 			l.EmitError("Invalid character in basic string: %q (must be escaped)", r[0])
 			return nil
 		default:
 			if !l.AcceptAny() {
 				return nil
 			}
 		}
 	}
 }
 func stateBasicString(l *parser.Parser) parser.StateFn {
 	l.PushState(func(l *parser.Parser) parser.StateFn {
 		err := l.EmitInterpreted(ItemString)
 		if err != nil {
 			l.EmitError("Invalid data in string: %s", err)
 			return nil
 		}
 		return stateKeyValuePair
 	})
 	return stateParseBasicString
 }
--- a/parser/parser.go
+++ b/parser/parser.go
@ -17,6 +17,16 @@ func New(input string, startState StateFn) *Parser {
 	}
 }
 func (p *Parser) ToChildState(state StateFn) StateFn {
 	p.PushState(p.state)
 	return state
 }
 func (p *Parser) ToParentState() StateFn {
 	state := p.PopState()
 	return state
 }
 // PushState adds the state function to the state stack.
 // This is used for implementing nested parsing.
 func (l *Parser) PushState(state StateFn) {
@ -36,6 +46,33 @@ func (l *Parser) AtEndOfFile() bool {
 	return l.pos >= l.len
 }
 func (p *Parser) AtEndOfLine() bool {
 	return p.AtEndOfFile() ||
 		p.Upcoming("\r", "\n") ||
 		p.Upcoming("\n")
 }
 func (p *Parser) SkipEndOfLine() bool {
 	return p.AtEndOfFile() ||
 		p.SkipMatching("\r", "\n") ||
 		p.SkipMatching("\n")
 }
 func (p *Parser) AcceptEndOfLine() bool {
 	// No newline, but we're defintely at the end of the line here.
 	if p.AtEndOfFile() {
 		return true
 	}
 	// If we see some kind of end of line, then we accept a
 	// normalized newline, which is just a '\n'. This will normalize
 	// '\r\n' into '\n'.
 	if p.SkipEndOfLine() {
 		p.buffer.WriteRune('\n')
 		return true
 	}
 	return false
 }
 // Emit passes a Parser item to the client, including the provided string.
 func (l *Parser) Emit(t ItemType, s string) {
 	l.items <- Item{t, s}