Now the parser code is out of the way, we can split up the state functions describing the syntax of TOML into separate files, while still keeping it maintainable.

This commit is contained in:
Maurice Makaay 2019-05-17 14:13:25 +00:00
parent db4a8f7942
commit 9f19add210
9 changed files with 297 additions and 246 deletions

21
lexer/comments.go Normal file
View File

@ -0,0 +1,21 @@
package lexer
import "github.com/mmakaay/toml/parser"
// A '#' hash symbol marks the rest of the line as a comment.
func stateCommentStart(p *parser.Parser) parser.StateFn {
p.SkipConsecutive(hash)
return stateCommentContent
}
// All characters up to the end of the line are included in the comment.
func stateCommentContent(p *parser.Parser) parser.StateFn {
switch {
case p.AtEndOfLine():
p.EmitLiteralTrim(ItemComment)
return p.ToParentState()
default:
p.AcceptAny()
return stateCommentContent
}
}

12
lexer/end_of_file.go Normal file
View File

@ -0,0 +1,12 @@
package lexer
import "github.com/mmakaay/toml/parser"
func stateEndOfFile(l *parser.Parser) parser.StateFn {
if l.AtEndOfFile() {
l.Emit(parser.ItemEOF, "EOF") // todo Automate within parser?
} else {
l.UnexpectedInputError("end of file")
}
return nil
}

View File

@ -1,35 +0,0 @@
package lexer
import (
"fmt"
"github.com/mmakaay/toml/parser"
)
// Definition of all the lexer item types for the TOML lexer.
const (
ItemComment parser.ItemType = iota // An error occurred
ItemKey // Key of a key/value pair
ItemKeyDot // Dot for a dotted key
ItemAssignment // Value assignment coming up (=)
ItemString // A value of type string
)
// ParserItemToString returns a string representation of the
// parser.Item. This is used for unit testing purposes.
func ParserItemToString(i parser.Item) string {
switch i.Type {
case ItemComment:
return fmt.Sprintf("#(%s)", i.Value)
case ItemKey:
return fmt.Sprintf("[%s]", i.Value)
case ItemString:
return fmt.Sprintf("STR(%s)", i.Value)
case ItemKeyDot:
return "."
case ItemAssignment:
return "="
default:
panic(fmt.Sprintf("No string representation available for parser.Item id %d", i.Type))
}
}

73
lexer/key_value_pairs.go Normal file
View File

@ -0,0 +1,73 @@
package lexer
import "github.com/mmakaay/toml/parser"
// The primary building block of a TOML document is the key/value pair.
func stateKeyValuePair(l *parser.Parser) parser.StateFn {
switch {
case l.SkipConsecutive(whitespace + carriageReturn + newline):
return stateKeyValuePair
case l.Upcoming(hash):
return l.ToChildState(stateCommentStart)
case l.Upcoming(startOfKey):
return l.ToChildState(stateKey)
default:
return stateEndOfFile
}
}
// A key may be either bare, quoted or dotted.
func stateKey(l *parser.Parser) parser.StateFn {
if l.AcceptMatching(bareKeyChars) {
return statebareKeyChars
}
return l.UnexpectedInputError("a valid key name")
}
// Bare keys may only contain ASCII letters, ASCII digits,
// underscores, and dashes (A-Za-z0-9_-). Note that bare
// keys are allowed to be composed of only ASCII digits,
// e.g. 1234, but are always interpreted as strings.
func statebareKeyChars(l *parser.Parser) parser.StateFn {
l.AcceptConsecutive(bareKeyChars)
l.EmitLiteral(ItemKey)
return stateEndOfKeyOrKeyDot
}
// Dotted keys are a sequence of bare or quoted keys joined with a dot.
// This allows for grouping similar properties together:
func stateEndOfKeyOrKeyDot(l *parser.Parser) parser.StateFn {
// Whitespace around dot-separated parts is ignored, however,
// best practice is to not use any extraneous whitespace.
l.SkipConsecutive(whitespace)
if l.SkipMatching(dot) {
l.Emit(ItemKeyDot, "")
l.SkipConsecutive(whitespace)
return stateKey
}
return stateKeyAssignment
}
// Keys are on the left of the equals sign and values are on the right.
// Whitespace is ignored around key names and values. The key, equals
// sign, and value must be on the same line (though some values can
// be broken over multiple lines).
func stateKeyAssignment(l *parser.Parser) parser.StateFn {
l.SkipConsecutive(whitespace)
if l.SkipMatching(equal) {
l.Emit(ItemAssignment, "")
l.SkipConsecutive(whitespace)
return stateValue
}
return l.UnexpectedInputError("a value assignment")
}
// Values must be of the following types: String, Integer, Float, Boolean,
// Datetime, Array, or Inline Table. Unspecified values are invalid.
func stateValue(l *parser.Parser) parser.StateFn {
l.SkipConsecutive(whitespace)
if l.Upcoming(quoteChars) {
return stateStringValue
}
return l.UnexpectedInputError("a value")
}

42
lexer/main.go Normal file
View File

@ -0,0 +1,42 @@
package lexer
import "github.com/mmakaay/toml/parser"
// Definition of the item types that are emitted by this parser.
const (
ItemComment parser.ItemType = iota // An error occurred
ItemKey // Key of a key/value pair
ItemKeyDot // Dot for a dotted key
ItemAssignment // Value assignment coming up (=)
ItemString // A value of type string
)
const (
whitespace string = " \t"
carriageReturn string = "\r"
newline string = "\n"
hash string = "#"
equal string = "="
lower string = "abcdefghijklmnopqrstuvwxyz"
upper string = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
digits string = "0123456789"
hex string = digits + "abcdefABCDEF"
dot string = "."
underscore string = "_"
dash string = "-"
singleQuote string = "'"
doubleQuote string = "\""
backslash string = "\\"
quoteChars string = singleQuote + doubleQuote
bareKeyChars string = lower + upper + digits + underscore + dash
startOfKey string = bareKeyChars + quoteChars
escapeChars string = `btnfr"\`
shortUtf8Escape string = "u"
longUtf8Escape string = "U"
)
// NewParser creates a new parser, using the provided input string
// as the data to parse.
func NewParser(input string) *parser.Parser {
return parser.New(input, stateKeyValuePair)
}

View File

@ -1,209 +0,0 @@
package lexer
import "github.com/mmakaay/toml/parser"
const (
whitespace string = " \t"
carriageReturn string = "\r"
newline string = "\n"
hash string = "#"
equal string = "="
lower string = "abcdefghijklmnopqrstuvwxyz"
upper string = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
digits string = "0123456789"
hex string = digits + "abcdefABCDEF"
dot string = "."
underscore string = "_"
dash string = "-"
singleQuote string = "'"
doubleQuote string = "\""
backslash string = "\\"
quoteChars string = singleQuote + doubleQuote
bareKeyChars string = lower + upper + digits + underscore + dash
startOfKey string = bareKeyChars + quoteChars
escapeChars string = `btnfr"\`
shortUtf8Escape string = "u"
longUtf8Escape string = "U"
)
// NewParser creates a new parser, using the provided input string
// as the data to parse.
func NewParser(input string) *parser.Parser {
return parser.New(input, stateKeyValuePair)
}
func stateKeyValuePair(l *parser.Parser) parser.StateFn {
l.SkipConsecutive(whitespace + carriageReturn + newline)
if l.SkipMatching(hash) {
return stateComment
}
if l.Upcoming(startOfKey) {
return stateKey
}
return stateEndOfFile
}
// A '#' hash symbol marks the rest of the line as a comment.
func stateComment(l *parser.Parser) parser.StateFn {
for {
switch {
case l.AtEndOfFile() || l.SkipMatching(newline):
l.EmitLiteralTrim(ItemComment)
return stateKeyValuePair
default:
if !l.AcceptAny() {
return nil
}
}
}
}
// A key may be either bare, quoted or dotted.
func stateKey(l *parser.Parser) parser.StateFn {
if l.AcceptMatching(bareKeyChars) {
return statebareKeyChars
}
return l.UnexpectedInputError("a valid key name")
}
// Bare keys may only contain ASCII letters, ASCII digits,
// underscores, and dashes (A-Za-z0-9_-). Note that bare
// keys are allowed to be composed of only ASCII digits,
// e.g. 1234, but are always interpreted as strings.
func statebareKeyChars(l *parser.Parser) parser.StateFn {
l.AcceptConsecutive(bareKeyChars)
l.EmitLiteral(ItemKey)
return stateEndOfKeyOrKeyDot
}
// Dotted keys are a sequence of bare or quoted keys joined with a dot.
// This allows for grouping similar properties together:
func stateEndOfKeyOrKeyDot(l *parser.Parser) parser.StateFn {
// Whitespace around dot-separated parts is ignored, however,
// best practice is to not use any extraneous whitespace.
l.SkipConsecutive(whitespace)
if l.SkipMatching(dot) {
l.Emit(ItemKeyDot, "")
l.SkipConsecutive(whitespace)
return stateKey
}
return stateKeyAssignment
}
// Keys are on the left of the equals sign and values are on the right.
// Whitespace is ignored around key names and values. The key, equals
// sign, and value must be on the same line (though some values can
// be broken over multiple lines).
func stateKeyAssignment(l *parser.Parser) parser.StateFn {
l.SkipConsecutive(whitespace)
if l.SkipMatching(equal) {
l.Emit(ItemAssignment, "")
l.SkipConsecutive(whitespace)
return stateValue
}
return l.UnexpectedInputError("a value assignment")
}
// Values must be of the following types: String, Integer, Float, Boolean,
// Datetime, Array, or Inline Table. Unspecified values are invalid.
func stateValue(l *parser.Parser) parser.StateFn {
l.SkipConsecutive(whitespace)
if l.Upcoming(quoteChars) {
return stateStringValue
}
return l.UnexpectedInputError("a value")
}
// There are four ways to express strings: basic, multi-line basic, literal,
// and multi-line literal. All strings must contain only valid UTF-8 characters.
func stateStringValue(l *parser.Parser) parser.StateFn {
switch {
case l.SkipMatching(doubleQuote, doubleQuote, doubleQuote):
// Multi-line basic strings are surrounded by three quotation marks on each side.
return stateMultiLineBasicString
case l.SkipMatching(doubleQuote):
// Basic strings are surrounded by quotation marks.
return stateSingleLineBasicString
}
return l.UnexpectedInputError("a string value")
}
func stateSingleLineBasicString(l *parser.Parser) parser.StateFn {
if l.Upcoming(doubleQuote, doubleQuote) {
return stateMultiLineBasicString
}
return stateBasicString
}
func stateMultiLineBasicString(l *parser.Parser) parser.StateFn {
l.EmitError("Not yet implemented")
return nil
}
// Any Unicode character may be used except those that must be escaped:
// quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F).
const invalidBasicStringCharacters string = "\"\\" +
"\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007" +
"\u0008\u0009\u000A\u000B\u000C\u000D\u000E\u000F" +
"\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017" +
"\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F" +
"\u007F"
func stateParseBasicString(l *parser.Parser) parser.StateFn {
for {
switch {
case l.AtEndOfFile():
return l.UnexpectedEndOfFile("basic string token")
case l.SkipMatching(doubleQuote):
return l.PopState()
case l.AcceptMatching(backslash, escapeChars):
// For convenience, some popular characters have a compact escape sequence.
// \b - backspace (U+0008)
// \t - tab (U+0009)
// \n - linefeed (U+000A)
// \f - form feed (U+000C)
// \r - carriage return (U+000D)
// \" - quote (U+0022)
// \\ - backslash (U+005C)
case l.AcceptMatching(backslash, shortUtf8Escape, hex, hex, hex, hex):
// \uXXXX - unicode (U+XXXX)
case l.AcceptMatching(backslash, longUtf8Escape, hex, hex, hex, hex, hex, hex, hex, hex):
// \UXXXXXXXX - unicode (U+XXXXXXXX)
case l.Upcoming(backslash):
// All other escape sequences not listed above are reserved and,
// if used, TOML should produce an error.
return l.EmitError("Invalid escape sequence in basic string")
case l.Upcoming(invalidBasicStringCharacters):
// Any Unicode character may be used except those that must be escaped:
// quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F).
r, _, _ := l.Match(invalidBasicStringCharacters)
l.EmitError("Invalid character in basic string: %q (must be escaped)", r[0])
return nil
default:
if !l.AcceptAny() {
return nil
}
}
}
}
func stateBasicString(l *parser.Parser) parser.StateFn {
l.PushState(func(l *parser.Parser) parser.StateFn {
err := l.EmitInterpreted(ItemString)
if err != nil {
l.EmitError("Invalid data in string: %s", err)
return nil
}
return stateKeyValuePair
})
return stateParseBasicString
}
func stateEndOfFile(l *parser.Parser) parser.StateFn {
if l.AtEndOfFile() {
l.Emit(parser.ItemEOF, "EOF") // todo Automate within parser?
} else {
l.UnexpectedInputError("end of file")
}
return nil
}

View File

@ -6,6 +6,7 @@ import (
"testing" "testing"
"github.com/mmakaay/toml/lexer" "github.com/mmakaay/toml/lexer"
"github.com/mmakaay/toml/parser"
) )
func TestErrorsIncludeLineAndRowPosition(t *testing.T) { func TestErrorsIncludeLineAndRowPosition(t *testing.T) {
@ -52,6 +53,8 @@ func TestComments(t *testing.T) {
{"basic comment with surrounding whitespace", "#\t cow \t", "#(cow)", ""}, {"basic comment with surrounding whitespace", "#\t cow \t", "#(cow)", ""},
{"two lines of comments", "# one \r\n#two", "#(one)#(two)", ""}, {"two lines of comments", "# one \r\n#two", "#(one)#(two)", ""},
{"comment with escape-y chars", `# \xxx/ \u can't escape/`, `#(\xxx/ \u can't escape/)`, ""}, {"comment with escape-y chars", `# \xxx/ \u can't escape/`, `#(\xxx/ \u can't escape/)`, ""},
{"comment with multiple hashes", `#### Just Jack!`, `#(Just Jack!)`, ""},
{"comment with hashes inside", `# Follow #me2`, `#(Follow #me2)`, ""},
{"carriage returns in comment", "# \tlexe\r accepts embedded ca\r\riage \returns\r", "#(lexe\r accepts embedded ca\r\riage \returns)", ""}, {"carriage returns in comment", "# \tlexe\r accepts embedded ca\r\riage \returns\r", "#(lexe\r accepts embedded ca\r\riage \returns)", ""},
}) })
} }
@ -178,7 +181,7 @@ func runStatesT(t *testing.T, c statesT) {
t.Errorf("[%s] Unexpected number of lexer items:\nexpected: %d\nactual: %d\n", c.name, len(expected), len(l)) t.Errorf("[%s] Unexpected number of lexer items:\nexpected: %d\nactual: %d\n", c.name, len(expected), len(l))
} }
for i, e := range expected { for i, e := range expected {
v := lexer.ParserItemToString(l[i]) v := ParserItemToString(l[i])
if v != e { if v != e {
t.Errorf("[%s] Unexpected lexer item at index %d:\nexpected: %s\nactual: %s\n", c.name, i, e, v) t.Errorf("[%s] Unexpected lexer item at index %d:\nexpected: %s\nactual: %s\n", c.name, i, e, v)
} }
@ -186,7 +189,7 @@ func runStatesT(t *testing.T, c statesT) {
case string: case string:
a := make([]string, len(l)) a := make([]string, len(l))
for _, v := range l { for _, v := range l {
a = append(a, lexer.ParserItemToString(v)) a = append(a, ParserItemToString(v))
} }
actual := strings.Join(a, "") actual := strings.Join(a, "")
if actual != expected { if actual != expected {
@ -194,3 +197,22 @@ func runStatesT(t *testing.T, c statesT) {
} }
} }
} }
// ParserItemToString returns a string representation of the
// parser.Item. This is used for unit testing purposes.
func ParserItemToString(i parser.Item) string {
switch i.Type {
case lexer.ItemComment:
return fmt.Sprintf("#(%s)", i.Value)
case lexer.ItemKey:
return fmt.Sprintf("[%s]", i.Value)
case lexer.ItemString:
return fmt.Sprintf("STR(%s)", i.Value)
case lexer.ItemKeyDot:
return "."
case lexer.ItemAssignment:
return "="
default:
panic(fmt.Sprintf("No string representation available for parser.Item id %d", i.Type))
}
}

88
lexer/strings.go Normal file
View File

@ -0,0 +1,88 @@
package lexer
import "github.com/mmakaay/toml/parser"
// There are four ways to express strings: basic, multi-line basic, literal,
// and multi-line literal. All strings must contain only valid UTF-8 characters.
func stateStringValue(l *parser.Parser) parser.StateFn {
switch {
case l.SkipMatching(doubleQuote, doubleQuote, doubleQuote):
// Multi-line basic strings are surrounded by three quotation marks on each side.
return stateMultiLineBasicString
case l.SkipMatching(doubleQuote):
// Basic strings are surrounded by quotation marks.
return stateSingleLineBasicString
}
return l.UnexpectedInputError("a string value")
}
func stateSingleLineBasicString(l *parser.Parser) parser.StateFn {
if l.Upcoming(doubleQuote, doubleQuote) {
return stateMultiLineBasicString
}
return stateBasicString
}
func stateMultiLineBasicString(l *parser.Parser) parser.StateFn {
l.EmitError("Not yet implemented")
return nil
}
// Any Unicode character may be used except those that must be escaped:
// quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F).
const invalidBasicStringCharacters string = "\"\\" +
"\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007" +
"\u0008\u0009\u000A\u000B\u000C\u000D\u000E\u000F" +
"\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017" +
"\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F" +
"\u007F"
func stateParseBasicString(l *parser.Parser) parser.StateFn {
for {
switch {
case l.AtEndOfFile():
return l.UnexpectedEndOfFile("basic string token")
case l.SkipMatching(doubleQuote):
return l.PopState()
case l.AcceptMatching(backslash, escapeChars):
// For convenience, some popular characters have a compact escape sequence.
// \b - backspace (U+0008)
// \t - tab (U+0009)
// \n - linefeed (U+000A)
// \f - form feed (U+000C)
// \r - carriage return (U+000D)
// \" - quote (U+0022)
// \\ - backslash (U+005C)
case l.AcceptMatching(backslash, shortUtf8Escape, hex, hex, hex, hex):
// \uXXXX - unicode (U+XXXX)
case l.AcceptMatching(backslash, longUtf8Escape, hex, hex, hex, hex, hex, hex, hex, hex):
// \UXXXXXXXX - unicode (U+XXXXXXXX)
case l.Upcoming(backslash):
// All other escape sequences not listed above are reserved and,
// if used, TOML should produce an error.
return l.EmitError("Invalid escape sequence in basic string")
case l.Upcoming(invalidBasicStringCharacters):
// Any Unicode character may be used except those that must be escaped:
// quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F).
r, _, _ := l.Match(invalidBasicStringCharacters)
l.EmitError("Invalid character in basic string: %q (must be escaped)", r[0])
return nil
default:
if !l.AcceptAny() {
return nil
}
}
}
}
func stateBasicString(l *parser.Parser) parser.StateFn {
l.PushState(func(l *parser.Parser) parser.StateFn {
err := l.EmitInterpreted(ItemString)
if err != nil {
l.EmitError("Invalid data in string: %s", err)
return nil
}
return stateKeyValuePair
})
return stateParseBasicString
}

View File

@ -17,6 +17,16 @@ func New(input string, startState StateFn) *Parser {
} }
} }
func (p *Parser) ToChildState(state StateFn) StateFn {
p.PushState(p.state)
return state
}
func (p *Parser) ToParentState() StateFn {
state := p.PopState()
return state
}
// PushState adds the state function to the state stack. // PushState adds the state function to the state stack.
// This is used for implementing nested parsing. // This is used for implementing nested parsing.
func (l *Parser) PushState(state StateFn) { func (l *Parser) PushState(state StateFn) {
@ -36,6 +46,33 @@ func (l *Parser) AtEndOfFile() bool {
return l.pos >= l.len return l.pos >= l.len
} }
func (p *Parser) AtEndOfLine() bool {
return p.AtEndOfFile() ||
p.Upcoming("\r", "\n") ||
p.Upcoming("\n")
}
func (p *Parser) SkipEndOfLine() bool {
return p.AtEndOfFile() ||
p.SkipMatching("\r", "\n") ||
p.SkipMatching("\n")
}
func (p *Parser) AcceptEndOfLine() bool {
// No newline, but we're defintely at the end of the line here.
if p.AtEndOfFile() {
return true
}
// If we see some kind of end of line, then we accept a
// normalized newline, which is just a '\n'. This will normalize
// '\r\n' into '\n'.
if p.SkipEndOfLine() {
p.buffer.WriteRune('\n')
return true
}
return false
}
// Emit passes a Parser item to the client, including the provided string. // Emit passes a Parser item to the client, including the provided string.
func (l *Parser) Emit(t ItemType, s string) { func (l *Parser) Emit(t ItemType, s string) {
l.items <- Item{t, s} l.items <- Item{t, s}