diff --git a/lexer/comments.go b/lexer/comments.go new file mode 100644 index 0000000..cb7082d --- /dev/null +++ b/lexer/comments.go @@ -0,0 +1,21 @@ +package lexer + +import "github.com/mmakaay/toml/parser" + +// A '#' hash symbol marks the rest of the line as a comment. +func stateCommentStart(p *parser.Parser) parser.StateFn { + p.SkipConsecutive(hash) + return stateCommentContent +} + +// All characters up to the end of the line are included in the comment. +func stateCommentContent(p *parser.Parser) parser.StateFn { + switch { + case p.AtEndOfLine(): + p.EmitLiteralTrim(ItemComment) + return p.ToParentState() + default: + p.AcceptAny() + return stateCommentContent + } +} diff --git a/lexer/end_of_file.go b/lexer/end_of_file.go new file mode 100644 index 0000000..06ab965 --- /dev/null +++ b/lexer/end_of_file.go @@ -0,0 +1,12 @@ +package lexer + +import "github.com/mmakaay/toml/parser" + +func stateEndOfFile(l *parser.Parser) parser.StateFn { + if l.AtEndOfFile() { + l.Emit(parser.ItemEOF, "EOF") // todo Automate within parser? + } else { + l.UnexpectedInputError("end of file") + } + return nil +} diff --git a/lexer/items.go b/lexer/items.go deleted file mode 100644 index 026aadc..0000000 --- a/lexer/items.go +++ /dev/null @@ -1,35 +0,0 @@ -package lexer - -import ( - "fmt" - - "github.com/mmakaay/toml/parser" -) - -// Definition of all the lexer item types for the TOML lexer. -const ( - ItemComment parser.ItemType = iota // An error occurred - ItemKey // Key of a key/value pair - ItemKeyDot // Dot for a dotted key - ItemAssignment // Value assignment coming up (=) - ItemString // A value of type string -) - -// ParserItemToString returns a string representation of the -// parser.Item. This is used for unit testing purposes. -func ParserItemToString(i parser.Item) string { - switch i.Type { - case ItemComment: - return fmt.Sprintf("#(%s)", i.Value) - case ItemKey: - return fmt.Sprintf("[%s]", i.Value) - case ItemString: - return fmt.Sprintf("STR(%s)", i.Value) - case ItemKeyDot: - return "." - case ItemAssignment: - return "=" - default: - panic(fmt.Sprintf("No string representation available for parser.Item id %d", i.Type)) - } -} diff --git a/lexer/key_value_pairs.go b/lexer/key_value_pairs.go new file mode 100644 index 0000000..c64213f --- /dev/null +++ b/lexer/key_value_pairs.go @@ -0,0 +1,73 @@ +package lexer + +import "github.com/mmakaay/toml/parser" + +// The primary building block of a TOML document is the key/value pair. +func stateKeyValuePair(l *parser.Parser) parser.StateFn { + switch { + case l.SkipConsecutive(whitespace + carriageReturn + newline): + return stateKeyValuePair + case l.Upcoming(hash): + return l.ToChildState(stateCommentStart) + case l.Upcoming(startOfKey): + return l.ToChildState(stateKey) + default: + return stateEndOfFile + } +} + +// A key may be either bare, quoted or dotted. +func stateKey(l *parser.Parser) parser.StateFn { + if l.AcceptMatching(bareKeyChars) { + return statebareKeyChars + } + return l.UnexpectedInputError("a valid key name") +} + +// Bare keys may only contain ASCII letters, ASCII digits, +// underscores, and dashes (A-Za-z0-9_-). Note that bare +// keys are allowed to be composed of only ASCII digits, +// e.g. 1234, but are always interpreted as strings. +func statebareKeyChars(l *parser.Parser) parser.StateFn { + l.AcceptConsecutive(bareKeyChars) + l.EmitLiteral(ItemKey) + return stateEndOfKeyOrKeyDot +} + +// Dotted keys are a sequence of bare or quoted keys joined with a dot. +// This allows for grouping similar properties together: +func stateEndOfKeyOrKeyDot(l *parser.Parser) parser.StateFn { + // Whitespace around dot-separated parts is ignored, however, + // best practice is to not use any extraneous whitespace. + l.SkipConsecutive(whitespace) + if l.SkipMatching(dot) { + l.Emit(ItemKeyDot, "") + l.SkipConsecutive(whitespace) + return stateKey + } + return stateKeyAssignment +} + +// Keys are on the left of the equals sign and values are on the right. +// Whitespace is ignored around key names and values. The key, equals +// sign, and value must be on the same line (though some values can +// be broken over multiple lines). +func stateKeyAssignment(l *parser.Parser) parser.StateFn { + l.SkipConsecutive(whitespace) + if l.SkipMatching(equal) { + l.Emit(ItemAssignment, "") + l.SkipConsecutive(whitespace) + return stateValue + } + return l.UnexpectedInputError("a value assignment") +} + +// Values must be of the following types: String, Integer, Float, Boolean, +// Datetime, Array, or Inline Table. Unspecified values are invalid. +func stateValue(l *parser.Parser) parser.StateFn { + l.SkipConsecutive(whitespace) + if l.Upcoming(quoteChars) { + return stateStringValue + } + return l.UnexpectedInputError("a value") +} diff --git a/lexer/main.go b/lexer/main.go new file mode 100644 index 0000000..a218c65 --- /dev/null +++ b/lexer/main.go @@ -0,0 +1,42 @@ +package lexer + +import "github.com/mmakaay/toml/parser" + +// Definition of the item types that are emitted by this parser. +const ( + ItemComment parser.ItemType = iota // An error occurred + ItemKey // Key of a key/value pair + ItemKeyDot // Dot for a dotted key + ItemAssignment // Value assignment coming up (=) + ItemString // A value of type string +) + +const ( + whitespace string = " \t" + carriageReturn string = "\r" + newline string = "\n" + hash string = "#" + equal string = "=" + lower string = "abcdefghijklmnopqrstuvwxyz" + upper string = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + digits string = "0123456789" + hex string = digits + "abcdefABCDEF" + dot string = "." + underscore string = "_" + dash string = "-" + singleQuote string = "'" + doubleQuote string = "\"" + backslash string = "\\" + quoteChars string = singleQuote + doubleQuote + bareKeyChars string = lower + upper + digits + underscore + dash + startOfKey string = bareKeyChars + quoteChars + escapeChars string = `btnfr"\` + shortUtf8Escape string = "u" + longUtf8Escape string = "U" +) + +// NewParser creates a new parser, using the provided input string +// as the data to parse. +func NewParser(input string) *parser.Parser { + return parser.New(input, stateKeyValuePair) +} diff --git a/lexer/states.go b/lexer/states.go deleted file mode 100644 index 4378982..0000000 --- a/lexer/states.go +++ /dev/null @@ -1,209 +0,0 @@ -package lexer - -import "github.com/mmakaay/toml/parser" - -const ( - whitespace string = " \t" - carriageReturn string = "\r" - newline string = "\n" - hash string = "#" - equal string = "=" - lower string = "abcdefghijklmnopqrstuvwxyz" - upper string = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" - digits string = "0123456789" - hex string = digits + "abcdefABCDEF" - dot string = "." - underscore string = "_" - dash string = "-" - singleQuote string = "'" - doubleQuote string = "\"" - backslash string = "\\" - quoteChars string = singleQuote + doubleQuote - bareKeyChars string = lower + upper + digits + underscore + dash - startOfKey string = bareKeyChars + quoteChars - escapeChars string = `btnfr"\` - shortUtf8Escape string = "u" - longUtf8Escape string = "U" -) - -// NewParser creates a new parser, using the provided input string -// as the data to parse. -func NewParser(input string) *parser.Parser { - return parser.New(input, stateKeyValuePair) -} - -func stateKeyValuePair(l *parser.Parser) parser.StateFn { - l.SkipConsecutive(whitespace + carriageReturn + newline) - if l.SkipMatching(hash) { - return stateComment - } - if l.Upcoming(startOfKey) { - return stateKey - } - return stateEndOfFile -} - -// A '#' hash symbol marks the rest of the line as a comment. -func stateComment(l *parser.Parser) parser.StateFn { - for { - switch { - case l.AtEndOfFile() || l.SkipMatching(newline): - l.EmitLiteralTrim(ItemComment) - return stateKeyValuePair - default: - if !l.AcceptAny() { - return nil - } - } - } -} - -// A key may be either bare, quoted or dotted. -func stateKey(l *parser.Parser) parser.StateFn { - if l.AcceptMatching(bareKeyChars) { - return statebareKeyChars - } - return l.UnexpectedInputError("a valid key name") -} - -// Bare keys may only contain ASCII letters, ASCII digits, -// underscores, and dashes (A-Za-z0-9_-). Note that bare -// keys are allowed to be composed of only ASCII digits, -// e.g. 1234, but are always interpreted as strings. -func statebareKeyChars(l *parser.Parser) parser.StateFn { - l.AcceptConsecutive(bareKeyChars) - l.EmitLiteral(ItemKey) - return stateEndOfKeyOrKeyDot -} - -// Dotted keys are a sequence of bare or quoted keys joined with a dot. -// This allows for grouping similar properties together: -func stateEndOfKeyOrKeyDot(l *parser.Parser) parser.StateFn { - // Whitespace around dot-separated parts is ignored, however, - // best practice is to not use any extraneous whitespace. - l.SkipConsecutive(whitespace) - if l.SkipMatching(dot) { - l.Emit(ItemKeyDot, "") - l.SkipConsecutive(whitespace) - return stateKey - } - return stateKeyAssignment -} - -// Keys are on the left of the equals sign and values are on the right. -// Whitespace is ignored around key names and values. The key, equals -// sign, and value must be on the same line (though some values can -// be broken over multiple lines). -func stateKeyAssignment(l *parser.Parser) parser.StateFn { - l.SkipConsecutive(whitespace) - if l.SkipMatching(equal) { - l.Emit(ItemAssignment, "") - l.SkipConsecutive(whitespace) - return stateValue - } - return l.UnexpectedInputError("a value assignment") -} - -// Values must be of the following types: String, Integer, Float, Boolean, -// Datetime, Array, or Inline Table. Unspecified values are invalid. -func stateValue(l *parser.Parser) parser.StateFn { - l.SkipConsecutive(whitespace) - if l.Upcoming(quoteChars) { - return stateStringValue - } - return l.UnexpectedInputError("a value") -} - -// There are four ways to express strings: basic, multi-line basic, literal, -// and multi-line literal. All strings must contain only valid UTF-8 characters. -func stateStringValue(l *parser.Parser) parser.StateFn { - switch { - case l.SkipMatching(doubleQuote, doubleQuote, doubleQuote): - // Multi-line basic strings are surrounded by three quotation marks on each side. - return stateMultiLineBasicString - case l.SkipMatching(doubleQuote): - // Basic strings are surrounded by quotation marks. - return stateSingleLineBasicString - } - return l.UnexpectedInputError("a string value") -} - -func stateSingleLineBasicString(l *parser.Parser) parser.StateFn { - if l.Upcoming(doubleQuote, doubleQuote) { - return stateMultiLineBasicString - } - return stateBasicString -} - -func stateMultiLineBasicString(l *parser.Parser) parser.StateFn { - l.EmitError("Not yet implemented") - return nil -} - -// Any Unicode character may be used except those that must be escaped: -// quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F). -const invalidBasicStringCharacters string = "\"\\" + - "\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007" + - "\u0008\u0009\u000A\u000B\u000C\u000D\u000E\u000F" + - "\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017" + - "\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F" + - "\u007F" - -func stateParseBasicString(l *parser.Parser) parser.StateFn { - for { - switch { - case l.AtEndOfFile(): - return l.UnexpectedEndOfFile("basic string token") - case l.SkipMatching(doubleQuote): - return l.PopState() - case l.AcceptMatching(backslash, escapeChars): - // For convenience, some popular characters have a compact escape sequence. - // \b - backspace (U+0008) - // \t - tab (U+0009) - // \n - linefeed (U+000A) - // \f - form feed (U+000C) - // \r - carriage return (U+000D) - // \" - quote (U+0022) - // \\ - backslash (U+005C) - case l.AcceptMatching(backslash, shortUtf8Escape, hex, hex, hex, hex): - // \uXXXX - unicode (U+XXXX) - case l.AcceptMatching(backslash, longUtf8Escape, hex, hex, hex, hex, hex, hex, hex, hex): - // \UXXXXXXXX - unicode (U+XXXXXXXX) - case l.Upcoming(backslash): - // All other escape sequences not listed above are reserved and, - // if used, TOML should produce an error. - return l.EmitError("Invalid escape sequence in basic string") - case l.Upcoming(invalidBasicStringCharacters): - // Any Unicode character may be used except those that must be escaped: - // quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F). - r, _, _ := l.Match(invalidBasicStringCharacters) - l.EmitError("Invalid character in basic string: %q (must be escaped)", r[0]) - return nil - default: - if !l.AcceptAny() { - return nil - } - } - } -} - -func stateBasicString(l *parser.Parser) parser.StateFn { - l.PushState(func(l *parser.Parser) parser.StateFn { - err := l.EmitInterpreted(ItemString) - if err != nil { - l.EmitError("Invalid data in string: %s", err) - return nil - } - return stateKeyValuePair - }) - return stateParseBasicString -} - -func stateEndOfFile(l *parser.Parser) parser.StateFn { - if l.AtEndOfFile() { - l.Emit(parser.ItemEOF, "EOF") // todo Automate within parser? - } else { - l.UnexpectedInputError("end of file") - } - return nil -} diff --git a/lexer/states_test.go b/lexer/states_test.go index 680b6bd..5a242ff 100644 --- a/lexer/states_test.go +++ b/lexer/states_test.go @@ -6,6 +6,7 @@ import ( "testing" "github.com/mmakaay/toml/lexer" + "github.com/mmakaay/toml/parser" ) func TestErrorsIncludeLineAndRowPosition(t *testing.T) { @@ -52,6 +53,8 @@ func TestComments(t *testing.T) { {"basic comment with surrounding whitespace", "#\t cow \t", "#(cow)", ""}, {"two lines of comments", "# one \r\n#two", "#(one)#(two)", ""}, {"comment with escape-y chars", `# \xxx/ \u can't escape/`, `#(\xxx/ \u can't escape/)`, ""}, + {"comment with multiple hashes", `#### Just Jack!`, `#(Just Jack!)`, ""}, + {"comment with hashes inside", `# Follow #me2`, `#(Follow #me2)`, ""}, {"carriage returns in comment", "# \tlexe\r accepts embedded ca\r\riage \returns\r", "#(lexe\r accepts embedded ca\r\riage \returns)", ""}, }) } @@ -178,7 +181,7 @@ func runStatesT(t *testing.T, c statesT) { t.Errorf("[%s] Unexpected number of lexer items:\nexpected: %d\nactual: %d\n", c.name, len(expected), len(l)) } for i, e := range expected { - v := lexer.ParserItemToString(l[i]) + v := ParserItemToString(l[i]) if v != e { t.Errorf("[%s] Unexpected lexer item at index %d:\nexpected: %s\nactual: %s\n", c.name, i, e, v) } @@ -186,7 +189,7 @@ func runStatesT(t *testing.T, c statesT) { case string: a := make([]string, len(l)) for _, v := range l { - a = append(a, lexer.ParserItemToString(v)) + a = append(a, ParserItemToString(v)) } actual := strings.Join(a, "") if actual != expected { @@ -194,3 +197,22 @@ func runStatesT(t *testing.T, c statesT) { } } } + +// ParserItemToString returns a string representation of the +// parser.Item. This is used for unit testing purposes. +func ParserItemToString(i parser.Item) string { + switch i.Type { + case lexer.ItemComment: + return fmt.Sprintf("#(%s)", i.Value) + case lexer.ItemKey: + return fmt.Sprintf("[%s]", i.Value) + case lexer.ItemString: + return fmt.Sprintf("STR(%s)", i.Value) + case lexer.ItemKeyDot: + return "." + case lexer.ItemAssignment: + return "=" + default: + panic(fmt.Sprintf("No string representation available for parser.Item id %d", i.Type)) + } +} diff --git a/lexer/strings.go b/lexer/strings.go new file mode 100644 index 0000000..960273f --- /dev/null +++ b/lexer/strings.go @@ -0,0 +1,88 @@ +package lexer + +import "github.com/mmakaay/toml/parser" + +// There are four ways to express strings: basic, multi-line basic, literal, +// and multi-line literal. All strings must contain only valid UTF-8 characters. +func stateStringValue(l *parser.Parser) parser.StateFn { + switch { + case l.SkipMatching(doubleQuote, doubleQuote, doubleQuote): + // Multi-line basic strings are surrounded by three quotation marks on each side. + return stateMultiLineBasicString + case l.SkipMatching(doubleQuote): + // Basic strings are surrounded by quotation marks. + return stateSingleLineBasicString + } + return l.UnexpectedInputError("a string value") +} + +func stateSingleLineBasicString(l *parser.Parser) parser.StateFn { + if l.Upcoming(doubleQuote, doubleQuote) { + return stateMultiLineBasicString + } + return stateBasicString +} + +func stateMultiLineBasicString(l *parser.Parser) parser.StateFn { + l.EmitError("Not yet implemented") + return nil +} + +// Any Unicode character may be used except those that must be escaped: +// quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F). +const invalidBasicStringCharacters string = "\"\\" + + "\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007" + + "\u0008\u0009\u000A\u000B\u000C\u000D\u000E\u000F" + + "\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017" + + "\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F" + + "\u007F" + +func stateParseBasicString(l *parser.Parser) parser.StateFn { + for { + switch { + case l.AtEndOfFile(): + return l.UnexpectedEndOfFile("basic string token") + case l.SkipMatching(doubleQuote): + return l.PopState() + case l.AcceptMatching(backslash, escapeChars): + // For convenience, some popular characters have a compact escape sequence. + // \b - backspace (U+0008) + // \t - tab (U+0009) + // \n - linefeed (U+000A) + // \f - form feed (U+000C) + // \r - carriage return (U+000D) + // \" - quote (U+0022) + // \\ - backslash (U+005C) + case l.AcceptMatching(backslash, shortUtf8Escape, hex, hex, hex, hex): + // \uXXXX - unicode (U+XXXX) + case l.AcceptMatching(backslash, longUtf8Escape, hex, hex, hex, hex, hex, hex, hex, hex): + // \UXXXXXXXX - unicode (U+XXXXXXXX) + case l.Upcoming(backslash): + // All other escape sequences not listed above are reserved and, + // if used, TOML should produce an error. + return l.EmitError("Invalid escape sequence in basic string") + case l.Upcoming(invalidBasicStringCharacters): + // Any Unicode character may be used except those that must be escaped: + // quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F). + r, _, _ := l.Match(invalidBasicStringCharacters) + l.EmitError("Invalid character in basic string: %q (must be escaped)", r[0]) + return nil + default: + if !l.AcceptAny() { + return nil + } + } + } +} + +func stateBasicString(l *parser.Parser) parser.StateFn { + l.PushState(func(l *parser.Parser) parser.StateFn { + err := l.EmitInterpreted(ItemString) + if err != nil { + l.EmitError("Invalid data in string: %s", err) + return nil + } + return stateKeyValuePair + }) + return stateParseBasicString +} diff --git a/parser/parser.go b/parser/parser.go index 160ae0b..f6af569 100644 --- a/parser/parser.go +++ b/parser/parser.go @@ -17,6 +17,16 @@ func New(input string, startState StateFn) *Parser { } } +func (p *Parser) ToChildState(state StateFn) StateFn { + p.PushState(p.state) + return state +} + +func (p *Parser) ToParentState() StateFn { + state := p.PopState() + return state +} + // PushState adds the state function to the state stack. // This is used for implementing nested parsing. func (l *Parser) PushState(state StateFn) { @@ -36,6 +46,33 @@ func (l *Parser) AtEndOfFile() bool { return l.pos >= l.len } +func (p *Parser) AtEndOfLine() bool { + return p.AtEndOfFile() || + p.Upcoming("\r", "\n") || + p.Upcoming("\n") +} + +func (p *Parser) SkipEndOfLine() bool { + return p.AtEndOfFile() || + p.SkipMatching("\r", "\n") || + p.SkipMatching("\n") +} + +func (p *Parser) AcceptEndOfLine() bool { + // No newline, but we're defintely at the end of the line here. + if p.AtEndOfFile() { + return true + } + // If we see some kind of end of line, then we accept a + // normalized newline, which is just a '\n'. This will normalize + // '\r\n' into '\n'. + if p.SkipEndOfLine() { + p.buffer.WriteRune('\n') + return true + } + return false +} + // Emit passes a Parser item to the client, including the provided string. func (l *Parser) Emit(t ItemType, s string) { l.items <- Item{t, s}