diff --git a/parsekit/emitting.go b/parsekit/emitting.go index 293b7ec..9f534a3 100644 --- a/parsekit/emitting.go +++ b/parsekit/emitting.go @@ -3,6 +3,7 @@ package parsekit import ( "fmt" "strings" + "unicode/utf8" ) // Emit passes a Parser item to the client, including the provided string. @@ -51,8 +52,16 @@ func (p *P) EmitError(format string, args ...interface{}) { func (p *P) UnexpectedInput(expected string) { // next() takes care of error messages in cases where ok == false. // Therefore, we only provide an error message for the ok case here. - if r, ok := p.next(); ok { - p.EmitError(fmt.Sprintf("unexpected character %q (expected %s)", r, expected)) + r, _, ok := p.peek(0) + switch { + case ok: + p.EmitError("unexpected character %q (expected %s)", r, expected) + case r == EOF: + p.EmitError("unexpected end of file (expected %s)", expected) + case r == utf8.RuneError: + p.EmitError("invalid UTF8 character in input (expected %s)", expected) + default: + panic("Unhandled output from peek()") } } diff --git a/parsekit/internals.go b/parsekit/internals.go index 776b19e..17b49d1 100644 --- a/parsekit/internals.go +++ b/parsekit/internals.go @@ -4,32 +4,13 @@ import ( "unicode/utf8" ) -// next returns the next rune from the input and a boolean indicating if -// reading the input was successful. -// When the end of input is reached, or an invalid UTF8 character is -// read, then false is returned. Both are considered error cases, -// and for that reason these automatically emit an error to the client. -func (p *P) next() (rune, bool) { - r, w, ok := p.peek(0) - if ok { - p.advanceCursor(r, w) - return r, true - } - if r == utf8.RuneError && w == 0 { - p.EmitError("unexpected end of file") - } else { - p.EmitError("invalid UTF8 character") - } - return r, false -} - // peek returns but does not advance the cursor to the next rune(s) in the input. // Returns the rune, its width in bytes and a boolean. // The boolean will be false in case no upcoming rune can be peeked // (end of data or invalid UTF8 character). func (p *P) peek(offsetInBytes int) (rune, int, bool) { - peeked, width := utf8.DecodeRuneInString(p.input[p.pos+offsetInBytes:]) - return peeked, width, peeked != utf8.RuneError + r, w := utf8.DecodeRuneInString(p.input[p.pos+offsetInBytes:]) + return handleRuneError(r, w) } // peekMulti takes a peek at multiple upcoming runes in the input. @@ -43,13 +24,12 @@ func (p *P) peekMulti(amount int) ([]rune, []int, bool) { offset := 0 for i := 0; i < amount; i++ { r, w := utf8.DecodeRuneInString(p.input[p.pos+offset:]) - switch { - case r == utf8.RuneError: + r, w, ok := handleRuneError(r, w) + runes = append(runes, r) + widths = append(widths, w) + offset += w + if !ok { return runes, widths, false - default: - offset += w - runes = append(runes, r) - widths = append(widths, w) } } return runes, widths, true @@ -86,3 +66,21 @@ func (p *P) advanceCursor(r rune, w int) { } p.newline = r == '\n' } + +// handleRuneError is used to normale rune value in case of errors. +// When an error occurs, then utf8.RuneError will be in the rune. +// This can however indicate one of two situations: +// * w == 0: end of file is reached +// * w == 1: invalid UTF character on input +// This function lets these two cases return respectively the +// package's own EOF or INVALID runes, to make it easy for client +// code to distinct between these two cases. +func handleRuneError(r rune, w int) (rune, int, bool) { + if r == utf8.RuneError { + if w == 0 { + return EOF, 0, false + } + return INVALID, w, false + } + return r, w, true +} diff --git a/parsekit/matchers.go b/parsekit/matchers.go new file mode 100644 index 0000000..74a87cb --- /dev/null +++ b/parsekit/matchers.go @@ -0,0 +1,218 @@ +package parsekit + +import "unicode/utf8" + +// Not in need of it myself, but nice to have I guess: +// - NotFollowedBy +// - Discard +// - Separated + +type MatchDialog struct { + p *P + runes []rune + widths []int + offset int + curRune rune + curWidth int + forked bool +} + +func (m *MatchDialog) Fork() *MatchDialog { + fork := &MatchDialog{ + p: m.p, + offset: m.offset, + forked: true, + } + return fork +} + +func (m *MatchDialog) Join(fork *MatchDialog) bool { + if !fork.forked { + panic("Cannot join a non-forked MatchDialog") + } + m.runes = append(m.runes, fork.runes...) + m.widths = append(m.widths, fork.widths...) + m.offset = fork.offset + fork.runes = []rune{} + fork.widths = []int{} + return true +} + +func (m *MatchDialog) NextRune() (rune, bool) { + if m.curRune == utf8.RuneError { + panic("Matcher must not call NextRune() after it returned false") + } + r, w := utf8.DecodeRuneInString(m.p.input[m.p.pos+m.offset:]) + m.offset += w + m.curRune = r + m.curWidth = w + m.runes = append(m.runes, r) + m.widths = append(m.widths, w) + return r, r != EOF && r != INVALID +} + +// Matcher is the interface that can be implemented to provide +// a matching stategy for the match() function. +// A MatchDialog is provided as input. This implements a +// specific set of methods that a Matcher needs to retrieve data +// from the parser and to report back results. +type Matcher interface { + Match(*MatchDialog) bool +} + +type MatcherConstructors struct { + Any func() MatchAny + Rune func(rune rune) MatchRune + RuneRange func(start rune, end rune) MatchRuneRange + Runes func(runes ...rune) MatchAnyOf + AnyOf func(matchers ...Matcher) MatchAnyOf + Repeat func(count int, matcher Matcher) MatchRepeat + Sequence func(matchers ...Matcher) MatchSequence + ZeroOrMore func(matcher Matcher) MatchZeroOrMore + OneOrMore func(matcher Matcher) MatchOneOrMore + Optional func(matcher Matcher) MatchOptional +} + +var C = MatcherConstructors{ + Any: func() MatchAny { + return MatchAny{} + }, + Rune: func(rune rune) MatchRune { + return MatchRune{rune} + }, + RuneRange: func(start rune, end rune) MatchRuneRange { + return MatchRuneRange{start, end} + }, + Runes: func(runes ...rune) MatchAnyOf { + m := make([]Matcher, len(runes)) + for i, r := range runes { + m[i] = MatchRune{r} + } + return MatchAnyOf{m} + }, + AnyOf: func(matchers ...Matcher) MatchAnyOf { + return MatchAnyOf{matchers} + }, + Repeat: func(count int, matcher Matcher) MatchRepeat { + return MatchRepeat{count, matcher} + }, + Sequence: func(matchers ...Matcher) MatchSequence { + return MatchSequence{matchers} + }, + OneOrMore: func(matcher Matcher) MatchOneOrMore { + return MatchOneOrMore{matcher} + }, + ZeroOrMore: func(matcher Matcher) MatchZeroOrMore { + return MatchZeroOrMore{matcher} + }, + Optional: func(matcher Matcher) MatchOptional { + return MatchOptional{matcher} + }, +} + +type MatchAny struct{} + +func (c MatchAny) Match(m *MatchDialog) bool { + _, ok := m.NextRune() + return ok +} + +type MatchRune struct { + match rune +} + +func (c MatchRune) Match(m *MatchDialog) bool { + r, ok := m.NextRune() + return ok && r == c.match +} + +type MatchRuneRange struct { + start rune + end rune +} + +func (c MatchRuneRange) Match(m *MatchDialog) bool { + r, ok := m.NextRune() + return ok && r >= c.start && r <= c.end +} + +type MatchAnyOf struct { + matcher []Matcher +} + +func (c MatchAnyOf) Match(m *MatchDialog) bool { + for _, matcher := range c.matcher { + mc := m.Fork() + if matcher.Match(mc) { + return m.Join(mc) + } + } + return false +} + +type MatchRepeat struct { + count int + matcher Matcher +} + +func (c MatchRepeat) Match(m *MatchDialog) bool { + mc := m.Fork() + for i := 0; i < c.count; i++ { + if !c.matcher.Match(mc) { + return false + } + } + m.Join(mc) + return true +} + +type MatchSequence struct { + matchers []Matcher +} + +func (c MatchSequence) Match(m *MatchDialog) bool { + mPart := m.Fork() + for _, matcher := range c.matchers { + if !matcher.Match(mPart) { + return false + } + } + m.Join(mPart) + return true +} + +type MatchOneOrMore struct { + matcher Matcher +} + +func (c MatchOneOrMore) Match(m *MatchDialog) bool { + mc := m.Fork() + for c.matcher.Match(mc) { + m.Join(mc) + } + return len(m.runes) > 0 +} + +type MatchZeroOrMore struct { + matcher Matcher +} + +func (c MatchZeroOrMore) Match(m *MatchDialog) bool { + mc := m.Fork() + for c.matcher.Match(mc) { + m.Join(mc) + } + return true +} + +type MatchOptional struct { + matcher Matcher +} + +func (c MatchOptional) Match(m *MatchDialog) bool { + mc := m.Fork() + if c.matcher.Match(mc) { + m.Join(mc) + } + return true +} diff --git a/parsekit/matchers_test.go b/parsekit/matchers_test.go new file mode 100644 index 0000000..7f1d474 --- /dev/null +++ b/parsekit/matchers_test.go @@ -0,0 +1,260 @@ +package parsekit_test + +import ( + "testing" + + p "github.com/mmakaay/toml/parsekit" +) + +var c = p.C + +const TestItem p.ItemType = 1 + +func newParser(input string, matcher p.Matcher) *p.P { + stateFn := func(p *p.P) { + if p.On(matcher).Accept() { + p.EmitLiteral(TestItem) + p.Repeat() + } else { + p.UnexpectedInput("MATCH") + } + } + return p.New(input, stateFn) +} + +func TestMatchAny(t *testing.T) { + p := newParser("o", c.Any()) + r, err, ok := p.Next() + if !ok { + t.Fatalf("Parsing failed: %s", err) + } + if r.Type != TestItem { + t.Error("Parser item type not expected TestTitem") + } + if r.Value != "o" { + t.Errorf("Parser item value is %q instead of expected \"o\"", r.Value) + } +} + +func TestMatchAny_AtEndOfFile(t *testing.T) { + p := newParser("", c.Any()) + _, err, ok := p.Next() + if ok { + t.Fatalf("Parsing unexpectedly succeeded") + } + expected := "unexpected end of file (expected MATCH)" + if err.Error() != expected { + t.Fatalf("Unexpected error from parser:\nexpectd: %s\nactual: %s\n", expected, err.Error()) + } +} + +func TestMatchAny_AtInvalidUtf8Rune(t *testing.T) { + p := newParser("\xcd", c.Any()) + _, err, ok := p.Next() + if ok { + t.Fatalf("Parsing unexpectedly succeeded") + } + expected := "invalid UTF8 character in input (expected MATCH)" + if err.Error() != expected { + t.Fatalf("Unexpected error from parser:\nexpectd: %s\nactual: %s\n", expected, err.Error()) + } +} + +func TestMatchRune(t *testing.T) { + p := newParser("xxx", c.Rune('x')) + r, err, ok := p.Next() + if !ok { + t.Fatalf("Parsing failed: %s", err) + } + if r.Type != TestItem { + t.Error("Parser item type not expected TestTitem") + } + if r.Value != "x" { + t.Errorf("Parser item value is %q instead of expected \"x\"", r.Value) + } +} + +func TestMatchRune_OnMismatch(t *testing.T) { + p := newParser("x ", c.Rune(' ')) + _, err, ok := p.Next() + if ok { + t.Fatalf("Parsing did not fail unexpectedly") + } + expected := "unexpected character 'x' (expected MATCH)" + if err.Error() != expected { + t.Fatalf("Unexpected error from parser:\nexpectd: %s\nactual: %s\n", expected, err.Error()) + } +} + +func TestMatchRuneRange(t *testing.T) { + m := c.RuneRange('b', 'y') + s := "mnopqrstuvwxybcdefghijkl" + p := newParser(s, m) + for i := 0; i < len(s); i++ { + r, err, ok := p.Next() + if !ok { + t.Fatalf("Parsing failed: %s", err) + } + if s[i] != r.Value[0] { + t.Fatalf("Unexpected parse output on cycle %d:\nexpected: %q\nactual: %q\n", i+1, s[i], r.Value[0]) + } + } + if _, _, ok := newParser("a", m).Next(); ok { + t.Fatalf("Unexpected parse success for input 'a'") + } + if _, _, ok := newParser("z", m).Next(); ok { + t.Fatalf("Unexpected parse success for input 'z'") + } +} + +func TestMatchRunes(t *testing.T) { + m := c.Runes('+', '-', '*', '/') + s := "-+/*+++" + p := newParser(s, m) + for i := 0; i < len(s); i++ { + r, err, ok := p.Next() + if !ok { + t.Fatalf("Parsing failed: %s", err) + } + if s[i] != r.Value[0] { + t.Fatalf("Unexpected parse output on cycle %d:\nexpected: %q\nactual: %q\n", i+1, s[i], r.Value[0]) + } + } + if _, _, ok := newParser("^", m).Next(); ok { + t.Fatalf("Unexpected parse success for input '^'") + } + if _, _, ok := newParser("x", m).Next(); ok { + t.Fatalf("Unexpected parse success for input 'x'") + } +} + +func TestMatchAnyOf(t *testing.T) { + p := newParser("abc", c.AnyOf(c.Rune('a'), c.Rune('b'))) + r, err, ok := p.Next() + if !ok { + t.Fatalf("Parsing failed: %s", err) + } + if r.Type != TestItem { + t.Error("Parser item type not expected TestTitem") + } + if r.Value != "a" { + t.Errorf("Parser item value is %q instead of expected \"a\"", r.Value) + } + + r, err, ok = p.Next() + if !ok { + t.Fatalf("Parsing failed: %s", err) + } + if r.Type != TestItem { + t.Error("Parser item type not expected TestTitem") + } + if r.Value != "b" { + t.Errorf("Parser item value is %q instead of expected \"a\"", r.Value) + } +} + +func TestMatchRepeat(t *testing.T) { + p := newParser("xxxxyyyy", c.Repeat(4, c.Rune('x'))) + r, err, ok := p.Next() + if !ok { + t.Fatalf("Parsing failed: %s at row: %d, column %d\n", err, err.Row, err.Column) + } + if r.Value != "xxxx" { + t.Errorf("Parser item value is %q instead of expected \"xxxx\"", r.Value) + } +} + +func TestMatchRepeat_Mismatch(t *testing.T) { + p := newParser("xxxyyyy", c.Repeat(4, c.Rune('x'))) + _, err, ok := p.Next() + if ok { + t.Fatalf("Parsing did not fail unexpectedly") + } + expected := "unexpected character 'x' (expected MATCH)" + if err.Error() != expected { + t.Fatalf("Unexpected error from parser:\nexpectd: %s\nactual: %s\n", expected, err.Error()) + } +} + +func TestMatchOneOrMore(t *testing.T) { + p := newParser("xxxxxxxxyyyy", c.OneOrMore(c.Rune('x'))) + r, err, ok := p.Next() + if !ok { + t.Fatalf("Parsing failed: %s at row: %d, column %d\n", err, err.Row, err.Column) + } + if r.Value != "xxxxxxxx" { + t.Errorf("Parser item value is %q instead of expected \"xxxxxxxx\"", r.Value) + } +} + +func TestMatchSequence(t *testing.T) { + p := newParser("10101", c.Sequence(c.Rune('1'), c.Rune('0'))) + r, err, ok := p.Next() + if !ok { + t.Fatalf("Parsing failed: %s at row: %d, column %d\n", err, err.Row, err.Column) + } + if r.Value != "10" { + t.Errorf("Parser item value is %q instead of expected \"10\"", r.Value) + } +} + +func TestMatchSequence_CombinedWithOneOrMore(t *testing.T) { + p := newParser("101010987", c.OneOrMore(c.Sequence(c.Rune('1'), c.Rune('0')))) + r, err, ok := p.Next() + if !ok { + t.Fatalf("Parsing failed: %s at row: %d, column %d\n", err, err.Row, err.Column) + } + if r.Value != "101010" { + t.Errorf("Parser item value is %q instead of expected \"101010\"", r.Value) + } +} + +func TestSequence_WithRepeatedRunes(t *testing.T) { + whitespace := c.Optional(c.OneOrMore(c.Rune(' '))) + equal := c.Rune('=') + assignment := c.Sequence(whitespace, equal, whitespace) + p := newParser(" == 10", assignment) + r, err, ok := p.Next() + if !ok { + t.Fatalf("Parsing failed: %s at row: %d, column %d\n", err, err.Row, err.Column) + } + if r.Value != " =" { + t.Errorf("Parser item value is %q instead of expected \" =\"", r.Value) + } +} + +func TestMatchOptional(t *testing.T) { + p := newParser("xyz", c.Optional(c.Rune('x'))) + r, err, ok := p.Next() + if !ok { + t.Fatalf("Parsing failed: %s at row: %d, column %d\n", err, err.Row, err.Column) + } + if r.Value != "x" { + t.Errorf("Parser item value is %q instead of expected \"x\"", r.Value) + } + + p = newParser("xyz", c.Optional(c.Rune('y'))) + r, err, ok = p.Next() + if !ok { + t.Fatalf("Parsing failed: %s at row: %d, column %d\n", err, err.Row, err.Column) + } + if r.Value != "" { + t.Errorf("Parser item value is %q instead of expected \"\"", r.Value) + } +} + +func TestMixAndMatch(t *testing.T) { + hex := c.AnyOf(c.RuneRange('0', '9'), c.RuneRange('a', 'f'), c.RuneRange('A', 'F')) + backslash := c.Rune('\\') + x := c.Rune('x') + hexbyte := c.Sequence(backslash, x, c.Repeat(2, hex)) + + p := newParser(`\x9a\x01\xF0\xfCAndSomeMoreStuff`, c.Repeat(4, hexbyte)) + r, err, ok := p.Next() + if !ok { + t.Fatalf("Parsing failed: %s at row: %d, column %d\n", err, err.Row, err.Column) + } + if r.Value != `\x9a\x01\xF0\xfC` { + t.Errorf("Parser item value is %q instead of expected \"%q\"", r.Value, `\x9a\x01\xF0\xfC`) + } +} diff --git a/parsekit/matching.go b/parsekit/matching.go index 81b9a8e..f092128 100644 --- a/parsekit/matching.go +++ b/parsekit/matching.go @@ -64,6 +64,13 @@ func (p *P) match(offset int, patterns ...interface{}) ([]rune, []int, bool) { return runes, widths, false } switch pattern := pattern.(type) { + case Matcher: + m := &MatchDialog{p: p} + if pattern.Match(m) { + return m.runes, m.widths, true + } else { + return m.runes, m.widths, false + } case []interface{}: rs, ws, matched := p.match(offset, pattern...) for i, r := range rs { @@ -98,17 +105,6 @@ func (p *P) Upcoming(patterns ...interface{}) bool { return ok } -// AcceptAny adds the next rune from the input to the string buffer. -// If no rune could be read (end of file or invalid UTF8 data), -// then false is returned. -func (p *P) AcceptAny() bool { - if r, ok := p.next(); ok { - p.buffer.writeRune(r) - return true - } - return false -} - type action struct { p *P runes []rune @@ -129,6 +125,10 @@ func (a *action) Accept() bool { func (a *action) Skip() bool { if a.ok { for i, r := range a.runes { + type C struct { + Rune MatchRune + } + a.p.advanceCursor(r, a.widths[i]) } } @@ -159,20 +159,10 @@ func (p *P) On(patterns ...interface{}) *action { // AcceptMatching adds the next runes to the string buffer, but only // if the upcoming runes satisfy the provided patterns. // When runes were added then true is returned, false otherwise. -func (p *P) AcceptMatching(patterns ...interface{}) bool { - return p.progress(func(r rune) { p.buffer.writeRune(r) }, patterns...) -} - -// AcceptConsecutive adds consecutive runes from the input to the string -// buffer, as long as they exist in the pattern. -// If any runes were added then true is returned, false otherwise. -func (p *P) AcceptConsecutive(pattern string) bool { - accepted := false - for p.AcceptMatching(pattern) { - accepted = true - } - return accepted -} +// TODO not needed anymore +// func (p *P) AcceptMatching(patterns ...interface{}) bool { +// return p.progress(func(r rune) { p.buffer.writeRune(r) }, patterns...) +// } // SkipMatching skips runes, but only when all provided patterns are satisfied. // Returns true when one or more runes were skipped. @@ -185,13 +175,3 @@ func (p *P) SkipMatching(patterns ...interface{}) bool { } return false } - -// SkipConsecutive skips consecutive runes from the provided pattern. -// Returns true when one or more runes were skipped. -func (p *P) SkipConsecutive(pattern string) bool { - didSkip := false - for p.SkipMatching(pattern) { - didSkip = true - } - return didSkip -} diff --git a/parsekit/types.go b/parsekit/types.go index a8d3500..ff5f3ed 100644 --- a/parsekit/types.go +++ b/parsekit/types.go @@ -1,5 +1,9 @@ package parsekit +import ( + "unicode/utf8" +) + // P holds the internal state of the parser. type P struct { state StateFn // the function that handles the current state @@ -50,3 +54,14 @@ type Error struct { func (err *Error) Error() string { return err.Message } + +// EOF is a special rune, which is used to indicate an end of file when +// reading a character from the input. +// It can be treated as a rune when writing parsing rules, so a valid way to +// say 'I now expect the end of the file' is using something like: +// if (p.On(c.Rune(EOF)).Skip()) { ... } +const EOF rune = -1 + +// INVALID is a special rune, which is used to indicate an invalid UTF8 +// rune on the input. +const INVALID rune = utf8.RuneError diff --git a/parser/helpers_test.go b/parser/helpers_test.go index 2f23a1b..fa07691 100644 --- a/parser/helpers_test.go +++ b/parser/helpers_test.go @@ -6,7 +6,7 @@ import ( "testing" "github.com/mmakaay/toml/parsekit" - lexer "github.com/mmakaay/toml/parser" + "github.com/mmakaay/toml/parser" ) type statesT struct { @@ -23,7 +23,7 @@ func runStatesTs(t *testing.T, tests []statesT) { } func runStatesT(t *testing.T, c statesT) { - l, err := lexer.NewParser(c.in).ToArray() + l, err := parser.NewParser(c.in).ToArray() if err == nil && c.err != "" { t.Errorf("[%s] Expected error '%s', but no error occurred", c.name, c.err) } @@ -36,12 +36,12 @@ func runStatesT(t *testing.T, c statesT) { switch expected := c.out.(type) { case []string: if len(expected) != len(l) { - t.Errorf("[%s] Unexpected number of lexer items:\nexpected: %d\nactual: %d\n", c.name, len(expected), len(l)) + t.Errorf("[%s] Unexpected number of parser items:\nexpected: %d\nactual: %d\n", c.name, len(expected), len(l)) } for i, e := range expected { v := ParserItemToString(l[i]) if v != e { - t.Errorf("[%s] Unexpected lexer item at index %d:\nexpected: %s\nactual: %s\n", c.name, i, e, v) + t.Errorf("[%s] Unexpected parser item at index %d:\nexpected: %s\nactual: %s\n", c.name, i, e, v) } } case string: @@ -51,7 +51,7 @@ func runStatesT(t *testing.T, c statesT) { } actual := strings.Join(a, "") if actual != expected { - t.Errorf("[%s] Unexpected lexer output:\nexpected: %s\nactual: %s\n", c.name, expected, actual) + t.Errorf("[%s] Unexpected parser output:\nexpected: %s\nactual: %s\n", c.name, expected, actual) } } } @@ -59,15 +59,15 @@ func runStatesT(t *testing.T, c statesT) { // ParserItemToString returns a string representation of the parsekit.Item. func ParserItemToString(i parsekit.Item) string { switch i.Type { - case lexer.ItemComment: + case parser.ItemComment: return fmt.Sprintf("#(%s)", i.Value) - case lexer.ItemKey: + case parser.ItemKey: return fmt.Sprintf("[%s]", i.Value) - case lexer.ItemString: + case parser.ItemString: return fmt.Sprintf("STR(%s)", i.Value) - case lexer.ItemKeyDot: + case parser.ItemKeyDot: return "." - case lexer.ItemAssignment: + case parser.ItemAssignment: return "=" default: panic(fmt.Sprintf("No string representation available for parsekit.Item id %d", i.Type)) diff --git a/parser/parser.go b/parser/parser.go index 9f1e563..ca4d48a 100644 --- a/parser/parser.go +++ b/parser/parser.go @@ -11,40 +11,28 @@ const ( ItemString // A value of type string ) -const ( - whitespace string = " \t" - carriageReturn string = "\r" - newline string = "\n" - hash string = "#" - equal string = "=" - lower string = "abcdefghijklmnopqrstuvwxyz" - upper string = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" - digits string = "0123456789" - hex string = digits + "abcdefABCDEF" - dot string = "." - underscore string = "_" - dash string = "-" - singleQuote string = "'" - doubleQuote string = "\"" - backslash string = "\\" - quoteChars string = singleQuote + doubleQuote - bareKeyChars string = lower + upper + digits + underscore + dash - startOfKey string = bareKeyChars + quoteChars - validEscapeChars string = `btnfr"\` - mustBeEscaped string = "" + - "\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007" + - "\u0008\u0009\u000A\u000B\u000C\u000D\u000E\u000F" + - "\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017" + - "\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F" + - "\u007F" -) - var ( - keySeparatorDot = []interface{}{whitespace, dot, whitespace} - doubleQuote3 = []interface{}{doubleQuote, doubleQuote, doubleQuote} - hex4 = []interface{}{hex, hex, hex, hex} - shortUtf8Match = []interface{}{backslash, 'u', hex4} - longUtf8Match = []interface{}{backslash, 'U', hex4, hex4} + c = parsekit.C + space = c.Rune(' ') + tab = c.Rune('\t') + carriageReturn = c.Rune('\r') + lineFeed = c.Rune('\n') + hash = c.Rune('#') + underscore = c.Rune('_') + dash = c.Rune('-') + equal = c.Rune('=') + dot = c.Rune('.') + singleQuote = c.Rune('\'') + doubleQuote = c.Rune('"') + any = c.Any() + anyQuote = c.AnyOf(singleQuote, doubleQuote) + backslash = c.Rune('\\') + lower = c.RuneRange('a', 'z') + upper = c.RuneRange('A', 'Z') + digit = c.RuneRange('0', '9') + whitespace = c.OneOrMore(c.AnyOf(space, tab)) + whitespaceOrNewlines = c.OneOrMore(c.AnyOf(space, tab, carriageReturn, lineFeed)) + optionalWhitespace = c.Optional(whitespace) ) // NewParser creates a new parser, using the provided input string diff --git a/parser/parser_test.go b/parser/parser_test.go index c20471d..9561a8a 100644 --- a/parser/parser_test.go +++ b/parser/parser_test.go @@ -6,6 +6,10 @@ import ( "github.com/mmakaay/toml/parser" ) +func TestEmptyInput(t *testing.T) { + runStatesT(t, statesT{"empty string", "", "", ""}) +} + func TestErrorsIncludeLineAndRowPosition(t *testing.T) { _, err := parser.NewParser("# 12345 abcde\t\n\n\n# 67890\r\n# 12345\xbc").ToArray() t.Logf("Got error: %s", err.Error()) @@ -17,18 +21,13 @@ func TestErrorsIncludeLineAndRowPosition(t *testing.T) { } } -func TestEmptyInput(t *testing.T) { - runStatesT(t, statesT{"empty string", "", "", ""}) -} - func TestInvalidUtf8Data(t *testing.T) { runStatesTs(t, []statesT{ - {"inside comment", "# \xbc", "", "invalid UTF8 character"}, - {"bare key 1", "\xbc", "", "invalid UTF8 character"}, - {"bare key 2", "key\xbc", "[key]", "invalid UTF8 character"}, - {"assignment", "key \xbc", "[key]", "invalid UTF8 character"}, - {"start of value", "key=\xbc", "[key]=", "invalid UTF8 character"}, - {"basic string value", "a=\"\xbc\"", "[a]=", "invalid UTF8 character"}, + {"inside comment", "# \xbc", "", "invalid UTF8 character in input (expected comment contents)"}, + {"bare key 1", "\xbc", "", "invalid UTF8 character in input (expected end of file)"}, + {"bare key 2", "key\xbc", "[key]", "invalid UTF8 character in input (expected a value assignment)"}, + {"start of value", "key=\xbc", "[key]=", "invalid UTF8 character in input (expected a value)"}, + {"basic string value", "a=\"\xbc\"", "[a]=", "invalid UTF8 character in input (expected string contents)"}, }) } diff --git a/parser/syn_comments.go b/parser/syn_comments.go index 0f00be1..9052987 100644 --- a/parser/syn_comments.go +++ b/parser/syn_comments.go @@ -6,7 +6,7 @@ import ( // A '#' hash symbol marks the rest of the line as a comment. func startComment(p *parsekit.P) { - p.SkipConsecutive(hash) + p.On(c.OneOrMore(hash)).Skip() p.RouteTo(commentContents) } @@ -16,8 +16,9 @@ func commentContents(p *parsekit.P) { case p.AtEndOfLine(): p.EmitLiteralTrim(ItemComment) p.RouteReturn() - default: - p.AcceptAny() + case p.On(any).Accept(): p.Repeat() + default: + p.UnexpectedInput("comment contents") } } diff --git a/parser/syn_key.go b/parser/syn_key.go deleted file mode 100644 index 4a3c363..0000000 --- a/parser/syn_key.go +++ /dev/null @@ -1,65 +0,0 @@ -package parser - -import "github.com/mmakaay/toml/parsekit" - -// The primary building block of a TOML document is the key/value pair. -func startKeyValuePair(p *parsekit.P) { - switch { - case p.On(whitespace + carriageReturn + newline).Skip(): - p.Repeat() - case p.On(hash).Stay(): - p.RouteTo(startComment).ThenReturnHere() - case p.On(startOfKey).RouteTo(startKey): - default: - p.RouteTo(endOfFile) - } -} - -// A key may be either bare, quoted or dotted. -func startKey(p *parsekit.P) { - switch { - case p.On(bareKeyChars).RouteTo(startBareKey): - default: - p.UnexpectedInput("a valid key name") - } -} - -// Bare keys may only contain ASCII letters, ASCII digits, -// underscores, and dashes (A-Za-z0-9_-). Note that bare -// keys are allowed to be composed of only ASCII digits, -// e.g. 1234, but are always interpreted as strings. -func startBareKey(p *parsekit.P) { - p.AcceptConsecutive(bareKeyChars) // TODO make a plan for adding this to After() - p.EmitLiteral(ItemKey) - p.RouteTo(endOfKeyOrDot) -} - -// Dotted keys are a sequence of bare or quoted keys joined with a dot. -// This allows for grouping similar properties together: -func endOfKeyOrDot(p *parsekit.P) { - // Whitespace around dot-separated parts is ignored, however, - // best practice is to not use any extraneous whitespace. - p.SkipConsecutive(whitespace) - if p.On(dot).Accept() { - p.SkipConsecutive(whitespace) - p.EmitLiteral(ItemKeyDot) - p.RouteTo(startKey) - } else { - p.RouteTo(startKeyAssignment) - } -} - -// Keys are on the left of the equals sign and values are on the right. -// Whitespace is ignored around key names and values. The key, equals -// sign, and value must be on the same line (though some values can -// be broken over multiple lines). -func startKeyAssignment(p *parsekit.P) { - p.SkipConsecutive(whitespace) - if p.On(equal).Accept() { - p.EmitLiteral(ItemAssignment) - p.SkipConsecutive(whitespace) - p.RouteTo(startValue) - } else { - p.UnexpectedInput("a value assignment") - } -} diff --git a/parser/syn_keyvaluepair.go b/parser/syn_keyvaluepair.go new file mode 100644 index 0000000..b91d01f --- /dev/null +++ b/parser/syn_keyvaluepair.go @@ -0,0 +1,88 @@ +package parser + +import "github.com/mmakaay/toml/parsekit" + +// The primary building block of a TOML document is the key/value pair. + +var ( + // Keys are on the left of the equals sign and values are on the right. + // Whitespace is ignored around key names and values. The key, equals + // sign, and value must be on the same line (though some values can be + // broken over multiple lines). + keyAssignment = c.Sequence(optionalWhitespace, equal, optionalWhitespace) + + // A key may be either bare, quoted or dotted. + // Bare keys may only contain ASCII letters, ASCII digits, + // underscores, and dashes (A-Za-z0-9_-). Note that bare + // keys are allowed to be composed of only ASCII digits, + // e.g. 1234, but are always interpreted as strings. + bareKeyRune = c.AnyOf(lower, upper, digit, underscore, dash) + bareKey = c.OneOrMore(bareKeyRune) + + // Quoted keys follow the exact same rules as either basic + // strings or literal strings and allow you to use a much broader + // set of key names. Best practice is to use bare keys except + // when absolutely necessary. + // A bare key must be non-empty, but an empty quoted key is + // allowed (though discouraged). + startOfKey = c.AnyOf(bareKeyRune, anyQuote) + + // Dotted keys are a sequence of bare or quoted keys joined with a dot. + // This allows for grouping similar properties together. + // Whitespace around dot-separated parts is ignored, however, best + // practice is to not use any extraneous whitespace. + keySeparatordDot = c.Sequence(optionalWhitespace, dot, optionalWhitespace) +) + +func startKeyValuePair(p *parsekit.P) { + p.On(whitespaceOrNewlines).Skip() + switch { + case p.On(hash).Stay(): + p.RouteTo(startComment).ThenReturnHere() + case p.On(startOfKey).RouteTo(startKey): + default: + p.RouteTo(endOfFile) // TODO Make end of file a Matcher, so this can be simpler. + } +} + +func startKey(p *parsekit.P) { + switch { + case p.On(bareKeyRune).RouteTo(startBareKey): + default: + p.UnexpectedInput("a valid key name") + } +} + +func startBareKey(p *parsekit.P) { + p.On(bareKey).Accept() + p.EmitLiteral(ItemKey) + p.RouteTo(endOfKeyOrDot) +} + +func endOfKeyOrDot(p *parsekit.P) { + if p.On(keySeparatordDot).Skip() { + p.Emit(ItemKeyDot, ".") + p.RouteTo(startKey) + } else { + p.RouteTo(startKeyAssignment) + } +} + +func startKeyAssignment(p *parsekit.P) { + if p.On(keyAssignment).Skip() { + p.Emit(ItemAssignment, "=") + p.RouteTo(startValue) + } else { + p.UnexpectedInput("a value assignment") + } +} + +// Values must be of the following types: String, Integer, Float, Boolean, +// Datetime, Array, or Inline Table. Unspecified values are invalid. +func startValue(p *parsekit.P) { + switch { + case p.On(anyQuote).RouteTo(startString): + default: + p.UnexpectedInput("a value") + } +} diff --git a/parser/syn_key_test.go b/parser/syn_keyvaluepair_test.go similarity index 66% rename from parser/syn_key_test.go rename to parser/syn_keyvaluepair_test.go index e3a7702..4dc4ad0 100644 --- a/parser/syn_key_test.go +++ b/parser/syn_keyvaluepair_test.go @@ -5,9 +5,9 @@ import ( ) func TestKeyWithoutAssignment(t *testing.T) { - err := "unexpected end of file" + err := "unexpected end of file (expected a value assignment)" runStatesTs(t, []statesT{ - {"bare with whitespace", " a ", "[a]", err}, + {"bare with whitespace", " a ", "[a]", "unexpected character ' ' (expected a value assignment)"}, {"bare lower", "abcdefghijklmnopqrstuvwxyz", "[abcdefghijklmnopqrstuvwxyz]", err}, {"bare upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "[ABCDEFGHIJKLMNOPQRSTUVWXYZ]", err}, {"bare numbers", "0123456789", "[0123456789]", err}, @@ -18,15 +18,14 @@ func TestKeyWithoutAssignment(t *testing.T) { } func TestDottedKey(t *testing.T) { - err := "unexpected end of file" runStatesTs(t, []statesT{ - {"bare dotted", "a._.c", "[a].[_].[c]", err}, - {"bare dotted with whitespace", " a .\t\t b\t ", "[a].[b]", err}, + {"bare dotted", "a._.c", "[a].[_].[c]", "unexpected end of file (expected a value assignment)"}, + {"bare dotted with whitespace", " a .\t\t b\t ", "[a].[b]", `unexpected character '\t' (expected a value assignment)`}, }) } func TestKeyWithAssignmentButNoValue(t *testing.T) { - err := "unexpected end of file" + err := "unexpected end of file (expected a value)" runStatesTs(t, []statesT{ {"bare", "a=", "[a]=", err}, {"double equal sign", "a==", "[a]=", "unexpected character '=' (expected a value)"}, diff --git a/parser/syn_strings.go b/parser/syn_strings.go index 90775b9..a0f33ab 100644 --- a/parser/syn_strings.go +++ b/parser/syn_strings.go @@ -2,10 +2,36 @@ package parser import "github.com/mmakaay/toml/parsekit" -// There are four ways to express strings: basic, multi-line basic, literal, -// and multi-line literal. All strings must contain only valid UTF-8 characters. -// * Multi-line basic strings are surrounded by three quotation marks on each side. -// * Basic strings are surrounded by quotation marks. +var ( + // There are four ways to express strings: basic, multi-line basic, literal, + // and multi-line literal. All strings must contain only valid UTF-8 characters. + // * Multi-line basic strings are surrounded by three quotation marks on each side. + // * Basic strings are surrounded by quotation marks. + doubleQuote3 = c.Repeat(3, doubleQuote) + + // Any Unicode character may be used except those that must be escaped: + // quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F). + charThatMustBeEscaped = c.AnyOf(c.RuneRange('\u0000', '\u001F'), c.Rune('\u007F')) + + // For convenience, some popular characters have a compact escape sequence. + // + // \b - backspace (U+0008) + // \t - tab (U+0009) + // \n - linefeed (U+000A) + // \f - form feed (U+000C) + // \r - carriage return (U+000D) + // \" - quote (U+0022) + // \\ - backslash (U+005C) + // \uXXXX - unicode (U+XXXX) + // \UXXXXXXXX - unicode (U+XXXXXXXX) + validEscapeChar = c.AnyOf(c.Runes('b', 't', 'n', 'f', 'r'), doubleQuote, backslash) + shortEscape = c.Sequence(backslash, validEscapeChar) + hex = c.AnyOf(digit, c.RuneRange('a', 'f'), c.RuneRange('A', 'F')) + shortUtf8Escape = c.Sequence(backslash, c.Rune('u'), c.Repeat(4, hex)) + longUtf8Escape = c.Sequence(backslash, c.Rune('U'), c.Repeat(8, hex)) + validEscape = c.AnyOf(shortEscape, shortUtf8Escape, longUtf8Escape) +) + func startString(p *parsekit.P) { switch { case p.On(doubleQuote3).RouteTo(startMultiLineBasicString): @@ -15,36 +41,21 @@ func startString(p *parsekit.P) { } } -// For convenience, some popular characters have a compact escape sequence. -// -// \b - backspace (U+0008) -// \t - tab (U+0009) -// \n - linefeed (U+000A) -// \f - form feed (U+000C) -// \r - carriage return (U+000D) -// \" - quote (U+0022) -// \\ - backslash (U+005C) -// \uXXXX - unicode (U+XXXX) -// \UXXXXXXXX - unicode (U+XXXXXXXX) -// -// Any Unicode character may be used except those that must be escaped: -// quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F). func parseBasicString(p *parsekit.P) { switch { - case p.AtEndOfFile(): + case p.On(parsekit.EOF).Stay(): p.UnexpectedEndOfFile("basic string token") - case p.On(backslash, validEscapeChars).Accept() || - p.On(shortUtf8Match).Accept() || - p.On(longUtf8Match).Accept(): + case p.On(validEscape).Accept(): p.Repeat() - case p.On(mustBeEscaped).Stay(): - r, _, _ := p.Match(mustBeEscaped) + case p.On(charThatMustBeEscaped).Stay(): + r, _, _ := p.Match(charThatMustBeEscaped) p.EmitError("Invalid character in basic string: %q (must be escaped)", r[0]) case p.On(backslash).Stay() || p.On(doubleQuote).Stay(): p.RouteReturn() - default: - p.AcceptAny() + case p.On(any).Accept(): p.Repeat() + default: + p.UnexpectedInput("string contents") } } @@ -69,7 +80,7 @@ func basicStringSpecifics(p *parsekit.P) { case p.On(backslash).Stay(): p.EmitError("Invalid escape sequence") default: - p.RouteTo(startBasicString) + panic("String parsing should not have ended up here") } } diff --git a/parser/syn_strings_test.go b/parser/syn_strings_test.go index 0598550..562386a 100644 --- a/parser/syn_strings_test.go +++ b/parser/syn_strings_test.go @@ -33,8 +33,8 @@ func TestEmptyBasicString(t *testing.T) { {"with comment", `a="" #cool`, "[a]=STR()#(cool)", ""}, {"with whitespaces", ` a = "" `, "[a]=STR()", ""}, {"dotted", ` a.b = "" `, "[a].[b]=STR()", ""}, - {"multiple same line", `a=""b=""`, "[a]=STR()[b]=STR()", ""}, - {"multiple lines", "a=\"\" \n b = \"\" ", "[a]=STR()[b]=STR()", ""}, + {"multiple on same line", `a=""b=""`, "[a]=STR()[b]=STR()", ""}, + {"multiple on multiple lines", "a=\"\" \n b = \"\" ", "[a]=STR()[b]=STR()", ""}, }) } diff --git a/parser/syn_value.go b/parser/syn_value.go deleted file mode 100644 index 6692ad5..0000000 --- a/parser/syn_value.go +++ /dev/null @@ -1,14 +0,0 @@ -package parser - -import "github.com/mmakaay/toml/parsekit" - -// Values must be of the following types: String, Integer, Float, Boolean, -// Datetime, Array, or Inline Table. Unspecified values are invalid. -func startValue(p *parsekit.P) { - p.SkipConsecutive(whitespace) - if p.Upcoming(quoteChars) { - p.RouteTo(startString) - } else { - p.UnexpectedInput("a value") - } -}