From eca2adc9fa565d96cac97d799e418e3cb89139c6 Mon Sep 17 00:00:00 2001 From: Maurice Makaay Date: Tue, 21 May 2019 14:49:44 +0000 Subject: [PATCH] Initial import of the parsekit module. --- README.md | 3 +- go.mod | 3 + go.sum | 1 + parsekit.go | 129 +++++++++++++ parser_combinators.go | 340 +++++++++++++++++++++++++++++++++++ parser_combinators_test.go | 360 +++++++++++++++++++++++++++++++++++++ peek.go | 43 +++++ statehandler_emit.go | 107 +++++++++++ statehandler_expects.go | 15 ++ statehandler_on.go | 58 ++++++ statehandler_on_match.go | 64 +++++++ statehandler_on_route.go | 59 ++++++ statehandler_routing.go | 42 +++++ stringbuf.go | 62 +++++++ stringbuf_test.go | 88 +++++++++ 15 files changed, 1373 insertions(+), 1 deletion(-) create mode 100644 go.mod create mode 100644 go.sum create mode 100644 parsekit.go create mode 100644 parser_combinators.go create mode 100644 parser_combinators_test.go create mode 100644 peek.go create mode 100644 statehandler_emit.go create mode 100644 statehandler_expects.go create mode 100644 statehandler_on.go create mode 100644 statehandler_on_match.go create mode 100644 statehandler_on_route.go create mode 100644 statehandler_routing.go create mode 100644 stringbuf.go create mode 100644 stringbuf_test.go diff --git a/README.md b/README.md index 4e10b19..f36ecfc 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,4 @@ # go-parsekit -A toolkit that facilitates writing text parsers, based on a flexible combination of parser/combinator technology and a parser state machine. \ No newline at end of file +A toolkit that facilitates writing text parsers, based on a flexible +combination of parser/combinator technology and a parser state machine. diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..f57b23c --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module github.com/mmakaay/go-parsekit + +go 1.12 diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..85502b6 --- /dev/null +++ b/go.sum @@ -0,0 +1 @@ +github.com/mmakaay/toml v0.3.1 h1:2uKRPvA/smKM8YuYGxWnW4KximMkWOMfunJOXgM5Zos= diff --git a/parsekit.go b/parsekit.go new file mode 100644 index 0000000..b266cf3 --- /dev/null +++ b/parsekit.go @@ -0,0 +1,129 @@ +package parsekit + +import ( + "fmt" + "reflect" + "runtime" +) + +// P holds the internal state of the parser. +type P struct { + state StateHandler // the function that handles the current state + nextState StateHandler // the function that will handle the next state + routeStack []StateHandler // route stack, for handling nested parsing + input string // the scanned input + len int // the total length of the input in bytes + pos int // current byte scanning position in the input + newline bool // keep track of when we have scanned a newline + cursorLine int // current row number in the input + cursorColumn int // current column position in the input + expecting string // a description of what the current state expects to find + buffer stringBuffer // an efficient buffer, used to build string values + LastMatch string // a string representation of the last matched input data + items chan Item // channel of resulting Parser items + item Item // the current item as reached by Next() and retrieved by Get() + err *Error // an error when lexing failed, retrieved by Error() +} + +// StateHandler defines the type of function that can be used to +// handle a parser state. +type StateHandler func(*P) + +// New takes an input string and a start state, +// and initializes the parser for it. +func New(input string, start StateHandler) *P { + return &P{ + input: input, + len: len(input), + cursorLine: 1, + cursorColumn: 1, + nextState: start, + items: make(chan Item, 2), + } +} + +// Next retrieves the next parsed item. +// When a valid item was found, then the boolean return parameter will be true. +// On error or when successfully reaching the end of the input, false is returned. +// When an error occurred, it will be set in the error return value, nil otherwise. +func (p *P) Next() (Item, *Error, bool) { + for { + select { + case i := <-p.items: + return p.makeReturnValues(i) + default: + p.runStatusHandler() + } + } +} + +// runStatusHandler moves the parser, which is bascially a state machine, +// to its next status. It does so by invoking a function of the +// type StateHandler. This function represents the current status. +func (p *P) runStatusHandler() { + if state, ok := p.getNextStateHandler(); ok { + p.invokeNextStatusHandler(state) + } +} + +// getNextStateHandler determintes the next StatusHandler to invoke in order +// to move the parsing state machine one step further. +// +// When implementing a parser, the StateHandler functions must provide +// a routing decision in every invocation. A routing decision is one +// of the following: +// +// * A route is specified explicitly, which means that the next StatusHandler +// function to invoke is registered during the StateHandler function +// invocation. For example: p.RouteTo(nextStatus) +// +// * A route is specified implicitly, which means that a previous StateHandler +// invocation has registered the followup route for the current state. +// For example: p.RouteTo(nextStatus).ThenTo(otherStatus) +// In this example, the nextStatus StateHandler will not have to specify +// a route explicitly, but otherStatus will be used implicitly after +// the nextStatus function has returned. +// +// * An expectation is registered by the StatusHandler. +// For example: p.Expects("a cool thing") +// When the StatusHandler returns without having specified a route, this +// expectation is used to generate an "unexpected input" error message. +// +// When no routing decision is provided by a StateHandler, then this is +// considered a bug in the state handler, and the parser will panic. +func (p *P) getNextStateHandler() (StateHandler, bool) { + switch { + case p.nextState != nil: + return p.nextState, true + case len(p.routeStack) > 0: + return p.popRoute(), true + case p.expecting != "": + p.UnexpectedInput() + return nil, false + default: + name := runtime.FuncForPC(reflect.ValueOf(p.state).Pointer()).Name() + panic(fmt.Sprintf("StateHandler %s did not provide a routing decision", name)) + } +} + +// invokeNextStatusHandler moves the parser state to the provided state +// and invokes the StatusHandler function. +func (p *P) invokeNextStatusHandler(state StateHandler) { + p.state = state + p.nextState = nil + p.expecting = "" + p.state(p) +} + +func (p *P) makeReturnValues(i Item) (Item, *Error, bool) { + switch { + case i.Type == ItemEOF: + return i, nil, false + case i.Type == ItemError: + p.err = &Error{i.Value, p.cursorLine, p.cursorColumn} + return i, p.err, false + default: + p.item = i + return i, nil, true + } +} diff --git a/parser_combinators.go b/parser_combinators.go new file mode 100644 index 0000000..a4e02b3 --- /dev/null +++ b/parser_combinators.go @@ -0,0 +1,340 @@ +package parsekit + +import ( + "unicode" + "unicode/utf8" +) + +// Not in need of it myself, but nice to have I guess: +// - LookAhead + +// MatchDialog is used by Matcher implementations as a means +// to retrieve data to match against and to report back +// successful matches. +type MatchDialog struct { + p *P + runes []rune + widths []int + offset int + curRune rune + curWidth int + parent *MatchDialog +} + +// Fork splits off a child MatchDialog, containing the same +// offset as the parent MatchDialog, but with all other data +// in a new state. +// By forking, a Matcher implementation can freely work with +// a MatchDialog, without affecting the parent MatchDialog. +// When the Matcher decides that a match was found, it can +// use the Merge() method on the child to merge the child's +// matching data into the parent MatchDialog. +func (m *MatchDialog) Fork() *MatchDialog { + child := &MatchDialog{ + p: m.p, + offset: m.offset, + parent: m, + } + return child +} + +// Merge merges the data for a a forked child MatchDialog back +// into its parent: +// * the runes that are accumulated in the child are added +// to the parent's runes +// * the parent's offset is set to the child's offset +// After a Merge, the child MatchDialog is reset so it can +// immediately be reused for performing another match. +func (m *MatchDialog) Merge() bool { + if m.parent == nil { + panic("Cannot call Merge a a non-forked MatchDialog") + } + m.parent.runes = append(m.parent.runes, m.runes...) + m.parent.widths = append(m.parent.widths, m.widths...) + m.parent.offset = m.offset + m.Clear() + return true +} + +// NextRune can be called by a Matcher on a MatchDialog in order +// to receive the next rune from the input. +// The rune is automatically added to the MatchDialog's runes. +// Returns the rune and a boolean. The boolean will be false in +// case an invalid UTF8 rune of the end of the file was encountered. +func (m *MatchDialog) NextRune() (rune, bool) { + if m.curRune == utf8.RuneError { + panic("Matcher must not call NextRune() after it returned false") + } + r, w, ok := m.p.peek(m.offset) + m.offset += w + m.curRune = r + m.curWidth = w + m.runes = append(m.runes, r) + m.widths = append(m.widths, w) + return r, ok +} + +// Clear empties out the accumulated runes that are stored +// in the MatchDialog. The offset is kept as-is. +func (m *MatchDialog) Clear() { + m.runes = []rune{} + m.widths = []int{} +} + +// Matcher is the interface that must be implemented to provide +// a matching stategy for the match() function. +// A MatchDialog is provided as input. This implements a +// specific set of methods that a Matcher needs to retrieve data +// from the parser and to report back results. +type Matcher interface { + Match(*MatchDialog) bool +} + +type MatcherConstructors struct { + EndOfFile func() MatchEndOfFile + AnyRune func() MatchAny + Rune func(rune) MatchRune + RuneRange func(rune, rune) MatchRuneRange + Runes func(...rune) MatchAnyOf + String func(string) MatchSequence + StringNoCase func(string) MatchSequence + AnyOf func(...Matcher) MatchAnyOf + Not func(Matcher) MatchNot + Optional func(Matcher) MatchOptional + Sequence func(...Matcher) MatchSequence + Repeat func(int, Matcher) MatchRepeat + Min func(int, Matcher) MatchRepeat + Max func(int, Matcher) MatchRepeat + Bounded func(int, int, Matcher) MatchRepeat + ZeroOrMore func(Matcher) MatchRepeat + OneOrMore func(Matcher) MatchRepeat + Separated func(Matcher, Matcher) MatchSeparated + Drop func(Matcher) MatchDrop +} + +// C provides access to a wide range of parser/combinator +// constructors that can be used to build matching expressions. +// When using C in your own parser, then it is advised to create +// an alias in your own package for easy reference: +// var c = parsekit.C +var C = MatcherConstructors{ + EndOfFile: func() MatchEndOfFile { + return MatchEndOfFile{} + }, + AnyRune: func() MatchAny { + return MatchAny{} + }, + Rune: func(rune rune) MatchRune { + return MatchRune(rune) + }, + RuneRange: func(start rune, end rune) MatchRuneRange { + return MatchRuneRange{start, end} + }, + Runes: func(runes ...rune) MatchAnyOf { + m := make([]Matcher, len(runes)) + for i, r := range runes { + m[i] = MatchRune(r) + } + return MatchAnyOf{m} + }, + String: func(s string) MatchSequence { + var m = []Matcher{} + for _, r := range s { + m = append(m, MatchRune(r)) + } + return MatchSequence{m} + }, + StringNoCase: func(s string) MatchSequence { + var m = []Matcher{} + for _, r := range s { + u := MatchRune(unicode.ToUpper(r)) + l := MatchRune(unicode.ToLower(r)) + m = append(m, MatchAnyOf{[]Matcher{u, l}}) + } + return MatchSequence{m} + }, + Optional: func(Matcher Matcher) MatchOptional { + return MatchOptional{Matcher} + }, + Not: func(Matcher Matcher) MatchNot { + return MatchNot{Matcher} + }, + AnyOf: func(Matchers ...Matcher) MatchAnyOf { + return MatchAnyOf{Matchers} + }, + Sequence: func(Matchers ...Matcher) MatchSequence { + return MatchSequence{Matchers} + }, + Repeat: func(count int, Matcher Matcher) MatchRepeat { + return MatchRepeat{count, count, Matcher} + }, + Min: func(min int, Matcher Matcher) MatchRepeat { + return MatchRepeat{min, -1, Matcher} + }, + Max: func(max int, Matcher Matcher) MatchRepeat { + return MatchRepeat{-1, max, Matcher} + }, + Bounded: func(min int, max int, Matcher Matcher) MatchRepeat { + return MatchRepeat{min, max, Matcher} + }, + OneOrMore: func(Matcher Matcher) MatchRepeat { + return MatchRepeat{1, -1, Matcher} + }, + ZeroOrMore: func(Matcher Matcher) MatchRepeat { + return MatchRepeat{0, -1, Matcher} + }, + Separated: func(separator Matcher, Matcher Matcher) MatchSeparated { + return MatchSeparated{separator, Matcher} + }, + Drop: func(Matcher Matcher) MatchDrop { + return MatchDrop{Matcher} + }, +} + +type MatchEndOfFile struct{} + +func (c MatchEndOfFile) Match(m *MatchDialog) bool { + r, ok := m.NextRune() + return !ok && r == EOF +} + +type MatchAny struct{} + +func (c MatchAny) Match(m *MatchDialog) bool { + _, ok := m.NextRune() + return ok +} + +type MatchNot struct { + Matcher Matcher +} + +func (c MatchNot) Match(m *MatchDialog) bool { + child := m.Fork() + if !c.Matcher.Match(child) { + child.Merge() + return true + } + return false +} + +type MatchOptional struct { + Matcher Matcher +} + +func (c MatchOptional) Match(m *MatchDialog) bool { + child := m.Fork() + if c.Matcher.Match(child) { + child.Merge() + } + return true +} + +type MatchRune rune + +func (c MatchRune) Match(m *MatchDialog) bool { + r, ok := m.NextRune() + return ok && r == rune(c) +} + +type MatchRuneRange struct { + start rune + end rune +} + +func (c MatchRuneRange) Match(m *MatchDialog) bool { + r, ok := m.NextRune() + return ok && r >= c.start && r <= c.end +} + +type MatchAnyOf struct { + Matcher []Matcher +} + +func (c MatchAnyOf) Match(m *MatchDialog) bool { + for _, Matcher := range c.Matcher { + child := m.Fork() + if Matcher.Match(child) { + return child.Merge() + } + } + return false +} + +type MatchRepeat struct { + min int + max int + Matcher Matcher +} + +func (c MatchRepeat) Match(m *MatchDialog) bool { + child := m.Fork() + if c.min >= 0 && c.max >= 0 && c.min > c.max { + panic("MatchRepeat definition error: max must not be < min") + } + total := 0 + // Specified min: check for the minimal required amount of matches. + for total < c.min { + total++ + if !c.Matcher.Match(child) { + return false + } + } + // No specified max: include the rest of the available matches. + if c.max < 0 { + child.Merge() + for c.Matcher.Match(child) { + child.Merge() + } + return true + } + // Specified max: include the rest of the availble matches, up to the max. + child.Merge() + for total < c.max { + total++ + if !c.Matcher.Match(child) { + break + } + child.Merge() + } + return true +} + +type MatchSequence struct { + Matchers []Matcher +} + +func (c MatchSequence) Match(m *MatchDialog) bool { + child := m.Fork() + for _, Matcher := range c.Matchers { + if !Matcher.Match(child) { + return false + } + } + child.Merge() + return true +} + +type MatchSeparated struct { + separator Matcher + Matcher Matcher +} + +func (c MatchSeparated) Match(m *MatchDialog) bool { + seq := C.Sequence(c.Matcher, C.ZeroOrMore(C.Sequence(c.separator, c.Matcher))) + return seq.Match(m) +} + +type MatchDrop struct { + Matcher Matcher +} + +func (c MatchDrop) Match(m *MatchDialog) bool { + child := m.Fork() + if c.Matcher.Match(child) { + child.Clear() + child.Merge() + return true + } + return false +} diff --git a/parser_combinators_test.go b/parser_combinators_test.go new file mode 100644 index 0000000..4492de3 --- /dev/null +++ b/parser_combinators_test.go @@ -0,0 +1,360 @@ +package parsekit_test + +import ( + "testing" + + p "github.com/mmakaay/go-parsekit" +) + +var c = p.C + +const TestItem p.ItemType = 1 + +func newParser(input string, Matcher p.Matcher) *p.P { + stateFn := func(p *p.P) { + p.Expects("MATCH") + if p.On(Matcher).Accept().End() { + p.EmitLiteral(TestItem) + p.RouteRepeat() + } + } + return p.New(input, stateFn) +} + +func TestMatchAnyRune(t *testing.T) { + p := newParser("o", c.AnyRune()) + r, err, ok := p.Next() + if !ok { + t.Fatalf("Parsing failed: %s", err) + } + if r.Type != TestItem { + t.Error("Parser item type not expected TestTitem") + } + if r.Value != "o" { + t.Errorf("Parser item value is %q instead of expected \"o\"", r.Value) + } +} + +func TestMatchAnyRune_AtEndOfFile(t *testing.T) { + p := newParser("", c.AnyRune()) + _, err, ok := p.Next() + if ok { + t.Fatalf("Parsing unexpectedly succeeded") + } + expected := "unexpected end of file (expected MATCH)" + if err.Error() != expected { + t.Fatalf("Unexpected error from parser:\nexpectd: %s\nactual: %s\n", expected, err.Error()) + } +} + +func TestMatchAnyRune_AtInvalidUtf8Rune(t *testing.T) { + p := newParser("\xcd", c.AnyRune()) + _, err, ok := p.Next() + if ok { + t.Fatalf("Parsing unexpectedly succeeded") + } + expected := "invalid UTF8 character in input (expected MATCH)" + if err.Error() != expected { + t.Fatalf("Unexpected error from parser:\nexpectd: %s\nactual: %s\n", expected, err.Error()) + } +} + +func TestMatchRune(t *testing.T) { + p := newParser("xxx", c.Rune('x')) + r, err, ok := p.Next() + if !ok { + t.Fatalf("Parsing failed: %s", err) + } + if r.Type != TestItem { + t.Error("Parser item type not expected TestTitem") + } + if r.Value != "x" { + t.Errorf("Parser item value is %q instead of expected \"x\"", r.Value) + } +} + +func TestMatchRune_OnMismatch(t *testing.T) { + p := newParser("x ", c.Rune(' ')) + _, err, ok := p.Next() + if ok { + t.Fatalf("Parsing did not fail unexpectedly") + } + expected := "unexpected character 'x' (expected MATCH)" + if err.Error() != expected { + t.Fatalf("Unexpected error from parser:\nexpectd: %s\nactual: %s\n", expected, err.Error()) + } +} + +func TestMatchRuneRange(t *testing.T) { + m := c.RuneRange('b', 'y') + s := "mnopqrstuvwxybcdefghijkl" + p := newParser(s, m) + for i := 0; i < len(s); i++ { + r, err, ok := p.Next() + if !ok { + t.Fatalf("Parsing failed: %s", err) + } + if s[i] != r.Value[0] { + t.Fatalf("Unexpected parse output on cycle %d:\nexpected: %q\nactual: %q\n", i+1, s[i], r.Value[0]) + } + } + if _, _, ok := newParser("a", m).Next(); ok { + t.Fatalf("Unexpected parse success for input 'a'") + } + if _, _, ok := newParser("z", m).Next(); ok { + t.Fatalf("Unexpected parse success for input 'z'") + } +} + +func TestMatchString(t *testing.T) { + p := newParser("Hello, world!", c.String("Hello")) + r, err, ok := p.Next() + if !ok { + t.Fatalf("Parsing failed: %s", err) + } + if r.Type != TestItem { + t.Error("Parser item type not expected TestTitem") + } + if r.Value != "Hello" { + t.Errorf("Parser item value is %q instead of expected \"Hello\"", r.Value) + } +} + +func TestMatchStringNoCase(t *testing.T) { + p := newParser("HellÖ, world!", c.StringNoCase("hellö")) + r, err, ok := p.Next() + if !ok { + t.Fatalf("Parsing failed: %s", err) + } + if r.Type != TestItem { + t.Error("Parser item type not expected TestTitem") + } + if r.Value != "HellÖ" { + t.Errorf("Parser item value is %q instead of expected \"HellÖ\"", r.Value) + } +} + +func TestMatchRunes(t *testing.T) { + m := c.Runes('+', '-', '*', '/') + s := "-+/*+++" + p := newParser(s, m) + for i := 0; i < len(s); i++ { + r, err, ok := p.Next() + if !ok { + t.Fatalf("Parsing failed: %s", err) + } + if s[i] != r.Value[0] { + t.Fatalf("Unexpected parse output on cycle %d:\nexpected: %q\nactual: %q\n", i+1, s[i], r.Value[0]) + } + } + if _, _, ok := newParser("^", m).Next(); ok { + t.Fatalf("Unexpected parse success for input '^'") + } + if _, _, ok := newParser("x", m).Next(); ok { + t.Fatalf("Unexpected parse success for input 'x'") + } +} + +func TestMatchNot(t *testing.T) { + p := newParser("aabc", c.Not(c.Rune('b'))) + r, err, ok := p.Next() + if !ok { + t.Fatalf("Parsing failed: %s", err) + } + if r.Value != "a" { + t.Errorf("Parser item value is %q instead of expected \"a\"", r.Value) + } +} + +func TestMatchNot_Mismatch(t *testing.T) { + p := newParser("aabc", c.Not(c.Rune('a'))) + _, err, ok := p.Next() + if ok { + t.Fatalf("Parsing unexpectedly succeeded") + } + expected := "unexpected character 'a' (expected MATCH)" + if err.Error() != expected { + t.Fatalf("Unexpected error from parser:\nexpectd: %s\nactual: %s\n", expected, err.Error()) + } +} + +func TestMatchAnyOf(t *testing.T) { + p := newParser("abc", c.AnyOf(c.Rune('a'), c.Rune('b'))) + r, err, ok := p.Next() + if !ok { + t.Fatalf("Parsing failed: %s", err) + } + if r.Type != TestItem { + t.Error("Parser item type not expected TestTitem") + } + if r.Value != "a" { + t.Errorf("Parser item value is %q instead of expected \"a\"", r.Value) + } + + r, err, ok = p.Next() + if !ok { + t.Fatalf("Parsing failed: %s", err) + } + if r.Type != TestItem { + t.Error("Parser item type not expected TestTitem") + } + if r.Value != "b" { + t.Errorf("Parser item value is %q instead of expected \"a\"", r.Value) + } +} + +func TestMatchRepeat(t *testing.T) { + p := newParser("xxxxyyyy", c.Repeat(4, c.Rune('x'))) + r, err, ok := p.Next() + if !ok { + t.Fatalf("Parsing failed: %s at row: %d, column %d\n", err, err.Line, err.Column) + } + if r.Value != "xxxx" { + t.Errorf("Parser item value is %q instead of expected \"xxxx\"", r.Value) + } +} + +func TestMatchRepeat_Min(t *testing.T) { + p := newParser("1111112345", c.Min(4, c.Rune('1'))) + r, _, _ := p.Next() + if r.Value != "111111" { + t.Errorf("Parser item value is %q instead of expected \"111111\"", r.Value) + } +} + +func TestMatchRepeat_Max(t *testing.T) { + p := newParser("1111112345", c.Max(4, c.Rune('1'))) + r, _, _ := p.Next() + if r.Value != "1111" { + t.Errorf("Parser item value is %q instead of expected \"1111\"", r.Value) + } +} + +func TestMatchRepeat_Bounded(t *testing.T) { + p := newParser("1111112345", c.Bounded(3, 5, c.Rune('1'))) + r, _, _ := p.Next() + if r.Value != "11111" { + t.Errorf("Parser item value is %q instead of expected \"11111\"", r.Value) + } +} + +func TestMatchRepeat_Mismatch(t *testing.T) { + p := newParser("xxxyyyy", c.Repeat(4, c.Rune('x'))) + _, err, ok := p.Next() + if ok { + t.Fatalf("Parsing did not fail unexpectedly") + } + expected := "unexpected character 'x' (expected MATCH)" + if err.Error() != expected { + t.Fatalf("Unexpected error from parser:\nexpectd: %s\nactual: %s\n", expected, err.Error()) + } +} + +func TestMatchOneOrMore(t *testing.T) { + p := newParser("xxxxxxxxyyyy", c.OneOrMore(c.Rune('x'))) + r, err, ok := p.Next() + if !ok { + t.Fatalf("Parsing failed: %s at row: %d, column %d\n", err, err.Line, err.Column) + } + if r.Value != "xxxxxxxx" { + t.Errorf("Parser item value is %q instead of expected \"xxxxxxxx\"", r.Value) + } +} + +func TestMatchSequence(t *testing.T) { + p := newParser("10101", c.Sequence(c.Rune('1'), c.Rune('0'))) + r, err, ok := p.Next() + if !ok { + t.Fatalf("Parsing failed: %s at row: %d, column %d\n", err, err.Line, err.Column) + } + if r.Value != "10" { + t.Errorf("Parser item value is %q instead of expected \"10\"", r.Value) + } +} + +func TestMatchSequence_CombinedWithOneOrMore(t *testing.T) { + p := newParser("101010987", c.OneOrMore(c.Sequence(c.Rune('1'), c.Rune('0')))) + r, err, ok := p.Next() + if !ok { + t.Fatalf("Parsing failed: %s at row: %d, column %d\n", err, err.Line, err.Column) + } + if r.Value != "101010" { + t.Errorf("Parser item value is %q instead of expected \"101010\"", r.Value) + } +} + +func TestSequence_WithRepeatedRunes(t *testing.T) { + whitespace := c.Optional(c.OneOrMore(c.Rune(' '))) + equal := c.Rune('=') + assignment := c.Sequence(whitespace, equal, whitespace) + p := newParser(" == 10", assignment) + r, err, ok := p.Next() + if !ok { + t.Fatalf("Parsing failed: %s at row: %d, column %d\n", err, err.Line, err.Column) + } + if r.Value != " =" { + t.Errorf("Parser item value is %q instead of expected \" =\"", r.Value) + } +} + +func TestMatchOptional(t *testing.T) { + p := newParser("xyz", c.Optional(c.Rune('x'))) + r, err, ok := p.Next() + if !ok { + t.Fatalf("Parsing failed: %s at row: %d, column %d\n", err, err.Line, err.Column) + } + if r.Value != "x" { + t.Errorf("Parser item value is %q instead of expected \"x\"", r.Value) + } + + p = newParser("xyz", c.Optional(c.Rune('y'))) + r, err, ok = p.Next() + if !ok { + t.Fatalf("Parsing failed: %s at row: %d, column %d\n", err, err.Line, err.Column) + } + if r.Value != "" { + t.Errorf("Parser item value is %q instead of expected \"\"", r.Value) + } +} + +func TestMatchDrop(t *testing.T) { + dashes := c.OneOrMore(c.Rune('-')) + p := newParser("---X---", c.Sequence(c.Drop(dashes), c.AnyRune(), c.Drop(dashes))) + r, err, ok := p.Next() + if !ok { + t.Fatalf("Parsing failed: %s at row: %d, column %d\n", err, err.Line, err.Column) + } + if r.Value != "X" { + t.Errorf("Parser item value is %q instead of expected \"x\"", r.Value) + } +} + +func TestMatchSeparated(t *testing.T) { + number := c.Bounded(1, 3, c.RuneRange('0', '9')) + separators := c.Runes('|', ';', ',') + separated_numbers := c.Separated(separators, number) + p := newParser("1,2;3|44,55|66;777,abc", separated_numbers) + r, err, ok := p.Next() + if !ok { + t.Fatalf("Parsing failed: %s at row: %d, column %d\n", err, err.Line, err.Column) + } + if r.Value != "1,2;3|44,55|66;777" { + t.Errorf("Parser item value is %q instead of expected \"1,2;3|44,55|66;777\"", r.Value) + } +} + +func TestMixAndMatch(t *testing.T) { + hex := c.AnyOf(c.RuneRange('0', '9'), c.RuneRange('a', 'f'), c.RuneRange('A', 'F')) + backslash := c.Rune('\\') + x := c.Rune('x') + hexbyte := c.Sequence(backslash, x, c.Repeat(2, hex)) + + p := newParser(`\x9a\x01\xF0\xfCAndSomeMoreStuff`, c.Repeat(4, hexbyte)) + r, err, ok := p.Next() + if !ok { + t.Fatalf("Parsing failed: %s at row: %d, column %d\n", err, err.Line, err.Column) + } + if r.Value != `\x9a\x01\xF0\xfC` { + t.Errorf("Parser item value is %q instead of expected \"%q\"", r.Value, `\x9a\x01\xF0\xfC`) + } +} diff --git a/peek.go b/peek.go new file mode 100644 index 0000000..c45e2d5 --- /dev/null +++ b/peek.go @@ -0,0 +1,43 @@ +package parsekit + +import ( + "unicode/utf8" +) + +// peek returns but does not advance the cursor to the next rune(s) in the input. +// Returns the rune, its width in bytes and a boolean. +// The boolean will be false in case no upcoming rune can be peeked +// (end of data or invalid UTF8 character). +func (p *P) peek(offsetInBytes int) (rune, int, bool) { + r, w := utf8.DecodeRuneInString(p.input[p.pos+offsetInBytes:]) + return handleRuneError(r, w) +} + +// handleRuneError is used to normale rune value in case of errors. +// When an error occurs, then utf8.RuneError will be in the rune. +// This can however indicate one of two situations: +// * w == 0: end of file is reached +// * w == 1: invalid UTF character on input +// This function lets these two cases return respectively the +// package's own EOF or INVALID runes, to make it easy for client +// code to distinct between these two cases. +func handleRuneError(r rune, w int) (rune, int, bool) { + if r == utf8.RuneError { + if w == 0 { + return EOF, 0, false + } + return INVALID, w, false + } + return r, w, true +} + +// EOF is a special rune, which is used to indicate an end of file when +// reading a character from the input. +// It can be treated as a rune when writing parsing rules, so a valid way to +// say 'I now expect the end of the file' is using something like: +// if (p.On(c.Rune(EOF)).Skip()) { ... } +const EOF rune = -1 + +// INVALID is a special rune, which is used to indicate an invalid UTF8 +// rune on the input. +const INVALID rune = utf8.RuneError diff --git a/statehandler_emit.go b/statehandler_emit.go new file mode 100644 index 0000000..646f342 --- /dev/null +++ b/statehandler_emit.go @@ -0,0 +1,107 @@ +package parsekit + +import ( + "fmt" + "strings" +) + +// ItemType represents the type of a parser Item. +type ItemType int + +// ItemEOF is a built-in parser item type that is used for flagging that the +// end of the input was reached. +const ItemEOF ItemType = -1 + +// ItemError is a built-in parser item type that is used for flagging that +// an error has occurred during parsing. +const ItemError ItemType = -2 + +// Item represents an item that can be emitted from the parser. +type Item struct { + Type ItemType + Value string +} + +// Emit passes a Parser item to the client, including the provided string. +func (p *P) Emit(t ItemType, s string) { + p.items <- Item{t, s} + p.buffer.reset() +} + +// EmitLiteral passes a Parser item to the client, including accumulated +// string buffer data as a literal string. +func (p *P) EmitLiteral(t ItemType) { + p.Emit(t, p.buffer.asLiteralString()) +} + +// EmitLiteralTrim passes a Parser item to the client, including +// accumulated string buffer data as a literal string with whitespace +// trimmed from it. +func (p *P) EmitLiteralTrim(t ItemType) { + p.Emit(t, strings.TrimSpace(p.buffer.asLiteralString())) +} + +// EmitInterpreted passes a Parser item to the client, including +// accumulated string buffer data a Go doubled quoted interpreted string +// (handling escape codes like \n, \t, \uXXXX, etc.) +// This method might return an error, in case there is data in the +// string buffer that is not valid for string interpretation. +func (p *P) EmitInterpreted(t ItemType) error { + s, err := p.buffer.asInterpretedString() + if err != nil { + return err + } + p.Emit(t, s) + return nil +} + +// Error is used as the error type when parsing errors occur. +// The error includes some extra meta information to allow for useful +// error messages to the user. +type Error struct { + Message string + Line int + Column int +} + +func (err *Error) Error() string { + if err == nil { + panic("Error method called on the parser, but no error was set") + } + return err.Message +} + +func (err *Error) ErrorFull() string { + message := err.Error() + return fmt.Sprintf("%s after line %d, column %d", message, err.Line, err.Column) +} + +// EmitError emits a Parser error item to the client. +func (p *P) EmitError(format string, args ...interface{}) { + message := fmt.Sprintf(format, args...) + p.Emit(ItemError, message) +} + +// UnexpectedInput is used by a parser implementation to emit an +// error item that tells the client that an unexpected rune was +// encountered in the input. +func (p *P) UnexpectedInput() { + r, _, ok := p.peek(0) + switch { + case ok: + p.EmitError("unexpected character %q%s", r, fmtExpects(p)) + case r == EOF: + p.EmitError("unexpected end of file%s", fmtExpects(p)) + case r == INVALID: + p.EmitError("invalid UTF8 character in input%s", fmtExpects(p)) + default: + panic("Unhandled output from peek()") + } +} + +func fmtExpects(p *P) string { + if p.expecting == "" { + return "" + } + return fmt.Sprintf(" (expected %s)", p.expecting) +} diff --git a/statehandler_expects.go b/statehandler_expects.go new file mode 100644 index 0000000..adc66ae --- /dev/null +++ b/statehandler_expects.go @@ -0,0 +1,15 @@ +package parsekit + +// Expects is used to let a state function describe what input it is expecting. +// This expectation is used in error messages to make them more descriptive. +// +// Also, when defining an expectation inside a StateHandler, you do not need +// to handle unexpected input yourself. When the end of the function is +// reached without setting the next state, an automatic error will be +// emitted. This error differentiates between issues: +// * there is valid data on input, but it was not accepted by the function +// * there is an invalid UTF8 character on input +// * the end of the file was reached. +func (p *P) Expects(description string) { + p.expecting = description +} diff --git a/statehandler_on.go b/statehandler_on.go new file mode 100644 index 0000000..bbc7a49 --- /dev/null +++ b/statehandler_on.go @@ -0,0 +1,58 @@ +package parsekit + +// On checks if the current input matches the provided Matcher. +// +// This method is the start of a chain method in which multiple things can +// be arranged in one go: +// +// * Checking whether or not there is a match (this is what On does) +// * Deciding what to do with the match (Stay(): do nothing, Skip(): only move +// the cursor forward, Accept(): move cursor forward and add the match in +// the parser string buffer) +// * Dedicing where to route to (e.g. using RouteTo() to route to a +// StateHandler by name) +// * Followup routing after that, when applicable (.e.g using something like +// RouteTo(...).ThenTo(...)) +// +// For every step of this chain, you can end the chain using the +// End() method. This will return a boolean value, indicating whether or +// not the initial On() method found a match in the input. +// End() is not mandatory. It is merely provided as a means to use +// a chain as an expression for a switch/case or if statement (since those +// require a boolean expression). +// +// You can omit "what to do with the match" and go straight into a routing +// method, e.g. On(...).RouteTo(...). This is functionally the same as +// using On(...).Stay().RouteTo(...). +// +// Here's a complete example chain: +// p.On(something).Accept().RouteTo(stateB).ThenTo(stateC).End() +func (p *P) On(Matcher Matcher) *matchAction { + m := &MatchDialog{p: p} + ok := Matcher.Match(m) + + // Keep track of the last match, to allow parser implementations + // to access it in an easy way. Typical use would be something like: + // if p.On(somethingBad).End() { + // p.Errorf("This was bad: %s", p.LastMatch) + // } + p.LastMatch = string(m.runes) + + return &matchAction{ + routeAction: routeAction{chainAction{p, ok}}, + runes: m.runes, + widths: m.widths, + } +} + +// chainAction is used for building method chains for the On() method. +type chainAction struct { + p *P + ok bool +} + +// End ends the method chain and returns a boolean indicating whether +// or not a match was found in the input. +func (a *chainAction) End() bool { + return a.ok +} diff --git a/statehandler_on_match.go b/statehandler_on_match.go new file mode 100644 index 0000000..874e661 --- /dev/null +++ b/statehandler_on_match.go @@ -0,0 +1,64 @@ +package parsekit + +// matchAction is a struct that is used for building On()-method chains. +// +// It embeds the routeAction struct, to make it possible to go right into +// a route action, which is basically a simple way of aliasing a chain +// like p.On(...).Stay().RouteTo(...) into p.On(...).RouteTo(...). +type matchAction struct { + routeAction + runes []rune + widths []int +} + +// Accept tells the parser to move the cursor past a match that was found, +// and to store the input that matched in the string buffer. +// When no match was found, then no action is taken. +// It returns a routeAction struct, which provides methods that can be used +// to tell the parser what state to go to next. +func (a *matchAction) Accept() *routeAction { + if a.ok { + for i, r := range a.runes { + a.p.buffer.writeRune(r) + a.p.advanceCursor(r, a.widths[i]) + } + } + return &routeAction{chainAction: chainAction{a.p, a.ok}} +} + +// Skip tells the parser to move the cursor past a match that was found, +// without storing the actual match in the string buffer. +// Returns true in case a match was found. +// When no match was found, then no action is taken and false is returned. +func (a *matchAction) Skip() *routeAction { + if a.ok { + for i, r := range a.runes { + type C struct { + Rune MatchRune + } + + a.p.advanceCursor(r, a.widths[i]) + } + } + return &routeAction{chainAction: chainAction{a.p, a.ok}} +} + +// Stay tells the parser to not move the cursor after finding a match. +// Returns true in case a match was found, false otherwise. +func (a *matchAction) Stay() *routeAction { + return &routeAction{chainAction: chainAction{a.p, a.ok}} +} + +// advanceCursor advances the rune cursor one position in the input data. +// While doing so, it keeps tracks of newlines, so we can report on +// row + column positions on error. +func (p *P) advanceCursor(r rune, w int) { + p.pos += w + if p.newline { + p.cursorLine++ + p.cursorColumn = 1 + } else { + p.cursorColumn++ + } + p.newline = r == '\n' +} diff --git a/statehandler_on_route.go b/statehandler_on_route.go new file mode 100644 index 0000000..26f927c --- /dev/null +++ b/statehandler_on_route.go @@ -0,0 +1,59 @@ +package parsekit + +// routeAction is a struct that is used for building On() method chains. +type routeAction struct { + chainAction +} + +// RouteRepeat indicates that on the next parsing cycle, +// the current StateHandler must be reinvoked. +func (a *routeAction) RouteRepeat() *chainAction { + if a.ok { + return a.p.RouteRepeat() + } + return &chainAction{nil, false} +} + +// RouteTo tells the parser what StateHandler function to invoke +// in the next parsing cycle. +func (a *routeAction) RouteTo(state StateHandler) *routeFollowupAction { + if a.ok { + return a.p.RouteTo(state) + } + return &routeFollowupAction{chainAction: chainAction{nil, false}} +} + +// RouteReturn tells the parser that on the next cycle the next scheduled +// route must be invoked. +func (a *routeAction) RouteReturn() *chainAction { + if a.ok { + return a.p.RouteReturn() + } + return &chainAction{nil, false} +} + +// routeFollowupAction chains parsing routes. +// It allows for routing code like p.RouteTo(handlerA).ThenTo(handlerB). +type routeFollowupAction struct { + chainAction +} + +// ThenTo schedules a StateHandler that must be invoked after the RouteTo +// StateHandler has been completed. +// For example: p.RouteTo(handlerA).ThenTo(handlerB) +func (a *routeFollowupAction) ThenTo(state StateHandler) *chainAction { + if a.ok { + a.p.pushRoute(state) + } + return &chainAction{nil, a.ok} +} + +// ThenReturnHere schedules the current StateHandler to be invoked after +// the RouteTo StateHandler has been completed. +// For example: p.RouteTo(handlerA).ThenReturnHere() +func (a *routeFollowupAction) ThenReturnHere() *chainAction { + if a.ok { + a.p.pushRoute(a.p.state) + } + return &chainAction{nil, a.ok} +} diff --git a/statehandler_routing.go b/statehandler_routing.go new file mode 100644 index 0000000..9142da9 --- /dev/null +++ b/statehandler_routing.go @@ -0,0 +1,42 @@ +package parsekit + +// RouteTo tells the parser what StateHandler function to invoke +// in the next parsing cycle. +func (p *P) RouteTo(state StateHandler) *routeFollowupAction { + p.nextState = state + return &routeFollowupAction{chainAction: chainAction{p, true}} +} + +// RouteRepeat indicates that on the next parsing cycle, the current +// StateHandler must be reinvoked. +func (p *P) RouteRepeat() *chainAction { + p.RouteTo(p.state) + return &chainAction{nil, true} +} + +// RouteReturn tells the parser that on the next cycle the last +// StateHandler that was pushed on the route stack must be invoked. +// +// Using this method is optional. When implementating a StateHandler that +// is used as a sort of subroutine (using constructions like +// p.RouteTo(subroutine).ThenReturnHere()), you can refrain from +// providing an explicit routing decision from that handler. The parser will +// automatically assume a RouteReturn() in that case. +func (p *P) RouteReturn() *chainAction { + p.nextState = p.popRoute() + return &chainAction{nil, true} +} + +// pushRoute adds the StateHandler to the route stack. +// This is used for implementing nested parsing. +func (p *P) pushRoute(state StateHandler) { + p.routeStack = append(p.routeStack, state) +} + +// popRoute pops the last pushed StateHandler from the route stack. +func (p *P) popRoute() StateHandler { + last := len(p.routeStack) - 1 + head, tail := p.routeStack[:last], p.routeStack[last] + p.routeStack = head + return tail +} diff --git a/stringbuf.go b/stringbuf.go new file mode 100644 index 0000000..8df4659 --- /dev/null +++ b/stringbuf.go @@ -0,0 +1,62 @@ +package parsekit + +import ( + "bytes" + "strconv" + "strings" +) + +// stringBuffer is a string buffer implementation, which is used by the parser +// to efficiently accumulate runes from the input and eventually turn these +// into a string, either literal or interpreted. +type stringBuffer struct { + buffer bytes.Buffer +} + +// reset resets the string buffer, in order to build a new string. +func (b *stringBuffer) reset() *stringBuffer { + b.buffer.Reset() + return b +} + +// writeString adds the runes of the input string to the string buffer. +func (b *stringBuffer) writeString(s string) *stringBuffer { + for _, r := range s { + b.writeRune(r) + } + return b +} + +// writeRune adds a single rune to the string buffer. +func (b *stringBuffer) writeRune(r rune) *stringBuffer { + b.buffer.WriteRune(r) + return b +} + +// asLiteralString returns the string buffer as a literal string. +// Literal means that no escape sequences are processed. +func (b *stringBuffer) asLiteralString() string { + return b.buffer.String() +} + +// asInterpretedString returns the string in its interpreted form. +// Interpreted means that escape sequences are handled in the way that Go would +// have, had it been inside double quotes. It translates for example escape +// sequences like "\n", "\t", \uXXXX" and "\UXXXXXXXX" into their string +// representations. +// Since the input might contain invalid escape sequences, this method +// also returns an error. When an error is returned, the returned string will +// contain the string as far as it could be interpreted. +func (b *stringBuffer) asInterpretedString() (string, error) { + var sb strings.Builder + tail := b.buffer.String() + for len(tail) > 0 { + r, _, newtail, err := strconv.UnquoteChar(tail, '"') + if err != nil { + return sb.String(), err + } + tail = newtail + sb.WriteRune(r) + } + return sb.String(), nil +} diff --git a/stringbuf_test.go b/stringbuf_test.go new file mode 100644 index 0000000..0140688 --- /dev/null +++ b/stringbuf_test.go @@ -0,0 +1,88 @@ +package parsekit + +import ( + "testing" +) + +func TestGeneratingStringDoesNotResetBuffer(t *testing.T) { + var b stringBuffer + s1, _ := b.writeString(`hi\nthere`).asInterpretedString() + s2 := b.asLiteralString() + if s1 != "hi\nthere" { + t.Fatalf("Did not get expected string\"X\" for try 1, but %q", s1) + } + if s2 != "hi\\nthere" { + t.Fatalf("Did not get expected string\"X\" for try 2, but %q", s2) + } +} + +func TestResetResetsBuffer(t *testing.T) { + var b stringBuffer + s := b.writeRune('X').reset().asLiteralString() + if s != "" { + t.Fatalf("Did not get expected empty string, but %q", s) + } +} + +func TestAsLiteralString(t *testing.T) { + b := stringBuffer{} + for _, c := range []stringbufT{ + {"empty string", ``, ``, OK}, + {"simple string", `Simple string!`, `Simple string!`, OK}, + {"single quote", `'`, `'`, OK}, + {"double quote", `"`, `"`, OK}, + {"escaped single quote", `\'`, `\'`, OK}, + {"escaped double quote", `\"`, `\"`, OK}, + {"escape anything", `\x\t\f\n\r\'\"\\`, `\x\t\f\n\r\'\"\\`, OK}, + {"UTF8 escapes", `\uceb2\U00e0b8bf`, `\uceb2\U00e0b8bf`, OK}, + {"actual newline", "on\nmultiple\nlines", "on\nmultiple\nlines", OK}, + } { + s := b.reset().writeString(c.in).asLiteralString() + if s != c.out { + t.Fatalf("[%s] %q -> %q failed: actual result = %q", c.name, c.in, c.out, s) + } + } +} + +func TestAsInterpretedString(t *testing.T) { + b := stringBuffer{} + for _, c := range []stringbufT{ + {"empty string", "", "", OK}, + {"one character", "Simple string!", "Simple string!", OK}, + {"escaped single quote", `\'`, "", FAIL}, + {"escaped double quote", `\"`, `"`, OK}, + {"bare single quote", `'`, "'", OK}, + {"string in single quotes", `'Hello'`, `'Hello'`, OK}, + {"string in escaped double quotes", `\"Hello\"`, `"Hello"`, OK}, + {"escape something", `\t\f\n\r\"\\`, "\t\f\n\r\"\\", OK}, + {"short UTF8 escapes", `\u2318Wh\u00e9\u00e9!`, `⌘Whéé!`, OK}, + {"long UTF8 escapes", `\U0001014D \u2318 Wh\u00e9\u00e9!`, `𐅍 ⌘ Whéé!`, OK}, + {"UTF8 characters", "Ѝюج wut Ж ?", "Ѝюج wut Ж ?", OK}, + {"example from spec", + `I'm a string. \"You can quote me\". Name\tJos\u00E9\nLocation\tSF.`, + "I'm a string. \"You can quote me\". Name\tJosé\nLocation\tSF.", OK}, + } { + s, err := b.reset().writeString(c.in).asInterpretedString() + if c.isSuccessCase && err != nil { + t.Fatalf("[%s] unexpected error for input %q: %s", c.name, c.in, err) + } + if !c.isSuccessCase && err == nil { + t.Fatalf("[%s] expected a failure, but no failure occurred", c.name) + } + if s != c.out && c.isSuccessCase { + t.Fatalf("[%s] %q -> %q failed: actual result = %q", c.name, c.in, c.out, s) + } + } +} + +type stringbufT struct { + name string + in string + out string + isSuccessCase bool +} + +const ( + OK bool = true + FAIL bool = false +)