From 8a09b7ca493de1249fe672fad552ce16c110dbc6 Mon Sep 17 00:00:00 2001 From: Maurice Makaay Date: Wed, 22 May 2019 12:44:29 +0000 Subject: [PATCH] Implemented a lot of atoms in the parsekit library, ready for use by a parser implementation. --- atoms.go | 114 +++++++++++++++ atoms_test.go | 131 ++++++++++++++++++ parser_combinators.go => combinators.go | 26 +--- ...combinators_test.go => combinators_test.go | 21 ++- parsekit_test.go | 11 ++ statehandler_emit.go | 3 + statehandler_on.go | 31 +++-- 7 files changed, 292 insertions(+), 45 deletions(-) create mode 100644 atoms.go create mode 100644 atoms_test.go rename parser_combinators.go => combinators.go (93%) rename parser_combinators_test.go => combinators_test.go (95%) create mode 100644 parsekit_test.go diff --git a/atoms.go b/atoms.go new file mode 100644 index 0000000..38785cb --- /dev/null +++ b/atoms.go @@ -0,0 +1,114 @@ +package parsekit + +// A provides convenient access to a range of atoms that can be used to +// build combinators or parsing rules. +var A = struct { + EndOfFile Matcher + AnyRune Matcher + Space Matcher + Tab Matcher + CarriageRet Matcher + Newline Matcher + Excl Matcher + DoubleQuote Matcher + Hash Matcher + Dollar Matcher + Percent Matcher + Amp Matcher + SingleQuote Matcher + RoundOpen Matcher + RoundClose Matcher + Asterisk Matcher + Plus Matcher + Comma Matcher + Minus Matcher + Dot Matcher + Slash Matcher + Colon Matcher + Semicolon Matcher + AngleOpen Matcher + Equal Matcher + AngleClose Matcher + Question Matcher + At Matcher + SquareOpen Matcher + Backslash Matcher + SquareClose Matcher + Caret Matcher + Underscore Matcher + Backquote Matcher + CurlyOpen Matcher + Pipe Matcher + CurlyClose Matcher + Tilde Matcher + Whitespace Matcher + WhitespaceAndNewlines Matcher + EndOfLine Matcher + Digit Matcher + ASCII Matcher + ASCIILower Matcher + ASCIIUpper Matcher + HexDigit Matcher +}{ + EndOfFile: MatchEndOfFile(), + AnyRune: MatchAnyRune(), + Space: C.Rune(' '), + Tab: C.Rune('\t'), + CarriageRet: C.Rune('\r'), + Newline: C.Rune('\n'), + Excl: C.Rune('!'), + DoubleQuote: C.Rune('"'), + Hash: C.Rune('#'), + Dollar: C.Rune('$'), + Percent: C.Rune('%'), + Amp: C.Rune('&'), + SingleQuote: C.Rune('\''), + RoundOpen: C.Rune('('), + RoundClose: C.Rune(')'), + Asterisk: C.Rune('*'), + Plus: C.Rune('+'), + Comma: C.Rune(','), + Minus: C.Rune('-'), + Dot: C.Rune('.'), + Slash: C.Rune('/'), + Colon: C.Rune(':'), + Semicolon: C.Rune(';'), + AngleOpen: C.Rune('<'), + Equal: C.Rune('='), + AngleClose: C.Rune('>'), + Question: C.Rune('?'), + At: C.Rune('@'), + SquareOpen: C.Rune('['), + Backslash: C.Rune('\\'), + SquareClose: C.Rune(']'), + Caret: C.Rune('^'), + Underscore: C.Rune('_'), + Backquote: C.Rune('`'), + CurlyOpen: C.Rune('{'), + Pipe: C.Rune('|'), + CurlyClose: C.Rune('}'), + Tilde: C.Rune('~'), + Whitespace: C.OneOrMore(C.AnyOf(C.Rune(' '), C.Rune('\t'))), + WhitespaceAndNewlines: C.OneOrMore(C.AnyOf(C.Rune(' '), C.Rune('\t'), C.Rune('\r'), C.Rune('\n'))), + EndOfLine: C.AnyOf(C.String("\r\n"), C.Rune('\n'), MatchEndOfFile()), + Digit: C.RuneRange('0', '9'), + ASCII: C.RuneRange('\x00', '\x7F'), + ASCIILower: C.RuneRange('a', 'z'), + ASCIIUpper: C.RuneRange('A', 'Z'), + HexDigit: C.AnyOf(C.RuneRange('0', '9'), C.RuneRange('a', 'f'), C.RuneRange('A', 'F')), +} + +func MatchEndOfFile() Matcher { + return func(m *MatchDialog) bool { + fork := m.Fork() + input, ok := fork.NextRune() + return !ok && input == EOF + } +} + +func MatchAnyRune() Matcher { + return func(m *MatchDialog) bool { + _, ok := m.NextRune() + return ok + } +} diff --git a/atoms_test.go b/atoms_test.go new file mode 100644 index 0000000..dd63535 --- /dev/null +++ b/atoms_test.go @@ -0,0 +1,131 @@ +package parsekit_test + +import ( + "testing" + + "git.makaay.nl/mauricem/go-parsekit" +) + +func TestAtoms(t *testing.T) { + for i, c := range []struct { + input string + matcher parsekit.Matcher + mustMatch bool + }{ + {"", a.EndOfFile, true}, + {"⌘", a.AnyRune, true}, + {"\xbc", a.AnyRune, false}, // invalid UTF8 rune + {"", a.AnyRune, false}, // end of file + {" ", a.Space, true}, + {"X", a.Space, false}, + {"\t", a.Tab, true}, + {"\r", a.CarriageRet, true}, + {"\n", a.Newline, true}, + {"!", a.Excl, true}, + {"\"", a.DoubleQuote, true}, + {"#", a.Hash, true}, + {"$", a.Dollar, true}, + {"%", a.Percent, true}, + {"&", a.Amp, true}, + {"'", a.SingleQuote, true}, + {"(", a.RoundOpen, true}, + {")", a.RoundClose, true}, + {"*", a.Asterisk, true}, + {"+", a.Plus, true}, + {",", a.Comma, true}, + {"-", a.Minus, true}, + {".", a.Dot, true}, + {"/", a.Slash, true}, + {":", a.Colon, true}, + {";", a.Semicolon, true}, + {"<", a.AngleOpen, true}, + {"=", a.Equal, true}, + {">", a.AngleClose, true}, + {"?", a.Question, true}, + {"@", a.At, true}, + {"[", a.SquareOpen, true}, + {"\\", a.Backslash, true}, + {"]", a.SquareClose, true}, + {"^", a.Caret, true}, + {"_", a.Underscore, true}, + {"`", a.Backquote, true}, + {"{", a.CurlyOpen, true}, + {"|", a.Pipe, true}, + {"}", a.CurlyClose, true}, + {"~", a.Tilde, true}, + {" \t \t ", a.Whitespace, true}, + {" \t\r\n ", a.WhitespaceAndNewlines, true}, + {"", a.EndOfLine, true}, + {"\r\n", a.EndOfLine, true}, + {"\n", a.EndOfLine, true}, + {"0", a.Digit, true}, + {"1", a.Digit, true}, + {"2", a.Digit, true}, + {"3", a.Digit, true}, + {"4", a.Digit, true}, + {"5", a.Digit, true}, + {"6", a.Digit, true}, + {"7", a.Digit, true}, + {"8", a.Digit, true}, + {"9", a.Digit, true}, + {"X", a.Digit, false}, + {"a", a.ASCIILower, true}, + {"z", a.ASCIILower, true}, + {"A", a.ASCIILower, false}, + {"Z", a.ASCIILower, false}, + {"A", a.ASCIIUpper, true}, + {"Z", a.ASCIIUpper, true}, + {"a", a.ASCIIUpper, false}, + {"z", a.ASCIIUpper, false}, + {"0", a.HexDigit, true}, + {"9", a.HexDigit, true}, + {"a", a.HexDigit, true}, + {"f", a.HexDigit, true}, + {"A", a.HexDigit, true}, + {"F", a.HexDigit, true}, + {"g", a.HexDigit, false}, + {"G", a.HexDigit, false}, + } { + parser := parsekit.New(c.input, func(p *parsekit.P) { + if p.On(c.matcher).Accept().End() { + p.EmitLiteral(SuccessItem) + } else { + p.EmitLiteral(FailItem) + } + }) + item, err, ok := parser.Next() + if !ok { + t.Fatalf("Test [%d] %q failed with error: %s", i+1, c.input, err) + } + if c.mustMatch && item.Type != SuccessItem { + t.Fatalf("Test [%d] %q failed: should match, but it didn't", i+1, c.input) + } + if !c.mustMatch && item.Type != FailItem { + t.Fatalf("Test [%d] %q failed: should not match, but it did", i+1, c.input) + } + } +} + +func TestSequenceOfRunes(t *testing.T) { + sequence := c.Sequence( + a.Hash, a.Dollar, a.Percent, a.Amp, a.SingleQuote, a.RoundOpen, + a.RoundClose, a.Asterisk, a.Plus, a.Comma, a.Minus, a.Dot, a.Slash, + a.Colon, a.Semicolon, a.AngleOpen, a.Equal, a.AngleClose, a.Question, + a.At, a.SquareOpen, a.Backslash, a.SquareClose, a.Caret, a.Underscore, + a.Backquote, a.CurlyOpen, a.Pipe, a.CurlyClose, a.Tilde, + ) + input := "#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" + parser := parsekit.New(input, func(p *parsekit.P) { + p.Expects("Sequence of runes") + if p.On(sequence).Accept().End() { + p.EmitLiteral(TestItem) + } + }) + item, err, ok := parser.Next() + if !ok { + t.Fatalf("Parsing failed: %s", err) + } + if item.Value != input { + t.Fatalf("Unexpected output from parser:\nexpected: %s\nactual: %s\n", input, item.Value) + } +} diff --git a/parser_combinators.go b/combinators.go similarity index 93% rename from parser_combinators.go rename to combinators.go index 004376e..25b3851 100644 --- a/parser_combinators.go +++ b/combinators.go @@ -5,8 +5,10 @@ import ( "unicode/utf8" ) -// Not in need of it myself, but nice to have I guess: +// Nice to have I guess: // - LookAhead +// - Ready to go combinators for various number notations +// - Ready to go atoms (C.space, C.tab, C.digits, C.asciiUpper, etc...) type Matcher func(m *MatchDialog) bool @@ -49,7 +51,7 @@ func (m *MatchDialog) NextRune() (rune, bool) { // 3 runes returned from NextRune() which match the expectations, then the // slice of runes inside the MatchDialog will contain these 3 runes. // When after this the 4th rune turns out to be a mismatch, the forked -// MatchDialog can simply be disarded, and the state in the parent will be +// MatchDialog can simply be discarded, and the state in the parent will be // kept as-is. // // When a forked MatchDialog is in use, and the Matcher decides that a @@ -87,15 +89,13 @@ func (m *MatchDialog) Clear() { m.widths = []int{} } -// C provides convenient access to a wide range of parser/combinator +// C provides convenient access to a range of parser/combinator // constructors that can be used to build matching expressions. // // When using C in your own parser, then it is advised to create // a variable in your own package to reference it (var c = parsekit.C). // This saves a lot of typing, and it makes your code a lot cleaner. var C = struct { - EndOfFile func() Matcher - AnyRune func() Matcher Rune func(rune) Matcher Runes func(...rune) Matcher RuneRange func(rune, rune) Matcher @@ -114,8 +114,6 @@ var C = struct { Separated func(Matcher, Matcher) Matcher Drop func(Matcher) Matcher }{ - EndOfFile: MatchEndOfFile, - AnyRune: MatchAnyRune, Rune: MatchRune, Runes: MatchRunes, RuneRange: MatchRuneRange, @@ -135,20 +133,6 @@ var C = struct { Drop: MatchDrop, } -func MatchEndOfFile() Matcher { - return func(m *MatchDialog) bool { - input, ok := m.NextRune() - return !ok && input == EOF - } -} - -func MatchAnyRune() Matcher { - return func(m *MatchDialog) bool { - _, ok := m.NextRune() - return ok - } -} - func MatchRune(r rune) Matcher { return func(m *MatchDialog) bool { input, ok := m.NextRune() diff --git a/parser_combinators_test.go b/combinators_test.go similarity index 95% rename from parser_combinators_test.go rename to combinators_test.go index 449f0b1..0c5e798 100644 --- a/parser_combinators_test.go +++ b/combinators_test.go @@ -5,29 +5,24 @@ import ( "testing" "git.makaay.nl/mauricem/go-parsekit" - p "git.makaay.nl/mauricem/go-parsekit" ) -var c = p.C - -const TestItem p.ItemType = 1 - -func newParser(input string, Matcher p.Matcher) *p.P { - stateFn := func(p *p.P) { +func newParser(input string, Matcher parsekit.Matcher) *parsekit.P { + stateFn := func(p *parsekit.P) { p.Expects("MATCH") if p.On(Matcher).Accept().End() { p.EmitLiteral(TestItem) p.RouteRepeat() } } - return p.New(input, stateFn) + return parsekit.New(input, stateFn) } func ExampleTestMatchAny(t *testing.T) { parser := parsekit.New( "¡Any / valid / character will dö!", func(p *parsekit.P) { - p.On(parsekit.MatchAnyRune()).Accept() + p.On(a.AnyRune).Accept() p.EmitLiteral(TestItem) }) match, _, ok := parser.Next() @@ -37,7 +32,7 @@ func ExampleTestMatchAny(t *testing.T) { } func TestMatchAnyRune(t *testing.T) { - p := newParser("o", c.AnyRune()) + p := newParser("o", a.AnyRune) r, err, ok := p.Next() if !ok { t.Fatalf("Parsing failed: %s", err) @@ -51,7 +46,7 @@ func TestMatchAnyRune(t *testing.T) { } func TestMatchAnyRune_AtEndOfFile(t *testing.T) { - p := newParser("", c.AnyRune()) + p := newParser("", a.AnyRune) _, err, ok := p.Next() if ok { t.Fatalf("Parsing unexpectedly succeeded") @@ -63,7 +58,7 @@ func TestMatchAnyRune_AtEndOfFile(t *testing.T) { } func TestMatchAnyRune_AtInvalidUtf8Rune(t *testing.T) { - p := newParser("\xcd", c.AnyRune()) + p := newParser("\xcd", a.AnyRune) _, err, ok := p.Next() if ok { t.Fatalf("Parsing unexpectedly succeeded") @@ -335,7 +330,7 @@ func TestMatchOptional(t *testing.T) { func TestMatchDrop(t *testing.T) { dashes := c.OneOrMore(c.Rune('-')) - p := newParser("---X---", c.Sequence(c.Drop(dashes), c.AnyRune(), c.Drop(dashes))) + p := newParser("---X---", c.Sequence(c.Drop(dashes), a.AnyRune, c.Drop(dashes))) r, err, ok := p.Next() if !ok { t.Fatalf("Parsing failed: %s at row: %d, column %d\n", err, err.Line, err.Column) diff --git a/parsekit_test.go b/parsekit_test.go new file mode 100644 index 0000000..058b6b5 --- /dev/null +++ b/parsekit_test.go @@ -0,0 +1,11 @@ +package parsekit_test + +import "git.makaay.nl/mauricem/go-parsekit" + +const ( + TestItem parsekit.ItemType = 1 + SuccessItem parsekit.ItemType = 2 + FailItem parsekit.ItemType = 3 +) + +var c, a = parsekit.C, parsekit.A diff --git a/statehandler_emit.go b/statehandler_emit.go index 646f342..2e20966 100644 --- a/statehandler_emit.go +++ b/statehandler_emit.go @@ -49,6 +49,9 @@ func (p *P) EmitLiteralTrim(t ItemType) { func (p *P) EmitInterpreted(t ItemType) error { s, err := p.buffer.asInterpretedString() if err != nil { + p.EmitError( + "invalid string: %s (%s, forgot to escape a double quote or backslash maybe?)", + p.buffer.asLiteralString(), err) return err } p.Emit(t, s) diff --git a/statehandler_on.go b/statehandler_on.go index bd737da..93691f9 100644 --- a/statehandler_on.go +++ b/statehandler_on.go @@ -5,14 +5,17 @@ package parsekit // This method is the start of a chain method in which multiple things can // be arranged in one go: // -// * Checking whether or not there is a match (this is what On does) -// * Deciding what to do with the match (Stay(): do nothing, Skip(): only move -// the cursor forward, Accept(): move cursor forward and add the match in -// the parser string buffer) -// * Dedicing where to route to (e.g. using RouteTo() to route to a -// StateHandler by name) -// * Followup routing after that, when applicable (.e.g using something like -// RouteTo(...).ThenTo(...)) +// 1) Checking whether or not there is a match (this is what On does) +// +// 2) Deciding what to do with the match (Stay(): do nothing, Skip(): only move +// the cursor forward, Accept(): move cursor forward and add the match in +// the parser string buffer) +// +// 3) Dedicing where to route to (e.g. using RouteTo() to route to a +// StateHandler by name) +// +// 4) Followup routing after that, when applicable (.e.g using something like +// RouteTo(...).ThenTo(...)) // // For every step of this chain, you can end the chain using the // End() method. This will return a boolean value, indicating whether or @@ -22,13 +25,19 @@ package parsekit // require a boolean expression). // // You can omit "what to do with the match" and go straight into a routing -// method, e.g. On(...).RouteTo(...). This is functionally the same as -// using On(...).Stay().RouteTo(...). +// method, e.g. +// On(...).RouteTo(...) +// This is functionally the same as using +// On(...).Stay().RouteTo(...). // // Here's a complete example chain: -// p.On(something).Accept().RouteTo(stateB).ThenTo(stateC).End() +// p.On(something).Accept().RouteTo(stateB).ThenTo(stateC).End() func (p *P) On(matcher Matcher) *matchAction { m := &MatchDialog{p: p} + if matcher == nil { + p.EmitError("internal parser error: matcher argument for On() is nil") + return &matchAction{routeAction: routeAction{chainAction: chainAction{nil, false}}} + } ok := matcher(m) // Keep track of the last match, to allow parser implementations