From 6ad449997197e136220d59a015949a8134ab9cc0 Mon Sep 17 00:00:00 2001 From: Maurice Makaay Date: Fri, 24 May 2019 12:41:34 +0000 Subject: [PATCH] Backup work, created a lot of tests for parser combinators and atoms. Pretty solid now! --- atoms.go | 114 ---------- atoms_test.go | 128 ----------- combinators.go | 296 ------------------------ combinators_test.go | 112 --------- matcher.go | 187 +++++++++++++++ matcher_builtin.go | 477 +++++++++++++++++++++++++++++++++++++++ matcher_builtin_test.go | 203 +++++++++++++++++ parsekit.go | 98 ++++---- parsekit_test.go | 41 +++- peek.go | 43 ---- statehandler.go | 128 +++++++++++ statehandler_emit.go | 33 ++- statehandler_expects.go | 15 -- statehandler_on.go | 128 ++++++++++- statehandler_on_match.go | 60 ----- statehandler_on_route.go | 59 ----- statehandler_routing.go | 42 ---- 17 files changed, 1216 insertions(+), 948 deletions(-) delete mode 100644 atoms.go delete mode 100644 atoms_test.go delete mode 100644 combinators.go delete mode 100644 combinators_test.go create mode 100644 matcher.go create mode 100644 matcher_builtin.go create mode 100644 matcher_builtin_test.go delete mode 100644 peek.go create mode 100644 statehandler.go delete mode 100644 statehandler_expects.go delete mode 100644 statehandler_on_match.go delete mode 100644 statehandler_on_route.go delete mode 100644 statehandler_routing.go diff --git a/atoms.go b/atoms.go deleted file mode 100644 index 38785cb..0000000 --- a/atoms.go +++ /dev/null @@ -1,114 +0,0 @@ -package parsekit - -// A provides convenient access to a range of atoms that can be used to -// build combinators or parsing rules. -var A = struct { - EndOfFile Matcher - AnyRune Matcher - Space Matcher - Tab Matcher - CarriageRet Matcher - Newline Matcher - Excl Matcher - DoubleQuote Matcher - Hash Matcher - Dollar Matcher - Percent Matcher - Amp Matcher - SingleQuote Matcher - RoundOpen Matcher - RoundClose Matcher - Asterisk Matcher - Plus Matcher - Comma Matcher - Minus Matcher - Dot Matcher - Slash Matcher - Colon Matcher - Semicolon Matcher - AngleOpen Matcher - Equal Matcher - AngleClose Matcher - Question Matcher - At Matcher - SquareOpen Matcher - Backslash Matcher - SquareClose Matcher - Caret Matcher - Underscore Matcher - Backquote Matcher - CurlyOpen Matcher - Pipe Matcher - CurlyClose Matcher - Tilde Matcher - Whitespace Matcher - WhitespaceAndNewlines Matcher - EndOfLine Matcher - Digit Matcher - ASCII Matcher - ASCIILower Matcher - ASCIIUpper Matcher - HexDigit Matcher -}{ - EndOfFile: MatchEndOfFile(), - AnyRune: MatchAnyRune(), - Space: C.Rune(' '), - Tab: C.Rune('\t'), - CarriageRet: C.Rune('\r'), - Newline: C.Rune('\n'), - Excl: C.Rune('!'), - DoubleQuote: C.Rune('"'), - Hash: C.Rune('#'), - Dollar: C.Rune('$'), - Percent: C.Rune('%'), - Amp: C.Rune('&'), - SingleQuote: C.Rune('\''), - RoundOpen: C.Rune('('), - RoundClose: C.Rune(')'), - Asterisk: C.Rune('*'), - Plus: C.Rune('+'), - Comma: C.Rune(','), - Minus: C.Rune('-'), - Dot: C.Rune('.'), - Slash: C.Rune('/'), - Colon: C.Rune(':'), - Semicolon: C.Rune(';'), - AngleOpen: C.Rune('<'), - Equal: C.Rune('='), - AngleClose: C.Rune('>'), - Question: C.Rune('?'), - At: C.Rune('@'), - SquareOpen: C.Rune('['), - Backslash: C.Rune('\\'), - SquareClose: C.Rune(']'), - Caret: C.Rune('^'), - Underscore: C.Rune('_'), - Backquote: C.Rune('`'), - CurlyOpen: C.Rune('{'), - Pipe: C.Rune('|'), - CurlyClose: C.Rune('}'), - Tilde: C.Rune('~'), - Whitespace: C.OneOrMore(C.AnyOf(C.Rune(' '), C.Rune('\t'))), - WhitespaceAndNewlines: C.OneOrMore(C.AnyOf(C.Rune(' '), C.Rune('\t'), C.Rune('\r'), C.Rune('\n'))), - EndOfLine: C.AnyOf(C.String("\r\n"), C.Rune('\n'), MatchEndOfFile()), - Digit: C.RuneRange('0', '9'), - ASCII: C.RuneRange('\x00', '\x7F'), - ASCIILower: C.RuneRange('a', 'z'), - ASCIIUpper: C.RuneRange('A', 'Z'), - HexDigit: C.AnyOf(C.RuneRange('0', '9'), C.RuneRange('a', 'f'), C.RuneRange('A', 'F')), -} - -func MatchEndOfFile() Matcher { - return func(m *MatchDialog) bool { - fork := m.Fork() - input, ok := fork.NextRune() - return !ok && input == EOF - } -} - -func MatchAnyRune() Matcher { - return func(m *MatchDialog) bool { - _, ok := m.NextRune() - return ok - } -} diff --git a/atoms_test.go b/atoms_test.go deleted file mode 100644 index 2a20c06..0000000 --- a/atoms_test.go +++ /dev/null @@ -1,128 +0,0 @@ -package parsekit_test - -import ( - "testing" - - "git.makaay.nl/mauricem/go-parsekit" -) - -func TestAtoms(t *testing.T) { - for i, c := range []struct { - input string - matcher parsekit.Matcher - mustMatch bool - }{ - {"", a.EndOfFile, true}, - {"⌘", a.AnyRune, true}, - {"\xbc", a.AnyRune, false}, // invalid UTF8 rune - {"", a.AnyRune, false}, // end of file - {" ", a.Space, true}, - {"X", a.Space, false}, - {"\t", a.Tab, true}, - {"\r", a.CarriageRet, true}, - {"\n", a.Newline, true}, - {"!", a.Excl, true}, - {"\"", a.DoubleQuote, true}, - {"#", a.Hash, true}, - {"$", a.Dollar, true}, - {"%", a.Percent, true}, - {"&", a.Amp, true}, - {"'", a.SingleQuote, true}, - {"(", a.RoundOpen, true}, - {")", a.RoundClose, true}, - {"*", a.Asterisk, true}, - {"+", a.Plus, true}, - {",", a.Comma, true}, - {"-", a.Minus, true}, - {".", a.Dot, true}, - {"/", a.Slash, true}, - {":", a.Colon, true}, - {";", a.Semicolon, true}, - {"<", a.AngleOpen, true}, - {"=", a.Equal, true}, - {">", a.AngleClose, true}, - {"?", a.Question, true}, - {"@", a.At, true}, - {"[", a.SquareOpen, true}, - {"\\", a.Backslash, true}, - {"]", a.SquareClose, true}, - {"^", a.Caret, true}, - {"_", a.Underscore, true}, - {"`", a.Backquote, true}, - {"{", a.CurlyOpen, true}, - {"|", a.Pipe, true}, - {"}", a.CurlyClose, true}, - {"~", a.Tilde, true}, - {" \t \t ", a.Whitespace, true}, - {" \t\r\n ", a.WhitespaceAndNewlines, true}, - {"", a.EndOfLine, true}, - {"\r\n", a.EndOfLine, true}, - {"\n", a.EndOfLine, true}, - {"0", a.Digit, true}, - {"1", a.Digit, true}, - {"2", a.Digit, true}, - {"3", a.Digit, true}, - {"4", a.Digit, true}, - {"5", a.Digit, true}, - {"6", a.Digit, true}, - {"7", a.Digit, true}, - {"8", a.Digit, true}, - {"9", a.Digit, true}, - {"X", a.Digit, false}, - {"a", a.ASCIILower, true}, - {"z", a.ASCIILower, true}, - {"A", a.ASCIILower, false}, - {"Z", a.ASCIILower, false}, - {"A", a.ASCIIUpper, true}, - {"Z", a.ASCIIUpper, true}, - {"a", a.ASCIIUpper, false}, - {"z", a.ASCIIUpper, false}, - {"0", a.HexDigit, true}, - {"9", a.HexDigit, true}, - {"a", a.HexDigit, true}, - {"f", a.HexDigit, true}, - {"A", a.HexDigit, true}, - {"F", a.HexDigit, true}, - {"g", a.HexDigit, false}, - {"G", a.HexDigit, false}, - } { - parser := parsekit.New(c.matcher).Parse(c.input) - item, err, ok := parser.Next() - if c.mustMatch { - if !ok { - t.Errorf("Test [%d] %q failed with error: %s", i+1, c.input, err) - } - if item.Type != parsekit.MatchedItem { - t.Errorf("Test [%d] %q failed: should match, but it didn't", i+1, c.input) - } - } else { - if ok { - t.Errorf("Test [%d] %q failed: should not match, but it did", i+1, c.input) - } - } - } -} - -func TestSequenceOfRunes(t *testing.T) { - sequence := c.Sequence( - a.Hash, a.Dollar, a.Percent, a.Amp, a.SingleQuote, a.RoundOpen, - a.RoundClose, a.Asterisk, a.Plus, a.Comma, a.Minus, a.Dot, a.Slash, - a.Colon, a.Semicolon, a.AngleOpen, a.Equal, a.AngleClose, a.Question, - a.At, a.SquareOpen, a.Backslash, a.SquareClose, a.Caret, a.Underscore, - a.Backquote, a.CurlyOpen, a.Pipe, a.CurlyClose, a.Tilde, - ) - input := "#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" - parser := parsekit.New(func(p *parsekit.P) { - p.Expects("Sequence of runes") - if p.On(sequence).Accept().End() { - p.EmitLiteral(TestItem) - } - }) - item, err, ok := parser.Parse(input).Next() - if !ok { - t.Fatalf("Parsing failed: %s", err) - } - if item.Value != input { - t.Fatalf("Unexpected output from parser:\nexpected: %s\nactual: %s\n", input, item.Value) - } -} diff --git a/combinators.go b/combinators.go deleted file mode 100644 index 287d74c..0000000 --- a/combinators.go +++ /dev/null @@ -1,296 +0,0 @@ -package parsekit - -import ( - "unicode" - "unicode/utf8" -) - -// Nice to have I guess: -// - LookAhead -// - Ready to go combinators for various number notations -// - Ready to go atoms (C.space, C.tab, C.digits, C.asciiUpper, etc...) - -type Matcher func(m *MatchDialog) bool - -// MatchDialog is used by Matcher functions to retrieve data from the parser -// input to match against and to report back successful matches. -type MatchDialog struct { - p *P - runes []rune - widths []int - offset int - curRune rune - curWidth int - parent *MatchDialog -} - -// NextRune can be called by a Matcher on a MatchDialog in order -// to receive the next rune from the input. -// The rune is automatically added to the MatchDialog's slice of runes. -// Returns the rune and a boolean. The boolean will be false in -// case an invalid UTF8 rune of the end of the file was encountered. -func (m *MatchDialog) NextRune() (rune, bool) { - if m.curRune == utf8.RuneError { - panic("internal parser error: Matcher must not call NextRune() after it returned false") - } - r, w, ok := m.p.peek(m.offset) - m.offset += w - m.curRune = r - m.curWidth = w - m.runes = append(m.runes, r) - m.widths = append(m.widths, w) - return r, ok -} - -// Fork splits off a child MatchDialog, containing the same offset as the -// parent MatchDialog, but with all other data in a new state. -// -// By forking, a Matcher implementation can freely work with a MatchDialog, -// without affecting the parent MatchDialog. This is for example useful when -// the Matcher is checking for a sequence of runes. When there are first -// 3 runes returned from NextRune() which match the expectations, then the -// slice of runes inside the MatchDialog will contain these 3 runes. -// When after this the 4th rune turns out to be a mismatch, the forked -// MatchDialog can simply be discarded, and the state in the parent will be -// kept as-is. -// -// When a forked MatchDialog is in use, and the Matcher decides that a -// successul match was found, then the Merge() method can be called in -// order to transport the collected runes to the parent MatchDialog. -func (m *MatchDialog) Fork() *MatchDialog { - child := &MatchDialog{ - p: m.p, - offset: m.offset, - parent: m, - } - return child -} - -// Merge merges the data from a forked child MatchDialog back into its parent: -// * the runes that are accumulated in the child are added to the parent runes -// * the parent's offset is set to the child's offset -// After a Merge, the child MatchDialog is reset so it can immediately be -// reused for performing another match. -func (m *MatchDialog) Merge() bool { - if m.parent == nil { - panic("internal parser error: Cannot call Merge a a non-forked MatchDialog") - } - m.parent.runes = append(m.parent.runes, m.runes...) - m.parent.widths = append(m.parent.widths, m.widths...) - m.parent.offset = m.offset - m.Clear() - return true -} - -// Clear empties out the accumulated runes that are stored in the MatchDialog. -// The offset is kept as-is. -func (m *MatchDialog) Clear() { - m.runes = []rune{} - m.widths = []int{} -} - -// C provides convenient access to a range of parser/combinator -// constructors that can be used to build matching expressions. -// -// When using C in your own parser, then it is advised to create -// a variable in your own package to reference it (var c = parsekit.C). -// This saves a lot of typing, and it makes your code a lot cleaner. -var C = struct { - Rune func(rune) Matcher - Runes func(...rune) Matcher - RuneRange func(rune, rune) Matcher - String func(string) Matcher - StringNoCase func(string) Matcher - AnyOf func(...Matcher) Matcher - Not func(Matcher) Matcher - Optional func(Matcher) Matcher - Sequence func(...Matcher) Matcher - Repeat func(int, Matcher) Matcher - Min func(int, Matcher) Matcher - Max func(int, Matcher) Matcher - ZeroOrMore func(Matcher) Matcher - OneOrMore func(Matcher) Matcher - MinMax func(int, int, Matcher) Matcher - Separated func(Matcher, Matcher) Matcher - Drop func(Matcher) Matcher -}{ - Rune: MatchRune, - Runes: MatchRunes, - RuneRange: MatchRuneRange, - String: MatchString, - StringNoCase: MatchStringNoCase, - Optional: MatchOptional, - AnyOf: MatchAnyOf, - Not: MatchNot, - Sequence: MatchSequence, - Repeat: MatchRepeat, - Min: MatchMin, - Max: MatchMax, - ZeroOrMore: MatchZeroOrMore, - OneOrMore: MatchOneOrMore, - MinMax: MatchMinMax, - Separated: MatchSeparated, - Drop: MatchDrop, -} - -func MatchRune(r rune) Matcher { - return func(m *MatchDialog) bool { - input, ok := m.NextRune() - return ok && input == r - } -} - -func MatchRunes(runes ...rune) Matcher { - return func(m *MatchDialog) bool { - input, ok := m.NextRune() - if ok { - for _, r := range runes { - if input == r { - return true - } - } - } - return false - } -} - -func MatchRuneRange(start rune, end rune) Matcher { - return func(m *MatchDialog) bool { - input, ok := m.NextRune() - return ok && input >= start && input <= end - } -} - -func MatchString(s string) Matcher { - var matchers = []Matcher{} - for _, r := range s { - matchers = append(matchers, MatchRune(r)) - } - return MatchSequence(matchers...) -} - -func MatchStringNoCase(s string) Matcher { - var matchers = []Matcher{} - for _, r := range s { - u := unicode.ToUpper(r) - l := unicode.ToLower(r) - matchers = append(matchers, MatchRunes(u, l)) - } - return MatchSequence(matchers...) -} - -func MatchOptional(matcher Matcher) Matcher { - return func(m *MatchDialog) bool { - child := m.Fork() - if matcher(child) { - child.Merge() - } - return true - } -} - -func MatchSequence(matchers ...Matcher) Matcher { - return func(m *MatchDialog) bool { - child := m.Fork() - for _, matcher := range matchers { - if !matcher(child) { - return false - } - } - child.Merge() - return true - } -} - -func MatchAnyOf(matchers ...Matcher) Matcher { - return func(m *MatchDialog) bool { - for _, matcher := range matchers { - child := m.Fork() - if matcher(child) { - return child.Merge() - } - } - return false - } -} - -func MatchNot(matcher Matcher) Matcher { - return func(m *MatchDialog) bool { - child := m.Fork() - if !matcher(child) { - return child.Merge() - } - return false - } -} - -func MatchRepeat(count int, matcher Matcher) Matcher { - return MatchMinMax(count, count, matcher) -} - -func MatchMin(min int, matcher Matcher) Matcher { - return MatchMinMax(min, -1, matcher) -} - -func MatchMax(max int, matcher Matcher) Matcher { - return MatchMinMax(-1, max, matcher) -} - -func MatchZeroOrMore(matcher Matcher) Matcher { - return MatchMinMax(0, -1, matcher) -} - -func MatchOneOrMore(matcher Matcher) Matcher { - return MatchMinMax(1, -1, matcher) -} - -func MatchMinMax(min int, max int, matcher Matcher) Matcher { - return func(m *MatchDialog) bool { - child := m.Fork() - if min >= 0 && max >= 0 && min > max { - panic("internal parser error: MatchRepeat definition error: max must not be < min") - } - total := 0 - // Specified min: check for the minimum required amount of matches. - for min > 0 && total < min { - total++ - if !matcher(child) { - return false - } - } - // No specified max: include the rest of the available matches. - if max < 0 { - child.Merge() - for matcher(child) { - child.Merge() - } - return true - } - // Specified max: include the rest of the availble matches, up to the max. - child.Merge() - for total < max { - total++ - if !matcher(child) { - break - } - child.Merge() - } - return true - } -} - -func MatchSeparated(separator Matcher, separated Matcher) Matcher { - return MatchSequence(separated, MatchZeroOrMore(MatchSequence(separator, separated))) -} - -func MatchDrop(matcher Matcher) Matcher { - return func(m *MatchDialog) bool { - child := m.Fork() - if matcher(child) { - child.Clear() - child.Merge() - return true - } - return false - } -} diff --git a/combinators_test.go b/combinators_test.go deleted file mode 100644 index 3b3aa16..0000000 --- a/combinators_test.go +++ /dev/null @@ -1,112 +0,0 @@ -package parsekit_test - -import ( - "fmt" - "testing" - - "git.makaay.nl/mauricem/go-parsekit" -) - -func ExampleMatchAnyRune(t *testing.T) { - parser := parsekit.New( - func(p *parsekit.P) { - p.Expects("Any valid rune") - if p.On(a.AnyRune).Accept().End() { - p.EmitLiteral(TestItem) - } - }) - run := parser.Parse("¡Any / valid / character will dö!") - match, _, ok := run.Next() - if ok { - fmt.Printf("Match = %q\n", match) - } -} - -func TestCombinators(t *testing.T) { - for i, c := range []struct { - input string - matcher parsekit.Matcher - mustMatch bool - expected string - }{ - {"xxx", c.Rune('x'), true, "x"}, - {"x ", c.Rune(' '), false, ""}, - {"aa", c.RuneRange('b', 'e'), false, ""}, - {"bb", c.RuneRange('b', 'e'), true, "b"}, - {"cc", c.RuneRange('b', 'e'), true, "c"}, - {"dd", c.RuneRange('b', 'e'), true, "d"}, - {"ee", c.RuneRange('b', 'e'), true, "e"}, - {"ff", c.RuneRange('b', 'e'), false, ""}, - {"Hello, world!", c.String("Hello"), true, "Hello"}, - {"HellÖ, world!", c.StringNoCase("hellö"), true, "HellÖ"}, - {"+X", c.Runes('+', '-', '*', '/'), true, "+"}, - {"-X", c.Runes('+', '-', '*', '/'), true, "-"}, - {"*X", c.Runes('+', '-', '*', '/'), true, "*"}, - {"/X", c.Runes('+', '-', '*', '/'), true, "/"}, - {"!X", c.Runes('+', '-', '*', '/'), false, ""}, - {"abc", c.Not(c.Rune('b')), true, "a"}, - {"bcd", c.Not(c.Rune('b')), false, ""}, - {"bcd", c.Not(c.Rune('b')), false, ""}, - {"abc", c.AnyOf(c.Rune('a'), c.Rune('b')), true, "a"}, - {"bcd", c.AnyOf(c.Rune('a'), c.Rune('b')), true, "b"}, - {"cde", c.AnyOf(c.Rune('a'), c.Rune('b')), false, ""}, - {"ababc", c.Repeat(4, c.Runes('a', 'b')), true, "abab"}, - {"ababc", c.Repeat(5, c.Runes('a', 'b')), false, ""}, - {"", c.Min(0, c.Rune('a')), true, ""}, - {"a", c.Min(0, c.Rune('a')), true, "a"}, - {"aaaaa", c.Min(4, c.Rune('a')), true, "aaaaa"}, - {"aaaaa", c.Min(5, c.Rune('a')), true, "aaaaa"}, - {"aaaaa", c.Min(6, c.Rune('a')), false, ""}, - {"", c.Max(4, c.Rune('b')), true, ""}, - {"X", c.Max(4, c.Rune('b')), true, ""}, - {"bbbbbX", c.Max(4, c.Rune('b')), true, "bbbb"}, - {"bbbbbX", c.Max(5, c.Rune('b')), true, "bbbbb"}, - {"bbbbbX", c.Max(6, c.Rune('b')), true, "bbbbb"}, - {"", c.MinMax(0, 0, c.Rune('c')), true, ""}, - {"X", c.MinMax(0, 0, c.Rune('c')), true, ""}, - {"cccccX", c.MinMax(0, 0, c.Rune('c')), true, ""}, - {"cccccX", c.MinMax(0, 1, c.Rune('c')), true, "c"}, - {"cccccX", c.MinMax(0, 5, c.Rune('c')), true, "ccccc"}, - {"cccccX", c.MinMax(0, 6, c.Rune('c')), true, "ccccc"}, - {"cccccX", c.MinMax(1, 1, c.Rune('c')), true, "c"}, - {"", c.MinMax(1, 1, c.Rune('c')), false, ""}, - {"X", c.MinMax(1, 1, c.Rune('c')), false, ""}, - {"cccccX", c.MinMax(1, 3, c.Rune('c')), true, "ccc"}, - {"cccccX", c.MinMax(1, 6, c.Rune('c')), true, "ccccc"}, - {"cccccX", c.MinMax(3, 4, c.Rune('c')), true, "cccc"}, - {"", c.OneOrMore(c.Rune('d')), false, ""}, - {"X", c.OneOrMore(c.Rune('d')), false, ""}, - {"dX", c.OneOrMore(c.Rune('d')), true, "d"}, - {"dddddX", c.OneOrMore(c.Rune('d')), true, "ddddd"}, - {"", c.ZeroOrMore(c.Rune('e')), true, ""}, - {"X", c.ZeroOrMore(c.Rune('e')), true, ""}, - {"eX", c.ZeroOrMore(c.Rune('e')), true, "e"}, - {"eeeeeX", c.ZeroOrMore(c.Rune('e')), true, "eeeee"}, - {"Hello, world!X", c.Sequence(c.String("Hello"), a.Comma, a.Space, c.String("world"), a.Excl), true, "Hello, world!"}, - {"101010123", c.OneOrMore(c.Sequence(c.Rune('1'), c.Rune('0'))), true, "101010"}, - {"", c.Optional(c.OneOrMore(c.Rune('f'))), true, ""}, - {"ghijkl", c.Optional(c.Rune('h')), true, ""}, - {"ghijkl", c.Optional(c.Rune('g')), true, "g"}, - {"fffffX", c.Optional(c.OneOrMore(c.Rune('f'))), true, "fffff"}, - {"--cool", c.Sequence(c.Drop(c.OneOrMore(a.Minus)), c.String("cool")), true, "cool"}, - {"1,2,3,b,c", c.Separated(a.Comma, a.Digit), true, "1,2,3"}, - {`\x9a\x01\xF0\xfCAndSomeMoreStuff`, c.OneOrMore(c.Sequence(a.Backslash, c.Rune('x'), c.Repeat(2, a.HexDigit))), true, `\x9a\x01\xF0\xfC`}, - } { - parser := parsekit.New(c.matcher).Parse(c.input) - item, err, ok := parser.Next() - - if c.mustMatch { - if !ok { - t.Errorf("Test [%d] %q failed with error: %s", i+1, c.input, err) - } else if item.Type != parsekit.MatchedItem { - t.Errorf("Test [%d] %q failed: should match, but it didn't", i+1, c.input) - } else if item.Value != c.expected { - t.Errorf("Test [%d] %q failed: not expected output:\nexpected: %s\nactual: %s\n", i, c.input, c.expected, item.Value) - } - } else { - if ok { - t.Errorf("Test [%d] %q failed: should not match, but it did", i+1, c.input) - } - } - } -} diff --git a/matcher.go b/matcher.go new file mode 100644 index 0000000..87ea504 --- /dev/null +++ b/matcher.go @@ -0,0 +1,187 @@ +package parsekit + +import ( + "fmt" +) + +// Matcher is the function type that must be implemented to create a function +// that can be used in conjunction with parsekit.P.On() or parsekit.New(). +// Its purpose is to check if input data matches some kind of pattern and to +// report back the match. +// +// A Matcher function gets a MatchDialog as its input and returns a boolean to +// indicate whether or not the Matcher found a match on the input. +// The MatchDialog is used for retrieving input data to match against +// and for reporting back results. +type Matcher func(m *MatchDialog) bool + +// MatchDialog is used by Matcher functions to retrieve runes from the +// input to match against and to report back results. +// +// Basic operation: +// +// To retrieve the next rune from the input, the Matcher function can call +// the MatchDialog.NextRune() method. +// +// The Matcher function can then evaluate the retrieved rune and either +// accept of skip the rune. When accepting it using MatchDialog.Accept(), +// the rune is added to the output of the MatchDialog. When using +// MatchDialog.Skip(), the rune will not be added to the output. It is +// mandatory for a Matcher to call either Accept() or Skip() after retrieving +// a rune, before calling NextRune() again. +// +// Eventually, the Matcher function must return a boolean value, indicating +// whether or not a match was found. When true, then the calling code will +// use the runes that were accepted into the MatchDialog's resulting output. +// +// Forking operation for easy lookahead support: +// +// Sometimes, a Matcher function must be able to perform a lookahead, which +// might either succeed or fail. In case of a failing lookahead, the state +// of the MatchDialog must be brought back to the original state. +// +// The way in which this is supported, is by forking a MatchDialog by calling +// MatchDialog.Fork(). This will return a child MatchDialog, with an empty +// output buffer, but using the same input offset as the forked parent. +// +// The Matcher function can then use the same interface as described for +// normal operation to retrieve runes from the input and to fill the output +// buffer. When the Matcher function decides that the lookahead was successful, +// then the method MatchDialog.Merge() can be called on the forked child to +// append the resulting output from the child to the parent's resulting output, +// and to update the parent input offset to that of the child. +// +// When the Matcher function decides that the lookahead was unsuccessful, then +// it can simply discard the forked child. The parent MatchDialog was never +// modified, so a new match can be safely started using that parent, as if the +// lookahead never happened. +type MatchDialog struct { + p *P // parser state, used to retrieve input data to match against (TODO should be interface) + inputOffset int // the byte offset into the input + input []rune // a slice of runes that represents the retrieved input runes for the Matcher + output []rune // a slice of runes that represents the accepted output runes for the Matcher + currRune *runeToken // hold the last rune that was read from the input + parent *MatchDialog // the parent MatchDialog, in case this one was forked +} + +type runeToken struct { + Rune rune + ByteSize int + OK bool +} + +// NextRune retrieves the next rune from the input. +// +// It returns the rune and a boolean. The boolean will be false in case an +// invalid UTF8 rune or the end of the file was encountered. +// +// After using NextRune() to retrieve a rune, Accept() or Skip() can be called +// to respectively add the rune to the MatchDialog's resulting output or to +// fully ignore it. This way, a Matcher has full control over what runes are +// significant for the resulting output of that matcher. +// +// After using NextRune(), this method can not be reinvoked, until the last read +// rune is explicitly accepted or skipped as described above. +func (m *MatchDialog) NextRune() (rune, bool) { + if m.currRune != nil { + panic("internal Matcher error: NextRune() was called without accepting or skipping the previously read rune") + } + r, w, ok := m.p.peek(m.inputOffset) + m.currRune = &runeToken{r, w, ok} + if ok { + m.input = append(m.input, r) + } + return r, ok +} + +// Fork splits off a child MatchDialog, containing the same offset as the +// parent MatchDialog, but with all other data in a fresh state. +// +// By forking, a Matcher function can freely work with a MatchDialog, without +// affecting the parent MatchDialog. This is for example useful when the +// Matcher function must perform some form of lookahead. +// +// When a successful match was found, the Matcher function can call +// child.Merge() to have the resulting output added to the parent MatchDialog. +// When no match was found, the forked child can simply be discarded. +// +// Example case: A Matcher checks for a sequence of runes: 'a', 'b', 'c', 'd'. +// This is done in 4 steps and only after finishing all steps, the Matcher +// function can confirm a successful match. The Matcher function for this +// case could look like this (yes, it's naive, but it shows the point): +// +// func MatchAbcd(m *MatchDialog) bool { +// child := m.Fork() // fork to keep m from input untouched +// for _, letter := []rune {'a', 'b', 'c', 'd'} { +// if r, ok := m.NextRune(); !ok || r != letter { +// return false // report mismatch, m is left untouched +// } +// child.Accept() // add rune to child output +// } +// child.Merge() // we have a match, add resulting output to parent +// return true // and report the successful match +// } +func (m *MatchDialog) Fork() *MatchDialog { + child := &MatchDialog{ + p: m.p, + inputOffset: m.inputOffset, + parent: m, + } + return child +} + +// Accept will add the last rune as read by NextRune() to the resulting +// output of the MatchDialog. +func (m *MatchDialog) Accept() { + m.checkAllowedCall("Accept()") + m.output = append(m.output, m.currRune.Rune) + m.inputOffset += m.currRune.ByteSize + m.currRune = nil +} + +// Skip will ignore the last rune as read by NextRune(). +func (m *MatchDialog) Skip() { + m.checkAllowedCall("Skip()") + m.inputOffset += m.currRune.ByteSize + m.currRune = nil +} + +func (m *MatchDialog) checkAllowedCall(name string) { + if m.currRune == nil { + panic(fmt.Sprintf("internal Matcher error: %s was called without a prior call to NextRune()", name)) + } + if !m.currRune.OK { + panic(fmt.Sprintf("internal Matcher error: %s was called, but prior call to NextRun() did not return OK (EOF or invalid rune)", name)) + } +} + +// Merge merges the resulting output from a forked child MatchDialog back into +// its parent: The runes that are accepted in the child are added to the parent +// runes and the parent's offset is advanced to the child's offset. +// +// After the merge, the child MatchDialog is reset so it can immediately be +// reused for performing another match (all data are cleared, except for the +// input offset which is kept at its current position). +func (m *MatchDialog) Merge() bool { + if m.parent == nil { + panic("internal parser error: Cannot call Merge a a non-forked MatchDialog") + } + m.parent.input = append(m.parent.input, m.input...) + m.parent.output = append(m.parent.output, m.output...) + m.parent.inputOffset = m.inputOffset + m.ClearOutput() + m.ClearInput() + return true +} + +// ClearOutput clears the resulting output for the MatchDialog, but it keeps +// the input and input offset as-is. +func (m *MatchDialog) ClearOutput() { + m.output = []rune{} +} + +// ClearInput clears the input for the MatchDialog, but it keeps the output +// and input offset as-is. +func (m *MatchDialog) ClearInput() { + m.input = []rune{} +} diff --git a/matcher_builtin.go b/matcher_builtin.go new file mode 100644 index 0000000..6df87c3 --- /dev/null +++ b/matcher_builtin.go @@ -0,0 +1,477 @@ +package parsekit + +import ( + "fmt" + "strings" + "unicode" +) + +// C provides convenient access to a range of parser/combinators +// that can be used to build Matcher functions. +// +// When using C in your own parser, then it is advised to create +// a variable in your own package to reference it: +// +// var c = parsekit.C +// +// Doing so saves you a lot of typing, and it makes your code a lot cleaner. +var C = struct { + Rune func(rune) Matcher + Runes func(...rune) Matcher + RuneRange func(rune, rune) Matcher + String func(string) Matcher + StringNoCase func(string) Matcher + AnyOf func(...Matcher) Matcher + Not func(Matcher) Matcher + Optional func(Matcher) Matcher + Sequence func(...Matcher) Matcher + Repeat func(int, Matcher) Matcher + Min func(int, Matcher) Matcher + Max func(int, Matcher) Matcher + ZeroOrMore func(Matcher) Matcher + OneOrMore func(Matcher) Matcher + MinMax func(int, int, Matcher) Matcher + Separated func(Matcher, Matcher) Matcher + Drop func(Matcher) Matcher + Trim func(Matcher, string) Matcher + TrimLeft func(Matcher, string) Matcher + TrimRight func(Matcher, string) Matcher +}{ + Rune: MatchRune, + Runes: MatchRunes, + RuneRange: MatchRuneRange, + String: MatchString, + StringNoCase: MatchStringNoCase, + Optional: MatchOptional, + AnyOf: MatchAnyOf, + Not: MatchNot, + Sequence: MatchSequence, + Repeat: MatchRepeat, + Min: MatchMin, + Max: MatchMax, + ZeroOrMore: MatchZeroOrMore, + OneOrMore: MatchOneOrMore, + MinMax: MatchMinMax, + Separated: MatchSeparated, + Drop: MatchDrop, + Trim: MatchTrim, + TrimLeft: MatchTrimLeft, + TrimRight: MatchTrimRight, +} + +// A provides convenient access to a range of atoms that can be used to +// build combinators or parsing rules. +// +// In parsekit, an atom is defined as a ready to go Matcher function. +var A = struct { + EndOfFile Matcher + AnyRune Matcher + Space Matcher + Tab Matcher + CR Matcher + LF Matcher + CRLF Matcher + Excl Matcher + DoubleQuote Matcher + Hash Matcher + Dollar Matcher + Percent Matcher + Amp Matcher + SingleQuote Matcher + RoundOpen Matcher + RoundClose Matcher + Asterisk Matcher + Plus Matcher + Comma Matcher + Minus Matcher + Dot Matcher + Slash Matcher + Colon Matcher + Semicolon Matcher + AngleOpen Matcher + Equal Matcher + AngleClose Matcher + Question Matcher + At Matcher + SquareOpen Matcher + Backslash Matcher + SquareClose Matcher + Caret Matcher + Underscore Matcher + Backquote Matcher + CurlyOpen Matcher + Pipe Matcher + CurlyClose Matcher + Tilde Matcher + Newline Matcher + Whitespace Matcher + WhitespaceAndNewlines Matcher + EndOfLine Matcher + Digit Matcher + ASCII Matcher + ASCIILower Matcher + ASCIIUpper Matcher + HexDigit Matcher +}{ + EndOfFile: MatchEndOfFile(), + AnyRune: MatchAnyRune(), + Space: C.Rune(' '), + Tab: C.Rune('\t'), + CR: C.Rune('\r'), + LF: C.Rune('\n'), + CRLF: C.String("\r\n"), + Excl: C.Rune('!'), + DoubleQuote: C.Rune('"'), + Hash: C.Rune('#'), + Dollar: C.Rune('$'), + Percent: C.Rune('%'), + Amp: C.Rune('&'), + SingleQuote: C.Rune('\''), + RoundOpen: C.Rune('('), + RoundClose: C.Rune(')'), + Asterisk: C.Rune('*'), + Plus: C.Rune('+'), + Comma: C.Rune(','), + Minus: C.Rune('-'), + Dot: C.Rune('.'), + Slash: C.Rune('/'), + Colon: C.Rune(':'), + Semicolon: C.Rune(';'), + AngleOpen: C.Rune('<'), + Equal: C.Rune('='), + AngleClose: C.Rune('>'), + Question: C.Rune('?'), + At: C.Rune('@'), + SquareOpen: C.Rune('['), + Backslash: C.Rune('\\'), + SquareClose: C.Rune(']'), + Caret: C.Rune('^'), + Underscore: C.Rune('_'), + Backquote: C.Rune('`'), + CurlyOpen: C.Rune('{'), + Pipe: C.Rune('|'), + CurlyClose: C.Rune('}'), + Tilde: C.Rune('~'), + Whitespace: C.OneOrMore(C.AnyOf(C.Rune(' '), C.Rune('\t'))), + WhitespaceAndNewlines: C.OneOrMore(C.AnyOf(C.Rune(' '), C.Rune('\t'), C.String("\r\n"), C.Rune('\n'))), + EndOfLine: C.AnyOf(C.String("\r\n"), C.Rune('\n'), MatchEndOfFile()), + Digit: C.RuneRange('0', '9'), + ASCII: C.RuneRange('\x00', '\x7F'), + ASCIILower: C.RuneRange('a', 'z'), + ASCIIUpper: C.RuneRange('A', 'Z'), + HexDigit: C.AnyOf(C.RuneRange('0', '9'), C.RuneRange('a', 'f'), C.RuneRange('A', 'F')), +} + +// MatchRune creates a Matcher function that checks if the next rune from +// the input matches the provided rune. +func MatchRune(expected rune) Matcher { + return func(m *MatchDialog) bool { + input, ok := m.NextRune() + if ok && input == expected { + m.Accept() + return true + } + return false + } +} + +// MatchRunes creates a Matcher function that that checks if the next rune +// from the input is one of the provided runes. +func MatchRunes(expected ...rune) Matcher { + s := string(expected) + return func(m *MatchDialog) bool { + input, ok := m.NextRune() + if ok { + if strings.ContainsRune(s, input) { + m.Accept() + return true + } + } + return false + } +} + +// MatchRuneRange creates a Matcher function that that checks if the next rune +// from the input is contained by the provided rune range. +// +// The rune range is defined by a start and an end rune, inclusive, so: +// +// MatchRuneRange('g', 'k') +// +// creates a Matcher that will match any of 'g', 'h', 'i', 'j' or 'k'. +func MatchRuneRange(start rune, end rune) Matcher { + return func(m *MatchDialog) bool { + if end < start { + panic(fmt.Sprintf("internal parser error: MatchRuneRange definition error: start %q must not be < end %q", start, end)) + } + input, ok := m.NextRune() + if ok && input >= start && input <= end { + m.Accept() + return true + } + return false + } +} + +// MatchString creater a Matcher that will check if the upcoming runes on the +// input match the provided string. +// TODO make this a more efficient string-level match? +func MatchString(expected string) Matcher { + var matchers = []Matcher{} + for _, r := range expected { + matchers = append(matchers, MatchRune(r)) + } + return MatchSequence(matchers...) +} + +// MatchStringNoCase creater a Matcher that will check if the upcoming runes +// on the input match the provided string in a case-insensitive manner. +// TODO make this a more efficient string-level match? +func MatchStringNoCase(expected string) Matcher { + var matchers = []Matcher{} + for _, r := range expected { + u := unicode.ToUpper(r) + l := unicode.ToLower(r) + matchers = append(matchers, MatchRunes(u, l)) + } + return MatchSequence(matchers...) +} + +// MatchOptional creates a Matcher that makes the provided Matcher optional. +// When the provided Matcher applies, then its output is used, otherwise +// no output is generated but still a successful match is reported. +func MatchOptional(matcher Matcher) Matcher { + return func(m *MatchDialog) bool { + child := m.Fork() + if matcher(child) { + child.Merge() + } + return true + } +} + +// MatchSequence creates a Matcher that checks if the provided Matchers can be +// applied in their exact order. Only if all matcher apply, the sequence +// reports successful match. +func MatchSequence(matchers ...Matcher) Matcher { + return func(m *MatchDialog) bool { + child := m.Fork() + for _, matcher := range matchers { + if !matcher(child) { + return false + } + } + child.Merge() + return true + } +} + +// MatchAnyOf creates a Matcher that checks if any of the provided Matchers +// can be applied. They are applied in their provided order. The first Matcher +// that applies is used for reporting back a match. +func MatchAnyOf(matchers ...Matcher) Matcher { + return func(m *MatchDialog) bool { + for _, matcher := range matchers { + child := m.Fork() + if matcher(child) { + return child.Merge() + } + } + return false + } +} + +// MatchNot creates a Matcher that checks if the provided Matcher applies to +// the current input. If it does, then a failed match will be reported. If it +// does not, then the next rune from the input will be reported as a match. +func MatchNot(matcher Matcher) Matcher { + return func(m *MatchDialog) bool { + probe := m.Fork() + if matcher(probe) { + return false + } + _, ok := m.NextRune() + if ok { + m.Accept() + return true + } + return false + } +} + +// MatchRepeat creates a Matcher that checks if the provided Matcher can be +// applied exactly the provided amount of times. +// +// Note that the input can contain more Matches for the provided matcher, e.g.: +// +// MatchRepeat(4, MatchRune('X')) +// +// will not match input "XXX", it will match input "XXXX", but also "XXXXXX". +// In that last case, there will be a remainder "XX" of the input. +func MatchRepeat(times int, matcher Matcher) Matcher { + return matchMinMax(times, times, matcher) +} + +// MatchMin creates a Matcher that checks if the provided Matcher can be +// applied at least the provided minimum number of times. +// When more matches are possible, these will be included in the output. +func MatchMin(min int, matcher Matcher) Matcher { + return matchMinMax(min, -1, matcher) +} + +// MatchMax creates a Matcher that checks if the provided Matcher can be +// applied at maximum the provided minimum number of times. +// When more matches are possible, these will be included in the output. +// Zero matches are considered a successful match. +func MatchMax(max int, matcher Matcher) Matcher { + return matchMinMax(0, max, matcher) +} + +// MatchZeroOrMore creates a Matcher that checks if the provided Matcher can +// be applied zero or more times. All matches will be included in the output. +// Zero matches are considered a successful match. +func MatchZeroOrMore(matcher Matcher) Matcher { + return matchMinMax(0, -1, matcher) +} + +// MatchOneOrMore creates a Matcher that checks if the provided Matcher can +// be applied one or more times. All matches will be included in the output. +func MatchOneOrMore(matcher Matcher) Matcher { + return matchMinMax(1, -1, matcher) +} + +// MatchMinMax creates a Matcher that checks if the provided Matcher can +// be applied between the provided minimum and maximum number of times, +// inclusive. All matches will be included in the output. +func MatchMinMax(min int, max int, matcher Matcher) Matcher { + if max < 0 { + panic("internal parser error: MatchMinMax definition error: max must be >= 0 ") + } + if min < 0 { + panic("internal parser error: MatchMinMax definition error: min must be >= 0 ") + } + return matchMinMax(min, max, matcher) +} + +func matchMinMax(min int, max int, matcher Matcher) Matcher { + return func(m *MatchDialog) bool { + child := m.Fork() + if max >= 0 && min > max { + panic(fmt.Sprintf("internal parser error: MatchRepeat definition error: max %d must not be < min %d", max, min)) + } + total := 0 + // Check for the minimum required amount of matches. + for total < min { + total++ + if !matcher(child) { + return false + } + } + // No specified max: include the rest of the available matches. + // Specified max: include the rest of the availble matches, up to the max. + child.Merge() + for max < 0 || total < max { + total++ + if !matcher(child) { + break + } + child.Merge() + } + return true + } +} + +// MatchSeparated creates a Matcher that checks for a pattern of one or more +// Matchers of one type (the separated), separated by Matches of another type +// (the separator). All matches (separated + separator) are included in the +// output. +func MatchSeparated(separated Matcher, separator Matcher) Matcher { + return MatchSequence(separated, MatchZeroOrMore(MatchSequence(separator, separated))) +} + +// MatchDrop creates a Matcher that checks if the provided Matcher applies. +// If it does, then a successful match is reported, but its output is not used. +// If the Matcher does not apply, a successful match is reported as well. +func MatchDrop(matcher Matcher) Matcher { + return func(m *MatchDialog) bool { + child := m.Fork() + if matcher(child) { + child.ClearOutput() + child.Merge() + return true + } + return true + } +} + +// MatchTrim creates a Matcher that checks if the provided Matcher applies. +// If it does, then its output is taken and characters from the provided +// cutset are trimmed from both the left and the right of the output. +// The trimmed output is reported back as the match output. +func MatchTrim(matcher Matcher, cutset string) Matcher { + return func(m *MatchDialog) bool { + return matchTrim(m, cutset, matcher, true, true) + } +} + +// MatchTrimLeft creates a Matcher that checks if the provided Matcher applies. +// If it does, then its output is taken and characters from the provided +// cutset are trimmed from the left of the output. +// The trimmed output is reported back as the match output. +func MatchTrimLeft(matcher Matcher, cutset string) Matcher { + return func(m *MatchDialog) bool { + return matchTrim(m, cutset, matcher, true, false) + } +} + +// MatchTrimRight creates a Matcher that checks if the provided Matcher applies. +// If it does, then its output is taken and characters from the provided +// cutset are trimmed from the right of the output. +// The trimmed output is reported back as the match output. +func MatchTrimRight(matcher Matcher, cutset string) Matcher { + return func(m *MatchDialog) bool { + return matchTrim(m, cutset, matcher, false, true) + } +} + +func matchTrim(m *MatchDialog, cutset string, matcher Matcher, trimLeft bool, trimRight bool) bool { + child := m.Fork() + if matcher(child) { + child.Merge() + s := string(m.output) + if trimLeft { + s = strings.TrimLeft(s, cutset) + } + if trimRight { + s = strings.TrimRight(s, cutset) + } + m.output = []rune(s) + return true + } + return false +} + +// MatchEndOfFile creates a Matcher that checks if the end of the input data +// has been reached. This Matcher will never produce output. It only reports +// a successful or a failing match through its boolean return value. +func MatchEndOfFile() Matcher { + return func(m *MatchDialog) bool { + fork := m.Fork() + input, ok := fork.NextRune() + return !ok && input == EOF + } +} + +// MatchAnyRune creates a Matcher function that checks if a valid rune can be +// read from the input. It reports back a successful match if the end of the +// input has not yet been reached and the upcoming input is a valid UTF8 rune. +func MatchAnyRune() Matcher { + return func(m *MatchDialog) bool { + _, ok := m.NextRune() + if ok { + m.Accept() + return true + } + return false + } +} diff --git a/matcher_builtin_test.go b/matcher_builtin_test.go new file mode 100644 index 0000000..963cf4f --- /dev/null +++ b/matcher_builtin_test.go @@ -0,0 +1,203 @@ +package parsekit_test + +import ( + "fmt" + "testing" + + "git.makaay.nl/mauricem/go-parsekit" +) + +func ExampleMatchAnyRune() { + parser := parsekit.New( + func(p *parsekit.P) { + p.Expects("Any valid rune") + if p.On(a.AnyRune).Accept().End() { + p.EmitLiteral(TestItem) + } + }) + run := parser.Parse("¡Any / valid / character will dö!") + match, _, ok := run.Next() + if ok { + fmt.Printf("Match = %q\n", match) + } +} + +func TestCombinators(t *testing.T) { + RunMatcherTests(t, []MatcherTest{ + {"xxx", c.Rune('x'), true, "x"}, + {"x ", c.Rune(' '), false, ""}, + {"aa", c.RuneRange('b', 'e'), false, ""}, + {"bb", c.RuneRange('b', 'e'), true, "b"}, + {"cc", c.RuneRange('b', 'e'), true, "c"}, + {"dd", c.RuneRange('b', 'e'), true, "d"}, + {"ee", c.RuneRange('b', 'e'), true, "e"}, + {"ff", c.RuneRange('b', 'e'), false, ""}, + {"Hello, world!", c.String("Hello"), true, "Hello"}, + {"HellÖ, world!", c.StringNoCase("hellö"), true, "HellÖ"}, + {"+X", c.Runes('+', '-', '*', '/'), true, "+"}, + {"-X", c.Runes('+', '-', '*', '/'), true, "-"}, + {"*X", c.Runes('+', '-', '*', '/'), true, "*"}, + {"/X", c.Runes('+', '-', '*', '/'), true, "/"}, + {"!X", c.Runes('+', '-', '*', '/'), false, ""}, + {"abc", c.Not(c.Rune('b')), true, "a"}, + {"bcd", c.Not(c.Rune('b')), false, ""}, + {"bcd", c.Not(c.Rune('b')), false, ""}, + {"1010", c.Not(c.Sequence(c.Rune('2'), c.Rune('0'))), true, "1"}, + {"2020", c.Not(c.Sequence(c.Rune('2'), c.Rune('0'))), false, ""}, + {"abc", c.AnyOf(c.Rune('a'), c.Rune('b')), true, "a"}, + {"bcd", c.AnyOf(c.Rune('a'), c.Rune('b')), true, "b"}, + {"cde", c.AnyOf(c.Rune('a'), c.Rune('b')), false, ""}, + {"ababc", c.Repeat(4, c.Runes('a', 'b')), true, "abab"}, + {"ababc", c.Repeat(5, c.Runes('a', 'b')), false, ""}, + {"", c.Min(0, c.Rune('a')), true, ""}, + {"a", c.Min(0, c.Rune('a')), true, "a"}, + {"aaaaa", c.Min(4, c.Rune('a')), true, "aaaaa"}, + {"aaaaa", c.Min(5, c.Rune('a')), true, "aaaaa"}, + {"aaaaa", c.Min(6, c.Rune('a')), false, ""}, + {"", c.Max(4, c.Rune('b')), true, ""}, + {"X", c.Max(4, c.Rune('b')), true, ""}, + {"bbbbbX", c.Max(4, c.Rune('b')), true, "bbbb"}, + {"bbbbbX", c.Max(5, c.Rune('b')), true, "bbbbb"}, + {"bbbbbX", c.Max(6, c.Rune('b')), true, "bbbbb"}, + {"", c.MinMax(0, 0, c.Rune('c')), true, ""}, + {"X", c.MinMax(0, 0, c.Rune('c')), true, ""}, + {"cccccX", c.MinMax(0, 0, c.Rune('c')), true, ""}, + {"cccccX", c.MinMax(0, 1, c.Rune('c')), true, "c"}, + {"cccccX", c.MinMax(0, 5, c.Rune('c')), true, "ccccc"}, + {"cccccX", c.MinMax(0, 6, c.Rune('c')), true, "ccccc"}, + {"cccccX", c.MinMax(1, 1, c.Rune('c')), true, "c"}, + {"", c.MinMax(1, 1, c.Rune('c')), false, ""}, + {"X", c.MinMax(1, 1, c.Rune('c')), false, ""}, + {"cccccX", c.MinMax(1, 3, c.Rune('c')), true, "ccc"}, + {"cccccX", c.MinMax(1, 6, c.Rune('c')), true, "ccccc"}, + {"cccccX", c.MinMax(3, 4, c.Rune('c')), true, "cccc"}, + {"", c.OneOrMore(c.Rune('d')), false, ""}, + {"X", c.OneOrMore(c.Rune('d')), false, ""}, + {"dX", c.OneOrMore(c.Rune('d')), true, "d"}, + {"dddddX", c.OneOrMore(c.Rune('d')), true, "ddddd"}, + {"", c.ZeroOrMore(c.Rune('e')), true, ""}, + {"X", c.ZeroOrMore(c.Rune('e')), true, ""}, + {"eX", c.ZeroOrMore(c.Rune('e')), true, "e"}, + {"eeeeeX", c.ZeroOrMore(c.Rune('e')), true, "eeeee"}, + {"Hello, world!X", c.Sequence(c.String("Hello"), a.Comma, a.Space, c.String("world"), a.Excl), true, "Hello, world!"}, + {"101010123", c.OneOrMore(c.Sequence(c.Rune('1'), c.Rune('0'))), true, "101010"}, + {"", c.Optional(c.OneOrMore(c.Rune('f'))), true, ""}, + {"ghijkl", c.Optional(c.Rune('h')), true, ""}, + {"ghijkl", c.Optional(c.Rune('g')), true, "g"}, + {"fffffX", c.Optional(c.OneOrMore(c.Rune('f'))), true, "fffff"}, + {"1,2,3,b,c", c.Separated(a.Digit, a.Comma), true, "1,2,3"}, + {"--cool", c.Sequence(c.Drop(c.OneOrMore(a.Minus)), c.String("cool")), true, "cool"}, + {`\x9a\x01\xF0\xfCAndSomeMoreStuff`, c.OneOrMore(c.Sequence(a.Backslash, c.Rune('x'), c.Repeat(2, a.HexDigit))), true, `\x9a\x01\xF0\xfC`}, + {" ", c.Trim(c.OneOrMore(a.AnyRune), " "), true, ""}, + {" ", c.TrimLeft(c.OneOrMore(a.AnyRune), " "), true, ""}, + {" ", c.TrimRight(c.OneOrMore(a.AnyRune), " "), true, ""}, + {" trim ", c.Trim(c.OneOrMore(a.AnyRune), " "), true, "trim"}, + {" \t trim \t ", c.Trim(c.OneOrMore(a.AnyRune), " \t"), true, "trim"}, + {" trim ", c.TrimLeft(c.OneOrMore(a.AnyRune), " "), true, "trim "}, + {" trim ", c.TrimRight(c.OneOrMore(a.AnyRune), " "), true, " trim"}, + {" \t trim \t ", c.TrimRight(c.OneOrMore(a.AnyRune), " \t"), true, " \t trim"}, + }) +} + +func TestAtoms(t *testing.T) { + RunMatcherTests(t, []MatcherTest{ + {"", a.EndOfFile, true, ""}, + {"⌘", a.AnyRune, true, "⌘"}, + {"\xbc", a.AnyRune, false, ""}, // invalid UTF8 rune + {"", a.AnyRune, false, ""}, // end of file + {" ", a.Space, true, " "}, + {"X", a.Space, false, ""}, + {"\t", a.Tab, true, "\t"}, + {"\r", a.CR, true, "\r"}, + {"\n", a.LF, true, "\n"}, + {"!", a.Excl, true, "!"}, + {"\"", a.DoubleQuote, true, "\""}, + {"#", a.Hash, true, "#"}, + {"$", a.Dollar, true, "$"}, + {"%", a.Percent, true, "%"}, + {"&", a.Amp, true, "&"}, + {"'", a.SingleQuote, true, "'"}, + {"(", a.RoundOpen, true, "("}, + {")", a.RoundClose, true, ")"}, + {"*", a.Asterisk, true, "*"}, + {"+", a.Plus, true, "+"}, + {",", a.Comma, true, ","}, + {"-", a.Minus, true, "-"}, + {".", a.Dot, true, "."}, + {"/", a.Slash, true, "/"}, + {":", a.Colon, true, ":"}, + {";", a.Semicolon, true, ";"}, + {"<", a.AngleOpen, true, "<"}, + {"=", a.Equal, true, "="}, + {">", a.AngleClose, true, ">"}, + {"?", a.Question, true, "?"}, + {"@", a.At, true, "@"}, + {"[", a.SquareOpen, true, "["}, + {"\\", a.Backslash, true, "\\"}, + {"]", a.SquareClose, true, "]"}, + {"^", a.Caret, true, "^"}, + {"_", a.Underscore, true, "_"}, + {"`", a.Backquote, true, "`"}, + {"{", a.CurlyOpen, true, "{"}, + {"|", a.Pipe, true, "|"}, + {"}", a.CurlyClose, true, "}"}, + {"~", a.Tilde, true, "~"}, + {" \t \t \r\n", a.Whitespace, true, " \t \t "}, + {"\r", a.WhitespaceAndNewlines, false, ""}, + {" \t\r\n \r", a.WhitespaceAndNewlines, true, " \t\r\n "}, + {"", a.EndOfLine, true, ""}, + {"\r\n", a.EndOfLine, true, "\r\n"}, + {"\n", a.EndOfLine, true, "\n"}, + {"0", a.Digit, true, "0"}, + {"1", a.Digit, true, "1"}, + {"2", a.Digit, true, "2"}, + {"3", a.Digit, true, "3"}, + {"4", a.Digit, true, "4"}, + {"5", a.Digit, true, "5"}, + {"6", a.Digit, true, "6"}, + {"7", a.Digit, true, "7"}, + {"8", a.Digit, true, "8"}, + {"9", a.Digit, true, "9"}, + {"X", a.Digit, false, ""}, + {"a", a.ASCIILower, true, "a"}, + {"z", a.ASCIILower, true, "z"}, + {"A", a.ASCIILower, false, ""}, + {"Z", a.ASCIILower, false, ""}, + {"A", a.ASCIIUpper, true, "A"}, + {"Z", a.ASCIIUpper, true, "Z"}, + {"a", a.ASCIIUpper, false, ""}, + {"z", a.ASCIIUpper, false, ""}, + {"0", a.HexDigit, true, "0"}, + {"9", a.HexDigit, true, "9"}, + {"a", a.HexDigit, true, "a"}, + {"f", a.HexDigit, true, "f"}, + {"A", a.HexDigit, true, "A"}, + {"F", a.HexDigit, true, "F"}, + {"g", a.HexDigit, false, "g"}, + {"G", a.HexDigit, false, "G"}, + }) +} + +func TestSequenceOfRunes(t *testing.T) { + sequence := c.Sequence( + a.Hash, a.Dollar, a.Percent, a.Amp, a.SingleQuote, a.RoundOpen, + a.RoundClose, a.Asterisk, a.Plus, a.Comma, a.Minus, a.Dot, a.Slash, + a.Colon, a.Semicolon, a.AngleOpen, a.Equal, a.AngleClose, a.Question, + a.At, a.SquareOpen, a.Backslash, a.SquareClose, a.Caret, a.Underscore, + a.Backquote, a.CurlyOpen, a.Pipe, a.CurlyClose, a.Tilde, + ) + input := "#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" + parser := parsekit.New(func(p *parsekit.P) { + p.Expects("Sequence of runes") + if p.On(sequence).Accept().End() { + p.EmitLiteral(TestItem) + } + }) + item, err, ok := parser.Parse(input).Next() + if !ok { + t.Fatalf("Parsing failed: %s", err) + } + if item.Value != input { + t.Fatalf("Unexpected output from parser:\nexpected: %s\nactual: %s\n", input, item.Value) + } +} diff --git a/parsekit.go b/parsekit.go index 9f18f11..8990542 100644 --- a/parsekit.go +++ b/parsekit.go @@ -6,6 +6,14 @@ import ( "runtime" ) +// Parser is the top-level struct that holds the configuration for a parser. +// The Parser can be instantiated using the parsekit.New() method. +// +// To start parsing input data, use the method Parser.Parse(). +type Parser struct { + startState StateHandler // the function that handles the very first state +} + // New instantiates a new Parser. // The logic parameter provides the parsing logic to apply. This can be: // @@ -55,12 +63,13 @@ func makeParserForMatcher(matcher Matcher) *Parser { })) } -// Parser is the top-level parser. -type Parser struct { - startState StateHandler // the function that handles the very first state +// Run represents a single parse run for a Parser. +type Run struct { + p *P // a struct holding the internal state of a parse run } // Parse starts a parse run on the provided input data. +// To retrieve parse items from the run, make use of the Run.Next() method. func (p *Parser) Parse(input string) *Run { return &Run{ p: &P{ @@ -74,69 +83,59 @@ func (p *Parser) Parse(input string) *Run { } } -// Run represents a single parse run for a Parser. -type Run struct { - p *P // a struct holding the internal state of a parse run -} - -// P holds the internal state of a parse run. -type P struct { - state StateHandler // the function that handles the current state - nextState StateHandler // the function that will handle the next state - routeStack []StateHandler // route stack, for handling nested parsing - input string // the scanned input - len int // the total length of the input in bytes - pos int // current byte scanning position in the input - newline bool // keep track of when we have scanned a newline - cursorLine int // current row number in the input - cursorColumn int // current column position in the input - expecting string // a description of what the current state expects to find - buffer stringBuffer // an efficient buffer, used to build string values - items chan Item // channel of resulting Parser items - item Item // the current item as reached by Next() and retrieved by Get() - err *Error // an error when lexing failed, retrieved by Error() - - LastMatch string // a string representation of the last matched input data -} - // Next retrieves the next parsed item for a parse run. +// // When a valid item was found, then the boolean return parameter will be true. // On error or when successfully reaching the end of the input, false is returned. -// When an error occurred, it will be set in the error return value, nil otherwise. +// When an error occurred, false will be returned and the error return value will +// be set (default is nil). func (run *Run) Next() (Item, *Error, bool) { + // State handling loop: we handle states, until an Item is ready to be returned. for { select { + // If a state handler has emitted an (error) Item, then the state handling + // loop is stopped and the Item is returned to the caller. case i := <-run.p.items: return run.makeReturnValues(i) + // Otherwise, the next state handler is looked up and invoked. default: - run.runStatusHandler() + run.runNextStateHandler() } } } -// StateHandler defines the type of function that can be used to -// handle a parser state. -type StateHandler func(*P) +func (run *Run) makeReturnValues(i Item) (Item, *Error, bool) { + switch { + case i.Type == ItemEOF: + return i, nil, false + case i.Type == ItemError: + run.p.err = &Error{i.Value, run.p.cursorLine, run.p.cursorColumn} + return i, run.p.err, false + default: + run.p.item = i + return i, nil, true + } +} -// runStatusHandler moves the parser, which is bascially a state machine, +// runNextStateHandler moves the parser, which is bascially a state machine, // to its next status. It does so by invoking a function of the // type StateHandler. This function represents the current status and // is responsible for moving the parser to its next status, depending // on the parsed input data. -func (run *Run) runStatusHandler() { +func (run *Run) runNextStateHandler() { if state, ok := run.getNextStateHandler(); ok { - run.invokeNextStatusHandler(state) + run.invokeNextStateHandler(state) } } -// getNextStateHandler determines the next StatusHandler to invoke in order +// getNextStateHandler determines the next StateHandler to invoke in order // to move the parsing state machine one step further. // // When implementing a parser, the StateHandler functions must provide // a routing decision in every invocation. A routing decision is one // of the following: // -// * A route is specified explicitly, which means that the next StatusHandler +// * A route is specified explicitly, which means that the next StateHandler // function to invoke is registered during the StateHandler function // invocation. For example: p.RouteTo(nextStatus) // @@ -147,9 +146,9 @@ func (run *Run) runStatusHandler() { // a route explicitly, but otherStatus will be used implicitly after // the nextStatus function has returned. // -// * An expectation is registered by the StatusHandler. +// * An expectation is registered by the StateHandler. // For example: p.Expects("a cool thing") -// When the StatusHandler returns without having specified a route, this +// When the StateHandler returns without having specified a route, this // expectation is used to generate an "unexpected input" error message. // // When no routing decision is provided by a StateHandler, then this is @@ -169,24 +168,11 @@ func (run *Run) getNextStateHandler() (StateHandler, bool) { } } -// invokeNextStatusHandler moves the parser state to the provided state -// and invokes the StatusHandler function. -func (run *Run) invokeNextStatusHandler(state StateHandler) { +// invokeNextStateHandler moves the parser state to the provided state +// and invokes the StateHandler function. +func (run *Run) invokeNextStateHandler(state StateHandler) { run.p.state = state run.p.nextState = nil run.p.expecting = "" run.p.state(run.p) } - -func (run *Run) makeReturnValues(i Item) (Item, *Error, bool) { - switch { - case i.Type == ItemEOF: - return i, nil, false - case i.Type == ItemError: - run.p.err = &Error{i.Value, run.p.cursorLine, run.p.cursorColumn} - return i, run.p.err, false - default: - run.p.item = i - return i, nil, true - } -} diff --git a/parsekit_test.go b/parsekit_test.go index 2720f28..0ac4586 100644 --- a/parsekit_test.go +++ b/parsekit_test.go @@ -1,7 +1,46 @@ package parsekit_test -import "git.makaay.nl/mauricem/go-parsekit" +// This file only provides building blocks for writing tests. +// No actual tests belong in this file. + +import ( + "testing" + + "git.makaay.nl/mauricem/go-parsekit" +) const TestItem parsekit.ItemType = 1 var c, a = parsekit.C, parsekit.A + +type MatcherTest struct { + input string + matcher parsekit.Matcher + mustMatch bool + expected string +} + +func RunMatcherTests(t *testing.T, testSet []MatcherTest) { + for _, test := range testSet { + RunMatcherTest(t, test) + } +} + +func RunMatcherTest(t *testing.T, test MatcherTest) { + parser := parsekit.New(test.matcher).Parse(test.input) + item, err, ok := parser.Next() + + if test.mustMatch { + if !ok { + t.Errorf("Test %q failed with error: %s", test.input, err) + } else if item.Type != parsekit.MatchedItem { + t.Errorf("Test %q failed: should match, but it didn't", test.input) + } else if item.Value != test.expected { + t.Errorf("Test %q failed: not expected output:\nexpected: %q\nactual: %q\n", test.input, test.expected, item.Value) + } + } else { + if ok { + t.Errorf("Test %q failed: should not match, but it did", test.input) + } + } +} diff --git a/peek.go b/peek.go deleted file mode 100644 index c45e2d5..0000000 --- a/peek.go +++ /dev/null @@ -1,43 +0,0 @@ -package parsekit - -import ( - "unicode/utf8" -) - -// peek returns but does not advance the cursor to the next rune(s) in the input. -// Returns the rune, its width in bytes and a boolean. -// The boolean will be false in case no upcoming rune can be peeked -// (end of data or invalid UTF8 character). -func (p *P) peek(offsetInBytes int) (rune, int, bool) { - r, w := utf8.DecodeRuneInString(p.input[p.pos+offsetInBytes:]) - return handleRuneError(r, w) -} - -// handleRuneError is used to normale rune value in case of errors. -// When an error occurs, then utf8.RuneError will be in the rune. -// This can however indicate one of two situations: -// * w == 0: end of file is reached -// * w == 1: invalid UTF character on input -// This function lets these two cases return respectively the -// package's own EOF or INVALID runes, to make it easy for client -// code to distinct between these two cases. -func handleRuneError(r rune, w int) (rune, int, bool) { - if r == utf8.RuneError { - if w == 0 { - return EOF, 0, false - } - return INVALID, w, false - } - return r, w, true -} - -// EOF is a special rune, which is used to indicate an end of file when -// reading a character from the input. -// It can be treated as a rune when writing parsing rules, so a valid way to -// say 'I now expect the end of the file' is using something like: -// if (p.On(c.Rune(EOF)).Skip()) { ... } -const EOF rune = -1 - -// INVALID is a special rune, which is used to indicate an invalid UTF8 -// rune on the input. -const INVALID rune = utf8.RuneError diff --git a/statehandler.go b/statehandler.go new file mode 100644 index 0000000..77fec0f --- /dev/null +++ b/statehandler.go @@ -0,0 +1,128 @@ +package parsekit + +import "unicode/utf8" + +// StateHandler defines the type of function that must be implemented to +// handle a parsing state. +// +// A StateHandler function gets a P struct as its input. This struct holds +// all the internal state for the parsing state machine and provides the +// interface that the StateHandler must use to interact with the parser. +type StateHandler func(*P) + +// P holds the internal state of a parse run and provides an API to +// StateHandler methods to communicate with the parser. +type P struct { + state StateHandler // the function that handles the current state + nextState StateHandler // the function that will handle the next state + routeStack []StateHandler // route stack, for handling nested parsing + input string // the scanned input + inputPos int // current byte cursor position in the input + cursorLine int // current rune cursor row number in the input + cursorColumn int // current rune cursor column position in the input + len int // the total length of the input in bytes + newline bool // keep track of when we have scanned a newline + expecting string // a description of what the current state expects to find + buffer stringBuffer // an efficient buffer, used to build string values + items chan Item // channel of resulting Parser items + item Item // the current item as reached by Next() and retrieved by Get() + err *Error // an error when lexing failed, retrieved by Error() + + LastMatch string // a string representation of the last matched input data +} + +// Expects is used to let a state function describe what input it is expecting. +// This expectation is used in error messages to make them more descriptive. +// +// When defining an expectation inside a StateHandler, you do not need to +// handle unexpected input yourself. When the end of the function is reached +// without setting the next state, an automatic error will be emitted. +// This error can differentiate between the following issues: +// +// * there is valid data on input, but it was not accepted by the function +// +// * there is an invalid UTF8 character on input +// +// * the end of the file was reached. +func (p *P) Expects(description string) { + p.expecting = description +} + +// peek returns but does not advance the cursor to the next rune(s) in the input. +// Returns the rune, its width in bytes and a boolean. +// The boolean will be false in case no upcoming rune can be peeked +// (end of data or invalid UTF8 character). +func (p *P) peek(byteOffset int) (rune, int, bool) { + r, w := utf8.DecodeRuneInString(p.input[p.inputPos+byteOffset:]) + return handleRuneError(r, w) +} + +// EOF is a special rune, which is used to indicate an end of file when +// reading a character from the input. +// It can be treated as a rune when writing parsing rules, so a valid way to +// say 'I now expect the end of the file' is using something like: +// if (p.On(c.Rune(EOF)).Skip()) { ... } +const EOF rune = -1 + +// INVALID is a special rune, which is used to indicate an invalid UTF8 +// rune on the input. +const INVALID rune = utf8.RuneError + +// handleRuneError is used to normale rune value in case of errors. +// When an error occurs, then utf8.RuneError will be in the rune. +// This can however indicate one of two situations: +// * w == 0: end of file is reached +// * w == 1: invalid UTF character on input +// This function lets these two cases return respectively the +// package's own EOF or INVALID runes, to make it easy for client +// code to distinct between these two cases. +func handleRuneError(r rune, w int) (rune, int, bool) { + if r == utf8.RuneError { + if w == 0 { + return EOF, 0, false + } + return INVALID, w, false + } + return r, w, true +} + +// RouteTo tells the parser what StateHandler function to invoke +// in the next parsing cycle. +func (p *P) RouteTo(state StateHandler) *routeFollowupAction { + p.nextState = state + return &routeFollowupAction{chainAction: chainAction{p, true}} +} + +// RouteRepeat indicates that on the next parsing cycle, the current +// StateHandler must be reinvoked. +func (p *P) RouteRepeat() *chainAction { + p.RouteTo(p.state) + return &chainAction{nil, true} +} + +// RouteReturn tells the parser that on the next cycle the last +// StateHandler that was pushed on the route stack must be invoked. +// +// Using this method is optional. When implementating a StateHandler that +// is used as a sort of subroutine (using constructions like +// p.RouteTo(subroutine).ThenReturnHere()), you can refrain from +// providing an explicit routing decision from that handler. The parser will +// automatically assume a RouteReturn() in that case. +func (p *P) RouteReturn() *chainAction { + p.nextState = p.popRoute() + return &chainAction{nil, true} +} + +// pushRoute adds the StateHandler to the route stack. +// This is used for implementing nested parsing. +func (p *P) pushRoute(state StateHandler) { + p.routeStack = append(p.routeStack, state) +} + +// popRoute pops the last pushed StateHandler from the route stack. +func (p *P) popRoute() StateHandler { + last := len(p.routeStack) - 1 + head, tail := p.routeStack[:last], p.routeStack[last] + p.routeStack = head + return tail +} diff --git a/statehandler_emit.go b/statehandler_emit.go index 220964a..3cd5d54 100644 --- a/statehandler_emit.go +++ b/statehandler_emit.go @@ -2,7 +2,6 @@ package parsekit import ( "fmt" - "strings" ) // ItemType represents the type of a parser Item. @@ -16,7 +15,7 @@ const ItemEOF ItemType = -1 // an error has occurred during parsing. const ItemError ItemType = -2 -// Item is a built-in parser item type that is used for indicating a +// MatchedItem is a built-in parser item type that is used for indicating a // successful match when using a parser that is based on a Matcher. const MatchedItem ItemType = -3 @@ -27,8 +26,8 @@ type Item struct { } // Emit passes a Parser item to the client, including the provided string. -func (p *P) Emit(t ItemType, s string) { - p.items <- Item{t, s} +func (p *P) Emit(t ItemType, v string) { + p.items <- Item{t, v} p.buffer.reset() } @@ -38,28 +37,22 @@ func (p *P) EmitLiteral(t ItemType) { p.Emit(t, p.buffer.asLiteralString()) } -// EmitLiteralTrim passes a Parser item to the client, including -// accumulated string buffer data as a literal string with whitespace -// trimmed from it. -func (p *P) EmitLiteralTrim(t ItemType) { - p.Emit(t, strings.TrimSpace(p.buffer.asLiteralString())) -} - -// EmitInterpreted passes a Parser item to the client, including -// accumulated string buffer data a Go doubled quoted interpreted string -// (handling escape codes like \n, \t, \uXXXX, etc.) -// This method might return an error, in case there is data in the -// string buffer that is not valid for string interpretation. -func (p *P) EmitInterpreted(t ItemType) error { +// EmitInterpreted passes a Parser item to the client, including accumulated +// string buffer data a Go double quoted interpreted string (handling escape +// codes like \n, \t, \uXXXX, etc.) +// This method returns a boolean value, indicating whether or not the string +// interpretation was successful. On invalid string data, an error will +// automatically be emitted and false will be returned. +func (p *P) EmitInterpreted(t ItemType) bool { s, err := p.buffer.asInterpretedString() if err != nil { p.EmitError( "invalid string: %s (%s, forgot to escape a double quote or backslash maybe?)", p.buffer.asLiteralString(), err) - return err + return false } p.Emit(t, s) - return nil + return true } // Error is used as the error type when parsing errors occur. @@ -78,6 +71,8 @@ func (err *Error) Error() string { return err.Message } +// ErrorFull returns the current error message, including information about +// the position in the input where the error occurred. func (err *Error) ErrorFull() string { message := err.Error() return fmt.Sprintf("%s after line %d, column %d", message, err.Line, err.Column) diff --git a/statehandler_expects.go b/statehandler_expects.go deleted file mode 100644 index adc66ae..0000000 --- a/statehandler_expects.go +++ /dev/null @@ -1,15 +0,0 @@ -package parsekit - -// Expects is used to let a state function describe what input it is expecting. -// This expectation is used in error messages to make them more descriptive. -// -// Also, when defining an expectation inside a StateHandler, you do not need -// to handle unexpected input yourself. When the end of the function is -// reached without setting the next state, an automatic error will be -// emitted. This error differentiates between issues: -// * there is valid data on input, but it was not accepted by the function -// * there is an invalid UTF8 character on input -// * the end of the file was reached. -func (p *P) Expects(description string) { - p.expecting = description -} diff --git a/statehandler_on.go b/statehandler_on.go index 3ac0c0c..37841ee 100644 --- a/statehandler_on.go +++ b/statehandler_on.go @@ -26,11 +26,15 @@ package parsekit // // You can omit "what to do with the match" and go straight into a routing // method, e.g. +// // On(...).RouteTo(...) +// // This is functionally the same as using +// // On(...).Stay().RouteTo(...). // // Here's a complete example chain: +// // p.On(something).Accept().RouteTo(stateB).ThenTo(stateC).End() func (p *P) On(matcher Matcher) *matchAction { m := &MatchDialog{p: p} @@ -44,16 +48,18 @@ func (p *P) On(matcher Matcher) *matchAction { // if p.On(somethingBad).End() { // p.Errorf("This was bad: %s", p.LastMatch) // } - p.LastMatch = string(m.runes) + p.LastMatch = string(m.input) return &matchAction{ routeAction: routeAction{chainAction{p, ok}}, - runes: m.runes, - widths: m.widths, + input: m.input, + output: m.output, + inputPos: p.inputPos + m.inputOffset, } } // chainAction is used for building method chains for the On() method. +// Every element of the method chain embeds this struct. type chainAction struct { p *P ok bool @@ -64,3 +70,119 @@ type chainAction struct { func (a *chainAction) End() bool { return a.ok } + +// matchAction is a struct that is used for building On()-method chains. +// +// It embeds the routeAction struct, to make it possible to go right into +// a route action, which is basically a simple way of aliasing a chain +// like p.On(...).Stay().RouteTo(...) into p.On(...).RouteTo(...). +type matchAction struct { + routeAction + input []rune + output []rune + inputPos int +} + +// Accept tells the parser to move the cursor past a match that was found, +// and to store the input that matched in the string buffer. +// When no match was found, then no action is taken. +// It returns a routeAction struct, which provides methods that can be used +// to tell the parser what state to go to next. +func (a *matchAction) Accept() *routeAction { + if a.ok { + a.p.buffer.writeString(string(a.output)) + a.advanceCursor() + } + return &routeAction{chainAction: chainAction{a.p, a.ok}} +} + +// Skip tells the parser to move the cursor past a match that was found, +// without storing the actual match in the string buffer. +// Returns true in case a match was found. +// When no match was found, then no action is taken and false is returned. +func (a *matchAction) Skip() *routeAction { + if a.ok { + a.advanceCursor() + } + return &routeAction{chainAction: chainAction{a.p, a.ok}} +} + +// advanceCursor advances the rune cursor one position in the input data. +// While doing so, it keeps tracks of newlines, so we can report on +// row + column positions on error. +func (a *matchAction) advanceCursor() { + a.p.inputPos = a.inputPos + for _, r := range a.input { + if a.p.newline { + a.p.cursorLine++ + a.p.cursorColumn = 1 + } else { + a.p.cursorColumn++ + } + a.p.newline = r == '\n' + } +} + +// Stay tells the parser to not move the cursor after finding a match. +// Returns true in case a match was found, false otherwise. +func (a *matchAction) Stay() *routeAction { + return &routeAction{chainAction: chainAction{a.p, a.ok}} +} + +// routeAction is a struct that is used for building On() method chains. +type routeAction struct { + chainAction +} + +// RouteRepeat indicates that on the next parsing cycle, +// the current StateHandler must be reinvoked. +func (a *routeAction) RouteRepeat() *chainAction { + if a.ok { + return a.p.RouteRepeat() + } + return &chainAction{nil, false} +} + +// RouteTo tells the parser what StateHandler function to invoke +// in the next parsing cycle. +func (a *routeAction) RouteTo(state StateHandler) *routeFollowupAction { + if a.ok { + return a.p.RouteTo(state) + } + return &routeFollowupAction{chainAction: chainAction{nil, false}} +} + +// RouteReturn tells the parser that on the next cycle the next scheduled +// route must be invoked. +func (a *routeAction) RouteReturn() *chainAction { + if a.ok { + return a.p.RouteReturn() + } + return &chainAction{nil, false} +} + +// routeFollowupAction chains parsing routes. +// It allows for routing code like p.RouteTo(handlerA).ThenTo(handlerB). +type routeFollowupAction struct { + chainAction +} + +// ThenTo schedules a StateHandler that must be invoked after the RouteTo +// StateHandler has been completed. +// For example: p.RouteTo(handlerA).ThenTo(handlerB) +func (a *routeFollowupAction) ThenTo(state StateHandler) *chainAction { + if a.ok { + a.p.pushRoute(state) + } + return &chainAction{nil, a.ok} +} + +// ThenReturnHere schedules the current StateHandler to be invoked after +// the RouteTo StateHandler has been completed. +// For example: p.RouteTo(handlerA).ThenReturnHere() +func (a *routeFollowupAction) ThenReturnHere() *chainAction { + if a.ok { + a.p.pushRoute(a.p.state) + } + return &chainAction{nil, a.ok} +} diff --git a/statehandler_on_match.go b/statehandler_on_match.go deleted file mode 100644 index 8beb6cb..0000000 --- a/statehandler_on_match.go +++ /dev/null @@ -1,60 +0,0 @@ -package parsekit - -// matchAction is a struct that is used for building On()-method chains. -// -// It embeds the routeAction struct, to make it possible to go right into -// a route action, which is basically a simple way of aliasing a chain -// like p.On(...).Stay().RouteTo(...) into p.On(...).RouteTo(...). -type matchAction struct { - routeAction - runes []rune - widths []int -} - -// Accept tells the parser to move the cursor past a match that was found, -// and to store the input that matched in the string buffer. -// When no match was found, then no action is taken. -// It returns a routeAction struct, which provides methods that can be used -// to tell the parser what state to go to next. -func (a *matchAction) Accept() *routeAction { - if a.ok { - for i, r := range a.runes { - a.p.buffer.writeRune(r) - a.p.advanceCursor(r, a.widths[i]) - } - } - return &routeAction{chainAction: chainAction{a.p, a.ok}} -} - -// Skip tells the parser to move the cursor past a match that was found, -// without storing the actual match in the string buffer. -// Returns true in case a match was found. -// When no match was found, then no action is taken and false is returned. -func (a *matchAction) Skip() *routeAction { - if a.ok { - for i, r := range a.runes { - a.p.advanceCursor(r, a.widths[i]) - } - } - return &routeAction{chainAction: chainAction{a.p, a.ok}} -} - -// Stay tells the parser to not move the cursor after finding a match. -// Returns true in case a match was found, false otherwise. -func (a *matchAction) Stay() *routeAction { - return &routeAction{chainAction: chainAction{a.p, a.ok}} -} - -// advanceCursor advances the rune cursor one position in the input data. -// While doing so, it keeps tracks of newlines, so we can report on -// row + column positions on error. -func (p *P) advanceCursor(r rune, w int) { - p.pos += w - if p.newline { - p.cursorLine++ - p.cursorColumn = 1 - } else { - p.cursorColumn++ - } - p.newline = r == '\n' -} diff --git a/statehandler_on_route.go b/statehandler_on_route.go deleted file mode 100644 index 26f927c..0000000 --- a/statehandler_on_route.go +++ /dev/null @@ -1,59 +0,0 @@ -package parsekit - -// routeAction is a struct that is used for building On() method chains. -type routeAction struct { - chainAction -} - -// RouteRepeat indicates that on the next parsing cycle, -// the current StateHandler must be reinvoked. -func (a *routeAction) RouteRepeat() *chainAction { - if a.ok { - return a.p.RouteRepeat() - } - return &chainAction{nil, false} -} - -// RouteTo tells the parser what StateHandler function to invoke -// in the next parsing cycle. -func (a *routeAction) RouteTo(state StateHandler) *routeFollowupAction { - if a.ok { - return a.p.RouteTo(state) - } - return &routeFollowupAction{chainAction: chainAction{nil, false}} -} - -// RouteReturn tells the parser that on the next cycle the next scheduled -// route must be invoked. -func (a *routeAction) RouteReturn() *chainAction { - if a.ok { - return a.p.RouteReturn() - } - return &chainAction{nil, false} -} - -// routeFollowupAction chains parsing routes. -// It allows for routing code like p.RouteTo(handlerA).ThenTo(handlerB). -type routeFollowupAction struct { - chainAction -} - -// ThenTo schedules a StateHandler that must be invoked after the RouteTo -// StateHandler has been completed. -// For example: p.RouteTo(handlerA).ThenTo(handlerB) -func (a *routeFollowupAction) ThenTo(state StateHandler) *chainAction { - if a.ok { - a.p.pushRoute(state) - } - return &chainAction{nil, a.ok} -} - -// ThenReturnHere schedules the current StateHandler to be invoked after -// the RouteTo StateHandler has been completed. -// For example: p.RouteTo(handlerA).ThenReturnHere() -func (a *routeFollowupAction) ThenReturnHere() *chainAction { - if a.ok { - a.p.pushRoute(a.p.state) - } - return &chainAction{nil, a.ok} -} diff --git a/statehandler_routing.go b/statehandler_routing.go deleted file mode 100644 index 9142da9..0000000 --- a/statehandler_routing.go +++ /dev/null @@ -1,42 +0,0 @@ -package parsekit - -// RouteTo tells the parser what StateHandler function to invoke -// in the next parsing cycle. -func (p *P) RouteTo(state StateHandler) *routeFollowupAction { - p.nextState = state - return &routeFollowupAction{chainAction: chainAction{p, true}} -} - -// RouteRepeat indicates that on the next parsing cycle, the current -// StateHandler must be reinvoked. -func (p *P) RouteRepeat() *chainAction { - p.RouteTo(p.state) - return &chainAction{nil, true} -} - -// RouteReturn tells the parser that on the next cycle the last -// StateHandler that was pushed on the route stack must be invoked. -// -// Using this method is optional. When implementating a StateHandler that -// is used as a sort of subroutine (using constructions like -// p.RouteTo(subroutine).ThenReturnHere()), you can refrain from -// providing an explicit routing decision from that handler. The parser will -// automatically assume a RouteReturn() in that case. -func (p *P) RouteReturn() *chainAction { - p.nextState = p.popRoute() - return &chainAction{nil, true} -} - -// pushRoute adds the StateHandler to the route stack. -// This is used for implementing nested parsing. -func (p *P) pushRoute(state StateHandler) { - p.routeStack = append(p.routeStack, state) -} - -// popRoute pops the last pushed StateHandler from the route stack. -func (p *P) popRoute() StateHandler { - last := len(p.routeStack) - 1 - head, tail := p.routeStack[:last], p.routeStack[last] - p.routeStack = head - return tail -}