From 6fe3c16a6d3af44441ea8afb37f4ed61a538590e Mon Sep 17 00:00:00 2001 From: Maurice Makaay Date: Fri, 24 May 2019 15:57:54 +0000 Subject: [PATCH] Added some more modifiers (ModifyReplace and ModifyByCallback). --- matcher_builtin.go | 305 +++++++++++++++++++++------------------- matcher_builtin_test.go | 114 +++++++++++---- parsekit.go | 2 +- statehandler.go | 4 +- statehandler_on.go | 6 +- 5 files changed, 255 insertions(+), 176 deletions(-) diff --git a/matcher_builtin.go b/matcher_builtin.go index f2e61dc..da22837 100644 --- a/matcher_builtin.go +++ b/matcher_builtin.go @@ -51,145 +51,6 @@ var C = struct { Separated: MatchSeparated, } -// A provides convenient access to a range of atoms that can be used to -// build combinators or parsing rules. -// -// In parsekit, an atom is defined as a ready to go Matcher function. -// -// When using A in your own parser, then it is advised to create -// a variable in your own package to reference it: -// -// var a = parsekit.A -// -// Doing so saves you a lot of typing, and it makes your code a lot cleaner. -var A = struct { - EndOfFile Matcher - AnyRune Matcher - Space Matcher - Tab Matcher - CR Matcher - LF Matcher - CRLF Matcher - Excl Matcher - DoubleQuote Matcher - Hash Matcher - Dollar Matcher - Percent Matcher - Amp Matcher - SingleQuote Matcher - RoundOpen Matcher - RoundClose Matcher - Asterisk Matcher - Plus Matcher - Comma Matcher - Minus Matcher - Dot Matcher - Slash Matcher - Colon Matcher - Semicolon Matcher - AngleOpen Matcher - Equal Matcher - AngleClose Matcher - Question Matcher - At Matcher - SquareOpen Matcher - Backslash Matcher - SquareClose Matcher - Caret Matcher - Underscore Matcher - Backquote Matcher - CurlyOpen Matcher - Pipe Matcher - CurlyClose Matcher - Tilde Matcher - Newline Matcher - Whitespace Matcher - WhitespaceAndNewlines Matcher - EndOfLine Matcher - Digit Matcher - ASCII Matcher - ASCIILower Matcher - ASCIIUpper Matcher - HexDigit Matcher -}{ - EndOfFile: MatchEndOfFile(), - AnyRune: MatchAnyRune(), - Space: C.Rune(' '), - Tab: C.Rune('\t'), - CR: C.Rune('\r'), - LF: C.Rune('\n'), - CRLF: C.Str("\r\n"), - Excl: C.Rune('!'), - DoubleQuote: C.Rune('"'), - Hash: C.Rune('#'), - Dollar: C.Rune('$'), - Percent: C.Rune('%'), - Amp: C.Rune('&'), - SingleQuote: C.Rune('\''), - RoundOpen: C.Rune('('), - RoundClose: C.Rune(')'), - Asterisk: C.Rune('*'), - Plus: C.Rune('+'), - Comma: C.Rune(','), - Minus: C.Rune('-'), - Dot: C.Rune('.'), - Slash: C.Rune('/'), - Colon: C.Rune(':'), - Semicolon: C.Rune(';'), - AngleOpen: C.Rune('<'), - Equal: C.Rune('='), - AngleClose: C.Rune('>'), - Question: C.Rune('?'), - At: C.Rune('@'), - SquareOpen: C.Rune('['), - Backslash: C.Rune('\\'), - SquareClose: C.Rune(']'), - Caret: C.Rune('^'), - Underscore: C.Rune('_'), - Backquote: C.Rune('`'), - CurlyOpen: C.Rune('{'), - Pipe: C.Rune('|'), - CurlyClose: C.Rune('}'), - Tilde: C.Rune('~'), - Whitespace: C.OneOrMore(C.Any(C.Rune(' '), C.Rune('\t'))), - WhitespaceAndNewlines: C.OneOrMore(C.Any(C.Rune(' '), C.Rune('\t'), C.Str("\r\n"), C.Rune('\n'))), - EndOfLine: C.Any(C.Str("\r\n"), C.Rune('\n'), MatchEndOfFile()), - Digit: C.RuneRange('0', '9'), - ASCII: C.RuneRange('\x00', '\x7F'), - ASCIILower: C.RuneRange('a', 'z'), - ASCIIUpper: C.RuneRange('A', 'Z'), - HexDigit: C.Any(C.RuneRange('0', '9'), C.RuneRange('a', 'f'), C.RuneRange('A', 'F')), -} - -// M provides convenient access to a range of modifiers that can be -// used when creating Matcher functions. -// -// In parsekit, a modifier is defined as a Matcher function that modifies the -// resulting output of another Matcher in some way. It does not do any matching -// against input of its own. -// -// When using M in your own parser, then it is advised to create -// a variable in your own package to reference it: -// -// var m = parsekit.M -// -// Doing so saves you a lot of typing, and it makes your code a lot cleaner. -var M = struct { - Drop func(Matcher) Matcher - Trim func(Matcher, string) Matcher - TrimLeft func(Matcher, string) Matcher - TrimRight func(Matcher, string) Matcher - ToLower func(Matcher) Matcher - ToUpper func(Matcher) Matcher -}{ - Drop: ModifyDrop, - Trim: ModifyTrim, - TrimLeft: ModifyTrimLeft, - TrimRight: ModifyTrimRight, - ToLower: ModifyToLower, - ToUpper: ModifyToUpper, -} - // MatchRune creates a Matcher function that checks if the next rune from // the input matches the provided rune. func MatchRune(expected rune) Matcher { @@ -417,6 +278,116 @@ func MatchSeparated(separated Matcher, separator Matcher) Matcher { return MatchSeq(separated, MatchZeroOrMore(MatchSeq(separator, separated))) } +// A provides convenient access to a range of atoms that can be used to +// build combinators or parsing rules. +// +// In parsekit, an atom is defined as a ready to go Matcher function. +// +// When using A in your own parser, then it is advised to create +// a variable in your own package to reference it: +// +// var a = parsekit.A +// +// Doing so saves you a lot of typing, and it makes your code a lot cleaner. +var A = struct { + EndOfFile Matcher + AnyRune Matcher + Space Matcher + Tab Matcher + CR Matcher + LF Matcher + CRLF Matcher + Excl Matcher + DoubleQuote Matcher + Hash Matcher + Dollar Matcher + Percent Matcher + Amp Matcher + SingleQuote Matcher + RoundOpen Matcher + RoundClose Matcher + Asterisk Matcher + Plus Matcher + Comma Matcher + Minus Matcher + Dot Matcher + Slash Matcher + Colon Matcher + Semicolon Matcher + AngleOpen Matcher + Equal Matcher + AngleClose Matcher + Question Matcher + At Matcher + SquareOpen Matcher + Backslash Matcher + SquareClose Matcher + Caret Matcher + Underscore Matcher + Backquote Matcher + CurlyOpen Matcher + Pipe Matcher + CurlyClose Matcher + Tilde Matcher + Newline Matcher + Whitespace Matcher + WhitespaceAndNewlines Matcher + EndOfLine Matcher + Digit Matcher + ASCII Matcher + ASCIILower Matcher + ASCIIUpper Matcher + HexDigit Matcher +}{ + EndOfFile: MatchEndOfFile(), + AnyRune: MatchAnyRune(), + Space: C.Rune(' '), + Tab: C.Rune('\t'), + CR: C.Rune('\r'), + LF: C.Rune('\n'), + CRLF: C.Str("\r\n"), + Excl: C.Rune('!'), + DoubleQuote: C.Rune('"'), + Hash: C.Rune('#'), + Dollar: C.Rune('$'), + Percent: C.Rune('%'), + Amp: C.Rune('&'), + SingleQuote: C.Rune('\''), + RoundOpen: C.Rune('('), + RoundClose: C.Rune(')'), + Asterisk: C.Rune('*'), + Plus: C.Rune('+'), + Comma: C.Rune(','), + Minus: C.Rune('-'), + Dot: C.Rune('.'), + Slash: C.Rune('/'), + Colon: C.Rune(':'), + Semicolon: C.Rune(';'), + AngleOpen: C.Rune('<'), + Equal: C.Rune('='), + AngleClose: C.Rune('>'), + Question: C.Rune('?'), + At: C.Rune('@'), + SquareOpen: C.Rune('['), + Backslash: C.Rune('\\'), + SquareClose: C.Rune(']'), + Caret: C.Rune('^'), + Underscore: C.Rune('_'), + Backquote: C.Rune('`'), + CurlyOpen: C.Rune('{'), + Pipe: C.Rune('|'), + CurlyClose: C.Rune('}'), + Tilde: C.Rune('~'), + Whitespace: C.OneOrMore(C.Any(C.Rune(' '), C.Rune('\t'))), + WhitespaceAndNewlines: C.OneOrMore(C.Any(C.Rune(' '), C.Rune('\t'), C.Str("\r\n"), C.Rune('\n'))), + EndOfLine: C.Any(C.Str("\r\n"), C.Rune('\n'), MatchEndOfFile()), + Digit: C.RuneRange('0', '9'), + ASCII: C.RuneRange('\x00', '\x7F'), + ASCIILower: C.RuneRange('a', 'z'), + ASCIIUpper: C.RuneRange('A', 'Z'), + HexDigit: C.Any(C.RuneRange('0', '9'), C.RuneRange('a', 'f'), C.RuneRange('A', 'F')), +} + // MatchEndOfFile creates a Matcher that checks if the end of the input data // has been reached. This Matcher will never produce output. It only reports // a successful or a failing match through its boolean return value. @@ -442,6 +413,39 @@ func MatchAnyRune() Matcher { } } +// M provides convenient access to a range of modifiers that can be +// used when creating Matcher functions. +// +// In parsekit, a modifier is defined as a Matcher function that modifies the +// resulting output of another Matcher in some way. It does not do any matching +// against input of its own. +// +// When using M in your own parser, then it is advised to create +// a variable in your own package to reference it: +// +// var m = parsekit.M +// +// Doing so saves you a lot of typing, and it makes your code a lot cleaner. +var M = struct { + Drop func(Matcher) Matcher + Trim func(Matcher, string) Matcher + TrimLeft func(Matcher, string) Matcher + TrimRight func(Matcher, string) Matcher + ToLower func(Matcher) Matcher + ToUpper func(Matcher) Matcher + Replace func(Matcher, string) Matcher + ModifyByCallback func(Matcher, func(string) string) Matcher +}{ + Drop: ModifyDrop, + Trim: ModifyTrim, + TrimLeft: ModifyTrimLeft, + TrimRight: ModifyTrimRight, + ToLower: ModifyToLower, + ToUpper: ModifyToUpper, + Replace: ModifyReplace, + ModifyByCallback: ModifyByCallback, +} + // ModifyDrop creates a Matcher that checks if the provided Matcher applies. // If it does, then its output is discarded completely. // @@ -459,7 +463,7 @@ func MatchAnyRune() Matcher { // string "bork" would not match against the second form, but " bork" would. // In both cases, it would match the first form. func ModifyDrop(matcher Matcher) Matcher { - return modifyStrCallback(matcher, func(s string) string { + return ModifyByCallback(matcher, func(s string) string { return "" }) } @@ -495,24 +499,37 @@ func modifyTrim(matcher Matcher, cutset string, trimLeft bool, trimRight bool) M } return s } - return modifyStrCallback(matcher, modfunc) + return ModifyByCallback(matcher, modfunc) } // ModifyToUpper creates a Matcher that checks if the provided Matcher applies. // If it does, then its output is taken and characters from the provided // cutset are converted into upper case. func ModifyToUpper(matcher Matcher) Matcher { - return modifyStrCallback(matcher, strings.ToUpper) + return ModifyByCallback(matcher, strings.ToUpper) } // ModifyToLower creates a Matcher that checks if the provided Matcher applies. // If it does, then its output is taken and characters from the provided // cutset are converted into lower case. func ModifyToLower(matcher Matcher) Matcher { - return modifyStrCallback(matcher, strings.ToLower) + return ModifyByCallback(matcher, strings.ToLower) } -func modifyStrCallback(matcher Matcher, modfunc func(string) string) Matcher { +// ModifyReplace creates a Matcher that checks if the provided Matcher applies. +// If it does, then its output is replaced by the provided string. +func ModifyReplace(matcher Matcher, s string) Matcher { + return ModifyByCallback(matcher, func(string) string { + return s + }) +} + +// ModifyByCallback creates a Matcher that checks if the provided matcher applies. +// If it does, then its output is taken and it is fed to the provided modfunc. +// This is a simple function that takes a string on input and returns a possibly +// modified string on output. The return value of the modfunc will replace the +// resulting output. +func ModifyByCallback(matcher Matcher, modfunc func(string) string) Matcher { return func(m *MatchDialog) bool { child := m.Fork() if matcher(child) { diff --git a/matcher_builtin_test.go b/matcher_builtin_test.go index 4412a27..b12c123 100644 --- a/matcher_builtin_test.go +++ b/matcher_builtin_test.go @@ -7,21 +7,6 @@ import ( "git.makaay.nl/mauricem/go-parsekit" ) -func ExampleMatchAnyRune() { - parser := parsekit.New( - func(p *parsekit.P) { - p.Expects("Any valid rune") - if p.On(a.AnyRune).Accept().End() { - p.EmitLiteral(TestItem) - } - }) - run := parser.Parse("¡Any / valid / character will dö!") - match, _, ok := run.Next() - if ok { - fmt.Printf("Match = %q\n", match) - } -} - func TestCombinators(t *testing.T) { RunMatcherTests(t, []MatcherTest{ {"xxx", c.Rune('x'), true, "x"}, @@ -93,17 +78,6 @@ func TestCombinators(t *testing.T) { }) } -func TestModifiers(t *testing.T) { - RunMatcherTests(t, []MatcherTest{ - {" trim ", m.Trim(c.OneOrMore(a.AnyRune), " "), true, "trim"}, - {" \t trim \t ", m.Trim(c.OneOrMore(a.AnyRune), " \t"), true, "trim"}, - {" trim ", m.TrimLeft(c.OneOrMore(a.AnyRune), " "), true, "trim "}, - {" trim ", m.TrimRight(c.OneOrMore(a.AnyRune), " "), true, " trim"}, - {" \t trim \t ", m.TrimRight(c.OneOrMore(a.AnyRune), " \t"), true, " \t trim"}, - {"--cool", c.Seq(m.Drop(c.OneOrMore(a.Minus)), c.Str("cool")), true, "cool"}, - }) -} - func TestAtoms(t *testing.T) { RunMatcherTests(t, []MatcherTest{ {"", a.EndOfFile, true, ""}, @@ -183,6 +157,51 @@ func TestAtoms(t *testing.T) { }) } +func TestModifiers(t *testing.T) { + RunMatcherTests(t, []MatcherTest{ + {"--cool", c.Seq(m.Drop(c.OneOrMore(a.Minus)), c.Str("cool")), true, "cool"}, + {" trim ", m.Trim(c.OneOrMore(a.AnyRune), " "), true, "trim"}, + {" \t trim \t ", m.Trim(c.OneOrMore(a.AnyRune), " \t"), true, "trim"}, + {" trim ", m.TrimLeft(c.OneOrMore(a.AnyRune), " "), true, "trim "}, + {" trim ", m.TrimRight(c.OneOrMore(a.AnyRune), " "), true, " trim"}, + {" \t trim \t ", m.TrimRight(c.OneOrMore(a.AnyRune), " \t"), true, " \t trim"}, + {"dirtyword", m.Replace(c.OneOrMore(a.AnyRune), "*******"), true, "*******"}, + {"abcdefghijk", m.ModifyByCallback(c.Str("abc"), func(s string) string { return "X" }), true, "X"}, + {"NoTaLlUpPeR", m.ToUpper(c.StrNoCase("notallUPPER")), true, "NOTALLUPPER"}, + {"NoTaLlLoWeR", m.ToLower(c.StrNoCase("NOTALLlower")), true, "notalllower"}, + }) +} + +// I know, this is hell, but that's the whole point for this test :-> +func TestCombination(t *testing.T) { + demonic := c.Seq( + c.Opt(a.SquareOpen), + m.Trim( + c.Seq( + c.Opt(a.Whitespace), + c.Rep(3, a.AngleClose), + m.ModifyByCallback(c.OneOrMore(c.StrNoCase("hello")), func(s string) string { + return fmt.Sprintf("%d", len(s)) + }), + m.Replace(c.Separated(c.Opt(a.Whitespace), a.Comma), ", "), + m.ToUpper(c.Min(1, a.ASCIILower)), + m.Drop(a.Excl), + c.Rep(3, a.AngleOpen), + c.Opt(a.Whitespace), + ), + " \t", + ), + c.Opt(a.SquareClose), + ) + + RunMatcherTests(t, []MatcherTest{ + {"[ \t >>>Hello, world!<<< ]", demonic, true, "[>>>5, WORLD<<<]"}, + {"[ \t >>>Hello, world!<<< ", demonic, true, "[>>>5, WORLD<<<"}, + {">>>HellohellO, world!<<< ]", demonic, true, ">>>10, WORLD<<<]"}, + {"[ \t >>>HellohellO , , , world!<<< ", demonic, true, "[>>>10, WORLD<<<"}, + }) +} + func TestSequenceOfRunes(t *testing.T) { sequence := c.Seq( a.Hash, a.Dollar, a.Percent, a.Amp, a.SingleQuote, a.RoundOpen, @@ -206,3 +225,46 @@ func TestSequenceOfRunes(t *testing.T) { t.Fatalf("Unexpected output from parser:\nexpected: %s\nactual: %s\n", input, item.Value) } } + +func ExampleMatchAnyRune() { + handler := func(p *parsekit.P) { + p.Expects("Any valid rune") + if p.On(a.AnyRune).Accept().End() { + p.EmitLiteral(TestItem) + } + } + parser := parsekit.New(handler) + run := parser.Parse("¡Any / valid / character will dö!") + match, _, ok := run.Next() + + // This will output '¡', since a.AnyRune matches exactly 1 rune. + if ok { + fmt.Printf("Match = %q\n", match) + } +} + +func ExampleModifyToUpper() { + // A Dutch poscode consists of 4 numbers and 2 letters (1234XX). + // The numbers never start with a zero. + digitNotZero := c.RuneRange('1', '9') + numbers := c.Seq(digitNotZero, c.Rep(3, a.Digit)) + + // It is good form to write the letters in upper case. + letter := c.Any(a.ASCIILower, a.ASCIIUpper) + letters := m.ToUpper(c.Seq(letter, letter)) + + // It is good form to use a single space between letters and numbers, + // but it is not mandatory. + space := m.Replace(c.Opt(a.Whitespace), " ") + + // With all the building blocks, we can now build the postcode parser. + postcode := c.Seq(numbers, space, letters) + + // Create a parser and let is parse some postcode inputs. + // This will print "1234 AB" for every input, because of the built-in normalization. + p := parsekit.New(postcode) + for _, input := range []string{"1234 AB", "1234AB", "1234 ab", "1234ab"} { + r, _, _ := p.Parse("1234 AB").Next() + fmt.Printf("Input: %q, output: %q", input, r.Value) + } +} diff --git a/parsekit.go b/parsekit.go index 65f52ec..8990542 100644 --- a/parsekit.go +++ b/parsekit.go @@ -57,7 +57,7 @@ func makeParserForStateHandler(handler StateHandler) *Parser { func makeParserForMatcher(matcher Matcher) *Parser { return New(StateHandler(func(p *P) { p.Expects("match") - if p.On(matcher).Accept().RouteRep().End() { + if p.On(matcher).Accept().RouteRepeat().End() { p.EmitLiteral(MatchedItem) } })) diff --git a/statehandler.go b/statehandler.go index 0452429..d8b91bf 100644 --- a/statehandler.go +++ b/statehandler.go @@ -93,9 +93,9 @@ func (p *P) RouteTo(state StateHandler) *routeFollowupAction { return &routeFollowupAction{chainAction: chainAction{p, true}} } -// RouteRep indicates that on the next parsing cycle, the current +// RouteRepeat indicates that on the next parsing cycle, the current // StateHandler must be reinvoked. -func (p *P) RouteRep() *chainAction { +func (p *P) RouteRepeat() *chainAction { p.RouteTo(p.state) return &chainAction{nil, true} } diff --git a/statehandler_on.go b/statehandler_on.go index ee19322..37841ee 100644 --- a/statehandler_on.go +++ b/statehandler_on.go @@ -134,11 +134,11 @@ type routeAction struct { chainAction } -// RouteRep indicates that on the next parsing cycle, +// RouteRepeat indicates that on the next parsing cycle, // the current StateHandler must be reinvoked. -func (a *routeAction) RouteRep() *chainAction { +func (a *routeAction) RouteRepeat() *chainAction { if a.ok { - return a.p.RouteRep() + return a.p.RouteRepeat() } return &chainAction{nil, false} }