From 6fe3c16a6d3af44441ea8afb37f4ed61a538590e Mon Sep 17 00:00:00 2001
From: Maurice Makaay <maurice@makaay.nl>
Date: Fri, 24 May 2019 15:57:54 +0000
Subject: [PATCH] Added some more modifiers (ModifyReplace and
 ModifyByCallback).

---
 matcher_builtin.go      | 305 +++++++++++++++++++++-------------------
 matcher_builtin_test.go | 114 +++++++++++----
 parsekit.go             |   2 +-
 statehandler.go         |   4 +-
 statehandler_on.go      |   6 +-
 5 files changed, 255 insertions(+), 176 deletions(-)

diff --git a/matcher_builtin.go b/matcher_builtin.go
index f2e61dc..da22837 100644
--- a/matcher_builtin.go
+++ b/matcher_builtin.go
@@ -51,145 +51,6 @@ var C = struct {
 	Separated:  MatchSeparated,
 }
 
-// A provides convenient access to a range of atoms that can be used to
-// build combinators or parsing rules.
-//
-// In parsekit, an atom is defined as a ready to go Matcher function.
-//
-// When using A in your own parser, then it is advised to create
-// a variable in your own package to reference it:
-//
-//     var a = parsekit.A
-//
-// Doing so saves you a lot of typing, and it makes your code a lot cleaner.
-var A = struct {
-	EndOfFile             Matcher
-	AnyRune               Matcher
-	Space                 Matcher
-	Tab                   Matcher
-	CR                    Matcher
-	LF                    Matcher
-	CRLF                  Matcher
-	Excl                  Matcher
-	DoubleQuote           Matcher
-	Hash                  Matcher
-	Dollar                Matcher
-	Percent               Matcher
-	Amp                   Matcher
-	SingleQuote           Matcher
-	RoundOpen             Matcher
-	RoundClose            Matcher
-	Asterisk              Matcher
-	Plus                  Matcher
-	Comma                 Matcher
-	Minus                 Matcher
-	Dot                   Matcher
-	Slash                 Matcher
-	Colon                 Matcher
-	Semicolon             Matcher
-	AngleOpen             Matcher
-	Equal                 Matcher
-	AngleClose            Matcher
-	Question              Matcher
-	At                    Matcher
-	SquareOpen            Matcher
-	Backslash             Matcher
-	SquareClose           Matcher
-	Caret                 Matcher
-	Underscore            Matcher
-	Backquote             Matcher
-	CurlyOpen             Matcher
-	Pipe                  Matcher
-	CurlyClose            Matcher
-	Tilde                 Matcher
-	Newline               Matcher
-	Whitespace            Matcher
-	WhitespaceAndNewlines Matcher
-	EndOfLine             Matcher
-	Digit                 Matcher
-	ASCII                 Matcher
-	ASCIILower            Matcher
-	ASCIIUpper            Matcher
-	HexDigit              Matcher
-}{
-	EndOfFile:             MatchEndOfFile(),
-	AnyRune:               MatchAnyRune(),
-	Space:                 C.Rune(' '),
-	Tab:                   C.Rune('\t'),
-	CR:                    C.Rune('\r'),
-	LF:                    C.Rune('\n'),
-	CRLF:                  C.Str("\r\n"),
-	Excl:                  C.Rune('!'),
-	DoubleQuote:           C.Rune('"'),
-	Hash:                  C.Rune('#'),
-	Dollar:                C.Rune('$'),
-	Percent:               C.Rune('%'),
-	Amp:                   C.Rune('&'),
-	SingleQuote:           C.Rune('\''),
-	RoundOpen:             C.Rune('('),
-	RoundClose:            C.Rune(')'),
-	Asterisk:              C.Rune('*'),
-	Plus:                  C.Rune('+'),
-	Comma:                 C.Rune(','),
-	Minus:                 C.Rune('-'),
-	Dot:                   C.Rune('.'),
-	Slash:                 C.Rune('/'),
-	Colon:                 C.Rune(':'),
-	Semicolon:             C.Rune(';'),
-	AngleOpen:             C.Rune('<'),
-	Equal:                 C.Rune('='),
-	AngleClose:            C.Rune('>'),
-	Question:              C.Rune('?'),
-	At:                    C.Rune('@'),
-	SquareOpen:            C.Rune('['),
-	Backslash:             C.Rune('\\'),
-	SquareClose:           C.Rune(']'),
-	Caret:                 C.Rune('^'),
-	Underscore:            C.Rune('_'),
-	Backquote:             C.Rune('`'),
-	CurlyOpen:             C.Rune('{'),
-	Pipe:                  C.Rune('|'),
-	CurlyClose:            C.Rune('}'),
-	Tilde:                 C.Rune('~'),
-	Whitespace:            C.OneOrMore(C.Any(C.Rune(' '), C.Rune('\t'))),
-	WhitespaceAndNewlines: C.OneOrMore(C.Any(C.Rune(' '), C.Rune('\t'), C.Str("\r\n"), C.Rune('\n'))),
-	EndOfLine:             C.Any(C.Str("\r\n"), C.Rune('\n'), MatchEndOfFile()),
-	Digit:                 C.RuneRange('0', '9'),
-	ASCII:                 C.RuneRange('\x00', '\x7F'),
-	ASCIILower:            C.RuneRange('a', 'z'),
-	ASCIIUpper:            C.RuneRange('A', 'Z'),
-	HexDigit:              C.Any(C.RuneRange('0', '9'), C.RuneRange('a', 'f'), C.RuneRange('A', 'F')),
-}
-
-// M provides convenient access to a range of modifiers that can be
-// used when creating Matcher functions.
-//
-// In parsekit, a modifier is defined as a Matcher function that modifies the
-// resulting output of another Matcher in some way. It does not do any matching
-// against input of its own.
-//
-// When using M in your own parser, then it is advised to create
-// a variable in your own package to reference it:
-//
-//     var m = parsekit.M
-//
-// Doing so saves you a lot of typing, and it makes your code a lot cleaner.
-var M = struct {
-	Drop      func(Matcher) Matcher
-	Trim      func(Matcher, string) Matcher
-	TrimLeft  func(Matcher, string) Matcher
-	TrimRight func(Matcher, string) Matcher
-	ToLower   func(Matcher) Matcher
-	ToUpper   func(Matcher) Matcher
-}{
-	Drop:      ModifyDrop,
-	Trim:      ModifyTrim,
-	TrimLeft:  ModifyTrimLeft,
-	TrimRight: ModifyTrimRight,
-	ToLower:   ModifyToLower,
-	ToUpper:   ModifyToUpper,
-}
-
 // MatchRune creates a Matcher function that checks if the next rune from
 // the input matches the provided rune.
 func MatchRune(expected rune) Matcher {
@@ -417,6 +278,116 @@ func MatchSeparated(separated Matcher, separator Matcher) Matcher {
 	return MatchSeq(separated, MatchZeroOrMore(MatchSeq(separator, separated)))
 }
 
+// A provides convenient access to a range of atoms that can be used to
+// build combinators or parsing rules.
+//
+// In parsekit, an atom is defined as a ready to go Matcher function.
+//
+// When using A in your own parser, then it is advised to create
+// a variable in your own package to reference it:
+//
+//     var a = parsekit.A
+//
+// Doing so saves you a lot of typing, and it makes your code a lot cleaner.
+var A = struct {
+	EndOfFile             Matcher
+	AnyRune               Matcher
+	Space                 Matcher
+	Tab                   Matcher
+	CR                    Matcher
+	LF                    Matcher
+	CRLF                  Matcher
+	Excl                  Matcher
+	DoubleQuote           Matcher
+	Hash                  Matcher
+	Dollar                Matcher
+	Percent               Matcher
+	Amp                   Matcher
+	SingleQuote           Matcher
+	RoundOpen             Matcher
+	RoundClose            Matcher
+	Asterisk              Matcher
+	Plus                  Matcher
+	Comma                 Matcher
+	Minus                 Matcher
+	Dot                   Matcher
+	Slash                 Matcher
+	Colon                 Matcher
+	Semicolon             Matcher
+	AngleOpen             Matcher
+	Equal                 Matcher
+	AngleClose            Matcher
+	Question              Matcher
+	At                    Matcher
+	SquareOpen            Matcher
+	Backslash             Matcher
+	SquareClose           Matcher
+	Caret                 Matcher
+	Underscore            Matcher
+	Backquote             Matcher
+	CurlyOpen             Matcher
+	Pipe                  Matcher
+	CurlyClose            Matcher
+	Tilde                 Matcher
+	Newline               Matcher
+	Whitespace            Matcher
+	WhitespaceAndNewlines Matcher
+	EndOfLine             Matcher
+	Digit                 Matcher
+	ASCII                 Matcher
+	ASCIILower            Matcher
+	ASCIIUpper            Matcher
+	HexDigit              Matcher
+}{
+	EndOfFile:             MatchEndOfFile(),
+	AnyRune:               MatchAnyRune(),
+	Space:                 C.Rune(' '),
+	Tab:                   C.Rune('\t'),
+	CR:                    C.Rune('\r'),
+	LF:                    C.Rune('\n'),
+	CRLF:                  C.Str("\r\n"),
+	Excl:                  C.Rune('!'),
+	DoubleQuote:           C.Rune('"'),
+	Hash:                  C.Rune('#'),
+	Dollar:                C.Rune('$'),
+	Percent:               C.Rune('%'),
+	Amp:                   C.Rune('&'),
+	SingleQuote:           C.Rune('\''),
+	RoundOpen:             C.Rune('('),
+	RoundClose:            C.Rune(')'),
+	Asterisk:              C.Rune('*'),
+	Plus:                  C.Rune('+'),
+	Comma:                 C.Rune(','),
+	Minus:                 C.Rune('-'),
+	Dot:                   C.Rune('.'),
+	Slash:                 C.Rune('/'),
+	Colon:                 C.Rune(':'),
+	Semicolon:             C.Rune(';'),
+	AngleOpen:             C.Rune('<'),
+	Equal:                 C.Rune('='),
+	AngleClose:            C.Rune('>'),
+	Question:              C.Rune('?'),
+	At:                    C.Rune('@'),
+	SquareOpen:            C.Rune('['),
+	Backslash:             C.Rune('\\'),
+	SquareClose:           C.Rune(']'),
+	Caret:                 C.Rune('^'),
+	Underscore:            C.Rune('_'),
+	Backquote:             C.Rune('`'),
+	CurlyOpen:             C.Rune('{'),
+	Pipe:                  C.Rune('|'),
+	CurlyClose:            C.Rune('}'),
+	Tilde:                 C.Rune('~'),
+	Whitespace:            C.OneOrMore(C.Any(C.Rune(' '), C.Rune('\t'))),
+	WhitespaceAndNewlines: C.OneOrMore(C.Any(C.Rune(' '), C.Rune('\t'), C.Str("\r\n"), C.Rune('\n'))),
+	EndOfLine:             C.Any(C.Str("\r\n"), C.Rune('\n'), MatchEndOfFile()),
+	Digit:                 C.RuneRange('0', '9'),
+	ASCII:                 C.RuneRange('\x00', '\x7F'),
+	ASCIILower:            C.RuneRange('a', 'z'),
+	ASCIIUpper:            C.RuneRange('A', 'Z'),
+	HexDigit:              C.Any(C.RuneRange('0', '9'), C.RuneRange('a', 'f'), C.RuneRange('A', 'F')),
+}
+
 // MatchEndOfFile creates a Matcher that checks if the end of the input data
 // has been reached. This Matcher will never produce output. It only reports
 // a successful or a failing match through its boolean return value.
@@ -442,6 +413,39 @@ func MatchAnyRune() Matcher {
 	}
 }
 
+// M provides convenient access to a range of modifiers that can be
+// used when creating Matcher functions.
+//
+// In parsekit, a modifier is defined as a Matcher function that modifies the
+// resulting output of another Matcher in some way. It does not do any matching
+// against input of its own.
+//
+// When using M in your own parser, then it is advised to create
+// a variable in your own package to reference it:
+//
+//     var m = parsekit.M
+//
+// Doing so saves you a lot of typing, and it makes your code a lot cleaner.
+var M = struct {
+	Drop             func(Matcher) Matcher
+	Trim             func(Matcher, string) Matcher
+	TrimLeft         func(Matcher, string) Matcher
+	TrimRight        func(Matcher, string) Matcher
+	ToLower          func(Matcher) Matcher
+	ToUpper          func(Matcher) Matcher
+	Replace          func(Matcher, string) Matcher
+	ModifyByCallback func(Matcher, func(string) string) Matcher
+}{
+	Drop:             ModifyDrop,
+	Trim:             ModifyTrim,
+	TrimLeft:         ModifyTrimLeft,
+	TrimRight:        ModifyTrimRight,
+	ToLower:          ModifyToLower,
+	ToUpper:          ModifyToUpper,
+	Replace:          ModifyReplace,
+	ModifyByCallback: ModifyByCallback,
+}
+
 // ModifyDrop creates a Matcher that checks if the provided Matcher applies.
 // If it does, then its output is discarded completely.
 //
@@ -459,7 +463,7 @@ func MatchAnyRune() Matcher {
 // string "bork" would not match against the second form, but " bork" would.
 // In both cases, it would match the first form.
 func ModifyDrop(matcher Matcher) Matcher {
-	return modifyStrCallback(matcher, func(s string) string {
+	return ModifyByCallback(matcher, func(s string) string {
 		return ""
 	})
 }
@@ -495,24 +499,37 @@ func modifyTrim(matcher Matcher, cutset string, trimLeft bool, trimRight bool) M
 		}
 		return s
 	}
-	return modifyStrCallback(matcher, modfunc)
+	return ModifyByCallback(matcher, modfunc)
 }
 
 // ModifyToUpper creates a Matcher that checks if the provided Matcher applies.
 // If it does, then its output is taken and characters from the provided
 // cutset are converted into upper case.
 func ModifyToUpper(matcher Matcher) Matcher {
-	return modifyStrCallback(matcher, strings.ToUpper)
+	return ModifyByCallback(matcher, strings.ToUpper)
 }
 
 // ModifyToLower creates a Matcher that checks if the provided Matcher applies.
 // If it does, then its output is taken and characters from the provided
 // cutset are converted into lower case.
 func ModifyToLower(matcher Matcher) Matcher {
-	return modifyStrCallback(matcher, strings.ToLower)
+	return ModifyByCallback(matcher, strings.ToLower)
 }
 
-func modifyStrCallback(matcher Matcher, modfunc func(string) string) Matcher {
+// ModifyReplace creates a Matcher that checks if the provided Matcher applies.
+// If it does, then its output is replaced by the provided string.
+func ModifyReplace(matcher Matcher, s string) Matcher {
+	return ModifyByCallback(matcher, func(string) string {
+		return s
+	})
+}
+
+// ModifyByCallback creates a Matcher that checks if the provided matcher applies.
+// If it does, then its output is taken and it is fed to the provided modfunc.
+// This is a simple function that takes a string on input and returns a possibly
+// modified string on output. The return value of the modfunc will replace the
+// resulting output.
+func ModifyByCallback(matcher Matcher, modfunc func(string) string) Matcher {
 	return func(m *MatchDialog) bool {
 		child := m.Fork()
 		if matcher(child) {
diff --git a/matcher_builtin_test.go b/matcher_builtin_test.go
index 4412a27..b12c123 100644
--- a/matcher_builtin_test.go
+++ b/matcher_builtin_test.go
@@ -7,21 +7,6 @@ import (
 	"git.makaay.nl/mauricem/go-parsekit"
 )
 
-func ExampleMatchAnyRune() {
-	parser := parsekit.New(
-		func(p *parsekit.P) {
-			p.Expects("Any valid rune")
-			if p.On(a.AnyRune).Accept().End() {
-				p.EmitLiteral(TestItem)
-			}
-		})
-	run := parser.Parse("¡Any / valid / character will dö!")
-	match, _, ok := run.Next()
-	if ok {
-		fmt.Printf("Match = %q\n", match)
-	}
-}
-
 func TestCombinators(t *testing.T) {
 	RunMatcherTests(t, []MatcherTest{
 		{"xxx", c.Rune('x'), true, "x"},
@@ -93,17 +78,6 @@ func TestCombinators(t *testing.T) {
 	})
 }
 
-func TestModifiers(t *testing.T) {
-	RunMatcherTests(t, []MatcherTest{
-		{"  trim  ", m.Trim(c.OneOrMore(a.AnyRune), " "), true, "trim"},
-		{" \t trim \t ", m.Trim(c.OneOrMore(a.AnyRune), " \t"), true, "trim"},
-		{"  trim  ", m.TrimLeft(c.OneOrMore(a.AnyRune), " "), true, "trim  "},
-		{"  trim  ", m.TrimRight(c.OneOrMore(a.AnyRune), " "), true, "  trim"},
-		{" \t  trim  \t ", m.TrimRight(c.OneOrMore(a.AnyRune), " \t"), true, " \t  trim"},
-		{"--cool", c.Seq(m.Drop(c.OneOrMore(a.Minus)), c.Str("cool")), true, "cool"},
-	})
-}
-
 func TestAtoms(t *testing.T) {
 	RunMatcherTests(t, []MatcherTest{
 		{"", a.EndOfFile, true, ""},
@@ -183,6 +157,51 @@ func TestAtoms(t *testing.T) {
 	})
 }
 
+func TestModifiers(t *testing.T) {
+	RunMatcherTests(t, []MatcherTest{
+		{"--cool", c.Seq(m.Drop(c.OneOrMore(a.Minus)), c.Str("cool")), true, "cool"},
+		{"  trim  ", m.Trim(c.OneOrMore(a.AnyRune), " "), true, "trim"},
+		{" \t trim \t ", m.Trim(c.OneOrMore(a.AnyRune), " \t"), true, "trim"},
+		{"  trim  ", m.TrimLeft(c.OneOrMore(a.AnyRune), " "), true, "trim  "},
+		{"  trim  ", m.TrimRight(c.OneOrMore(a.AnyRune), " "), true, "  trim"},
+		{" \t  trim  \t ", m.TrimRight(c.OneOrMore(a.AnyRune), " \t"), true, " \t  trim"},
+		{"dirtyword", m.Replace(c.OneOrMore(a.AnyRune), "*******"), true, "*******"},
+		{"abcdefghijk", m.ModifyByCallback(c.Str("abc"), func(s string) string { return "X" }), true, "X"},
+		{"NoTaLlUpPeR", m.ToUpper(c.StrNoCase("notallUPPER")), true, "NOTALLUPPER"},
+		{"NoTaLlLoWeR", m.ToLower(c.StrNoCase("NOTALLlower")), true, "notalllower"},
+	})
+}
+
+// I know, this is hell, but that's the whole point for this test :->
+func TestCombination(t *testing.T) {
+	demonic := c.Seq(
+		c.Opt(a.SquareOpen),
+		m.Trim(
+			c.Seq(
+				c.Opt(a.Whitespace),
+				c.Rep(3, a.AngleClose),
+				m.ModifyByCallback(c.OneOrMore(c.StrNoCase("hello")), func(s string) string {
+					return fmt.Sprintf("%d", len(s))
+				}),
+				m.Replace(c.Separated(c.Opt(a.Whitespace), a.Comma), ", "),
+				m.ToUpper(c.Min(1, a.ASCIILower)),
+				m.Drop(a.Excl),
+				c.Rep(3, a.AngleOpen),
+				c.Opt(a.Whitespace),
+			),
+			" \t",
+		),
+		c.Opt(a.SquareClose),
+	)
+
+	RunMatcherTests(t, []MatcherTest{
+		{"[ \t >>>Hello, world!<<<   ]", demonic, true, "[>>>5, WORLD<<<]"},
+		{"[ \t >>>Hello, world!<<<   ", demonic, true, "[>>>5, WORLD<<<"},
+		{">>>HellohellO, world!<<<   ]", demonic, true, ">>>10, WORLD<<<]"},
+		{"[ \t >>>HellohellO , , , world!<<<   ", demonic, true, "[>>>10, WORLD<<<"},
+	})
+}
+
 func TestSequenceOfRunes(t *testing.T) {
 	sequence := c.Seq(
 		a.Hash, a.Dollar, a.Percent, a.Amp, a.SingleQuote, a.RoundOpen,
@@ -206,3 +225,46 @@ func TestSequenceOfRunes(t *testing.T) {
 		t.Fatalf("Unexpected output from parser:\nexpected: %s\nactual: %s\n", input, item.Value)
 	}
 }
+
+func ExampleMatchAnyRune() {
+	handler := func(p *parsekit.P) {
+		p.Expects("Any valid rune")
+		if p.On(a.AnyRune).Accept().End() {
+			p.EmitLiteral(TestItem)
+		}
+	}
+	parser := parsekit.New(handler)
+	run := parser.Parse("¡Any / valid / character will dö!")
+	match, _, ok := run.Next()
+
+	// This will output '¡', since a.AnyRune matches exactly 1 rune.
+	if ok {
+		fmt.Printf("Match = %q\n", match)
+	}
+}
+
+func ExampleModifyToUpper() {
+	// A Dutch poscode consists of 4 numbers and 2 letters (1234XX).
+	// The numbers never start with a zero.
+	digitNotZero := c.RuneRange('1', '9')
+	numbers := c.Seq(digitNotZero, c.Rep(3, a.Digit))
+
+	// It is good form to write the letters in upper case.
+	letter := c.Any(a.ASCIILower, a.ASCIIUpper)
+	letters := m.ToUpper(c.Seq(letter, letter))
+
+	// It is good form to use a single space between letters and numbers,
+	// but it is not mandatory.
+	space := m.Replace(c.Opt(a.Whitespace), " ")
+
+	// With all the building blocks, we can now build the postcode parser.
+	postcode := c.Seq(numbers, space, letters)
+
+	// Create a parser and let is parse some postcode inputs.
+	// This will print "1234 AB" for every input, because of the built-in normalization.
+	p := parsekit.New(postcode)
+	for _, input := range []string{"1234 AB", "1234AB", "1234 ab", "1234ab"} {
+		r, _, _ := p.Parse("1234 AB").Next()
+		fmt.Printf("Input: %q, output: %q", input, r.Value)
+	}
+}
diff --git a/parsekit.go b/parsekit.go
index 65f52ec..8990542 100644
--- a/parsekit.go
+++ b/parsekit.go
@@ -57,7 +57,7 @@ func makeParserForStateHandler(handler StateHandler) *Parser {
 func makeParserForMatcher(matcher Matcher) *Parser {
 	return New(StateHandler(func(p *P) {
 		p.Expects("match")
-		if p.On(matcher).Accept().RouteRep().End() {
+		if p.On(matcher).Accept().RouteRepeat().End() {
 			p.EmitLiteral(MatchedItem)
 		}
 	}))
diff --git a/statehandler.go b/statehandler.go
index 0452429..d8b91bf 100644
--- a/statehandler.go
+++ b/statehandler.go
@@ -93,9 +93,9 @@ func (p *P) RouteTo(state StateHandler) *routeFollowupAction {
 	return &routeFollowupAction{chainAction: chainAction{p, true}}
 }
 
-// RouteRep indicates that on the next parsing cycle, the current
+// RouteRepeat indicates that on the next parsing cycle, the current
 // StateHandler must be reinvoked.
-func (p *P) RouteRep() *chainAction {
+func (p *P) RouteRepeat() *chainAction {
 	p.RouteTo(p.state)
 	return &chainAction{nil, true}
 }
diff --git a/statehandler_on.go b/statehandler_on.go
index ee19322..37841ee 100644
--- a/statehandler_on.go
+++ b/statehandler_on.go
@@ -134,11 +134,11 @@ type routeAction struct {
 	chainAction
 }
 
-// RouteRep indicates that on the next parsing cycle,
+// RouteRepeat indicates that on the next parsing cycle,
 // the current StateHandler must be reinvoked.
-func (a *routeAction) RouteRep() *chainAction {
+func (a *routeAction) RouteRepeat() *chainAction {
 	if a.ok {
-		return a.p.RouteRep()
+		return a.p.RouteRepeat()
 	}
 	return &chainAction{nil, false}
 }