diff --git a/examples/example_basiccalculator2_test.go b/examples/example_basiccalculator2_test.go index a768ddf..2a5dc7d 100644 --- a/examples/example_basiccalculator2_test.go +++ b/examples/example_basiccalculator2_test.go @@ -94,9 +94,9 @@ func (calc *calculator) calculation(p *parsekit.ParseAPI) { func (calc *calculator) expr(p *parsekit.ParseAPI) { calc.interpreter.push() - var C, A = parsekit.C, parsekit.A + var A = parsekit.A if p.Handle(calc.term) { - for p.Accept(C.Any(A.Add, A.Subtract)) { + for p.Accept(A.Add.Or(A.Subtract)) { op := p.Result().Rune(0) if !p.Handle(calc.term) { return @@ -112,9 +112,9 @@ func (calc *calculator) expr(p *parsekit.ParseAPI) { func (calc *calculator) term(p *parsekit.ParseAPI) { calc.interpreter.push() - var C, A = parsekit.C, parsekit.A + var A = parsekit.A if p.Handle(calc.factor) { - for p.Accept(C.Any(A.Multiply, A.Divide)) { + for p.Accept(A.Multiply.Or(A.Divide)) { op := p.Result().Rune(0) if !p.Handle(calc.factor) { return diff --git a/examples/example_dutchpostcode_test.go b/examples/example_dutchpostcode_test.go index e93b690..f8e5923 100644 --- a/examples/example_dutchpostcode_test.go +++ b/examples/example_dutchpostcode_test.go @@ -62,9 +62,9 @@ func createPostcodeTokenizer() *parsekit.Tokenizer { // - It is good form to write the letters in upper case. // - It is good form to use a single space between digits and letters. digitNotZero := C.Except(A.Rune('0'), A.Digit) - pcDigits := C.Seq(digitNotZero, C.Rep(3, A.Digit)) - pcLetter := C.Any(A.ASCIILower, A.ASCIIUpper) - pcLetters := M.ToUpper(C.Seq(pcLetter, pcLetter)) + pcDigits := C.Seq(digitNotZero, A.Digit.Times(3)) + pcLetter := A.ASCIILower.Or(A.ASCIIUpper) + pcLetters := M.ToUpper(pcLetter.Times(2)) space := M.Replace(C.Opt(A.Blanks), " ") postcode := C.Seq(T.Str("PCD", pcDigits), space, T.Str("PCL", pcLetters), A.EndOfFile) diff --git a/examples/example_helloParserCombinator_test.go b/examples/example_helloParserCombinator_test.go index 2d96f8f..59d3631 100644 --- a/examples/example_helloParserCombinator_test.go +++ b/examples/example_helloParserCombinator_test.go @@ -57,7 +57,11 @@ func createHelloTokenizer() *parsekit.Tokenizer { comma := c.Seq(c.Opt(a.Blank), a.Comma, c.Opt(a.Blank)) separator := c.Any(comma, a.Blank) name := c.OneOrMore(c.Not(a.Excl)) - greeting := c.Seq(m.Drop(hello), m.Drop(separator), name, m.Drop(a.Excl), a.EndOfFile) + greeting := m.Drop(hello). + Then(m.Drop(separator)). + Then(name). + Then(m.Drop(a.Excl)). + Then(a.EndOfFile) // Create a Tokenizer that wraps the 'greeting' TokenHandler and allows // us to match some input against that handler. diff --git a/parseapi.go b/parseapi.go index c730d1a..9407f5d 100644 --- a/parseapi.go +++ b/parseapi.go @@ -183,7 +183,7 @@ func (p *ParseAPI) ExpectEndOfFile() { } // Expected is used to set an error that tells the user that some -// unexpected input was encountered, and that input was expected. +// unexpected input was encountered, and what input was expected. // // The 'expected' argument can be an empty string. In that case the error // message will not contain a description of the expected input. diff --git a/parser.go b/parser.go index fc29824..903df2b 100644 --- a/parser.go +++ b/parser.go @@ -21,6 +21,9 @@ type ParseHandler func(*ParseAPI) // parsing. This style of parser is typically used for parsing programming // languages and structured data formats (like json, xml, toml, etc.) // +// The startHandler argument points the Parser to the ParseHandler function +// that must be executed at the start of the parsing process. +// // To parse input data, use the method Parser.Execute(). func NewParser(startHandler ParseHandler) *Parser { if startHandler == nil { diff --git a/reader/reader.go b/reader/reader.go index 104d08e..b80063a 100644 --- a/reader/reader.go +++ b/reader/reader.go @@ -36,7 +36,7 @@ // 0 6 9 // // So after a flush, the first upcoming rune after the flushed runes -// will always have index 0. +// will always be at offset 0. package reader import ( diff --git a/tokenapi.go b/tokenapi.go index d706af0..5d9c7a4 100644 --- a/tokenapi.go +++ b/tokenapi.go @@ -20,7 +20,7 @@ import ( // // By invoking NextRune() + Accept() multiple times, the result can be extended // with as many runes as needed. Runes collected this way can later on be -// retrieved using the method +// retrieved using the method Result().Runes(). // // It is mandatory to call Accept() after retrieving a rune, before calling // NextRune() again. Failing to do so will result in a panic. diff --git a/tokenapi_example_test.go b/tokenapi_example_test.go index 46caca5..d838c2f 100644 --- a/tokenapi_example_test.go +++ b/tokenapi_example_test.go @@ -7,40 +7,40 @@ import ( ) func ExampleTokenAPI_Fork() { - // This custom TokenHandler checks for a sequence of runes: "abcd" - // This is done in 4 steps and only after finishing all steps, - // the TokenHandler will confirm a successful match. - abcdSequence := func(t *parsekit.TokenAPI) bool { - child := t.Fork() // fork, so we won't change parent t - for _, checkRune := range "abcd" { - readRune, err := child.NextRune() - if err != nil || readRune != checkRune { - return false // report mismatch, parent t is left untouched + // This custom TokenHandler checks for input 'a', 'b' or 'c'. + abcHandler := func(t *parsekit.TokenAPI) bool { + a := parsekit.A + for _, r := range []rune{'a', 'b', 'c'} { + child := t.Fork() // fork, so we won't change parent t + if a.Rune(r)(child) { + child.Merge() // accept results into parent t + return true // and report a successful match } - child.Accept() // add rune to child output } - child.Merge() // we have a match, add resulting output to parent - return true // and report the successful match + // If we get here, then no match was found. Return false to communicate + // this to the caller. + return false } // Note: a custom TokenHandler is normally not what you need. // You can make use of the parser/combinator tooling to do things - // a lot simpler. The handler from above can be replaced with: - simpler := parsekit.A.Str("abcd") + // a lot simpler and take care of forking at the appropriate places. + // The handler from above can be replaced with: + simpler := parsekit.A.RuneRange('a', 'c') - result, err := parsekit.NewTokenizer(abcdSequence).Execute("abcdefgh") + result, err := parsekit.NewTokenizer(abcHandler).Execute("another test") fmt.Println(result, err) - result, err = parsekit.NewTokenizer(simpler).Execute("abcdefgh") + result, err = parsekit.NewTokenizer(simpler).Execute("curious") fmt.Println(result, err) - result, err = parsekit.NewTokenizer(abcdSequence).Execute("abcx") + result, err = parsekit.NewTokenizer(abcHandler).Execute("bang on!") fmt.Println(result, err) - result, err = parsekit.NewTokenizer(abcdSequence).Execute("xyz") + result, err = parsekit.NewTokenizer(abcHandler).Execute("not a match") fmt.Println(result, err) // Output: - // abcd - // abcd - // unexpected input at start of file + // a + // c + // b // unexpected input at start of file } diff --git a/tokenhandler_test.go b/tokenhandler_test.go index cd4e906..e597c4b 100644 --- a/tokenhandler_test.go +++ b/tokenhandler_test.go @@ -75,15 +75,12 @@ func TestUsingTokenParserCombinators_TokensCanBeEmitted(t *testing.T) { func TestUsingTokenParserCombinators_TokensCanBeNested(t *testing.T) { var c, m, tok, a = parsekit.C, parsekit.M, parsekit.T, parsekit.A - fooToken := c.Seq( - m.Drop(c.ZeroOrMore(a.Asterisk)), - tok.Str("COMBI", c.Seq( - tok.Str("ASCII", m.TrimSpace(c.OneOrMore(a.ASCII))), - tok.Str("UTF8", m.TrimSpace(c.OneOrMore(c.Except(a.Asterisk, a.AnyRune)))), - )), - m.Drop(c.ZeroOrMore(a.Asterisk)), - ) + ascii := tok.Str("ASCII", m.TrimSpace(c.OneOrMore(a.ASCII))) + utf8 := tok.Str("UTF8", m.TrimSpace(c.OneOrMore(c.Except(a.Asterisk, a.AnyRune)))) + stars := m.Drop(c.ZeroOrMore(a.Asterisk)) + fooToken := c.Seq(stars, tok.Str("COMBI", ascii.Then(utf8)), stars) parser := parsekit.NewTokenizer(fooToken) + input := "*** This is fine ASCII Åltho hère öt endĩt! ***" output := "This is fine ASCIIÅltho hère öt endĩt!" result, err := parser.Execute(input) diff --git a/tokenhandlerresult.go b/tokenhandlerresult.go index c556b25..bef2d32 100644 --- a/tokenhandlerresult.go +++ b/tokenhandlerresult.go @@ -5,8 +5,9 @@ import ( "strings" ) -// TokenHandlerResult is a struct that is used for holding and managing tokenizing results as -// produced by a TokenHandler. +// TokenHandlerResult is a struct that is used for holding tokenizing results +// as produced by a TokenHandler. It also provides the API that TokenHandlers +// and Parsers can use to respectively store and access the results. type TokenHandlerResult struct { lastRune *runeInfo // Information about the last rune read using NextRune() runes []rune diff --git a/tokenhandlers_builtin.go b/tokenhandlers_builtin.go index f7b90bf..b7a1a38 100644 --- a/tokenhandlers_builtin.go +++ b/tokenhandlers_builtin.go @@ -36,7 +36,7 @@ var C = struct { ZeroOrMore func(TokenHandler) TokenHandler OneOrMore func(TokenHandler) TokenHandler MinMax func(min int, max int, handler TokenHandler) TokenHandler - Separated func(separated TokenHandler, separator TokenHandler) TokenHandler // TODO reverse args for consistency, us string? + Separated func(separated TokenHandler, separator TokenHandler) TokenHandler Except func(except TokenHandler, handler TokenHandler) TokenHandler }{ Opt: MatchOpt, @@ -241,13 +241,13 @@ var A = struct { // Doing so saves you a lot of typing, and it makes your code a lot cleaner. var M = struct { Drop func(TokenHandler) TokenHandler - Trim func(handler TokenHandler, cutset string) TokenHandler // TODO reverse arguments? - TrimLeft func(handler TokenHandler, cutset string) TokenHandler // TODO reverse arguments? - TrimRight func(handler TokenHandler, cutset string) TokenHandler // TODO reverse arguments? + Trim func(handler TokenHandler, cutset string) TokenHandler + TrimLeft func(handler TokenHandler, cutset string) TokenHandler + TrimRight func(handler TokenHandler, cutset string) TokenHandler TrimSpace func(handler TokenHandler) TokenHandler ToLower func(TokenHandler) TokenHandler ToUpper func(TokenHandler) TokenHandler - Replace func(handler TokenHandler, replaceWith string) TokenHandler // TODO reverse arguments? + Replace func(handler TokenHandler, replaceWith string) TokenHandler ByCallback func(TokenHandler, func(string) string) TokenHandler }{ Drop: ModifyDrop, @@ -409,13 +409,7 @@ func MatchStrNoCase(expected string) TokenHandler { // no output is generated but still a successful match is reported (but the // result will be empty). func MatchOpt(handler TokenHandler) TokenHandler { - return func(t *TokenAPI) bool { - child := t.Fork() - if handler(child) { - child.Merge() - } - return true - } + return MatchMinMax(0, 1, handler) } // MatchSeq creates a TokenHandler that checks if the provided TokenHandlers can be @@ -457,8 +451,7 @@ func MatchAny(handlers ...TokenHandler) TokenHandler { // does not, then the next rune from the input will be reported as a match. func MatchNot(handler TokenHandler) TokenHandler { return func(t *TokenAPI) bool { - probe := t.Fork() - if handler(probe) { + if handler(t.Fork()) { return false } _, err := t.NextRune() @@ -479,6 +472,10 @@ func MatchNot(handler TokenHandler) TokenHandler { // // will not match input "XXX", it will match input "XXXX", but also "XXXXXX". // In that last case, there will be a remainder "XX" on the input. +// +// Another way to use this method, is by applying the following syntactic sugar: +// +// MatchRune('X').Times(4) func MatchRep(times int, handler TokenHandler) TokenHandler { return matchMinMax(times, times, handler, "MatchRep") } @@ -495,7 +492,7 @@ func MatchMin(min int, handler TokenHandler) TokenHandler { // MatchMax creates a TokenHandler that checks if the provided TokenHandler can be // applied at maximum the provided minimum number of times. -// When more matches are possible, these will be included in the output. +// When more matches are possible, thhandler(ese will be included in the output. // Zero matches are considered a successful match. func MatchMax(max int, handler TokenHandler) TokenHandler { if max < 0 { @@ -535,20 +532,22 @@ func matchMinMax(min int, max int, handler TokenHandler, name string) TokenHandl callerPanic(2, "TokenHandler: %s definition error at {caller}: max %d must not be < min %d", name, max, min) } return func(t *TokenAPI) bool { - child := t.Fork() total := 0 // Check for the minimum required amount of matches. for total < min { total++ + child := t.Fork() if !handler(child) { return false } + child.Merge() } // No specified max: include the rest of the available matches. // Specified max: include the rest of the availble matches, up to the max. - child.Merge() + //child.Merge() for max < 0 || total < max { total++ + child := t.Fork() if !handler(child) { break } diff --git a/tokenhandlers_builtin_test.go b/tokenhandlers_builtin_test.go index 2115bf2..68e01f5 100644 --- a/tokenhandlers_builtin_test.go +++ b/tokenhandlers_builtin_test.go @@ -378,6 +378,19 @@ func TestTokenMakers(t *testing.T) { }) } +func TestSyntacticSugar(t *testing.T) { + var a = parsekit.A + parsekit.AssertTokenHandlers(t, []parsekit.TokenHandlerT{ + {"aaaaaa", a.Rune('a').Times(4), true, "aaaa"}, + {"ababab", a.Rune('a').Or(a.Rune('b')).Times(4), true, "abab"}, + {"ababab", a.Rune('a').Then(a.Rune('b')), true, "ab"}, + {"bababa", a.Rune('a').Then(a.Rune('b')), false, ""}, + {"cccccc", a.Rune('c').Optional(), true, "c"}, + {"dddddd", a.Rune('c').Optional(), true, ""}, + {"a,b ,c, d|", a.ASCII.SeparatedBy(a.Space.Optional().Then(a.Comma).Then(a.Space.Optional())), true, "a,b ,c, d"}, + }) +} + func TestSequenceOfRunes(t *testing.T) { var c, a = parsekit.C, parsekit.A sequence := c.Seq( diff --git a/tokenizer.go b/tokenizer.go index 791d2b1..75f863d 100644 --- a/tokenizer.go +++ b/tokenizer.go @@ -18,6 +18,36 @@ type Tokenizer struct { // for retrieving input data to match against and for reporting back results. type TokenHandler func(t *TokenAPI) bool +// Or is syntactic sugar that allows you to write a construction like +// MatchAny(tokenHandler1, tokenHandler2) as tokenHandler1.Or(tokenHandler2). +func (handler TokenHandler) Or(otherHandler TokenHandler) TokenHandler { + return MatchAny(handler, otherHandler) +} + +// Times is syntactic sugar that allows you to write a construction like +// MatchRep(3, handler) as handler.Times(3). +func (handler TokenHandler) Times(n int) TokenHandler { + return MatchRep(n, handler) +} + +// Then is syntactic sugar that allows you to write a construction like +// MatchSeq(handler1, handler2, handler3) as handler1.Then(handler2).Then(handler3). +func (handler TokenHandler) Then(otherHandler TokenHandler) TokenHandler { + return MatchSeq(handler, otherHandler) +} + +// SeparatedBy is syntactic sugar that allows you to write a construction like +// MatchSeparated(handler, separator) as handler.SeparatedBy(separator). +func (handler TokenHandler) SeparatedBy(separatorHandler TokenHandler) TokenHandler { + return MatchSeparated(separatorHandler, handler) +} + +// Optional is syntactic sugar that allows you to write a construction like +// MatchOpt(handler) as handler.Optional(). +func (handler TokenHandler) Optional() TokenHandler { + return MatchOpt(handler) +} + // NewTokenizer instantiates a new Tokenizer. // // This is a simple wrapper around a TokenHandler function. It can be used to