From c6fde2cf4ed0f2f5cec3614858c3e4b227b74459 Mon Sep 17 00:00:00 2001 From: Maurice Makaay Date: Sun, 26 May 2019 09:25:34 +0000 Subject: [PATCH] A big round of getting-ya-terminology-straight. --- example_basiccalculator_test.go | 34 +- example_dutchpostcode_test.go | 10 +- example_hellomatcher_test.go | 15 +- example_helloparser_test.go | 16 +- examples_test.go | 6 +- matcher.go | 187 ------ matcher_builtin.go | 559 ------------------ parsekit.go | 58 +- parsekit_test.go | 18 +- statehandler.go | 16 +- statehandler_emit.go | 61 +- statehandler_expects.go | 4 +- statehandler_on.go | 53 +- statehandler_route.go | 36 +- tokenhandler.go | 192 ++++++ tokenhandlers_builtin.go | 558 +++++++++++++++++ ...n_test.go => tokenhandlers_builtin_test.go | 56 +- 17 files changed, 945 insertions(+), 934 deletions(-) delete mode 100644 matcher.go delete mode 100644 matcher_builtin.go create mode 100644 tokenhandler.go create mode 100644 tokenhandlers_builtin.go rename matcher_builtin_test.go => tokenhandlers_builtin_test.go (97%) diff --git a/example_basiccalculator_test.go b/example_basiccalculator_test.go index 5d792a9..f953d21 100644 --- a/example_basiccalculator_test.go +++ b/example_basiccalculator_test.go @@ -15,15 +15,15 @@ import ( ) // When writing a parser, it's a good start to use the parser/combinator -// functionality of parsekit to create some Matcher functions. These functions +// functionality of parsekit to create some TokenHandler functions. These functions // can later be used in the parser state machine to check for matching strings // on the input data. // // For the calculator, we only need a definition of "number, surrounded by // optional whitespace". Skipping whitespace could be a part of the StateHandler -// functions below too, but including it in a Matcher makes things really +// functions below too, but including it in a TokenHandler makes things really // practical. -func createNumberMatcher() parsekit.Matcher { +func createNumberMatcher() parsekit.TokenHandler { // Easy access to parsekit definition. c, a, m := parsekit.C, parsekit.A, parsekit.M @@ -43,17 +43,17 @@ const ( // We also need to define the state machine for parsing the input. // The state machine is built up from functions that match the StateHandler -// signature: func(*parsekit.P) -// The P struct holds the internal state for the parser and it provides +// signature: func(*parsekit.ParseAPI) +// The ParseAPI struct holds the internal state for the parser and it provides // some methods that form the API for your StateHandler implementation. // State: expect a number. When a number is found on the input, -// it is accepted in the output buffer, after which the output buffer is +// it is accepted in the parser's string buffer, after which that buffer is // emitted as a numberType item. Then we tell the state machine to continue // with the calcWaitForOperatorOrEndOfInput state. // When no number is found, the parser will emit an error, explaining that // "a number" was expected. -func calcWaitForNumber(p *parsekit.P) { +func calcWaitForNumber(p *parsekit.ParseAPI) { p.Expects("a number") if p.On(calcNumber).Accept() { p.EmitLiteral(numberType) @@ -61,13 +61,13 @@ func calcWaitForNumber(p *parsekit.P) { } } -// State: expect a plus or minus operator. When one of those -// is found, the appropriate Item is emitted and the parser is sent back -// to the numberHandler to find the next number on the input. -// When no operator is found, then the parser is told to expect the end of -// the input. When more input data is available (which is obviously wrong -// data since it does not match our syntax), the parser will emit an error. -func calcWaitForOperatorOrEndOfInput(p *parsekit.P) { +// State: expect a plus or minus operator. When one of those is found, the +// appropriate Item is emitted and the parser is sent back to the +// numberHandler to find the next number on the input. When no operator is +// found, then the parser is told to expect the end of the input. When more +// input data are available (which are obviously wrong data since they do +// not match our syntax), the parser will emit an error. +func calcWaitForOperatorOrEndOfInput(p *parsekit.ParseAPI) { switch { case p.On(a.Plus).Accept(): p.EmitLiteral(addType) @@ -81,18 +81,20 @@ func calcWaitForOperatorOrEndOfInput(p *parsekit.P) { } // All is ready for our parser. We now can create a new Parser struct. -// We need to tell it what the start state is. In our case, it is the +// We need to tell it what StateHandler to start with. In our case, it is the // calcWaitForNumber state, since the calculation must start with a number. var calcParser = parsekit.NewParser(calcWaitForNumber) func Example_basicCalculator() { - // Let's feed the parser some input to work with. + // Let's feed the parser some input to work with. This provides us with + // a parse run for that input. run := calcParser.Parse(" 153+22 + 31-4 -\t 6+42 ") // We can now step through the results of the parsing process by repeated // calls to run.Next(). Next() returns either the next parse item, a parse // error or an end of file. Let's dump the parse results and handle the // computation while we're at it. + // TODO this in convoluted for people using the parser code I think. Maybe use three output data types instead? sum := 0 op := +1 for { diff --git a/example_dutchpostcode_test.go b/example_dutchpostcode_test.go index cd28874..38bad4f 100644 --- a/example_dutchpostcode_test.go +++ b/example_dutchpostcode_test.go @@ -1,5 +1,5 @@ -// In this example, a parser is created which can parse and normalize Dutch postcodes -// The implementation uses only a Matcher function and does not implement a +// In this example, a Parser is created which can parse and normalize Dutch postcodes +// The implementation uses only TokenHandler functions and does not implement a // full-fledged state-based Parser for it. package parsekit_test @@ -9,11 +9,11 @@ import ( "git.makaay.nl/mauricem/go-parsekit" ) -func createPostcodeMatcher() *parsekit.MatcherWrapper { +func createPostcodeMatcher() *parsekit.Matcher { // Easy access to the parsekit definitions. c, a, m := parsekit.C, parsekit.A, parsekit.M - // Matcher functions are created and combined to satisfy these rules: + // TokenHandler functions are created and combined to satisfy these rules: // - A Dutch postcode consists of 4 digits and 2 letters (1234XX). // - The first digit is never a zero. // - A space between letters and digits is optional. @@ -26,6 +26,8 @@ func createPostcodeMatcher() *parsekit.MatcherWrapper { space := m.Replace(c.Opt(a.Whitespace), " ") postcode := c.Seq(pcDigits, space, pcLetters) + // Create a Matcher, which wraps the 'postcode' TokenHandler and allows + // us to match some input against that handler. return parsekit.NewMatcher(postcode, "a Dutch postcode") } diff --git a/example_hellomatcher_test.go b/example_hellomatcher_test.go index 975de0c..23debec 100644 --- a/example_hellomatcher_test.go +++ b/example_hellomatcher_test.go @@ -1,9 +1,9 @@ // In this example, a parser is created that is able to parse input that looks // like "Hello, !", and that extracts the name from it. // -// The implementation uses only a Matcher function and does not implement a -// full-fledged state-based Parser for it. If you want to see the same kind of -// functionality, implementated using a Paser, take a look at the +// The implementation uses only parser/combinator TokenHandler functions and does +// not implement a full-fledged state-based Parser for it. If you want to see the +// same kind of functionality, implementated using a Paser, take a look at the // HelloWorldUsingParser example. package parsekit_test @@ -13,12 +13,12 @@ import ( "git.makaay.nl/mauricem/go-parsekit" ) -func createHelloMatcher() *parsekit.MatcherWrapper { +func createHelloMatcher() *parsekit.Matcher { // Easy access to parsekit definition. c, a, m := parsekit.C, parsekit.A, parsekit.M - // Using the parser/combinator support of parsekit, we create a Matcher function - // that does all the work. The 'greeting' Matcher matches the whole input and + // Using the parser/combinator support of parsekit, we create a TokenHandler function + // that does all the work. The 'greeting' TokenHandler matches the whole input and // drops all but the name from it. hello := c.StrNoCase("hello") comma := c.Seq(c.Opt(a.Whitespace), a.Comma, c.Opt(a.Whitespace)) @@ -26,7 +26,8 @@ func createHelloMatcher() *parsekit.MatcherWrapper { name := c.OneOrMore(c.Not(a.Excl)) greeting := c.Seq(m.Drop(hello), m.Drop(separator), name, m.Drop(a.Excl)) - // Using 'greeting' we can now create the Matcher-based parser. + // Create a Matcher, which wraps the 'greeting' TokenHandler and allows + // us to match some input against that handler. return parsekit.NewMatcher(greeting, "a friendly greeting") } diff --git a/example_helloparser_test.go b/example_helloparser_test.go index 57ee5f6..baa5403 100644 --- a/example_helloparser_test.go +++ b/example_helloparser_test.go @@ -2,14 +2,14 @@ // like "Hello, !", and that extracts the name from it. // // This implementation uses a state-based Parser for it, and it does not implement -// any custom combinator/parser Matcher functions. Note that things are much easier to -// implement using custom Matchers (see the other HelloWorldUsingMatcher example -// for this). Doing this fully parser-based implementation is mainly for your +// any custom parser/combinator TokenHandler functions. Note that things are much +// easier to implement using custom TokenHandlers (see the other HelloWorldUsingMatcher +// example for this). Doing this fully parser-based implementation is mainly for your // learning pleasure. // // One big difference between the Matcher-based example and this one, is that the // state-based parser reports errors much more fine-grained. This might or might -// not be useful for your specific application. +// not be useful for your specific use case. package parsekit_test import ( @@ -21,7 +21,7 @@ import ( const greeteeItem parsekit.ItemType = 1 -func stateStartOfGreeting(p *parsekit.P) { +func stateStartOfGreeting(p *parsekit.ParseAPI) { c := parsekit.C p.Expects("hello") if p.On(c.StrNoCase("hello")).Skip() { @@ -29,7 +29,7 @@ func stateStartOfGreeting(p *parsekit.P) { } } -func stateComma(p *parsekit.P) { +func stateComma(p *parsekit.ParseAPI) { a := parsekit.A p.Expects("comma") switch { @@ -40,7 +40,7 @@ func stateComma(p *parsekit.P) { } } -func stateName(p *parsekit.P) { +func stateName(p *parsekit.ParseAPI) { a := parsekit.A p.Expects("name") switch { @@ -51,7 +51,7 @@ func stateName(p *parsekit.P) { } } -func stateEndOfGreeting(p *parsekit.P) { +func stateEndOfGreeting(p *parsekit.ParseAPI) { p.Expects("end of greeting") if p.On(a.EndOfFile).Stay() { name := strings.TrimSpace(p.BufLiteral()) diff --git a/examples_test.go b/examples_test.go index 479d1aa..899e7c2 100644 --- a/examples_test.go +++ b/examples_test.go @@ -28,7 +28,7 @@ func ExampleItem() { // the p.Emit* methods on parsekit.P. // When errors occur, or the end of the file is reached, then the built-in // types parsekit.ItemEOF and parsekit.ItemError will be emitted by parsekit. - stateHandler := func(p *parsekit.P) { + stateHandler := func(p *parsekit.ParseAPI) { if p.On(c.Str("question")).Accept() { p.EmitLiteral(QuestionItem) } @@ -99,14 +99,14 @@ func ExampleMatchAnyRune() { // Easy access to the parsekit definitions. a := parsekit.A - handler := func(p *parsekit.P) { + stateHandler := func(p *parsekit.ParseAPI) { p.Expects("Any valid rune") if p.On(a.AnyRune).Accept() { p.EmitLiteral(TestItem) p.RouteRepeat() } } - parser := parsekit.NewParser(handler) + parser := parsekit.NewParser(stateHandler) run := parser.Parse("¡Any / valid / character will dö!") for i := 0; i < 5; i++ { diff --git a/matcher.go b/matcher.go deleted file mode 100644 index 87ea504..0000000 --- a/matcher.go +++ /dev/null @@ -1,187 +0,0 @@ -package parsekit - -import ( - "fmt" -) - -// Matcher is the function type that must be implemented to create a function -// that can be used in conjunction with parsekit.P.On() or parsekit.New(). -// Its purpose is to check if input data matches some kind of pattern and to -// report back the match. -// -// A Matcher function gets a MatchDialog as its input and returns a boolean to -// indicate whether or not the Matcher found a match on the input. -// The MatchDialog is used for retrieving input data to match against -// and for reporting back results. -type Matcher func(m *MatchDialog) bool - -// MatchDialog is used by Matcher functions to retrieve runes from the -// input to match against and to report back results. -// -// Basic operation: -// -// To retrieve the next rune from the input, the Matcher function can call -// the MatchDialog.NextRune() method. -// -// The Matcher function can then evaluate the retrieved rune and either -// accept of skip the rune. When accepting it using MatchDialog.Accept(), -// the rune is added to the output of the MatchDialog. When using -// MatchDialog.Skip(), the rune will not be added to the output. It is -// mandatory for a Matcher to call either Accept() or Skip() after retrieving -// a rune, before calling NextRune() again. -// -// Eventually, the Matcher function must return a boolean value, indicating -// whether or not a match was found. When true, then the calling code will -// use the runes that were accepted into the MatchDialog's resulting output. -// -// Forking operation for easy lookahead support: -// -// Sometimes, a Matcher function must be able to perform a lookahead, which -// might either succeed or fail. In case of a failing lookahead, the state -// of the MatchDialog must be brought back to the original state. -// -// The way in which this is supported, is by forking a MatchDialog by calling -// MatchDialog.Fork(). This will return a child MatchDialog, with an empty -// output buffer, but using the same input offset as the forked parent. -// -// The Matcher function can then use the same interface as described for -// normal operation to retrieve runes from the input and to fill the output -// buffer. When the Matcher function decides that the lookahead was successful, -// then the method MatchDialog.Merge() can be called on the forked child to -// append the resulting output from the child to the parent's resulting output, -// and to update the parent input offset to that of the child. -// -// When the Matcher function decides that the lookahead was unsuccessful, then -// it can simply discard the forked child. The parent MatchDialog was never -// modified, so a new match can be safely started using that parent, as if the -// lookahead never happened. -type MatchDialog struct { - p *P // parser state, used to retrieve input data to match against (TODO should be interface) - inputOffset int // the byte offset into the input - input []rune // a slice of runes that represents the retrieved input runes for the Matcher - output []rune // a slice of runes that represents the accepted output runes for the Matcher - currRune *runeToken // hold the last rune that was read from the input - parent *MatchDialog // the parent MatchDialog, in case this one was forked -} - -type runeToken struct { - Rune rune - ByteSize int - OK bool -} - -// NextRune retrieves the next rune from the input. -// -// It returns the rune and a boolean. The boolean will be false in case an -// invalid UTF8 rune or the end of the file was encountered. -// -// After using NextRune() to retrieve a rune, Accept() or Skip() can be called -// to respectively add the rune to the MatchDialog's resulting output or to -// fully ignore it. This way, a Matcher has full control over what runes are -// significant for the resulting output of that matcher. -// -// After using NextRune(), this method can not be reinvoked, until the last read -// rune is explicitly accepted or skipped as described above. -func (m *MatchDialog) NextRune() (rune, bool) { - if m.currRune != nil { - panic("internal Matcher error: NextRune() was called without accepting or skipping the previously read rune") - } - r, w, ok := m.p.peek(m.inputOffset) - m.currRune = &runeToken{r, w, ok} - if ok { - m.input = append(m.input, r) - } - return r, ok -} - -// Fork splits off a child MatchDialog, containing the same offset as the -// parent MatchDialog, but with all other data in a fresh state. -// -// By forking, a Matcher function can freely work with a MatchDialog, without -// affecting the parent MatchDialog. This is for example useful when the -// Matcher function must perform some form of lookahead. -// -// When a successful match was found, the Matcher function can call -// child.Merge() to have the resulting output added to the parent MatchDialog. -// When no match was found, the forked child can simply be discarded. -// -// Example case: A Matcher checks for a sequence of runes: 'a', 'b', 'c', 'd'. -// This is done in 4 steps and only after finishing all steps, the Matcher -// function can confirm a successful match. The Matcher function for this -// case could look like this (yes, it's naive, but it shows the point): -// -// func MatchAbcd(m *MatchDialog) bool { -// child := m.Fork() // fork to keep m from input untouched -// for _, letter := []rune {'a', 'b', 'c', 'd'} { -// if r, ok := m.NextRune(); !ok || r != letter { -// return false // report mismatch, m is left untouched -// } -// child.Accept() // add rune to child output -// } -// child.Merge() // we have a match, add resulting output to parent -// return true // and report the successful match -// } -func (m *MatchDialog) Fork() *MatchDialog { - child := &MatchDialog{ - p: m.p, - inputOffset: m.inputOffset, - parent: m, - } - return child -} - -// Accept will add the last rune as read by NextRune() to the resulting -// output of the MatchDialog. -func (m *MatchDialog) Accept() { - m.checkAllowedCall("Accept()") - m.output = append(m.output, m.currRune.Rune) - m.inputOffset += m.currRune.ByteSize - m.currRune = nil -} - -// Skip will ignore the last rune as read by NextRune(). -func (m *MatchDialog) Skip() { - m.checkAllowedCall("Skip()") - m.inputOffset += m.currRune.ByteSize - m.currRune = nil -} - -func (m *MatchDialog) checkAllowedCall(name string) { - if m.currRune == nil { - panic(fmt.Sprintf("internal Matcher error: %s was called without a prior call to NextRune()", name)) - } - if !m.currRune.OK { - panic(fmt.Sprintf("internal Matcher error: %s was called, but prior call to NextRun() did not return OK (EOF or invalid rune)", name)) - } -} - -// Merge merges the resulting output from a forked child MatchDialog back into -// its parent: The runes that are accepted in the child are added to the parent -// runes and the parent's offset is advanced to the child's offset. -// -// After the merge, the child MatchDialog is reset so it can immediately be -// reused for performing another match (all data are cleared, except for the -// input offset which is kept at its current position). -func (m *MatchDialog) Merge() bool { - if m.parent == nil { - panic("internal parser error: Cannot call Merge a a non-forked MatchDialog") - } - m.parent.input = append(m.parent.input, m.input...) - m.parent.output = append(m.parent.output, m.output...) - m.parent.inputOffset = m.inputOffset - m.ClearOutput() - m.ClearInput() - return true -} - -// ClearOutput clears the resulting output for the MatchDialog, but it keeps -// the input and input offset as-is. -func (m *MatchDialog) ClearOutput() { - m.output = []rune{} -} - -// ClearInput clears the input for the MatchDialog, but it keeps the output -// and input offset as-is. -func (m *MatchDialog) ClearInput() { - m.input = []rune{} -} diff --git a/matcher_builtin.go b/matcher_builtin.go deleted file mode 100644 index fd90142..0000000 --- a/matcher_builtin.go +++ /dev/null @@ -1,559 +0,0 @@ -package parsekit - -import ( - "fmt" - "strings" - "unicode" -) - -// C provides convenient access to a range of parser/combinators -// that can be used to construct Matcher functions. -// -// When using C in your own parser, then it is advised to create -// a variable in your own package to reference it: -// -// var c = parsekit.C -// -// Doing so saves you a lot of typing, and it makes your code a lot cleaner. -var C = struct { - Rune func(rune) Matcher - Runes func(...rune) Matcher - RuneRange func(rune, rune) Matcher - Str func(string) Matcher - StrNoCase func(string) Matcher - Any func(...Matcher) Matcher - Not func(Matcher) Matcher - Opt func(Matcher) Matcher - Seq func(...Matcher) Matcher - Rep func(int, Matcher) Matcher - Min func(int, Matcher) Matcher - Max func(int, Matcher) Matcher - ZeroOrMore func(Matcher) Matcher - OneOrMore func(Matcher) Matcher - MinMax func(int, int, Matcher) Matcher - Separated func(separated Matcher, separator Matcher) Matcher - Except func(except Matcher, matcher Matcher) Matcher -}{ - Rune: MatchRune, - Runes: MatchRunes, - RuneRange: MatchRuneRange, - Str: MatchStr, - StrNoCase: MatchStrNoCase, - Opt: MatchOpt, - Any: MatchAny, - Not: MatchNot, - Seq: MatchSeq, - Rep: MatchRep, - Min: MatchMin, - Max: MatchMax, - ZeroOrMore: MatchZeroOrMore, - OneOrMore: MatchOneOrMore, - MinMax: MatchMinMax, - Separated: MatchSeparated, - Except: MatchExcept, -} - -// MatchRune creates a Matcher function that checks if the next rune from -// the input matches the provided rune. -func MatchRune(expected rune) Matcher { - return func(m *MatchDialog) bool { - input, ok := m.NextRune() - if ok && input == expected { - m.Accept() - return true - } - return false - } -} - -// MatchRunes creates a Matcher function that that checks if the next rune -// from the input is one of the provided runes. -func MatchRunes(expected ...rune) Matcher { - s := string(expected) - return func(m *MatchDialog) bool { - input, ok := m.NextRune() - if ok { - if strings.ContainsRune(s, input) { - m.Accept() - return true - } - } - return false - } -} - -// MatchRuneRange creates a Matcher function that that checks if the next rune -// from the input is contained by the provided rune range. -// -// The rune range is defined by a start and an end rune, inclusive, so: -// -// MatchRuneRange('g', 'k') -// -// creates a Matcher that will match any of 'g', 'h', 'i', 'j' or 'k'. -func MatchRuneRange(start rune, end rune) Matcher { - return func(m *MatchDialog) bool { - if end < start { - panic(fmt.Sprintf("internal parser error: MatchRuneRange definition error: start %q must not be < end %q", start, end)) - } - input, ok := m.NextRune() - if ok && input >= start && input <= end { - m.Accept() - return true - } - return false - } -} - -// MatchStr creates a Matcher that will check if the upcoming runes on the -// input match the provided string. -// TODO make this a more efficient string-level match? -func MatchStr(expected string) Matcher { - var matchers = []Matcher{} - for _, r := range expected { - matchers = append(matchers, MatchRune(r)) - } - return MatchSeq(matchers...) -} - -// MatchStrNoCase creates a Matcher that will check if the upcoming runes -// on the input match the provided string in a case-insensitive manner. -// TODO make this a more efficient string-level match? -func MatchStrNoCase(expected string) Matcher { - var matchers = []Matcher{} - for _, r := range expected { - u := unicode.ToUpper(r) - l := unicode.ToLower(r) - matchers = append(matchers, MatchRunes(u, l)) - } - return MatchSeq(matchers...) -} - -// MatchOpt creates a Matcher that makes the provided Matcher optional. -// When the provided Matcher applies, then its output is used, otherwise -// no output is generated but still a successful match is reported. -func MatchOpt(matcher Matcher) Matcher { - return func(m *MatchDialog) bool { - child := m.Fork() - if matcher(child) { - child.Merge() - } - return true - } -} - -// MatchSeq creates a Matcher that checks if the provided Matchers can be -// applied in their exact order. Only if all matcher apply, the sequence -// reports successful match. -func MatchSeq(matchers ...Matcher) Matcher { - return func(m *MatchDialog) bool { - child := m.Fork() - for _, matcher := range matchers { - if !matcher(child) { - return false - } - } - child.Merge() - return true - } -} - -// MatchAny creates a Matcher that checks if any of the provided Matchers -// can be applied. They are applied in their provided order. The first Matcher -// that applies is used for reporting back a match. -func MatchAny(matchers ...Matcher) Matcher { - return func(m *MatchDialog) bool { - for _, matcher := range matchers { - child := m.Fork() - if matcher(child) { - return child.Merge() - } - } - return false - } -} - -// MatchNot creates a Matcher that checks if the provided Matcher applies to -// the current input. If it does, then a failed match will be reported. If it -// does not, then the next rune from the input will be reported as a match. -func MatchNot(matcher Matcher) Matcher { - return func(m *MatchDialog) bool { - probe := m.Fork() - if matcher(probe) { - return false - } - _, ok := m.NextRune() - if ok { - m.Accept() - return true - } - return false - } -} - -// MatchRep creates a Matcher that checks if the provided Matcher can be -// applied exactly the provided amount of times. -// -// Note that the input can contain more Matches for the provided matcher, e.g.: -// -// MatchRep(4, MatchRune('X')) -// -// will not match input "XXX", it will match input "XXXX", but also "XXXXXX". -// In that last case, there will be a remainder "XX" of the input. -func MatchRep(times int, matcher Matcher) Matcher { - return matchMinMax(times, times, matcher) -} - -// MatchMin creates a Matcher that checks if the provided Matcher can be -// applied at least the provided minimum number of times. -// When more matches are possible, these will be included in the output. -func MatchMin(min int, matcher Matcher) Matcher { - return matchMinMax(min, -1, matcher) -} - -// MatchMax creates a Matcher that checks if the provided Matcher can be -// applied at maximum the provided minimum number of times. -// When more matches are possible, these will be included in the output. -// Zero matches are considered a successful match. -func MatchMax(max int, matcher Matcher) Matcher { - return matchMinMax(0, max, matcher) -} - -// MatchZeroOrMore creates a Matcher that checks if the provided Matcher can -// be applied zero or more times. All matches will be included in the output. -// Zero matches are considered a successful match. -func MatchZeroOrMore(matcher Matcher) Matcher { - return matchMinMax(0, -1, matcher) -} - -// MatchOneOrMore creates a Matcher that checks if the provided Matcher can -// be applied one or more times. All matches will be included in the output. -func MatchOneOrMore(matcher Matcher) Matcher { - return matchMinMax(1, -1, matcher) -} - -// MatchMinMax creates a Matcher that checks if the provided Matcher can -// be applied between the provided minimum and maximum number of times, -// inclusive. All matches will be included in the output. -func MatchMinMax(min int, max int, matcher Matcher) Matcher { - if max < 0 { - panic("internal parser error: MatchMinMax definition error: max must be >= 0 ") - } - if min < 0 { - panic("internal parser error: MatchMinMax definition error: min must be >= 0 ") - } - return matchMinMax(min, max, matcher) -} - -func matchMinMax(min int, max int, matcher Matcher) Matcher { - return func(m *MatchDialog) bool { - child := m.Fork() - if max >= 0 && min > max { - panic(fmt.Sprintf("internal parser error: MatchRep definition error: max %d must not be < min %d", max, min)) - } - total := 0 - // Check for the minimum required amount of matches. - for total < min { - total++ - if !matcher(child) { - return false - } - } - // No specified max: include the rest of the available matches. - // Specified max: include the rest of the availble matches, up to the max. - child.Merge() - for max < 0 || total < max { - total++ - if !matcher(child) { - break - } - child.Merge() - } - return true - } -} - -// MatchSeparated creates a Matcher that checks for a pattern of one or more -// Matchers of one type (the separated), separated by Matches of another type -// (the separator). All matches (separated + separator) are included in the -// output. -func MatchSeparated(separator Matcher, separated Matcher) Matcher { - return MatchSeq(separated, MatchZeroOrMore(MatchSeq(separator, separated))) -} - -// MatchExcept creates a Matcher that checks if the provided matcher can be -// applied to the upcoming input. It also checks if the except Matcher can be -// applied. If the matcher applies, but the except Matcher too, then the match -// as a whole will be treated as a mismatch. -func MatchExcept(except Matcher, matcher Matcher) Matcher { - return func(m *MatchDialog) bool { - if except(m.Fork()) { - return false - } - return matcher(m) - } -} - -// A provides convenient access to a range of atoms that can be used to -// build combinators or parsing rules. -// -// In parsekit, an atom is defined as a ready to go Matcher function. -// -// When using A in your own parser, then it is advised to create -// a variable in your own package to reference it: -// -// var a = parsekit.A -// -// Doing so saves you a lot of typing, and it makes your code a lot cleaner. -var A = struct { - EndOfFile Matcher - AnyRune Matcher - Space Matcher - Tab Matcher - CR Matcher - LF Matcher - CRLF Matcher - Excl Matcher - DoubleQuote Matcher - Hash Matcher - Dollar Matcher - Percent Matcher - Amp Matcher - SingleQuote Matcher - RoundOpen Matcher - RoundClose Matcher - Asterisk Matcher - Plus Matcher - Comma Matcher - Minus Matcher - Dot Matcher - Slash Matcher - Colon Matcher - Semicolon Matcher - AngleOpen Matcher - Equal Matcher - AngleClose Matcher - Question Matcher - At Matcher - SquareOpen Matcher - Backslash Matcher - SquareClose Matcher - Caret Matcher - Underscore Matcher - Backquote Matcher - CurlyOpen Matcher - Pipe Matcher - CurlyClose Matcher - Tilde Matcher - Newline Matcher - Whitespace Matcher - WhitespaceAndNewlines Matcher - EndOfLine Matcher - Digit Matcher - ASCII Matcher - ASCIILower Matcher - ASCIIUpper Matcher - HexDigit Matcher -}{ - EndOfFile: MatchEndOfFile(), - AnyRune: MatchAnyRune(), - Space: C.Rune(' '), - Tab: C.Rune('\t'), - CR: C.Rune('\r'), - LF: C.Rune('\n'), - CRLF: C.Str("\r\n"), - Excl: C.Rune('!'), - DoubleQuote: C.Rune('"'), - Hash: C.Rune('#'), - Dollar: C.Rune('$'), - Percent: C.Rune('%'), - Amp: C.Rune('&'), - SingleQuote: C.Rune('\''), - RoundOpen: C.Rune('('), - RoundClose: C.Rune(')'), - Asterisk: C.Rune('*'), - Plus: C.Rune('+'), - Comma: C.Rune(','), - Minus: C.Rune('-'), - Dot: C.Rune('.'), - Slash: C.Rune('/'), - Colon: C.Rune(':'), - Semicolon: C.Rune(';'), - AngleOpen: C.Rune('<'), - Equal: C.Rune('='), - AngleClose: C.Rune('>'), - Question: C.Rune('?'), - At: C.Rune('@'), - SquareOpen: C.Rune('['), - Backslash: C.Rune('\\'), - SquareClose: C.Rune(']'), - Caret: C.Rune('^'), - Underscore: C.Rune('_'), - Backquote: C.Rune('`'), - CurlyOpen: C.Rune('{'), - Pipe: C.Rune('|'), - CurlyClose: C.Rune('}'), - Tilde: C.Rune('~'), - Whitespace: C.OneOrMore(C.Any(C.Rune(' '), C.Rune('\t'))), - WhitespaceAndNewlines: C.OneOrMore(C.Any(C.Rune(' '), C.Rune('\t'), C.Str("\r\n"), C.Rune('\n'))), - EndOfLine: C.Any(C.Str("\r\n"), C.Rune('\n'), MatchEndOfFile()), - Digit: C.RuneRange('0', '9'), - ASCII: C.RuneRange('\x00', '\x7F'), - ASCIILower: C.RuneRange('a', 'z'), - ASCIIUpper: C.RuneRange('A', 'Z'), - HexDigit: C.Any(C.RuneRange('0', '9'), C.RuneRange('a', 'f'), C.RuneRange('A', 'F')), -} - -// MatchEndOfFile creates a Matcher that checks if the end of the input data -// has been reached. This Matcher will never produce output. It only reports -// a successful or a failing match through its boolean return value. -func MatchEndOfFile() Matcher { - return func(m *MatchDialog) bool { - fork := m.Fork() - input, ok := fork.NextRune() - return !ok && input == eofRune - } -} - -// MatchAnyRune creates a Matcher function that checks if a valid rune can be -// read from the input. It reports back a successful match if the end of the -// input has not yet been reached and the upcoming input is a valid UTF8 rune. -func MatchAnyRune() Matcher { - return func(m *MatchDialog) bool { - _, ok := m.NextRune() - if ok { - m.Accept() - return true - } - return false - } -} - -// M provides convenient access to a range of modifiers that can be -// used when creating Matcher functions. -// -// In parsekit, a modifier is defined as a Matcher function that modifies the -// resulting output of another Matcher in some way. It does not do any matching -// against input of its own. -// -// When using M in your own parser, then it is advised to create -// a variable in your own package to reference it: -// -// var m = parsekit.M -// -// Doing so saves you a lot of typing, and it makes your code a lot cleaner. -var M = struct { - Drop func(Matcher) Matcher - Trim func(Matcher, string) Matcher - TrimLeft func(Matcher, string) Matcher - TrimRight func(Matcher, string) Matcher - ToLower func(Matcher) Matcher - ToUpper func(Matcher) Matcher - Replace func(Matcher, string) Matcher - ModifyByCallback func(Matcher, func(string) string) Matcher -}{ - Drop: ModifyDrop, - Trim: ModifyTrim, - TrimLeft: ModifyTrimLeft, - TrimRight: ModifyTrimRight, - ToLower: ModifyToLower, - ToUpper: ModifyToUpper, - Replace: ModifyReplace, - ModifyByCallback: ModifyByCallback, -} - -// ModifyDrop creates a Matcher that checks if the provided Matcher applies. -// If it does, then its output is discarded completely. -// -// Note that if the Matcher does not apply, a mismatch will be reported back, -// even though we would have dropped the output anyway. So if you would like -// to drop optional whitespace, then use something like: -// -// M.Drop(C.Opt(A.Whitespace)) -// -// instead of: -// -// M.Drop(A.Whitespace) -// -// Since whitespace is defined as "1 or more spaces and/or tabs", the input -// string "bork" would not match against the second form, but " bork" would. -// In both cases, it would match the first form. -func ModifyDrop(matcher Matcher) Matcher { - return ModifyByCallback(matcher, func(s string) string { - return "" - }) -} - -// ModifyTrim creates a Matcher that checks if the provided Matcher applies. -// If it does, then its output is taken and characters from the provided -// cutset are trimmed from both the left and the right of the output. -// TODO move cutset to the left arg -func ModifyTrim(matcher Matcher, cutset string) Matcher { - return modifyTrim(matcher, cutset, true, true) -} - -// ModifyTrimLeft creates a Matcher that checks if the provided Matcher applies. -// If it does, then its output is taken and characters from the provided -// cutset are trimmed from the left of the output. -func ModifyTrimLeft(matcher Matcher, cutset string) Matcher { - return modifyTrim(matcher, cutset, true, false) -} - -// ModifyTrimRight creates a Matcher that checks if the provided Matcher applies. -// If it does, then its output is taken and characters from the provided -// cutset are trimmed from the right of the output. -func ModifyTrimRight(matcher Matcher, cutset string) Matcher { - return modifyTrim(matcher, cutset, false, true) -} - -func modifyTrim(matcher Matcher, cutset string, trimLeft bool, trimRight bool) Matcher { - modfunc := func(s string) string { - if trimLeft { - s = strings.TrimLeft(s, cutset) - } - if trimRight { - s = strings.TrimRight(s, cutset) - } - return s - } - return ModifyByCallback(matcher, modfunc) -} - -// ModifyToUpper creates a Matcher that checks if the provided Matcher applies. -// If it does, then its output is taken and characters from the provided -// cutset are converted into upper case. -func ModifyToUpper(matcher Matcher) Matcher { - return ModifyByCallback(matcher, strings.ToUpper) -} - -// ModifyToLower creates a Matcher that checks if the provided Matcher applies. -// If it does, then its output is taken and characters from the provided -// cutset are converted into lower case. -func ModifyToLower(matcher Matcher) Matcher { - return ModifyByCallback(matcher, strings.ToLower) -} - -// ModifyReplace creates a Matcher that checks if the provided Matcher applies. -// If it does, then its output is replaced by the provided string. -func ModifyReplace(matcher Matcher, s string) Matcher { - return ModifyByCallback(matcher, func(string) string { - return s - }) -} - -// ModifyByCallback creates a Matcher that checks if the provided matcher applies. -// If it does, then its output is taken and it is fed to the provided modfunc. -// This is a simple function that takes a string on input and returns a possibly -// modified string on output. The return value of the modfunc will replace the -// resulting output. -func ModifyByCallback(matcher Matcher, modfunc func(string) string) Matcher { - return func(m *MatchDialog) bool { - child := m.Fork() - if matcher(child) { - s := modfunc(string(child.output)) - child.output = []rune(s) - child.Merge() - return true - } - return false - } -} diff --git a/parsekit.go b/parsekit.go index e312b10..35fd3d4 100644 --- a/parsekit.go +++ b/parsekit.go @@ -24,17 +24,16 @@ func NewParser(startState StateHandler) *Parser { return &Parser{startState: startState} } -// Run represents a single parse run for a Parser. -// TODO rename to ParseRun -type Run struct { - p *P // a struct holding the internal state of a parse run +// ParseRun represents a single parse run for a Parser. +type ParseRun struct { + p *ParseAPI // holds the internal state of a parse run } // Parse starts a parse run on the provided input data. -// To retrieve parse items from the run, make use of the Run.Next() method. -func (p *Parser) Parse(input string) *Run { - return &Run{ - p: &P{ +// To retrieve parser Items from the run, make use of the ParseRun.Next() method. +func (p *Parser) Parse(input string) *ParseRun { + return &ParseRun{ + p: &ParseAPI{ input: input, len: len(input), cursorLine: 1, @@ -51,7 +50,7 @@ func (p *Parser) Parse(input string) *Run { // On error or when successfully reaching the end of the input, false is returned. // When an error occurred, false will be returned and the error return value will // be set (default is nil). -func (run *Run) Next() (Item, *Error, bool) { +func (run *ParseRun) Next() (Item, *Error, bool) { // State handling loop: we handle states, until an Item is ready to be returned. for { select { @@ -66,7 +65,7 @@ func (run *Run) Next() (Item, *Error, bool) { } } -func (run *Run) makeReturnValues(i Item) (Item, *Error, bool) { +func (run *ParseRun) makeReturnValues(i Item) (Item, *Error, bool) { switch { case i.Type == ItemEOF: return i, nil, false @@ -84,7 +83,7 @@ func (run *Run) makeReturnValues(i Item) (Item, *Error, bool) { // type StateHandler. This function represents the current status and // is responsible for moving the parser to its next status, depending // on the parsed input data. -func (run *Run) runNextStateHandler() { +func (run *ParseRun) runNextStateHandler() { if state, ok := run.getNextStateHandler(); ok { run.invokeNextStateHandler(state) } @@ -115,7 +114,7 @@ func (run *Run) runNextStateHandler() { // // When no routing decision is provided by a StateHandler, then this is // considered a bug in the state handler, and the parser will panic. -func (run *Run) getNextStateHandler() (StateHandler, bool) { +func (run *ParseRun) getNextStateHandler() (StateHandler, bool) { switch { case run.p.nextState != nil: return run.p.nextState, true @@ -132,42 +131,45 @@ func (run *Run) getNextStateHandler() (StateHandler, bool) { // invokeNextStateHandler moves the parser state to the provided state // and invokes the StateHandler function. -func (run *Run) invokeNextStateHandler(state StateHandler) { +func (run *ParseRun) invokeNextStateHandler(state StateHandler) { run.p.state = state run.p.nextState = nil run.p.expecting = "" run.p.state(run.p) } -// MatcherWrapper is the top-level struct that holds the configuration for -// a parser that is based solely on a Wrapper function. -// The MatcherWrapper can be instantiated using the parsekit.NewMatcher() +// Matcher is the top-level struct that holds the configuration for +// a parser that is based solely on a TokenHandler function. +// The Matcher can be instantiated using the parsekit.NewMatcher() // method. // // To match input data against the wrapped Matcher function, use the method -// MatcherWrapper.Parse(). -type MatcherWrapper struct { +// Matcher.Parse(). +type Matcher struct { parser *Parser } -// NewMatcher instantiates a new MatcherWrapper. +// NewMatcher instantiates a new Matcher. // -// This is a simple wrapper around a Matcher function. It can be used to -// match an input string against that Matcher function and retrieve the +// This is a simple wrapper around a TokenHandler function. It can be used to +// match an input string against that TokenHandler function and retrieve the // results in a straight forward way. -func NewMatcher(matcher Matcher, expects string) *MatcherWrapper { - handler := func(p *P) { +// +// The 'expects' parameter is used for creating an error message in case parsed +// input does not match the TokenHandler. +func NewMatcher(tokenHandler TokenHandler, expects string) *Matcher { + stateHandler := func(p *ParseAPI) { p.Expects(expects) - if p.On(matcher).Accept() { + if p.On(tokenHandler).Accept() { p.EmitLiteral(0) // ItemType is irrelevant } } - return &MatcherWrapper{parser: NewParser(handler)} + return &Matcher{parser: NewParser(stateHandler)} } -// Parse runs the wrapped Matcher function against the provided input data. -func (w *MatcherWrapper) Parse(input string) (string, *Error, bool) { - item, err, ok := w.parser.Parse(input).Next() +// Parse checks for a match on the provided input data. +func (m *Matcher) Parse(input string) (string, *Error, bool) { + item, err, ok := m.parser.Parse(input).Next() if !ok { return "", err, false } diff --git a/parsekit_test.go b/parsekit_test.go index 5e43288..f40a4d5 100644 --- a/parsekit_test.go +++ b/parsekit_test.go @@ -14,21 +14,21 @@ const TestItem parsekit.ItemType = 1 // Easy access to the parsekit definitions. var c, a, m = parsekit.C, parsekit.A, parsekit.M -type MatcherTest struct { - input string - matcher parsekit.Matcher - mustMatch bool - expected string +type TokenHandlerTest struct { + input string + tokenHandler parsekit.TokenHandler + mustMatch bool + expected string } -func RunMatcherTests(t *testing.T, testSet []MatcherTest) { +func RunTokenHandlerTests(t *testing.T, testSet []TokenHandlerTest) { for _, test := range testSet { - RunMatcherTest(t, test) + RunTokenHandlerTest(t, test) } } -func RunMatcherTest(t *testing.T, test MatcherTest) { - output, err, ok := parsekit.NewMatcher(test.matcher, "a match").Parse(test.input) +func RunTokenHandlerTest(t *testing.T, test TokenHandlerTest) { + output, err, ok := parsekit.NewMatcher(test.tokenHandler, "a match").Parse(test.input) if test.mustMatch { if !ok { diff --git a/statehandler.go b/statehandler.go index a6909e7..c49d16b 100644 --- a/statehandler.go +++ b/statehandler.go @@ -2,17 +2,17 @@ package parsekit import "unicode/utf8" -// StateHandler defines the type of function that must be implemented to -// handle a parsing state. +// StateHandler defines the type of function that must be implemented to handle +// a parsing state in a Parser state machine. // -// A StateHandler function gets a P struct as its input. This struct holds +// A StateHandler function gets a ParseAPI struct as its input. This struct holds // all the internal state for the parsing state machine and provides the -// interface that the StateHandler must use to interact with the parser. -type StateHandler func(*P) +// interface that the StateHandler uses to interact with the parser. +type StateHandler func(*ParseAPI) -// P holds the internal state of a parse run and provides an API to +// ParseAPI holds the internal state of a parse run and provides an API to // StateHandler methods to communicate with the parser. -type P struct { +type ParseAPI struct { state StateHandler // the function that handles the current state nextState StateHandler // the function that will handle the next state routeStack []StateHandler // route stack, for handling nested parsing @@ -37,7 +37,7 @@ type P struct { // The boolean will be false in case no upcoming rune can be peeked // (end of data or invalid UTF8 character). In this case, the returned rune // will be one of eofRune or invalidRune. -func (p *P) peek(byteOffset int) (rune, int, bool) { +func (p *ParseAPI) peek(byteOffset int) (rune, int, bool) { r, w := utf8.DecodeRuneInString(p.input[p.inputPos+byteOffset:]) return handleRuneError(r, w) } diff --git a/statehandler_emit.go b/statehandler_emit.go index cf8fd0c..f4a7e0f 100644 --- a/statehandler_emit.go +++ b/statehandler_emit.go @@ -4,6 +4,12 @@ import ( "fmt" ) +// Item represents an item that can be emitted from the parser. +type Item struct { + Type ItemType + Value string +} + // ItemType represents the type of a parser Item. // // When creating your own ItemType values, then make use of positive integer @@ -19,26 +25,14 @@ const ItemEOF ItemType = -1 // an error has occurred during parsing. const ItemError ItemType = -2 -// Item represents an item that can be emitted from the parser. -type Item struct { - Type ItemType - Value string -} - // Emit passes a Parser item to the client, including the provided string. -func (p *P) Emit(t ItemType, v string) { +func (p *ParseAPI) Emit(t ItemType, v string) { p.items <- Item{t, v} p.buffer.reset() } -// EmitLiteral passes a Parser item to the client, including accumulated -// string buffer data as a literal string. -func (p *P) EmitLiteral(t ItemType) { - p.Emit(t, p.buffer.asLiteralString()) -} - -// BufLiteral retrieves the contents of the parser buffer (all the runes that -// were added to it using P.Accept()) as a literal string. +// BufLiteral retrieves the contents of the parser's string buffer (all the +// runes that were added to it using ParseAPI.Accept()) as a literal string. // // Literal means that if the input had for example the subsequent runes '\' and 'n' // in it, then the literal string would have a backslash and an 'n' it in, not a @@ -46,12 +40,19 @@ func (p *P) EmitLiteral(t ItemType) { // // Retrieving the buffer contents will not affect the buffer itself. New runes can // still be added to it. Only when calling P.Emit(), the buffer will be cleared. -func (p *P) BufLiteral() string { +func (p *ParseAPI) BufLiteral() string { return p.buffer.asLiteralString() } -// BufInterpreted retrieves the contents of the parser buffer (all the runes that -// were added to it using P.Accept()) as an interpreted string. +// EmitLiteral passes a parser Item to the client, including the accumulated +// string buffer data as a literal string. +func (p *ParseAPI) EmitLiteral(t ItemType) { + p.Emit(t, p.BufLiteral()) +} + +// BufInterpreted retrieves the contents of the parser's string buffer (all +// the runes that were added to it using ParseAPI.Accept()) as an +// interpreted string. // // Interpreted means that the contents are treated as a Go double quoted // interpreted string (handling escape codes like \n, \t, \uXXXX, etc.). if the @@ -64,7 +65,7 @@ func (p *P) BufLiteral() string { // // Retrieving the buffer contents will not affect the buffer itself. New runes can // still be added to it. Only when calling P.Emit(), the buffer will be cleared. -func (p *P) BufInterpreted() (string, bool) { +func (p *ParseAPI) BufInterpreted() (string, bool) { s, err := p.buffer.asInterpretedString() if err != nil { p.EmitError( @@ -81,16 +82,12 @@ func (p *P) BufInterpreted() (string, bool) { // This method returns a boolean value, indicating whether or not the string // interpretation was successful. On invalid string data, an error will // automatically be emitted and false will be returned. -func (p *P) EmitInterpreted(t ItemType) bool { - s, err := p.buffer.asInterpretedString() - if err != nil { - p.EmitError( - "invalid string: %s (%s, forgot to escape a double quote or backslash maybe?)", - p.buffer.asLiteralString(), err) - return false +func (p *ParseAPI) EmitInterpreted(t ItemType) bool { + if s, ok := p.BufInterpreted(); ok { + p.Emit(t, s) + return true } - p.Emit(t, s) - return true + return false } // Error is used as the error type when parsing errors occur. @@ -115,15 +112,15 @@ func (err *Error) ErrorFull() string { return fmt.Sprintf("%s after line %d, column %d", err, err.Line, err.Column) } -// EmitError emits a Parser error item to the client. -func (p *P) EmitError(format string, args ...interface{}) { +// EmitError emits a parser error item to the client. +func (p *ParseAPI) EmitError(format string, args ...interface{}) { message := fmt.Sprintf(format, args...) p.Emit(ItemError, message) } // UnexpectedInput is used by a StateHandler function to emit an error item // that tells the client that an unexpected rune was encountered in the input. -func (p *P) UnexpectedInput() { +func (p *ParseAPI) UnexpectedInput() { r, _, ok := p.peek(0) switch { case ok: @@ -137,7 +134,7 @@ func (p *P) UnexpectedInput() { } } -func fmtExpects(p *P) string { +func fmtExpects(p *ParseAPI) string { if p.expecting == "" { return "" } diff --git a/statehandler_expects.go b/statehandler_expects.go index 4f8fe48..fb082e8 100644 --- a/statehandler_expects.go +++ b/statehandler_expects.go @@ -1,6 +1,6 @@ package parsekit -// Expects is used to let a state function describe what input it is expecting. +// Expects is used to let a StateHandler function describe what input it is expecting. // This expectation is used in error messages to make them more descriptive. // // When defining an expectation inside a StateHandler, you do not need to @@ -13,6 +13,6 @@ package parsekit // 2) there is an invalid UTF8 character on input // // 3) the end of the file was reached. -func (p *P) Expects(description string) { +func (p *ParseAPI) Expects(description string) { p.expecting = description } diff --git a/statehandler_on.go b/statehandler_on.go index 9ef63a4..737bbbf 100644 --- a/statehandler_on.go +++ b/statehandler_on.go @@ -1,12 +1,12 @@ package parsekit -// On checks if the input at the current cursor position matches the provided Matcher. -// On must be chained with another method, which tells the parser what action to -// perform when a match was found: +// On checks if the input at the current cursor position matches the provided +// TokenHandler. On must be chained with another method, which tells the parser +// what action to perform when a match was found: // // 1) On(...).Skip() - Only move cursor forward, ignore the matched runes. // -// 2) On(...).Accept() - Move cursor forward, add matched runes to the string buffer. +// 2) On(...).Accept() - Move cursor forward, add runes to parsers's string buffer. // // 3) On(...).Stay() - Do nothing, the cursor stays at the same position. // @@ -32,16 +32,16 @@ package parsekit // p.RouteTo(stateHandlerC) // } // -// // When there's a "hi" on input, emit it. +// // When there's a "hi" on input, emit a parser item for it. // if p.On(parsekit.C.Str("hi")).Accept() { // p.Emit(SomeItemType, p.BufLiteral()) // } -func (p *P) On(matcher Matcher) *matchAction { - m := &MatchDialog{p: p} - if matcher == nil { - panic("internal parser error: matcher argument for On() is nil") +func (p *ParseAPI) On(tokenHandler TokenHandler) *MatchAction { + m := &TokenAPI{p: p} + if tokenHandler == nil { + panic("internal parser error: tokenHandler argument for On() is nil") } - ok := matcher(m) + ok := tokenHandler(m) // Keep track of the last match, to allow parser implementations // to access it in an easy way. Typical use would be something like: @@ -51,7 +51,7 @@ func (p *P) On(matcher Matcher) *matchAction { // } p.LastMatch = string(m.input) - return &matchAction{ + return &MatchAction{ p: p, ok: ok, input: m.input, @@ -60,9 +60,10 @@ func (p *P) On(matcher Matcher) *matchAction { } } -// matchAction is a struct that is used for building the On()-method chain. -type matchAction struct { - p *P +// MatchAction is a struct that is used for building the On()-method chain. +// The On() method will return an initialized struct of this type. +type MatchAction struct { + p *ParseAPI ok bool input []rune output []rune @@ -70,11 +71,12 @@ type matchAction struct { } // Accept tells the parser to move the cursor past a match that was found, -// and to store the input that matched in the string buffer. +// and to store the input that matched in the parser's string buffer. // When no match was found, then no action is taken. -// It returns a routeAction struct, which provides methods that can be used -// to tell the parser what state to go to next. -func (a *matchAction) Accept() bool { +// +// Returns true in case a match was found. +// When no match was found, then no action is taken and false is returned. +func (a *MatchAction) Accept() bool { if a.ok { a.p.buffer.writeString(string(a.output)) a.advanceCursor() @@ -83,10 +85,11 @@ func (a *matchAction) Accept() bool { } // Skip tells the parser to move the cursor past a match that was found, -// without storing the actual match in the string buffer. +// without storing the actual match in the parser's string buffer. +// // Returns true in case a match was found. // When no match was found, then no action is taken and false is returned. -func (a *matchAction) Skip() bool { +func (a *MatchAction) Skip() bool { if a.ok { a.advanceCursor() } @@ -95,14 +98,14 @@ func (a *matchAction) Skip() bool { // Stay tells the parser to not move the cursor after finding a match. // Returns true in case a match was found, false otherwise. -func (a *matchAction) Stay() bool { +func (a *MatchAction) Stay() bool { return a.ok } -// advanceCursor advances the rune cursor one position in the input data. -// While doing so, it keeps tracks of newlines, so we can report on -// row + column positions on error. -func (a *matchAction) advanceCursor() { +// advanceCursor advances the input position in the input data. +// While doing so, it keeps tracks of newlines that are encountered, so we +// can report on line + column positions on error. +func (a *MatchAction) advanceCursor() { a.p.inputPos = a.inputPos for _, r := range a.input { if a.p.newline { diff --git a/statehandler_route.go b/statehandler_route.go index 47a60d4..907a64e 100644 --- a/statehandler_route.go +++ b/statehandler_route.go @@ -1,34 +1,34 @@ package parsekit -// RouteTo tells the parser what StateHandler function to invoke -// in the next parsing cycle. -func (p *P) RouteTo(state StateHandler) *routeFollowupAction { +// RouteTo tells the parser what StateHandler function to invoke on +// the next parse cycle. +func (p *ParseAPI) RouteTo(state StateHandler) *RouteFollowupAction { p.nextState = state - return &routeFollowupAction{p} + return &RouteFollowupAction{p} } -// RouteRepeat indicates that on the next parsing cycle, the current +// RouteRepeat tells the parser that on the next parsing cycle, the current // StateHandler must be reinvoked. -func (p *P) RouteRepeat() { +func (p *ParseAPI) RouteRepeat() { p.RouteTo(p.state) } -// RouteReturn tells the parser that on the next cycle the last -// StateHandler that was pushed on the route stack must be invoked. +// RouteReturn tells the parser that on the next cycle the last StateHandler +// that was pushed on the route stack must be invoked. // // Using this method is optional. When implementating a StateHandler that // is used as a sort of subroutine (using constructions like // p.RouteTo(subroutine).ThenReturnHere()), you can refrain from // providing an explicit routing decision from that handler. The parser will // automatically assume a RouteReturn() in that case. -func (p *P) RouteReturn() { +func (p *ParseAPI) RouteReturn() { p.nextState = p.popRoute() } -// routeFollowupAction chains parsing routes. +// RouteFollowupAction chains parsing routes. // It allows for routing code like p.RouteTo(handlerA).ThenTo(handlerB). -type routeFollowupAction struct { - p *P +type RouteFollowupAction struct { + p *ParseAPI } // ThenTo schedules a StateHandler that must be invoked after the RouteTo @@ -36,7 +36,7 @@ type routeFollowupAction struct { // For example: // // p.RouteTo(handlerA).ThenTo(handlerB) -func (a *routeFollowupAction) ThenTo(state StateHandler) { +func (a *RouteFollowupAction) ThenTo(state StateHandler) { a.p.pushRoute(state) } @@ -45,18 +45,18 @@ func (a *routeFollowupAction) ThenTo(state StateHandler) { // For example: // // p.RouteTo(handlerA).ThenReturnHere() -func (a *routeFollowupAction) ThenReturnHere() { +func (a *RouteFollowupAction) ThenReturnHere() { a.p.pushRoute(a.p.state) } // pushRoute adds the StateHandler to the route stack. // This is used for implementing nested parsing. -func (p *P) pushRoute(state StateHandler) { +func (p *ParseAPI) pushRoute(state StateHandler) { p.routeStack = append(p.routeStack, state) } // popRoute pops the last pushed StateHandler from the route stack. -func (p *P) popRoute() StateHandler { +func (p *ParseAPI) popRoute() StateHandler { last := len(p.routeStack) - 1 head, tail := p.routeStack[:last], p.routeStack[last] p.routeStack = head @@ -66,8 +66,8 @@ func (p *P) popRoute() StateHandler { // ExpectEndOfFile can be used from a StateHandler function to indicate that // your parser expects to be at the end of the file. This will schedule // a parsekit-provided StateHandler which will do the actual check for this. -func (p *P) ExpectEndOfFile() { - p.RouteTo(func(p *P) { +func (p *ParseAPI) ExpectEndOfFile() { + p.RouteTo(func(p *ParseAPI) { p.Expects("end of file") if p.On(A.EndOfFile).Stay() { p.Emit(ItemEOF, "EOF") diff --git a/tokenhandler.go b/tokenhandler.go new file mode 100644 index 0000000..95d0663 --- /dev/null +++ b/tokenhandler.go @@ -0,0 +1,192 @@ +package parsekit + +import ( + "fmt" +) + +// TokenHandler is the function type that is involved in turning a low level +// stream of UTF8 runes into parsing tokens. Its purpose is to check if input +// data matches some kind of pattern and to report back the match. +// +// A TokenHandler is to be used in conjunction with parsekit.P.On() or +// parsekit.Matcher(). +// +// A TokenHandler function gets a TokenAPI as its input and returns a boolean to +// indicate whether or not it found a match on the input. The TokenAPI is used +// for retrieving input data to match against and for reporting back results. +type TokenHandler func(t *TokenAPI) bool + +// TokenAPI is used by TokenHandler functions to retrieve runes from the +// input to match against and to report back results. +// +// Basic operation: +// +// To retrieve the next rune from the input, the TokenHandler function can call +// the TokenAPI.NextRune() method. +// +// The TokenHandler function can then evaluate the retrieved rune and either +// accept of skip the rune. When accepting it using TokenAPI.Accept(), the rune +// is added to the resulting output of the TokenAPI. When using TokenAPI.Skip(), +// the rune will not be added to the output. It is mandatory for a TokenHandler +// to call either Accept() or Skip() after retrieving a rune, before calling +// NextRune() again. +// +// Eventually, the TokenHandler function must return a boolean value, indicating +// whether or not a match was found. When true, then the calling code will +// use the runes that were accepted into the TokenAPI's resulting output. +// +// Forking operation for easy lookahead support: +// +// Sometimes, a TokenHandler function must be able to perform a lookahead, which +// might either succeed or fail. In case of a failing lookahead, the state +// of the TokenAPI must be brought back to the original state. +// +// The way in which this is supported, is by forking a TokenAPI by calling +// TokenAPI.Fork(). This will return a child TokenAPI, with an empty +// output buffer, but using the same input cursor position as the forked parent. +// +// The TokenHandler function can then use the same interface as described for +// normal operation to retrieve runes from the input and to fill the resulting +// output. When the TokenHandler function decides that the lookahead was successful, +// then the method TokenAPI.Merge() can be called on the forked child to +// append the resulting output from the child to the parent's resulting output, +// and to update the parent input cursor position to that of the child. +// +// When the TokenHandler function decides that the lookahead was unsuccessful, +// then it can simply discard the forked child. The parent TokenAPI was never +// modified, so a new match can be safely started using that parent, as if the +// lookahead never happened. +type TokenAPI struct { + p *ParseAPI // parser state, used to retrieve input data to match against (TODO should be tiny interface) + inputOffset int // the byte offset into the input + input []rune // a slice of runes that represents all retrieved input runes for the Matcher + output []rune // a slice of runes that represents the accepted output runes for the Matcher + currRune *runeInfo // hold information for the last rune that was read from the input + parent *TokenAPI // the parent MatchDialog, in case this one was forked +} + +// runeInfo describes a single rune and its metadata. +type runeInfo struct { + Rune rune // an UTF8 rune + ByteSize int // the number of bytes in the rune + OK bool // false when the rune represents an invalid UTF8 rune or EOF +} + +// NextRune retrieves the next rune from the input. +// +// It returns the rune and a boolean. The boolean will be false in case an +// invalid UTF8 rune or the end of the file was encountered. +// +// After using NextRune() to retrieve a rune, Accept() or Skip() can be called +// to respectively add the rune to the TokenAPI's resulting output or to +// fully ignore it. This way, a TokenHandler has full control over what runes are +// significant for the resulting output of that TokenHandler. +// +// After using NextRune(), this method can not be reinvoked, until the last read +// rune is explicitly accepted or skipped as described above. +func (t *TokenAPI) NextRune() (rune, bool) { + if t.currRune != nil { + panic("internal Matcher error: NextRune() was called without accepting or skipping the previously read rune") + } + r, w, ok := t.p.peek(t.inputOffset) + t.currRune = &runeInfo{r, w, ok} + if ok { + t.input = append(t.input, r) + } + return r, ok +} + +// Fork splits off a child TokenAPI, containing the same input cursor position +// as the parent TokenAPI, but with all other data in a fresh state. +// +// By forking, a TokenHandler function can freely work with a TokenAPI, without +// affecting the parent TokenAPI. This is for example useful when the +// TokenHandler function must perform some form of lookahead. +// +// When a successful match was found, the TokenHandler function can call +// TokenAPI.Merge() on the forked child to have the resulting output added +// to the parent TokenAPI. +// +// When no match was found, the forked child can simply be discarded. +// +// Example case: A TokenHandler checks for a sequence of runes: 'a', 'b', 'c', 'd'. +// This is done in 4 steps and only after finishing all steps, the TokenHandler +// function can confirm a successful match. The TokenHandler function for this +// case could look like this (yes, it's naive, but it shows the point): +// TODO make proper tested example +// +// func MatchAbcd(t *TokenAPI) bool { +// child := t.Fork() // fork to keep m from input untouched +// for _, letter := []rune {'a', 'b', 'c', 'd'} { +// if r, ok := t.NextRune(); !ok || r != letter { +// return false // report mismatch, t is left untouched +// } +// child.Accept() // add rune to child output +// } +// child.Merge() // we have a match, add resulting output to parent +// return true // and report the successful match +// } +func (t *TokenAPI) Fork() *TokenAPI { + return &TokenAPI{ + p: t.p, + inputOffset: t.inputOffset, + parent: t, + } +} + +// Accept will add the last rune as read by TokenAPI.NextRune() to the resulting +// output of the TokenAPI. +func (t *TokenAPI) Accept() { + t.checkAllowedCall("Accept()") + t.output = append(t.output, t.currRune.Rune) + t.inputOffset += t.currRune.ByteSize + t.currRune = nil +} + +// Skip will ignore the last rune as read by NextRune(). +func (t *TokenAPI) Skip() { + t.checkAllowedCall("Skip()") + t.inputOffset += t.currRune.ByteSize + t.currRune = nil +} + +func (t *TokenAPI) checkAllowedCall(name string) { + if t.currRune == nil { + panic(fmt.Sprintf("internal Matcher error: %s was called without a prior call to NextRune()", name)) + } + if !t.currRune.OK { + panic(fmt.Sprintf("internal Matcher error: %s was called, but prior call to NextRun() did not return OK (EOF or invalid rune)", name)) + } +} + +// Merge merges the resulting output from a forked child TokenAPI back into +// its parent: The runes that are accepted in the child are added to the parent +// runes and the parent's input cursor position is advanced to the child's +// cursor position. +// +// After the merge, the child TokenAPI is reset so it can immediately be +// reused for performing another match (all data are cleared, except for the +// input offset which is kept at its current position). +func (t *TokenAPI) Merge() bool { + if t.parent == nil { + panic("internal parser error: Cannot call Merge a a non-forked MatchDialog") + } + t.parent.input = append(t.parent.input, t.input...) + t.parent.output = append(t.parent.output, t.output...) + t.parent.inputOffset = t.inputOffset + t.ClearOutput() + t.ClearInput() + return true +} + +// ClearOutput clears the resulting output for the TokenAPI, but it keeps +// the input and input offset as-is. +func (t *TokenAPI) ClearOutput() { + t.output = []rune{} +} + +// ClearInput clears the input for the TokenAPI, but it keeps the output +// and input offset as-is. +func (t *TokenAPI) ClearInput() { + t.input = []rune{} +} diff --git a/tokenhandlers_builtin.go b/tokenhandlers_builtin.go new file mode 100644 index 0000000..c565088 --- /dev/null +++ b/tokenhandlers_builtin.go @@ -0,0 +1,558 @@ +package parsekit + +import ( + "fmt" + "strings" + "unicode" +) + +// C provides convenient access to a range of parser/combinators that can be +// used to construct TokenHandler functions. +// +// When using C in your own parser, then it is advised to create a variable +// to reference it: +// +// var c = parsekit.C +// +// Doing so saves you a lot of typing, and it makes your code a lot cleaner. +var C = struct { + Rune func(rune) TokenHandler + Runes func(...rune) TokenHandler + RuneRange func(rune, rune) TokenHandler + Str func(string) TokenHandler + StrNoCase func(string) TokenHandler + Any func(...TokenHandler) TokenHandler + Not func(TokenHandler) TokenHandler + Opt func(TokenHandler) TokenHandler + Seq func(...TokenHandler) TokenHandler + Rep func(times int, handler TokenHandler) TokenHandler + Min func(min int, handler TokenHandler) TokenHandler + Max func(max int, handler TokenHandler) TokenHandler + ZeroOrMore func(TokenHandler) TokenHandler + OneOrMore func(TokenHandler) TokenHandler + MinMax func(min int, max int, handler TokenHandler) TokenHandler + Separated func(separated TokenHandler, separator TokenHandler) TokenHandler // TODO reverse args for consistency + Except func(except TokenHandler, handler TokenHandler) TokenHandler +}{ + Rune: MatchRune, + Runes: MatchRunes, + RuneRange: MatchRuneRange, + Str: MatchStr, + StrNoCase: MatchStrNoCase, + Opt: MatchOpt, + Any: MatchAny, + Not: MatchNot, + Seq: MatchSeq, + Rep: MatchRep, + Min: MatchMin, + Max: MatchMax, + ZeroOrMore: MatchZeroOrMore, + OneOrMore: MatchOneOrMore, + MinMax: MatchMinMax, + Separated: MatchSeparated, + Except: MatchExcept, +} + +// MatchRune creates a TokenHandler function that checks if the next rune from +// the input matches the provided rune. +func MatchRune(expected rune) TokenHandler { + return func(t *TokenAPI) bool { + input, ok := t.NextRune() + if ok && input == expected { + t.Accept() + return true + } + return false + } +} + +// MatchRunes creates a TokenHandler function that that checks if the next rune +// from the input is one of the provided runes. +func MatchRunes(expected ...rune) TokenHandler { + s := string(expected) + return func(t *TokenAPI) bool { + input, ok := t.NextRune() + if ok { + if strings.ContainsRune(s, input) { + t.Accept() + return true + } + } + return false + } +} + +// MatchRuneRange creates a TokenHandler function that that checks if the next rune +// from the input is contained by the provided rune range. +// +// The rune range is defined by a start and an end rune, inclusive, so: +// +// MatchRuneRange('g', 'k') +// +// creates a TokenHandler that will match any of 'g', 'h', 'i', 'j' or 'k'. +func MatchRuneRange(start rune, end rune) TokenHandler { + return func(t *TokenAPI) bool { + if end < start { + panic(fmt.Sprintf("internal parser error: MatchRuneRange definition error: start %q must not be < end %q", start, end)) + } + input, ok := t.NextRune() + if ok && input >= start && input <= end { + t.Accept() + return true + } + return false + } +} + +// MatchStr creates a TokenHandler that will check if the upcoming runes on the +// input match the provided string. +// TODO make this a more efficient string-level match? +func MatchStr(expected string) TokenHandler { + var handlers = []TokenHandler{} + for _, r := range expected { + handlers = append(handlers, MatchRune(r)) + } + return MatchSeq(handlers...) +} + +// MatchStrNoCase creates a TokenHandler that will check if the upcoming runes +// on the input match the provided string in a case-insensitive manner. +// TODO make this a more efficient string-level match? +func MatchStrNoCase(expected string) TokenHandler { + var handlers = []TokenHandler{} + for _, r := range expected { + u := unicode.ToUpper(r) + l := unicode.ToLower(r) + handlers = append(handlers, MatchRunes(u, l)) + } + return MatchSeq(handlers...) +} + +// MatchOpt creates a TokenHandler that makes the provided TokenHandler optional. +// When the provided TokenHandler applies, then its output is used, otherwise +// no output is generated but still a successful match is reported. +func MatchOpt(handler TokenHandler) TokenHandler { + return func(t *TokenAPI) bool { + child := t.Fork() + if handler(child) { + child.Merge() + } + return true + } +} + +// MatchSeq creates a TokenHandler that checks if the provided TokenHandlers can be +// applied in their exact order. Only if all matcher apply, the sequence +// reports successful match. +func MatchSeq(handlers ...TokenHandler) TokenHandler { + return func(t *TokenAPI) bool { + child := t.Fork() + for _, matcher := range handlers { + if !matcher(child) { + return false + } + } + child.Merge() + return true + } +} + +// MatchAny creates a TokenHandler that checks if any of the provided TokenHandlers +// can be applied. They are applied in their provided order. The first TokenHandler +// that applies is used for reporting back a match. +func MatchAny(handlers ...TokenHandler) TokenHandler { + return func(t *TokenAPI) bool { + for _, handler := range handlers { + child := t.Fork() + if handler(child) { + return child.Merge() + } + } + return false + } +} + +// MatchNot creates a TokenHandler that checks if the provided TokenHandler applies to +// the current input. If it does, then a failed match will be reported. If it +// does not, then the next rune from the input will be reported as a match. +func MatchNot(handler TokenHandler) TokenHandler { + return func(t *TokenAPI) bool { + probe := t.Fork() + if handler(probe) { + return false + } + _, ok := t.NextRune() + if ok { + t.Accept() + return true + } + return false + } +} + +// MatchRep creates a TokenHandler that checks if the provided TokenHandler can be +// applied exactly the provided amount of times. +// +// Note that the input can contain more than the provided number of matches, e.g.: +// +// MatchRep(4, MatchRune('X')) +// +// will not match input "XXX", it will match input "XXXX", but also "XXXXXX". +// In that last case, there will be a remainder "XX" on the input. +func MatchRep(times int, handler TokenHandler) TokenHandler { + return matchMinMax(times, times, handler) +} + +// MatchMin creates a TokenHandler that checks if the provided TokenHandler can be +// applied at least the provided minimum number of times. +// When more matches are possible, these will be included in the output. +func MatchMin(min int, handler TokenHandler) TokenHandler { + return matchMinMax(min, -1, handler) +} + +// MatchMax creates a TokenHandler that checks if the provided TokenHandler can be +// applied at maximum the provided minimum number of times. +// When more matches are possible, these will be included in the output. +// Zero matches are considered a successful match. +func MatchMax(max int, handler TokenHandler) TokenHandler { + return matchMinMax(0, max, handler) +} + +// MatchZeroOrMore creates a TokenHandler that checks if the provided TokenHandler can +// be applied zero or more times. All matches will be included in the output. +// Zero matches are considered a successful match. +func MatchZeroOrMore(handler TokenHandler) TokenHandler { + return matchMinMax(0, -1, handler) +} + +// MatchOneOrMore creates a TokenHandler that checks if the provided TokenHandler can +// be applied one or more times. All matches will be included in the output. +func MatchOneOrMore(handler TokenHandler) TokenHandler { + return matchMinMax(1, -1, handler) +} + +// MatchMinMax creates a TokenHandler that checks if the provided TokenHandler can +// be applied between the provided minimum and maximum number of times, +// inclusive. All matches will be included in the output. +func MatchMinMax(min int, max int, handler TokenHandler) TokenHandler { + if max < 0 { + panic("internal parser error: MatchMinMax definition error: max must be >= 0 ") + } + if min < 0 { + panic("internal parser error: MatchMinMax definition error: min must be >= 0 ") + } + return matchMinMax(min, max, handler) +} + +func matchMinMax(min int, max int, handler TokenHandler) TokenHandler { + return func(t *TokenAPI) bool { + child := t.Fork() + if max >= 0 && min > max { + panic(fmt.Sprintf("internal parser error: MatchRep definition error: max %d must not be < min %d", max, min)) + } + total := 0 + // Check for the minimum required amount of matches. + for total < min { + total++ + if !handler(child) { + return false + } + } + // No specified max: include the rest of the available matches. + // Specified max: include the rest of the availble matches, up to the max. + child.Merge() + for max < 0 || total < max { + total++ + if !handler(child) { + break + } + child.Merge() + } + return true + } +} + +// MatchSeparated creates a TokenHandler that checks for a pattern of one or more +// TokenHandlers of one type (the separated), separated by TokenHandler of another type +// (the separator). All matches (separated + separator) are included in the +// output. +func MatchSeparated(separator TokenHandler, separated TokenHandler) TokenHandler { + return MatchSeq(separated, MatchZeroOrMore(MatchSeq(separator, separated))) +} + +// MatchExcept creates a TokenHandler that checks if the provided TokenHandler can be +// applied to the upcoming input. It also checks if the except TokenHandler can be +// applied. If the handler applies, but the except TokenHandler as well, then the match +// as a whole will be treated as a mismatch. +func MatchExcept(except TokenHandler, handler TokenHandler) TokenHandler { + return func(t *TokenAPI) bool { + if except(t.Fork()) { + return false + } + return handler(t) + } +} + +// A provides convenient access to a range of atoms that can be used to +// build TokenHandlers or parser rules. +// +// In parsekit, an atom is defined as a ready for use TokenHandler function. +// +// When using A in your own parser, then it is advised to create a variable +// to reference it: +// +// var a = parsekit.A +// +// Doing so saves you a lot of typing, and it makes your code a lot cleaner. +var A = struct { + EndOfFile TokenHandler + AnyRune TokenHandler + Space TokenHandler + Tab TokenHandler + CR TokenHandler + LF TokenHandler + CRLF TokenHandler + Excl TokenHandler + DoubleQuote TokenHandler + Hash TokenHandler + Dollar TokenHandler + Percent TokenHandler + Amp TokenHandler + SingleQuote TokenHandler + RoundOpen TokenHandler + RoundClose TokenHandler + Asterisk TokenHandler + Plus TokenHandler + Comma TokenHandler + Minus TokenHandler + Dot TokenHandler + Slash TokenHandler + Colon TokenHandler + Semicolon TokenHandler + AngleOpen TokenHandler + Equal TokenHandler + AngleClose TokenHandler + Question TokenHandler + At TokenHandler + SquareOpen TokenHandler + Backslash TokenHandler + SquareClose TokenHandler + Caret TokenHandler + Underscore TokenHandler + Backquote TokenHandler + CurlyOpen TokenHandler + Pipe TokenHandler + CurlyClose TokenHandler + Tilde TokenHandler + Newline TokenHandler + Whitespace TokenHandler + WhitespaceAndNewlines TokenHandler + EndOfLine TokenHandler + Digit TokenHandler + ASCII TokenHandler + ASCIILower TokenHandler + ASCIIUpper TokenHandler + HexDigit TokenHandler +}{ + EndOfFile: MatchEndOfFile(), + AnyRune: MatchAnyRune(), + Space: C.Rune(' '), + Tab: C.Rune('\t'), + CR: C.Rune('\r'), + LF: C.Rune('\n'), + CRLF: C.Str("\r\n"), + Excl: C.Rune('!'), + DoubleQuote: C.Rune('"'), + Hash: C.Rune('#'), + Dollar: C.Rune('$'), + Percent: C.Rune('%'), + Amp: C.Rune('&'), + SingleQuote: C.Rune('\''), + RoundOpen: C.Rune('('), + RoundClose: C.Rune(')'), + Asterisk: C.Rune('*'), + Plus: C.Rune('+'), + Comma: C.Rune(','), + Minus: C.Rune('-'), + Dot: C.Rune('.'), + Slash: C.Rune('/'), + Colon: C.Rune(':'), + Semicolon: C.Rune(';'), + AngleOpen: C.Rune('<'), + Equal: C.Rune('='), + AngleClose: C.Rune('>'), + Question: C.Rune('?'), + At: C.Rune('@'), + SquareOpen: C.Rune('['), + Backslash: C.Rune('\\'), + SquareClose: C.Rune(']'), + Caret: C.Rune('^'), + Underscore: C.Rune('_'), + Backquote: C.Rune('`'), + CurlyOpen: C.Rune('{'), + Pipe: C.Rune('|'), + CurlyClose: C.Rune('}'), + Tilde: C.Rune('~'), + Whitespace: C.OneOrMore(C.Any(C.Rune(' '), C.Rune('\t'))), + WhitespaceAndNewlines: C.OneOrMore(C.Any(C.Rune(' '), C.Rune('\t'), C.Str("\r\n"), C.Rune('\n'))), + EndOfLine: C.Any(C.Str("\r\n"), C.Rune('\n'), MatchEndOfFile()), + Digit: C.RuneRange('0', '9'), + ASCII: C.RuneRange('\x00', '\x7F'), + ASCIILower: C.RuneRange('a', 'z'), + ASCIIUpper: C.RuneRange('A', 'Z'), + HexDigit: C.Any(C.RuneRange('0', '9'), C.RuneRange('a', 'f'), C.RuneRange('A', 'F')), +} + +// MatchEndOfFile creates a TokenHandler that checks if the end of the input data +// has been reached. This TokenHandler will never produce output. It only reports +// a successful or a failing match through its boolean return value. +func MatchEndOfFile() TokenHandler { + return func(t *TokenAPI) bool { + fork := t.Fork() + input, ok := fork.NextRune() + return !ok && input == eofRune + } +} + +// MatchAnyRune creates a TokenHandler function that checks if a valid rune can be +// read from the input. It reports back a successful match if the end of the +// input has not yet been reached and the upcoming input is a valid UTF8 rune. +func MatchAnyRune() TokenHandler { + return func(t *TokenAPI) bool { + _, ok := t.NextRune() + if ok { + t.Accept() + return true + } + return false + } +} + +// M provides convenient access to a range of modifiers (which in their nature are +// parser/combinators) that can be used when creating TokenHandler functions. +// +// In parsekit, a modifier is defined as a TokenHandler function that modifies the +// resulting output of another TokenHandler in some way. It does not do any matching +// against input of its own. +// +// When using M in your own parser, then it is advised to create a variable +// to reference it: +// +// var m = parsekit.M +// +// Doing so saves you a lot of typing, and it makes your code a lot cleaner. +var M = struct { + Drop func(TokenHandler) TokenHandler + Trim func(handler TokenHandler, cutset string) TokenHandler // TODO reverse arguments? + TrimLeft func(handler TokenHandler, cutset string) TokenHandler // TODO reverse arguments? + TrimRight func(handler TokenHandler, cutset string) TokenHandler // TODO reverse arguments? + ToLower func(TokenHandler) TokenHandler + ToUpper func(TokenHandler) TokenHandler + Replace func(handler TokenHandler, replaceWith string) TokenHandler // TODO reverse arguments? + ModifyByCallback func(TokenHandler, func(string) string) TokenHandler +}{ + Drop: ModifyDrop, + Trim: ModifyTrim, + TrimLeft: ModifyTrimLeft, + TrimRight: ModifyTrimRight, + ToLower: ModifyToLower, + ToUpper: ModifyToUpper, + Replace: ModifyReplace, + ModifyByCallback: ModifyByCallback, +} + +// ModifyDrop creates a TokenHandler that checks if the provided TokenHandler applies. +// If it does, then its output is discarded completely. +// +// Note that if the TokenHandler does not apply, a mismatch will be reported back, +// even though we would have dropped the output anyway. So if you would like +// to drop optional whitespace, then use something like: +// +// M.Drop(C.Opt(A.Whitespace)) +// +// instead of: +// +// M.Drop(A.Whitespace) +// +// Since whitespace is defined as "1 or more spaces and/or tabs", the input +// string "bork" would not match against the second form, but " bork" would. +// In both cases, it would match the first form. +func ModifyDrop(handler TokenHandler) TokenHandler { + return ModifyByCallback(handler, func(s string) string { + return "" + }) +} + +// ModifyTrim creates a TokenHandler that checks if the provided TokenHandler applies. +// If it does, then its output is taken and characters from the provided +// cutset are trimmed from both the left and the right of the output. +func ModifyTrim(handler TokenHandler, cutset string) TokenHandler { + return modifyTrim(handler, cutset, true, true) +} + +// ModifyTrimLeft creates a TokenHandler that checks if the provided TokenHandler applies. +// If it does, then its output is taken and characters from the provided +// cutset are trimmed from the left of the output. +func ModifyTrimLeft(handler TokenHandler, cutset string) TokenHandler { + return modifyTrim(handler, cutset, true, false) +} + +// ModifyTrimRight creates a TokenHandler that checks if the provided TokenHandler applies. +// If it does, then its output is taken and characters from the provided +// cutset are trimmed from the right of the output. +func ModifyTrimRight(handler TokenHandler, cutset string) TokenHandler { + return modifyTrim(handler, cutset, false, true) +} + +func modifyTrim(handler TokenHandler, cutset string, trimLeft bool, trimRight bool) TokenHandler { + modfunc := func(s string) string { + if trimLeft { + s = strings.TrimLeft(s, cutset) + } + if trimRight { + s = strings.TrimRight(s, cutset) + } + return s + } + return ModifyByCallback(handler, modfunc) +} + +// ModifyToUpper creates a TokenHandler that checks if the provided TokenHandler applies. +// If it does, then its output is taken and characters from the provided +// cutset are converted into upper case. +func ModifyToUpper(handler TokenHandler) TokenHandler { + return ModifyByCallback(handler, strings.ToUpper) +} + +// ModifyToLower creates a TokenHandler that checks if the provided TokenHandler applies. +// If it does, then its output is taken and characters from the provided +// cutset are converted into lower case. +func ModifyToLower(handler TokenHandler) TokenHandler { + return ModifyByCallback(handler, strings.ToLower) +} + +// ModifyReplace creates a TokenHandler that checks if the provided TokenHandler applies. +// If it does, then its output is replaced by the provided string. +func ModifyReplace(handler TokenHandler, replaceWith string) TokenHandler { + return ModifyByCallback(handler, func(string) string { + return replaceWith + }) +} + +// ModifyByCallback creates a TokenHandler that checks if the provided TokenHandler applies. +// If it does, then its output is taken and it is fed to the provided modfunc. +// This is a simple function that takes a string on input and returns a possibly +// modified string on output. The return value of the modfunc will replace the +// resulting output. +func ModifyByCallback(handler TokenHandler, modfunc func(string) string) TokenHandler { + return func(t *TokenAPI) bool { + child := t.Fork() + if handler(child) { + s := modfunc(string(child.output)) + child.output = []rune(s) + child.Merge() + return true + } + return false + } +} diff --git a/matcher_builtin_test.go b/tokenhandlers_builtin_test.go similarity index 97% rename from matcher_builtin_test.go rename to tokenhandlers_builtin_test.go index 8579c64..950f513 100644 --- a/matcher_builtin_test.go +++ b/tokenhandlers_builtin_test.go @@ -8,7 +8,7 @@ import ( ) func TestCombinators(t *testing.T) { - RunMatcherTests(t, []MatcherTest{ + RunTokenHandlerTests(t, []TokenHandlerTest{ {"xxx", c.Rune('x'), true, "x"}, {"x ", c.Rune(' '), false, ""}, {"aa", c.RuneRange('b', 'e'), false, ""}, @@ -79,7 +79,7 @@ func TestCombinators(t *testing.T) { } func TestAtoms(t *testing.T) { - RunMatcherTests(t, []MatcherTest{ + RunTokenHandlerTests(t, []TokenHandlerTest{ {"", a.EndOfFile, true, ""}, {"⌘", a.AnyRune, true, "⌘"}, {"\xbc", a.AnyRune, false, ""}, // invalid UTF8 rune @@ -158,7 +158,7 @@ func TestAtoms(t *testing.T) { } func TestModifiers(t *testing.T) { - RunMatcherTests(t, []MatcherTest{ + RunTokenHandlerTests(t, []TokenHandlerTest{ {"--cool", c.Seq(m.Drop(c.OneOrMore(a.Minus)), c.Str("cool")), true, "cool"}, {" trim ", m.Trim(c.OneOrMore(a.AnyRune), " "), true, "trim"}, {" \t trim \t ", m.Trim(c.OneOrMore(a.AnyRune), " \t"), true, "trim"}, @@ -172,6 +172,30 @@ func TestModifiers(t *testing.T) { }) } +func TestSequenceOfRunes(t *testing.T) { + sequence := c.Seq( + a.Hash, a.Dollar, a.Percent, a.Amp, a.SingleQuote, a.RoundOpen, + a.RoundClose, a.Asterisk, a.Plus, a.Comma, a.Minus, a.Dot, a.Slash, + a.Colon, a.Semicolon, a.AngleOpen, a.Equal, a.AngleClose, a.Question, + a.At, a.SquareOpen, a.Backslash, a.SquareClose, a.Caret, a.Underscore, + a.Backquote, a.CurlyOpen, a.Pipe, a.CurlyClose, a.Tilde, + ) + input := "#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" + parser := parsekit.NewParser(func(p *parsekit.ParseAPI) { + p.Expects("Sequence of runes") + if p.On(sequence).Accept() { + p.EmitLiteral(TestItem) + } + }) + item, err, ok := parser.Parse(input).Next() + if !ok { + t.Fatalf("Parsing failed: %s", err) + } + if item.Value != input { + t.Fatalf("Unexpected output from parser:\nexpected: %s\nactual: %s\n", input, item.Value) + } +} + // I know, this is hell, but that's the whole point for this test :-> func TestCombination(t *testing.T) { demonic := c.Seq( @@ -194,34 +218,10 @@ func TestCombination(t *testing.T) { c.Opt(a.SquareClose), ) - RunMatcherTests(t, []MatcherTest{ + RunTokenHandlerTests(t, []TokenHandlerTest{ {"[ \t >>>Hello, world!<<< ]", demonic, true, "[>>>5, WORLD<<<]"}, {"[ \t >>>Hello, world!<<< ", demonic, true, "[>>>5, WORLD<<<"}, {">>>HellohellO, world!<<< ]", demonic, true, ">>>10, WORLD<<<]"}, {"[ \t >>>HellohellO , , , world!<<< ", demonic, true, "[>>>10, WORLD<<<"}, }) } - -func TestSequenceOfRunes(t *testing.T) { - sequence := c.Seq( - a.Hash, a.Dollar, a.Percent, a.Amp, a.SingleQuote, a.RoundOpen, - a.RoundClose, a.Asterisk, a.Plus, a.Comma, a.Minus, a.Dot, a.Slash, - a.Colon, a.Semicolon, a.AngleOpen, a.Equal, a.AngleClose, a.Question, - a.At, a.SquareOpen, a.Backslash, a.SquareClose, a.Caret, a.Underscore, - a.Backquote, a.CurlyOpen, a.Pipe, a.CurlyClose, a.Tilde, - ) - input := "#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" - parser := parsekit.NewParser(func(p *parsekit.P) { - p.Expects("Sequence of runes") - if p.On(sequence).Accept() { - p.EmitLiteral(TestItem) - } - }) - item, err, ok := parser.Parse(input).Next() - if !ok { - t.Fatalf("Parsing failed: %s", err) - } - if item.Value != input { - t.Fatalf("Unexpected output from parser:\nexpected: %s\nactual: %s\n", input, item.Value) - } -}