From 2751c7800376afad3ff94c5d2f9ce1301cec86dd Mon Sep 17 00:00:00 2001 From: Maurice Makaay Date: Sat, 25 May 2019 22:53:04 +0000 Subject: [PATCH] Got rid of the full On()...etc chains for both code clarity and usability clarity. Working on good examples and shaving the API's accordingly. --- example_basiccalculator_test.go | 140 +++++++++++++++++++++++ example_dutchpostcode_test.go | 18 +-- example_hellomatcher_test.go | 58 ++++++++++ example_helloparser_test.go | 111 ++++++++++++++++++ examples_test.go | 195 +------------------------------- matcher_builtin.go | 2 +- matcher_builtin_test.go | 2 +- parsekit.go | 2 +- statehandler.go | 109 ++++-------------- statehandler_emit.go | 42 ++++++- statehandler_expects.go | 18 +++ statehandler_on.go | 170 ++++++++-------------------- statehandler_route.go | 76 +++++++++++++ 13 files changed, 530 insertions(+), 413 deletions(-) create mode 100644 example_basiccalculator_test.go create mode 100644 example_hellomatcher_test.go create mode 100644 example_helloparser_test.go create mode 100644 statehandler_expects.go create mode 100644 statehandler_route.go diff --git a/example_basiccalculator_test.go b/example_basiccalculator_test.go new file mode 100644 index 0000000..5d792a9 --- /dev/null +++ b/example_basiccalculator_test.go @@ -0,0 +1,140 @@ +// Let's write a small example for parsing a really basic calculator. +// The calculator understands input that looks like: +// +// 10 + 20 - 8+4 +// +// So positive numbers that can be either added or substracted, and whitespace +// is ignored. +package parsekit_test + +import ( + "fmt" + "strconv" + + "git.makaay.nl/mauricem/go-parsekit" +) + +// When writing a parser, it's a good start to use the parser/combinator +// functionality of parsekit to create some Matcher functions. These functions +// can later be used in the parser state machine to check for matching strings +// on the input data. +// +// For the calculator, we only need a definition of "number, surrounded by +// optional whitespace". Skipping whitespace could be a part of the StateHandler +// functions below too, but including it in a Matcher makes things really +// practical. +func createNumberMatcher() parsekit.Matcher { + // Easy access to parsekit definition. + c, a, m := parsekit.C, parsekit.A, parsekit.M + + whitespace := m.Drop(c.Opt(a.Whitespace)) + return c.Seq(whitespace, c.OneOrMore(a.Digit), whitespace) +} + +var calcNumber = createNumberMatcher() + +// We need to define the ItemTypes that we will use for emitting Items +// during the parsing process. +const ( + numberType parsekit.ItemType = iota + addType + subtractType +) + +// We also need to define the state machine for parsing the input. +// The state machine is built up from functions that match the StateHandler +// signature: func(*parsekit.P) +// The P struct holds the internal state for the parser and it provides +// some methods that form the API for your StateHandler implementation. + +// State: expect a number. When a number is found on the input, +// it is accepted in the output buffer, after which the output buffer is +// emitted as a numberType item. Then we tell the state machine to continue +// with the calcWaitForOperatorOrEndOfInput state. +// When no number is found, the parser will emit an error, explaining that +// "a number" was expected. +func calcWaitForNumber(p *parsekit.P) { + p.Expects("a number") + if p.On(calcNumber).Accept() { + p.EmitLiteral(numberType) + p.RouteTo(calcWaitForOperatorOrEndOfInput) + } +} + +// State: expect a plus or minus operator. When one of those +// is found, the appropriate Item is emitted and the parser is sent back +// to the numberHandler to find the next number on the input. +// When no operator is found, then the parser is told to expect the end of +// the input. When more input data is available (which is obviously wrong +// data since it does not match our syntax), the parser will emit an error. +func calcWaitForOperatorOrEndOfInput(p *parsekit.P) { + switch { + case p.On(a.Plus).Accept(): + p.EmitLiteral(addType) + p.RouteTo(calcWaitForNumber) + case p.On(a.Minus).Accept(): + p.EmitLiteral(subtractType) + p.RouteTo(calcWaitForNumber) + default: + p.ExpectEndOfFile() + } +} + +// All is ready for our parser. We now can create a new Parser struct. +// We need to tell it what the start state is. In our case, it is the +// calcWaitForNumber state, since the calculation must start with a number. +var calcParser = parsekit.NewParser(calcWaitForNumber) + +func Example_basicCalculator() { + // Let's feed the parser some input to work with. + run := calcParser.Parse(" 153+22 + 31-4 -\t 6+42 ") + + // We can now step through the results of the parsing process by repeated + // calls to run.Next(). Next() returns either the next parse item, a parse + // error or an end of file. Let's dump the parse results and handle the + // computation while we're at it. + sum := 0 + op := +1 + for { + item, err, ok := run.Next() + switch { + case !ok && err == nil: + fmt.Println("End of file reached") + fmt.Println("Outcome of computation:", sum) + return + case !ok: + fmt.Printf("Error: %s\n", err) + return + default: + fmt.Printf("Type: %d, Value: %q\n", item.Type, item.Value) + switch { + case item.Type == addType: + op = +1 + case item.Type == subtractType: + op = -1 + case item.Type == numberType: + nr, err := strconv.Atoi(item.Value) + if err != nil { + fmt.Printf("Error: invalid number %s: %s\n", item.Value, err) + return + } + sum += op * nr + } + } + } + + // Output: + // Type: 0, Value: "153" + // Type: 1, Value: "+" + // Type: 0, Value: "22" + // Type: 1, Value: "+" + // Type: 0, Value: "31" + // Type: 2, Value: "-" + // Type: 0, Value: "4" + // Type: 2, Value: "-" + // Type: 0, Value: "6" + // Type: 1, Value: "+" + // Type: 0, Value: "42" + // End of file reached + // Outcome of computation: 238 +} diff --git a/example_dutchpostcode_test.go b/example_dutchpostcode_test.go index afe9348..cd28874 100644 --- a/example_dutchpostcode_test.go +++ b/example_dutchpostcode_test.go @@ -1,4 +1,6 @@ -// In this example, a parser is created which can parse and normalize Dutch postcodes. +// In this example, a parser is created which can parse and normalize Dutch postcodes +// The implementation uses only a Matcher function and does not implement a +// full-fledged state-based Parser for it. package parsekit_test import ( @@ -9,7 +11,7 @@ import ( func createPostcodeMatcher() *parsekit.MatcherWrapper { // Easy access to the parsekit definitions. - var c, a, m = parsekit.C, parsekit.A, parsekit.M + c, a, m := parsekit.C, parsekit.A, parsekit.M // Matcher functions are created and combined to satisfy these rules: // - A Dutch postcode consists of 4 digits and 2 letters (1234XX). @@ -17,12 +19,12 @@ func createPostcodeMatcher() *parsekit.MatcherWrapper { // - A space between letters and digits is optional. // - It is good form to write the letters in upper case. // - It is good form to use a single space between digits and letters. - var digitNotZero = c.Except(c.Rune('0'), a.Digit) - var pcDigits = c.Seq(digitNotZero, c.Rep(3, a.Digit)) - var pcLetter = c.Any(a.ASCIILower, a.ASCIIUpper) - var pcLetters = m.ToUpper(c.Seq(pcLetter, pcLetter)) - var space = m.Replace(c.Opt(a.Whitespace), " ") - var postcode = c.Seq(pcDigits, space, pcLetters) + digitNotZero := c.Except(c.Rune('0'), a.Digit) + pcDigits := c.Seq(digitNotZero, c.Rep(3, a.Digit)) + pcLetter := c.Any(a.ASCIILower, a.ASCIIUpper) + pcLetters := m.ToUpper(c.Seq(pcLetter, pcLetter)) + space := m.Replace(c.Opt(a.Whitespace), " ") + postcode := c.Seq(pcDigits, space, pcLetters) return parsekit.NewMatcher(postcode, "a Dutch postcode") } diff --git a/example_hellomatcher_test.go b/example_hellomatcher_test.go new file mode 100644 index 0000000..975de0c --- /dev/null +++ b/example_hellomatcher_test.go @@ -0,0 +1,58 @@ +// In this example, a parser is created that is able to parse input that looks +// like "Hello, !", and that extracts the name from it. +// +// The implementation uses only a Matcher function and does not implement a +// full-fledged state-based Parser for it. If you want to see the same kind of +// functionality, implementated using a Paser, take a look at the +// HelloWorldUsingParser example. +package parsekit_test + +import ( + "fmt" + + "git.makaay.nl/mauricem/go-parsekit" +) + +func createHelloMatcher() *parsekit.MatcherWrapper { + // Easy access to parsekit definition. + c, a, m := parsekit.C, parsekit.A, parsekit.M + + // Using the parser/combinator support of parsekit, we create a Matcher function + // that does all the work. The 'greeting' Matcher matches the whole input and + // drops all but the name from it. + hello := c.StrNoCase("hello") + comma := c.Seq(c.Opt(a.Whitespace), a.Comma, c.Opt(a.Whitespace)) + separator := c.Any(comma, a.Whitespace) + name := c.OneOrMore(c.Not(a.Excl)) + greeting := c.Seq(m.Drop(hello), m.Drop(separator), name, m.Drop(a.Excl)) + + // Using 'greeting' we can now create the Matcher-based parser. + return parsekit.NewMatcher(greeting, "a friendly greeting") +} + +func Example_helloWorldUsingMatcher() { + parser := createHelloMatcher() + + for i, input := range []string{ + "Hello, world!", + "HELLO ,Johnny!", + "hello , Bob123!", + "hello Pizza!", + "Oh no!", + "Hello, world", + } { + output, err, ok := parser.Parse(input) + if !ok { + fmt.Printf("[%d] Input: %q Error: %s\n", i, input, err) + } else { + fmt.Printf("[%d] Input: %q Output: %s\n", i, input, output) + } + } + // Output: + // [0] Input: "Hello, world!" Output: world + // [1] Input: "HELLO ,Johnny!" Output: Johnny + // [2] Input: "hello , Bob123!" Output: Bob123 + // [3] Input: "hello Pizza!" Output: Pizza + // [4] Input: "Oh no!" Error: unexpected character 'O' (expected a friendly greeting) + // [5] Input: "Hello, world" Error: unexpected character 'H' (expected a friendly greeting) +} diff --git a/example_helloparser_test.go b/example_helloparser_test.go new file mode 100644 index 0000000..57ee5f6 --- /dev/null +++ b/example_helloparser_test.go @@ -0,0 +1,111 @@ +// In this example, a parser is created that is able to parse input that looks +// like "Hello, !", and that extracts the name from it. +// +// This implementation uses a state-based Parser for it, and it does not implement +// any custom combinator/parser Matcher functions. Note that things are much easier to +// implement using custom Matchers (see the other HelloWorldUsingMatcher example +// for this). Doing this fully parser-based implementation is mainly for your +// learning pleasure. +// +// One big difference between the Matcher-based example and this one, is that the +// state-based parser reports errors much more fine-grained. This might or might +// not be useful for your specific application. +package parsekit_test + +import ( + "fmt" + "strings" + + "git.makaay.nl/mauricem/go-parsekit" +) + +const greeteeItem parsekit.ItemType = 1 + +func stateStartOfGreeting(p *parsekit.P) { + c := parsekit.C + p.Expects("hello") + if p.On(c.StrNoCase("hello")).Skip() { + p.RouteTo(stateComma) + } +} + +func stateComma(p *parsekit.P) { + a := parsekit.A + p.Expects("comma") + switch { + case p.On(a.Whitespace).Skip(): + p.RouteRepeat() + case p.On(a.Comma).Skip(): + p.RouteTo(stateName) + } +} + +func stateName(p *parsekit.P) { + a := parsekit.A + p.Expects("name") + switch { + case p.On(a.Excl).Skip(): + p.RouteTo(stateEndOfGreeting) + case p.On(a.AnyRune).Accept(): + p.RouteRepeat() + } +} + +func stateEndOfGreeting(p *parsekit.P) { + p.Expects("end of greeting") + if p.On(a.EndOfFile).Stay() { + name := strings.TrimSpace(p.BufLiteral()) + if name == "" { + p.EmitError("The name cannot be empty") + } else { + p.Emit(greeteeItem, name) + } + } +} + +func createHelloParser() *parsekit.Parser { + return parsekit.NewParser(stateStartOfGreeting) +} + +func Example_helloWorldUsingParser() { + parser := createHelloParser() + + for i, input := range []string{ + "Hello, world!", + "HELLO ,Johnny!", + "hello , Bob123!", + "hello Pizza!", + "", + " ", + "hello", + "hello,", + "hello , ", + "hello , Droopy", + "hello , Droopy!", + "hello , \t \t Droopy \t !", + "Oh no!", + "hello,!", + } { + item, err, ok := parser.Parse(input).Next() + if !ok { + fmt.Printf("[%d] Input: %q Error: %s\n", i, input, err) + } else { + fmt.Printf("[%d] Input: %q Output: %s\n", i, input, item.Value) + } + } + // Output: + // [0] Input: "Hello, world!" Output: world + // [1] Input: "HELLO ,Johnny!" Output: Johnny + // [2] Input: "hello , Bob123!" Output: Bob123 + // [3] Input: "hello Pizza!" Error: unexpected character 'P' (expected comma) + // [4] Input: "" Error: unexpected end of file (expected hello) + // [5] Input: " " Error: unexpected character ' ' (expected hello) + // [6] Input: "hello" Error: unexpected end of file (expected comma) + // [7] Input: "hello," Error: unexpected end of file (expected name) + // [8] Input: "hello , " Error: unexpected end of file (expected name) + // [9] Input: "hello , Droopy" Error: unexpected end of file (expected name) + // [10] Input: "hello , Droopy!" Output: Droopy + // [11] Input: "hello , \t \t Droopy \t !" Output: Droopy + // [12] Input: "Oh no!" Error: unexpected character 'O' (expected hello) + // [13] Input: "hello,!" Error: The name cannot be empty +} diff --git a/examples_test.go b/examples_test.go index d579661..479d1aa 100644 --- a/examples_test.go +++ b/examples_test.go @@ -2,194 +2,10 @@ package parsekit_test import ( "fmt" - "strconv" "git.makaay.nl/mauricem/go-parsekit" ) -func Example_helloWorldUsingParser() { -} - -func Example_helloWorldUsingMatcher() { - // In this example, a parser is created that is able to parse input that looks - // like "Hello, !", and that extracts the name from it. - // The implementation uses only a Matcher function and does not implement a - // full-fledged state-based Parser for it. - - // Easy access to parsekit parser/combinators, atoms and modifiers. - var c, a, m = parsekit.C, parsekit.A, parsekit.M - - // Using the parser/combinator support of parsekit, we create a Matcher function - // that does all the work. The 'greeting' Matcher matches the whole input and - // drops all but the name from it. - var hello = c.StrNoCase("hello") - var comma = c.Seq(c.Opt(a.Whitespace), a.Comma, c.Opt(a.Whitespace)) - var separator = c.Any(comma, a.Whitespace) - var name = c.OneOrMore(c.Not(a.Excl)) - var greeting = c.Seq(m.Drop(hello), m.Drop(separator), name, m.Drop(a.Excl)) - - // Now we can already do some parsing, by using a Matcher. - var parser = parsekit.NewMatcher(greeting, "a friendly greeting") - for i, input := range []string{ - "Hello, world!", - "HELLO ,Johnny!", - "hello , Bob123!", - "hello Pizza!", - "Oh no!", - "Hello, world", - } { - output, err, ok := parser.Parse(input) - if !ok { - fmt.Printf("[%d] Input: %q Error: %s\n", i, input, err) - } else { - fmt.Printf("[%d] Input: %q Output: %s\n", i, input, output) - } - } - // Output: - // [0] Input: "Hello, world!" Output: world - // [1] Input: "HELLO ,Johnny!" Output: Johnny - // [2] Input: "hello , Bob123!" Output: Bob123 - // [3] Input: "hello Pizza!" Output: Pizza - // [4] Input: "Oh no!" Error: unexpected character 'O' (expected a friendly greeting) - // [5] Input: "Hello, world" Error: unexpected character 'H' (expected a friendly greeting) -} - -func Example_basicCalculator() { - // Let's write a small example for parsing a really basic calculator. - // The calculator understands input that looks like: - // - // 10 + 20 - 8+4 - // - // So positive numbers that can be either added or substracted, and whitespace - // is ignored. - - // Easy access to parsekit parser/combinators, atoms and modifiers. - var c, a, m = parsekit.C, parsekit.A, parsekit.M - - // When writing a parser, it's a good start to use the parser/combinator - // functionality of parsekit to create some Matcher functions. These - // functions can later be used in the parser state machine to find the - // matching tokens on the input data. - // - // In our case, we only need a definition of "number, surrounded by - // optional whitespace". Skipping whitespace could be a part of the - // StateHandler functions below too, but including it in a Matcher makes - // things really practical here. - var whitespace = m.Drop(c.Opt(a.Whitespace)) - var number = c.Seq(whitespace, c.OneOrMore(a.Digit), whitespace) - - // We also must define the types of items that the parser will emit. - // We only need three of them here, for numbers, plus and minus. - // The recommended way to define these, is using 'iota' for auto numbering. - const ( - numberType parsekit.ItemType = iota - addType - subtractType - ) - - // Now it is time to define the state machine for parsing the input. - // The state machine is built up from functions that match the StateHandler - // signature: func(*parsekit.P) - // The P struct holds the internal state for the parser and it provides - // some methods that form the API for your StateHandler implementation. - // - // (note that normally you'd write normal functions and not anonymous - // functions like I did here. I had to use these to be able to write the - // example code) - - var operatorHandler parsekit.StateHandler - - // In this state, we expect a number. When a number is found on the input, - // it is accepted in the output buffer, after which the output buffer is - // emitted as a numberType item. Then we tell the state machine to continue - // with the operatorHandler state. - // When no number is found, the parser will emit an error, explaining that - // "a number" was expected. - numberHandler := func(p *parsekit.P) { - p.Expects("a number") - if p.On(number).Accept().End() { - p.EmitLiteral(numberType) - p.RouteTo(operatorHandler) - } - } - - // In this state, we expect a plus or minus operator. When one of those - // is found, the appropriate Item is emitted and the parser is sent back - // to the numberHandler to find the next number on the input. - // When no operator is found, then the parser is told to expect the end of - // the input. When more input data is available (which is obviously wrong - // data since it does not match our syntax), the parser will emit an error. - operatorHandler = func(p *parsekit.P) { - switch { - case p.On(a.Plus).Accept().End(): - p.EmitLiteral(addType) - p.RouteTo(numberHandler) - case p.On(a.Minus).Accept().End(): - p.EmitLiteral(subtractType) - p.RouteTo(numberHandler) - default: - p.ExpectEndOfFile() - } - } - - // All is ready for our parser. We now can create a new Parser struct. - // We need to tell it what the start state is. In our case, it is - // the number state, since the calculation must start with a number. - parser := parsekit.NewParser(numberHandler) - - // Let's feed the parser some input to work with. - run := parser.Parse(" 153+22 + 31-4 -\t 6+42 ") - - // We can now step through the results of the parsing process by repeated - // calls to run.Next(). Next() returns either the next parse item, a parse - // error or an end of file. Let's dump the parse results and handle the - // computation while we're at it. - sum := 0 - op := +1 - for { - item, err, ok := run.Next() - switch { - case !ok && err == nil: - fmt.Println("End of file reached") - fmt.Println("Outcome of computation:", sum) - return - case !ok: - fmt.Printf("Error: %s\n", err) - return - default: - fmt.Printf("Type: %d, Value: %q\n", item.Type, item.Value) - switch { - case item.Type == addType: - op = +1 - case item.Type == subtractType: - op = -1 - case item.Type == numberType: - nr, err := strconv.Atoi(item.Value) - if err != nil { - fmt.Printf("Error: invalid number %s: %s\n", item.Value, err) - return - } - sum += op * nr - } - } - } - - // Output: - // Type: 0, Value: "153" - // Type: 1, Value: "+" - // Type: 0, Value: "22" - // Type: 1, Value: "+" - // Type: 0, Value: "31" - // Type: 2, Value: "-" - // Type: 0, Value: "4" - // Type: 2, Value: "-" - // Type: 0, Value: "6" - // Type: 1, Value: "+" - // Type: 0, Value: "42" - // End of file reached - // Outcome of computation: 238 -} - func ExampleItemType() { // Make use of positive values. Ideally, define your ItemTypes using // iota for easy automatic value management like this: @@ -202,17 +18,18 @@ func ExampleItemType() { } func ExampleItem() { - var c = parsekit.C + // Easy access to the parsekit definitions. + c := parsekit.C // You define your own item types for your specific parser. - var QuestionItem parsekit.ItemType = 42 + const QuestionItem = parsekit.ItemType(42) // A StateHandler function can use the defined item type by means of // the p.Emit* methods on parsekit.P. // When errors occur, or the end of the file is reached, then the built-in // types parsekit.ItemEOF and parsekit.ItemError will be emitted by parsekit. stateHandler := func(p *parsekit.P) { - if p.On(c.Str("question")).Accept().End() { + if p.On(c.Str("question")).Accept() { p.EmitLiteral(QuestionItem) } p.ExpectEndOfFile() @@ -280,11 +97,11 @@ func ExampleError_ErrorFull() { func ExampleMatchAnyRune() { // Easy access to the parsekit definitions. - var a = parsekit.A + a := parsekit.A handler := func(p *parsekit.P) { p.Expects("Any valid rune") - if p.On(a.AnyRune).Accept().End() { + if p.On(a.AnyRune).Accept() { p.EmitLiteral(TestItem) p.RouteRepeat() } diff --git a/matcher_builtin.go b/matcher_builtin.go index fe96dcd..fd90142 100644 --- a/matcher_builtin.go +++ b/matcher_builtin.go @@ -410,7 +410,7 @@ func MatchEndOfFile() Matcher { return func(m *MatchDialog) bool { fork := m.Fork() input, ok := fork.NextRune() - return !ok && input == EOF + return !ok && input == eofRune } } diff --git a/matcher_builtin_test.go b/matcher_builtin_test.go index 59878f0..8579c64 100644 --- a/matcher_builtin_test.go +++ b/matcher_builtin_test.go @@ -213,7 +213,7 @@ func TestSequenceOfRunes(t *testing.T) { input := "#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" parser := parsekit.NewParser(func(p *parsekit.P) { p.Expects("Sequence of runes") - if p.On(sequence).Accept().End() { + if p.On(sequence).Accept() { p.EmitLiteral(TestItem) } }) diff --git a/parsekit.go b/parsekit.go index ab3f050..e312b10 100644 --- a/parsekit.go +++ b/parsekit.go @@ -158,7 +158,7 @@ type MatcherWrapper struct { func NewMatcher(matcher Matcher, expects string) *MatcherWrapper { handler := func(p *P) { p.Expects(expects) - if p.On(matcher).Accept().End() { + if p.On(matcher).Accept() { p.EmitLiteral(0) // ItemType is irrelevant } } diff --git a/statehandler.go b/statehandler.go index d8b91bf..a6909e7 100644 --- a/statehandler.go +++ b/statehandler.go @@ -16,125 +16,54 @@ type P struct { state StateHandler // the function that handles the current state nextState StateHandler // the function that will handle the next state routeStack []StateHandler // route stack, for handling nested parsing - input string // the scanned input + input string // the input that is being scanned by the parser inputPos int // current byte cursor position in the input cursorLine int // current rune cursor row number in the input cursorColumn int // current rune cursor column position in the input len int // the total length of the input in bytes newline bool // keep track of when we have scanned a newline - expecting string // a description of what the current state expects to find - buffer stringBuffer // an efficient buffer, used to build string values - items chan Item // channel of resulting Parser items + expecting string // a description of what the current state expects to find (see P.Expects()) + buffer stringBuffer // an efficient buffer, used to build string values (see P.Accept()) + items chan Item // channel of resulting Parser items (see P.Emit()) item Item // the current item as reached by Next() and retrieved by Get() err *Error // an error when lexing failed, retrieved by Error() LastMatch string // a string representation of the last matched input data } -// Expects is used to let a state function describe what input it is expecting. -// This expectation is used in error messages to make them more descriptive. -// -// When defining an expectation inside a StateHandler, you do not need to -// handle unexpected input yourself. When the end of the function is reached -// without setting the next state, an automatic error will be emitted. -// This error can differentiate between the following issues: -// -// * there is valid data on input, but it was not accepted by the function -// -// * there is an invalid UTF8 character on input -// -// * the end of the file was reached. -func (p *P) Expects(description string) { - p.expecting = description -} - -// peek returns but does not advance the cursor to the next rune(s) in the input. +// peek returns but does not advance the cursor to the next rune in the input. // Returns the rune, its width in bytes and a boolean. +// // The boolean will be false in case no upcoming rune can be peeked -// (end of data or invalid UTF8 character). +// (end of data or invalid UTF8 character). In this case, the returned rune +// will be one of eofRune or invalidRune. func (p *P) peek(byteOffset int) (rune, int, bool) { r, w := utf8.DecodeRuneInString(p.input[p.inputPos+byteOffset:]) return handleRuneError(r, w) } -// EOF is a special rune, which is used to indicate an end of file when +// eofRune is a special rune, which is used to indicate an end of file when // reading a character from the input. -// It can be treated as a rune when writing parsing rules, so a valid way to -// say 'I now expect the end of the file' is using something like: -// if (p.On(c.Rune(EOF)).Skip()) { ... } -const EOF rune = -1 +const eofRune rune = -1 -// INVALID is a special rune, which is used to indicate an invalid UTF8 +// invalidRune is a special rune, which is used to indicate an invalid UTF8 // rune on the input. -const INVALID rune = utf8.RuneError +const invalidRune rune = utf8.RuneError -// handleRuneError is used to normale rune value in case of errors. +// handleRuneError is used to create specific rune value in case of errors. // When an error occurs, then utf8.RuneError will be in the rune. // This can however indicate one of two situations: -// * w == 0: end of file is reached -// * w == 1: invalid UTF character on input +// 1) w == 0: end of file is reached +// 2) w == 1: invalid UTF character on input // This function lets these two cases return respectively the -// package's own EOF or INVALID runes, to make it easy for client -// code to distinct between these two cases. +// package's own eofRune or invalidRune, to make it easy for calling code +// to distinct between these two cases. func handleRuneError(r rune, w int) (rune, int, bool) { if r == utf8.RuneError { if w == 0 { - return EOF, 0, false + return eofRune, 0, false } - return INVALID, w, false + return invalidRune, w, false } return r, w, true } - -// RouteTo tells the parser what StateHandler function to invoke -// in the next parsing cycle. -func (p *P) RouteTo(state StateHandler) *routeFollowupAction { - p.nextState = state - return &routeFollowupAction{chainAction: chainAction{p, true}} -} - -// RouteRepeat indicates that on the next parsing cycle, the current -// StateHandler must be reinvoked. -func (p *P) RouteRepeat() *chainAction { - p.RouteTo(p.state) - return &chainAction{nil, true} -} - -// RouteReturn tells the parser that on the next cycle the last -// StateHandler that was pushed on the route stack must be invoked. -// -// Using this method is optional. When implementating a StateHandler that -// is used as a sort of subroutine (using constructions like -// p.RouteTo(subroutine).ThenReturnHere()), you can refrain from -// providing an explicit routing decision from that handler. The parser will -// automatically assume a RouteReturn() in that case. -func (p *P) RouteReturn() *chainAction { - p.nextState = p.popRoute() - return &chainAction{nil, true} -} - -// pushRoute adds the StateHandler to the route stack. -// This is used for implementing nested parsing. -func (p *P) pushRoute(state StateHandler) { - p.routeStack = append(p.routeStack, state) -} - -// popRoute pops the last pushed StateHandler from the route stack. -func (p *P) popRoute() StateHandler { - last := len(p.routeStack) - 1 - head, tail := p.routeStack[:last], p.routeStack[last] - p.routeStack = head - return tail -} - -// ExpectEndOfFile can be used from a StateHandler function to indicate that -// your parser expects to be at the end of the file. This will schedule -// a parsekit-provided StateHandler which will do the actual check for this. -func (p *P) ExpectEndOfFile() { - p.RouteTo(func(p *P) { - p.Expects("end of file") - if p.On(A.EndOfFile).Stay().End() { - p.Emit(ItemEOF, "EOF") - } - }) -} diff --git a/statehandler_emit.go b/statehandler_emit.go index ce6778c..cf8fd0c 100644 --- a/statehandler_emit.go +++ b/statehandler_emit.go @@ -37,6 +37,44 @@ func (p *P) EmitLiteral(t ItemType) { p.Emit(t, p.buffer.asLiteralString()) } +// BufLiteral retrieves the contents of the parser buffer (all the runes that +// were added to it using P.Accept()) as a literal string. +// +// Literal means that if the input had for example the subsequent runes '\' and 'n' +// in it, then the literal string would have a backslash and an 'n' it in, not a +// linefeed (ASCII char 10). +// +// Retrieving the buffer contents will not affect the buffer itself. New runes can +// still be added to it. Only when calling P.Emit(), the buffer will be cleared. +func (p *P) BufLiteral() string { + return p.buffer.asLiteralString() +} + +// BufInterpreted retrieves the contents of the parser buffer (all the runes that +// were added to it using P.Accept()) as an interpreted string. +// +// Interpreted means that the contents are treated as a Go double quoted +// interpreted string (handling escape codes like \n, \t, \uXXXX, etc.). if the +// input had for example the subsequent runes '\' and 'n' in it, then the interpreted +// string would have an actual linefeed (ASCII char 10) in it. +// +// This method returns a boolean value, indicating whether or not the string +// interpretation was successful. On invalid string data, an error will +// automatically be emitted and the boolean return value will be false. +// +// Retrieving the buffer contents will not affect the buffer itself. New runes can +// still be added to it. Only when calling P.Emit(), the buffer will be cleared. +func (p *P) BufInterpreted() (string, bool) { + s, err := p.buffer.asInterpretedString() + if err != nil { + p.EmitError( + "invalid string: %s (%s, forgot to escape a double quote or backslash maybe?)", + p.buffer.asLiteralString(), err) + return "", false + } + return s, true +} + // EmitInterpreted passes a Parser item to the client, including accumulated // string buffer data a Go double quoted interpreted string (handling escape // codes like \n, \t, \uXXXX, etc.) @@ -90,9 +128,9 @@ func (p *P) UnexpectedInput() { switch { case ok: p.EmitError("unexpected character %q%s", r, fmtExpects(p)) - case r == EOF: + case r == eofRune: p.EmitError("unexpected end of file%s", fmtExpects(p)) - case r == INVALID: + case r == invalidRune: p.EmitError("invalid UTF8 character in input%s", fmtExpects(p)) default: panic("parsekit bug: Unhandled output from peek()") diff --git a/statehandler_expects.go b/statehandler_expects.go new file mode 100644 index 0000000..4f8fe48 --- /dev/null +++ b/statehandler_expects.go @@ -0,0 +1,18 @@ +package parsekit + +// Expects is used to let a state function describe what input it is expecting. +// This expectation is used in error messages to make them more descriptive. +// +// When defining an expectation inside a StateHandler, you do not need to +// handle unexpected input yourself. When the end of the function is reached +// without setting the next state, an automatic error will be emitted. +// This error can differentiate between the following issues: +// +// 1) there is valid data on input, but it was not accepted by the function +// +// 2) there is an invalid UTF8 character on input +// +// 3) the end of the file was reached. +func (p *P) Expects(description string) { + p.expecting = description +} diff --git a/statehandler_on.go b/statehandler_on.go index 37841ee..9ef63a4 100644 --- a/statehandler_on.go +++ b/statehandler_on.go @@ -1,41 +1,41 @@ package parsekit -// On checks if the current input matches the provided Matcher. +// On checks if the input at the current cursor position matches the provided Matcher. +// On must be chained with another method, which tells the parser what action to +// perform when a match was found: // -// This method is the start of a chain method in which multiple things can -// be arranged in one go: +// 1) On(...).Skip() - Only move cursor forward, ignore the matched runes. // -// 1) Checking whether or not there is a match (this is what On does) +// 2) On(...).Accept() - Move cursor forward, add matched runes to the string buffer. // -// 2) Deciding what to do with the match (Stay(): do nothing, Skip(): only move -// the cursor forward, Accept(): move cursor forward and add the match in -// the parser string buffer) +// 3) On(...).Stay() - Do nothing, the cursor stays at the same position. // -// 3) Dedicing where to route to (e.g. using RouteTo() to route to a -// StateHandler by name) +// So an example chain could look like this: // -// 4) Followup routing after that, when applicable (.e.g using something like -// RouteTo(...).ThenTo(...)) +// p.On(parsekit.A.Whitespace).Skip() // -// For every step of this chain, you can end the chain using the -// End() method. This will return a boolean value, indicating whether or -// not the initial On() method found a match in the input. -// End() is not mandatory. It is merely provided as a means to use -// a chain as an expression for a switch/case or if statement (since those -// require a boolean expression). +// The chain as a whole returns a boolean, which indicates whether or not at match +// was found. When no match was found, false is returned and Skip() and Accept() +// will have no effect. Because of this, typical use of an On() chain is as +// expression for a conditional expression (if, switch/case, for). E.g.: // -// You can omit "what to do with the match" and go straight into a routing -// method, e.g. +// // Skip multiple exclamation marks. +// for p.On(parsekit.A.Excl).Skip() { } // -// On(...).RouteTo(...) +// // Fork a route based on the input. +// switch { +// case p.On(parsekit.A.Excl).Stay() +// p.RouteTo(stateHandlerA) +// case p.On(parsekit.A.Colon).Stay(): +// p.RouteTo(stateHandlerB) +// default: +// p.RouteTo(stateHandlerC) +// } // -// This is functionally the same as using -// -// On(...).Stay().RouteTo(...). -// -// Here's a complete example chain: -// -// p.On(something).Accept().RouteTo(stateB).ThenTo(stateC).End() +// // When there's a "hi" on input, emit it. +// if p.On(parsekit.C.Str("hi")).Accept() { +// p.Emit(SomeItemType, p.BufLiteral()) +// } func (p *P) On(matcher Matcher) *matchAction { m := &MatchDialog{p: p} if matcher == nil { @@ -45,39 +45,25 @@ func (p *P) On(matcher Matcher) *matchAction { // Keep track of the last match, to allow parser implementations // to access it in an easy way. Typical use would be something like: - // if p.On(somethingBad).End() { - // p.Errorf("This was bad: %s", p.LastMatch) - // } + // + // if p.On(somethingBad).End() { + // p.Errorf("This was bad: %s", p.LastMatch) + // } p.LastMatch = string(m.input) return &matchAction{ - routeAction: routeAction{chainAction{p, ok}}, - input: m.input, - output: m.output, - inputPos: p.inputPos + m.inputOffset, + p: p, + ok: ok, + input: m.input, + output: m.output, + inputPos: p.inputPos + m.inputOffset, } } -// chainAction is used for building method chains for the On() method. -// Every element of the method chain embeds this struct. -type chainAction struct { - p *P - ok bool -} - -// End ends the method chain and returns a boolean indicating whether -// or not a match was found in the input. -func (a *chainAction) End() bool { - return a.ok -} - -// matchAction is a struct that is used for building On()-method chains. -// -// It embeds the routeAction struct, to make it possible to go right into -// a route action, which is basically a simple way of aliasing a chain -// like p.On(...).Stay().RouteTo(...) into p.On(...).RouteTo(...). +// matchAction is a struct that is used for building the On()-method chain. type matchAction struct { - routeAction + p *P + ok bool input []rune output []rune inputPos int @@ -88,23 +74,29 @@ type matchAction struct { // When no match was found, then no action is taken. // It returns a routeAction struct, which provides methods that can be used // to tell the parser what state to go to next. -func (a *matchAction) Accept() *routeAction { +func (a *matchAction) Accept() bool { if a.ok { a.p.buffer.writeString(string(a.output)) a.advanceCursor() } - return &routeAction{chainAction: chainAction{a.p, a.ok}} + return a.ok } // Skip tells the parser to move the cursor past a match that was found, // without storing the actual match in the string buffer. // Returns true in case a match was found. // When no match was found, then no action is taken and false is returned. -func (a *matchAction) Skip() *routeAction { +func (a *matchAction) Skip() bool { if a.ok { a.advanceCursor() } - return &routeAction{chainAction: chainAction{a.p, a.ok}} + return a.ok +} + +// Stay tells the parser to not move the cursor after finding a match. +// Returns true in case a match was found, false otherwise. +func (a *matchAction) Stay() bool { + return a.ok } // advanceCursor advances the rune cursor one position in the input data. @@ -122,67 +114,3 @@ func (a *matchAction) advanceCursor() { a.p.newline = r == '\n' } } - -// Stay tells the parser to not move the cursor after finding a match. -// Returns true in case a match was found, false otherwise. -func (a *matchAction) Stay() *routeAction { - return &routeAction{chainAction: chainAction{a.p, a.ok}} -} - -// routeAction is a struct that is used for building On() method chains. -type routeAction struct { - chainAction -} - -// RouteRepeat indicates that on the next parsing cycle, -// the current StateHandler must be reinvoked. -func (a *routeAction) RouteRepeat() *chainAction { - if a.ok { - return a.p.RouteRepeat() - } - return &chainAction{nil, false} -} - -// RouteTo tells the parser what StateHandler function to invoke -// in the next parsing cycle. -func (a *routeAction) RouteTo(state StateHandler) *routeFollowupAction { - if a.ok { - return a.p.RouteTo(state) - } - return &routeFollowupAction{chainAction: chainAction{nil, false}} -} - -// RouteReturn tells the parser that on the next cycle the next scheduled -// route must be invoked. -func (a *routeAction) RouteReturn() *chainAction { - if a.ok { - return a.p.RouteReturn() - } - return &chainAction{nil, false} -} - -// routeFollowupAction chains parsing routes. -// It allows for routing code like p.RouteTo(handlerA).ThenTo(handlerB). -type routeFollowupAction struct { - chainAction -} - -// ThenTo schedules a StateHandler that must be invoked after the RouteTo -// StateHandler has been completed. -// For example: p.RouteTo(handlerA).ThenTo(handlerB) -func (a *routeFollowupAction) ThenTo(state StateHandler) *chainAction { - if a.ok { - a.p.pushRoute(state) - } - return &chainAction{nil, a.ok} -} - -// ThenReturnHere schedules the current StateHandler to be invoked after -// the RouteTo StateHandler has been completed. -// For example: p.RouteTo(handlerA).ThenReturnHere() -func (a *routeFollowupAction) ThenReturnHere() *chainAction { - if a.ok { - a.p.pushRoute(a.p.state) - } - return &chainAction{nil, a.ok} -} diff --git a/statehandler_route.go b/statehandler_route.go new file mode 100644 index 0000000..47a60d4 --- /dev/null +++ b/statehandler_route.go @@ -0,0 +1,76 @@ +package parsekit + +// RouteTo tells the parser what StateHandler function to invoke +// in the next parsing cycle. +func (p *P) RouteTo(state StateHandler) *routeFollowupAction { + p.nextState = state + return &routeFollowupAction{p} +} + +// RouteRepeat indicates that on the next parsing cycle, the current +// StateHandler must be reinvoked. +func (p *P) RouteRepeat() { + p.RouteTo(p.state) +} + +// RouteReturn tells the parser that on the next cycle the last +// StateHandler that was pushed on the route stack must be invoked. +// +// Using this method is optional. When implementating a StateHandler that +// is used as a sort of subroutine (using constructions like +// p.RouteTo(subroutine).ThenReturnHere()), you can refrain from +// providing an explicit routing decision from that handler. The parser will +// automatically assume a RouteReturn() in that case. +func (p *P) RouteReturn() { + p.nextState = p.popRoute() +} + +// routeFollowupAction chains parsing routes. +// It allows for routing code like p.RouteTo(handlerA).ThenTo(handlerB). +type routeFollowupAction struct { + p *P +} + +// ThenTo schedules a StateHandler that must be invoked after the RouteTo +// StateHandler has been completed. +// For example: +// +// p.RouteTo(handlerA).ThenTo(handlerB) +func (a *routeFollowupAction) ThenTo(state StateHandler) { + a.p.pushRoute(state) +} + +// ThenReturnHere schedules the current StateHandler to be invoked after +// the RouteTo StateHandler has been completed. +// For example: +// +// p.RouteTo(handlerA).ThenReturnHere() +func (a *routeFollowupAction) ThenReturnHere() { + a.p.pushRoute(a.p.state) +} + +// pushRoute adds the StateHandler to the route stack. +// This is used for implementing nested parsing. +func (p *P) pushRoute(state StateHandler) { + p.routeStack = append(p.routeStack, state) +} + +// popRoute pops the last pushed StateHandler from the route stack. +func (p *P) popRoute() StateHandler { + last := len(p.routeStack) - 1 + head, tail := p.routeStack[:last], p.routeStack[last] + p.routeStack = head + return tail +} + +// ExpectEndOfFile can be used from a StateHandler function to indicate that +// your parser expects to be at the end of the file. This will schedule +// a parsekit-provided StateHandler which will do the actual check for this. +func (p *P) ExpectEndOfFile() { + p.RouteTo(func(p *P) { + p.Expects("end of file") + if p.On(A.EndOfFile).Stay() { + p.Emit(ItemEOF, "EOF") + } + }) +}