diff --git a/example_basiccalculator1_test.go b/example_basiccalculator1_test.go new file mode 100644 index 0000000..c1f86a8 --- /dev/null +++ b/example_basiccalculator1_test.go @@ -0,0 +1,109 @@ +// Let's write a parser for a really basic calculator. +// The calculator understands input that looks like: +// +// 10 + 20 - 8+4 +// +// So positive numbers that can be either added or substracted, and whitespace +// is ignored. +package parsekit_test + +import ( + "fmt" + "strconv" + + "git.makaay.nl/mauricem/go-parsekit" +) + +func Example_basicCalculator1() { + for _, c := range []struct { + input string + expected int64 + }{ + {"0", 0}, + {"1", 1}, + {"1+2+3", 6}, + {" 10 + \t20 - 3 + 7 -10 ", 24}, + {"", 0}, + {" \t ", 0}, + {"+", 0}, + {"10.8 + 12", 0}, + {"42+ ", 0}, + } { + output, err := ComputeSimple(c.input) + if err != nil { + fmt.Printf("Input: %q, got error: %s\n", c.input, err) + } else { + fmt.Printf("Input: %q, got outcome: %d, correct = %t\n", c.input, output, output == c.expected) + } + } + // Output: + // Input: "0", got outcome: 0, correct = true + // Input: "1", got outcome: 1, correct = true + // Input: "1+2+3", got outcome: 6, correct = true + // Input: " 10 + \t20 - 3 + 7 -10 ", got outcome: 24, correct = true + // Input: "", got error: unexpected end of file (expected integer number) + // Input: " \t ", got error: unexpected character ' ' (expected integer number) + // Input: "+", got error: unexpected character '+' (expected integer number) + // Input: "10.8 + 12", got error: unexpected character '.' (expected operator, '+' or '-') + // Input: "42+ ", got error: unexpected character ' ' (expected integer number) +} + +// --------------------------------------------------------------------------- +// Implementation of the calculator +// --------------------------------------------------------------------------- + +// CalculateSimple interprets a simple calculation, consisting of only integers +// and add or subtract operators. It returns the result of the calculation. +// An error is returned in case the calculation failed. +func ComputeSimple(calculation string) (int64, *parsekit.Error) { + calculator := &simpleCalculator{op: +1} + parser := parsekit.NewParser(calculator.number) + _, err, _ := parser.Parse(calculation).Next() + return calculator.Result, err +} + +// simpleCalculator defines the parsing state machine. We do this using methods +// on a struct, so the parser can make use of state data inside that struct +// during the parsing. +type simpleCalculator struct { + Result int64 // holds the resulting outcome of the computation + op int64 // represents operation for next term (+1 = add, -1 = subtract) +} + +func (c *simpleCalculator) number(p *parsekit.ParseAPI) { + // A definition of integer, which conveniently drops surrounding whitespace. + pc, a, m := parsekit.C, parsekit.A, parsekit.M + whitespace := m.Drop(pc.Opt(a.Whitespace)) + integer := pc.Seq(whitespace, a.Integer, whitespace) + + if p.On(integer).Accept() { + value, err := strconv.ParseInt(p.BufLiteral(), 10, 64) + p.BufClear() + if err != nil { + p.EmitError("invalid value %q: %s", value, err) + } else { + c.Result += c.op * value + p.Handle(c.operatorOrEndOfFile) + } + } else { + p.Expects("integer number") + p.UnexpectedInput() + } +} + +func (c *simpleCalculator) operatorOrEndOfFile(p *parsekit.ParseAPI) { + var a = parsekit.A + switch { + case p.On(a.Add).Skip(): + c.op = +1 + p.Handle(c.number) + case p.On(a.Subtract).Skip(): + c.op = -1 + p.Handle(c.number) + case !p.On(a.EndOfFile).Stay(): + p.Expects("operator, '+' or '-'") + p.UnexpectedInput() + default: + p.ExpectEndOfFile() + } +} diff --git a/example_basiccalculator2_test.go b/example_basiccalculator2_test.go new file mode 100644 index 0000000..f1363e8 --- /dev/null +++ b/example_basiccalculator2_test.go @@ -0,0 +1,213 @@ +// Let's write the hello world of parsers: a calculator that can interpret +// calculations that looks like: +// +// " -10 + (10.8+ (3 *-20-3*(8 +-4.12)) + 10)/5 " +// +// More formally, a calculation is defined as: +// +// calculation : expr EOF +// expr : term ((ADD|SUB) term)* +// term : factor ((MUL|DIV) factor)* +// space : (SPACE|TAB)* +// factor : space (FLOAT | LPAREN expr RPAREN) space +package parsekit_test + +import ( + "fmt" + "math" + "strconv" + + "git.makaay.nl/mauricem/go-parsekit" +) + +func Example_basicCalculator2() { + for _, c := range []struct { + input string + expected float64 + }{ + {"1", 1}, + {"(123.10)", 123.10}, + {"1 + 2 + 3 + 4 + 5", 15}, + {"1 * 2 * 3 * 4 * 5 * 0.6", 72}, + {"(3.05+2)*(4.3+5.12)", 47.571}, + {"8.10 + 999/233", 12.387554}, + {" -10 + (10.8+ (3 *-20-3*(8 +-4.12)) + 10)/5 ", -20.168}, + {"", 0}, + {"(", 0}, + {"10+20-", 0}, + {"10+20-(4*10))", 0}, + {"10+20-((4*10) + 17", 0}, + } { + output, err := Compute(c.input) + output = math.Round(output*1000000) / 1000000 // to make the expectation comparisons usable + if err != nil { + fmt.Printf("Input: %q, got error: %s\n", c.input, err) + } else { + fmt.Printf("Input: %q, got outcome: %f, correct = %t\n", c.input, output, output == c.expected) + } + } + // Output: + // Input: "1", got outcome: 1.000000, correct = true + // Input: "(123.10)", got outcome: 123.100000, correct = true + // Input: "1 + 2 + 3 + 4 + 5", got outcome: 15.000000, correct = true + // Input: "1 * 2 * 3 * 4 * 5 * 0.6", got outcome: 72.000000, correct = true + // Input: "(3.05+2)*(4.3+5.12)", got outcome: 47.571000, correct = true + // Input: "8.10 + 999/233", got outcome: 12.387554, correct = true + // Input: " -10 + (10.8+ (3 *-20-3*(8 +-4.12)) + 10)/5 ", got outcome: -20.168000, correct = true + // Input: "", got error: unexpected end of file + // Input: "(", got error: unexpected end of file + // Input: "10+20-", got error: unexpected end of file + // Input: "10+20-(4*10))", got error: unexpected character ')' (expected end of file) + // Input: "10+20-((4*10) + 17", got error: unexpected end of file (expected ')') +} + +// --------------------------------------------------------------------------- +// Implementation of the calculator +// --------------------------------------------------------------------------- + +// calculator implements a recursive descent parser that is responsible for parsing +// the input computation string according to the grammar. +// It offloads the actual computation to a separate interpreter. +type calculator struct { + interpreter interpreter + result float64 +} + +// Compute takes a calculation string as input and returns the interpreted result +// value for the calculation. An error can be returned as well, in case the +// computation fails for some reason. +func Compute(input string) (float64, *parsekit.Error) { + c := &calculator{} + parser := parsekit.NewParser(c.computation) + _, err, _ := parser.Parse(input).Next() + return c.result, err +} + +func (c *calculator) computation(p *parsekit.ParseAPI) { + c.interpreter.push() + + p.Handle(c.expr) + p.ExpectEndOfFile() + p.Handle(c.factor) + + c.result = c.interpreter.pop() +} + +// expr : term ((ADD|SUB) term)* +func (c *calculator) expr(p *parsekit.ParseAPI) { + c.interpreter.push() + + var pc, a = parsekit.C, parsekit.A + p.Handle(c.term) + for p.On(pc.Any(a.Add, a.Subtract)).Skip() { + c.interpreter.pushOperator(p.LastMatch) + p.Handle(c.term) + c.interpreter.eval() + } + + c.interpreter.pop() +} + +// term : factor ((MUL|DIV) factor)* +func (c *calculator) term(p *parsekit.ParseAPI) { + c.interpreter.push() + + var pc, a = parsekit.C, parsekit.A + p.Handle(c.factor) + for p.On(pc.Any(a.Multiply, a.Divide)).Skip() { + c.interpreter.pushOperator(p.LastMatch) + p.Handle(c.factor) + c.interpreter.eval() + } + + c.interpreter.pop() +} + +// factor : space (FLOAT | LPAREN expr RPAREN) space +func (c *calculator) factor(p *parsekit.ParseAPI) { + var pc, a = parsekit.C, parsekit.A + p.On(a.Whitespace).Skip() + switch { + case p.On(pc.Signed(a.Float)).Accept(): + floatStr := p.BufLiteral() + p.BufClear() + value, err := strconv.ParseFloat(floatStr, 64) + if err != nil { + p.EmitError("invalid number %s: %s", floatStr, err) + } else { + c.interpreter.pushValue(value) + } + case p.On(a.LeftParen).Skip(): + p.Handle(c.expr) + if !p.On(a.RightParen).Skip() { + p.Expects("')'") + p.UnexpectedInput() + } + default: + p.UnexpectedInput() + } + p.On(a.Whitespace).Skip() +} + +// --------------------------------------------------------------------------- +// The computational interpreter, used by the calculator. +// --------------------------------------------------------------------------- + +type stackFrame struct { + a float64 + b float64 + op func(a, b float64) float64 +} + +type interpreter struct { + stack []*stackFrame + top *stackFrame +} + +func (i *interpreter) push() *stackFrame { + f := &stackFrame{} + i.stack = append(i.stack, f) + i.top = f + i.pushOperator("VAL") + return f +} + +func (i *interpreter) pop() float64 { + value := i.eval() + i.stack = i.stack[0 : len(i.stack)-1] + if len(i.stack) > 0 { + i.top = i.stack[len(i.stack)-1] + i.pushValue(value) + } else { + i.top = nil + } + return value +} + +func (i *interpreter) pushValue(value float64) { + i.top.a, i.top.b = i.top.b, value +} + +func (i *interpreter) pushOperator(op string) { + switch op { + case "VAL": + i.top.op = func(a, b float64) float64 { return b } + case "+": + i.top.op = func(a, b float64) float64 { return a + b } + case "-": + i.top.op = func(a, b float64) float64 { return a - b } + case "*": + i.top.op = func(a, b float64) float64 { return a * b } + case "/": + i.top.op = func(a, b float64) float64 { return a / b } + default: + panic(fmt.Sprintf("Unhandled op name: %s", op)) + } +} + +func (i *interpreter) eval() float64 { + value := i.top.op(i.top.a, i.top.b) + i.pushValue(value) + i.pushOperator("VAL") + return value +} diff --git a/example_basiccalculator_test.go b/example_basiccalculator_test.go deleted file mode 100644 index f953d21..0000000 --- a/example_basiccalculator_test.go +++ /dev/null @@ -1,142 +0,0 @@ -// Let's write a small example for parsing a really basic calculator. -// The calculator understands input that looks like: -// -// 10 + 20 - 8+4 -// -// So positive numbers that can be either added or substracted, and whitespace -// is ignored. -package parsekit_test - -import ( - "fmt" - "strconv" - - "git.makaay.nl/mauricem/go-parsekit" -) - -// When writing a parser, it's a good start to use the parser/combinator -// functionality of parsekit to create some TokenHandler functions. These functions -// can later be used in the parser state machine to check for matching strings -// on the input data. -// -// For the calculator, we only need a definition of "number, surrounded by -// optional whitespace". Skipping whitespace could be a part of the StateHandler -// functions below too, but including it in a TokenHandler makes things really -// practical. -func createNumberMatcher() parsekit.TokenHandler { - // Easy access to parsekit definition. - c, a, m := parsekit.C, parsekit.A, parsekit.M - - whitespace := m.Drop(c.Opt(a.Whitespace)) - return c.Seq(whitespace, c.OneOrMore(a.Digit), whitespace) -} - -var calcNumber = createNumberMatcher() - -// We need to define the ItemTypes that we will use for emitting Items -// during the parsing process. -const ( - numberType parsekit.ItemType = iota - addType - subtractType -) - -// We also need to define the state machine for parsing the input. -// The state machine is built up from functions that match the StateHandler -// signature: func(*parsekit.ParseAPI) -// The ParseAPI struct holds the internal state for the parser and it provides -// some methods that form the API for your StateHandler implementation. - -// State: expect a number. When a number is found on the input, -// it is accepted in the parser's string buffer, after which that buffer is -// emitted as a numberType item. Then we tell the state machine to continue -// with the calcWaitForOperatorOrEndOfInput state. -// When no number is found, the parser will emit an error, explaining that -// "a number" was expected. -func calcWaitForNumber(p *parsekit.ParseAPI) { - p.Expects("a number") - if p.On(calcNumber).Accept() { - p.EmitLiteral(numberType) - p.RouteTo(calcWaitForOperatorOrEndOfInput) - } -} - -// State: expect a plus or minus operator. When one of those is found, the -// appropriate Item is emitted and the parser is sent back to the -// numberHandler to find the next number on the input. When no operator is -// found, then the parser is told to expect the end of the input. When more -// input data are available (which are obviously wrong data since they do -// not match our syntax), the parser will emit an error. -func calcWaitForOperatorOrEndOfInput(p *parsekit.ParseAPI) { - switch { - case p.On(a.Plus).Accept(): - p.EmitLiteral(addType) - p.RouteTo(calcWaitForNumber) - case p.On(a.Minus).Accept(): - p.EmitLiteral(subtractType) - p.RouteTo(calcWaitForNumber) - default: - p.ExpectEndOfFile() - } -} - -// All is ready for our parser. We now can create a new Parser struct. -// We need to tell it what StateHandler to start with. In our case, it is the -// calcWaitForNumber state, since the calculation must start with a number. -var calcParser = parsekit.NewParser(calcWaitForNumber) - -func Example_basicCalculator() { - // Let's feed the parser some input to work with. This provides us with - // a parse run for that input. - run := calcParser.Parse(" 153+22 + 31-4 -\t 6+42 ") - - // We can now step through the results of the parsing process by repeated - // calls to run.Next(). Next() returns either the next parse item, a parse - // error or an end of file. Let's dump the parse results and handle the - // computation while we're at it. - // TODO this in convoluted for people using the parser code I think. Maybe use three output data types instead? - sum := 0 - op := +1 - for { - item, err, ok := run.Next() - switch { - case !ok && err == nil: - fmt.Println("End of file reached") - fmt.Println("Outcome of computation:", sum) - return - case !ok: - fmt.Printf("Error: %s\n", err) - return - default: - fmt.Printf("Type: %d, Value: %q\n", item.Type, item.Value) - switch { - case item.Type == addType: - op = +1 - case item.Type == subtractType: - op = -1 - case item.Type == numberType: - nr, err := strconv.Atoi(item.Value) - if err != nil { - fmt.Printf("Error: invalid number %s: %s\n", item.Value, err) - return - } - sum += op * nr - } - } - } - - // Output: - // Type: 0, Value: "153" - // Type: 1, Value: "+" - // Type: 0, Value: "22" - // Type: 1, Value: "+" - // Type: 0, Value: "31" - // Type: 2, Value: "-" - // Type: 0, Value: "4" - // Type: 2, Value: "-" - // Type: 0, Value: "6" - // Type: 1, Value: "+" - // Type: 0, Value: "42" - // End of file reached - // Outcome of computation: 238 -} diff --git a/example_dutchpostcode_test.go b/example_dutchpostcode_test.go index 36364e6..d96735a 100644 --- a/example_dutchpostcode_test.go +++ b/example_dutchpostcode_test.go @@ -1,4 +1,4 @@ -// In this example, a Paparserrser is created which can parse and normalize Dutch postcodes +// In this example, a Parser is created that can parse and normalize Dutch postcodes // The implementation uses only TokenHandler functions and does not implement a // full-fledged state-based Parser for it. package parsekit_test @@ -26,7 +26,7 @@ func createPostcodeMatcher() *parsekit.Matcher { space := m.Replace(c.Opt(a.Whitespace), " ") postcode := c.Seq(pcDigits, space, pcLetters) - // Create a Matcher, which wraps the 'postcode' TokenHandler and allows + // Create a Matcher that wraps the 'postcode' TokenHandler and allows // us to match some input against that handler. return parsekit.NewMatcher(postcode, "a Dutch postcode") } diff --git a/example_hellomatcher_test.go b/example_hellomatcher_test.go index da50d83..11b4988 100644 --- a/example_hellomatcher_test.go +++ b/example_hellomatcher_test.go @@ -26,7 +26,7 @@ func createHelloMatcher() *parsekit.Matcher { name := c.OneOrMore(c.Not(a.Excl)) greeting := c.Seq(m.Drop(hello), m.Drop(separator), name, m.Drop(a.Excl)) - // Create a Matcher, which wraps the 'greeting' TokenHandler and allows + // Create a Matcher that wraps the 'greeting' TokenHandler and allows // us to match some input against that handler. return parsekit.NewMatcher(greeting, "a friendly greeting") } diff --git a/examples_test.go b/examples_test.go index 899e7c2..a1a1a2d 100644 --- a/examples_test.go +++ b/examples_test.go @@ -24,7 +24,7 @@ func ExampleItem() { // You define your own item types for your specific parser. const QuestionItem = parsekit.ItemType(42) - // A StateHandler function can use the defined item type by means of + // A ParseHandler function can use the defined item type by means of // the p.Emit* methods on parsekit.P. // When errors occur, or the end of the file is reached, then the built-in // types parsekit.ItemEOF and parsekit.ItemError will be emitted by parsekit. diff --git a/statehandler.go b/parsehandler.go similarity index 77% rename from statehandler.go rename to parsehandler.go index 16201be..27b7783 100644 --- a/statehandler.go +++ b/parsehandler.go @@ -2,20 +2,20 @@ package parsekit import "unicode/utf8" -// StateHandler defines the type of function that must be implemented to handle +// ParseHandler defines the type of function that must be implemented to handle // a parsing state in a Parser state machine. // -// A StateHandler function gets a ParseAPI struct as its input. This struct holds +// A ParseHandler function gets a ParseAPI struct as its input. This struct holds // all the internal state for the parsing state machine and provides the -// interface that the StateHandler uses to interact with the parser. -type StateHandler func(*ParseAPI) +// interface that the ParseHandler uses to interact with the parser. +type ParseHandler func(*ParseAPI) // ParseAPI holds the internal state of a parse run and provides an API to -// StateHandler methods to communicate with the parser. +// ParseHandler methods to communicate with the parser. type ParseAPI struct { - state StateHandler // the function that handles the current state - nextState StateHandler // the function that will handle the next state - routeStack []StateHandler // route stack, for handling nested parsing + state ParseHandler // the function that handles the current state + nextState ParseHandler // the function that will handle the next state + routeStack []ParseHandler // route stack, for handling nested parsing input string // the input that is being scanned by the parser inputPos int // current byte cursor position in the input cursorLine int // current rune cursor row number in the input @@ -26,7 +26,7 @@ type ParseAPI struct { buffer stringBuffer // an efficient buffer, used to build string values (see P.Accept()) items []Item // a slice of resulting Parser items (see P.Emit()) item Item // the current item as reached by Next() and retrieved by Get() - err *Error // an error when lexing failed, retrieved by Error() + err *Error // an error when parsing failed, can be retrieved by Error() LastMatch string // a string representation of the last matched input data } @@ -42,11 +42,11 @@ func (p *ParseAPI) peek(byteOffset int) (rune, int, bool) { return handleRuneError(r, w) } -// eofRune is a special rune, which is used to indicate an end of file when +// eofRune is a special rune that is used to indicate an end of file when // reading a character from the input. const eofRune rune = -1 -// invalidRune is a special rune, which is used to indicate an invalid UTF8 +// invalidRune is a special rune that is used to indicate an invalid UTF8 // rune on the input. const invalidRune rune = utf8.RuneError diff --git a/statehandler_emit.go b/parsehandler_emit.go similarity index 90% rename from statehandler_emit.go rename to parsehandler_emit.go index 0442f77..e8cfb6f 100644 --- a/statehandler_emit.go +++ b/parsehandler_emit.go @@ -4,7 +4,7 @@ import ( "fmt" ) -// Item represents an item that can be emitted from the parser. +// Item represents an item that can be emitted from a ParseHandler function. type Item struct { Type ItemType Value string @@ -50,6 +50,11 @@ func (p *ParseAPI) EmitLiteral(t ItemType) { p.Emit(t, p.BufLiteral()) } +// BufClear clears the contents of the parser string buffer. +func (p *ParseAPI) BufClear() { + p.buffer.reset() +} + // BufInterpreted retrieves the contents of the parser's string buffer (all // the runes that were added to it using ParseAPI.Accept()) as an // interpreted string. @@ -118,9 +123,18 @@ func (p *ParseAPI) EmitError(format string, args ...interface{}) { p.Emit(ItemError, message) } -// UnexpectedInput is used by a StateHandler function to emit an error item +// EmitEOF emits an EOF to the client. In effect, this will stop the parsing process. +func (p *ParseAPI) EmitEOF() { + p.Emit(ItemEOF, "EOF") +} + +// UnexpectedInput is used by a ParseHandler function to emit an error item // that tells the client that an unexpected rune was encountered in the input. func (p *ParseAPI) UnexpectedInput() { + // When some previous parsing step yielded an error, skip this operation. + if p.err != nil { + return + } r, _, ok := p.peek(0) switch { case ok: diff --git a/statehandler_expects.go b/parsehandler_expects.go similarity index 68% rename from statehandler_expects.go rename to parsehandler_expects.go index fb082e8..7e6d80a 100644 --- a/statehandler_expects.go +++ b/parsehandler_expects.go @@ -1,9 +1,9 @@ package parsekit -// Expects is used to let a StateHandler function describe what input it is expecting. +// Expects is used to let a ParseHandler function describe what input it is expecting. // This expectation is used in error messages to make them more descriptive. // -// When defining an expectation inside a StateHandler, you do not need to +// When defining an expectation inside a ParseHandler, you do not need to // handle unexpected input yourself. When the end of the function is reached // without setting the next state, an automatic error will be emitted. // This error can differentiate between the following issues: @@ -14,5 +14,7 @@ package parsekit // // 3) the end of the file was reached. func (p *ParseAPI) Expects(description string) { + // TODO make this into some debugging tool? + // fmt.Printf("Expecting %s @ line %d, col %d\n", description, p.cursorLine, p.cursorColumn) p.expecting = description } diff --git a/statehandler_on.go b/parsehandler_on.go similarity index 91% rename from statehandler_on.go rename to parsehandler_on.go index 737bbbf..7066ac3 100644 --- a/statehandler_on.go +++ b/parsehandler_on.go @@ -1,7 +1,7 @@ package parsekit // On checks if the input at the current cursor position matches the provided -// TokenHandler. On must be chained with another method, which tells the parser +// TokenHandler. On must be chained with another method that tells the parser // what action to perform when a match was found: // // 1) On(...).Skip() - Only move cursor forward, ignore the matched runes. @@ -14,7 +14,7 @@ package parsekit // // p.On(parsekit.A.Whitespace).Skip() // -// The chain as a whole returns a boolean, which indicates whether or not at match +// The chain as a whole returns a boolean that indicates whether or not at match // was found. When no match was found, false is returned and Skip() and Accept() // will have no effect. Because of this, typical use of an On() chain is as // expression for a conditional expression (if, switch/case, for). E.g.: @@ -37,6 +37,15 @@ package parsekit // p.Emit(SomeItemType, p.BufLiteral()) // } func (p *ParseAPI) On(tokenHandler TokenHandler) *MatchAction { + // When some previous parsing step yielded an error, skip this operation. + if p.err != nil { + return &MatchAction{ + p: p, + ok: false, + } + } + + // Perform the matching operation. m := &TokenAPI{p: p} if tokenHandler == nil { panic("internal parser error: tokenHandler argument for On() is nil") diff --git a/parsehandler_route.go b/parsehandler_route.go new file mode 100644 index 0000000..d66eae3 --- /dev/null +++ b/parsehandler_route.go @@ -0,0 +1,125 @@ +package parsekit + +// Handle is used to execute other ParseHandler functions from within your +// ParseHandler function. +func (p *ParseAPI) Handle(handlers ...ParseHandler) { + for _, handler := range handlers { + // When some previous parsing step yielded an error, skip this operation. + if p.err != nil { + break + } + handler(p) + } +} + +// RouteTo tells the parser what ParseHandler function to invoke on +// the next parse cycle. +func (p *ParseAPI) RouteTo(handler ParseHandler) *RouteFollowupAction { + p.nextState = handler + return &RouteFollowupAction{p} +} + +// RouteRepeat tells the parser that on the next parsing cycle, the current +// ParseHandler must be reinvoked. +func (p *ParseAPI) RouteRepeat() { + p.RouteTo(p.state) +} + +// RouteReturn tells the parser that on the next cycle the last ParseHandler +// that was pushed on the route stack must be invoked. +// +// Using this method is optional. When implementating a ParseHandler that +// is used as a sort of subroutine (using constructions like +// p.RouteTo(subroutine).ThenReturnHere()), you can refrain from +// providing an explicit routing decision from that handler. The parser will +// automatically assume a RouteReturn() in that case. +func (p *ParseAPI) RouteReturn() { + p.nextState = p.popRoute() +} + +// RouteFollowupAction chains parsing routes. +// It allows for routing code like p.RouteTo(handlerA).ThenTo(handlerB). +type RouteFollowupAction struct { + p *ParseAPI +} + +// ThenTo schedules a ParseHandler that must be invoked after the RouteTo +// ParseHandler has been completed. +// For example: +// +// p.RouteTo(handlerA).ThenTo(handlerB) +func (a *RouteFollowupAction) ThenTo(state ParseHandler) { + a.p.pushRoute(state) +} + +// ThenReturnHere schedules the current ParseHandler to be invoked after +// the RouteTo ParseHandler has been completed. +// For example: +// +// p.RouteTo(handlerA).ThenReturnHere() +func (a *RouteFollowupAction) ThenReturnHere() { + a.p.pushRoute(a.p.state) +} + +// pushRoute adds the ParseHandler to the route stack. +// This is used for implementing nested parsing. +func (p *ParseAPI) pushRoute(state ParseHandler) { + p.routeStack = append(p.routeStack, state) +} + +// popRoute pops the last pushed ParseHandler from the route stack. +func (p *ParseAPI) popRoute() ParseHandler { + last := len(p.routeStack) - 1 + head, tail := p.routeStack[:last], p.routeStack[last] + p.routeStack = head + return tail +} + +// ExpectEndOfFile can be used to check if the input is at end of file. +// Intended use: +// +// func yourParseHandler(p *parsekit.ParseAPI) { +// ... +// p.ExpectEndOfFile() +// } +// +// This will execute the end of file test right away. If you want to +// use the end of file check as a StateHandler instead, you can also +// make use of another form, for example: +// +// func yourParseHandler(p *parsekit.ParseAPI) { +// p.RouteTo(yourHandler).ThenTo(parsekit.ExpectEndOfFile) +// } +func (p *ParseAPI) ExpectEndOfFile() { + // When some previous parsing step yielded an error, skip this operation. + if p.err == nil { + if p.On(A.EndOfFile).Stay() { + p.EmitEOF() + } else { + p.Expects("end of file") + p.UnexpectedInput() + } + } +} + +// ExpectEndOfFile can be scheduled as a ParseHandler function. +// It makes sure that the input is at the end of file. +// Intended use: +// +// func yourParseHandler(p *parsekit.ParseAPI) { +// ... +// p.RouteTo(parsekit.ExpectEndOfFile) +// } +// +// It is not mandatory to use this ParseHandler. You can take care fo EOF +// yourself too. Simply emit an ItemEOF when the end of the input was reached +// to stop the parser loop: +// +// p.EmitEOF() +// TODO meh, get rid of this one, once we don't use state scheduling anymore. +func ExpectEndOfFile(p *ParseAPI) { + p.Expects("end of file") + if p.On(A.EndOfFile).Stay() { + p.EmitEOF() + } +} diff --git a/parsekit.go b/parsekit.go index fc325d9..a18f939 100644 --- a/parsekit.go +++ b/parsekit.go @@ -9,24 +9,24 @@ import ( // Parser is the top-level struct that holds the configuration for a parser. // The Parser can be instantiated using the parsekit.NewParser() method. type Parser struct { - startState StateHandler // the function that handles the very first state + startState ParseHandler // the function that handles the very first state } // NewParser instantiates a new Parser. // // The Parser is a state machine-style recursive descent parser, in which -// StateHandler functions are used to move the state machine forward during -// parsing. This style of parser is typically used for parsing languages and -// structured data formats (like json, toml, etc.) +// ParseHandler functions are used to move the state machine forward during +// parsing. This style of parser is typically used for parsing programming +// languages and structured data formats (like json, xml, toml, etc.) // // To start parsing input data, use the method Parser.Parse(). -func NewParser(startState StateHandler) *Parser { +func NewParser(startState ParseHandler) *Parser { return &Parser{startState: startState} } // ParseRun represents a single parse run for a Parser. type ParseRun struct { - p *ParseAPI // holds the internal state of a parse run + p *ParseAPI // holds parser state and provides an API to ParseHandler functions } // Parse starts a parse run on the provided input data. @@ -60,7 +60,7 @@ func (run *ParseRun) Next() (Item, *Error, bool) { return run.makeReturnValues(item) } // Otherwise, the next state handler is looked up and invoked. - run.runNextStateHandler() + run.runNextParseHandler() } } @@ -77,43 +77,43 @@ func (run *ParseRun) makeReturnValues(i Item) (Item, *Error, bool) { } } -// runNextStateHandler moves the parser, which is bascially a state machine, +// runNextParseHandler moves the parser, that is bascially a state machine, // to its next status. It does so by invoking a function of the -// type StateHandler. This function represents the current status and +// type ParseHandler. This function represents the current status and // is responsible for moving the parser to its next status, depending // on the parsed input data. -func (run *ParseRun) runNextStateHandler() { - if state, ok := run.getNextStateHandler(); ok { - run.invokeNextStateHandler(state) +func (run *ParseRun) runNextParseHandler() { + if state, ok := run.getNextParseHandler(); ok { + run.invokeNextParseHandler(state) } } -// getNextStateHandler determines the next StateHandler to invoke in order +// getNextParseHandler determines the next ParseHandler to invoke in order // to move the parsing state machine one step further. // -// When implementing a parser, the StateHandler functions must provide +// When implementing a parser, the ParseHandler functions must provide // a routing decision in every invocation. A routing decision is one // of the following: // -// * A route is specified explicitly, which means that the next StateHandler -// function to invoke is registered during the StateHandler function +// * A route is specified explicitly, which means that the next ParseHandler +// function to invoke is registered during the ParseHandler function // invocation. For example: p.RouteTo(nextStatus) // -// * A route is specified implicitly, which means that a previous StateHandler +// * A route is specified implicitly, which means that a previous ParseHandler // invocation has registered the followup route for the current state. // For example: p.RouteTo(nextStatus).ThenTo(otherStatus) -// In this example, the nextStatus StateHandler will not have to specify +// In this example, the nextStatus ParseHandler will not have to specify // a route explicitly, but otherStatus will be used implicitly after // the nextStatus function has returned. // -// * An expectation is registered by the StateHandler. +// * An expectation is registered by the ParseHandler. // For example: p.Expects("a cool thing") -// When the StateHandler returns without having specified a route, this +// When the ParseHandler returns without having specified a route, this // expectation is used to generate an "unexpected input" error message. // -// When no routing decision is provided by a StateHandler, then this is +// When no routing decision is provided by a ParseHandler, then this is // considered a bug in the state handler, and the parser will panic. -func (run *ParseRun) getNextStateHandler() (StateHandler, bool) { +func (run *ParseRun) getNextParseHandler() (ParseHandler, bool) { switch { case run.p.nextState != nil: return run.p.nextState, true @@ -124,13 +124,13 @@ func (run *ParseRun) getNextStateHandler() (StateHandler, bool) { return nil, false default: name := runtime.FuncForPC(reflect.ValueOf(run.p.state).Pointer()).Name() - panic(fmt.Sprintf("internal parser error: StateHandler %s did not provide a routing decision", name)) + panic(fmt.Sprintf("internal parser error: ParseHandler %s did not provide a routing decision", name)) } } -// invokeNextStateHandler moves the parser state to the provided state -// and invokes the StateHandler function. -func (run *ParseRun) invokeNextStateHandler(state StateHandler) { +// invokeNextParseHandler moves the parser state to the provided state +// and invokes the ParseHandler function. +func (run *ParseRun) invokeNextParseHandler(state ParseHandler) { run.p.state = state run.p.nextState = nil run.p.expecting = "" diff --git a/statehandler_route.go b/statehandler_route.go deleted file mode 100644 index 907a64e..0000000 --- a/statehandler_route.go +++ /dev/null @@ -1,76 +0,0 @@ -package parsekit - -// RouteTo tells the parser what StateHandler function to invoke on -// the next parse cycle. -func (p *ParseAPI) RouteTo(state StateHandler) *RouteFollowupAction { - p.nextState = state - return &RouteFollowupAction{p} -} - -// RouteRepeat tells the parser that on the next parsing cycle, the current -// StateHandler must be reinvoked. -func (p *ParseAPI) RouteRepeat() { - p.RouteTo(p.state) -} - -// RouteReturn tells the parser that on the next cycle the last StateHandler -// that was pushed on the route stack must be invoked. -// -// Using this method is optional. When implementating a StateHandler that -// is used as a sort of subroutine (using constructions like -// p.RouteTo(subroutine).ThenReturnHere()), you can refrain from -// providing an explicit routing decision from that handler. The parser will -// automatically assume a RouteReturn() in that case. -func (p *ParseAPI) RouteReturn() { - p.nextState = p.popRoute() -} - -// RouteFollowupAction chains parsing routes. -// It allows for routing code like p.RouteTo(handlerA).ThenTo(handlerB). -type RouteFollowupAction struct { - p *ParseAPI -} - -// ThenTo schedules a StateHandler that must be invoked after the RouteTo -// StateHandler has been completed. -// For example: -// -// p.RouteTo(handlerA).ThenTo(handlerB) -func (a *RouteFollowupAction) ThenTo(state StateHandler) { - a.p.pushRoute(state) -} - -// ThenReturnHere schedules the current StateHandler to be invoked after -// the RouteTo StateHandler has been completed. -// For example: -// -// p.RouteTo(handlerA).ThenReturnHere() -func (a *RouteFollowupAction) ThenReturnHere() { - a.p.pushRoute(a.p.state) -} - -// pushRoute adds the StateHandler to the route stack. -// This is used for implementing nested parsing. -func (p *ParseAPI) pushRoute(state StateHandler) { - p.routeStack = append(p.routeStack, state) -} - -// popRoute pops the last pushed StateHandler from the route stack. -func (p *ParseAPI) popRoute() StateHandler { - last := len(p.routeStack) - 1 - head, tail := p.routeStack[:last], p.routeStack[last] - p.routeStack = head - return tail -} - -// ExpectEndOfFile can be used from a StateHandler function to indicate that -// your parser expects to be at the end of the file. This will schedule -// a parsekit-provided StateHandler which will do the actual check for this. -func (p *ParseAPI) ExpectEndOfFile() { - p.RouteTo(func(p *ParseAPI) { - p.Expects("end of file") - if p.On(A.EndOfFile).Stay() { - p.Emit(ItemEOF, "EOF") - } - }) -} diff --git a/stringbuf.go b/stringbuf.go index 8df4659..727eed6 100644 --- a/stringbuf.go +++ b/stringbuf.go @@ -6,7 +6,7 @@ import ( "strings" ) -// stringBuffer is a string buffer implementation, which is used by the parser +// stringBuffer is a string buffer implementation that is used by the parser // to efficiently accumulate runes from the input and eventually turn these // into a string, either literal or interpreted. type stringBuffer struct { diff --git a/tokenhandlers_builtin.go b/tokenhandlers_builtin.go index c565088..b18ece2 100644 --- a/tokenhandlers_builtin.go +++ b/tokenhandlers_builtin.go @@ -33,6 +33,7 @@ var C = struct { MinMax func(min int, max int, handler TokenHandler) TokenHandler Separated func(separated TokenHandler, separator TokenHandler) TokenHandler // TODO reverse args for consistency Except func(except TokenHandler, handler TokenHandler) TokenHandler + Signed func(TokenHandler) TokenHandler }{ Rune: MatchRune, Runes: MatchRunes, @@ -51,6 +52,7 @@ var C = struct { MinMax: MatchMinMax, Separated: MatchSeparated, Except: MatchExcept, + Signed: MatchSigned, } // MatchRune creates a TokenHandler function that checks if the next rune from @@ -293,6 +295,16 @@ func MatchExcept(except TokenHandler, handler TokenHandler) TokenHandler { } } +// MatchSigned creates a TokenHandler that checks if the provided TokenHandler is +// prefixed by an optional '+' or '-' sign. This can be used to turn numeric +// atoms into a signed version, e.g. +// +// C.Signed(A.Integer) +func MatchSigned(handler TokenHandler) TokenHandler { + sign := MatchOpt(MatchAny(MatchRune('+'), MatchRune('-'))) + return MatchSeq(sign, handler) +} + // A provides convenient access to a range of atoms that can be used to // build TokenHandlers or parser rules. // @@ -320,18 +332,26 @@ var A = struct { Amp TokenHandler SingleQuote TokenHandler RoundOpen TokenHandler + LeftParen TokenHandler RoundClose TokenHandler + RightParen TokenHandler Asterisk TokenHandler + Multiply TokenHandler Plus TokenHandler + Add TokenHandler Comma TokenHandler Minus TokenHandler + Subtract TokenHandler Dot TokenHandler Slash TokenHandler + Divide TokenHandler Colon TokenHandler Semicolon TokenHandler AngleOpen TokenHandler + LessThan TokenHandler Equal TokenHandler AngleClose TokenHandler + GreaterThan TokenHandler Question TokenHandler At TokenHandler SquareOpen TokenHandler @@ -349,6 +369,10 @@ var A = struct { WhitespaceAndNewlines TokenHandler EndOfLine TokenHandler Digit TokenHandler + DigitNotZero TokenHandler + Digits TokenHandler + Float TokenHandler + Integer TokenHandler ASCII TokenHandler ASCIILower TokenHandler ASCIIUpper TokenHandler @@ -369,18 +393,26 @@ var A = struct { Amp: C.Rune('&'), SingleQuote: C.Rune('\''), RoundOpen: C.Rune('('), + LeftParen: C.Rune('('), RoundClose: C.Rune(')'), + RightParen: C.Rune(')'), Asterisk: C.Rune('*'), + Multiply: C.Rune('*'), Plus: C.Rune('+'), + Add: C.Rune('+'), Comma: C.Rune(','), Minus: C.Rune('-'), + Subtract: C.Rune('-'), Dot: C.Rune('.'), Slash: C.Rune('/'), + Divide: C.Rune('/'), Colon: C.Rune(':'), Semicolon: C.Rune(';'), AngleOpen: C.Rune('<'), + LessThan: C.Rune('<'), Equal: C.Rune('='), AngleClose: C.Rune('>'), + GreaterThan: C.Rune('>'), Question: C.Rune('?'), At: C.Rune('@'), SquareOpen: C.Rune('['), @@ -396,7 +428,11 @@ var A = struct { Whitespace: C.OneOrMore(C.Any(C.Rune(' '), C.Rune('\t'))), WhitespaceAndNewlines: C.OneOrMore(C.Any(C.Rune(' '), C.Rune('\t'), C.Str("\r\n"), C.Rune('\n'))), EndOfLine: C.Any(C.Str("\r\n"), C.Rune('\n'), MatchEndOfFile()), - Digit: C.RuneRange('0', '9'), + Digit: MatchDigit(), + DigitNotZero: MatchDigitNotZero(), + Digits: MatchDigits(), + Integer: MatchInteger(), + Float: MatchFloat(), ASCII: C.RuneRange('\x00', '\x7F'), ASCIILower: C.RuneRange('a', 'z'), ASCIIUpper: C.RuneRange('A', 'Z'), @@ -428,6 +464,42 @@ func MatchAnyRune() TokenHandler { } } +// MatchDigit creates a TokenHandler that checks if a single digit can be read +// from the input. +func MatchDigit() TokenHandler { + return MatchRuneRange('0', '9') +} + +// MatchDigits creates a TokenHandler that checks if one or more digits can be read +// from the input. +func MatchDigits() TokenHandler { + return MatchOneOrMore(MatchRuneRange('0', '9')) +} + +// MatchDigitNotZero creates a TokenHandler that checks if a single digit not equal +// to zero '0' can be read from the input. +func MatchDigitNotZero() TokenHandler { + return MatchRuneRange('1', '9') +} + +// MatchInteger creates a TokenHandler function that checks if a valid integer +// can be read from the input. In line with Go, a integer cannot start with +// a zero. Starting with a zero is used to indicate other bases, like octal or +// hexadecimal. +func MatchInteger() TokenHandler { + justZero := MatchRune('0') + integer := C.Seq(MatchDigitNotZero(), MatchZeroOrMore(MatchDigit())) + return MatchAny(integer, justZero) +} + +// MatchFloat creates a TokenHandler function that checks if a valid float value +// can be read from the input. In case the fractional part is missing, this +// TokenHandler will report a match, so both "123" and "123.123" will match. +func MatchFloat() TokenHandler { + digits := MatchDigits() + return MatchSeq(digits, MatchOpt(MatchSeq(MatchRune('.'), digits))) +} + // M provides convenient access to a range of modifiers (which in their nature are // parser/combinators) that can be used when creating TokenHandler functions. // diff --git a/tokenhandlers_builtin_test.go b/tokenhandlers_builtin_test.go index 950f513..4d86970 100644 --- a/tokenhandlers_builtin_test.go +++ b/tokenhandlers_builtin_test.go @@ -96,19 +96,27 @@ func TestAtoms(t *testing.T) { {"%", a.Percent, true, "%"}, {"&", a.Amp, true, "&"}, {"'", a.SingleQuote, true, "'"}, + {"(", a.LeftParen, true, "("}, {"(", a.RoundOpen, true, "("}, + {")", a.RightParen, true, ")"}, {")", a.RoundClose, true, ")"}, {"*", a.Asterisk, true, "*"}, + {"*", a.Multiply, true, "*"}, {"+", a.Plus, true, "+"}, + {"+", a.Add, true, "+"}, {",", a.Comma, true, ","}, {"-", a.Minus, true, "-"}, + {"-", a.Subtract, true, "-"}, {".", a.Dot, true, "."}, {"/", a.Slash, true, "/"}, + {"/", a.Divide, true, "/"}, {":", a.Colon, true, ":"}, {";", a.Semicolon, true, ";"}, {"<", a.AngleOpen, true, "<"}, + {"<", a.LessThan, true, "<"}, {"=", a.Equal, true, "="}, {">", a.AngleClose, true, ">"}, + {">", a.GreaterThan, true, ">"}, {"?", a.Question, true, "?"}, {"@", a.At, true, "@"}, {"[", a.SquareOpen, true, "["}, @@ -154,6 +162,22 @@ func TestAtoms(t *testing.T) { {"F", a.HexDigit, true, "F"}, {"g", a.HexDigit, false, "g"}, {"G", a.HexDigit, false, "G"}, + {"0", a.Integer, true, "0"}, + {"09", a.Integer, true, "0"}, // following Go: 09 is invalid octal, so only 0 is valid for the integer + {"1", a.Integer, true, "1"}, + {"-10X", a.Integer, false, ""}, + {"+10X", a.Integer, false, ""}, + {"-10X", c.Signed(a.Integer), true, "-10"}, + {"+10X", c.Signed(a.Integer), true, "+10"}, + {"+10.1X", c.Signed(a.Integer), true, "+10"}, + {"0X", a.Float, true, "0"}, + {"0X", a.Float, true, "0"}, + {"1X", a.Float, true, "1"}, + {"1.", a.Float, true, "1"}, // incomplete float, so only the 1 is picked up + {"123.321X", a.Float, true, "123.321"}, + {"-3.14X", a.Float, false, ""}, + {"-3.14X", c.Signed(a.Float), true, "-3.14"}, + {"-003.0014X", c.Signed(a.Float), true, "-003.0014"}, }) } @@ -174,8 +198,8 @@ func TestModifiers(t *testing.T) { func TestSequenceOfRunes(t *testing.T) { sequence := c.Seq( - a.Hash, a.Dollar, a.Percent, a.Amp, a.SingleQuote, a.RoundOpen, - a.RoundClose, a.Asterisk, a.Plus, a.Comma, a.Minus, a.Dot, a.Slash, + a.Hash, a.Dollar, a.Percent, a.Amp, a.SingleQuote, a.LeftParen, + a.RightParen, a.Asterisk, a.Plus, a.Comma, a.Minus, a.Dot, a.Slash, a.Colon, a.Semicolon, a.AngleOpen, a.Equal, a.AngleClose, a.Question, a.At, a.SquareOpen, a.Backslash, a.SquareClose, a.Caret, a.Underscore, a.Backquote, a.CurlyOpen, a.Pipe, a.CurlyClose, a.Tilde,