From 4580962fb8a75b20a706ad362cb23bf3c02a73d5 Mon Sep 17 00:00:00 2001 From: Maurice Makaay Date: Tue, 4 Jun 2019 00:03:08 +0000 Subject: [PATCH] Backup a load of work on typed token support, making it easy to produce tokens directly from parser/combinator-based parsing rules. --- assert/assert_comparison.go | 19 + assert/assert_panic.go | 34 + cursor.go | 29 + cursor_test.go | 42 ++ example_basiccalculator1_test.go | 22 +- example_basiccalculator2_test.go | 34 +- example_dutchpostcode_test.go | 12 +- ...go => example_helloManyStateParser_test.go | 58 +- ...o => example_helloParserCombinator_test.go | 10 +- ... => example_helloSingleStateParser_test.go | 34 +- examples_state_test.go | 11 +- examples_test.go | 36 +- parseapi.go | 68 ++ parsehandler.go | 107 --- parsehandler_error.go | 8 +- parsehandler_on.go | 91 +-- parsehandler_routing.go | 15 +- parsehandler_stringbuf.go | 47 -- parsehandler_test.go | 30 - parsekit.go | 28 +- parsekit_test.go | 41 +- reader.go | 108 +++ reader_test.go | 134 ++++ stringbuf.go | 62 -- stringbuf_test.go | 88 --- tokenapi.go | 188 +++++ tokenapi_result.go | 106 +++ tokenapi_result_test.go | 27 + tokenapi_test.go | 288 +++++++ tokenhandler.go | 224 ++---- tokenhandler_test.go | 143 +++- tokenhandlers_builtin.go | 705 +++++++++++++----- tokenhandlers_builtin_test.go | 264 ++++--- 33 files changed, 2179 insertions(+), 934 deletions(-) create mode 100644 assert/assert_comparison.go create mode 100644 assert/assert_panic.go create mode 100644 cursor.go create mode 100644 cursor_test.go rename example_helloparser1_test.go => example_helloManyStateParser_test.go (70%) rename example_hellomatcher_test.go => example_helloParserCombinator_test.go (91%) rename example_helloparser2_test.go => example_helloSingleStateParser_test.go (80%) create mode 100644 parseapi.go delete mode 100644 parsehandler_stringbuf.go create mode 100644 reader.go create mode 100644 reader_test.go delete mode 100644 stringbuf.go delete mode 100644 stringbuf_test.go create mode 100644 tokenapi.go create mode 100644 tokenapi_result.go create mode 100644 tokenapi_result_test.go create mode 100644 tokenapi_test.go diff --git a/assert/assert_comparison.go b/assert/assert_comparison.go new file mode 100644 index 0000000..5f62ba1 --- /dev/null +++ b/assert/assert_comparison.go @@ -0,0 +1,19 @@ +package assert + +import ( + "testing" +) + +func Equal(t *testing.T, expected interface{}, actual interface{}, forWhat string) { + if expected != actual { + t.Errorf( + "Unexpected value for %s:\nexpected: %q\nactual: %q", + forWhat, expected, actual) + } +} + +func NotEqual(t *testing.T, notExpected interface{}, actual interface{}, forWhat string) { + if notExpected == actual { + t.Errorf("Unexpected value for %s: %q", forWhat, actual) + } +} diff --git a/assert/assert_panic.go b/assert/assert_panic.go new file mode 100644 index 0000000..573d426 --- /dev/null +++ b/assert/assert_panic.go @@ -0,0 +1,34 @@ +package assert + +import ( + "regexp" + "testing" +) + +type PanicT struct { + Function func() + Expect string + Regexp bool +} + +func Panic(t *testing.T, p PanicT) { + defer func() { + if r := recover(); r != nil { + mismatch := false + if p.Regexp && !regexp.MustCompile(p.Expect).MatchString(r.(string)) { + mismatch = true + } + if !p.Regexp && p.Expect != r.(string) { + mismatch = true + } + if mismatch { + t.Errorf( + "Code did panic, but unexpected panic message received:\nexpected: %q\nactual: %q", + p.Expect, r) + } + } else { + t.Errorf("Function did not panic (expected panic message: %s)", p.Expect) + } + }() + p.Function() +} diff --git a/cursor.go b/cursor.go new file mode 100644 index 0000000..d5b03c1 --- /dev/null +++ b/cursor.go @@ -0,0 +1,29 @@ +package parsekit + +import "fmt" + +// Cursor represents the position of the input cursor in various ways. +type Cursor struct { + Byte int // The cursor offset in bytes + Rune int // The cursor offset in UTF8 runes + Column int // The column at which the cursor is (0-indexed) + Line int // The line at which the cursor is (0-indexed) +} + +func (c *Cursor) String() string { + return fmt.Sprintf("line %d, column %d", c.Line+1, c.Column+1) +} + +// move updates the position of the cursor, based on the provided input string. +func (c *Cursor) move(input string) { + c.Byte += len(input) + for _, r := range input { + c.Rune++ + if r == '\n' { + c.Column = 0 + c.Line++ + } else { + c.Column++ + } + } +} diff --git a/cursor_test.go b/cursor_test.go new file mode 100644 index 0000000..58b26c8 --- /dev/null +++ b/cursor_test.go @@ -0,0 +1,42 @@ +package parsekit + +import ( + "testing" +) + +func TestGivenCursor_WhenMoving_CursorIsUpdated(t *testing.T) { + for _, test := range []struct { + name string + input []string + byte int + rune int + line int + column int + }{ + {"No input at all", []string{""}, 0, 0, 0, 0}, + {"One ASCII char", []string{"a"}, 1, 1, 0, 1}, + {"Multiple ASCII chars", []string{"abc"}, 3, 3, 0, 3}, + {"One newline", []string{"\n"}, 1, 1, 1, 0}, + {"Carriage return", []string{"\r\r\r"}, 3, 3, 0, 3}, + {"One UTF8 3 byte char", []string{"⌘"}, 3, 1, 0, 1}, + {"Mixture", []string{"Hello\n\npretty\nW⌘O⌘R⌘L⌘D"}, 31, 23, 3, 9}, + {"Multiple calls", []string{"hello", "world"}, 10, 10, 0, 10}, + } { + c := Cursor{} + for _, s := range test.input { + c.move(s) + } + if c.Byte != test.byte { + t.Errorf("[%s] Unexpected byte offset %d (expected %d)", test.name, c.Byte, test.byte) + } + if c.Rune != test.rune { + t.Errorf("[%s] Unexpected rune offset %d (expected %d)", test.name, c.Rune, test.rune) + } + if c.Line != test.line { + t.Errorf("[%s] Unexpected line offset %d (expected %d)", test.name, c.Line, test.line) + } + if c.Column != test.column { + t.Errorf("[%s] Unexpected column offset %d (expected %d)", test.name, c.Column, test.column) + } + } +} diff --git a/example_basiccalculator1_test.go b/example_basiccalculator1_test.go index a7bb38b..4180aa2 100644 --- a/example_basiccalculator1_test.go +++ b/example_basiccalculator1_test.go @@ -9,7 +9,6 @@ package parsekit_test import ( "fmt" - "strconv" "git.makaay.nl/mauricem/go-parsekit" ) @@ -28,7 +27,6 @@ func Example_basicCalculator1() { {"+", 0}, {"10.8 + 12", 0}, {"42+ ", 0}, - {"9999999999999999999 + 8888888", 0}, } { output, err := ComputeSimple(c.input) if err != nil { @@ -47,7 +45,6 @@ func Example_basicCalculator1() { // Input: "+", got error: unexpected character '+' (expected integer number) // Input: "10.8 + 12", got error: unexpected character '.' (expected operator, '+' or '-') // Input: "42+ ", got error: unexpected character ' ' (expected integer number) - // Input: "9999999999999999999 + 8888888", got error: invalid value: strconv.ParseInt: parsing "9999999999999999999": value out of range } // --------------------------------------------------------------------------- @@ -72,23 +69,16 @@ type simpleCalculator struct { op int64 // represents operation for next term (+1 = add, -1 = subtract) } -// A definition of bareInteger, which conveniently drops surrounding whitespace. +// A definition of an int64, which conveniently drops surrounding whitespace. var dropWhitespace = parsekit.M.Drop(parsekit.C.Opt(parsekit.A.Whitespace)) var bareInteger = parsekit.C.Seq(dropWhitespace, parsekit.A.Integer, dropWhitespace) +var int64Token = parsekit.T.Int64(nil, bareInteger) func (c *simpleCalculator) number(p *parsekit.ParseAPI) { - if p.On(bareInteger).Accept() { - value, err := strconv.ParseInt(p.BufLiteral(), 10, 64) - p.BufClear() - if err != nil { - p.Error("invalid value: %s", err) - } else { - c.Result += c.op * value - p.Handle(c.operatorOrEndOfFile) - } - } else { - p.Expects("integer number") - p.UnexpectedInput() + p.Expects("integer number") + if p.On(int64Token).Accept() { + c.Result += c.op * p.Result().Value(0).(int64) + p.Handle(c.operatorOrEndOfFile) } } diff --git a/example_basiccalculator2_test.go b/example_basiccalculator2_test.go index c5356d7..a15fa15 100644 --- a/example_basiccalculator2_test.go +++ b/example_basiccalculator2_test.go @@ -15,7 +15,6 @@ package parsekit_test import ( "fmt" "math" - "strconv" "git.makaay.nl/mauricem/go-parsekit" ) @@ -97,8 +96,8 @@ func (c *calculator) expr(p *parsekit.ParseAPI) { var pc, a = parsekit.C, parsekit.A if p.Handle(c.term) { - for p.On(pc.Any(a.Add, a.Subtract)).Skip() { - op := p.LastMatch + for p.On(pc.Any(a.Add, a.Subtract)).Accept() { + op := p.Result().Rune(0) if !p.Handle(c.term) { return } @@ -115,8 +114,8 @@ func (c *calculator) term(p *parsekit.ParseAPI) { var pc, a = parsekit.C, parsekit.A if p.Handle(c.factor) { - for p.On(pc.Any(a.Multiply, a.Divide)).Skip() { - op := p.LastMatch + for p.On(pc.Any(a.Multiply, a.Divide)).Accept() { + op := p.Result().Rune(0) if !p.Handle(c.factor) { return } @@ -130,19 +129,12 @@ func (c *calculator) term(p *parsekit.ParseAPI) { // = ( (SPACE|TAB) | "") // = (FLOAT | LPAREN RPAREN) func (c *calculator) factor(p *parsekit.ParseAPI) { - var pc, a = parsekit.C, parsekit.A + var a, tok = parsekit.A, parsekit.T p.On(a.Whitespace).Skip() switch { - case p.On(pc.Signed(a.Float)).Accept(): - floatStr := p.BufLiteral() - p.BufClear() - value, err := strconv.ParseFloat(floatStr, 64) - if err != nil { - p.Error("invalid number %s: %s", floatStr, err) - return - } else { - c.interpreter.pushValue(value) - } + case p.On(tok.Float64(nil, a.Signed(a.Float))).Accept(): + value := p.Result().Value(0).(float64) + c.interpreter.pushValue(value) case p.On(a.LeftParen).Skip(): if !p.Handle(c.expr) { return @@ -194,16 +186,16 @@ func (i *interpreter) pushValue(value float64) { i.top.a, i.top.b = i.top.b, value } -func (i *interpreter) eval(op string) float64 { +func (i *interpreter) eval(op rune) float64 { value := i.top.a switch op { - case "+": + case '+': value += i.top.b - case "-": + case '-': value -= i.top.b - case "*": + case '*': value *= i.top.b - case "/": + case '/': value /= i.top.b } i.top.b = value diff --git a/example_dutchpostcode_test.go b/example_dutchpostcode_test.go index 0905446..752dbad 100644 --- a/example_dutchpostcode_test.go +++ b/example_dutchpostcode_test.go @@ -36,11 +36,11 @@ func Example_dutchPostcodeUsingMatcher() { // [1] Input: "2233Ab" Output: 2233 AB // [2] Input: "1001\t\tab" Output: 1001 AB // [3] Input: "1818ab" Output: 1818 AB - // [4] Input: "1212abc" Error: unexpected character '1' (expected a Dutch postcode) at line 1, column 1 - // [5] Input: "1234" Error: unexpected character '1' (expected a Dutch postcode) at line 1, column 1 - // [6] Input: "huh" Error: unexpected character 'h' (expected a Dutch postcode) at line 1, column 1 - // [7] Input: "" Error: unexpected end of file (expected a Dutch postcode) at line 1, column 1 - // [8] Input: "\xcd2222AB" Error: invalid UTF8 character in input (expected a Dutch postcode) at line 1, column 1 + // [4] Input: "1212abc" Error: unexpected character '1' (expected a Dutch postcode) at start of file + // [5] Input: "1234" Error: unexpected character '1' (expected a Dutch postcode) at start of file + // [6] Input: "huh" Error: unexpected character 'h' (expected a Dutch postcode) at start of file + // [7] Input: "" Error: unexpected end of file (expected a Dutch postcode) at start of file + // [8] Input: "\xcd2222AB" Error: unexpected character '�' (expected a Dutch postcode) at start of file } // --------------------------------------------------------------------------- @@ -57,7 +57,7 @@ func createPostcodeMatcher() *parsekit.Matcher { // - A space between letters and digits is optional. // - It is good form to write the letters in upper case. // - It is good form to use a single space between digits and letters. - digitNotZero := c.Except(c.Rune('0'), a.Digit) + digitNotZero := c.Except(a.Rune('0'), a.Digit) pcDigits := c.Seq(digitNotZero, c.Rep(3, a.Digit)) pcLetter := c.Any(a.ASCIILower, a.ASCIIUpper) pcLetters := m.ToUpper(c.Seq(pcLetter, pcLetter)) diff --git a/example_helloparser1_test.go b/example_helloManyStateParser_test.go similarity index 70% rename from example_helloparser1_test.go rename to example_helloManyStateParser_test.go index b2cc184..29a2543 100644 --- a/example_helloparser1_test.go +++ b/example_helloManyStateParser_test.go @@ -3,15 +3,15 @@ // // This implementation uses a state-based Parser for it, and it does not // implement any custom parser/combinator TokenHandler functions. Note that -// things are much easier to implement using custom TokenHandlers (see the other -// HelloWorldUsingMatcher example for this). Doing this fully parser-based +// things are much easier to implement using custom TokenHandlers (see the +// helloParserCombinator example for this). Doing this fully parser-based // implementation is mainly for your learning pleasure. // -// One big difference between the Matcher-based example and this one, is that -// this parser reports errors much more fine-grained. This might or might not be -// useful for your specific use case. If you need error reporting like this, -// then also take a look at the HelloWorldUsingParser2 example, which does the -// same thing as this version, only more concise. +// One big difference between the parser/combinator-based example and this one, +// is that this parser reports errors much more fine-grained. This might or +// might not be useful for your specific use case. If you need error reporting +// like this, then also take a look at the helloSingleState example, which does +// the same thing as this version, only more concise. package parsekit_test @@ -56,11 +56,11 @@ func Example_helloWorldUsingParser1() { // [6] Input: "hello" Error: unexpected end of file (expected comma) // [7] Input: "hello," Error: unexpected end of file (expected name) // [8] Input: "hello , " Error: unexpected end of file (expected name) - // [9] Input: "hello , Droopy" Error: unexpected end of file (expected name) + // [9] Input: "hello , Droopy" Error: unexpected end of file (expected exclamation) // [10] Input: "hello , Droopy!" Output: Droopy // [11] Input: "hello , \t \t Droopy \t !" Output: Droopy // [12] Input: "Oh no!" Error: unexpected character 'O' (expected hello) - // [13] Input: "hello,!" Error: The name cannot be empty + // [13] Input: "hello,!" Error: unexpected character '!' (expected name) } // --------------------------------------------------------------------------- @@ -78,9 +78,9 @@ func (h *helloparser1) Parse(input string) (string, *parsekit.Error) { } func (h *helloparser1) start(p *parsekit.ParseAPI) { - c := parsekit.C + a := parsekit.A p.Expects("hello") - if p.On(c.StrNoCase("hello")).Skip() { + if p.On(a.StrNoCase("hello")).Skip() { p.Handle(h.comma) } } @@ -88,20 +88,42 @@ func (h *helloparser1) start(p *parsekit.ParseAPI) { func (h *helloparser1) comma(p *parsekit.ParseAPI) { a := parsekit.A p.Expects("comma") - p.On(a.Whitespace).Skip() - if p.On(a.Comma).Skip() { + switch { + case p.On(a.Whitespace).Skip(): + p.Handle(h.comma) + case p.On(a.Comma).Skip(): + p.Handle(h.startName) + } +} + +func (h *helloparser1) startName(p *parsekit.ParseAPI) { + c, a := parsekit.C, parsekit.A + p.Expects("name") + switch { + case p.On(a.Whitespace).Skip(): + p.Handle(h.startName) + case p.On(c.Not(a.Excl)).Stay(): p.Handle(h.name) } } func (h *helloparser1) name(p *parsekit.ParseAPI) { - a := parsekit.A + c, a := parsekit.C, parsekit.A p.Expects("name") switch { - case p.On(a.Excl).Skip(): - p.Handle(h.end) - case p.On(a.AnyRune).Accept(): + case p.On(c.Not(a.Excl)).Accept(): + h.greetee += p.Result().String() p.Handle(h.name) + default: + p.Handle(h.exclamation) + } +} + +func (h *helloparser1) exclamation(p *parsekit.ParseAPI) { + a := parsekit.A + p.Expects("exclamation") + if p.On(a.Excl).Accept() { + p.Handle(h.end) } } @@ -115,7 +137,7 @@ func (h *helloparser1) end(p *parsekit.ParseAPI) { return } - h.greetee = strings.TrimSpace(p.BufLiteral()) + h.greetee = strings.TrimSpace(h.greetee) if h.greetee == "" { p.Error("The name cannot be empty") } else { diff --git a/example_hellomatcher_test.go b/example_helloParserCombinator_test.go similarity index 91% rename from example_hellomatcher_test.go rename to example_helloParserCombinator_test.go index ee3f294..45033b1 100644 --- a/example_hellomatcher_test.go +++ b/example_helloParserCombinator_test.go @@ -4,7 +4,7 @@ // The implementation uses only parser/combinator TokenHandler functions and does // not implement a full-fledged state-based Parser for it. If you want to see the // same kind of functionality, implementated using a Parser, take a look at the -// HelloWorldUsingParser examples. +// other hello examples. package parsekit_test import ( @@ -37,9 +37,9 @@ func Example_helloWorldUsingMatcher() { // [1] Input: "HELLO ,Johnny!" Output: Johnny // [2] Input: "hello , Bob123!" Output: Bob123 // [3] Input: "hello Pizza!" Output: Pizza - // [4] Input: "Oh no!" Error: unexpected character 'O' (expected a friendly greeting) at line 1, column 1 - // [5] Input: "Hello, world" Error: unexpected character 'H' (expected a friendly greeting) at line 1, column 1 - // [6] Input: "Hello,!" Error: unexpected character 'H' (expected a friendly greeting) at line 1, column 1 + // [4] Input: "Oh no!" Error: unexpected character 'O' (expected a friendly greeting) at start of file + // [5] Input: "Hello, world" Error: unexpected character 'H' (expected a friendly greeting) at start of file + // [6] Input: "Hello,!" Error: unexpected character 'H' (expected a friendly greeting) at start of file } // --------------------------------------------------------------------------- @@ -53,7 +53,7 @@ func createHelloMatcher() *parsekit.Matcher { // Using the parser/combinator support of parsekit, we create a TokenHandler function // that does all the work. The 'greeting' TokenHandler matches the whole input and // drops all but the name from it. - hello := c.StrNoCase("hello") + hello := a.StrNoCase("hello") comma := c.Seq(c.Opt(a.Whitespace), a.Comma, c.Opt(a.Whitespace)) separator := c.Any(comma, a.Whitespace) name := c.OneOrMore(c.Not(a.Excl)) diff --git a/example_helloparser2_test.go b/example_helloSingleStateParser_test.go similarity index 80% rename from example_helloparser2_test.go rename to example_helloSingleStateParser_test.go index c106835..c5d3208 100644 --- a/example_helloparser2_test.go +++ b/example_helloSingleStateParser_test.go @@ -1,15 +1,15 @@ -// This is the same as the example HelloWorldUsingParser1, except that in this +// This is the same as the other hello examples, except that in this // implementation the state machine is implemented using a combination of some // TokenHandlers and only a single state, in which multiple ParseAPI.On() calls // are combined to do all the work in one go. // // Note that things are much easier to implement using custom TokenHandlers (see -// the other HelloWorldUsingMatcher example for this). Doing this implementation +// the other helloParserCombinator example for this). Doing this implementation // is mainly for your learning pleasure. // -// One big difference between the Matcher-based example and this one, is that -// this parser reports errors much more fine-grained. This might or might not be -// useful for your specific use case.:0 +// One big difference between the parser/combinator-based example and this one, +// is that this parser reports errors much more fine-grained. This might or +// might not be useful for your specific use case. package parsekit_test @@ -80,21 +80,29 @@ func (h *helloparser2) Parse(input string) (string, *parsekit.Error) { func (h *helloparser2) start(p *parsekit.ParseAPI) { c, a, m := parsekit.C, parsekit.A, parsekit.M - if !p.On(c.StrNoCase("hello")).Skip() { + if !p.On(a.StrNoCase("hello")).Skip() { p.Error("the greeting is not being friendly") - } else if !p.On(c.Seq(c.Opt(a.Whitespace), a.Comma, c.Opt(a.Whitespace))).Skip() { + return + } + if !p.On(c.Seq(c.Opt(a.Whitespace), a.Comma, c.Opt(a.Whitespace))).Skip() { p.Error("the greeting is not properly separated") - } else if !p.On(m.TrimSpace(c.OneOrMore(c.Except(a.Excl, a.AnyRune)))).Accept() { + return + } + if p.On(m.TrimSpace(c.OneOrMore(c.Except(a.Excl, a.AnyRune)))).Accept() { + h.greetee = p.Result().String() + if h.greetee == "" { + p.Error("the name cannot be empty") + return + } + } else { p.Error("the greeting is targeted at thin air") - } else if !p.On(a.Excl).Skip() { + return + } + if !p.On(a.Excl).Skip() { p.Error("the greeting is not loud enough") } else if !p.On(a.EndOfFile).Stay() { p.Error("too much stuff going on after the closing '!'") } else { - h.greetee = p.BufLiteral() - if h.greetee == "" { - p.Error("the name cannot be empty") - } p.Stop() } } diff --git a/examples_state_test.go b/examples_state_test.go index 68dfef5..c8e9282 100644 --- a/examples_state_test.go +++ b/examples_state_test.go @@ -1,7 +1,7 @@ // In this example, we show that any type can be extended into a parser, // filling that type with data from the ParseHandler methods. // -// Here, we create a custom type 'letterCollection', which is an alias +// Here, we create a custom type 'Chunks', which is an alias // for []string. We add a ParseHandler method directly to that type // and let the parsing code fill the slice with strings during parsing. @@ -21,8 +21,7 @@ func (l *Chunks) AddChopped(s string, chunkSize int) *parsekit.Error { parser := parsekit.NewParser(func(p *parsekit.ParseAPI) { for p.On(chunkOfRunes).Accept() { - *l = append(*l, p.BufLiteral()) - p.BufClear() + *l = append(*l, p.Result().String()) } }) return parser.Execute(s) @@ -30,10 +29,10 @@ func (l *Chunks) AddChopped(s string, chunkSize int) *parsekit.Error { func Example_usingSliceAsParserState() { chunks := &Chunks{} - chunks.AddChopped("This string will", 4) - chunks.AddChopped("be cut to bits!!!!!!", 8) + chunks.AddChopped("123412341234xxx", 4) + chunks.AddChopped("1234567812345678xxxxx", 8) fmt.Printf("Matches = %q", *chunks) // Output: - // Matches = ["This" " str" "ing " "will" "be cut t" "o bits!!" "!!!!"] + // Matches = ["1234" "1234" "1234" "xxx" "12345678" "12345678" "xxxxx"] } diff --git a/examples_test.go b/examples_test.go index 7d1e607..c765bbb 100644 --- a/examples_test.go +++ b/examples_test.go @@ -48,23 +48,47 @@ func ExampleError_Full() { // it broke down at line 10, column 42 } -func ExampleMatchAnyRune() { +func ExampleMatchAnyRune_usingAcceptedRunes() { // Easy access to the parsekit definitions. a := parsekit.A matches := []string{} - stateHandler := func(p *parsekit.ParseAPI) { + parser := parsekit.NewParser(func(p *parsekit.ParseAPI) { for p.On(a.AnyRune).Accept() { - matches = append(matches, p.BufLiteral()) - p.BufClear() + matches = append(matches, p.Result().String()) } p.ExpectEndOfFile() - } - parser := parsekit.NewParser(stateHandler) + }) err := parser.Execute("¡Any will dö!") fmt.Printf("Matches = %q, Error = %s\n", matches, err) // Output: // Matches = ["¡" "A" "n" "y" " " "w" "i" "l" "l" " " "d" "ö" "!"], Error = } + +func ExampleMatchAnyRune_usingTokens() { + // Easy access to the parsekit definitions. + c, a, tok := parsekit.C, parsekit.A, parsekit.T + + var tokens []*parsekit.Token + var accepted string + + parser := parsekit.NewParser(func(p *parsekit.ParseAPI) { + if p.On(c.OneOrMore(tok.Rune("a rune", a.AnyRune))).Accept() { + tokens = p.Result().Tokens() + accepted = p.Result().String() + } + p.ExpectEndOfFile() + }) + parser.Execute("¡Any will dö!") + + fmt.Printf("Runes accepted: %q\n", accepted) + fmt.Printf("Token values: ") + for _, t := range tokens { + fmt.Printf("%c ", t.Value) + } + // Output: + // Runes accepted: "¡Any will dö!" + // Token values: ¡ A n y w i l l d ö ! +} diff --git a/parseapi.go b/parseapi.go new file mode 100644 index 0000000..f2bbc95 --- /dev/null +++ b/parseapi.go @@ -0,0 +1,68 @@ +package parsekit + +import ( + "fmt" + "runtime" + "strings" +) + +// ParseAPI holds the internal state of a parse run and provides an API to +// ParseHandler methods to communicate with the parser. +type ParseAPI struct { + tokenAPI *TokenAPI // the input reader + loopCheck map[string]bool // used for parser loop detection + expecting string // a description of what the current state expects to find (see Expects()) + result *Result // TokenHandler result, as received from On(...).Accept() + err *Error // error during parsing, retrieved by Error(), further ParseAPI calls are ignored + stopped bool // a boolean set to true by Stop(), further ParseAPI calls are ignored +} + +// panicWhenStoppedOrInError will panic when the parser has produced an error +// or when it has been stopped. It is used from the ParseAPI methods, to +// prevent further calls to the ParseAPI on these occasions. +// +// Basically, this guard ensures proper coding of parsers, making sure +// that clean routes are followed. You can consider this check a runtime +// unit test. +func (p *ParseAPI) panicWhenStoppedOrInError() { + if !p.isStoppedOrInError() { + return + } + + called, _ := p.getCaller(1) + parts := strings.Split(called, ".") + calledShort := parts[len(parts)-1] + caller, filepos := p.getCaller(2) + + after := "Error()" + if p.stopped { + after = "Stop()" + } + + panic(fmt.Sprintf("Illegal call to ParseAPI.%s() from %s at %s: no calls allowed after ParseAPI.%s", calledShort, caller, filepos, after)) +} + +func (p *ParseAPI) isStoppedOrInError() bool { + return p.stopped || p.err != nil +} + +func (p *ParseAPI) initLoopCheck() { + p.loopCheck = map[string]bool{} +} + +func (p *ParseAPI) checkForLoops() { + caller, filepos := p.getCaller(2) + if _, ok := p.loopCheck[filepos]; ok { + panic(fmt.Sprintf("Loop detected in parser in %s at %s", caller, filepos)) + } + p.loopCheck[filepos] = true +} + +// TODO delete this one +func (p *ParseAPI) getCaller(depth int) (string, string) { + // No error handling, because we call this method ourselves with safe depth values. + pc, file, line, _ := runtime.Caller(depth + 1) + filepos := fmt.Sprintf("%s:%d", file, line) + caller := runtime.FuncForPC(pc) + return caller.Name(), filepos +} diff --git a/parsehandler.go b/parsehandler.go index d85709d..e035439 100644 --- a/parsehandler.go +++ b/parsehandler.go @@ -1,12 +1,5 @@ package parsekit -import ( - "fmt" - "runtime" - "strings" - "unicode/utf8" -) - // ParseHandler defines the type of function that must be implemented to handle // a parsing state in a Parser state machine. // @@ -14,103 +7,3 @@ import ( // all the internal state for the parsing state machine and provides the // interface that the ParseHandler uses to interact with the parser. type ParseHandler func(*ParseAPI) - -// ParseAPI holds the internal state of a parse run and provides an API to -// ParseHandler methods to communicate with the parser. -type ParseAPI struct { - input string // the input that is being scanned by the parser - inputPos int // current byte cursor position in the input - loopCheck map[string]bool // used for parser loop detection - cursorLine int // current rune cursor row number in the input - cursorColumn int // current rune cursor column position in the input - len int // the total length of the input in bytes - newline bool // keep track of when we have scanned a newline - expecting string // a description of what the current state expects to find (see P.Expects()) - buffer stringBuffer // an efficient buffer, used to build string values (see P.Accept()) - err *Error // error during parsing, retrieved by Error(), further ParseAPI calls are ignored - stopped bool // a boolean set to true by Stop(), further ParseAPI calls are ignored - - LastMatch string // a string representation of the last matched input data -} - -// panicWhenStoppedOrInError will panic when the parser has produced an error -// or when it has been stopped. It is used from the ParseAPI methods, to -// prevent further calls to the ParseAPI on these occasions. -// -// Basically, this guard ensures proper coding of parsers, making sure -// that clean routes are followed. You can consider this check a runtime -// unit test. -func (p *ParseAPI) panicWhenStoppedOrInError() { - if !p.isStoppedOrInError() { - return - } - - called, _ := p.getCaller(1) - parts := strings.Split(called, ".") - calledShort := parts[len(parts)-1] - caller, filepos := p.getCaller(2) - - after := "Error()" - if p.stopped { - after = "Stop()" - } - - panic(fmt.Sprintf("Illegal call to ParseAPI.%s() from %s at %s: no calls allowed after ParseAPI.%s", calledShort, caller, filepos, after)) -} - -func (p *ParseAPI) isStoppedOrInError() bool { - return p.stopped || p.err != nil -} - -func (p *ParseAPI) checkForLoops() { - caller, filepos := p.getCaller(2) - if _, ok := p.loopCheck[filepos]; ok { - panic(fmt.Sprintf("Loop detected in parser in %s at %s", caller, filepos)) - } - p.loopCheck[filepos] = true -} - -// peek returns but does not advance the cursor to the next rune in the input. -// Returns the rune, its width in bytes and a boolean. -// -// The boolean will be false in case no upcoming rune can be peeked -// (end of data or invalid UTF8 character). In this case, the returned rune -// will be one of eofRune or invalidRune. -func (p *ParseAPI) peek(byteOffset int) (rune, int, bool) { - r, w := utf8.DecodeRuneInString(p.input[p.inputPos+byteOffset:]) - return handleRuneError(r, w) -} - -// eofRune is a special rune that is used to indicate an end of file when -// reading a character from the input. -const eofRune rune = -1 - -// invalidRune is a special rune that is used to indicate an invalid UTF8 -// rune on the input. -const invalidRune rune = utf8.RuneError - -// handleRuneError is used to create specific rune value in case of errors. -// When an error occurs, then utf8.RuneError will be in the rune. -// This can however indicate one of two situations: -// 1) w == 0: end of file is reached -// 2) w == 1: invalid UTF character on input -// This function lets these two cases return respectively the -// package's own eofRune or invalidRune, to make it easy for calling code -// to distinct between these two cases. -func handleRuneError(r rune, w int) (rune, int, bool) { - if r == utf8.RuneError { - if w == 0 { - return eofRune, 0, false - } - return invalidRune, w, false - } - return r, w, true -} - -func (p *ParseAPI) getCaller(depth int) (string, string) { - // No error handling, because we call this method ourselves with safe depth values. - pc, file, line, _ := runtime.Caller(depth + 1) - filepos := fmt.Sprintf("%s:%d", file, line) - caller := runtime.FuncForPC(pc) - return caller.Name(), filepos -} diff --git a/parsehandler_error.go b/parsehandler_error.go index 5a5ac2e..5fdf9ab 100644 --- a/parsehandler_error.go +++ b/parsehandler_error.go @@ -20,7 +20,11 @@ func (err *Error) Error() string { // Full returns the current error message, including information about // the position in the input where the error occurred. func (err *Error) Full() string { - return fmt.Sprintf("%s at line %d, column %d", err, err.Line, err.Column) + if err.Line == 0 { + return fmt.Sprintf("%s at start of file", err) + } else { + return fmt.Sprintf("%s at line %d, column %d", err, err.Line, err.Column) + } } // Error sets the error message in the parser API. This error message @@ -29,5 +33,5 @@ func (p *ParseAPI) Error(format string, args ...interface{}) { // No call to p.panicWhenStoppedOrInError(), to allow a parser to // set a different error message when needed. message := fmt.Sprintf(format, args...) - p.err = &Error{message, p.cursorLine, p.cursorColumn} + p.err = &Error{message, p.tokenAPI.cursor.Line, p.tokenAPI.cursor.Column} } diff --git a/parsehandler_on.go b/parsehandler_on.go index 2bc22d5..1773cfb 100644 --- a/parsehandler_on.go +++ b/parsehandler_on.go @@ -1,5 +1,7 @@ package parsekit +import "fmt" + // On checks if the input at the current cursor position matches the provided // TokenHandler. On must be chained with another method that tells the parser // what action to perform when a match was found: @@ -17,7 +19,7 @@ package parsekit // The chain as a whole returns a boolean that indicates whether or not at match // was found. When no match was found, false is returned and Skip() and Accept() // will have no effect. Because of this, typical use of an On() chain is as -// expression for a conditional expression (if, switch/case, for). E.g.: +// expression for a conditional statement (if, switch/case, for). E.g.: // // // Skip multiple exclamation marks. // for p.On(parsekit.A.Excl).Skip() { } @@ -32,70 +34,71 @@ package parsekit // p.RouteTo(stateHandlerC) // } // -// // When there's a "hi" on input, emit a parser item for it. +// // When there's a "hi" on input, then say hello. // if p.On(parsekit.C.Str("hi")).Accept() { -// p.Emit(SomeItemType, p.BufLiteral()) +// fmt.Println("Hello!") // } func (p *ParseAPI) On(tokenHandler TokenHandler) *ParseAPIOnAction { p.panicWhenStoppedOrInError() p.checkForLoops() - - // Perform the matching operation. - m := &TokenAPI{p: p} if tokenHandler == nil { panic("ParseHandler bug: tokenHandler argument for On() is nil") } - ok := tokenHandler(m) - // Keep track of the last match, to allow parser implementations - // to access it in an easy way. Typical use would be something like: - // - // if p.On(somethingBad).End() { - // p.Errorf("This was bad: %s", p.LastMatch) - // } - p.LastMatch = string(m.input) + p.result = nil + p.tokenAPI.result = NewResult() + fork := p.tokenAPI.Fork() + ok := tokenHandler(fork) return &ParseAPIOnAction{ - p: p, + parseAPI: p, + tokenAPI: fork, ok: ok, - input: m.input, - output: m.output, - inputPos: p.inputPos + m.inputOffset, } } // ParseAPIOnAction is a struct that is used for building the On()-method chain. // The On() method will return an initialized struct of this type. type ParseAPIOnAction struct { - p *ParseAPI + parseAPI *ParseAPI + tokenAPI *TokenAPI ok bool - input []rune - output []rune - inputPos int } // Accept tells the parser to move the cursor past a match that was found, -// and to store the input that matched in the parser's string buffer. -// When no match was found, then no action is taken. +// and to make the TokenHandler results available in the ParseAPI through +// the Result() method. // // Returns true in case a match was found. // When no match was found, then no action is taken and false is returned. func (a *ParseAPIOnAction) Accept() bool { if a.ok { - a.p.buffer.writeString(string(a.output)) - a.advanceCursor() + a.tokenAPI.Merge() + a.parseAPI.result = a.tokenAPI.root.result + a.tokenAPI.root.result = NewResult() + a.tokenAPI.root.detachChilds() + if a.tokenAPI.offset > 0 { + a.tokenAPI.root.FlushReaderBuffer(a.tokenAPI.offset) + a.parseAPI.initLoopCheck() + } } return a.ok } // Skip tells the parser to move the cursor past a match that was found, -// without storing the actual match in the parser's string buffer. +// without making the results available through the ParseAPI. // // Returns true in case a match was found. // When no match was found, then no action is taken and false is returned. func (a *ParseAPIOnAction) Skip() bool { if a.ok { - a.advanceCursor() + a.tokenAPI.root.cursor = a.tokenAPI.cursor + a.tokenAPI.root.result = NewResult() + a.tokenAPI.root.detachChilds() + if a.tokenAPI.offset > 0 { + a.tokenAPI.root.FlushReaderBuffer(a.tokenAPI.offset) + a.parseAPI.initLoopCheck() + } } return a.ok } @@ -103,25 +106,23 @@ func (a *ParseAPIOnAction) Skip() bool { // Stay tells the parser to not move the cursor after finding a match. // Returns true in case a match was found, false otherwise. func (a *ParseAPIOnAction) Stay() bool { + if a.ok { + a.tokenAPI.root.result = NewResult() + a.tokenAPI.root.detachChilds() + } return a.ok } -// advanceCursor advances the input position in the input data. -// While doing so, it keeps tracks of newlines that are encountered, so we -// can report on line + column positions on error. -func (a *ParseAPIOnAction) advanceCursor() { - if a.p.inputPos == a.inputPos { - return - } - a.p.loopCheck = map[string]bool{} - a.p.inputPos = a.inputPos - for _, r := range a.input { - if a.p.newline { - a.p.cursorLine++ - a.p.cursorColumn = 1 - } else { - a.p.cursorColumn++ - } - a.p.newline = r == '\n' +// Result returns a Result struct, containing results as produced by the +// last ParseAPI.On() call. +func (p *ParseAPI) Result() *Result { + result := p.result + if p.result == nil { + caller, filepos := getCaller(1) + panic(fmt.Sprintf( + "parsekit.ParseAPI.Result(): Result() called without calling "+ + "ParseAPI.Accept() on beforehand to make the result available "+ + "from %s at %s", caller, filepos)) } + return result } diff --git a/parsehandler_routing.go b/parsehandler_routing.go index 4c26c21..862b8f3 100644 --- a/parsehandler_routing.go +++ b/parsehandler_routing.go @@ -1,6 +1,9 @@ package parsekit -import "fmt" +import ( + "fmt" + "io" +) // Handle is used to execute other ParseHandler functions from within your // ParseHandler function. @@ -77,14 +80,14 @@ func (p *ParseAPI) ExpectEndOfFile() { // expectation is included in the error message. func (p *ParseAPI) UnexpectedInput() { p.panicWhenStoppedOrInError() - r, _, ok := p.peek(0) + r, err := p.tokenAPI.NextRune() switch { - case ok: + case err == nil: p.Error("unexpected character %q%s", r, fmtExpects(p)) - case r == eofRune: + case err == io.EOF: p.Error("unexpected end of file%s", fmtExpects(p)) - case r == invalidRune: - p.Error("invalid UTF8 character in input%s", fmtExpects(p)) + default: + p.Error("unexpected error '%s'%s", err, fmtExpects(p)) } } diff --git a/parsehandler_stringbuf.go b/parsehandler_stringbuf.go deleted file mode 100644 index 50f928c..0000000 --- a/parsehandler_stringbuf.go +++ /dev/null @@ -1,47 +0,0 @@ -package parsekit - -// BufLiteral retrieves the contents of the parser's string buffer (all the -// runes that were added to it using ParseAPI.Accept()) as a literal string. -// -// Literal means that if the input had for example the subsequent runes '\' and -// 'n' in it, then the literal string would have a backslash and an 'n' it in, -// not a linefeed (ASCII char 10). -// -// Retrieving the buffer contents will not affect the buffer itself. New runes -// can still be added to it. Only when calling P.BufClear(), the buffer will be -// cleared. -func (p *ParseAPI) BufLiteral() string { - return p.buffer.asLiteralString() -} - -// BufInterpreted retrieves the contents of the parser's string buffer (all the -// runes that were added to it using ParseAPI.Accept()) as an interpreted -// string. -// -// Interpreted means that the contents are treated as a Go double quoted -// interpreted string (handling escape codes like \n, \t, \uXXXX, etc.). if the -// input had for example the subsequent runes '\' and 'n' in it, then the -// interpreted string would have an actual linefeed (ASCII char 10) in it. -// -// This method returns a boolean value, indicating whether or not the string -// interpretation was successful. On invalid string data, an error will -// automatically be emitted and the boolean return value will be false. -// -// Retrieving the buffer contents will not affect the buffer itself. New runes -// can still be added to it. Only when calling P.BufClear(), the buffer will be -// cleared. -func (p *ParseAPI) BufInterpreted() (string, bool) { - s, err := p.buffer.asInterpretedString() - if err != nil { - p.Error( - "invalid string: %s (%s, forgot to escape a double quote or backslash maybe?)", - p.buffer.asLiteralString(), err) - return "", false - } - return s, true -} - -// BufClear clears the contents of the parser's string buffer. -func (p *ParseAPI) BufClear() { - p.buffer.reset() -} diff --git a/parsehandler_test.go b/parsehandler_test.go index 2f8a1c7..4ec35b8 100644 --- a/parsehandler_test.go +++ b/parsehandler_test.go @@ -43,35 +43,6 @@ func TestGivenParserWithError_WhenCallingHandle_ParsekitPanics(t *testing.T) { `.*/parsehandler_test\.go:\d+: no calls allowed after ParseAPI\.Error\(\)`}) } -func TestGivenFilledStringBuffer_BufInterpreted_ReturnsInterpretedString(t *testing.T) { - var interpreted string - var literal string - p := parsekit.NewParser(func(p *parsekit.ParseAPI) { - p.On(parsekit.C.OneOrMore(parsekit.A.AnyRune)).Accept() - literal = p.BufLiteral() - interpreted, _ = p.BufInterpreted() - }) - p.Execute(`This\tis\ta\tcool\tstring`) - - if literal != `This\tis\ta\tcool\tstring` { - t.Fatal("literal string is incorrect") - } - if interpreted != "This\tis\ta\tcool\tstring" { - t.Fatal("interpreted string is incorrect") - } -} - -func TestGivenInputInvalidForStringInterpretation_BufInterpreted_SetsError(t *testing.T) { - p := parsekit.NewParser(func(p *parsekit.ParseAPI) { - p.On(parsekit.C.OneOrMore(parsekit.A.AnyRune)).Accept() - p.BufInterpreted() - }) - err := p.Execute(`This \is wrongly escaped`) - if err.Error() != `invalid string: This \is wrongly escaped (invalid syntax, forgot to escape a double quote or backslash maybe?)` { - t.Fatalf("Got unexpected error: %s", err.Error()) - } -} - type parserWithLoop struct { loopCounter int } @@ -119,7 +90,6 @@ func TestGivenLoopingParserDefinition_ParserPanics(t *testing.T) { func TestGivenLoopingParserDefinition2_ParserPanics(t *testing.T) { parser := parsekit.NewParser(func(p *parsekit.ParseAPI) { for p.On(c.Max(5, a.AnyRune)).Accept() { - p.BufClear() } p.Stop() }) diff --git a/parsekit.go b/parsekit.go index 3dcf977..fe258b5 100644 --- a/parsekit.go +++ b/parsekit.go @@ -1,5 +1,9 @@ package parsekit +import ( + "strings" +) + // Parser is the top-level struct that holds the configuration for a parser. // The Parser can be instantiated using the parsekit.NewParser() method. type Parser struct { @@ -22,11 +26,8 @@ func NewParser(startHandler ParseHandler) *Parser { // When an error occurs during parsing, then this error is returned. Nil otherwise. func (p *Parser) Execute(input string) *Error { api := &ParseAPI{ - input: input, - len: len(input), - cursorLine: 1, - cursorColumn: 1, - loopCheck: map[string]bool{}, + tokenAPI: NewTokenAPI(strings.NewReader(input)), + loopCheck: map[string]bool{}, } api.Handle(p.startHandler) if !api.stopped && api.err == nil { @@ -39,12 +40,10 @@ func (p *Parser) Execute(input string) *Error { // a parser that is based solely on a TokenHandler function. // The Matcher can be instantiated using the parsekit.NewMatcher() // method. -// -// To match input data against the wrapped Matcher function, use the method -// Matcher.Parse(). +// TODO Rename to Tokenizer type Matcher struct { parser *Parser - match string + result *Result } // NewMatcher instantiates a new Matcher. @@ -55,11 +54,12 @@ type Matcher struct { // // The 'expects' parameter is used for creating an error message in case parsed // input does not match the TokenHandler. +// TODO Rename to NewTokenizer, and make matcher Tokeninzer, also see if we can use a Reader straight away, no ParseAPI. func NewMatcher(tokenHandler TokenHandler, expects string) *Matcher { matcher := &Matcher{} matcher.parser = NewParser(func(p *ParseAPI) { if p.On(tokenHandler).Accept() { - matcher.match = p.BufLiteral() + matcher.result = p.Result() p.Stop() } else { p.Expects(expects) @@ -70,9 +70,9 @@ func NewMatcher(tokenHandler TokenHandler, expects string) *Matcher { } // Execute feeds the input to the wrapped TokenHandler function. -// It returns the matched input string and an error. When an error -// occurred during parsing, the error will be set, nil otherwise. -func (m *Matcher) Execute(input string) (string, *Error) { +// It returns the TokenHandler's results. When an error occurred during parsing, +// the error will be set, nil otherwise. +func (m *Matcher) Execute(input string) (*Result, *Error) { err := m.parser.Execute(input) - return m.match, err + return m.result, err } diff --git a/parsekit_test.go b/parsekit_test.go index 5f4d210..3440e50 100644 --- a/parsekit_test.go +++ b/parsekit_test.go @@ -11,7 +11,7 @@ import ( ) // Easy access to the parsekit definitions. -var c, a, m = parsekit.C, parsekit.A, parsekit.M +var c, a, m, tok = parsekit.C, parsekit.A, parsekit.M, parsekit.T type TokenHandlerTest struct { Input string @@ -27,11 +27,11 @@ func RunTokenHandlerTests(t *testing.T, testSet []TokenHandlerTest) { } func RunTokenHandlerTest(t *testing.T, test TokenHandlerTest) { - output, err := parsekit.NewMatcher(test.TokenHandler, "a match").Execute(test.Input) + result, err := parsekit.NewMatcher(test.TokenHandler, "a match").Execute(test.Input) if test.MustMatch { if err != nil { t.Errorf("Test %q failed with error: %s", test.Input, err) - } else if output != test.Expected { + } else if output := result.String(); output != test.Expected { t.Errorf("Test %q failed: not expected output:\nexpected: %q\nactual: %q\n", test.Input, test.Expected, output) } } else { @@ -41,6 +41,41 @@ func RunTokenHandlerTest(t *testing.T, test TokenHandlerTest) { } } +type TokenMakerTest struct { + Input string + TokenHandler parsekit.TokenHandler + Expected []parsekit.Token +} + +func RunTokenMakerTest(t *testing.T, test TokenMakerTest) { + result, err := parsekit.NewMatcher(test.TokenHandler, "a match").Execute(test.Input) + if err != nil { + t.Errorf("Test %q failed with error: %s", test.Input, err) + } else { + if len(result.Tokens()) != len(test.Expected) { + t.Errorf("Unexpected number of tokens in output:\nexpected: %d\nactual: %d", len(test.Expected), len(result.Tokens())) + } + for i, expected := range test.Expected { + actual := result.Token(i) + if expected.Type != actual.Type { + t.Errorf("Unexpected Type in result.Tokens[%d]:\nexpected: (%T) %s\nactual: (%T) %s", i, expected.Type, expected.Type, actual.Type, actual.Type) + } + if string(expected.Runes) != string(actual.Runes) { + t.Errorf("Unexpected Runes in result.Tokens[%d]:\nexpected: %q\nactual: %q", i, expected.Runes, actual.Runes) + } + if expected.Value != actual.Value { + t.Errorf("Unexpected Value in result.Tokens[%d]:\nexpected: (%T) %s\nactual: (%T) %s", i, expected.Value, expected.Value, actual.Value, actual.Value) + } + } + } +} + +func RunTokenMakerTests(t *testing.T, testSet []TokenMakerTest) { + for _, test := range testSet { + RunTokenMakerTest(t, test) + } +} + type PanicTest struct { function func() expected string diff --git a/reader.go b/reader.go new file mode 100644 index 0000000..2d7fb56 --- /dev/null +++ b/reader.go @@ -0,0 +1,108 @@ +package parsekit + +import ( + "bufio" + "fmt" + "io" + "unicode/utf8" +) + +// Reader wraps around an io.Reader and provides buffering to allows us to read +// the same runes over and over again. This is useful for implementing a parser +// that must be able to do lookahead on the input, returning to the original +// input position after finishing that lookahead). +// +// To minimze memory use, it is also possible to flush the buffer when there is +// no more need to go back to previously read runes. +type Reader struct { + bufio *bufio.Reader // Used for ReadRune() + buffer []rune // Input buffer, holding runes that were read from input + bufferOffset int // The offset of the buffer, relative to the start of the input + bufferLen int // Input size, the number of runes in the buffer +} + +// NewReader initializes a new Reader struct, wrapped around the provided io.Reader. +func NewReader(r io.Reader) *Reader { + return &Reader{ + bufio: bufio.NewReader(r), + buffer: []rune{}, + } +} + +// RuneAt reads the rune at the provided rune offset. +// +// This offset is relative to the current starting position of the buffer in +// the reader. When starting reading, offset 0 will point at the start of the +// input. After flushing, offset 0 will point at the input up to where +// the flush was done. +// +// The error return value will be nil when reading was successful. +// When an invalid rune is encountered on the input, the error will be nil, +// but the rune will be utf8.RuneError +// +// When reading failed, the rune will be utf8.RuneError. One special read +// fail is actually a normal situation: end of file reached. In that case, +// the returned error wille be io.EOF. +func (r *Reader) RuneAt(offset int) (rune, error) { + // Rune at provided offset is not yet available in the input buffer. + // Read runes until we have enough runes to satisfy the offset. + for r.bufferLen <= offset { + readRune, _, err := r.bufio.ReadRune() + + // Handle errors. + if err != nil { + return utf8.RuneError, err + } + + // Skip BOM. + if readRune == '\uFEFF' && r.bufferOffset == 0 { + r.bufferOffset++ + continue + } + + r.buffer = append(r.buffer, readRune) + r.bufferLen++ + } + return r.buffer[offset], nil +} + +// RunesAt reads a slice of runes of length 'len', starting from offset 'offset'. +// +// This offset is relative to the current starting position of the buffer in +// the reader. When starting reading, offset 0 will point at the start of the +// input. After flushing, offset 0 will point at the input up to where +// the flush was done. +// +// When an error is encountered during reading (EOF or other error), then the +// error return value will be set. In case of an error, any runes that could be +// successfully read are returned along with the error. +// TODO Do I actually use this interface? +func (r *Reader) RunesAt(start int, len int) ([]rune, error) { + if len == 0 { + return r.buffer[0:0], nil + } + end := start + len + _, err := r.RuneAt(end) + if err != nil { + if end > r.bufferLen { + end = r.bufferLen + } + return r.buffer[start:end], err + } + return r.buffer[start:end], nil +} + +// Flush deletes the provided number of runes from the start of the +// reader buffer. After flushing the buffer, offset 0 as used by RuneAt() +// will point to the rune that comes after the flushed runes. +// So what this basically does is turn the Reader into a sliding window. +func (r *Reader) Flush(numberOfRunes int) { + if numberOfRunes > r.bufferLen { + panic(fmt.Sprintf( + "parsekit.Input.Reader.Flush(): number of runes to flush (%d) "+ + "exceeds size of the buffer (%d)", numberOfRunes, r.bufferLen)) + } + r.bufferOffset += numberOfRunes + r.bufferLen -= numberOfRunes + r.buffer = r.buffer[numberOfRunes:] +} diff --git a/reader_test.go b/reader_test.go new file mode 100644 index 0000000..672fbca --- /dev/null +++ b/reader_test.go @@ -0,0 +1,134 @@ +package parsekit + +import ( + "fmt" + "io" + "strings" + "testing" + + "git.makaay.nl/mauricem/go-parsekit/assert" +) + +func ExampleNewReader() { + in := strings.NewReader("Hello, world!") + r := NewReader(in) + at := func(i int) rune { r, _ := r.RuneAt(i); return r } + + fmt.Printf("%c", at(0)) + fmt.Printf("%c", at(12)) + + // Output: + // H! +} + +func ExampleReader_RuneAt() { + in := strings.NewReader("Hello, world!") + r := NewReader(in) + at := func(i int) rune { r, _ := r.RuneAt(i); return r } + + // It is possible to go back and forth while reading the input. + fmt.Printf("%c", at(0)) + fmt.Printf("%c", at(12)) + fmt.Printf("%c", at(7)) + fmt.Printf("%c", at(0)) + + // Output: + // H!wH +} + +func ExampleReader_RuneAt_endOfFile() { + in := strings.NewReader("Hello, world!") + r := NewReader(in) + + rn, err := r.RuneAt(13) + fmt.Printf("%q %s %t\n", rn, err, err == io.EOF) + + rn, err = r.RuneAt(20) + fmt.Printf("%q %s %t\n", rn, err, err == io.EOF) + + // Output: + // '�' EOF true + // '�' EOF true +} + +func ExampleReader_RuneAt_invalidRune() { + in := strings.NewReader("Hello, \xcdworld!") + r := NewReader(in) + + rn, err := r.RuneAt(6) + fmt.Printf("%q %t\n", rn, err == nil) + rn, err = r.RuneAt(7) + fmt.Printf("%q %t\n", rn, err == nil) + rn, err = r.RuneAt(8) + fmt.Printf("%q %t\n", rn, err == nil) + rn, err = r.RuneAt(9) + fmt.Printf("%q %t\n", rn, err == nil) + + // Output: + // ' ' true + // '�' true + // 'w' true + // 'o' true +} + +func ExampleReader_RunesAt() { + in := strings.NewReader("Hello, \xcdworld!") + r := NewReader(in) + + rs, err := r.RunesAt(4, 6) + fmt.Printf("%q %t\n", string(rs), err == nil) + rs, err = r.RunesAt(4, 0) + fmt.Printf("%q %t\n", string(rs), err == nil) + rs, err = r.RunesAt(8, 100) + fmt.Printf("%q %t\n", string(rs), err == io.EOF) + + // Output: + // "o, �wo" true + // "" true + // "world!" true +} + +func TestRuneAt_SkipsBOMAtStartOfFile(t *testing.T) { + in := strings.NewReader("\uFEFFBommetje!") + r := NewReader(in) + b, _ := r.RuneAt(0) + o, _ := r.RuneAt(1) + m, _ := r.RuneAt(2) + bom := fmt.Sprintf("%c%c%c", b, o, m) + assert.Equal(t, "Bom", bom, "first three runes") +} + +func ExampleReader_Flush() { + in := strings.NewReader("Hello, world!") + r := NewReader(in) + at := func(i int) rune { r, _ := r.RuneAt(i); return r } + rb := func(start int, len int) []rune { r, _ := r.RunesAt(start, len); return r } + + // Fills the buffer with the first 8 runes on the input: "Hello, w" + fmt.Printf("%c\n", at(7)) + + // Now flush the first 4 runes from the buffer (dropping "Hell" from it) + r.Flush(4) + + // Rune 0 is now pointing at what originally was rune offset 4. + // We can continue reading from there. + fmt.Printf("%s", string(rb(0, 8))) + + // Output: + // w + // o, world +} + +func TestGivenNumberOfRunesTooHigh_Flush_Panics(t *testing.T) { + in := strings.NewReader("Hello, world!") + r := NewReader(in) + + // Fill buffer with "Hello, worl", the first 11 runes. + r.RuneAt(10) + + // However, we flush 12 runes, which exceeds the buffer size. + assert.Panic(t, assert.PanicT{ + Function: func() { r.Flush(12) }, + Expect: "parsekit.Input.Reader.Flush(): number of runes to flush (12) exceeds size of the buffer (11)", + }) +} diff --git a/stringbuf.go b/stringbuf.go deleted file mode 100644 index 727eed6..0000000 --- a/stringbuf.go +++ /dev/null @@ -1,62 +0,0 @@ -package parsekit - -import ( - "bytes" - "strconv" - "strings" -) - -// stringBuffer is a string buffer implementation that is used by the parser -// to efficiently accumulate runes from the input and eventually turn these -// into a string, either literal or interpreted. -type stringBuffer struct { - buffer bytes.Buffer -} - -// reset resets the string buffer, in order to build a new string. -func (b *stringBuffer) reset() *stringBuffer { - b.buffer.Reset() - return b -} - -// writeString adds the runes of the input string to the string buffer. -func (b *stringBuffer) writeString(s string) *stringBuffer { - for _, r := range s { - b.writeRune(r) - } - return b -} - -// writeRune adds a single rune to the string buffer. -func (b *stringBuffer) writeRune(r rune) *stringBuffer { - b.buffer.WriteRune(r) - return b -} - -// asLiteralString returns the string buffer as a literal string. -// Literal means that no escape sequences are processed. -func (b *stringBuffer) asLiteralString() string { - return b.buffer.String() -} - -// asInterpretedString returns the string in its interpreted form. -// Interpreted means that escape sequences are handled in the way that Go would -// have, had it been inside double quotes. It translates for example escape -// sequences like "\n", "\t", \uXXXX" and "\UXXXXXXXX" into their string -// representations. -// Since the input might contain invalid escape sequences, this method -// also returns an error. When an error is returned, the returned string will -// contain the string as far as it could be interpreted. -func (b *stringBuffer) asInterpretedString() (string, error) { - var sb strings.Builder - tail := b.buffer.String() - for len(tail) > 0 { - r, _, newtail, err := strconv.UnquoteChar(tail, '"') - if err != nil { - return sb.String(), err - } - tail = newtail - sb.WriteRune(r) - } - return sb.String(), nil -} diff --git a/stringbuf_test.go b/stringbuf_test.go deleted file mode 100644 index 0140688..0000000 --- a/stringbuf_test.go +++ /dev/null @@ -1,88 +0,0 @@ -package parsekit - -import ( - "testing" -) - -func TestGeneratingStringDoesNotResetBuffer(t *testing.T) { - var b stringBuffer - s1, _ := b.writeString(`hi\nthere`).asInterpretedString() - s2 := b.asLiteralString() - if s1 != "hi\nthere" { - t.Fatalf("Did not get expected string\"X\" for try 1, but %q", s1) - } - if s2 != "hi\\nthere" { - t.Fatalf("Did not get expected string\"X\" for try 2, but %q", s2) - } -} - -func TestResetResetsBuffer(t *testing.T) { - var b stringBuffer - s := b.writeRune('X').reset().asLiteralString() - if s != "" { - t.Fatalf("Did not get expected empty string, but %q", s) - } -} - -func TestAsLiteralString(t *testing.T) { - b := stringBuffer{} - for _, c := range []stringbufT{ - {"empty string", ``, ``, OK}, - {"simple string", `Simple string!`, `Simple string!`, OK}, - {"single quote", `'`, `'`, OK}, - {"double quote", `"`, `"`, OK}, - {"escaped single quote", `\'`, `\'`, OK}, - {"escaped double quote", `\"`, `\"`, OK}, - {"escape anything", `\x\t\f\n\r\'\"\\`, `\x\t\f\n\r\'\"\\`, OK}, - {"UTF8 escapes", `\uceb2\U00e0b8bf`, `\uceb2\U00e0b8bf`, OK}, - {"actual newline", "on\nmultiple\nlines", "on\nmultiple\nlines", OK}, - } { - s := b.reset().writeString(c.in).asLiteralString() - if s != c.out { - t.Fatalf("[%s] %q -> %q failed: actual result = %q", c.name, c.in, c.out, s) - } - } -} - -func TestAsInterpretedString(t *testing.T) { - b := stringBuffer{} - for _, c := range []stringbufT{ - {"empty string", "", "", OK}, - {"one character", "Simple string!", "Simple string!", OK}, - {"escaped single quote", `\'`, "", FAIL}, - {"escaped double quote", `\"`, `"`, OK}, - {"bare single quote", `'`, "'", OK}, - {"string in single quotes", `'Hello'`, `'Hello'`, OK}, - {"string in escaped double quotes", `\"Hello\"`, `"Hello"`, OK}, - {"escape something", `\t\f\n\r\"\\`, "\t\f\n\r\"\\", OK}, - {"short UTF8 escapes", `\u2318Wh\u00e9\u00e9!`, `⌘Whéé!`, OK}, - {"long UTF8 escapes", `\U0001014D \u2318 Wh\u00e9\u00e9!`, `𐅍 ⌘ Whéé!`, OK}, - {"UTF8 characters", "Ѝюج wut Ж ?", "Ѝюج wut Ж ?", OK}, - {"example from spec", - `I'm a string. \"You can quote me\". Name\tJos\u00E9\nLocation\tSF.`, - "I'm a string. \"You can quote me\". Name\tJosé\nLocation\tSF.", OK}, - } { - s, err := b.reset().writeString(c.in).asInterpretedString() - if c.isSuccessCase && err != nil { - t.Fatalf("[%s] unexpected error for input %q: %s", c.name, c.in, err) - } - if !c.isSuccessCase && err == nil { - t.Fatalf("[%s] expected a failure, but no failure occurred", c.name) - } - if s != c.out && c.isSuccessCase { - t.Fatalf("[%s] %q -> %q failed: actual result = %q", c.name, c.in, c.out, s) - } - } -} - -type stringbufT struct { - name string - in string - out string - isSuccessCase bool -} - -const ( - OK bool = true - FAIL bool = false -) diff --git a/tokenapi.go b/tokenapi.go new file mode 100644 index 0000000..f085a23 --- /dev/null +++ b/tokenapi.go @@ -0,0 +1,188 @@ +package parsekit + +import ( + "fmt" + "io" +) + +// TokenAPI wraps a parsekit.Reader and its purpose is to retrieve input data and +// to report back results. For easy lookahead support, a forking strategy is +// provided. +// +// BASIC OPERATION: +// +// To retrieve the next rune from the TokenAPI, call the NextRune() method. +// +// When the rune is to be accepted as input, call the method Accept(). The rune +// is then added to the result buffer of the TokenAPI struct. +// It is mandatory to call Accept() after retrieving a rune, before calling +// NextRune() again. Failing to do so will result in a panic. +// +// By invoking NextRune() + Accept() multiple times, the result buffer is extended +// with as many runes as needed. +// +// FORKING OPERATION FOR EASY LOOKEAHEAD SUPPORT: +// +// Sometimes, we must be able to perform a lookahead, which might either +// succeed or fail. In case of a failing lookahead, the state of the TokenAPI must be +// brought back to the original state, so we can try a different route. +// +// The way in which this is supported, is by forking a TokenAPI struct by calling +// Fork(). This will return a forked child TokenAPI, with an empty result buffer, +// but using the same input cursor position as the forked parent. +// +// After forking, the same interface as described for BASIC OPERATION can be +// used to fill the result buffer. When the lookahead was successful, then +// Merge() can be called on the forked child to append the child's result +// buffer to the parent's result buffer, and to move the input cursor position +// to that of the child. +// +// When the lookahead was unsuccessful, then the forked child TokenAPI can simply +// be discarded. The parent TokenAPI was never modified, so it can safely be used +// as if the lookahead never happened. +// +// Note: +// Many tokenizers/parsers take a different approach on lookaheads by using +// peeks and by moving the input cursor position back and forth, or by putting +// read input back on the input stream. That often leads to code that is +// efficient, however, in my opinion, not very untuitive to read. +type TokenAPI struct { + reader *Reader + cursor *Cursor // current read cursor position, rel. to the input start + offset int // current rune offset rel. to the Reader's sliding window + result *Result // results as produced by a TokenHandler (runes, Tokens) + root *TokenAPI // the root TokenAPI + parent *TokenAPI // parent TokenAPI in case this TokenAPI is a fork child + child *TokenAPI // child TokenAPI in case this TokenAPI is a fork parent +} + +// NewTokenAPI initializes a new TokenAPI struct, wrapped around the provided io.Reader. +func NewTokenAPI(r io.Reader) *TokenAPI { + input := &TokenAPI{ + reader: NewReader(r), + cursor: &Cursor{}, + result: NewResult(), + } + input.root = input + return input +} + +// NextRune returns the rune at the current read offset. +// +// When an invalid UTF8 rune is encountered on the input, it is replaced with +// the utf.RuneError rune. It's up to the caller to handle this as an error +// when needed. +// +// After reading a rune it must be Accept()-ed to move the read cursor forward +// to the next rune. Doing so is mandatory. When doing a second call to NextRune() +// without explicitly accepting, this method will panic. +func (i *TokenAPI) NextRune() (rune, error) { + if i.result.lastRune != nil { + caller, linepos := getCaller(1) + panic(fmt.Sprintf("parsekit.TokenAPI.NextRune(): NextRune() called without a prior call "+ + "to Accept() from %s at %s", caller, linepos)) + } + i.detachChilds() + + readRune, err := i.reader.RuneAt(i.offset) + i.result.lastRune = &runeInfo{r: readRune, err: err} + return readRune, err +} + +// Accept the last rune as read by NextRune() into the result buffer and move +// the cursor forward. +// +// It is not allowed to call Accept() when the previous call to NextRune() +// returned an error. Calling Accept() in such case will result in a panic. +func (i *TokenAPI) Accept() { + if i.result.lastRune == nil { + caller, linepos := getCaller(1) + panic(fmt.Sprintf( + "parsekit.TokenAPI.Accept(): Accept() called without first "+ + "calling NextRune() from %s at %s", caller, linepos)) + } else if i.result.lastRune.err != nil { + caller, linepos := getCaller(1) + panic(fmt.Sprintf( + "parsekit.TokenAPI.Accept(): Accept() called while the previous "+ + "call to NextRune() failed from %s at %s", caller, linepos)) + } + i.result.runes = append(i.result.runes, i.result.lastRune.r) + i.cursor.move(fmt.Sprintf("%c", i.result.lastRune.r)) + i.offset++ + i.result.lastRune = nil +} + +// Fork forks off a child of the TokenAPI struct. It will reuse the same Reader and +// read cursor position, but for the rest this is a fresh TokenAPI. +func (i *TokenAPI) Fork() *TokenAPI { + i.detachChilds() + + // Create the new fork. + child := &TokenAPI{ + reader: i.reader, + cursor: &Cursor{}, + offset: i.offset, + root: i.root, + parent: i, + } + child.result = NewResult() + *child.cursor = *i.cursor + i.child = child + i.result.lastRune = nil + return child +} + +// Merge adds the data of the forked child TokenAPI that Merge() is called on to the +// data of its parent (results and read cursor position). +func (i *TokenAPI) Merge() { + if i.parent == nil { + panic("parsekit.TokenAPI.Merge(): Cannot call Merge() on a non-forked TokenAPI") + } + + i.parent.result.runes = append(i.parent.result.runes, i.result.runes...) + i.parent.result.tokens = append(i.parent.result.tokens, i.result.tokens...) + i.parent.offset = i.offset + i.parent.cursor = i.cursor + + i.detachChilds() + i.result = NewResult() +} + +// Result returns the result data for the TokenAPI. The returned struct +// can be used to retrieve and modify the result data. +func (i *TokenAPI) Result() *Result { + return i.result +} + +// Cursor retrieves the current read cursor data. +// TODO make this and offset part of Result struct? +func (i *TokenAPI) Cursor() Cursor { + return *i.cursor +} + +// FlushReaderBuffer delegates to the Flush() method of the contained +// parsekit.TokenAPI.Reader. It flushes the provided number of runes from the +// reader cache. +func (i *TokenAPI) FlushReaderBuffer(numberOfRunes int) { + if i != i.root { + panic("parsekit.input.TokenAPI.FlushReaderBuffer(): Flushbuffer() can only be called on the root TokenAPI, not on a forked child") + } + i.detachChilds() + i.reader.Flush(numberOfRunes) + i.offset = 0 +} + +func (i *TokenAPI) detachChilds() { + if i.child != nil { + i.child.detachChildsRecurse() + i.child = nil + } +} + +func (i *TokenAPI) detachChildsRecurse() { + if i.child != nil { + i.child.detachChildsRecurse() + } + i.child = nil + i.parent = nil +} diff --git a/tokenapi_result.go b/tokenapi_result.go new file mode 100644 index 0000000..ef90761 --- /dev/null +++ b/tokenapi_result.go @@ -0,0 +1,106 @@ +package parsekit + +import ( + "fmt" +) + +// Result holds results as produced by a TokenHandler. +type Result struct { + lastRune *runeInfo // Information about the last rune read using NextRune() + runes []rune + tokens []*Token +} + +type runeInfo struct { + r rune + err error +} + +// Token defines a lexical token as produced by TokenHandlers. +type Token struct { + Type interface{} // token type, can be any type that a parser author sees fit + Runes []rune // the runes that make up the token + Value interface{} // an optional value of any type +} + +// NewResult initializes an empty result struct. +func NewResult() *Result { + return &Result{ + runes: []rune{}, + tokens: []*Token{}, + } +} + +// ClearRunes clears the runes in the Result. +func (r *Result) ClearRunes() { + r.runes = []rune{} +} + +// SetRunes replaces the Runes from the Result with the provided input. +func (r *Result) SetRunes(s interface{}) { + r.ClearRunes() + r.AddRunes(s) +} + +// AddRunes is used to add runes to the Result. +func (r *Result) AddRunes(s interface{}) { + switch s := s.(type) { + case string: + r.runes = append(r.runes, []rune(s)...) + case []rune: + r.runes = append(r.runes, s...) + case rune: + r.runes = append(r.runes, s) + default: + panic(fmt.Sprintf("parsekit.Result.SetRunes(): unsupported type '%T' used", s)) + } +} + +// Runes retrieves the Runes from the Result. +func (r *Result) Runes() []rune { + return r.runes +} + +// Rune retrieve a single rune from the Result at the specified index. +func (r *Result) Rune(idx int) rune { + return r.runes[idx] +} + +// String returns the Runes from the Result as a string. +func (r *Result) String() string { + return string(r.runes) +} + +// ClearTokens clears the tokens in the Result. +func (r *Result) ClearTokens() { + r.tokens = []*Token{} +} + +// AddToken is used to add a Token to the results. +func (r *Result) AddToken(t *Token) { + r.tokens = append(r.tokens, t) +} + +// Tokens retrieves the Tokens from the Result. +func (r *Result) Tokens() []*Token { + return r.tokens +} + +// Token retrieves a single Token from the Result at the specified index. +func (r *Result) Token(idx int) *Token { + return r.tokens[idx] +} + +// Values retrieves a slice containing only the Values for the Result Tokens. +func (r *Result) Values() []interface{} { + values := make([]interface{}, len(r.tokens)) + for i, tok := range r.tokens { + values[i] = tok.Value + } + return values +} + +// Value retrieves a single Value from the Result Token at the specified index. +func (r *Result) Value(idx int) interface{} { + return r.tokens[idx].Value +} diff --git a/tokenapi_result_test.go b/tokenapi_result_test.go new file mode 100644 index 0000000..3ccec52 --- /dev/null +++ b/tokenapi_result_test.go @@ -0,0 +1,27 @@ +package parsekit + +import ( + "testing" + + "git.makaay.nl/mauricem/go-parsekit/assert" +) + +func TestSetResult_AcceptsVariousTypesAsInput(t *testing.T) { + i := mkInput() + i.Result().SetRunes("string") + assert.Equal(t, "string", string(i.Result().String()), "i.Result() with string input") + i.Result().SetRunes([]rune("rune slice")) + assert.Equal(t, "rune slice", string(i.Result().String()), "i.Result() with rune slice input") + i.Result().SetRunes('X') + assert.Equal(t, "X", string(i.Result().String()), "i.Result() with rune input") +} + +func TestSetResult_PanicsOnUnhandledInput(t *testing.T) { + assert.Panic(t, assert.PanicT{ + Function: func() { + i := mkInput() + i.Result().SetRunes(1234567) + }, + Expect: "parsekit.Result.SetRunes(): unsupported type 'int' used", + }) +} diff --git a/tokenapi_test.go b/tokenapi_test.go new file mode 100644 index 0000000..59e78f9 --- /dev/null +++ b/tokenapi_test.go @@ -0,0 +1,288 @@ +package parsekit + +import ( + "io" + "strings" + "testing" + "unicode/utf8" + + "git.makaay.nl/mauricem/go-parsekit/assert" +) + +func TestCallingNextRune_ReturnsNextRune(t *testing.T) { + r, _ := mkInput().NextRune() + assert.Equal(t, 'T', r, "first rune") +} + +func TestInputCanAcceptRunesFromReader(t *testing.T) { + i := mkInput() + i.NextRune() + i.Accept() + i.NextRune() + i.Accept() + i.NextRune() + i.Accept() + assert.Equal(t, "Tes", i.Result().String(), "i.Result().String()") +} + +func TestCallingNextRuneTwice_Panics(t *testing.T) { + assert.Panic(t, assert.PanicT{ + Function: func() { + i := mkInput() + i.NextRune() + i.NextRune() + }, + Regexp: true, + Expect: `parsekit\.TokenAPI\.NextRune\(\): NextRune\(\) called without ` + + `a prior call to Accept\(\) from .*TestCallingNextRuneTwice_Panics.* at /.*_test.go:\d+`, + }) +} + +func TestCallingAcceptWithoutCallingNextRune_Panics(t *testing.T) { + assert.Panic(t, assert.PanicT{ + Function: mkInput().Accept, + Regexp: true, + Expect: `parsekit\.TokenAPI\.Accept\(\): Accept\(\) called without ` + + `first calling NextRune\(\) from .* at /.*:\d+`, + }) +} + +func TestCallingMergeOnNonForkedChild_Panics(t *testing.T) { + assert.Panic(t, assert.PanicT{ + Function: func() { + i := mkInput() + i.Merge() + }, + Expect: "parsekit.TokenAPI.Merge(): Cannot call Merge() on a non-forked TokenAPI", + }) +} + +func TestCallingNextRuneOnForkedParent_DetachesForkedChild(t *testing.T) { + assert.Panic(t, assert.PanicT{ + Function: func() { + i := mkInput() + f := i.Fork() + i.NextRune() + f.Merge() + }, + Expect: "parsekit.TokenAPI.Merge(): Cannot call Merge() on a non-forked TokenAPI", + }) +} + +func TestCallingForkOnForkedParent_DetachesForkedChild(t *testing.T) { + assert.Panic(t, assert.PanicT{ + Function: func() { + i := mkInput() + f := i.Fork() + i.Fork() + f.Merge() + }, + Expect: "parsekit.TokenAPI.Merge(): Cannot call Merge() on a non-forked TokenAPI", + }) +} + +func TestGivenMultipleLevelsOfForks_WhenReturningToRootInput_ForksAreDetached(t *testing.T) { + i := mkInput() + f1 := i.Fork() + f2 := f1.Fork() + f3 := f2.Fork() + f4 := f1.Fork() // secret subtest: this Fork() detaches both forks f2 and f3 + f5 := f4.Fork() + assert.Equal(t, true, i.parent == nil, "i.parent == nil") + assert.Equal(t, true, i.child == f1, "i.child == f1") + assert.Equal(t, true, f1.parent == i, "f1.parent == i") + assert.Equal(t, true, f1.child == f4, "f1.child == f4") + assert.Equal(t, true, f2.child == nil, "f2.child == nil") + assert.Equal(t, true, f2.parent == nil, "f2.parent == nil") + assert.Equal(t, true, f3.child == nil, "f3.child == nil") + assert.Equal(t, true, f3.parent == nil, "f3.parent == nil") + assert.Equal(t, true, f4.parent == f1, "f4.parent == f1") + assert.Equal(t, true, f4.child == f5, "f4.child == f5") + assert.Equal(t, true, f5.parent == f4, "f5.parent == f4") + assert.Equal(t, true, f5.child == nil, "f5.child == nil") + + i.NextRune() + + assert.Equal(t, true, i.parent == nil, "i.parent == nil") + assert.Equal(t, true, i.child == nil, "i.child == nil") + assert.Equal(t, true, f1.parent == nil, "f1.parent == nil") + assert.Equal(t, true, f1.child == nil, "f1.child == nil") + assert.Equal(t, true, f2.child == nil, "f2.child == nil") + assert.Equal(t, true, f2.parent == nil, "f2.parent == nil") + assert.Equal(t, true, f3.child == nil, "f3.child == nil") + assert.Equal(t, true, f3.parent == nil, "f3.parent == nil") + assert.Equal(t, true, f4.parent == nil, "f4.parent == nil") + assert.Equal(t, true, f4.child == nil, "f4.child == nil") + assert.Equal(t, true, f5.parent == nil, "f5.parent == nil") + assert.Equal(t, true, f5.child == nil, "f5.child == nil") +} + +func TestForkingInput_ClearsLastRune(t *testing.T) { + assert.Panic(t, assert.PanicT{ + Function: func() { + i := mkInput() + i.NextRune() + i.Fork() + i.Accept() + }, + Regexp: true, + Expect: `parsekit\.TokenAPI\.Accept\(\): Accept\(\) called without ` + + `first calling NextRune\(\) from .* at /.*:\d+`, + }) +} + +func TestCallingAcceptAfterNextRune_AcceptsRuneAndMovesReadOffsetForward(t *testing.T) { + i := mkInput() + r, _ := i.NextRune() + assert.Equal(t, 'T', r, "result from 1st call to NextRune()") + // TODO still (*runeInfo) case needed? + assert.NotEqual(t, (*runeInfo)(nil), i.result.lastRune, "Input.lastRune after NextRune()") + i.Accept() + assert.Equal(t, (*runeInfo)(nil), i.result.lastRune, "Input.lastRune after Accept()") + assert.Equal(t, 1, i.offset, "Input.offset") + assert.Equal(t, 'T', i.reader.buffer[0], "Input.buffer[0]") + r, _ = i.NextRune() + assert.Equal(t, 'e', r, "result from 2nd call to NextRune()") +} + +func TestCallingMultipleAccepts_FillsInputWithData(t *testing.T) { + i := mkInput() + for j := 0; j < 7; j++ { + i.NextRune() + i.Accept() + } + assert.Equal(t, "Testing", string(i.reader.buffer), "reader input buffer") + assert.Equal(t, "Testing", i.Result().String(), "i.Result().String()") +} + +func TestAccept_UpdatesCursor(t *testing.T) { + i := NewTokenAPI(strings.NewReader("input\r\nwith\r\nnewlines")) + assert.Equal(t, "line 1, column 1", i.cursor.String(), "cursor 1") + for j := 0; j < 6; j++ { // read "input\r", cursor end up at "\n" + i.NextRune() + i.Accept() + } + assert.Equal(t, "line 1, column 7", i.cursor.String(), "cursor 2") + i.NextRune() // read "\n", cursor ends up at start of new line + i.Accept() + assert.Equal(t, "line 2, column 1", i.cursor.String(), "cursor 3") + for j := 0; j < 10; j++ { // read "with\r\nnewl", cursor end up at "i" + i.NextRune() + i.Accept() + } + assert.Equal(t, "line 3, column 5", i.cursor.String(), "cursor 4") + assert.Equal(t, *i.cursor, i.Cursor(), "i.Cursor()") +} + +func TestFork_CreatesForkOfInputAtSameCursorPosition(t *testing.T) { + // Create input, accept the first rune. + i := mkInput() + i.NextRune() + i.Accept() // T + assert.Equal(t, "T", i.Result().String(), "accepted rune in input") + // Fork + f := i.Fork() + assert.Equal(t, f, i.child, "Input.child (must be f)") + assert.Equal(t, i, f.parent, "Input.parent (must be i)") + assert.Equal(t, 1, i.cursor.Byte, "i.child.cursor.Byte") + assert.Equal(t, 1, i.child.cursor.Byte, "i.child.cursor.Byte") + // Accept two runes via fork. + f.NextRune() + f.Accept() // e + f.NextRune() + f.Accept() // s + assert.Equal(t, "es", f.Result().String(), "result runes in fork") + assert.Equal(t, 1, i.cursor.Byte, "i.child.cursor.Byte") + assert.Equal(t, 3, i.child.cursor.Byte, "i.child.cursor.Byte") + // Merge fork back into parent + f.Merge() + assert.Equal(t, "Tes", i.Result().String(), "result runes in parent Input after Merge()") + assert.Equal(t, 3, i.cursor.Byte, "i.child.cursor.Byte") +} + +func TestGivenForkedChildWhichAcceptedRune_AfterMerging_RuneEndsUpInParentResult(t *testing.T) { + i := mkInput() + i.NextRune() + i.Accept() + f1 := i.Fork() + f1.NextRune() + f1.Accept() + f2 := f1.Fork() + f2.NextRune() + f2.Accept() + assert.Equal(t, "T", i.Result().String(), "i.Result().String()") + assert.Equal(t, 1, i.offset, "i.offset") + assert.Equal(t, "e", f1.Result().String(), "f1.Result().String()") + assert.Equal(t, 2, f1.offset, "f1.offset") + assert.Equal(t, "s", f2.Result().String(), "f2.Result().String()") + assert.Equal(t, 3, f2.offset, "f2.offset") + f2.Merge() + assert.Equal(t, "T", i.Result().String(), "i.Result().String()") + assert.Equal(t, 1, i.offset, "i.offset") + assert.Equal(t, "es", f1.Result().String(), "f1.Result().String()") + assert.Equal(t, 3, f1.offset, "f1.offset") + assert.Equal(t, "", f2.Result().String(), "f2.Result().String()") + assert.Equal(t, 3, f2.offset, "f2.offset") + f1.Merge() + assert.Equal(t, "Tes", i.Result().String(), "i.Result().String()") + assert.Equal(t, 3, i.offset, "i.offset") + assert.Equal(t, "", f1.Result().String(), "f1.Result().String()") + assert.Equal(t, 3, f1.offset, "f1.offset") + assert.Equal(t, "", f2.Result().String(), "f2.Result().String()") + assert.Equal(t, 3, f2.offset, "f2.offset") +} + +func TestGivenForkedChild_FlushReaderBuffer_Panics(t *testing.T) { + assert.Panic(t, assert.PanicT{ + Function: func() { + i := mkInput() + f := i.Fork() + f.FlushReaderBuffer(1) + }, + Expect: "parsekit.input.TokenAPI.FlushReaderBuffer(): Flushbuffer() " + + "can only be called on the root TokenAPI, not on a forked child", + }) +} + +func TestGivenRootWithSomeRunesRead_FlushReaderBuffer_ClearsReaderBuffer(t *testing.T) { + i := mkInput() + i.NextRune() + i.Accept() + i.NextRune() + i.Accept() + i.FlushReaderBuffer(2) + assert.Equal(t, "Te", i.Result().String(), "i.Result()") + assert.Equal(t, 0, i.offset, "i.offset") + i.NextRune() + i.Accept() + i.NextRune() + i.Accept() + assert.Equal(t, 2, i.offset, "i.offset") + i.FlushReaderBuffer(2) + assert.Equal(t, "Test", i.Result().String(), "i.Result()") + assert.Equal(t, 0, i.offset, "i.offset") +} + +func TestWhenCallingNextRuneAtEndOfFile_EOFIsReturned(t *testing.T) { + i := NewTokenAPI(strings.NewReader("X")) + i.NextRune() + i.Accept() + r, err := i.NextRune() + assert.Equal(t, true, r == utf8.RuneError, "returned rune from NextRune()") + assert.Equal(t, true, err == io.EOF, "returned error from NextRune()") +} +func TestAfterReadingRuneAtEndOfFile_EarlierRunesCanStillBeAccessed(t *testing.T) { + i := NewTokenAPI(strings.NewReader("X")) + f := i.Fork() + f.NextRune() + f.Accept() + r, err := f.NextRune() + assert.Equal(t, true, r == utf8.RuneError, "returned rune from 2nd NextRune()") + r, err = i.NextRune() + assert.Equal(t, 'X', r, "returned rune from 2nd NextRune()") + assert.Equal(t, true, err == nil, "returned error from 2nd NextRune()") +} + +func mkInput() *TokenAPI { + return NewTokenAPI(strings.NewReader("Testing")) +} diff --git a/tokenhandler.go b/tokenhandler.go index ecc1079..997e4ed 100644 --- a/tokenhandler.go +++ b/tokenhandler.go @@ -2,113 +2,55 @@ package parsekit import ( "fmt" + "runtime" ) // TokenHandler is the function type that is involved in turning a low level -// stream of UTF8 runes into parsing tokens. Its purpose is to check if input -// data matches some kind of pattern and to report back the match. -// -// A TokenHandler is to be used in conjunction with parsekit.P.On() or -// parsekit.Matcher(). +// stream of UTF8 runes into lexical tokens. Its purpose is to check if input +// data matches some kind of pattern and to report back the token(s). // // A TokenHandler function gets a TokenAPI as its input and returns a boolean to // indicate whether or not it found a match on the input. The TokenAPI is used // for retrieving input data to match against and for reporting back results. type TokenHandler func(t *TokenAPI) bool -// TokenAPI is used by TokenHandler functions to retrieve runes from the -// input to match against and to report back results. -// -// Basic operation: -// -// To retrieve the next rune from the input, the TokenHandler function can call -// the TokenAPI.NextRune() method. -// -// The TokenHandler function can then evaluate the retrieved rune and either -// accept of skip the rune. When accepting it using TokenAPI.Accept(), the rune -// is added to the resulting output of the TokenAPI. When using TokenAPI.Skip(), -// the rune will not be added to the output. It is mandatory for a TokenHandler -// to call either Accept() or Skip() after retrieving a rune, before calling -// NextRune() again. -// -// Eventually, the TokenHandler function must return a boolean value, indicating -// whether or not a match was found. When true, then the calling code will -// use the runes that were accepted into the TokenAPI's resulting output. -// -// Forking operation for easy lookahead support: -// -// Sometimes, a TokenHandler function must be able to perform a lookahead, which -// might either succeed or fail. In case of a failing lookahead, the state -// of the TokenAPI must be brought back to the original state. -// -// The way in which this is supported, is by forking a TokenAPI by calling -// TokenAPI.Fork(). This will return a child TokenAPI, with an empty -// output buffer, but using the same input cursor position as the forked parent. -// -// The TokenHandler function can then use the same interface as described for -// normal operation to retrieve runes from the input and to fill the resulting -// output. When the TokenHandler function decides that the lookahead was successful, -// then the method TokenAPI.Merge() can be called on the forked child to -// append the resulting output from the child to the parent's resulting output, -// and to update the parent input cursor position to that of the child. -// -// When the TokenHandler function decides that the lookahead was unsuccessful, -// then it can simply discard the forked child. The parent TokenAPI was never -// modified, so a new match can be safely started using that parent, as if the -// lookahead never happened. -type TokenAPI struct { - p *ParseAPI // parser state, used to retrieve input data to match against (TODO should be tiny interface) - inputOffset int // the byte offset into the input - input []rune // a slice of runes that represents all retrieved input runes for the Matcher - output []rune // a slice of runes that represents the accepted output runes for the Matcher - currRune *runeInfo // hold information for the last rune that was read from the input - parent *TokenAPI // the parent MatchDialog, in case this one was forked -} - -// runeInfo describes a single rune and its metadata. -type runeInfo struct { - Rune rune // an UTF8 rune - ByteSize int // the number of bytes in the rune - OK bool // false when the rune represents an invalid UTF8 rune or EOF -} - // NextRune retrieves the next rune from the input. // // It returns the rune and a boolean. The boolean will be false in case an // invalid UTF8 rune or the end of the file was encountered. // -// After using NextRune() to retrieve a rune, Accept() or Skip() can be called -// to respectively add the rune to the TokenAPI's resulting output or to -// fully ignore it. This way, a TokenHandler has full control over what runes are -// significant for the resulting output of that TokenHandler. +// After retrieving a rune, Accept() or Skip() can be called to respectively add +// the rune to the TokenAPIold's string buffer or to fully ignore it. This way, +// a TokenHandler has full control over what runes are significant for the +// resulting output of that TokenHandler. // // After using NextRune(), this method can not be reinvoked, until the last read // rune is explicitly accepted or skipped as described above. -func (t *TokenAPI) NextRune() (rune, bool) { - if t.currRune != nil { - caller, filepos := t.p.getCaller(1) - panic(fmt.Sprintf( - "TokenHandler bug: NextRune() was called from %s at %s "+ - "without accepting or skipping the previously read rune", caller, filepos)) - } - r, w, ok := t.p.peek(t.inputOffset) - t.currRune = &runeInfo{r, w, ok} - if ok { - t.input = append(t.input, r) - } - return r, ok -} +// func (t *TokenAPIold) NextRune() (rune, bool) { +// if t.lastRune != nil { +// caller, filepos := getCaller(1) +// panic(fmt.Sprintf( +// "TokenHandler bug: NextRune() was called from %s at %s "+ +// "without accepting or skipping the previously read rune", caller, filepos)) +// } +// r, w, ok := 'X', 10, true // t.input.peek(t.inputOffset) +// t.lastRune = &runeInfo{r, w, ok} +// if ok { +// t.result.Input = append(t.result.Input, r) +// } +// return r, ok +// } -// Fork splits off a child TokenAPI, containing the same input cursor position -// as the parent TokenAPI, but with all other data in a fresh state. +// Fork splits off a child TokenAPIold, containing the same input cursor position +// as the parent TokenAPIold, but with all other data in a fresh state. // -// By forking, a TokenHandler function can freely work with a TokenAPI, without -// affecting the parent TokenAPI. This is for example useful when the +// By forking, a TokenHandler function can freely work with a TokenAPIold, without +// affecting the parent TokenAPIold. This is for example useful when the // TokenHandler function must perform some form of lookahead. // // When a successful match was found, the TokenHandler function can call -// TokenAPI.Merge() on the forked child to have the resulting output added -// to the parent TokenAPI. +// TokenAPIold.Merge() on the forked child to have the resulting output added +// to the parent TokenAPIold. // // When no match was found, the forked child can simply be discarded. // @@ -118,7 +60,7 @@ func (t *TokenAPI) NextRune() (rune, bool) { // case could look like this (yes, it's naive, but it shows the point): // TODO make proper tested example // -// func MatchAbcd(t *TokenAPI) bool { +// func MatchAbcd(t *TokenAPIold) bool { // child := t.Fork() // fork to keep m from input untouched // for _, letter := []rune {'a', 'b', 'c', 'd'} { // if r, ok := t.NextRune(); !ok || r != letter { @@ -129,73 +71,69 @@ func (t *TokenAPI) NextRune() (rune, bool) { // child.Merge() // we have a match, add resulting output to parent // return true // and report the successful match // } -func (t *TokenAPI) Fork() *TokenAPI { - return &TokenAPI{ - p: t.p, - inputOffset: t.inputOffset, - parent: t, - } -} -// Accept will add the last rune as read by TokenAPI.NextRune() to the resulting -// output of the TokenAPI. -func (t *TokenAPI) Accept() { - t.checkAllowedCall("Accept()") - t.output = append(t.output, t.currRune.Rune) - t.inputOffset += t.currRune.ByteSize - t.currRune = nil -} +// Accept will add the last rune as read by TokenAPIold.NextRune() to the resulting +// output of the TokenAPIold. +// func (t *TokenAPIold) Accept() { +// t.checkAllowedCall("Accept()") +// t.buffer = append(t.buffer, t.lastRune.Rune) +// t.result.Accepted = append(t.result.Accepted, t.lastRune.Rune) +// t.inputOffset += t.lastRune.ByteSize +// t.lastRune = nil +// } // Skip will ignore the last rune as read by NextRune(). -func (t *TokenAPI) Skip() { - t.checkAllowedCall("Skip()") - t.inputOffset += t.currRune.ByteSize - t.currRune = nil -} +// func (t *TokenAPIold) Skip() { +// t.checkAllowedCall("Skip()") +// t.inputOffset += t.lastRune.ByteSize +// t.lastRune = nil +// } -func (t *TokenAPI) checkAllowedCall(name string) { - if t.currRune == nil { - caller, filepos := t.p.getCaller(2) - panic(fmt.Sprintf( - "TokenHandler bug: %s was called from %s at %s without a prior call to NextRune()", - name, caller, filepos)) - } - if !t.currRune.OK { - caller, filepos := t.p.getCaller(2) - panic(fmt.Sprintf( - "TokenHandler bug: %s was called from %s at %s, but prior call to NextRune() "+ - "did not return OK (EOF or invalid rune)", name, caller, filepos)) - } -} +// func (t *TokenAPIold) checkAllowedCall(name string) { +// if t.lastRune == nil { +// caller, filepos := getCaller(2) +// panic(fmt.Sprintf( +// "TokenHandler bug: %s was called from %s at %s without a prior call to NextRune()", +// name, caller, filepos)) +// } +// if !t.lastRune.OK { +// caller, filepos := getCaller(2) +// panic(fmt.Sprintf( +// "TokenHandler bug: %s was called from %s at %s, but prior call to NextRune() "+ +// "did not return OK (EOF or invalid rune)", name, caller, filepos)) +// } +// } -// Merge merges the resulting output from a forked child TokenAPI back into +// AddToken is used to add a token to the results of the TokenHandler. +// func (t *TokenAPIold) AddToken(tok *Token) { +// t.result.Tokens = append(t.result.Tokens, tok) +// } + +// Merge merges the resulting output from a forked child TokenAPIold back into // its parent: The runes that are accepted in the child are added to the parent // runes and the parent's input cursor position is advanced to the child's // cursor position. // -// After the merge, the child TokenAPI is reset so it can immediately be +// After the merge, the child TokenAPIold is reset so it can immediately be // reused for performing another match (all data are cleared, except for the // input offset which is kept at its current position). -func (t *TokenAPI) Merge() bool { - if t.parent == nil { - panic("TokenHandler bug: Cannot call Merge a a non-forked MatchDialog") - } - t.parent.input = append(t.parent.input, t.input...) - t.parent.output = append(t.parent.output, t.output...) - t.parent.inputOffset = t.inputOffset - t.ClearOutput() - t.ClearInput() - return true -} +// func (t *TokenAPIold) Merge() bool { +// if t.parent == nil { +// panic("TokenHandler bug: Cannot call Merge a a non-forked MatchDialog") +// } +// t.parent.buffer = append(t.parent.buffer, t.result.Accepted...) +// t.parent.result.Input = append(t.parent.result.Input, t.result.Input...) +// t.parent.result.Accepted = append(t.parent.result.Accepted, t.result.Accepted...) +// t.parent.result.Tokens = append(t.parent.result.Tokens, t.result.Tokens...) +// t.parent.inputOffset = t.inputOffset +// t.result = &TokResult{} +// return true +// } -// ClearOutput clears the resulting output for the TokenAPI, but it keeps -// the input and input offset as-is. -func (t *TokenAPI) ClearOutput() { - t.output = []rune{} -} - -// ClearInput clears the input for the TokenAPI, but it keeps the output -// and input offset as-is. -func (t *TokenAPI) ClearInput() { - t.input = []rune{} +func getCaller(depth int) (string, string) { + // No error handling, because we call this method ourselves with safe depth values. + pc, file, line, _ := runtime.Caller(depth + 1) + filepos := fmt.Sprintf("%s:%d", file, line) + caller := runtime.FuncForPC(pc) + return caller.Name(), filepos } diff --git a/tokenhandler_test.go b/tokenhandler_test.go index 9e82769..04f005c 100644 --- a/tokenhandler_test.go +++ b/tokenhandler_test.go @@ -4,34 +4,107 @@ import ( "testing" "git.makaay.nl/mauricem/go-parsekit" + "git.makaay.nl/mauricem/go-parsekit/assert" ) -func TestWithinTokenHandler_AcceptIncludesAndSkipIgnoresRuneInOutput(t *testing.T) { +func TestWithinTokenHandler_AcceptIncludesRuneInOutput(t *testing.T) { parser := parsekit.NewMatcher(func(t *parsekit.TokenAPI) bool { - for i := 0; i < 33; i++ { + for i := 0; i < 20; i++ { t.NextRune() t.Accept() - t.NextRune() - t.Skip() } return true }, "test") - output, _ := parser.Execute("Txhxixsx xsxhxoxuxlxdx xbxexcxoxmxex xqxuxixtxex xrxexaxdxaxbxlxex") - if output != "This should become quite readable" { - t.Fatalf("Got unexpected output from TokenHandler: %s", output) + result, _ := parser.Execute("This is some random data to parse") + if result.String() != "This is some random " { + t.Fatalf("Got unexpected output from TokenHandler: %s", result.String()) } } -func TestGivenNextRuneCalled_WithoutAcceptOrSkip_NextCallToNextRunePanics(t *testing.T) { +func TestWithinTokenHandler_TokensCanBeEmitted(t *testing.T) { parser := parsekit.NewMatcher(func(t *parsekit.TokenAPI) bool { - t.NextRune() - t.NextRune() - return false + t.Result().AddToken(&parsekit.Token{ + Type: "PI", + Runes: []rune("π"), + Value: 3.1415, + }) + t.Result().AddToken(&parsekit.Token{ + Type: nil, + Runes: []rune("yes"), + Value: true, + }) + return true }, "test") - RunPanicTest(t, PanicTest{ - func() { parser.Execute("input string") }, - `TokenHandler bug: NextRune\(\) was called from .*NextCallToNextRunePanics.* ` + - `at .*/tokenhandler_test\.go:\d+ without accepting or skipping the previously read rune`}) + result, _ := parser.Execute("doesn't matter") + if len(result.Tokens()) != 2 { + t.Fatalf("Wrong number of tokens in result, expected 2, got %d", len(result.Tokens())) + } + if result.Token(0).Value != 3.1415 { + t.Fatal("Token 0 value not 3.1415") + } + if string(result.Token(0).Runes) != "π" { + t.Fatal("Token 0 runes not \"π\"") + } + if result.Token(0).Type != "PI" { + t.Fatal("Token 0 type not \"PI\"") + } + if result.Token(1).Value != true { + t.Fatal("Token 1 value not true") + } + if string(result.Token(1).Runes) != "yes" { + t.Fatal("Token 1 runes not \"yes\"") + } + if result.Token(1).Type != nil { + t.Fatal("Token 1 type not nil") + } +} + +func TestUsingTokenParserCombinators_TokensCanBeEmitted(t *testing.T) { + fooToken := tok.StrLiteral("ASCII", c.OneOrMore(a.ASCII)) + parser := parsekit.NewMatcher(fooToken, "something") + input := "This is fine ASCII Åltho hère öt endĩt!" + result, err := parser.Execute(input) + + if err != nil { + t.Fatalf("Unexpected error from parser: %s", err) + } + if result.String() != "This is fine ASCII " { + t.Fatalf("result.String() contains unexpected data: %s", result.String()) + } +} + +func TestUsingTokenParserCombinators_TokensCanBeNested(t *testing.T) { + fooToken := c.Seq( + m.Drop(c.ZeroOrMore(a.Asterisk)), + tok.StrLiteral("COMBI", c.Seq( + tok.StrLiteral("ASCII", m.TrimSpace(c.OneOrMore(a.ASCII))), + tok.StrLiteral("UTF8", m.TrimSpace(c.OneOrMore(c.Except(a.Asterisk, a.AnyRune)))), + )), + m.Drop(c.ZeroOrMore(a.Asterisk)), + ) + parser := parsekit.NewMatcher(fooToken, "something") + input := "*** This is fine ASCII Åltho hère öt endĩt! ***" + output := "This is fine ASCIIÅltho hère öt endĩt!" + result, err := parser.Execute(input) + + if err != nil { + t.Fatalf("Unexpected error from parser: %s", err) + } + if result.String() != output { + t.Fatalf("result.String() contains unexpected data: %s", result.String()) + } + if result.Token(0).Type != "COMBI" { + t.Fatalf("Token 0 has unexpected type: %s", result.Token(0).Type) + } + if result.Token(0).Value != "This is fine ASCIIÅltho hère öt endĩt!" { + t.Fatalf("Token 0 has unexpected value: %s", result.Token(0).Value) + } + if result.Token(1).Value != "This is fine ASCII" { + t.Fatalf("Token 1 has unexpected value: %s", result.Token(0).Value) + } + if result.Token(2).Value != "Åltho hère öt endĩt!" { + t.Fatalf("Token 2 has unexpected value: %s", result.Token(0).Value) + } } func TestGivenNextRuneNotCalled_CallToAcceptPanics(t *testing.T) { @@ -39,21 +112,25 @@ func TestGivenNextRuneNotCalled_CallToAcceptPanics(t *testing.T) { t.Accept() return false }, "test") - RunPanicTest(t, PanicTest{ - func() { parser.Execute("input string") }, - `TokenHandler bug: Accept\(\) was called from .*CallToAcceptPanics.* ` + - `at .*/tokenhandler_test\.go:\d+ without a prior call to NextRune\(\)`}) + assert.Panic(t, assert.PanicT{ + Function: func() { parser.Execute("input string") }, + Regexp: true, + Expect: `parsekit.TokenAPI.Accept\(\): Accept\(\) called without first ` + + `calling NextRune\(\) from .*CallToAcceptPanics.* at /.*_test.go`, + }) } -func TestGivenNextRuneNotCalled_CallToSkipPanics(t *testing.T) { +func TestGivenAcceptNotCalled_CallToNextRunePanics(t *testing.T) { parser := parsekit.NewMatcher(func(t *parsekit.TokenAPI) bool { - t.Skip() + t.NextRune() + t.NextRune() return false }, "test") - RunPanicTest(t, PanicTest{ - func() { parser.Execute("input string") }, - `TokenHandler bug: Skip\(\) was called from .*CallToSkipPanics.* ` + - `at .*tokenhandler_test\.go:\d+ without a prior call to NextRune\(\)`}) + assert.Panic(t, assert.PanicT{ + Function: func() { parser.Execute("input string") }, + Regexp: true, + Expect: `parsekit\.TokenAPI\.NextRune\(\): NextRune\(\) called without ` + + `a prior call to Accept\(\) from .*CallToNextRunePanics.* at /.*/tokenhandler_test.go:\d+`}) } func TestGivenNextRuneReturningNotOk_CallToAcceptPanics(t *testing.T) { @@ -62,19 +139,19 @@ func TestGivenNextRuneReturningNotOk_CallToAcceptPanics(t *testing.T) { t.Accept() return false }, "test") - RunPanicTest(t, PanicTest{ - func() { parser.Execute("\xcd") }, - `TokenHandler bug: Accept\(\) was called from .*CallToAcceptPanics.* ` + - `at .*tokenhandler_test\.go:\d+, but prior call to NextRune\(\) did not ` + - `return OK \(EOF or invalid rune\)`}) + assert.Panic(t, assert.PanicT{ + Function: func() { parser.Execute("") }, + Regexp: true, + Expect: `parsekit\.TokenAPI\.Accept\(\): Accept\(\) called while the previous call to ` + + `NextRune\(\) failed from .*CallToAcceptPanics.* at .*_test\.go:\d+`}) } func TestGivenRootTokenAPI_CallingMergePanics(t *testing.T) { - RunPanicTest(t, PanicTest{ - func() { + assert.Panic(t, assert.PanicT{ + Function: func() { a := parsekit.TokenAPI{} a.Merge() }, - `TokenHandler bug: Cannot call Merge a a non-forked MatchDialog`, + Expect: `parsekit.TokenAPI.Merge(): Cannot call Merge() on a non-forked TokenAPI`, }) } diff --git a/tokenhandlers_builtin.go b/tokenhandlers_builtin.go index ecbe8c8..ea003d5 100644 --- a/tokenhandlers_builtin.go +++ b/tokenhandlers_builtin.go @@ -2,6 +2,9 @@ package parsekit import ( "fmt" + "io" + "runtime" + "strconv" "strings" "unicode" ) @@ -9,6 +12,11 @@ import ( // C provides convenient access to a range of parser/combinators that can be // used to construct TokenHandler functions. // +// Parser/combinators are so called higher order functions that take in one +// or more other TokenHandlers and output a new TokenHandler. They can be +// used to combine TokenHandlers in useful ways to create new more complex +// TokenHandlers. +// // When using C in your own parser, then it is advised to create a variable // to reference it: // @@ -16,11 +24,6 @@ import ( // // Doing so saves you a lot of typing, and it makes your code a lot cleaner. var C = struct { - Rune func(rune) TokenHandler - Runes func(...rune) TokenHandler - RuneRange func(rune, rune) TokenHandler - Str func(string) TokenHandler - StrNoCase func(string) TokenHandler Any func(...TokenHandler) TokenHandler Not func(TokenHandler) TokenHandler Opt func(TokenHandler) TokenHandler @@ -31,15 +34,9 @@ var C = struct { ZeroOrMore func(TokenHandler) TokenHandler OneOrMore func(TokenHandler) TokenHandler MinMax func(min int, max int, handler TokenHandler) TokenHandler - Separated func(separated TokenHandler, separator TokenHandler) TokenHandler // TODO reverse args for consistency + Separated func(separated TokenHandler, separator TokenHandler) TokenHandler // TODO reverse args for consistency, us string? Except func(except TokenHandler, handler TokenHandler) TokenHandler - Signed func(TokenHandler) TokenHandler }{ - Rune: MatchRune, - Runes: MatchRunes, - RuneRange: MatchRuneRange, - Str: MatchStr, - StrNoCase: MatchStrNoCase, Opt: MatchOpt, Any: MatchAny, Not: MatchNot, @@ -52,15 +49,217 @@ var C = struct { MinMax: MatchMinMax, Separated: MatchSeparated, Except: MatchExcept, - Signed: MatchSigned, +} + +// A provides convenient access to a range of atoms or functions to build atoms. +// +// When using A in your own parser, then it is advised to create a variable +// to reference it: +// +// var a = parsekit.A +// +// Doing so saves you a lot of typing, and it makes your code a lot cleaner. +var A = struct { + Rune func(rune) TokenHandler + Runes func(...rune) TokenHandler + RuneRange func(rune, rune) TokenHandler + Str func(string) TokenHandler + StrNoCase func(string) TokenHandler + EndOfFile TokenHandler + AnyRune TokenHandler + Space TokenHandler + Tab TokenHandler + CR TokenHandler + LF TokenHandler + CRLF TokenHandler + Excl TokenHandler + DoubleQuote TokenHandler + Hash TokenHandler + Dollar TokenHandler + Percent TokenHandler + Amp TokenHandler + SingleQuote TokenHandler + RoundOpen TokenHandler + LeftParen TokenHandler + RoundClose TokenHandler + RightParen TokenHandler + Asterisk TokenHandler + Multiply TokenHandler + Plus TokenHandler + Add TokenHandler + Comma TokenHandler + Minus TokenHandler + Subtract TokenHandler + Dot TokenHandler + Slash TokenHandler + Divide TokenHandler + Colon TokenHandler + Semicolon TokenHandler + AngleOpen TokenHandler + LessThan TokenHandler + Equal TokenHandler + AngleClose TokenHandler + GreaterThan TokenHandler + Question TokenHandler + At TokenHandler + SquareOpen TokenHandler + Backslash TokenHandler + SquareClose TokenHandler + Caret TokenHandler + Underscore TokenHandler + Backquote TokenHandler + CurlyOpen TokenHandler + Pipe TokenHandler + CurlyClose TokenHandler + Tilde TokenHandler + Newline TokenHandler + Whitespace TokenHandler + WhitespaceAndNewlines TokenHandler + EndOfLine TokenHandler + Digit TokenHandler + DigitNotZero TokenHandler + Digits TokenHandler + Float TokenHandler + Boolean TokenHandler + Integer TokenHandler + Signed func(TokenHandler) TokenHandler + IntegerBetween func(min int64, max int64) TokenHandler + ASCII TokenHandler + ASCIILower TokenHandler + ASCIIUpper TokenHandler + HexDigit TokenHandler + Octet TokenHandler + IPv4 TokenHandler + IPv4MaskBits TokenHandler +}{ + Rune: MatchRune, + Runes: MatchRunes, + RuneRange: MatchRuneRange, + Str: MatchStr, + StrNoCase: MatchStrNoCase, + EndOfFile: MatchEndOfFile(), + AnyRune: MatchAnyRune(), + Space: MatchRune(' '), + Tab: MatchRune('\t'), + CR: MatchRune('\r'), + LF: MatchRune('\n'), + CRLF: MatchStr("\r\n"), + Excl: MatchRune('!'), + DoubleQuote: MatchRune('"'), + Hash: MatchRune('#'), + Dollar: MatchRune('$'), + Percent: MatchRune('%'), + Amp: MatchRune('&'), + SingleQuote: MatchRune('\''), + RoundOpen: MatchRune('('), + LeftParen: MatchRune('('), + RoundClose: MatchRune(')'), + RightParen: MatchRune(')'), + Asterisk: MatchRune('*'), + Multiply: MatchRune('*'), + Plus: MatchRune('+'), + Add: MatchRune('+'), + Comma: MatchRune(','), + Minus: MatchRune('-'), + Subtract: MatchRune('-'), + Dot: MatchRune('.'), + Slash: MatchRune('/'), + Divide: MatchRune('/'), + Colon: MatchRune(':'), + Semicolon: MatchRune(';'), + AngleOpen: MatchRune('<'), + LessThan: MatchRune('<'), + Equal: MatchRune('='), + AngleClose: MatchRune('>'), + GreaterThan: MatchRune('>'), + Question: MatchRune('?'), + At: MatchRune('@'), + SquareOpen: MatchRune('['), + Backslash: MatchRune('\\'), + SquareClose: MatchRune(']'), + Caret: MatchRune('^'), + Underscore: MatchRune('_'), + Backquote: MatchRune('`'), + CurlyOpen: MatchRune('{'), + Pipe: MatchRune('|'), + CurlyClose: MatchRune('}'), + Tilde: MatchRune('~'), + Whitespace: MatchOneOrMore(MatchAny(MatchRune(' '), MatchRune('\t'))), + WhitespaceAndNewlines: MatchOneOrMore(MatchAny(MatchRune(' '), MatchRune('\t'), MatchStr("\r\n"), MatchRune('\n'))), + EndOfLine: MatchAny(MatchStr("\r\n"), MatchRune('\n'), MatchEndOfFile()), + Digit: MatchDigit(), + DigitNotZero: MatchDigitNotZero(), + Digits: MatchDigits(), + Integer: MatchInteger(), + Signed: MatchSigned, + IntegerBetween: MatchIntegerBetween, + Float: MatchFloat(), + Boolean: MatchBoolean(), + ASCII: MatchRuneRange('\x00', '\x7F'), + ASCIILower: MatchRuneRange('a', 'z'), + ASCIIUpper: MatchRuneRange('A', 'Z'), + HexDigit: MatchAny(MatchRuneRange('0', '9'), MatchRuneRange('a', 'f'), MatchRuneRange('A', 'F')), + Octet: MatchOctet(false), + IPv4: MatchIPv4(), + IPv4MaskBits: MatchIntegerBetween(0, 32), +} + +// T provides convenient access to a range of Token producers (which in their +// nature are parser/combinators) that can be used when creating TokenHandler +// functions. +// +// When using T in your own parser, then it is advised to create a variable +// to reference it: +// +// var t = parsekit.T +// +// Doing so saves you a lot of typing, and it makes your code a lot cleaner. +var T = struct { + StrLiteral func(interface{}, TokenHandler) TokenHandler + StrInterpreted func(interface{}, TokenHandler) TokenHandler + Byte func(interface{}, TokenHandler) TokenHandler + Rune func(interface{}, TokenHandler) TokenHandler + Int func(interface{}, TokenHandler) TokenHandler + Int8 func(interface{}, TokenHandler) TokenHandler + Int16 func(interface{}, TokenHandler) TokenHandler + Int32 func(interface{}, TokenHandler) TokenHandler + Int64 func(interface{}, TokenHandler) TokenHandler + Uint func(interface{}, TokenHandler) TokenHandler + Uint8 func(interface{}, TokenHandler) TokenHandler + Uint16 func(interface{}, TokenHandler) TokenHandler + Uint32 func(interface{}, TokenHandler) TokenHandler + Uint64 func(interface{}, TokenHandler) TokenHandler + Float32 func(interface{}, TokenHandler) TokenHandler + Float64 func(interface{}, TokenHandler) TokenHandler + Boolean func(interface{}, TokenHandler) TokenHandler + ByCallback func(TokenHandler, func(t *TokenAPI) *Token) TokenHandler +}{ + StrLiteral: MakeStrLiteralToken, + StrInterpreted: MakeStrInterpretedToken, + Byte: MakeByteToken, + Rune: MakeRuneToken, + Int: MakeIntToken, + Int8: MakeInt8Token, + Int16: MakeInt16Token, + Int32: MakeInt32Token, + Int64: MakeInt64Token, + Uint: MakeUintToken, + Uint8: MakeUint8Token, + Uint16: MakeUint16Token, + Uint32: MakeUint32Token, + Uint64: MakeUint64Token, + Float32: MakeFloat32Token, + Float64: MakeFloat64Token, + Boolean: MakeBooleanToken, + ByCallback: MakeTokenByCallback, } // MatchRune creates a TokenHandler function that checks if the next rune from // the input matches the provided rune. func MatchRune(expected rune) TokenHandler { return func(t *TokenAPI) bool { - input, ok := t.NextRune() - if ok && input == expected { + input, err := t.NextRune() + if err == nil && input == expected { t.Accept() return true } @@ -73,8 +272,8 @@ func MatchRune(expected rune) TokenHandler { func MatchRunes(expected ...rune) TokenHandler { s := string(expected) return func(t *TokenAPI) bool { - input, ok := t.NextRune() - if ok { + input, err := t.NextRune() + if err == nil { if strings.ContainsRune(s, input) { t.Accept() return true @@ -97,8 +296,8 @@ func MatchRuneRange(start rune, end rune) TokenHandler { panic(fmt.Sprintf("TokenHandler bug: MatchRuneRange definition error: start %q must not be < end %q", start, end)) } return func(t *TokenAPI) bool { - input, ok := t.NextRune() - if ok && input >= start && input <= end { + input, err := t.NextRune() + if err == nil && input >= start && input <= end { t.Accept() return true } @@ -167,7 +366,8 @@ func MatchAny(handlers ...TokenHandler) TokenHandler { for _, handler := range handlers { child := t.Fork() if handler(child) { - return child.Merge() + child.Merge() + return true } } return false @@ -183,8 +383,8 @@ func MatchNot(handler TokenHandler) TokenHandler { if handler(probe) { return false } - _, ok := t.NextRune() - if ok { + _, err := t.NextRune() + if err == nil { t.Accept() return true } @@ -311,138 +511,24 @@ func MatchSigned(handler TokenHandler) TokenHandler { return MatchSeq(sign, handler) } -// A provides convenient access to a range of atoms that can be used to -// build TokenHandlers or parser rules. -// -// In parsekit, an atom is defined as a ready for use TokenHandler function. -// -// When using A in your own parser, then it is advised to create a variable -// to reference it: -// -// var a = parsekit.A -// -// Doing so saves you a lot of typing, and it makes your code a lot cleaner. -var A = struct { - EndOfFile TokenHandler - AnyRune TokenHandler - Space TokenHandler - Tab TokenHandler - CR TokenHandler - LF TokenHandler - CRLF TokenHandler - Excl TokenHandler - DoubleQuote TokenHandler - Hash TokenHandler - Dollar TokenHandler - Percent TokenHandler - Amp TokenHandler - SingleQuote TokenHandler - RoundOpen TokenHandler - LeftParen TokenHandler - RoundClose TokenHandler - RightParen TokenHandler - Asterisk TokenHandler - Multiply TokenHandler - Plus TokenHandler - Add TokenHandler - Comma TokenHandler - Minus TokenHandler - Subtract TokenHandler - Dot TokenHandler - Slash TokenHandler - Divide TokenHandler - Colon TokenHandler - Semicolon TokenHandler - AngleOpen TokenHandler - LessThan TokenHandler - Equal TokenHandler - AngleClose TokenHandler - GreaterThan TokenHandler - Question TokenHandler - At TokenHandler - SquareOpen TokenHandler - Backslash TokenHandler - SquareClose TokenHandler - Caret TokenHandler - Underscore TokenHandler - Backquote TokenHandler - CurlyOpen TokenHandler - Pipe TokenHandler - CurlyClose TokenHandler - Tilde TokenHandler - Newline TokenHandler - Whitespace TokenHandler - WhitespaceAndNewlines TokenHandler - EndOfLine TokenHandler - Digit TokenHandler - DigitNotZero TokenHandler - Digits TokenHandler - Float TokenHandler - Integer TokenHandler - ASCII TokenHandler - ASCIILower TokenHandler - ASCIIUpper TokenHandler - HexDigit TokenHandler -}{ - EndOfFile: MatchEndOfFile(), - AnyRune: MatchAnyRune(), - Space: C.Rune(' '), - Tab: C.Rune('\t'), - CR: C.Rune('\r'), - LF: C.Rune('\n'), - CRLF: C.Str("\r\n"), - Excl: C.Rune('!'), - DoubleQuote: C.Rune('"'), - Hash: C.Rune('#'), - Dollar: C.Rune('$'), - Percent: C.Rune('%'), - Amp: C.Rune('&'), - SingleQuote: C.Rune('\''), - RoundOpen: C.Rune('('), - LeftParen: C.Rune('('), - RoundClose: C.Rune(')'), - RightParen: C.Rune(')'), - Asterisk: C.Rune('*'), - Multiply: C.Rune('*'), - Plus: C.Rune('+'), - Add: C.Rune('+'), - Comma: C.Rune(','), - Minus: C.Rune('-'), - Subtract: C.Rune('-'), - Dot: C.Rune('.'), - Slash: C.Rune('/'), - Divide: C.Rune('/'), - Colon: C.Rune(':'), - Semicolon: C.Rune(';'), - AngleOpen: C.Rune('<'), - LessThan: C.Rune('<'), - Equal: C.Rune('='), - AngleClose: C.Rune('>'), - GreaterThan: C.Rune('>'), - Question: C.Rune('?'), - At: C.Rune('@'), - SquareOpen: C.Rune('['), - Backslash: C.Rune('\\'), - SquareClose: C.Rune(']'), - Caret: C.Rune('^'), - Underscore: C.Rune('_'), - Backquote: C.Rune('`'), - CurlyOpen: C.Rune('{'), - Pipe: C.Rune('|'), - CurlyClose: C.Rune('}'), - Tilde: C.Rune('~'), - Whitespace: C.OneOrMore(C.Any(C.Rune(' '), C.Rune('\t'))), - WhitespaceAndNewlines: C.OneOrMore(C.Any(C.Rune(' '), C.Rune('\t'), C.Str("\r\n"), C.Rune('\n'))), - EndOfLine: C.Any(C.Str("\r\n"), C.Rune('\n'), MatchEndOfFile()), - Digit: MatchDigit(), - DigitNotZero: MatchDigitNotZero(), - Digits: MatchDigits(), - Integer: MatchInteger(), - Float: MatchFloat(), - ASCII: C.RuneRange('\x00', '\x7F'), - ASCIILower: C.RuneRange('a', 'z'), - ASCIIUpper: C.RuneRange('A', 'Z'), - HexDigit: C.Any(C.RuneRange('0', '9'), C.RuneRange('a', 'f'), C.RuneRange('A', 'F')), +// MatchIntegerBetween creates a TokenHandler that checks for an integer +// value between the provided min and max boundaries (inclusive). +// It uses an int64 for checking internally, so you can check values +// ranging from -9223372036854775808 to 9223372036854775807. +func MatchIntegerBetween(min int64, max int64) TokenHandler { + digits := MatchSigned(MatchDigits()) + return func(t *TokenAPI) bool { + fork := t.Fork() + if !digits(fork) { + return false + } + value, _ := strconv.ParseInt(fork.Result().String(), 10, 64) + if value < min || value > max { + return false + } + fork.Merge() + return true + } } // MatchEndOfFile creates a TokenHandler that checks if the end of the input data @@ -451,8 +537,8 @@ var A = struct { func MatchEndOfFile() TokenHandler { return func(t *TokenAPI) bool { fork := t.Fork() - input, ok := fork.NextRune() - return !ok && input == eofRune + _, err := fork.NextRune() + return err == io.EOF } } @@ -461,8 +547,8 @@ func MatchEndOfFile() TokenHandler { // input has not yet been reached and the upcoming input is a valid UTF8 rune. func MatchAnyRune() TokenHandler { return func(t *TokenAPI) bool { - _, ok := t.NextRune() - if ok { + _, err := t.NextRune() + if err == nil { t.Accept() return true } @@ -494,7 +580,7 @@ func MatchDigitNotZero() TokenHandler { // hexadecimal. func MatchInteger() TokenHandler { justZero := MatchRune('0') - integer := C.Seq(MatchDigitNotZero(), MatchZeroOrMore(MatchDigit())) + integer := MatchSeq(MatchDigitNotZero(), MatchZeroOrMore(MatchDigit())) return MatchAny(integer, justZero) } @@ -506,6 +592,56 @@ func MatchFloat() TokenHandler { return MatchSeq(digits, MatchOpt(MatchSeq(MatchRune('.'), digits))) } +// MatchBoolean creates a TokenHandler function that checks if a valid boolean +// value can be read from the input. It supports the boolean values as understood +// by Go's strconv.ParseBool() function. +func MatchBoolean() TokenHandler { + trues := MatchAny(MatchStr("true"), MatchStr("TRUE"), MatchStr("True"), MatchRune('1'), MatchRune('t'), MatchRune('T')) + falses := MatchAny(MatchStr("false"), MatchStr("FALSE"), MatchStr("False"), MatchRune('0'), MatchRune('f'), MatchRune('F')) + return MatchAny(trues, falses) +} + +// MatchOctet creates a TokenHandler function that checks if a valid octet value +// can be read from the input (octet = byte value representation, with a value +// between 0 and 255 inclusive). It only looks at the first 1 to 3 upcoming +// digits, not if there's a non-digit after it, meaning that "123255" would be +// a valid sequence of two octets. +// +// When the normalize parameter is set to true, then leading zeroes will be +// stripped from the octet. +func MatchOctet(normalize bool) TokenHandler { + digits := MatchMinMax(1, 3, MatchDigit()) + return func(t *TokenAPI) bool { + fork := t.Fork() + if !digits(fork) { + return false + } + value, _ := strconv.ParseInt(fork.Result().String(), 10, 16) + if value <= 255 { + if normalize { + runes := fork.Result().Runes() + for len(runes) > 1 && runes[0] == '0' { + runes = runes[1:] + } + fork.Result().SetRunes(runes) + } + fork.Merge() + return true + } + return false + } +} + +// MatchIPv4 creates a TokenHandler function that checks if a valid IPv4 +// IP address value can be read from the input. +// It will normalize IP-addresses that look like "192.168.001.012" to +// "192.168.1.12". +func MatchIPv4() TokenHandler { + octet := MatchOctet(true) + dot := MatchRune('.') + return MatchSeq(octet, dot, octet, dot, octet, dot, octet) +} + // M provides convenient access to a range of modifiers (which in their nature are // parser/combinators) that can be used when creating TokenHandler functions. // @@ -520,25 +656,25 @@ func MatchFloat() TokenHandler { // // Doing so saves you a lot of typing, and it makes your code a lot cleaner. var M = struct { - Drop func(TokenHandler) TokenHandler - Trim func(handler TokenHandler, cutset string) TokenHandler // TODO reverse arguments? - TrimLeft func(handler TokenHandler, cutset string) TokenHandler // TODO reverse arguments? - TrimRight func(handler TokenHandler, cutset string) TokenHandler // TODO reverse arguments? - TrimSpace func(handler TokenHandler) TokenHandler - ToLower func(TokenHandler) TokenHandler - ToUpper func(TokenHandler) TokenHandler - Replace func(handler TokenHandler, replaceWith string) TokenHandler // TODO reverse arguments? - ModifyByCallback func(TokenHandler, func(string) string) TokenHandler + Drop func(TokenHandler) TokenHandler + Trim func(handler TokenHandler, cutset string) TokenHandler // TODO reverse arguments? + TrimLeft func(handler TokenHandler, cutset string) TokenHandler // TODO reverse arguments? + TrimRight func(handler TokenHandler, cutset string) TokenHandler // TODO reverse arguments? + TrimSpace func(handler TokenHandler) TokenHandler + ToLower func(TokenHandler) TokenHandler + ToUpper func(TokenHandler) TokenHandler + Replace func(handler TokenHandler, replaceWith string) TokenHandler // TODO reverse arguments? + ByCallback func(TokenHandler, func(string) string) TokenHandler }{ - Drop: ModifyDrop, - Trim: ModifyTrim, - TrimLeft: ModifyTrimLeft, - TrimRight: ModifyTrimRight, - TrimSpace: ModifyTrimSpace, - ToLower: ModifyToLower, - ToUpper: ModifyToUpper, - Replace: ModifyReplace, - ModifyByCallback: ModifyByCallback, + Drop: ModifyDrop, + Trim: ModifyTrim, + TrimLeft: ModifyTrimLeft, + TrimRight: ModifyTrimRight, + TrimSpace: ModifyTrimSpace, + ToLower: ModifyToLower, + ToUpper: ModifyToUpper, + Replace: ModifyReplace, + ByCallback: ModifyByCallback, } // ModifyDrop creates a TokenHandler that checks if the provided TokenHandler applies. @@ -635,11 +771,222 @@ func ModifyByCallback(handler TokenHandler, modfunc func(string) string) TokenHa return func(t *TokenAPI) bool { child := t.Fork() if handler(child) { - s := modfunc(string(child.output)) - child.output = []rune(s) + s := modfunc(child.Result().String()) + child.Result().SetRunes(s) child.Merge() return true } return false } } + +func MakeStrLiteralToken(toktype interface{}, handler TokenHandler) TokenHandler { + return MakeTokenByCallback(handler, func(t *TokenAPI) *Token { + literal := t.Result().String() + return &Token{Type: toktype, Runes: t.Result().Runes(), Value: literal} + }) +} + +func MakeStrInterpretedToken(toktype interface{}, handler TokenHandler) TokenHandler { + return MakeTokenByCallback(handler, func(t *TokenAPI) *Token { + // TODO ERROR HANDLING + interpreted, _ := interpretString(t.Result().String()) + return &Token{Type: toktype, Runes: t.Result().Runes(), Value: interpreted} + }) +} + +func MakeRuneToken(toktype interface{}, handler TokenHandler) TokenHandler { + return MakeTokenByCallback(handler, func(t *TokenAPI) *Token { + // TODO ERROR HANDLING --- not a 1 rune input + return &Token{Type: toktype, Runes: t.Result().Runes(), Value: t.Result().Rune(0)} + }) +} + +func MakeByteToken(toktype interface{}, handler TokenHandler) TokenHandler { + return MakeTokenByCallback(handler, func(t *TokenAPI) *Token { + // TODO ERROR HANDLING --- not a 1 byte input + return &Token{Type: toktype, Runes: t.Result().Runes(), Value: byte(t.Result().Rune(0))} + }) +} + +func interpretString(str string) (string, error) { + var sb strings.Builder + for len(str) > 0 { + r, _, remainder, err := strconv.UnquoteChar(str, '"') + if err != nil { + return sb.String(), err + } + str = remainder + sb.WriteRune(r) + } + return sb.String(), nil +} + +func MakeIntToken(toktype interface{}, handler TokenHandler) TokenHandler { + return makeStrconvToken(toktype, handler, func(s string) (interface{}, error) { + return strconv.Atoi(s) + }) +} + +// TODO allow other Go types for oct and hex too. +func MakeInt8Token(toktype interface{}, handler TokenHandler) TokenHandler { + return makeStrconvToken(toktype, handler, + func(s string) (interface{}, error) { + value, err := strconv.ParseInt(s, 10, 8) + if err == nil { + return int8(value), err + } + return value, err + }) +} + +func MakeInt16Token(toktype interface{}, handler TokenHandler) TokenHandler { + return makeStrconvToken(toktype, handler, + func(s string) (interface{}, error) { + value, err := strconv.ParseInt(s, 10, 16) + if err == nil { + return int16(value), err + } + return value, err + }) +} + +func MakeInt32Token(toktype interface{}, handler TokenHandler) TokenHandler { + return makeStrconvToken(toktype, handler, + func(s string) (interface{}, error) { + value, err := strconv.ParseInt(s, 10, 32) + if err == nil { + return int32(value), err + } + return value, err + }) +} + +func MakeInt64Token(toktype interface{}, handler TokenHandler) TokenHandler { + return makeStrconvToken(toktype, handler, + func(s string) (interface{}, error) { + value, err := strconv.ParseInt(s, 10, 64) + if err == nil { + return int64(value), err + } + return value, err + }) +} + +func MakeUintToken(toktype interface{}, handler TokenHandler) TokenHandler { + return makeStrconvToken(toktype, handler, + func(s string) (interface{}, error) { + value, err := strconv.ParseUint(s, 10, 0) + if err == nil { + return uint(value), err + } + return value, err + }) +} + +// TODO allow other Go types for oct and hex too. +func MakeUint8Token(toktype interface{}, handler TokenHandler) TokenHandler { + return makeStrconvToken(toktype, handler, + func(s string) (interface{}, error) { + value, err := strconv.ParseUint(s, 10, 8) + if err == nil { + return uint8(value), err + } + return value, err + }) +} + +func MakeUint16Token(toktype interface{}, handler TokenHandler) TokenHandler { + return makeStrconvToken(toktype, handler, + func(s string) (interface{}, error) { + value, err := strconv.ParseUint(s, 10, 16) + if err == nil { + return uint16(value), err + } + return value, err + }) +} + +func MakeUint32Token(toktype interface{}, handler TokenHandler) TokenHandler { + return makeStrconvToken(toktype, handler, + func(s string) (interface{}, error) { + value, err := strconv.ParseUint(s, 10, 32) + if err == nil { + return uint32(value), err + } + return value, err + }) +} + +func MakeUint64Token(toktype interface{}, handler TokenHandler) TokenHandler { + return makeStrconvToken(toktype, handler, + func(s string) (interface{}, error) { + value, err := strconv.ParseUint(s, 10, 64) + if err == nil { + return uint64(value), err + } + return value, err + }) +} + +func MakeFloat32Token(toktype interface{}, handler TokenHandler) TokenHandler { + return makeStrconvToken(toktype, handler, + func(s string) (interface{}, error) { + value, err := strconv.ParseFloat(s, 32) + if err == nil { + return float32(value), err + } + return value, err + }) +} + +func MakeFloat64Token(toktype interface{}, handler TokenHandler) TokenHandler { + return makeStrconvToken(toktype, handler, + func(s string) (interface{}, error) { + value, err := strconv.ParseFloat(s, 64) + if err == nil { + return float64(value), err + } + return value, err + }) +} + +func MakeBooleanToken(toktype interface{}, handler TokenHandler) TokenHandler { + return makeStrconvToken(toktype, handler, + func(s string) (interface{}, error) { + value, err := strconv.ParseBool(s) + if err == nil { + return bool(value), err + } + return value, err + }) +} + +func makeStrconvToken(toktype interface{}, handler TokenHandler, convert func(s string) (interface{}, error)) TokenHandler { + pc, _, _, _ := runtime.Caller(1) + fullName := runtime.FuncForPC(pc).Name() + parts := strings.Split(fullName, ".") + name := parts[len(parts)-1] + return MakeTokenByCallback(handler, func(t *TokenAPI) *Token { + value, err := convert(t.Result().String()) + if err != nil { + panic(fmt.Sprintf( + "TokenHandler error: %s cannot handle input %q: %s "+ + "(only use a type conversion token maker, when the input has been "+ + "validated on beforehand)", name, t.Result().String(), err)) + } + return &Token{Type: toktype, Runes: t.Result().Runes(), Value: value} + }) +} + +func MakeTokenByCallback(handler TokenHandler, callback func(t *TokenAPI) *Token) TokenHandler { + return func(t *TokenAPI) bool { + fork := t.Fork() + if handler(fork) { + t.Result().AddToken(callback(fork)) + fork.Merge() + return true + } + return false + } +} diff --git a/tokenhandlers_builtin_test.go b/tokenhandlers_builtin_test.go index 4fc6cf6..27d08b8 100644 --- a/tokenhandlers_builtin_test.go +++ b/tokenhandlers_builtin_test.go @@ -9,72 +9,57 @@ import ( func TestCombinators(t *testing.T) { RunTokenHandlerTests(t, []TokenHandlerTest{ - {"xxx", c.Rune('x'), true, "x"}, - {"x ", c.Rune(' '), false, ""}, - {"aa", c.RuneRange('b', 'e'), false, ""}, - {"bb", c.RuneRange('b', 'e'), true, "b"}, - {"cc", c.RuneRange('b', 'e'), true, "c"}, - {"dd", c.RuneRange('b', 'e'), true, "d"}, - {"ee", c.RuneRange('b', 'e'), true, "e"}, - {"ff", c.RuneRange('b', 'e'), false, ""}, - {"Hello, world!", c.Str("Hello"), true, "Hello"}, - {"HellÖ, world!", c.StrNoCase("hellö"), true, "HellÖ"}, - {"+X", c.Runes('+', '-', '*', '/'), true, "+"}, - {"-X", c.Runes('+', '-', '*', '/'), true, "-"}, - {"*X", c.Runes('+', '-', '*', '/'), true, "*"}, - {"/X", c.Runes('+', '-', '*', '/'), true, "/"}, - {"!X", c.Runes('+', '-', '*', '/'), false, ""}, - {"abc", c.Not(c.Rune('b')), true, "a"}, - {"bcd", c.Not(c.Rune('b')), false, ""}, - {"bcd", c.Not(c.Rune('b')), false, ""}, - {"1010", c.Not(c.Seq(c.Rune('2'), c.Rune('0'))), true, "1"}, - {"2020", c.Not(c.Seq(c.Rune('2'), c.Rune('0'))), false, ""}, - {"abc", c.Any(c.Rune('a'), c.Rune('b')), true, "a"}, - {"bcd", c.Any(c.Rune('a'), c.Rune('b')), true, "b"}, - {"cde", c.Any(c.Rune('a'), c.Rune('b')), false, ""}, - {"ababc", c.Rep(4, c.Runes('a', 'b')), true, "abab"}, - {"ababc", c.Rep(5, c.Runes('a', 'b')), false, ""}, - {"", c.Min(0, c.Rune('a')), true, ""}, - {"a", c.Min(0, c.Rune('a')), true, "a"}, - {"aaaaa", c.Min(4, c.Rune('a')), true, "aaaaa"}, - {"aaaaa", c.Min(5, c.Rune('a')), true, "aaaaa"}, - {"aaaaa", c.Min(6, c.Rune('a')), false, ""}, - {"", c.Max(4, c.Rune('b')), true, ""}, - {"X", c.Max(4, c.Rune('b')), true, ""}, - {"bbbbbX", c.Max(4, c.Rune('b')), true, "bbbb"}, - {"bbbbbX", c.Max(5, c.Rune('b')), true, "bbbbb"}, - {"bbbbbX", c.Max(6, c.Rune('b')), true, "bbbbb"}, - {"", c.MinMax(0, 0, c.Rune('c')), true, ""}, - {"X", c.MinMax(0, 0, c.Rune('c')), true, ""}, - {"cccc", c.MinMax(0, 5, c.Rune('c')), true, "cccc"}, - {"ccccc", c.MinMax(0, 5, c.Rune('c')), true, "ccccc"}, - {"cccccc", c.MinMax(0, 5, c.Rune('c')), true, "ccccc"}, - {"cccccX", c.MinMax(0, 0, c.Rune('c')), true, ""}, - {"cccccX", c.MinMax(0, 1, c.Rune('c')), true, "c"}, - {"cccccX", c.MinMax(0, 5, c.Rune('c')), true, "ccccc"}, - {"cccccX", c.MinMax(0, 6, c.Rune('c')), true, "ccccc"}, - {"cccccX", c.MinMax(1, 1, c.Rune('c')), true, "c"}, - {"", c.MinMax(1, 1, c.Rune('c')), false, ""}, - {"X", c.MinMax(1, 1, c.Rune('c')), false, ""}, - {"cccccX", c.MinMax(1, 3, c.Rune('c')), true, "ccc"}, - {"cccccX", c.MinMax(1, 6, c.Rune('c')), true, "ccccc"}, - {"cccccX", c.MinMax(3, 4, c.Rune('c')), true, "cccc"}, - {"", c.OneOrMore(c.Rune('d')), false, ""}, - {"X", c.OneOrMore(c.Rune('d')), false, ""}, - {"dX", c.OneOrMore(c.Rune('d')), true, "d"}, - {"dddddX", c.OneOrMore(c.Rune('d')), true, "ddddd"}, - {"", c.ZeroOrMore(c.Rune('e')), true, ""}, - {"X", c.ZeroOrMore(c.Rune('e')), true, ""}, - {"eX", c.ZeroOrMore(c.Rune('e')), true, "e"}, - {"eeeeeX", c.ZeroOrMore(c.Rune('e')), true, "eeeee"}, - {"Hello, world!X", c.Seq(c.Str("Hello"), a.Comma, a.Space, c.Str("world"), a.Excl), true, "Hello, world!"}, - {"101010123", c.OneOrMore(c.Seq(c.Rune('1'), c.Rune('0'))), true, "101010"}, - {"", c.Opt(c.OneOrMore(c.Rune('f'))), true, ""}, - {"ghijkl", c.Opt(c.Rune('h')), true, ""}, - {"ghijkl", c.Opt(c.Rune('g')), true, "g"}, - {"fffffX", c.Opt(c.OneOrMore(c.Rune('f'))), true, "fffff"}, + {"abc", c.Not(a.Rune('b')), true, "a"}, + {"bcd", c.Not(a.Rune('b')), false, ""}, + {"bcd", c.Not(a.Rune('b')), false, ""}, + {"1010", c.Not(c.Seq(a.Rune('2'), a.Rune('0'))), true, "1"}, + {"2020", c.Not(c.Seq(a.Rune('2'), a.Rune('0'))), false, ""}, + {"abc", c.Any(a.Rune('a'), a.Rune('b')), true, "a"}, + {"bcd", c.Any(a.Rune('a'), a.Rune('b')), true, "b"}, + {"cde", c.Any(a.Rune('a'), a.Rune('b')), false, ""}, + {"ababc", c.Rep(4, a.Runes('a', 'b')), true, "abab"}, + {"ababc", c.Rep(5, a.Runes('a', 'b')), false, ""}, + {"", c.Min(0, a.Rune('a')), true, ""}, + {"a", c.Min(0, a.Rune('a')), true, "a"}, + {"aaaaa", c.Min(4, a.Rune('a')), true, "aaaaa"}, + {"aaaaa", c.Min(5, a.Rune('a')), true, "aaaaa"}, + {"aaaaa", c.Min(6, a.Rune('a')), false, ""}, + {"", c.Max(4, a.Rune('b')), true, ""}, + {"X", c.Max(4, a.Rune('b')), true, ""}, + {"bbbbbX", c.Max(4, a.Rune('b')), true, "bbbb"}, + {"bbbbbX", c.Max(5, a.Rune('b')), true, "bbbbb"}, + {"bbbbbX", c.Max(6, a.Rune('b')), true, "bbbbb"}, + {"", c.MinMax(0, 0, a.Rune('c')), true, ""}, + {"X", c.MinMax(0, 0, a.Rune('c')), true, ""}, + {"cccc", c.MinMax(0, 5, a.Rune('c')), true, "cccc"}, + {"ccccc", c.MinMax(0, 5, a.Rune('c')), true, "ccccc"}, + {"cccccc", c.MinMax(0, 5, a.Rune('c')), true, "ccccc"}, + {"cccccX", c.MinMax(0, 0, a.Rune('c')), true, ""}, + {"cccccX", c.MinMax(0, 1, a.Rune('c')), true, "c"}, + {"cccccX", c.MinMax(0, 5, a.Rune('c')), true, "ccccc"}, + {"cccccX", c.MinMax(0, 6, a.Rune('c')), true, "ccccc"}, + {"cccccX", c.MinMax(1, 1, a.Rune('c')), true, "c"}, + {"", c.MinMax(1, 1, a.Rune('c')), false, ""}, + {"X", c.MinMax(1, 1, a.Rune('c')), false, ""}, + {"cccccX", c.MinMax(1, 3, a.Rune('c')), true, "ccc"}, + {"cccccX", c.MinMax(1, 6, a.Rune('c')), true, "ccccc"}, + {"cccccX", c.MinMax(3, 4, a.Rune('c')), true, "cccc"}, + {"", c.OneOrMore(a.Rune('d')), false, ""}, + {"X", c.OneOrMore(a.Rune('d')), false, ""}, + {"dX", c.OneOrMore(a.Rune('d')), true, "d"}, + {"dddddX", c.OneOrMore(a.Rune('d')), true, "ddddd"}, + {"", c.ZeroOrMore(a.Rune('e')), true, ""}, + {"X", c.ZeroOrMore(a.Rune('e')), true, ""}, + {"eX", c.ZeroOrMore(a.Rune('e')), true, "e"}, + {"eeeeeX", c.ZeroOrMore(a.Rune('e')), true, "eeeee"}, + {"Hello, world!X", c.Seq(a.Str("Hello"), a.Comma, a.Space, a.Str("world"), a.Excl), true, "Hello, world!"}, + {"101010123", c.OneOrMore(c.Seq(a.Rune('1'), a.Rune('0'))), true, "101010"}, + {"", c.Opt(c.OneOrMore(a.Rune('f'))), true, ""}, + {"ghijkl", c.Opt(a.Rune('h')), true, ""}, + {"ghijkl", c.Opt(a.Rune('g')), true, "g"}, + {"fffffX", c.Opt(c.OneOrMore(a.Rune('f'))), true, "fffff"}, {"1,2,3,b,c", c.Separated(a.Comma, a.Digit), true, "1,2,3"}, - {`\x9a\x01\xF0\xfCAndSomeMoreStuff`, c.OneOrMore(c.Seq(a.Backslash, c.Rune('x'), c.Rep(2, a.HexDigit))), true, `\x9a\x01\xF0\xfC`}, + {`\x9a\x01\xF0\xfCAndSomeMoreStuff`, c.OneOrMore(c.Seq(a.Backslash, a.Rune('x'), c.Rep(2, a.HexDigit))), true, `\x9a\x01\xF0\xfC`}, {" ", m.Trim(c.OneOrMore(a.AnyRune), " "), true, ""}, {" ", m.TrimLeft(c.OneOrMore(a.AnyRune), " "), true, ""}, {" ", m.TrimRight(c.OneOrMore(a.AnyRune), " "), true, ""}, @@ -83,27 +68,42 @@ func TestCombinators(t *testing.T) { func TestCombinatorPanics(t *testing.T) { RunPanicTests(t, []PanicTest{ - {func() { parsekit.C.RuneRange('z', 'a') }, + {func() { a.RuneRange('z', 'a') }, "TokenHandler bug: MatchRuneRange definition error: start 'z' must not be < end 'a'"}, - {func() { parsekit.C.MinMax(-1, 1, parsekit.A.Space) }, + {func() { c.MinMax(-1, 1, parsekit.A.Space) }, "TokenHandler bug: MatchMinMax definition error: min must be >= 0"}, - {func() { parsekit.C.MinMax(1, -1, parsekit.A.Space) }, + {func() { c.MinMax(1, -1, parsekit.A.Space) }, "TokenHandler bug: MatchMinMax definition error: max must be >= 0"}, - {func() { parsekit.C.MinMax(10, 5, parsekit.A.Space) }, + {func() { c.MinMax(10, 5, parsekit.A.Space) }, "TokenHandler bug: MatchMinMax definition error: max 5 must not be < min 10"}, - {func() { parsekit.C.Min(-10, parsekit.A.Space) }, + {func() { c.Min(-10, parsekit.A.Space) }, "TokenHandler bug: MatchMin definition error: min must be >= 0"}, - {func() { parsekit.C.Max(-42, parsekit.A.Space) }, + {func() { c.Max(-42, parsekit.A.Space) }, "TokenHandler bug: MatchMax definition error: max must be >= 0"}, }) } func TestAtoms(t *testing.T) { RunTokenHandlerTests(t, []TokenHandlerTest{ + {"dd", a.RuneRange('b', 'e'), true, "d"}, + {"ee", a.RuneRange('b', 'e'), true, "e"}, + {"ff", a.RuneRange('b', 'e'), false, ""}, + {"Hello, world!", a.Str("Hello"), true, "Hello"}, + {"HellÖ, world!", a.StrNoCase("hellö"), true, "HellÖ"}, + {"+X", a.Runes('+', '-', '*', '/'), true, "+"}, + {"-X", a.Runes('+', '-', '*', '/'), true, "-"}, + {"*X", a.Runes('+', '-', '*', '/'), true, "*"}, + {"/X", a.Runes('+', '-', '*', '/'), true, "/"}, + {"!X", a.Runes('+', '-', '*', '/'), false, ""}, + {"xxx", a.Rune('x'), true, "x"}, + {"x ", a.Rune(' '), false, ""}, + {"aa", a.RuneRange('b', 'e'), false, ""}, + {"bb", a.RuneRange('b', 'e'), true, "b"}, + {"cc", a.RuneRange('b', 'e'), true, "c"}, {"", a.EndOfFile, true, ""}, {"⌘", a.AnyRune, true, "⌘"}, - {"\xbc", a.AnyRune, false, ""}, // invalid UTF8 rune - {"", a.AnyRune, false, ""}, // end of file + {"\xbc", a.AnyRune, true, "�"}, // invalid UTF8 rune + {"", a.AnyRune, false, ""}, // false is for end of file {" ", a.Space, true, " "}, {"X", a.Space, false, ""}, {"\t", a.Tab, true, "\t"}, @@ -187,32 +187,128 @@ func TestAtoms(t *testing.T) { {"1", a.Integer, true, "1"}, {"-10X", a.Integer, false, ""}, {"+10X", a.Integer, false, ""}, - {"-10X", c.Signed(a.Integer), true, "-10"}, - {"+10X", c.Signed(a.Integer), true, "+10"}, - {"+10.1X", c.Signed(a.Integer), true, "+10"}, + {"-10X", a.Signed(a.Integer), true, "-10"}, + {"+10X", a.Signed(a.Integer), true, "+10"}, + {"+10.1X", a.Signed(a.Integer), true, "+10"}, {"0X", a.Float, true, "0"}, {"0X", a.Float, true, "0"}, {"1X", a.Float, true, "1"}, {"1.", a.Float, true, "1"}, // incomplete float, so only the 1 is picked up {"123.321X", a.Float, true, "123.321"}, {"-3.14X", a.Float, false, ""}, - {"-3.14X", c.Signed(a.Float), true, "-3.14"}, - {"-003.0014X", c.Signed(a.Float), true, "-003.0014"}, + {"-3.14X", a.Signed(a.Float), true, "-3.14"}, + {"-003.0014X", a.Signed(a.Float), true, "-003.0014"}, + {"0X", a.Octet, true, "0"}, + {"00X", a.Octet, true, "00"}, + {"000X", a.Octet, true, "000"}, + {"10X", a.Octet, true, "10"}, + {"010X", a.Octet, true, "010"}, + {"255123", a.Octet, true, "255"}, + {"256123", a.Octet, false, ""}, + {"300", a.Octet, false, ""}, + {"0.0.0.0", a.IPv4, true, "0.0.0.0"}, + {"10.20.30.40", a.IPv4, true, "10.20.30.40"}, + {"010.020.003.004", a.IPv4, true, "10.20.3.4"}, + {"255.255.255.255", a.IPv4, true, "255.255.255.255"}, + {"256.255.255.255", a.IPv4, false, ""}, + {"0", a.IPv4MaskBits, true, "0"}, + {"32", a.IPv4MaskBits, true, "32"}, + {"33", a.IPv4MaskBits, false, "0"}, + {"-11", a.IntegerBetween(-10, 10), false, "0"}, + {"-10", a.IntegerBetween(-10, 10), true, "-10"}, + {"0", a.IntegerBetween(-10, 10), true, "0"}, + {"10", a.IntegerBetween(-10, 10), true, "10"}, + {"11", a.IntegerBetween(0, 10), false, ""}, }) } func TestModifiers(t *testing.T) { RunTokenHandlerTests(t, []TokenHandlerTest{ - {"--cool", c.Seq(m.Drop(c.OneOrMore(a.Minus)), c.Str("cool")), true, "cool"}, + {"--cool", c.Seq(m.Drop(c.OneOrMore(a.Minus)), a.Str("cool")), true, "cool"}, {" trim ", m.Trim(c.OneOrMore(a.AnyRune), " "), true, "trim"}, {" \t trim \t ", m.Trim(c.OneOrMore(a.AnyRune), " \t"), true, "trim"}, {" trim ", m.TrimLeft(c.OneOrMore(a.AnyRune), " "), true, "trim "}, {" trim ", m.TrimRight(c.OneOrMore(a.AnyRune), " "), true, " trim"}, {" \t trim \t ", m.TrimRight(c.OneOrMore(a.AnyRune), " \t"), true, " \t trim"}, {"dirtyword", m.Replace(c.OneOrMore(a.AnyRune), "*******"), true, "*******"}, - {"abcdefghijk", m.ModifyByCallback(c.Str("abc"), func(s string) string { return "X" }), true, "X"}, - {"NoTaLlUpPeR", m.ToUpper(c.StrNoCase("notallUPPER")), true, "NOTALLUPPER"}, - {"NoTaLlLoWeR", m.ToLower(c.StrNoCase("NOTALLlower")), true, "notalllower"}, + {"abcdefghijk", m.ByCallback(a.Str("abc"), func(s string) string { return "X" }), true, "X"}, + {"NoTaLlUpPeR", m.ToUpper(a.StrNoCase("notallUPPER")), true, "NOTALLUPPER"}, + {"NoTaLlLoWeR", m.ToLower(a.StrNoCase("NOTALLlower")), true, "notalllower"}, + }) +} + +// When a TokenMaker encounters an error, this is considered a programmer error. +// A TokenMaker should not be called, unless the input is already validated to +// follow the correct pattern. Therefore, tokenmakers will panic when the +// input cannot be processed successfully. +func TestTokenMakerErrorHandling(t *testing.T) { + invalid := tok.Boolean("BOOL", a.Str("no")) // not valid for strconv.ParseBool() + parser := parsekit.NewMatcher(invalid, "boolean") + RunPanicTest(t, PanicTest{ + func() { parser.Execute("no") }, + `TokenHandler error: MakeBooleanToken cannot handle input "no": strconv.ParseBool: parsing "no": ` + + `invalid syntax \(only use a type conversion token maker, when the input has been validated on beforehand\)`, + }) +} + +func TestTokenMakers(t *testing.T) { + RunTokenMakerTests(t, []TokenMakerTest{ + {`empty token`, tok.StrLiteral("A", c.ZeroOrMore(a.Digit)), + []parsekit.Token{{Type: "A", Runes: []rune(""), Value: ""}}}, + + {`Ѝюج literal \string`, tok.StrLiteral("B", c.OneOrMore(a.AnyRune)), + []parsekit.Token{{Type: "B", Runes: []rune(`Ѝюج literal \string`), Value: `Ѝюج literal \string`}}}, + + {`Ѝюجinterpreted \n string \u2318`, tok.StrInterpreted("C", c.OneOrMore(a.AnyRune)), + []parsekit.Token{{Type: "C", Runes: []rune(`Ѝюجinterpreted \n string \u2318`), Value: "Ѝюجinterpreted \n string ⌘"}}}, + + {"Ø*", tok.Byte("Q", a.AnyRune), []parsekit.Token{{Type: "Q", Runes: []rune("Ø"), Value: byte('Ø')}}}, + {"ROCKS", c.OneOrMore(tok.Byte("bar", a.ASCII)), []parsekit.Token{ + {Type: "bar", Runes: []rune("R"), Value: byte('R')}, + {Type: "bar", Runes: []rune("O"), Value: byte('O')}, + {Type: "bar", Runes: []rune("C"), Value: byte('C')}, + {Type: "bar", Runes: []rune("K"), Value: byte('K')}, + {Type: "bar", Runes: []rune("S"), Value: byte('S')}, + }}, + + {"Ø*", tok.Rune("P", a.AnyRune), []parsekit.Token{{Type: "P", Runes: []rune("Ø"), Value: rune('Ø')}}}, + + {`2147483647XYZ`, tok.Int("D", a.Integer), []parsekit.Token{{Type: "D", Runes: []rune("2147483647"), Value: int(2147483647)}}}, + {`-2147483647XYZ`, tok.Int("D", a.Signed(a.Integer)), []parsekit.Token{{Type: "D", Runes: []rune("-2147483647"), Value: int(-2147483647)}}}, + {`127XYZ`, tok.Int8("E", a.Integer), []parsekit.Token{{Type: "E", Runes: []rune("127"), Value: int8(127)}}}, + {`-127XYZ`, tok.Int8("E", a.Signed(a.Integer)), []parsekit.Token{{Type: "E", Runes: []rune("-127"), Value: int8(-127)}}}, + {`32767XYZ`, tok.Int16("F", a.Integer), []parsekit.Token{{Type: "F", Runes: []rune("32767"), Value: int16(32767)}}}, + {`-32767XYZ`, tok.Int16("F", a.Signed(a.Integer)), []parsekit.Token{{Type: "F", Runes: []rune("-32767"), Value: int16(-32767)}}}, + {`2147483647XYZ`, tok.Int32("G", a.Integer), []parsekit.Token{{Type: "G", Runes: []rune("2147483647"), Value: int32(2147483647)}}}, + {`-2147483647XYZ`, tok.Int32("G", a.Signed(a.Integer)), []parsekit.Token{{Type: "G", Runes: []rune("-2147483647"), Value: int32(-2147483647)}}}, + {`-9223372036854775807XYZ`, tok.Int64("H", a.Signed(a.Integer)), []parsekit.Token{{Type: "H", Runes: []rune("-9223372036854775807"), Value: int64(-9223372036854775807)}}}, + + {`4294967295`, tok.Uint("I", a.Integer), []parsekit.Token{{Type: "I", Runes: []rune("4294967295"), Value: uint(4294967295)}}}, + {`255XYZ`, tok.Uint8("J", a.Integer), []parsekit.Token{{Type: "J", Runes: []rune("255"), Value: uint8(255)}}}, + {`65535XYZ`, tok.Uint16("K", a.Integer), []parsekit.Token{{Type: "K", Runes: []rune("65535"), Value: uint16(65535)}}}, + {`4294967295XYZ`, tok.Uint32("L", a.Integer), []parsekit.Token{{Type: "L", Runes: []rune("4294967295"), Value: uint32(4294967295)}}}, + {`18446744073709551615XYZ`, tok.Uint64("M", a.Integer), []parsekit.Token{{Type: "M", Runes: []rune("18446744073709551615"), Value: uint64(18446744073709551615)}}}, + + {`3.1415=PI`, tok.Float32("N", a.Float), []parsekit.Token{{Type: "N", Runes: []rune("3.1415"), Value: float32(3.1415)}}}, + {`24.19287=PI`, tok.Float64("O", a.Float), []parsekit.Token{{Type: "O", Runes: []rune("24.19287"), Value: float64(24.19287)}}}, + + {`1tTtrueTRUETrue`, c.OneOrMore(tok.Boolean("P", a.Boolean)), []parsekit.Token{ + {Type: "P", Runes: []rune("1"), Value: true}, + {Type: "P", Runes: []rune("t"), Value: true}, + {Type: "P", Runes: []rune("T"), Value: true}, + {Type: "P", Runes: []rune("true"), Value: true}, + {Type: "P", Runes: []rune("TRUE"), Value: true}, + {Type: "P", Runes: []rune("True"), Value: true}, + }}, + + {`0fFfalseFALSEFalse`, c.OneOrMore(tok.Boolean("P", a.Boolean)), []parsekit.Token{ + {Type: "P", Runes: []rune("0"), Value: false}, + {Type: "P", Runes: []rune("f"), Value: false}, + {Type: "P", Runes: []rune("F"), Value: false}, + {Type: "P", Runes: []rune("false"), Value: false}, + {Type: "P", Runes: []rune("FALSE"), Value: false}, + {Type: "P", Runes: []rune("False"), Value: false}, + }}, }) } @@ -229,7 +325,7 @@ func TestSequenceOfRunes(t *testing.T) { parser := parsekit.NewParser(func(p *parsekit.ParseAPI) { p.Expects("Sequence of runes") if p.On(sequence).Accept() { - output = p.BufLiteral() + output = p.Result().String() p.Stop() } }) @@ -250,7 +346,7 @@ func TestCombination(t *testing.T) { c.Seq( c.Opt(a.Whitespace), c.Rep(3, a.AngleClose), - m.ModifyByCallback(c.OneOrMore(c.StrNoCase("hello")), func(s string) string { + m.ByCallback(c.OneOrMore(a.StrNoCase("hello")), func(s string) string { return fmt.Sprintf("%d", len(s)) }), m.Replace(c.Separated(a.Comma, c.Opt(a.Whitespace)), ", "),