From c532af67ca2995480eb6e7aae3e04f4739613b6c Mon Sep 17 00:00:00 2001 From: Maurice Makaay Date: Thu, 11 Jul 2019 12:43:57 +0000 Subject: [PATCH] Optimization round completed (for now :-) All tests successful. --- examples/example_basiccalculator1_test.go | 2 +- examples/example_basiccalculator2_test.go | 6 +- examples/example_helloManyStateParser_test.go | 2 +- .../example_helloSingleStateParser_test.go | 2 +- examples/examples_state_test.go | 2 +- parse/api.go | 61 +- parse/assertions_test.go | 38 - parse/parse_test.go | 37 +- tokenize/api.go | 361 ++-- tokenize/api_test.go | 256 ++- tokenize/assertions_test.go | 5 +- {tokenize2 => tokenize}/callerinfo_test.go | 2 +- tokenize/handler.go | 4 +- tokenize/handler_test.go | 2 +- tokenize/handlers_builtin.go | 227 +-- tokenize/handlers_builtin_test.go | 213 ++- tokenize/result.go | 155 -- tokenize/result_test.go | 58 - {tokenize2 => tokenize}/token.go | 2 +- {tokenize2 => tokenize}/token_test.go | 4 +- tokenize/tokenize.go | 6 +- tokenize/tokenizer_test.go | 103 +- tokenize/tokenizer_whitebox_test.go | 173 +- tokenize2/api.go | 374 ----- tokenize2/api_test.go | 330 ---- tokenize2/assertions_test.go | 118 -- tokenize2/callerinfo.go | 33 - tokenize2/cursor.go | 45 - tokenize2/cursor_test.go | 69 - tokenize2/handler.go | 53 - tokenize2/handler_test.go | 101 -- tokenize2/handlers_builtin.go | 1489 ----------------- tokenize2/handlers_builtin_test.go | 512 ------ tokenize2/tokenize.go | 41 - tokenize2/tokenizer_test.go | 223 --- tokenize2/tokenizer_whitebox_test.go | 131 -- 36 files changed, 905 insertions(+), 4335 deletions(-) rename {tokenize2 => tokenize}/callerinfo_test.go (97%) delete mode 100644 tokenize/result.go delete mode 100644 tokenize/result_test.go rename {tokenize2 => tokenize}/token.go (98%) rename {tokenize2 => tokenize}/token_test.go (89%) delete mode 100644 tokenize2/api.go delete mode 100644 tokenize2/api_test.go delete mode 100644 tokenize2/assertions_test.go delete mode 100644 tokenize2/callerinfo.go delete mode 100644 tokenize2/cursor.go delete mode 100644 tokenize2/cursor_test.go delete mode 100644 tokenize2/handler.go delete mode 100644 tokenize2/handler_test.go delete mode 100644 tokenize2/handlers_builtin.go delete mode 100644 tokenize2/handlers_builtin_test.go delete mode 100644 tokenize2/tokenize.go delete mode 100644 tokenize2/tokenizer_test.go delete mode 100644 tokenize2/tokenizer_whitebox_test.go diff --git a/examples/example_basiccalculator1_test.go b/examples/example_basiccalculator1_test.go index bcdef0a..f336d0b 100644 --- a/examples/example_basiccalculator1_test.go +++ b/examples/example_basiccalculator1_test.go @@ -77,7 +77,7 @@ var int64Token = tokenize.T.Int64(nil, bareInteger) func (c *simpleCalculator) number(p *parse.API) { if p.Accept(int64Token) { - c.Result += c.op * p.Result().Value(0).(int64) + c.Result += c.op * p.Result.Tokens[0].Value.(int64) p.Handle(c.operatorOrEndOfFile) } else { p.Expected("integer number") diff --git a/examples/example_basiccalculator2_test.go b/examples/example_basiccalculator2_test.go index 43c42c0..e404eba 100644 --- a/examples/example_basiccalculator2_test.go +++ b/examples/example_basiccalculator2_test.go @@ -98,7 +98,7 @@ func (calc *calculator) expr(p *parse.API) { var A = tokenize.A if p.Handle(calc.term) { for p.Accept(A.Add.Or(A.Subtract)) { - op := p.Result().Rune(0) + op := p.Result.Runes[0] if !p.Handle(calc.term) { return } @@ -116,7 +116,7 @@ func (calc *calculator) term(p *parse.API) { var A = tokenize.A if p.Handle(calc.factor) { for p.Accept(A.Multiply.Or(A.Divide)) { - op := p.Result().Rune(0) + op := p.Result.Runes[0] if !p.Handle(calc.factor) { return } @@ -134,7 +134,7 @@ func (calc *calculator) factor(p *parse.API) { p.Accept(A.Blanks) switch { case p.Accept(T.Float64(nil, A.Signed(A.Float))): - value := p.Result().Value(0).(float64) + value := p.Result.Tokens[0].Value.(float64) calc.interpreter.pushValue(value) case p.Accept(A.LeftParen): if !p.Handle(calc.expr) { diff --git a/examples/example_helloManyStateParser_test.go b/examples/example_helloManyStateParser_test.go index 21204c6..4eebfdd 100644 --- a/examples/example_helloManyStateParser_test.go +++ b/examples/example_helloManyStateParser_test.go @@ -116,7 +116,7 @@ func (h *helloparser1) name(p *parse.API) { case p.Peek(a.Excl): p.Handle(h.exclamation) case p.Accept(a.AnyRune): - h.greetee += p.Result().String() + h.greetee += p.Result.String() p.Handle(h.name) default: p.Expected("exclamation mark") diff --git a/examples/example_helloSingleStateParser_test.go b/examples/example_helloSingleStateParser_test.go index e862fe1..b9f6aa2 100644 --- a/examples/example_helloSingleStateParser_test.go +++ b/examples/example_helloSingleStateParser_test.go @@ -90,7 +90,7 @@ func (h *helloparser2) start(p *parse.API) { return } if p.Accept(m.TrimSpace(c.OneOrMore(a.AnyRune.Except(a.Excl)))) { - h.greetee = p.Result().String() + h.greetee = p.Result.String() if h.greetee == "" { p.Error("the name cannot be empty") return diff --git a/examples/examples_state_test.go b/examples/examples_state_test.go index 3825292..70678c4 100644 --- a/examples/examples_state_test.go +++ b/examples/examples_state_test.go @@ -22,7 +22,7 @@ func (l *Chunks) AddChopped(s string, chunkSize int) error { parseChunks := parse.New(func(p *parse.API) { for p.Accept(chunkOfRunes) { - *l = append(*l, p.Result().String()) + *l = append(*l, p.Result.String()) } }) return parseChunks(s) diff --git a/parse/api.go b/parse/api.go index 5cfcd9f..b0216be 100644 --- a/parse/api.go +++ b/parse/api.go @@ -16,14 +16,24 @@ import ( // // • call other parse.Handler functions, the core of recursive-descent parsing (Handle) type API struct { - tokenAPI tokenize.API // the tokenize.API, used for communicating with tokenize.Handler functions - result *tokenize.Result // last tokenize.Handler result as produced by Accept() or Peek() + tokenAPI *tokenize.API // the tokenize.API, used for communicating with tokenize.Handler functions + Result TokenizeResult // a struct, holding the results of the last Peek() or Accept() call sanityChecksEnabled bool // whether or not runtime sanity checks are enabled loopCheck map[uintptr]bool // used for parser loop detection err error // parse error, retrieved by Error(), using API methods is denied when set stopped bool // a boolean set to true by Stop() } +// TokenizeResult holds the results of the last Peek() or Accept() call. +type TokenizeResult struct { + Tokens []tokenize.Token // the resulting tokens from the last call to Peek() or Accept() + Runes []rune // the resulting runes from the last call to Peek() or Accept() +} + +func (result *TokenizeResult) String() string { + return string(result.Runes) +} + // DisableSanityChecks disables the built-in parser implementation sanity checks, // which detects parser implementation errors like loops and continuing parsing // after an error or invoking Stop(). @@ -40,16 +50,13 @@ func (p *API) DisableSanityChecks() { // If it does, then true will be returned, false otherwise. The read cursor // will be kept at the same position, so the next call to Peek() or Accept() // will start from the same cursor position. -// -// After calling this method, you can retrieve the produced tokenize.Result -// struct using the Result() method. func (p *API) Peek(tokenHandler tokenize.Handler) bool { - p.result = nil forkedAPI, ok := p.invokeHandler("Peek", tokenHandler) if ok { - p.result = forkedAPI.Result() - p.tokenAPI.Reset() + p.Result.Tokens = p.tokenAPI.Tokens() + p.Result.Runes = p.tokenAPI.Runes() } + p.tokenAPI.Dispose(forkedAPI) return ok } @@ -58,24 +65,31 @@ func (p *API) Peek(tokenHandler tokenize.Handler) bool { // forward to beyond the match that was found. Otherwise false will be // and the read cursor will stay at the same position. // -// After calling this method, you can retrieve the tokenize.Result -// using the Result() method. +// After calling this method, you can retrieve the results using the Result() method. func (p *API) Accept(tokenHandler tokenize.Handler) bool { - p.result = nil forkedAPI, ok := p.invokeHandler("Accept", tokenHandler) if ok { - forkedAPI.Merge() - p.result = p.tokenAPI.Result() + // Keep track of the results. + p.Result.Tokens = p.tokenAPI.Tokens() + p.Result.Runes = p.tokenAPI.Runes() + + // Merge to the parent level. + p.tokenAPI.Merge(forkedAPI) + p.tokenAPI.Dispose(forkedAPI) + + // And flush the input reader buffer. if p.tokenAPI.FlushInput() { if p.sanityChecksEnabled { p.initLoopCheck() } } + } else { + p.tokenAPI.Dispose(forkedAPI) } return ok } -func (p *API) invokeHandler(name string, tokenHandler tokenize.Handler) (tokenize.API, bool) { +func (p *API) invokeHandler(name string, tokenHandler tokenize.Handler) (int, bool) { if p.sanityChecksEnabled { p.panicWhenStoppedOrInError(name) p.checkForLoops(name) @@ -84,10 +98,9 @@ func (p *API) invokeHandler(name string, tokenHandler tokenize.Handler) (tokeniz } } - p.result = nil p.tokenAPI.Reset() child := p.tokenAPI.Fork() - ok := tokenHandler(child) + ok := tokenHandler(p.tokenAPI) return child, ok } @@ -138,20 +151,6 @@ func (p *API) checkForLoops(name string) { p.loopCheck[filepos] = true } -// Result returns the tokenize.Result struct, containing results as produced by the -// last Peek() or Accept() call. -// -// When Result() is called without first doing a Peek() or Accept(), then no -// result will be available and the method will panic. -func (p *API) Result() *tokenize.Result { - result := p.result - if p.result == nil { - callerPanic("Result", "parsekit.parse.API.{name}(): {name}() called "+ - "at {caller} without calling API.Peek() or API.Accept() on beforehand") - } - return result -} - // Handle executes other parse.Handler functions from within the active // parse.Handler function. // @@ -215,7 +214,7 @@ func (p *API) Error(format string, data ...interface{}) { // No call to p.panicWhenStoppedOrInError(), to allow a parser to // set a different error message when needed. message := fmt.Sprintf(format, data...) - p.err = fmt.Errorf("%s at %s", message, p.tokenAPI.Result().Cursor()) + p.err = fmt.Errorf("%s at %s", message, p.tokenAPI.Cursor()) } // ExpectEndOfFile can be used to check if the input is at end of file. diff --git a/parse/assertions_test.go b/parse/assertions_test.go index dcd8fc8..710ebc5 100644 --- a/parse/assertions_test.go +++ b/parse/assertions_test.go @@ -5,8 +5,6 @@ package parse import ( "regexp" "testing" - - "git.makaay.nl/mauricem/go-parsekit/tokenize" ) func AssertEqual(t *testing.T, expected interface{}, actual interface{}, forWhat string) { @@ -56,39 +54,3 @@ func AssertPanic(t *testing.T, p PanicT) { }() p.Function() } - -type TokenMakerT struct { - Input string - Handler tokenize.Handler - Expected []tokenize.Token -} - -func AssertTokenMakers(t *testing.T, testSet []TokenMakerT) { - for _, test := range testSet { - AssertTokenMaker(t, test) - } -} - -func AssertTokenMaker(t *testing.T, test TokenMakerT) { - tokenizer := tokenize.New(test.Handler) - result, err := tokenizer(test.Input) - if err != nil { - t.Errorf("Test %q failed with error: %s", test.Input, err) - } else { - if len(result.Tokens()) != len(test.Expected) { - t.Errorf("Unexpected number of tokens in output:\nexpected: %d\nactual: %d", len(test.Expected), len(result.Tokens())) - } - for i, expected := range test.Expected { - actual := result.Token(i) - if expected.Type != actual.Type { - t.Errorf("Unexpected Type in result.Tokens[%d]:\nexpected: (%T) %s\nactual: (%T) %s", i, expected.Type, expected.Type, actual.Type, actual.Type) - } - if string(expected.Runes) != string(actual.Runes) { - t.Errorf("Unexpected Runes in result.Tokens[%d]:\nexpected: %q\nactual: %q", i, expected.Runes, actual.Runes) - } - if expected.Value != actual.Value { - t.Errorf("Unexpected Value in result.Tokens[%d]:\nexpected: (%T) %s\nactual: (%T) %s", i, expected.Value, expected.Value, actual.Value, actual.Value) - } - } - } -} diff --git a/parse/parse_test.go b/parse/parse_test.go index 6d3cca6..255c2ca 100644 --- a/parse/parse_test.go +++ b/parse/parse_test.go @@ -16,7 +16,7 @@ func ExampleNew_usingAcceptedRunes() { parser := parse.New(func(p *parse.API) { for p.Accept(a.AnyRune) { - matches = append(matches, p.Result().String()) + matches = append(matches, p.Result.String()) } p.ExpectEndOfFile() }) @@ -33,9 +33,9 @@ func ExampleNew_usingTokens() { parser := parse.New(func(p *parse.API) { if p.Accept(c.OneOrMore(tok.Rune("RUNE", a.AnyRune))) { - fmt.Printf("Runes accepted: %q\n", p.Result().String()) + fmt.Printf("Runes accepted: %q\n", p.Result.String()) fmt.Printf("Tokens:\n") - for i, token := range p.Result().Tokens() { + for i, token := range p.Result.Tokens { fmt.Printf("[%d] %s\n", i, token) } } @@ -46,10 +46,10 @@ func ExampleNew_usingTokens() { // Output: // Runes accepted: "¡ök!" // Tokens: - // [0] RUNE(161) - // [1] RUNE(246) - // [2] RUNE(107) - // [3] RUNE(33) + // [0] RUNE('¡') + // [1] RUNE('ö') + // [2] RUNE('k') + // [3] RUNE('!') } func ExampleAPI_Expected() { @@ -71,7 +71,7 @@ func ExampleAPI_Accept_inIfStatement() { if p.Accept(tokenize.A.StrNoCase("Yowza!")) { // Result.String() returns a string containing all // accepted runes that were matched against. - fmt.Println(p.Result().String()) + fmt.Println(p.Result.String()) } }) parser("YOWZA!") @@ -88,7 +88,7 @@ func ExampleAPI_Accept_inSwitchStatement() { case p.Accept(tokenize.A.Rune('X')): // NOOP, skip this rune case p.Accept(tokenize.A.AnyRune): - result += p.Result().String() + result += p.Result.String() default: loop = false } @@ -107,7 +107,7 @@ func ExampleAPI_Stop() { parser := parse.New(func(p *parse.API) { fmt.Printf("First word: ") for p.Accept(c.Not(a.Space)) { - fmt.Printf("%s", p.Result()) + fmt.Printf("%s", p.Result.String()) } p.Stop() }) @@ -123,7 +123,7 @@ func ExampleAPI_Stop_notCalledAndNoInputPending() { parser := parse.New(func(p *parse.API) { fmt.Printf("Word: ") for p.Accept(c.Not(a.Space)) { - fmt.Printf("%s", p.Result()) + fmt.Printf("%s", p.Result.String()) } fmt.Printf("\n") }) @@ -141,7 +141,7 @@ func ExampleAPI_Stop_notCalledButInputPending() { parser := parse.New(func(p *parse.API) { fmt.Printf("First word: ") for p.Accept(c.Not(a.Space)) { - fmt.Printf("%s", p.Result()) + fmt.Printf("%s", p.Result.String()) } fmt.Printf("\n") }) @@ -161,7 +161,7 @@ func ExampleAPI_Peek() { // This handler is able to handle serial numbers. serialnrHandler := func(p *parse.API) { if p.Accept(serialnr) { - fmt.Println(p.Result().String()) + fmt.Println(p.Result.String()) } } @@ -255,17 +255,6 @@ func TestGivenParserWithErrorSet_HandlePanics(t *testing.T) { `at /.*/parse_test\.go:\d+: no calls allowed after API\.Error\(\)`}) } -func TestGivenParserWithoutCallToPeekOrAccept_ResultPanics(t *testing.T) { - p := parse.New(func(p *parse.API) { - p.Result() - }) - parse.AssertPanic(t, parse.PanicT{ - Function: func() { p("") }, - Regexp: true, - Expect: `parsekit\.parse\.API\.Result\(\): Result\(\) called at ` + - `/.*/parse_test.go:\d+ without calling API.Peek\(\) or API.Accept\(\) on beforehand`}) -} - func TestGivenParserWhichIsNotStopped_WithNoMoreInput_FallbackExpectEndOfFileKicksIn(t *testing.T) { p := parse.New(func(p *parse.API) {}) err := p("") diff --git a/tokenize/api.go b/tokenize/api.go index f1d301b..e00162c 100644 --- a/tokenize/api.go +++ b/tokenize/api.go @@ -25,7 +25,7 @@ import ( // // By invoking NextRune() + Accept() multiple times, the result can be extended // with as many runes as needed. Runes collected this way can later on be -// retrieved using the method Result().Runes(). +// retrieved using the method Runes(). // // It is mandatory to call Accept() after retrieving a rune, before calling // NextRune() again. Failing to do so will result in a panic. @@ -74,39 +74,40 @@ type API struct { runeRead bool // whether or not a rune was read using NextRune() runes []rune // the rune stack tokens []Token // the token stack - runeStart int - runeEnd int - tokenStart int - tokenEnd int - stackLevel int // the stack level for this API object - state *apiState // shared API state data + stackFrames []stackFrame // the stack frames, containing stack level-specific data + stackLevel int // the current stack level + stackFrame *stackFrame // the current stack frame } -type apiState struct { - stack []Result // the stack, used for forking / merging the API. - top int // the index of the current top item in the stack +type stackFrame struct { + offset int // current rune offset relative to the Reader's sliding window + runeStart int + runeEnd int + tokenStart int + tokenEnd int + cursor Cursor + + // TODO + err error // can be used by a Handler to report a specific issue with the input } -// initialAPIstackDepth determines the initial stack depth for the API. -// When a parser requires a higher stack depth, then this is no problem. -// The API will automatically scale the stack when forking beyond this -// default number of stack levels. -const initialAPIstackDepth = 10 +const initialStackDepth = 10 +const initialTokenDepth = 10 +const initialRuneDepth = 10 // NewAPI initializes a new API struct, wrapped around the provided input. // For an overview of allowed inputs, take a look at the documentation // for parsekit.read.New(). -func NewAPI(input interface{}) API { - stack := make([]Result, 1, initialAPIstackDepth) - state := apiState{ - stack: stack, - } - return API{ - runes: make([]rune, initialAPIstackDepth), - tokens: make([]Token, initialAPIstackDepth), - reader: read.New(input), - state: &state, +func NewAPI(input interface{}) *API { + api := &API{ + reader: read.New(input), + runes: make([]rune, 0, initialRuneDepth), + tokens: make([]Token, 0, initialTokenDepth), + stackFrames: make([]stackFrame, 1, initialStackDepth), } + api.stackFrame = &api.stackFrames[0] + + return api } // NextRune returns the rune at the current read offset. @@ -120,25 +121,16 @@ func NewAPI(input interface{}) API { // without explicitly accepting, this method will panic. You can see this as a // built-in unit test, enforcing correct serialization of API method calls. func (i *API) NextRune() (rune, error) { - if i.stackLevel > i.state.top { - callerPanic("NextRune", "tokenize.API.{name}(): {name}() called at {caller} "+ - "using a non-active API fork (a parent was read, forked or merged, "+ - "causing this fork to be invalidated)") - } - - result := &(i.state.stack[i.stackLevel]) if i.runeRead { callerPanic("NextRune", "tokenize.API.{name}(): {name}() called at {caller} "+ "without a prior call to Accept()") } - readRune, err := i.reader.RuneAt(result.offset) + readRune, err := i.reader.RuneAt(i.stackFrame.offset) i.lastRune = readRune i.lastRuneErr = err i.runeRead = true - i.DisposeChilds() - return readRune, err } @@ -148,22 +140,31 @@ func (i *API) NextRune() (rune, error) { // It is not allowed to call Accept() when the previous call to NextRune() // returned an error. Calling Accept() in such case will result in a panic. func (i *API) Accept() { - if i.stackLevel > i.state.top { - callerPanic("Accept", "tokenize.API.{name}(): {name}() called at {caller} "+ - "using a non-active API fork (a parent was read, forked or merged, "+ - "causing this fork to be invalidated)") - } - - result := &(i.state.stack[i.stackLevel]) + // TODO can go after completing the code for performance. + //fmt.Println("STACK [", i.stackLevel, "] runes", len(i.runes), "/", cap(i.runes), "tokens", len(i.tokens), "/", cap(i.tokens)) if !i.runeRead { - callerPanic("Accept", "tokenize.API.{name}(): {name}() called at {caller} without first calling NextRune()") + callerPanic("Accept", "tokenize.API.{name}(): {name}() called at {caller} "+ + "without first calling NextRune()") } else if i.lastRuneErr != nil { - callerPanic("Accept", "tokenize.API.{name}(): {name}() called at {caller}, but the prior call to NextRune() failed") + callerPanic("Accept", "tokenize.API.{name}(): {name}() called at {caller}, "+ + "but the prior call to NextRune() failed") } - result.runes = append(result.runes, i.lastRune) - result.cursor.moveByRune(i.lastRune) - result.offset++ + newRuneEnd := i.stackFrame.runeEnd + 1 + + // Grow the runes capacity when needed. + if cap(i.runes) < newRuneEnd { + newRunes := make([]rune, newRuneEnd, newRuneEnd*2) + copy(newRunes, i.runes) + i.runes = newRunes + } else { + i.runes = i.runes[0:newRuneEnd] + } + + i.runes[newRuneEnd-1] = i.lastRune + i.stackFrame.runeEnd++ + i.stackFrame.cursor.moveByRune(i.lastRune) + i.stackFrame.offset++ i.runeRead = false } @@ -184,44 +185,30 @@ func (i *API) Accept() { // Garbage collection will take care of this automatically. // The parent API was never modified, so it can safely be used after disposal // as if the lookahead never happened. -func (i *API) Fork() API { - if i.stackLevel > i.state.top { - callerPanic("Fork", "tokenize.API.{name}(): {name}() called at {caller} "+ - "using a non-active API fork (a parent was read, forked or merged, "+ - "causing this fork to be invalidated)") +func (i *API) Fork() int { + newStackLevel := i.stackLevel + 1 + newStackSize := newStackLevel + 1 + + // Grow the stack frames capacity when needed. + if cap(i.stackFrames) < newStackSize { + newFrames := make([]stackFrame, newStackSize, newStackSize*2) + copy(newFrames, i.stackFrames) + i.stackFrames = newFrames + } else { + i.stackFrames = i.stackFrames[0:newStackSize] } - i.DisposeChilds() - result := &(i.state.stack[i.stackLevel]) - - // Grow the stack storage when needed. - newStackSize := i.stackLevel + 2 - if cap(i.state.stack) < newStackSize { - newStack := make([]Result, newStackSize, newStackSize+initialAPIstackDepth) - copy(newStack, i.state.stack) - i.state.stack = newStack - } - i.state.stack = i.state.stack[0 : i.stackLevel+1] - - // Create the new fork. - child := API{ - state: i.state, - stackLevel: i.stackLevel + 1, - reader: i.reader, - } - childResult := Result{ - cursor: result.cursor, - offset: result.offset, - } - i.state.stack = append(i.state.stack, childResult) - //i.state.stack[i.stackLevel+1] = childResult - - // Invalidate parent's last read rune. + i.stackLevel++ i.runeRead = false - i.state.top = child.stackLevel + parent := i.stackFrame - return child + i.stackFrame = &i.stackFrames[i.stackLevel] + *i.stackFrame = *parent + i.stackFrame.runeStart = parent.runeEnd + i.stackFrame.tokenStart = parent.tokenEnd + + return i.stackLevel } // Merge appends the results of a forked child API (runes, tokens) to the @@ -232,56 +219,68 @@ func (i *API) Fork() API { // be reused for performing another match. This means that all Result data are // cleared, but the read cursor position is kept at its current position. // This allows a child to feed results in chunks to its parent. -func (i *API) Merge() { - if i.stackLevel == 0 { - callerPanic("Merge", "tokenize.API.{name}(): {name}() called at {caller} on the top-level API") - } - if i.stackLevel > i.state.top { +// +// Once the child is no longer needed, it can be disposed of by using the +// method Dispose(), which will return the tokenizer to the parent. +func (i *API) Merge(stackLevel int) { + if stackLevel == 0 { callerPanic("Merge", "tokenize.API.{name}(): {name}() called at {caller} "+ - "using a non-active API fork (a parent was read, forked or merged, "+ - "causing this fork to be invalidated)") + "on the top-level API stack level 0") + } + if stackLevel != i.stackLevel { + callerPanic("Merge", "tokenize.API.{name}(): {name}() called at {caller} "+ + "on API stack level %d, but the current stack level is %d "+ + "(forgot to Dispose() a forked child?)", stackLevel, i.stackLevel) } - result := &(i.state.stack[i.stackLevel]) - parentResult := &(i.state.stack[i.stackLevel-1]) + parent := &i.stackFrames[stackLevel-1] - // // Grow parent rune storage when needed. - // newRuneSize := len(parentResult.runes) + len(result.runes) - // if cap(parentResult.runes) < newRuneSize { - // newRunes := make([]rune, len(parentResult.runes), 2*newRuneSize) - // copy(newRunes, parentResult.runes) - // parentResult.runes = newRunes - // //fmt.Println("Beefed up runes", i.stackLevel-1, newRuneSize*2) - // } + // The end of the parent slice aligns with the start of the child slice. + // Because of this, to merge the parent slice can simply be expanded + // to include the child slice. + // parent : |----------| + // child: |------| + // After merge operation: + // parent: |-----------------| + // child: |---> continue reading from here + parent.runeEnd = i.stackFrame.runeEnd + i.stackFrame.runeStart = i.stackFrame.runeEnd - // // Grow parent token storage when needed. - // newTokenSize := len(parentResult.tokens) + len(result.tokens) - // if cap(parentResult.tokens) < newTokenSize { - // newTokens := make([]Token, len(parentResult.tokens), 2*newTokenSize) - // copy(newTokens, parentResult.tokens) - // parentResult.tokens = newTokens - // //fmt.Println("Beefed up tokens", i.stackLevel-1, newTokenSize*2) - // } + // The same logic applies to tokens. + parent.tokenEnd = i.stackFrame.tokenEnd + i.stackFrame.tokenStart = i.stackFrame.tokenEnd - parentResult.runes = append(parentResult.runes, result.runes...) - parentResult.tokens = append(parentResult.tokens, result.tokens...) - parentResult.offset = result.offset - parentResult.cursor = result.cursor - i.DisposeChilds() - i.Reset() + parent.offset = i.stackFrame.offset + parent.cursor = i.stackFrame.cursor + + i.stackFrame.err = nil + i.runeRead = false } -func (i *API) DisposeChilds() { - i.state.stack = i.state.stack[:i.stackLevel+1] - i.state.top = i.stackLevel +func (i *API) Dispose(stackLevel int) { + if stackLevel == 0 { + callerPanic("Dispose", "tokenize.API.{name}(): {name}() called at {caller} "+ + "on the top-level API stack level 0") + } + if stackLevel != i.stackLevel { + callerPanic("Dispose", "tokenize.API.{name}(): {name}() called at {caller} "+ + "on API stack level %d, but the current stack level is %d "+ + "(forgot to Dispose() a forked child?)", stackLevel, i.stackLevel) + } + + i.runeRead = false + i.stackLevel = stackLevel - 1 + i.stackFrames = i.stackFrames[:stackLevel] + i.stackFrame = &i.stackFrames[stackLevel-1] + i.runes = i.runes[0:i.stackFrame.runeEnd] + i.tokens = i.tokens[0:i.stackFrame.tokenEnd] } func (i *API) Reset() { - result := &(i.state.stack[i.stackLevel]) i.runeRead = false - result.runes = result.runes[:0] - result.tokens = result.tokens[:0] - result.err = nil + i.stackFrame.runeEnd = i.stackFrame.runeStart + i.stackFrame.tokenEnd = i.stackFrame.tokenStart + i.stackFrame.err = nil } // FlushInput flushes processed input data from the read.Buffer. @@ -291,18 +290,126 @@ func (i *API) Reset() { // Note: // When writing your own TokenHandler, you normally won't have to call this // method yourself. It is automatically called by parsekit when needed. -func (i API) FlushInput() bool { - result := &(i.state.stack[i.stackLevel]) - if result.offset > 0 { - i.reader.Flush(result.offset) - result.offset = 0 +func (i *API) FlushInput() bool { + // result := &(i.state.stack[i.stackLevel]) + if i.stackFrame.offset > 0 { + i.reader.Flush(i.stackFrame.offset) + i.stackFrame.offset = 0 return true } return false } -// Result returns the Result struct from the API. The returned struct -// can be used to retrieve and to modify result data. -func (i API) Result() *Result { - return &(i.state.stack[i.stackLevel]) +func (i *API) String() string { + return string(i.Runes()) +} + +func (i *API) Runes() []rune { + return i.runes[i.stackFrame.runeStart:i.stackFrame.runeEnd] +} + +func (i *API) Rune(offset int) rune { + return i.runes[i.stackFrame.runeStart+offset] +} + +func (i *API) ClearRunes() { + i.runes = i.runes[:i.stackFrame.runeStart] + i.stackFrame.runeEnd = i.stackFrame.runeStart +} + +func (i *API) SetRunes(runes ...rune) { + // Grow the runes capacity when needed. + newRuneEnd := i.stackFrame.runeStart + len(runes) + if cap(i.runes) < newRuneEnd { + newRunes := make([]rune, newRuneEnd, newRuneEnd*2) + copy(newRunes, i.runes) + i.runes = newRunes + } else { + i.runes = i.runes[0:newRuneEnd] + } + + for offset, r := range runes { + i.runes[i.stackFrame.runeStart+offset] = r + } + i.stackFrame.runeEnd = newRuneEnd +} + +func (i *API) AddRunes(runes ...rune) { + // Grow the runes capacity when needed. + newRuneEnd := i.stackFrame.runeEnd + len(runes) + if cap(i.runes) < newRuneEnd { + newRunes := make([]rune, newRuneEnd, newRuneEnd*2) + copy(newRunes, i.runes) + i.runes = newRunes + } else { + i.runes = i.runes[0:newRuneEnd] + } + + for offset, r := range runes { + i.runes[i.stackFrame.runeEnd+offset] = r + } + i.stackFrame.runeEnd = newRuneEnd +} + +func (i *API) AddString(s string) { + i.AddRunes([]rune(s)...) +} + +func (i *API) SetString(s string) { + i.SetRunes([]rune(s)...) +} + +func (i *API) Cursor() Cursor { + return i.stackFrame.cursor +} + +func (i *API) Tokens() []Token { + return i.tokens[i.stackFrame.tokenStart:i.stackFrame.tokenEnd] +} + +func (i *API) Token(offset int) Token { + return i.tokens[i.stackFrame.tokenStart+offset] +} + +func (i *API) TokenValue(offset int) interface{} { + return i.tokens[i.stackFrame.tokenStart+offset].Value +} + +func (i *API) ClearTokens() { + i.tokens = i.tokens[:i.stackFrame.tokenStart] + i.stackFrame.tokenEnd = i.stackFrame.tokenStart +} + +func (i *API) SetTokens(tokens ...Token) { + // Grow the tokens capacity when needed. + newTokenEnd := i.stackFrame.tokenStart + len(tokens) + if cap(i.tokens) < newTokenEnd { + newTokens := make([]Token, newTokenEnd, newTokenEnd*2) + copy(newTokens, tokens) + i.tokens = newTokens + } else { + i.tokens = i.tokens[0:newTokenEnd] + } + + for offset, t := range tokens { + i.tokens[i.stackFrame.tokenStart+offset] = t + } + i.stackFrame.tokenEnd = newTokenEnd +} + +func (i *API) AddTokens(tokens ...Token) { + // Grow the tokens capacity when needed. + newTokenEnd := i.stackFrame.tokenEnd + len(tokens) + if cap(i.tokens) < newTokenEnd { + newTokens := make([]Token, newTokenEnd, newTokenEnd*2) + copy(newTokens, i.tokens) + i.tokens = newTokens + } else { + i.tokens = i.tokens[0:newTokenEnd] + } + + for offset, t := range tokens { + i.tokens[i.stackFrame.tokenEnd+offset] = t + } + i.stackFrame.tokenEnd = newTokenEnd } diff --git a/tokenize/api_test.go b/tokenize/api_test.go index a570b92..d77830d 100644 --- a/tokenize/api_test.go +++ b/tokenize/api_test.go @@ -18,7 +18,7 @@ func ExampleAPI_NextRune() { r, err := api.NextRune() fmt.Printf("Rune read from input; %c\n", r) fmt.Printf("The error: %v\n", err) - fmt.Printf("API results: %q\n", api.Result().String()) + fmt.Printf("API results: %q\n", api.String()) // Output: // Rune read from input; T @@ -34,38 +34,38 @@ func ExampleAPI_Accept() { api.Accept() // adds 'h' to the API results api.NextRune() // reads 'e', but it is not added to the API results - fmt.Printf("API results: %q\n", api.Result().String()) + fmt.Printf("API results: %q\n", api.String()) // Output: // API results: "Th" } -func ExampleAPI_Result() { +func ExampleAPI_modifyingResults() { api := tokenize.NewAPI("") - result := api.Result() + api.AddString("Some runes") + api.AddRunes(' ', 'a', 'd', 'd', 'e', 'd') + api.AddRunes(' ', 'i', 'n', ' ') + api.AddString("various ways") + fmt.Printf("API result first 10 runes: %q\n", api.Runes()[0:10]) + fmt.Printf("API result runes as string: %q\n", api.String()) - result.AddRunes("Some runes") - result.AddRunes([]rune{' ', 'a', 'd', 'd', 'e', 'd'}) - result.AddRunes(' ', 'i', 'n', ' ', "various ways") - fmt.Printf("API result first 10 runes: %q\n", api.Result().Runes()[0:10]) - fmt.Printf("API result runes as string: %q\n", api.Result().String()) + api.SetString("new ") + api.AddString("set ") + api.AddString("of ") + api.AddRunes('r', 'u', 'n', 'e', 's') + fmt.Printf("API result runes as string: %q\n", api.String()) + fmt.Printf("API result runes: %q\n", api.Runes()) + fmt.Printf("API third rune: %q\n", api.Rune(2)) - result.SetRunes("new ", "set ", "of ", 'r', 'u', 'n', 'e', 's') - fmt.Printf("API result runes as string: %q\n", api.Result().String()) - fmt.Printf("API result runes: %q\n", api.Result().Runes()) - fmt.Printf("API third rune: %q\n", api.Result().Rune(2)) - - result.AddTokens(tokenize.Token{ - Runes: []rune("demo 1"), + api.AddTokens(tokenize.Token{ Type: 42, Value: "towel"}) - result.AddTokens(tokenize.Token{ - Runes: []rune("demo 2"), + api.AddTokens(tokenize.Token{ Type: 73, Value: "Zaphod"}) - fmt.Printf("API result tokens: %v\n", api.Result().Tokens()) - fmt.Printf("API second result token: %v\n", api.Result().Token(1)) + fmt.Printf("API result tokens: %v\n", api.Tokens()) + fmt.Printf("API second result token: %v\n", api.Token(1)) // Output: // API result first 10 runes: ['S' 'o' 'm' 'e' ' ' 'r' 'u' 'n' 'e' 's'] @@ -84,17 +84,17 @@ func ExampleAPI_Reset() { api.Accept() api.NextRune() api.Accept() - fmt.Printf("API results: %q at %s\n", api.Result().String(), api.Result().Cursor()) + fmt.Printf("API results: %q at %s\n", api.String(), api.Cursor()) // Reset clears the results, but keeps the cursor position. api.Reset() - fmt.Printf("API results: %q at %s\n", api.Result().String(), api.Result().Cursor()) + fmt.Printf("API results: %q at %s\n", api.String(), api.Cursor()) api.NextRune() api.Accept() api.NextRune() api.Accept() - fmt.Printf("API results: %q at %s\n", api.Result().String(), api.Result().Cursor()) + fmt.Printf("API results: %q at %s\n", api.String(), api.Cursor()) // Output: // API results: "Ve" at line 1, column 3 @@ -104,14 +104,16 @@ func ExampleAPI_Reset() { func ExampleAPI_Fork() { // This custom Handler checks for input 'a', 'b' or 'c'. - abcHandler := func(t tokenize.API) bool { + abcHandler := func(t *tokenize.API) bool { a := tokenize.A for _, r := range []rune{'a', 'b', 'c'} { child := t.Fork() // fork, so we won't change parent t - if a.Rune(r)(child) { - child.Merge() // accept results into parent t - return true // and report a successful match + if a.Rune(r)(t) { + t.Merge(child) // accept results into parent of child + t.Dispose(child) // return to the parent level + return true // and report a successful match } + t.Dispose(child) // return to the parent level } // If we get here, then no match was found. Return false to communicate // this to the caller. @@ -141,25 +143,27 @@ func ExampleAPI_Fork() { } func ExampleAPI_Merge() { - tokenHandler := func(t tokenize.API) bool { + tokenHandler := func(t *tokenize.API) bool { child1 := t.Fork() - child1.NextRune() // reads 'H' - child1.Accept() - child1.NextRune() // reads 'i' - child1.Accept() + t.NextRune() // reads 'H' + t.Accept() + t.NextRune() // reads 'i' + t.Accept() - child2 := child1.Fork() - child2.NextRune() // reads ' ' - child2.Accept() - child2.NextRune() // reads 'd' - child2.Accept() + child2 := t.Fork() + t.NextRune() // reads ' ' + t.Accept() + t.NextRune() // reads 'm' + t.Accept() + t.Dispose(child2) - child1.Merge() // We merge child1, which has read 'H' and 'i' only. + t.Merge(child1) // We merge child1, which has read 'H' and 'i' only. + t.Dispose(child1) // and clean up child1 to return to the parent return true } result, _ := tokenize.New(tokenHandler)("Hi mister X!") - fmt.Println(result) + fmt.Println(result.String()) // Output: // Hi @@ -170,75 +174,157 @@ func TestMultipleLevelsOfForksAndMerges(t *testing.T) { // Fork a few levels. child1 := api.Fork() - child2 := child1.Fork() - child3 := child2.Fork() - child4 := child3.Fork() + child2 := api.Fork() + child3 := api.Fork() + child4 := api.Fork() - // Read some data from child4. - r, _ := child4.NextRune() - child4.Accept() + // Read a rune 'a' from child4. + r, _ := api.NextRune() AssertEqual(t, 'a', r, "child4 rune 1") + api.Accept() + AssertEqual(t, "a", api.String(), "child4 runes after rune 1") - r, _ = child4.NextRune() - child4.Accept() + // Read another rune 'b' from child4. + r, _ = api.NextRune() AssertEqual(t, 'b', r, "child4 rune 2") + api.Accept() + AssertEqual(t, "ab", api.String(), "child4 runes after rune 2") - // Merge it to child3. - child4.Merge() + // Merge "ab" from child4 to child3. + api.Merge(child4) + AssertEqual(t, "", api.String(), "child4 runes after first merge") // Read some more from child4. - r, _ = child4.NextRune() - child4.Accept() + r, _ = api.NextRune() AssertEqual(t, 'c', r, "child4 rune 3") - AssertEqual(t, "line 1, column 4", child4.Result().Cursor().String(), "cursor child4 rune 3") + api.Accept() + AssertEqual(t, "c", api.String(), "child4 runes after rune 1") + AssertEqual(t, "line 1, column 4", api.Cursor().String(), "cursor child4 rune 3") - AssertEqual(t, "line 1, column 3", child3.Result().Cursor().String(), "cursor child3 rune 3, before merge of child 4") + // Merge "c" from child4 to child3. + api.Merge(child4) - // Again, merge it to child3. - child4.Merge() - AssertEqual(t, "line 1, column 4", child3.Result().Cursor().String(), "cursor child3 rune 3, after merge of child 4") + // And dispose of child4, making child3 the active stack level. + api.Dispose(child4) + + // Child3 should now have the compbined results "abc" from child4's work. + AssertEqual(t, "abc", api.String(), "child3 after merge of child4") + AssertEqual(t, "line 1, column 4", api.Cursor().String(), "cursor child3 rune 3, after merge of child4") // Now read some data from child3. - r, _ = child3.NextRune() - child3.Accept() - r, _ = child3.NextRune() - child3.Accept() - r, _ = child3.NextRune() - child3.Accept() - AssertEqual(t, 'f', r, "child3 rune 5") + r, _ = api.NextRune() + AssertEqual(t, 'd', r, "child3 rune 5") + api.Accept() - AssertEqual(t, "abcdef", child3.Result().String(), "child3 total result after rune 6") + r, _ = api.NextRune() + AssertEqual(t, 'e', r, "child3 rune 5") + api.Accept() + + r, _ = api.NextRune() + AssertEqual(t, 'f', r, "child3 rune 5") + api.Accept() + + AssertEqual(t, "abcdef", api.String(), "child3 total result after rune 6") // Temporarily go some new forks from here, but don't use their outcome. - child3sub1 := child3.Fork() - child3sub1.NextRune() - child3sub1.Accept() - child3sub1.NextRune() - child3sub1.Accept() - child3sub2 := child3sub1.Fork() - child3sub2.NextRune() - child3sub2.Accept() - child3sub2.Merge() + child3sub1 := api.Fork() + api.NextRune() + api.Accept() + api.NextRune() + api.Accept() + child3sub2 := api.Fork() + api.NextRune() + api.Accept() + api.Merge(child3sub2) // do merge sub2 down to sub1 + api.Dispose(child3sub2) // and dispose of sub2 + api.Dispose(child3sub1) // but dispose of sub1 without merging - // Instead merge the pre-forking results from child3 to child2. - child3.Merge() + // Instead merge the results from before this forking segway from child3 to child2 + // and dispose of it. + api.Merge(child3) + api.Dispose(child3) - AssertEqual(t, "abcdef", child2.Result().String(), "child2 total result after merge of child3") - AssertEqual(t, "line 1, column 7", child2.Result().Cursor().String(), "cursor child2 after merge child3") + AssertEqual(t, "abcdef", api.String(), "child2 total result after merge of child3") + AssertEqual(t, "line 1, column 7", api.Cursor().String(), "cursor child2 after merge child3") - // Merge child2 to child1. - child2.Merge() + // Merge child2 to child1 and dispose of it. + api.Merge(child2) + api.Dispose(child2) // Merge child1 a few times to the top level api. - child1.Merge() - child1.Merge() - child1.Merge() - child1.Merge() + api.Merge(child1) + api.Merge(child1) + api.Merge(child1) + api.Merge(child1) + + // And dispose of it. + api.Dispose(child1) // Read some data from the top level api. r, _ = api.NextRune() api.Accept() - AssertEqual(t, "abcdefg", api.Result().String(), "api string end result") - AssertEqual(t, "line 1, column 8", api.Result().Cursor().String(), "api cursor end result") + AssertEqual(t, "abcdefg", api.String(), "api string end result") + AssertEqual(t, "line 1, column 8", api.Cursor().String(), "api cursor end result") +} + +func TestClearRunes(t *testing.T) { + api := tokenize.NewAPI("Laphroaig") + api.NextRune() // Read 'L' + api.Accept() // Add to runes + api.NextRune() // Read 'a' + api.Accept() // Add to runes + api.ClearRunes() // Clear the runes, giving us a fresh start. + api.NextRune() // Read 'p' + api.Accept() // Add to runes + api.NextRune() // Read 'r' + api.Accept() // Add to runes + + AssertEqual(t, "ph", api.String(), "api string end result") +} + +func TestMergeScenariosForTokens(t *testing.T) { + api := tokenize.NewAPI("") + + token1 := tokenize.Token{Value: 1} + token2 := tokenize.Token{Value: 2} + token3 := tokenize.Token{Value: 3} + token4 := tokenize.Token{Value: 4} + + api.SetTokens(token1) + tokens := api.Tokens() + AssertEqual(t, 1, len(tokens), "Tokens 1") + + child := api.Fork() + + tokens = api.Tokens() + AssertEqual(t, 0, len(tokens), "Tokens 2") + + api.AddTokens(token2) + + // Here we can merge by expanding the token slice on the parent, + // because the end of the parent slice and the start of the child + // slice align. + api.Merge(child) + api.Dispose(child) + + tokens = api.Tokens() + AssertEqual(t, 2, len(tokens), "Tokens 3") + + child = api.Fork() + api.AddTokens(token3) + api.Reset() + api.AddTokens(token4) + + // Here the merge means that token4 will be copied to the end of + // the token slice of the parent, since there's a gap at the place + // where token3 used to be. + api.Merge(child) + api.Dispose(child) + + tokens = api.Tokens() + AssertEqual(t, 3, len(tokens), "Tokens 4") + AssertEqual(t, 1, api.TokenValue(0).(int), "Tokens 4, value 0") + AssertEqual(t, 2, api.TokenValue(1).(int), "Tokens 4, value 1") + AssertEqual(t, 4, api.TokenValue(2).(int), "Tokens 4, value 2") } diff --git a/tokenize/assertions_test.go b/tokenize/assertions_test.go index 6c48ee5..6b11beb 100644 --- a/tokenize/assertions_test.go +++ b/tokenize/assertions_test.go @@ -6,7 +6,7 @@ import ( "regexp" "testing" - "git.makaay.nl/mauricem/go-parsekit/tokenize" + tokenize "git.makaay.nl/mauricem/go-parsekit/tokenize" ) func AssertEqual(t *testing.T, expected interface{}, actual interface{}, forWhat string) { @@ -110,9 +110,6 @@ func AssertTokenMaker(t *testing.T, test TokenMakerT) { if expected.Type != actual.Type { t.Errorf("Unexpected Type in result.Tokens[%d]:\nexpected: (%T) %s\nactual: (%T) %s", i, expected.Type, expected.Type, actual.Type, actual.Type) } - if string(expected.Runes) != string(actual.Runes) { - t.Errorf("Unexpected Runes in result.Tokens[%d]:\nexpected: %q\nactual: %q", i, expected.Runes, actual.Runes) - } if expected.Value != actual.Value { t.Errorf("Unexpected Value in result.Tokens[%d]:\nexpected: (%T) %s\nactual: (%T) %s", i, expected.Value, expected.Value, actual.Value, actual.Value) } diff --git a/tokenize2/callerinfo_test.go b/tokenize/callerinfo_test.go similarity index 97% rename from tokenize2/callerinfo_test.go rename to tokenize/callerinfo_test.go index d0f1107..f287318 100644 --- a/tokenize2/callerinfo_test.go +++ b/tokenize/callerinfo_test.go @@ -1,4 +1,4 @@ -package tokenize2 +package tokenize import ( "strings" diff --git a/tokenize/handler.go b/tokenize/handler.go index 43ae975..cd9241a 100644 --- a/tokenize/handler.go +++ b/tokenize/handler.go @@ -7,11 +7,11 @@ package tokenize // A Handler function gets an API as its input and returns a boolean to // indicate whether or not it found a match on the input. The API is used // for retrieving input data to match against and for reporting back results. -type Handler func(t API) bool +type Handler func(t *API) bool // Match is syntactic sugar that allows you to write a construction like // NewTokenizer(handler).Execute(input) as handler.Match(input). -func (handler Handler) Match(input interface{}) (*Result, error) { +func (handler Handler) Match(input interface{}) (*API, error) { tokenizer := New(handler) return tokenizer(input) } diff --git a/tokenize/handler_test.go b/tokenize/handler_test.go index ccee843..6f929bc 100644 --- a/tokenize/handler_test.go +++ b/tokenize/handler_test.go @@ -4,7 +4,7 @@ import ( "fmt" "testing" - "git.makaay.nl/mauricem/go-parsekit/tokenize" + tokenize "git.makaay.nl/mauricem/go-parsekit/tokenize" ) func TestSyntacticSugar(t *testing.T) { diff --git a/tokenize/handlers_builtin.go b/tokenize/handlers_builtin.go index c1a856d..5fc3f13 100644 --- a/tokenize/handlers_builtin.go +++ b/tokenize/handlers_builtin.go @@ -230,7 +230,7 @@ var A = struct { Lower: MatchUnicodeLower(), Upper: MatchUnicodeUpper(), HexDigit: MatchHexDigit(), - Octet: MatchOctet(false), + Octet: MatchOctet(true), IPv4: MatchIPv4(true), IPv4CIDRMask: MatchIPv4CIDRMask(true), IPv4Netmask: MatchIPv4Netmask(true), @@ -306,7 +306,7 @@ var T = struct { Float64 func(interface{}, Handler) Handler Boolean func(interface{}, Handler) Handler ByValue func(toktype interface{}, handler Handler, value interface{}) Handler - ByCallback func(toktype interface{}, handler Handler, makeValue func(t API) interface{}) Handler + ByCallback func(toktype interface{}, handler Handler, makeValue func(t *API) interface{}) Handler Group func(interface{}, Handler) Handler }{ Str: MakeStrLiteralToken, @@ -405,9 +405,9 @@ func MatchUnicodeSpace() Handler { // Note that the callback function matches the signature of the unicode.Is* functions, // so those can be used. E.g. MatchRuneByCallback(unicode.IsLower). func MatchRuneByCallback(callback func(rune) bool) Handler { - return func(t API) bool { - input, err := t.NextRune() - if err == nil && callback(input) { + return func(t *API) bool { + r, err := t.NextRune() + if err == nil && callback(r) { t.Accept() return true } @@ -422,9 +422,9 @@ func MatchEndOfLine() Handler { // MatchStr creates a Handler that matches the input against the provided string. func MatchStr(expected string) Handler { - var handlers = []Handler{} - for _, r := range expected { - handlers = append(handlers, MatchRune(r)) + var handlers = make([]Handler, len(expected)) + for i, r := range expected { + handlers[i] = MatchRune(r) } return MatchSeq(handlers...) } @@ -453,16 +453,20 @@ func MatchOptional(handler Handler) Handler { // applied in their exact order. Only if all Handlers apply, the sequence // reports successful match. func MatchSeq(handlers ...Handler) Handler { - return func(t API) bool { + return func(t *API) bool { child := t.Fork() for _, handler := range handlers { - subchild := child.Fork() - if !handler(subchild) { + subchild := t.Fork() + if !handler(t) { + t.Dispose(subchild) + t.Dispose(child) return false } - subchild.Merge() + t.Merge(subchild) + t.Dispose(subchild) } - child.Merge() + t.Merge(child) + t.Dispose(child) return true } } @@ -471,14 +475,17 @@ func MatchSeq(handlers ...Handler) Handler { // can be applied. They are applied in their provided order. The first Handler // that applies is used for reporting back a match. func MatchAny(handlers ...Handler) Handler { - return func(t API) bool { + return func(t *API) bool { for _, handler := range handlers { child := t.Fork() - if handler(child) { - child.Merge() + if handler(t) { + t.Merge(child) + t.Dispose(child) return true } + t.Dispose(child) // TODO switch to Reset() and move forking outside the loop? } + return false } } @@ -487,10 +494,13 @@ func MatchAny(handlers ...Handler) Handler { // the current input. If it does, then a failed match will be reported. If it // does not, then the next rune from the input will be reported as a match. func MatchNot(handler Handler) Handler { - return func(t API) bool { - if handler(t.Fork()) { + return func(t *API) bool { + child := t.Fork() + if handler(t) { + t.Dispose(child) return false } + t.Dispose(child) _, err := t.NextRune() if err == nil { t.Accept() @@ -568,28 +578,30 @@ func matchMinMax(min int, max int, handler Handler, name string) Handler { if max >= 0 && min > max { callerPanic(name, "Handler: {name} definition error at {caller}: max %d must not be < min %d", max, min) } - return func(t API) bool { + return func(t *API) bool { total := 0 + // Check for the minimum required amount of matches. + child := t.Fork() for total < min { total++ - child := t.Fork() - if !handler(child) { + if !handler(t) { + t.Dispose(child) return false } - child.Merge() } + // No specified max: include the rest of the available matches. // Specified max: include the rest of the availble matches, up to the max. //child.Merge() for max < 0 || total < max { total++ - child := t.Fork() - if !handler(child) { + if !handler(t) { break } - child.Merge() } + t.Merge(child) + t.Dispose(child) return true } } @@ -607,10 +619,13 @@ func MatchSeparated(separator Handler, separated Handler) Handler { // applied. If the handler applies, but the except Handler as well, then the match // as a whole will be treated as a mismatch. func MatchExcept(handler Handler, except Handler) Handler { - return func(t API) bool { - if except(t.Fork()) { + return func(t *API) bool { + child := t.Fork() + if except(t) { + t.Dispose(child) return false } + t.Dispose(child) return handler(t) } } @@ -620,11 +635,12 @@ func MatchExcept(handler Handler, except Handler) Handler { // When both handlers match, the match for the handler is accepted and the match // for the lookAhead handler is ignored. func MatchFollowedBy(lookAhead Handler, handler Handler) Handler { - return func(t API) bool { - child := t.Fork() - if handler(child) && lookAhead(child.Fork()) { - child.Merge() - return true + return func(t *API) bool { + if handler(t) { + child := t.Fork() + result := lookAhead(t) + t.Dispose(child) + return result } return false } @@ -635,11 +651,12 @@ func MatchFollowedBy(lookAhead Handler, handler Handler) Handler { // If the handler matches and the lookAhead handler doesn't, then the match for // the handler is accepted. func MatchNotFollowedBy(lookAhead Handler, handler Handler) Handler { - return func(t API) bool { - child := t.Fork() - if handler(child) && !lookAhead(child.Fork()) { - child.Merge() - return true + return func(t *API) bool { + if handler(t) { + child := t.Fork() + result := !lookAhead(t) + t.Dispose(child) + return result } return false } @@ -654,14 +671,14 @@ func MatchNotFollowedBy(lookAhead Handler, handler Handler) Handler { // // Without flushing the input, the input reader will allocate memory // during the parsing process, eventually enough to hold the full input -// in memory. By wrapping Handlers with DoFlushInput, you can tell parsekit +// in memory. By wrapping Handlers with an input flusher, you can tell parsekit // that the accumulated input so far will no longer be needed, allowing // this input to be flushed from memory. // // Rule of thumb is: only use it when you have to actually fix a memory // hogging issue for your use case. func MakeInputFlusher(handler Handler) Handler { - return func(t API) bool { + return func(t *API) bool { if handler(t) { t.FlushInput() return true @@ -689,11 +706,12 @@ func MatchIntegerBetween(min int64, max int64) Handler { callerPanic("MatchIntegerBetween", "Handler: {name} definition error at {caller}: max %d must not be < min %d", max, min) } digits := MatchSigned(MatchDigits()) - return func(t API) bool { + + return func(t *API) bool { if !digits(t) { return false } - value, _ := strconv.ParseInt(t.Result().String(), 10, 64) + value, _ := strconv.ParseInt(t.String(), 10, 64) if value < min || value > max { return false } @@ -705,9 +723,10 @@ func MatchIntegerBetween(min int64, max int64) Handler { // has been reached. This Handler will never produce output. It only reports // a successful or a failing match through its boolean return value. func MatchEndOfFile() Handler { - return func(t API) bool { + return func(t *API) bool { child := t.Fork() - _, err := child.NextRune() + _, err := t.NextRune() + t.Dispose(child) return err == io.EOF } } @@ -723,7 +742,7 @@ func MatchUntilEndOfLine() Handler { // read from the input. Invalid runes on the input are replaced with the UTF8 // replacement rune \uFFFD (i.e. utf8.RuneError), which displays as �. func MatchAnyRune() Handler { - return func(t API) bool { + return func(t *API) bool { _, err := t.NextRune() if err == nil { t.Accept() @@ -736,7 +755,7 @@ func MatchAnyRune() Handler { // MatchValidRune creates a Handler function that checks if a valid // UTF8 rune can be read from the input. func MatchValidRune() Handler { - return func(t API) bool { + return func(t *API) bool { r, err := t.NextRune() if err == nil && r != utf8.RuneError { t.Accept() @@ -749,7 +768,7 @@ func MatchValidRune() Handler { // MatchInvalidRune creates a Handler function that checks if an invalid // UTF8 rune can be read from the input. func MatchInvalidRune() Handler { - return func(t API) bool { + return func(t *API) bool { r, err := t.NextRune() if err == nil && r == utf8.RuneError { t.Accept() @@ -860,20 +879,20 @@ func MatchHexDigit() Handler { // stripped from the octet. func MatchOctet(normalize bool) Handler { max3Digits := MatchMinMax(1, 3, MatchDigit()) - return func(t API) bool { + return func(t *API) bool { if !max3Digits(t) { return false } - value, _ := strconv.ParseInt(t.Result().String(), 10, 16) + value, _ := strconv.ParseInt(t.String(), 10, 16) if value > 255 { return false } if normalize { - runes := t.Result().Runes() + runes := t.Runes() for len(runes) > 1 && runes[0] == '0' { runes = runes[1:] } - t.Result().SetRunes(runes) + t.SetRunes(runes...) } return true } @@ -909,20 +928,19 @@ func MatchIPv4Netmask(normalize bool) Handler { dot := MatchRune('.') netmask := MatchSeq(octet, dot, octet, dot, octet, dot, octet) - return func(t API) bool { + return func(t *API) bool { if !netmask(t) { return false } - // Check if the mask is provided in canonical form (ones followed by zeroes). - r := t.Result() - mask := net.IPv4Mask(r.Value(0).(byte), r.Value(1).(byte), r.Value(2).(byte), r.Value(3).(byte)) + // Check if the mask is provided in canonical form (at the binary level, ones followed by zeroes). + mask := net.IPv4Mask(t.TokenValue(0).(byte), t.TokenValue(1).(byte), t.TokenValue(2).(byte), t.TokenValue(3).(byte)) ones, bits := mask.Size() if ones == 0 && bits == 0 { return false } - r.ClearTokens() + t.ClearTokens() return true } } @@ -942,7 +960,7 @@ func MatchIPv4Net(normalize bool) Handler { MakeUint8Token("cidr", MatchIPv4CIDRMask(normalize))) ipnet := MatchSeq(ip, slash, mask) - return func(t API) bool { + return func(t *API) bool { if !ipnet(t) { return false } @@ -951,19 +969,18 @@ func MatchIPv4Net(normalize bool) Handler { return true } - r := t.Result() - maskToken := r.Token(1) + maskToken := t.Token(1) if maskToken.Type == "cidr" { - r.SetRunes(fmt.Sprintf("%s/%d", r.Value(0), r.Value(1).(uint8))) + t.SetString(fmt.Sprintf("%s/%d", t.TokenValue(0), t.TokenValue(1).(uint8))) } else { - o := strings.Split(r.Value(1).(string), ".") + o := strings.Split(t.TokenValue(1).(string), ".") b := func(idx int) byte { i, _ := strconv.Atoi(o[idx]); return byte(i) } mask := net.IPv4Mask(b(0), b(1), b(2), b(3)) bits, _ := mask.Size() - r.SetRunes(fmt.Sprintf("%s/%d", r.Value(0), bits)) + t.SetString(fmt.Sprintf("%s/%d", t.TokenValue(0), bits)) } - r.ClearTokens() + t.ClearTokens() return true } } @@ -975,7 +992,7 @@ func MatchIPv6(normalize bool) Handler { colon := MatchRune(':') empty := MatchSeq(colon, colon) - return func(t API) bool { + return func(t *API) bool { nrOfHextets := 0 for nrOfHextets < 8 { if hextet(t) { @@ -992,13 +1009,13 @@ func MatchIPv6(normalize bool) Handler { } // Invalid IPv6, when net.ParseIP() cannot handle it. - parsed := net.ParseIP(t.Result().String()) + parsed := net.ParseIP(t.String()) if parsed == nil { return false } if normalize { - t.Result().SetRunes(parsed.String()) + t.SetString(parsed.String()) } return true } @@ -1017,13 +1034,12 @@ func matchCIDRMask(bits int64, normalize bool) Handler { return mask } - return func(t API) bool { + return func(t *API) bool { if !mask(t) { return false } - r := t.Result() - bits, _ := strconv.Atoi(r.String()) - t.Result().SetRunes(fmt.Sprintf("%d", bits)) + bits, _ := strconv.Atoi(t.String()) + t.SetString(fmt.Sprintf("%d", bits)) return true } } @@ -1057,13 +1073,15 @@ func MatchIPv6Net(normalize bool) Handler { // string "bork" would not match against the second form, but " bork" would. // In both cases, it would match the first form. func ModifyDrop(handler Handler) Handler { - return func(t API) bool { + return func(t *API) bool { child := t.Fork() - if handler(child) { - child.Reset() - child.Merge() + if handler(t) { + t.Reset() + t.Merge(child) + t.Dispose(child) return true } + t.Dispose(child) return false } } @@ -1137,14 +1155,16 @@ func ModifyReplace(handler Handler, replaceWith string) Handler { // modified string on output. The return value of the modfunc will replace the // resulting output. func ModifyByCallback(handler Handler, modfunc func(string) string) Handler { - return func(t API) bool { + return func(t *API) bool { child := t.Fork() - if handler(child) { - s := modfunc(child.Result().String()) - child.Result().SetRunes(s) - child.Merge() + if handler(t) { + s := modfunc(t.String()) + t.SetString(s) + t.Merge(child) + t.Dispose(child) return true } + t.Dispose(child) return false } } @@ -1155,8 +1175,8 @@ func ModifyByCallback(handler Handler, modfunc func(string) string) Handler { // escape sequence like "\n" is kept as-is (a backslash character, followed by // an 'n'-character). func MakeStrLiteralToken(toktype interface{}, handler Handler) Handler { - return MakeTokenByCallback(toktype, handler, func(t API) interface{} { - literal := t.Result().String() + return MakeTokenByCallback(toktype, handler, func(t *API) interface{} { + literal := t.String() return literal }) } @@ -1166,9 +1186,9 @@ func MakeStrLiteralToken(toktype interface{}, handler Handler) Handler { // representation of the read Runes. This string is interpreted, meaning that an // escape sequence like "\n" is translated to an actual newline control character func MakeStrInterpretedToken(toktype interface{}, handler Handler) Handler { - return MakeTokenByCallback(toktype, handler, func(t API) interface{} { + return MakeTokenByCallback(toktype, handler, func(t *API) interface{} { // TODO ERROR HANDLING - interpreted, _ := interpretString(t.Result().String()) + interpreted, _ := interpretString(t.String()) return interpreted }) } @@ -1190,9 +1210,9 @@ func interpretString(str string) (string, error) { // Result, for which the Token.Value is set to a Rune-representation // of the read Rune. func MakeRuneToken(toktype interface{}, handler Handler) Handler { - return MakeTokenByCallback(toktype, handler, func(t API) interface{} { + return MakeTokenByCallback(toktype, handler, func(t *API) interface{} { // TODO ERROR HANDLING --- not a 1 rune input - return t.Result().Rune(0) + return t.Rune(0) }) } @@ -1200,9 +1220,9 @@ func MakeRuneToken(toktype interface{}, handler Handler) Handler { // Result, for which the Token.Value is set to a Byte-representation // of the read Rune. func MakeByteToken(toktype interface{}, handler Handler) Handler { - return MakeTokenByCallback(toktype, handler, func(t API) interface{} { + return MakeTokenByCallback(toktype, handler, func(t *API) interface{} { // TODO ERROR HANDLING --- not a 1 byte input - return byte(t.Result().Rune(0)) + return byte(t.Rune(0)) }) } @@ -1406,8 +1426,8 @@ func MakeBooleanToken(toktype interface{}, handler Handler) Handler { } func makeStrconvToken(name string, toktype interface{}, handler Handler, convert func(s string) (interface{}, error)) Handler { - return MakeTokenByCallback(toktype, handler, func(t API) interface{} { - value, err := convert(t.Result().String()) + return MakeTokenByCallback(toktype, handler, func(t *API) interface{} { + value, err := convert(t.String()) if err != nil { // TODO meh, panic feels so bad here. Maybe just turn this case into "no match"? panic(fmt.Sprintf("%s token invalid (%s)", name, err)) @@ -1419,17 +1439,17 @@ func makeStrconvToken(name string, toktype interface{}, handler Handler, convert // MakeTokenByValue creates a Handler that will add a static Token value // to the Result. func MakeTokenByValue(toktype interface{}, handler Handler, value interface{}) Handler { - return MakeTokenByCallback(toktype, handler, func(t API) interface{} { return value }) + return MakeTokenByCallback(toktype, handler, func(t *API) interface{} { return value }) } // MakeTokenByCallback creates a Handler that will add a Token to the // Result, for which the Token.Value is to be generated by the provided // makeValue() callback function. The function gets the current API as // its input and must return the token value. -func MakeTokenByCallback(toktype interface{}, handler Handler, makeValue func(t API) interface{}) Handler { - return func(t API) bool { +func MakeTokenByCallback(toktype interface{}, handler Handler, makeValue func(t *API) interface{}) Handler { + return func(t *API) bool { child := t.Fork() - if handler(child) { + if handler(t) { // The token is not added to the child here. The child might have produced its own // tokens and we want those to come after the token for the current parsing level. // By adding the token to the input API and then merging the child tokens, the order @@ -1437,12 +1457,14 @@ func MakeTokenByCallback(toktype interface{}, handler Handler, makeValue func(t // e.g. when a parsing hierarchy looks like ("date" ("year", "month" "day")), the // tokens will end up in the order "date", "year", "month", "day". When we'd add the // token to the child here, the order would have been "year", "month", "day", "date". - token := Token{Type: toktype, Runes: child.Result().Runes(), Value: makeValue(child)} - t.Result().AddTokens(token) - child.Merge() + token := Token{Type: toktype, Value: makeValue(t)} + t.AddTokens(token) + t.Merge(child) + t.Dispose(child) return true } + t.Dispose(child) return false } } @@ -1450,15 +1472,18 @@ func MakeTokenByCallback(toktype interface{}, handler Handler, makeValue func(t // MakeTokenGroup checks if the provided handler matches the input. If yes, then it will // take the tokens as produced by the handler and group them together in a single token. func MakeTokenGroup(toktype interface{}, handler Handler) Handler { - return func(t API) bool { + return func(t *API) bool { child := t.Fork() - if handler(child) { - result := child.Result() - token := Token{Type: toktype, Runes: result.Runes(), Value: result.Tokens()} - result.SetTokens(token) - child.Merge() + if handler(t) { + tokens := t.Tokens() + tokensCopy := make([]Token, len(tokens)) + copy(tokensCopy, tokens) + t.SetTokens(Token{Type: toktype, Value: tokensCopy}) + t.Merge(child) + t.Dispose(child) return true } + t.Dispose(child) return false } } diff --git a/tokenize/handlers_builtin_test.go b/tokenize/handlers_builtin_test.go index 5d8a867..af121b9 100644 --- a/tokenize/handlers_builtin_test.go +++ b/tokenize/handlers_builtin_test.go @@ -4,22 +4,32 @@ import ( "fmt" "testing" - "git.makaay.nl/mauricem/go-parsekit/tokenize" + tokenize "git.makaay.nl/mauricem/go-parsekit/tokenize" ) +func TestCombinatorsTempDebug(t *testing.T) { + var a = tokenize.A + AssertHandlers(t, []HandlerT{ + // {"024", a.IPv4CIDRMask, true, "24"}, + // {"024", a.Octet, true, "24"}, + {"192.168.6.123/024", a.IPv4Net, true, "192.168.6.123/24"}, + }) +} + func TestCombinators(t *testing.T) { var c, a, m = tokenize.C, tokenize.A, tokenize.M AssertHandlers(t, []HandlerT{ - {"abc", c.Not(a.Rune('b')), true, "a"}, - {"bcd", c.Not(a.Rune('b')), false, ""}, - {"bcd", c.Not(a.Rune('b')), false, ""}, - {"1010", c.Not(c.Seq(a.Rune('2'), a.Rune('0'))), true, "1"}, - {"2020", c.Not(c.Seq(a.Rune('2'), a.Rune('0'))), false, ""}, - {"abc", c.Any(a.Rune('a'), a.Rune('b')), true, "a"}, - {"bcd", c.Any(a.Rune('a'), a.Rune('b')), true, "b"}, - {"cde", c.Any(a.Rune('a'), a.Rune('b')), false, ""}, - {"ababc", c.Repeated(4, a.Runes('a', 'b')), true, "abab"}, - {"ababc", c.Repeated(5, a.Runes('a', 'b')), false, ""}, + {"", c.Not(a.Rune('b')), false, ""}, + {"abc not", c.Not(a.Rune('b')), true, "a"}, + {"bcd not", c.Not(a.Rune('b')), false, ""}, + {"aaaxxxb", c.OneOrMore(c.Not(a.Rune('b'))), true, "aaaxxx"}, + {"1010 not", c.Not(c.Seq(a.Rune('2'), a.Rune('0'))), true, "1"}, + {"2020 not", c.Not(c.Seq(a.Rune('2'), a.Rune('0'))), false, ""}, + {"abc any", c.Any(a.Rune('a'), a.Rune('b')), true, "a"}, + {"bcd any", c.Any(a.Rune('a'), a.Rune('b')), true, "b"}, + {"cde any", c.Any(a.Rune('a'), a.Rune('b')), false, ""}, + {"ababc repeated", c.Repeated(4, a.Runes('a', 'b')), true, "abab"}, + {"ababc repeated", c.Repeated(5, a.Runes('a', 'b')), false, ""}, {"", c.Min(0, a.Rune('a')), true, ""}, {"a", c.Min(0, a.Rune('a')), true, "a"}, {"aaaaa", c.Min(4, a.Rune('a')), true, "aaaaa"}, @@ -53,6 +63,7 @@ func TestCombinators(t *testing.T) { {"X", c.ZeroOrMore(a.Rune('e')), true, ""}, {"eX", c.ZeroOrMore(a.Rune('e')), true, "e"}, {"eeeeeX", c.ZeroOrMore(a.Rune('e')), true, "eeeee"}, + {"HI!", c.Seq(a.Rune('H'), a.Rune('I'), a.Rune('!')), true, "HI!"}, {"Hello, world!X", c.Seq(a.Str("Hello"), a.Comma, a.Space, a.Str("world"), a.Excl), true, "Hello, world!"}, {"101010123", c.OneOrMore(c.Seq(a.Rune('1'), a.Rune('0'))), true, "101010"}, {"", c.Optional(c.OneOrMore(a.Rune('f'))), true, ""}, @@ -62,8 +73,20 @@ func TestCombinators(t *testing.T) { {"1,2,3,b,c", c.Separated(a.Comma, a.Digit), true, "1,2,3"}, {`\x9a\x01\xF0\xfCAndSomeMoreStuff`, c.OneOrMore(c.Seq(a.Backslash, a.Rune('x'), c.Repeated(2, a.HexDigit))), true, `\x9a\x01\xF0\xfC`}, {" ", m.Trim(c.OneOrMore(a.AnyRune), " "), true, ""}, - {" ", m.TrimLeft(c.OneOrMore(a.AnyRune), " "), true, ""}, - {" ", m.TrimRight(c.OneOrMore(a.AnyRune), " "), true, ""}, + {" a", m.TrimLeft(c.OneOrMore(a.AnyRune), " "), true, "a"}, + {"a ", m.TrimRight(c.OneOrMore(a.AnyRune), " "), true, "a"}, + {" a ", m.TrimSpace(c.OneOrMore(a.AnyRune)), true, "a"}, + {"ab", c.FollowedBy(a.Rune('b'), a.Rune('a')), true, "a"}, + {"ba", c.FollowedBy(a.Rune('b'), a.Rune('a')), false, ""}, + {"aa", c.FollowedBy(a.Rune('b'), a.Rune('a')), false, ""}, + {"aaabbbcccddd", c.FollowedBy(c.OneOrMore(a.Rune('d')), c.OneOrMore(a.Rune('a')).Then(c.OneOrMore(c.Not(a.Rune('d'))))), true, "aaabbbccc"}, + {"aaabbbcccxxx", c.FollowedBy(c.OneOrMore(a.Rune('d')), c.OneOrMore(a.Rune('a')).Then(c.OneOrMore(c.Not(a.Rune('d'))))), false, ""}, + {"xy", c.NotFollowedBy(a.Rune('a'), a.Rune('x')), true, "x"}, + {"yx", c.NotFollowedBy(a.Rune('a'), a.Rune('x')), false, ""}, + {"xx", c.NotFollowedBy(a.Rune('a'), a.Rune('x')), true, "x"}, + {"xa", c.NotFollowedBy(a.Rune('a'), a.Rune('x')), false, ""}, + {"xxxyyyzzzaaa", c.NotFollowedBy(a.Rune('a'), c.OneOrMore(a.Runes('x', 'y', 'z'))), false, ""}, + {"xxxyyyzzzbaa", c.NotFollowedBy(a.Rune('a'), c.OneOrMore(a.Runes('x', 'y', 'z'))), true, "xxxyyyzzz"}, }) } @@ -110,8 +133,10 @@ func TestAtoms(t *testing.T) { {"\xbc with AnyRune", a.AnyRune, true, "�"}, {"", a.AnyRune, false, ""}, {"⌘", a.ValidRune, true, "⌘"}, - {"\xbc with ValidRune", a.ValidRune, false, "�"}, + {"\xbc with ValidRune", a.ValidRune, false, ""}, {"", a.ValidRune, false, ""}, + {"\xbc with InvalidRune", a.InvalidRune, true, "�"}, + {"ok with InvalidRune", a.InvalidRune, false, ""}, {" ", a.Space, true, " "}, {"X", a.Space, false, ""}, {"\t", a.Tab, true, "\t"}, @@ -225,38 +250,73 @@ func TestAtoms(t *testing.T) { {"0", a.IntegerBetween(-10, 10), true, "0"}, {"10", a.IntegerBetween(-10, 10), true, "10"}, {"11", a.IntegerBetween(0, 10), false, ""}, + {"fifteen", a.IntegerBetween(0, 10), false, ""}, }) } func TestIPv4Atoms(t *testing.T) { var a = tokenize.A AssertHandlers(t, []HandlerT{ + // Not normalized octet. + {"0X", tokenize.MatchOctet(false), true, "0"}, + {"00X", tokenize.MatchOctet(false), true, "00"}, + {"000X", tokenize.MatchOctet(false), true, "000"}, + {"10X", tokenize.MatchOctet(false), true, "10"}, + {"010X", tokenize.MatchOctet(false), true, "010"}, + {"255123", tokenize.MatchOctet(false), true, "255"}, + {"256123", tokenize.MatchOctet(false), false, ""}, + {"300", tokenize.MatchOctet(false), false, ""}, + + // Octet. + {"0", tokenize.MatchOctet(false), true, "0"}, + {"02", tokenize.MatchOctet(false), true, "02"}, + {"003", tokenize.MatchOctet(false), true, "003"}, + {"256", tokenize.MatchOctet(false), false, ""}, {"0X", a.Octet, true, "0"}, - {"00X", a.Octet, true, "00"}, - {"000X", a.Octet, true, "000"}, + {"00X", a.Octet, true, "0"}, + {"000X", a.Octet, true, "0"}, {"10X", a.Octet, true, "10"}, - {"010X", a.Octet, true, "010"}, + {"010X", a.Octet, true, "10"}, {"255123", a.Octet, true, "255"}, {"256123", a.Octet, false, ""}, {"300", a.Octet, false, ""}, + + // IPv4 address. + {"0.0.0.0", tokenize.MatchIPv4(false), true, "0.0.0.0"}, + {"010.0.255.01", tokenize.MatchIPv4(false), true, "010.0.255.01"}, {"0.0.0.0", a.IPv4, true, "0.0.0.0"}, {"10.20.30.40", a.IPv4, true, "10.20.30.40"}, {"010.020.003.004", a.IPv4, true, "10.20.3.4"}, {"255.255.255.255", a.IPv4, true, "255.255.255.255"}, {"256.255.255.255", a.IPv4, false, ""}, + + // IPv4 CIDR netmask. + {"0", tokenize.MatchIPv4CIDRMask(false), true, "0"}, + {"000", tokenize.MatchIPv4CIDRMask(false), true, "000"}, {"0", a.IPv4CIDRMask, true, "0"}, + {"00", a.IPv4CIDRMask, true, "0"}, + {"000", a.IPv4CIDRMask, true, "0"}, {"32", a.IPv4CIDRMask, true, "32"}, + {"032", a.IPv4CIDRMask, true, "32"}, {"33", a.IPv4CIDRMask, false, ""}, + + // IPv4 netmask in dotted quad format. + {"0.0.0.0", tokenize.MatchIPv4Netmask(false), true, "0.0.0.0"}, + {"255.128.000.000", tokenize.MatchIPv4Netmask(false), true, "255.128.000.000"}, {"0.0.0.0", a.IPv4Netmask, true, "0.0.0.0"}, {"255.255.128.0", a.IPv4Netmask, true, "255.255.128.0"}, {"255.255.255.255", a.IPv4Netmask, true, "255.255.255.255"}, {"255.255.132.0", a.IPv4Netmask, false, ""}, // not a canonical netmask (1-bits followed by 0-bits) + + // IPv4 address + CIDR or dotted quad netmask. {"192.168.6.123", a.IPv4Net, false, ""}, + {"192.168.6.123/24", tokenize.MatchIPv4Net(false), true, "192.168.6.123/24"}, + {"001.002.003.004/016", tokenize.MatchIPv4Net(false), true, "001.002.003.004/016"}, {"192.168.6.123/024", a.IPv4Net, true, "192.168.6.123/24"}, {"192.168.6.123/255.255.255.0", a.IPv4Net, true, "192.168.6.123/24"}, {"10.0.0.10/192.0.0.0", a.IPv4Net, true, "10.0.0.10/2"}, - {"10.0.0.10/193.0.0.0", a.IPv4Net, false, ""}, // invalid netmask and 193 is also invalid cidr - {"10.0.0.10/16.0.0.0", a.IPv4Net, true, "10.0.0.10/16"}, // invalid netmask, but 16 cidr is ok, remainder input = ".0.0.0" + {"10.0.0.10/193.0.0.0", a.IPv4Net, false, ""}, // invalid netmask and 193 is also invalid cidr + {"010.000.000.010/16.000.000.000", a.IPv4Net, true, "10.0.0.10/16"}, // invalid netmask, but 16 cidr is ok, remainder input = ".0.0.0" }) } @@ -292,7 +352,10 @@ func TestIPv6Atoms(t *testing.T) { func TestModifiers(t *testing.T) { var c, a, m = tokenize.C, tokenize.A, tokenize.M AssertHandlers(t, []HandlerT{ + {"missed me!", m.Drop(a.Rune('w')), false, ""}, + {"where are you?", m.Drop(a.Rune('w')), true, ""}, {"--cool", c.Seq(m.Drop(c.OneOrMore(a.Minus)), a.Str("cool")), true, "cool"}, + {"12345", c.Seq(a.Digit, m.Drop(a.Digit), a.Digit, m.Drop(a.Digit), a.Digit), true, "135"}, {" trim ", m.Trim(c.OneOrMore(a.AnyRune), " "), true, "trim"}, {" \t trim \t ", m.Trim(c.OneOrMore(a.AnyRune), " \t"), true, "trim"}, {" trim ", m.TrimLeft(c.OneOrMore(a.AnyRune), " "), true, "trim "}, @@ -300,6 +363,7 @@ func TestModifiers(t *testing.T) { {" \t trim \t ", m.TrimRight(c.OneOrMore(a.AnyRune), " \t"), true, " \t trim"}, {"dirtyword", m.Replace(c.OneOrMore(a.AnyRune), "*******"), true, "*******"}, {"abcdefghijk", m.ByCallback(a.Str("abc"), func(s string) string { return "X" }), true, "X"}, + {"abcdefghijk", m.ByCallback(a.Str("xyz"), func(s string) string { return "X" }), false, ""}, {"NoTaLlUpPeR", m.ToUpper(a.StrNoCase("notallUPPER")), true, "NOTALLUPPER"}, {"NoTaLlLoWeR", m.ToLower(a.StrNoCase("NOTALLlower")), true, "notalllower"}, }) @@ -323,64 +387,99 @@ func TestTokenMakers(t *testing.T) { var c, a, tok = tokenize.C, tokenize.A, tokenize.T AssertTokenMakers(t, []TokenMakerT{ {`empty token`, tok.Str("A", c.ZeroOrMore(a.Digit)), - []tokenize.Token{{Type: "A", Runes: []rune(""), Value: ""}}}, + []tokenize.Token{{Type: "A", Value: ""}}}, {`Ѝюج literal \string`, tok.Str("B", c.OneOrMore(a.AnyRune)), - []tokenize.Token{{Type: "B", Runes: []rune(`Ѝюج literal \string`), Value: `Ѝюج literal \string`}}}, + []tokenize.Token{{Type: "B", Value: `Ѝюج literal \string`}}}, {`Ѝюجinterpreted \n string \u2318`, tok.StrInterpreted("C", c.OneOrMore(a.AnyRune)), - []tokenize.Token{{Type: "C", Runes: []rune(`Ѝюجinterpreted \n string \u2318`), Value: "Ѝюجinterpreted \n string ⌘"}}}, + []tokenize.Token{{Type: "C", Value: "Ѝюجinterpreted \n string ⌘"}}}, - {"Ø*", tok.Byte("Q", a.AnyRune), []tokenize.Token{{Type: "Q", Runes: []rune("Ø"), Value: byte('Ø')}}}, + {`\uD801 invalid rune`, tok.StrInterpreted("D", c.OneOrMore(a.AnyRune)), []tokenize.Token{{Type: "D", Value: "� invalid rune"}}}, + + // I don't check the returned error here, but it's good enough to see that the parsing + // stopped after the illegal \g escape sequence. + {`invalid \g escape`, tok.StrInterpreted("E", c.OneOrMore(a.AnyRune)), []tokenize.Token{{Type: "E", Value: "invalid "}}}, + + {"Ø*", tok.Byte("Q", a.AnyRune), []tokenize.Token{{Type: "Q", Value: byte('Ø')}}}, {"ROCKS", c.OneOrMore(tok.Byte("bar", a.ASCII)), []tokenize.Token{ - {Type: "bar", Runes: []rune("R"), Value: byte('R')}, - {Type: "bar", Runes: []rune("O"), Value: byte('O')}, - {Type: "bar", Runes: []rune("C"), Value: byte('C')}, - {Type: "bar", Runes: []rune("K"), Value: byte('K')}, - {Type: "bar", Runes: []rune("S"), Value: byte('S')}, + {Type: "bar", Value: byte('R')}, + {Type: "bar", Value: byte('O')}, + {Type: "bar", Value: byte('C')}, + {Type: "bar", Value: byte('K')}, + {Type: "bar", Value: byte('S')}, }}, - {"Ø*", tok.Rune("P", a.AnyRune), []tokenize.Token{{Type: "P", Runes: []rune("Ø"), Value: rune('Ø')}}}, + {"Ø*", tok.Rune("P", a.AnyRune), []tokenize.Token{{Type: "P", Value: rune('Ø')}}}, - {`2147483647XYZ`, tok.Int("D", a.Integer), []tokenize.Token{{Type: "D", Runes: []rune("2147483647"), Value: int(2147483647)}}}, - {`-2147483647XYZ`, tok.Int("D", a.Signed(a.Integer)), []tokenize.Token{{Type: "D", Runes: []rune("-2147483647"), Value: int(-2147483647)}}}, - {`127XYZ`, tok.Int8("E", a.Integer), []tokenize.Token{{Type: "E", Runes: []rune("127"), Value: int8(127)}}}, - {`-127XYZ`, tok.Int8("E", a.Signed(a.Integer)), []tokenize.Token{{Type: "E", Runes: []rune("-127"), Value: int8(-127)}}}, - {`32767XYZ`, tok.Int16("F", a.Integer), []tokenize.Token{{Type: "F", Runes: []rune("32767"), Value: int16(32767)}}}, - {`-32767XYZ`, tok.Int16("F", a.Signed(a.Integer)), []tokenize.Token{{Type: "F", Runes: []rune("-32767"), Value: int16(-32767)}}}, - {`2147483647XYZ`, tok.Int32("G", a.Integer), []tokenize.Token{{Type: "G", Runes: []rune("2147483647"), Value: int32(2147483647)}}}, - {`-2147483647XYZ`, tok.Int32("G", a.Signed(a.Integer)), []tokenize.Token{{Type: "G", Runes: []rune("-2147483647"), Value: int32(-2147483647)}}}, - {`-9223372036854775807XYZ`, tok.Int64("H", a.Signed(a.Integer)), []tokenize.Token{{Type: "H", Runes: []rune("-9223372036854775807"), Value: int64(-9223372036854775807)}}}, + {`2147483647XYZ`, tok.Int("D", a.Integer), []tokenize.Token{{Type: "D", Value: int(2147483647)}}}, + {`-2147483647XYZ`, tok.Int("D", a.Signed(a.Integer)), []tokenize.Token{{Type: "D", Value: int(-2147483647)}}}, + {`127XYZ`, tok.Int8("E", a.Integer), []tokenize.Token{{Type: "E", Value: int8(127)}}}, + {`-127XYZ`, tok.Int8("E", a.Signed(a.Integer)), []tokenize.Token{{Type: "E", Value: int8(-127)}}}, + {`32767XYZ`, tok.Int16("F", a.Integer), []tokenize.Token{{Type: "F", Value: int16(32767)}}}, + {`-32767XYZ`, tok.Int16("F", a.Signed(a.Integer)), []tokenize.Token{{Type: "F", Value: int16(-32767)}}}, + {`2147483647XYZ`, tok.Int32("G", a.Integer), []tokenize.Token{{Type: "G", Value: int32(2147483647)}}}, + {`-2147483647XYZ`, tok.Int32("G", a.Signed(a.Integer)), []tokenize.Token{{Type: "G", Value: int32(-2147483647)}}}, + {`-9223372036854775807XYZ`, tok.Int64("H", a.Signed(a.Integer)), []tokenize.Token{{Type: "H", Value: int64(-9223372036854775807)}}}, - {`4294967295`, tok.Uint("I", a.Integer), []tokenize.Token{{Type: "I", Runes: []rune("4294967295"), Value: uint(4294967295)}}}, - {`255XYZ`, tok.Uint8("J", a.Integer), []tokenize.Token{{Type: "J", Runes: []rune("255"), Value: uint8(255)}}}, - {`65535XYZ`, tok.Uint16("K", a.Integer), []tokenize.Token{{Type: "K", Runes: []rune("65535"), Value: uint16(65535)}}}, - {`4294967295XYZ`, tok.Uint32("L", a.Integer), []tokenize.Token{{Type: "L", Runes: []rune("4294967295"), Value: uint32(4294967295)}}}, - {`18446744073709551615XYZ`, tok.Uint64("M", a.Integer), []tokenize.Token{{Type: "M", Runes: []rune("18446744073709551615"), Value: uint64(18446744073709551615)}}}, + {`4294967295`, tok.Uint("I", a.Integer), []tokenize.Token{{Type: "I", Value: uint(4294967295)}}}, + {`255XYZ`, tok.Uint8("J", a.Integer), []tokenize.Token{{Type: "J", Value: uint8(255)}}}, + {`65535XYZ`, tok.Uint16("K", a.Integer), []tokenize.Token{{Type: "K", Value: uint16(65535)}}}, + {`4294967295XYZ`, tok.Uint32("L", a.Integer), []tokenize.Token{{Type: "L", Value: uint32(4294967295)}}}, + {`18446744073709551615XYZ`, tok.Uint64("M", a.Integer), []tokenize.Token{{Type: "M", Value: uint64(18446744073709551615)}}}, - {`3.1415=PI`, tok.Float32("N", a.Float), []tokenize.Token{{Type: "N", Runes: []rune("3.1415"), Value: float32(3.1415)}}}, - {`24.19287=PI`, tok.Float64("O", a.Float), []tokenize.Token{{Type: "O", Runes: []rune("24.19287"), Value: float64(24.19287)}}}, + {`3.1415=PI`, tok.Float32("N", a.Float), []tokenize.Token{{Type: "N", Value: float32(3.1415)}}}, + {`24.19287=PI`, tok.Float64("O", a.Float), []tokenize.Token{{Type: "O", Value: float64(24.19287)}}}, {`1tTtrueTRUETrue`, c.OneOrMore(tok.Boolean("P", a.Boolean)), []tokenize.Token{ - {Type: "P", Runes: []rune("1"), Value: true}, - {Type: "P", Runes: []rune("t"), Value: true}, - {Type: "P", Runes: []rune("T"), Value: true}, - {Type: "P", Runes: []rune("true"), Value: true}, - {Type: "P", Runes: []rune("TRUE"), Value: true}, - {Type: "P", Runes: []rune("True"), Value: true}, + {Type: "P", Value: true}, + {Type: "P", Value: true}, + {Type: "P", Value: true}, + {Type: "P", Value: true}, + {Type: "P", Value: true}, + {Type: "P", Value: true}, }}, {`0fFfalseFALSEFalse`, c.OneOrMore(tok.Boolean("P", a.Boolean)), []tokenize.Token{ - {Type: "P", Runes: []rune("0"), Value: false}, - {Type: "P", Runes: []rune("f"), Value: false}, - {Type: "P", Runes: []rune("F"), Value: false}, - {Type: "P", Runes: []rune("false"), Value: false}, - {Type: "P", Runes: []rune("FALSE"), Value: false}, - {Type: "P", Runes: []rune("False"), Value: false}, + {Type: "P", Value: false}, + {Type: "P", Value: false}, + {Type: "P", Value: false}, + {Type: "P", Value: false}, + {Type: "P", Value: false}, + {Type: "P", Value: false}, }}, + + {`anything`, tok.ByValue("Q", c.OneOrMore(a.AnyRune), "Kaboom!"), []tokenize.Token{{Type: "Q", Value: "Kaboom!"}}}, }) } +func TestTokenGroup_Match(t *testing.T) { + var c, a, tok = tokenize.C, tokenize.A, tokenize.T + tokenizer := tokenize.New(tok.Group("Group", + c.Seq(tok.Rune(1, a.Letter), tok.Rune(2, a.Letter), tok.Rune(3, a.Letter)))) + + api, err := tokenizer("xxxxx") + AssertTrue(t, err == nil, "Tokenizer result") + tokens := api.Tokens() + AssertEqual(t, 1, len(tokens), "Length of tokens slice") + contained := tokens[0].Value.([]tokenize.Token) + AssertEqual(t, 3, len(contained), "Length of contained tokens") + AssertEqual(t, 1, contained[0].Type.(int), "Value of contained Token 1") + AssertEqual(t, 2, contained[1].Type.(int), "Value of contained Token 2") + AssertEqual(t, 3, contained[2].Type.(int), "Value of contained Token 3") +} + +func TestTokenGroup_Mismatch(t *testing.T) { + var c, a, tok = tokenize.C, tokenize.A, tokenize.T + tokenizer := tokenize.New(tok.Group("Group", + c.Seq(tok.Rune(1, a.Letter), tok.Rune(2, a.Letter), tok.Rune(3, a.Letter))).Optional()) + + api, err := tokenizer("12345") + AssertTrue(t, err == nil, "Tokenizer result") + tokens := api.Tokens() + AssertEqual(t, 0, len(tokens), "Length of tokens slice") +} + // I know, this is hell, but that's the whole point for this test :-> func TestCombination(t *testing.T) { var c, a, m = tokenize.C, tokenize.A, tokenize.M diff --git a/tokenize/result.go b/tokenize/result.go deleted file mode 100644 index 467e6f1..0000000 --- a/tokenize/result.go +++ /dev/null @@ -1,155 +0,0 @@ -package tokenize - -import ( - "fmt" -) - -// Result is a struct that is used for holding tokenizer results as produced -// by a tokenize.Handler. It also provides the API that Handlers and Parsers -// can use to store and retrieve the results. -type Result struct { - runes []rune // runes as added to the result by tokenize.Handler functions - tokens []Token // Tokens as added to the result by tokenize.Handler functions - cursor Cursor // current read cursor position, relative to the start of the file - offset int // current rune offset relative to the Reader's sliding window - err error // can be used by a Handler to report a specific issue with the input -} - -// Token defines a lexical token as produced by tokenize.Handlers. -// -// The only mandatory data in a Token are the Runes. The Type and Value fields -// are optional fields that can be filled with data at will. -// -// The use of the Type field is to let a tokenizer communicate to -// the parser what type of token it's handling. -// -// The use of the Value field is to store any kind af data along with the token. -// One use of this can be found in the built-in token maker functions like -// MakeInt8Token(), which store an interpreted version of the input string -// in the Value field. -type Token struct { - Runes []rune // the runes that make up the token - Type interface{} // optional token type, can be any type that a parser author sees fit - Value interface{} // optional token value, of any type as well -} - -func (t Token) String() string { - tokenType := "" - if t.Type != nil { - tokenType = fmt.Sprintf("%v", t.Type) - } - - value := "" - if t.Value != nil { - switch t.Value.(type) { - case []*Token: - return fmt.Sprintf("%v%v", tokenType, t.Value) - case string: - value = fmt.Sprintf("%q", t.Value) - case rune: - value = fmt.Sprintf("%v", t.Value) - case bool: - value = fmt.Sprintf("%v", t.Value) - default: - value = fmt.Sprintf("(%T)%v", t.Value, t.Value) - } - } - - return fmt.Sprintf("%v(%s)", tokenType, value) -} - -// newResult initializes an empty Result struct. -func newResult() Result { - return Result{} -} - -// ClearRunes clears the runes in the Result. -func (r *Result) ClearRunes() { - r.runes = []rune{} -} - -// SetRunes replaces the Runes from the Result with the provided input. -func (r *Result) SetRunes(s ...interface{}) { - r.ClearRunes() - r.addRunes("SetRunes", s...) -} - -// AddRunes is used to add runes to the Result. -func (r *Result) AddRunes(set ...interface{}) { - r.addRunes("AddRunes", set...) -} - -func (r *Result) addRunes(name string, set ...interface{}) { - for _, s := range set { - switch s := s.(type) { - case string: - r.runes = append(r.runes, []rune(s)...) - case []rune: - r.runes = append(r.runes, s...) - case rune: - r.runes = append(r.runes, s) - default: - callerPanic(name, "tokenize.Result.{name}(): unsupported type '%T' used at {caller}", s) - } - } -} - -// Runes retrieves the Runes from the Result. -func (r *Result) Runes() []rune { - return r.runes -} - -// Rune retrieve a single rune from the Result at the specified index. -func (r *Result) Rune(idx int) rune { - return r.runes[idx] -} - -// String returns the Runes from the Result as a string. -func (r *Result) String() string { - return string(r.runes) -} - -// ClearTokens clears the tokens in the Result. -func (r *Result) ClearTokens() { - r.tokens = []Token{} -} - -// SetTokens replaces the Tokens from the Result with the provided tokens. -func (r *Result) SetTokens(tokens ...Token) { - r.tokens = tokens -} - -// AddTokens is used to add Tokens to the Result. -func (r *Result) AddTokens(tokens ...Token) { - r.tokens = append(r.tokens, tokens...) -} - -// Tokens retrieves the Tokens from the Result. -func (r *Result) Tokens() []Token { - return r.tokens -} - -// Token retrieves a single Token from the Result at the specified index. -func (r *Result) Token(idx int) Token { - return r.tokens[idx] -} - -// Values retrieves a slice containing only the Values for the Result Tokens. -func (r *Result) Values() []interface{} { - values := make([]interface{}, len(r.tokens)) - for i, tok := range r.tokens { - values[i] = tok.Value - } - return values -} - -// Value retrieves a single Value from the Result Token at the specified index. -func (r *Result) Value(idx int) interface{} { - return r.tokens[idx].Value -} - -// Cursor retrieves the read cursor from the Result. This is the first -// cursor position after the runes that were read and accepted by the Handler. -func (r *Result) Cursor() Cursor { - return r.cursor -} diff --git a/tokenize/result_test.go b/tokenize/result_test.go deleted file mode 100644 index 6966da2..0000000 --- a/tokenize/result_test.go +++ /dev/null @@ -1,58 +0,0 @@ -package tokenize_test - -import ( - "fmt" - "strings" - "testing" - - "git.makaay.nl/mauricem/go-parsekit/tokenize" -) - -func ExampleToken() { - t0 := tokenize.Token{} - - t1 := tokenize.Token{ - Type: "Number", - Value: 224, - } - - const TName = 1 - - t2 := tokenize.Token{ - Type: TName, - Value: "John", - } - - t3 := tokenize.Token{ - Value: 42, - } - - fmt.Printf("%s\n%s\n%s\n%s\n", t0, t1, t2, t3) - - // Result: [ip("0.0.0.0") mask((int8)0)] - // Result: [ip("192.168.0.1") mask((int8)24)] - // Result: [ip("255.255.255.255") mask((int8)32)] - // Error: mismatch at start of file - // Error: mismatch at start of file -} - -func TestSetResult_AcceptsVariousTypesAsInput(t *testing.T) { - i := tokenize.NewAPI(strings.NewReader("Testing")) - i.Result().SetRunes("string") - AssertEqual(t, "string", string(i.Result().String()), "i.Result() with string input") - i.Result().SetRunes([]rune("rune slice")) - AssertEqual(t, "rune slice", string(i.Result().String()), "i.Result() with rune slice input") - i.Result().SetRunes('X') - AssertEqual(t, "X", string(i.Result().String()), "i.Result() with rune input") -} - -func TestSetResult_PanicsOnUnhandledInput(t *testing.T) { - AssertPanic(t, PanicT{ - Function: func() { - i := tokenize.NewAPI(strings.NewReader("Testing")) - i.Result().SetRunes(1234567) - }, - Regexp: true, - Expect: `tokenize\.Result\.SetRunes\(\): unsupported type 'int' used at /.*/result_test.go:\d+`, - }) -} diff --git a/tokenize2/token.go b/tokenize/token.go similarity index 98% rename from tokenize2/token.go rename to tokenize/token.go index a55add4..bdf0e1b 100644 --- a/tokenize2/token.go +++ b/tokenize/token.go @@ -1,4 +1,4 @@ -package tokenize2 +package tokenize import ( "fmt" diff --git a/tokenize2/token_test.go b/tokenize/token_test.go similarity index 89% rename from tokenize2/token_test.go rename to tokenize/token_test.go index e82b7cf..c5117d0 100644 --- a/tokenize2/token_test.go +++ b/tokenize/token_test.go @@ -1,9 +1,9 @@ -package tokenize2_test +package tokenize_test import ( "fmt" - tokenize "git.makaay.nl/mauricem/go-parsekit/tokenize2" + tokenize "git.makaay.nl/mauricem/go-parsekit/tokenize" ) func ExampleToken_String() { diff --git a/tokenize/tokenize.go b/tokenize/tokenize.go index 50b5104..8a725ba 100644 --- a/tokenize/tokenize.go +++ b/tokenize/tokenize.go @@ -9,7 +9,7 @@ import ( // Func is the function signature as returned by New: a function that takes // any supported type of input, executes a tokenizer run and returns a // Result struct (possibly nil) and an error (possibly nil). -type Func func(input interface{}) (*Result, error) +type Func func(input interface{}) (*API, error) // New instantiates a new tokenizer. // @@ -28,7 +28,7 @@ type Func func(input interface{}) (*Result, error) // against the provided input data. For an overview of allowed inputs, take a // look at the documentation for parsekit.read.New(). func New(tokenHandler Handler) Func { - return func(input interface{}) (*Result, error) { + return func(input interface{}) (*API, error) { api := NewAPI(input) ok := tokenHandler(api) @@ -36,6 +36,6 @@ func New(tokenHandler Handler) Func { err := fmt.Errorf("mismatch at %s", Cursor{}) return nil, err } - return api.Result(), nil + return api, nil } } diff --git a/tokenize/tokenizer_test.go b/tokenize/tokenizer_test.go index ebc508d..bbe6f0b 100644 --- a/tokenize/tokenizer_test.go +++ b/tokenize/tokenizer_test.go @@ -7,7 +7,7 @@ import ( "testing" "unicode/utf8" - "git.makaay.nl/mauricem/go-parsekit/tokenize" + tokenize "git.makaay.nl/mauricem/go-parsekit/tokenize" ) // TODO For error handling, it would be really cool if for example the @@ -55,7 +55,7 @@ func ExampleNew() { func TestCallingNextRune_ReturnsNextRune(t *testing.T) { api := makeTokenizeAPI() - r, _ := (&api).NextRune() + r, _ := api.NextRune() AssertEqual(t, 'T', r, "first rune") } @@ -67,7 +67,7 @@ func TestInputCanAcceptRunesFromReader(t *testing.T) { i.Accept() i.NextRune() i.Accept() - AssertEqual(t, "Tes", i.Result().String(), "i.Result().String()") + AssertEqual(t, "Tes", i.String(), "i.String()") } func TestCallingNextRuneTwice_Panics(t *testing.T) { @@ -78,52 +78,92 @@ func TestCallingNextRuneTwice_Panics(t *testing.T) { i.NextRune() }, Regexp: true, - Expect: `tokenize\.API\.NextRune\(\): NextRune\(\) called at /.*/tokenizer_test\.go:\d+ without a prior call to Accept\(\)`, + Expect: `tokenize\.API\.NextRune\(\): NextRune\(\) called at /.*_test\.go:\d+ ` + + `without a prior call to Accept\(\)`, }) } func TestCallingAcceptWithoutCallingNextRune_Panics(t *testing.T) { - input := makeTokenizeAPI() + api := makeTokenizeAPI() AssertPanic(t, PanicT{ - Function: (&input).Accept, + Function: api.Accept, Regexp: true, - Expect: `tokenize\.API\.Accept\(\): Accept\(\) called at /.*/assertions_test\.go:\d+ without first calling NextRune()`, + Expect: `tokenize\.API\.Accept\(\): Accept\(\) called at /.*test\.go:\d+ ` + + `without first calling NextRune\(\)`, }) } -func TestCallingMergeOnNonForkedChild_Panics(t *testing.T) { +func TestCallingAcceptAfterReadError_Panics(t *testing.T) { + api := tokenize.NewAPI("") + AssertPanic(t, PanicT{ + Function: func() { + api.NextRune() + api.Accept() + }, + Regexp: true, + Expect: `tokenize\.API\.Accept\(\): Accept\(\) called at /.*_test\.go:\d+` + + `, but the prior call to NextRune\(\) failed`, + }) +} + +func TestCallingMergeOnTopLevelAPI_Panics(t *testing.T) { AssertPanic(t, PanicT{ Function: func() { i := makeTokenizeAPI() - i.Merge() + i.Merge(0) }, Regexp: true, Expect: `tokenize\.API\.Merge\(\): Merge\(\) called at /.*_test.go:\d+ on the top-level API`}) } -func TestCallingNextRuneOnForkedParent_DetachesForkedChild(t *testing.T) { +func TestCallingMergeOnForkParentAPI_Panics(t *testing.T) { AssertPanic(t, PanicT{ Function: func() { i := makeTokenizeAPI() - f := i.Fork() - i.NextRune() - f.Merge() + child := i.Fork() + i.Fork() + i.Merge(child) }, Regexp: true, - Expect: `tokenize\.API\.Merge\(\): Merge\(\) called at /.*_test.go:\d+ using a non-active API fork.*`}) + Expect: `tokenize\.API\.Merge\(\): Merge\(\) called at /.*_test.go:\d+ ` + + `on API stack level 1, but the current stack level is 2 \(forgot to Dispose\(\) a forked child\?\)`}) } -func TestCallingForkOnForkedParent_DetachesForkedChild(t *testing.T) { +func TestCallingDisposeOnTopLevelAPI_Panics(t *testing.T) { AssertPanic(t, PanicT{ Function: func() { i := makeTokenizeAPI() - f := i.Fork() - g := f.Fork() - i.Fork() - g.Merge() + i.Dispose(0) }, Regexp: true, - Expect: `tokenize\.API\.Merge\(\): Merge\(\) called at /.*_test.go:\d+ using a non-active API fork.*`}) + Expect: `tokenize\.API\.Dispose\(\): Dispose\(\) called at /.*_test.go:\d+ on the top-level API`}) +} + +func TestCallingDisposeOnForkParentAPI_Panics(t *testing.T) { + AssertPanic(t, PanicT{ + Function: func() { + i := makeTokenizeAPI() + child := i.Fork() + i.Fork() + i.Dispose(child) + }, + Regexp: true, + Expect: `tokenize\.API\.Dispose\(\): Dispose\(\) called at /.*_test.go:\d+ ` + + `on API stack level 1, but the current stack level is 2 \(forgot to Dispose\(\) a forked child\?\)`}) +} + +func TestCallingForkOnForkedParentAPI_Panics(t *testing.T) { + AssertPanic(t, PanicT{ + Function: func() { + i := makeTokenizeAPI() + i.Fork() + g := i.Fork() + i.Fork() + i.Merge(g) + }, + Regexp: true, + Expect: `tokenize\.API\.Merge\(\): Merge\(\) called at /.*_test.go:\d+ ` + + `on API stack level 2, but the current stack level is 3 \(forgot to Dispose\(\) a forked child\?\)`}) } func TestForkingInput_ClearsLastRune(t *testing.T) { @@ -135,26 +175,26 @@ func TestForkingInput_ClearsLastRune(t *testing.T) { i.Accept() }, Regexp: true, - Expect: `tokenize\.API\.Accept\(\): Accept\(\) called at /hom.*/tokenizer_test\.go:\d+ without first calling NextRune\(\)`, + Expect: `tokenize\.API\.Accept\(\): Accept\(\) called at /.*_test\.go:\d+ without first calling NextRune\(\)`, }) } func TestAccept_UpdatesCursor(t *testing.T) { i := tokenize.NewAPI(strings.NewReader("input\r\nwith\r\nnewlines")) - AssertEqual(t, "start of file", i.Result().Cursor().String(), "cursor 1") + AssertEqual(t, "start of file", i.Cursor().String(), "cursor 1") for j := 0; j < 6; j++ { // read "input\r", cursor end up at "\n" i.NextRune() i.Accept() } - AssertEqual(t, "line 1, column 7", i.Result().Cursor().String(), "cursor 2") + AssertEqual(t, "line 1, column 7", i.Cursor().String(), "cursor 2") i.NextRune() // read "\n", cursor ends up at start of new line i.Accept() - AssertEqual(t, "line 2, column 1", i.Result().Cursor().String(), "cursor 3") + AssertEqual(t, "line 2, column 1", i.Cursor().String(), "cursor 3") for j := 0; j < 10; j++ { // read "with\r\nnewl", cursor end up at "i" i.NextRune() i.Accept() } - AssertEqual(t, "line 3, column 5", i.Result().Cursor().String(), "cursor 4") + AssertEqual(t, "line 3, column 5", i.Cursor().String(), "cursor 4") } func TestWhenCallingNextruneAtEndOfFile_EOFIsReturned(t *testing.T) { @@ -167,16 +207,17 @@ func TestWhenCallingNextruneAtEndOfFile_EOFIsReturned(t *testing.T) { } func TestAfterReadingruneAtEndOfFile_EarlierRunesCanStillBeAccessed(t *testing.T) { i := tokenize.NewAPI(strings.NewReader("X")) - f := i.Fork() - f.NextRune() - f.Accept() - r, err := f.NextRune() + child := i.Fork() + i.NextRune() + i.Accept() + r, err := i.NextRune() AssertEqual(t, true, r == utf8.RuneError, "returned rune from 2nd NextRune()") - r, err = i.NextRune() + i.Dispose(child) // brings the read offset back to the start + r, err = i.NextRune() // so here we should see the same rune AssertEqual(t, 'X', r, "returned rune from 2nd NextRune()") AssertEqual(t, true, err == nil, "returned error from 2nd NextRune()") } -func makeTokenizeAPI() tokenize.API { +func makeTokenizeAPI() *tokenize.API { return tokenize.NewAPI("Testing") } diff --git a/tokenize/tokenizer_whitebox_test.go b/tokenize/tokenizer_whitebox_test.go index f2fd27d..6f26b15 100644 --- a/tokenize/tokenizer_whitebox_test.go +++ b/tokenize/tokenizer_whitebox_test.go @@ -5,33 +5,33 @@ import ( ) func TestFork_CreatesForkOfInputAtSameCursorPosition(t *testing.T) { - // TODO FIXME Speed change // Create input, accept the first rune. i := NewAPI("Testing") i.NextRune() i.Accept() // T - AssertEqual(t, "T", i.Result().String(), "accepted rune in input") + AssertEqual(t, "T", i.String(), "accepted rune in input") // Fork - f := i.Fork() - AssertEqual(t, 1, i.state.stack[i.stackLevel].cursor.Byte, "parent cursor.Byte") - AssertEqual(t, 1, i.state.stack[i.stackLevel].offset, "parent offset") - AssertEqual(t, 1, f.state.stack[f.stackLevel].cursor.Byte, "child cursor.Byte") - AssertEqual(t, 1, f.state.stack[f.stackLevel].offset, "child offset") + child := i.Fork() + AssertEqual(t, 1, i.stackFrame.cursor.Byte, "parent cursor.Byte") + AssertEqual(t, 1, i.stackFrame.offset, "parent offset") + AssertEqual(t, 1, i.stackFrame.cursor.Byte, "child cursor.Byte") + AssertEqual(t, 1, i.stackFrame.offset, "child offset") // Accept two runes via fork. - f.NextRune() - f.Accept() // e - f.NextRune() - f.Accept() // s - AssertEqual(t, "es", f.Result().String(), "result runes in fork") - AssertEqual(t, 1, i.state.stack[i.stackLevel].cursor.Byte, "parent cursor.Byte") - AssertEqual(t, 1, i.state.stack[i.stackLevel].offset, "parent offset") - AssertEqual(t, 3, f.state.stack[f.stackLevel].cursor.Byte, "child cursor.Byte") - AssertEqual(t, 3, f.state.stack[f.stackLevel].offset, "child offset") + i.NextRune() + i.Accept() // e + i.NextRune() + i.Accept() // s + AssertEqual(t, "es", i.String(), "result runes in fork") + AssertEqual(t, 1, i.stackFrames[i.stackLevel-1].cursor.Byte, "parent cursor.Byte") + AssertEqual(t, 1, i.stackFrames[i.stackLevel-1].offset, "parent offset") + AssertEqual(t, 3, i.stackFrame.cursor.Byte, "child cursor.Byte") + AssertEqual(t, 3, i.stackFrame.offset, "child offset") // Merge fork back into parent - f.Merge() - AssertEqual(t, "Tes", i.Result().String(), "result runes in parent Input after Merge()") - AssertEqual(t, 3, i.state.stack[i.stackLevel].cursor.Byte, "parent cursor.Byte") - AssertEqual(t, 3, i.state.stack[i.stackLevel].offset, "parent offset") + i.Merge(child) + i.Dispose(child) + AssertEqual(t, "Tes", i.String(), "result runes in parent Input after Merge()") + AssertEqual(t, 3, i.stackFrame.cursor.Byte, "parent cursor.Byte") + AssertEqual(t, 3, i.stackFrame.offset, "parent offset") } func TestGivenForkedChildWhichAcceptedRune_AfterMerging_RuneEndsUpInParentResult(t *testing.T) { @@ -39,86 +39,83 @@ func TestGivenForkedChildWhichAcceptedRune_AfterMerging_RuneEndsUpInParentResult i.NextRune() i.Accept() f1 := i.Fork() - f1.NextRune() - f1.Accept() - f2 := f1.Fork() - f2.NextRune() - f2.Accept() - // TODO FIXME Speed changes - // AssertEqual(t, "T", i.Result().String(), "i.Result().String()") - // AssertEqual(t, 1, i.result.offset, "i.offset A") - // AssertEqual(t, "e", f1.Result().String(), "f1.Result().String()") - // AssertEqual(t, 2, f1.result.offset, "f1.offset A") - // AssertEqual(t, "s", f2.Result().String(), "f2.Result().String()") - // AssertEqual(t, 3, f2.result.offset, "f2.offset A") - // f2.Merge() - // AssertEqual(t, "T", i.Result().String(), "i.Result().String()") - // AssertEqual(t, 1, i.result.offset, "i.offset B") - // AssertEqual(t, "es", f1.Result().String(), "f1.Result().String()") - // AssertEqual(t, 3, f1.result.offset, "f1.offset B") - // AssertEqual(t, "", f2.Result().String(), "f2.Result().String()") - // AssertEqual(t, 3, f2.result.offset, "f2.offset B") - // f1.Merge() - // AssertEqual(t, "Tes", i.Result().String(), "i.Result().String()") - // AssertEqual(t, 3, i.result.offset, "i.offset C") - // AssertEqual(t, "", f1.Result().String(), "f1.Result().String()") - // AssertEqual(t, 3, f1.result.offset, "f1.offset C") - // AssertEqual(t, "", f2.Result().String(), "f2.Result().String()") - // AssertEqual(t, 3, f2.result.offset, "f2.offset C") -} - -func TestGivenMultipleLevelsOfForks_WhenReturningToRootInput_ForksAreDetached(t *testing.T) { - i := NewAPI("Testing") - f1 := i.Fork() - f2 := f1.Fork() - //f3 := f2.Fork() - f2.Fork() - f4 := f1.Fork() // secret subtest: this Fork() detaches both forks f2 and f3 - //f5 := f4.Fork() - f4.Fork() - // TODO FIXME Speed changes - // AssertEqual(t, true, i.parent == nil, "i.parent == nil") - // AssertEqual(t, true, i.child == &f1, "i.child == f1") - // AssertEqual(t, true, f1.parent == &i, "f1.parent == i") - // AssertEqual(t, true, f1.child == &f4, "f1.child == f4") - // AssertEqual(t, true, f2.child == nil, "f2.child == nil") - // AssertEqual(t, true, f2.parent == nil, "f2.parent == nil") - // AssertEqual(t, true, f3.child == nil, "f3.child == nil") - // AssertEqual(t, true, f3.parent == nil, "f3.parent == nil") - // AssertEqual(t, true, f4.parent == &f1, "f4.parent == f1") - // AssertEqual(t, true, f4.child == &f5, "f4.child == f5") - // AssertEqual(t, true, f5.parent == &f4, "f5.parent == f4") - // AssertEqual(t, true, f5.child == nil, "f5.child == nil") - i.NextRune() - - // AssertEqual(t, true, i.parent == nil, "i.parent == nil") - // AssertEqual(t, true, i.child == nil, "i.child == nil") - // AssertEqual(t, true, f1.parent == nil, "f1.parent == nil") - // AssertEqual(t, true, f1.child == nil, "f1.child == nil") - // AssertEqual(t, true, f2.child == nil, "f2.child == nil") - // AssertEqual(t, true, f2.parent == nil, "f2.parent == nil") - // AssertEqual(t, true, f3.child == nil, "f3.child == nil") - // AssertEqual(t, true, f3.parent == nil, "f3.parent == nil") - // AssertEqual(t, true, f4.parent == nil, "f4.parent == nil") - // AssertEqual(t, true, f4.child == nil, "f4.child == nil") - // AssertEqual(t, true, f5.parent == nil, "f5.parent == nil") - // AssertEqual(t, true, f5.child == nil, "f5.child == nil") + i.Accept() + f2 := i.Fork() + i.NextRune() + i.Accept() + AssertEqual(t, "s", i.String(), "f2 String()") + AssertEqual(t, 3, i.stackFrame.offset, "f2.offset A") + i.Merge(f2) + i.Dispose(f2) + AssertEqual(t, "es", i.String(), "f1 String()") + AssertEqual(t, 3, i.stackFrame.offset, "f1.offset A") + i.Merge(f1) + i.Dispose(f1) + AssertEqual(t, "Tes", i.String(), "top-level API String()") + AssertEqual(t, 3, i.stackFrame.offset, "f1.offset A") } func TestCallingAcceptAfterNextRune_AcceptsRuneAndMovesReadOffsetForward(t *testing.T) { - // TODO FIXME Speed changes i := NewAPI("Testing") r, _ := i.NextRune() AssertEqual(t, 'T', r, "result from 1st call to NextRune()") - // AssertTrue(t, i.result.lastRune != nil, "API.result.lastRune after NextRune() is not nil") + AssertTrue(t, i.lastRune == 'T', "API.lastRune after NextRune() is not 'T'") + AssertTrue(t, i.runeRead, "API.runeRead after NextRune() is not true") i.Accept() - // AssertTrue(t, i.result.lastRune == nil, "API.result.lastRune after Accept() is nil") - // AssertEqual(t, 1, i.result.offset, "API.result.offset") + AssertTrue(t, i.runeRead == false, "API.runeRead after Accept() is not false") + AssertEqual(t, 1, i.stackFrame.offset, "API.stackFrame.offset") r, _ = i.NextRune() AssertEqual(t, 'e', r, "result from 2nd call to NextRune()") } +func TestFlushInput(t *testing.T) { + api := NewAPI("cool") + + // Flushing without any read data is okay. FlushInput() will return + // false in this case, and nothing else happens. + AssertTrue(t, api.FlushInput() == false, "flush input at start") + + api.NextRune() + api.Accept() + api.NextRune() + api.Accept() + + AssertTrue(t, api.FlushInput() == true, "flush input after reading some data") + AssertEqual(t, 0, api.stackFrame.offset, "offset after flush input") + + AssertTrue(t, api.FlushInput() == false, "flush input after flush input") + + // Read offset is now zero, but reading should continue after "co". + api.NextRune() + api.Accept() + api.NextRune() + api.Accept() + + AssertEqual(t, "cool", api.String(), "end result") +} + +func TestInputFlusherWrapper(t *testing.T) { + runeA := A.Rune('a') + flushB := C.FlushInput(A.Rune('b')) + api := NewAPI("abaab") + runeA(api) + AssertEqual(t, 1, api.stackFrame.offset, "offset after 1 read") + AssertEqual(t, "a", api.String(), "runes after 1 read") + flushB(api) + AssertEqual(t, 0, api.stackFrame.offset, "offset after 2 reads + input flush") + AssertEqual(t, "ab", api.String(), "runes after 2 reads") + runeA(api) + AssertEqual(t, 1, api.stackFrame.offset, "offset after 3 reads") + AssertEqual(t, "aba", api.String(), "runes after 3 reads") + runeA(api) + AssertEqual(t, 2, api.stackFrame.offset, "offset after 4 reads") + AssertEqual(t, "abaa", api.String(), "runes after 4 reads") + flushB(api) + AssertEqual(t, 0, api.stackFrame.offset, "offset after 5 reads + input flush") + AssertEqual(t, "abaab", api.String(), "runes after 5 reads") +} + func AssertEqual(t *testing.T, expected interface{}, actual interface{}, forWhat string) { if expected != actual { t.Errorf( diff --git a/tokenize2/api.go b/tokenize2/api.go deleted file mode 100644 index 2b0aa07..0000000 --- a/tokenize2/api.go +++ /dev/null @@ -1,374 +0,0 @@ -package tokenize2 - -import ( - "git.makaay.nl/mauricem/go-parsekit/read" -) - -// API holds the internal state of a tokenizer run and provides an API that -// tokenize.Handler functions can use to: -// -// • read and accept runes from the input (NextRune, Accept) -// -// • fork the API for easy lookahead support (Fork, Merge, Reset, Dispose) -// -// • flush already read input data when not needed anymore (FlushInput) -// -// • retrieve the tokenizer Result struct (Result) to read or modify the results -// -// BASIC OPERATION: -// -// To retrieve the next rune from the API, call the NextRune() method. -// -// When the rune is to be accepted as input, call the method Accept(). The rune -// is then added to the result runes of the API and the read cursor is moved -// forward. -// -// By invoking NextRune() + Accept() multiple times, the result can be extended -// with as many runes as needed. Runes collected this way can later on be -// retrieved using the method Result().Runes(). -// -// It is mandatory to call Accept() after retrieving a rune, before calling -// NextRune() again. Failing to do so will result in a panic. -// -// Next to adding runes to the result, it is also possible to modify the -// stored runes or to add lexical Tokens to the result. For all things -// concerning results, take a look at the Result struct, which -// can be accessed though the method Result(). -// -// FORKING OPERATION FOR EASY LOOKEAHEAD SUPPORT: -// -// Sometimes, we must be able to perform a lookahead, which might either -// succeed or fail. In case of a failing lookahead, the state of the -// API must be brought back to the original state, so we can try -// a different route. -// -// The way in which this is supported, is by forking an API struct by -// calling method Fork(). This will return a forked child API, with -// empty result data, but using the same read cursor position as the -// forked parent. -// -// After forking, the same interface as described for BASIC OPERATION can be -// used to fill the results. When the lookahead was successful, then -// Merge() can be called on the forked child to append the child's results -// to the parent's results, and to move the read cursor position to that -// of the child. -// -// When the lookahead was unsuccessful, then the forked child API can -// disposed by calling Dispose() on the forked child. This is not mandatory. -// Garbage collection will take care of this automatically. -// The parent API was never modified, so it can safely be used after disposal -// as if the lookahead never happened. -// -// Opinionized note: -// Many tokenizers/parsers take a different approach on lookaheads by using -// peeks and by moving the read cursor position back and forth, or by putting -// read input back on the input stream. That often leads to code that is -// efficient, however, in my opinion, not very intuitive to read. It can also -// be tedious to get the cursor position back at the correct position, which -// can lead to hard to track bugs. I much prefer this forking method, since -// no bookkeeping has to be implemented when implementing a parser. -type API struct { - reader *read.Buffer // the input data reader - lastRune rune // the rune as retrieved by the last NextRune() calll - lastRuneErr error // the error for the last NextRune() call - runeRead bool // whether or not a rune was read using NextRune() - runes []rune // the rune stack - tokens []Token // the token stack - stackFrames []stackFrame // the stack frames, containing stack level-specific data - stackLevel int // the current stack level - stackFrame *stackFrame // the current stack frame -} - -type stackFrame struct { - offset int // current rune offset relative to the Reader's sliding window - runeStart int - runeEnd int - tokenStart int - tokenEnd int - cursor Cursor - - // TODO - err error // can be used by a Handler to report a specific issue with the input -} - -const initialStackDepth = 10 -const initialTokenDepth = 10 -const initialRuneDepth = 10 - -// NewAPI initializes a new API struct, wrapped around the provided input. -// For an overview of allowed inputs, take a look at the documentation -// for parsekit.read.New(). -func NewAPI(input interface{}) *API { - api := &API{ - reader: read.New(input), - runes: make([]rune, 0, initialRuneDepth), - tokens: make([]Token, 0, initialTokenDepth), - stackFrames: make([]stackFrame, 1, initialStackDepth), - } - api.stackFrame = &api.stackFrames[0] - - return api -} - -// NextRune returns the rune at the current read offset. -// -// When an invalid UTF8 rune is encountered on the input, it is replaced with -// the utf.RuneError rune. It's up to the caller to handle this as an error -// when needed. -// -// After reading a rune it must be Accept()-ed to move the read cursor forward -// to the next rune. Doing so is mandatory. When doing a second call to NextRune() -// without explicitly accepting, this method will panic. You can see this as a -// built-in unit test, enforcing correct serialization of API method calls. -func (i *API) NextRune() (rune, error) { - if i.runeRead { - callerPanic("NextRune", "tokenize.API.{name}(): {name}() called at {caller} "+ - "without a prior call to Accept()") - } - - readRune, err := i.reader.RuneAt(i.stackFrame.offset) - i.lastRune = readRune - i.lastRuneErr = err - i.runeRead = true - - return readRune, err -} - -// Accept the last rune as read by NextRune() into the Result runes and move -// the cursor forward. -// -// It is not allowed to call Accept() when the previous call to NextRune() -// returned an error. Calling Accept() in such case will result in a panic. -func (i *API) Accept() { - if !i.runeRead { - callerPanic("Accept", "tokenize.API.{name}(): {name}() called at {caller} "+ - "without first calling NextRune()") - } else if i.lastRuneErr != nil { - callerPanic("Accept", "tokenize.API.{name}(): {name}() called at {caller}, "+ - "but the prior call to NextRune() failed") - } - - i.runes = append(i.runes, i.lastRune) - i.stackFrame.runeEnd++ - i.stackFrame.cursor.moveByRune(i.lastRune) - i.stackFrame.offset++ - i.runeRead = false -} - -// Fork forks off a child of the API struct. It will reuse the same -// read buffer and cursor position, but for the rest this is a fresh API. -// -// By forking an API, you can freely work with the forked child, without -// affecting the parent API. This is for example useful when you must perform -// some form of lookahead. -// -// When processing of the Handler was successful and you want to add the results -// to the parent API, you can call Merge() on the forked child. -// This will add the results to the results of the parent (runes, tokens). -// It also updates the read cursor position of the parent to that of the child. -// -// When the lookahead was unsuccessful, then the forked child API can -// disposed by calling Dispose() on the forked child. This is not mandatory. -// Garbage collection will take care of this automatically. -// The parent API was never modified, so it can safely be used after disposal -// as if the lookahead never happened. -func (i *API) Fork() int { - newStackLevel := i.stackLevel + 1 - newStackSize := newStackLevel + 1 - - // Grow the stack frames capacity when needed. - if cap(i.stackFrames) < newStackSize { - newFrames := make([]stackFrame, newStackSize, newStackSize*2) - copy(newFrames, i.stackFrames) - i.stackFrames = newFrames - } else { - i.stackFrames = i.stackFrames[0:newStackSize] - } - - parent := i.stackFrame - i.stackLevel++ - i.stackFrame = &i.stackFrames[i.stackLevel] - *i.stackFrame = *parent - i.stackFrame.runeStart = parent.runeEnd - i.stackFrame.tokenStart = parent.tokenEnd - i.runeRead = false - - return i.stackLevel -} - -// Merge appends the results of a forked child API (runes, tokens) to the -// results of its parent. The read cursor of the parent is also updated -// to that of the forked child. -// -// After the merge operation, the child results are reset so it can immediately -// be reused for performing another match. This means that all Result data are -// cleared, but the read cursor position is kept at its current position. -// This allows a child to feed results in chunks to its parent. -// -// Once the child is no longer needed, it can be disposed of by using the -// method Dispose(), which will return the tokenizer to the parent. -func (i *API) Merge(stackLevel int) { - if stackLevel == 0 { - callerPanic("Merge", "tokenize.API.{name}(): {name}() called at {caller} "+ - "on the top-level API stack level 0") - } - if stackLevel != i.stackLevel { - callerPanic("Merge", "tokenize.API.{name}(): {name}() called at {caller} "+ - "on API stack level %d, but the current stack level is %d "+ - "(forgot to Dispose() a forked child?)", stackLevel, i.stackLevel) - } - - parent := &i.stackFrames[stackLevel-1] - - if parent.runeEnd == i.stackFrame.runeStart { - // The end of the parent slice aligns with the start of the child slice. - // Because of this, to merge the parent slice can simply be expanded - // to include the child slice. - // parent : |----------| - // child: |------| - // After merge operation: - // parent: |-----------------| - // child: |---> continue reading from here - parent.runeEnd = i.stackFrame.runeEnd - i.stackFrame.runeStart = i.stackFrame.runeEnd - } else { - // The end of the parent slice does not align with the start of the - // child slice. The child slice has to be copied onto the end of - // the parent slice. - // parent : |----------| - // child: |------| - // After merge operation: - // parent: |-----------------| - // child: |---> continue reading from here - i.runes = append(i.runes[:parent.runeEnd], i.runes[i.stackFrame.runeStart:i.stackFrame.runeEnd]...) - parent.runeEnd = len(i.runes) - i.stackFrame.runeStart = parent.runeEnd - i.stackFrame.runeEnd = parent.runeEnd - } - - // The same logic applies to tokens. - if parent.tokenEnd == i.stackFrame.tokenStart { - parent.tokenEnd = i.stackFrame.tokenEnd - i.stackFrame.tokenStart = i.stackFrame.tokenEnd - } else { - i.tokens = append(i.tokens[:parent.tokenEnd], i.tokens[i.stackFrame.tokenStart:i.stackFrame.tokenEnd]...) - parent.tokenEnd = len(i.tokens) - i.stackFrame.tokenStart = parent.tokenEnd - i.stackFrame.tokenEnd = parent.tokenEnd - } - - parent.offset = i.stackFrame.offset - parent.cursor = i.stackFrame.cursor - - i.stackFrame.err = nil - i.runeRead = false -} - -func (i *API) Dispose(stackLevel int) { - if stackLevel == 0 { - callerPanic("Dispose", "tokenize.API.{name}(): {name}() called at {caller} "+ - "on the top-level API stack level 0") - } - if stackLevel != i.stackLevel { - callerPanic("Dispose", "tokenize.API.{name}(): {name}() called at {caller} "+ - "on API stack level %d, but the current stack level is %d "+ - "(forgot to Dispose() a forked child?)", stackLevel, i.stackLevel) - } - - i.runeRead = false - i.stackLevel = stackLevel - 1 - i.stackFrames = i.stackFrames[:stackLevel] - i.stackFrame = &i.stackFrames[stackLevel-1] - i.runes = i.runes[0:i.stackFrame.runeEnd] - i.tokens = i.tokens[0:i.stackFrame.tokenEnd] -} - -func (i *API) Reset() { - i.runeRead = false - i.stackFrame.runeStart = i.stackFrame.runeEnd - i.stackFrame.tokenStart = i.stackFrame.tokenEnd - i.stackFrame.err = nil -} - -// FlushInput flushes processed input data from the read.Buffer. -// In this context 'processed' means all runes that were read using NextRune() -// and that were added to the results using Accept(). -// -// Note: -// When writing your own TokenHandler, you normally won't have to call this -// method yourself. It is automatically called by parsekit when needed. -func (i *API) FlushInput() bool { - // result := &(i.state.stack[i.stackLevel]) - if i.stackFrame.offset > 0 { - i.reader.Flush(i.stackFrame.offset) - i.stackFrame.offset = 0 - return true - } - return false -} - -func (i *API) String() string { - return string(i.Runes()) -} - -func (i *API) Runes() []rune { - return i.runes[i.stackFrame.runeStart:i.stackFrame.runeEnd] -} - -func (i *API) Rune(offset int) rune { - return i.runes[i.stackFrame.runeStart+offset] -} - -func (i *API) ClearRunes() { - i.runes = i.runes[:i.stackFrame.runeStart] - i.stackFrame.runeEnd = i.stackFrame.runeStart -} - -func (i *API) SetRunes(runes ...rune) { - i.runes = append(i.runes[:i.stackFrame.runeStart], runes...) - i.stackFrame.runeEnd = i.stackFrame.runeStart + len(runes) -} - -func (i *API) AddRunes(runes ...rune) { - i.runes = append(i.runes[:i.stackFrame.runeEnd], runes...) - i.stackFrame.runeEnd += len(runes) -} - -func (i *API) AddString(s string) { - i.AddRunes([]rune(s)...) -} - -func (i *API) SetString(s string) { - i.SetRunes([]rune(s)...) -} - -func (i *API) Cursor() Cursor { - return i.stackFrame.cursor -} - -func (i *API) Tokens() []Token { - return i.tokens[i.stackFrame.tokenStart:i.stackFrame.tokenEnd] -} - -func (i *API) Token(offset int) Token { - return i.tokens[i.stackFrame.tokenStart+offset] -} - -func (i *API) TokenValue(offset int) interface{} { - return i.tokens[i.stackFrame.tokenStart+offset].Value -} - -func (i *API) ClearTokens() { - i.tokens = i.tokens[:i.stackFrame.tokenStart] - i.stackFrame.tokenEnd = i.stackFrame.tokenStart -} - -func (i *API) SetTokens(tokens ...Token) { - i.tokens = append(i.tokens[:i.stackFrame.tokenStart], tokens...) - i.stackFrame.tokenEnd = i.stackFrame.tokenStart + len(tokens) -} - -func (i *API) AddTokens(tokens ...Token) { - i.tokens = append(i.tokens[:i.stackFrame.tokenEnd], tokens...) - i.stackFrame.tokenEnd += len(tokens) -} diff --git a/tokenize2/api_test.go b/tokenize2/api_test.go deleted file mode 100644 index e331cd0..0000000 --- a/tokenize2/api_test.go +++ /dev/null @@ -1,330 +0,0 @@ -package tokenize2_test - -import ( - "fmt" - "testing" - - tokenize "git.makaay.nl/mauricem/go-parsekit/tokenize2" -) - -func ExampleNewAPI() { - tokenize.NewAPI("The input that the API will handle") - - // Output: -} - -func ExampleAPI_NextRune() { - api := tokenize.NewAPI("The input that the API will handle") - r, err := api.NextRune() - fmt.Printf("Rune read from input; %c\n", r) - fmt.Printf("The error: %v\n", err) - fmt.Printf("API results: %q\n", api.String()) - - // Output: - // Rune read from input; T - // The error: - // API results: "" -} - -func ExampleAPI_Accept() { - api := tokenize.NewAPI("The input that the API will handle") - api.NextRune() // reads 'T' - api.Accept() // adds 'T' to the API results - api.NextRune() // reads 'h' - api.Accept() // adds 'h' to the API results - api.NextRune() // reads 'e', but it is not added to the API results - - fmt.Printf("API results: %q\n", api.String()) - - // Output: - // API results: "Th" -} - -func ExampleAPI_modifyingResults() { - api := tokenize.NewAPI("") - - api.AddString("Some runes") - api.AddRunes(' ', 'a', 'd', 'd', 'e', 'd') - api.AddRunes(' ', 'i', 'n', ' ') - api.AddString("various ways") - fmt.Printf("API result first 10 runes: %q\n", api.Runes()[0:10]) - fmt.Printf("API result runes as string: %q\n", api.String()) - - api.SetString("new ") - api.AddString("set ") - api.AddString("of ") - api.AddRunes('r', 'u', 'n', 'e', 's') - fmt.Printf("API result runes as string: %q\n", api.String()) - fmt.Printf("API result runes: %q\n", api.Runes()) - fmt.Printf("API third rune: %q\n", api.Rune(2)) - - api.AddTokens(tokenize.Token{ - Type: 42, - Value: "towel"}) - api.AddTokens(tokenize.Token{ - Type: 73, - Value: "Zaphod"}) - fmt.Printf("API result tokens: %v\n", api.Tokens()) - fmt.Printf("API second result token: %v\n", api.Token(1)) - - // Output: - // API result first 10 runes: ['S' 'o' 'm' 'e' ' ' 'r' 'u' 'n' 'e' 's'] - // API result runes as string: "Some runes added in various ways" - // API result runes as string: "new set of runes" - // API result runes: ['n' 'e' 'w' ' ' 's' 'e' 't' ' ' 'o' 'f' ' ' 'r' 'u' 'n' 'e' 's'] - // API third rune: 'w' - // API result tokens: [42("towel") 73("Zaphod")] - // API second result token: 73("Zaphod") -} - -func ExampleAPI_Reset() { - api := tokenize.NewAPI("Very important input!") - - api.NextRune() - api.Accept() - api.NextRune() - api.Accept() - fmt.Printf("API results: %q at %s\n", api.String(), api.Cursor()) - - // Reset clears the results, but keeps the cursor position. - api.Reset() - fmt.Printf("API results: %q at %s\n", api.String(), api.Cursor()) - - api.NextRune() - api.Accept() - api.NextRune() - api.Accept() - fmt.Printf("API results: %q at %s\n", api.String(), api.Cursor()) - - // Output: - // API results: "Ve" at line 1, column 3 - // API results: "" at line 1, column 3 - // API results: "ry" at line 1, column 5 -} - -func ExampleAPI_Fork() { - // This custom Handler checks for input 'a', 'b' or 'c'. - abcHandler := func(t *tokenize.API) bool { - a := tokenize.A - for _, r := range []rune{'a', 'b', 'c'} { - child := t.Fork() // fork, so we won't change parent t - if a.Rune(r)(t) { - t.Merge(child) // accept results into parent of child - t.Dispose(child) // return to the parent level - return true // and report a successful match - } - t.Dispose(child) // return to the parent level - } - // If we get here, then no match was found. Return false to communicate - // this to the caller. - return false - } - - // Note: a custom Handler is normally not what you need. - // You can make use of the parser/combinator tooling to make the - // implementation a lot simpler and to take care of forking at - // the appropriate places. The handler from above can be replaced with: - simpler := tokenize.A.RuneRange('a', 'c') - - result, err := tokenize.New(abcHandler)("another test") - fmt.Println(result, err) - result, err = tokenize.New(simpler)("curious") - fmt.Println(result, err) - result, err = tokenize.New(abcHandler)("bang on!") - fmt.Println(result, err) - result, err = tokenize.New(abcHandler)("not a match") - fmt.Println(result, err) - - // Output: - // a - // c - // b - // mismatch at start of file -} - -func ExampleAPI_Merge() { - tokenHandler := func(t *tokenize.API) bool { - child1 := t.Fork() - t.NextRune() // reads 'H' - t.Accept() - t.NextRune() // reads 'i' - t.Accept() - - child2 := t.Fork() - t.NextRune() // reads ' ' - t.Accept() - t.NextRune() // reads 'm' - t.Accept() - t.Dispose(child2) - - t.Merge(child1) // We merge child1, which has read 'H' and 'i' only. - t.Dispose(child1) // and clean up child1 to return to the parent - return true - } - - result, _ := tokenize.New(tokenHandler)("Hi mister X!") - fmt.Println(result.String()) - - // Output: - // Hi -} - -func TestMultipleLevelsOfForksAndMerges(t *testing.T) { - api := tokenize.NewAPI("abcdefghijklmnopqrstuvwxyz") - - // Fork a few levels. - child1 := api.Fork() - child2 := api.Fork() - child3 := api.Fork() - child4 := api.Fork() - - // Read a rune 'a' from child4. - r, _ := api.NextRune() - AssertEqual(t, 'a', r, "child4 rune 1") - api.Accept() - AssertEqual(t, "a", api.String(), "child4 runes after rune 1") - - // Read another rune 'b' from child4. - r, _ = api.NextRune() - AssertEqual(t, 'b', r, "child4 rune 2") - api.Accept() - AssertEqual(t, "ab", api.String(), "child4 runes after rune 2") - - // Merge "ab" from child4 to child3. - api.Merge(child4) - AssertEqual(t, "", api.String(), "child4 runes after first merge") - - // Read some more from child4. - r, _ = api.NextRune() - AssertEqual(t, 'c', r, "child4 rune 3") - api.Accept() - AssertEqual(t, "c", api.String(), "child4 runes after rune 1") - AssertEqual(t, "line 1, column 4", api.Cursor().String(), "cursor child4 rune 3") - - // Merge "c" from child4 to child3. - api.Merge(child4) - - // And dispose of child4, making child3 the active stack level. - api.Dispose(child4) - - // Child3 should now have the compbined results "abc" from child4's work. - AssertEqual(t, "abc", api.String(), "child3 after merge of child4") - AssertEqual(t, "line 1, column 4", api.Cursor().String(), "cursor child3 rune 3, after merge of child4") - - // Now read some data from child3. - r, _ = api.NextRune() - AssertEqual(t, 'd', r, "child3 rune 5") - api.Accept() - - r, _ = api.NextRune() - AssertEqual(t, 'e', r, "child3 rune 5") - api.Accept() - - r, _ = api.NextRune() - AssertEqual(t, 'f', r, "child3 rune 5") - api.Accept() - - AssertEqual(t, "abcdef", api.String(), "child3 total result after rune 6") - - // Temporarily go some new forks from here, but don't use their outcome. - child3sub1 := api.Fork() - api.NextRune() - api.Accept() - api.NextRune() - api.Accept() - child3sub2 := api.Fork() - api.NextRune() - api.Accept() - api.Merge(child3sub2) // do merge sub2 down to sub1 - api.Dispose(child3sub2) // and dispose of sub2 - api.Dispose(child3sub1) // but dispose of sub1 without merging - - // Instead merge the results from before this forking segway from child3 to child2 - // and dispose of it. - api.Merge(child3) - api.Dispose(child3) - - AssertEqual(t, "abcdef", api.String(), "child2 total result after merge of child3") - AssertEqual(t, "line 1, column 7", api.Cursor().String(), "cursor child2 after merge child3") - - // Merge child2 to child1 and dispose of it. - api.Merge(child2) - api.Dispose(child2) - - // Merge child1 a few times to the top level api. - api.Merge(child1) - api.Merge(child1) - api.Merge(child1) - api.Merge(child1) - - // And dispose of it. - api.Dispose(child1) - - // Read some data from the top level api. - r, _ = api.NextRune() - api.Accept() - - AssertEqual(t, "abcdefg", api.String(), "api string end result") - AssertEqual(t, "line 1, column 8", api.Cursor().String(), "api cursor end result") -} - -func TestClearRunes(t *testing.T) { - api := tokenize.NewAPI("Laphroaig") - api.NextRune() // Read 'L' - api.Accept() // Add to runes - api.NextRune() // Read 'a' - api.Accept() // Add to runes - api.ClearRunes() // Clear the runes, giving us a fresh start. - api.NextRune() // Read 'p' - api.Accept() // Add to runes - api.NextRune() // Read 'r' - api.Accept() // Add to runes - - AssertEqual(t, "ph", api.String(), "api string end result") -} - -func TestMergeScenariosForTokens(t *testing.T) { - api := tokenize.NewAPI("") - - token1 := tokenize.Token{Value: 1} - token2 := tokenize.Token{Value: 2} - token3 := tokenize.Token{Value: 3} - token4 := tokenize.Token{Value: 4} - - api.SetTokens(token1) - tokens := api.Tokens() - AssertEqual(t, 1, len(tokens), "Tokens 1") - - child := api.Fork() - - tokens = api.Tokens() - AssertEqual(t, 0, len(tokens), "Tokens 2") - - api.AddTokens(token2) - - // Here we can merge by expanding the token slice on the parent, - // because the end of the parent slice and the start of the child - // slice align. - api.Merge(child) - api.Dispose(child) - - tokens = api.Tokens() - AssertEqual(t, 2, len(tokens), "Tokens 3") - - child = api.Fork() - api.AddTokens(token3) - api.Reset() - api.AddTokens(token4) - - // Here the merge means that token4 will be copied to the end of - // the token slice of the parent, since there's a gap at the place - // where token3 used to be. - api.Merge(child) - api.Dispose(child) - - tokens = api.Tokens() - AssertEqual(t, 3, len(tokens), "Tokens 4") - AssertEqual(t, 1, api.TokenValue(0).(int), "Tokens 4, value 0") - AssertEqual(t, 2, api.TokenValue(1).(int), "Tokens 4, value 1") - AssertEqual(t, 4, api.TokenValue(2).(int), "Tokens 4, value 2") -} diff --git a/tokenize2/assertions_test.go b/tokenize2/assertions_test.go deleted file mode 100644 index 7aa8831..0000000 --- a/tokenize2/assertions_test.go +++ /dev/null @@ -1,118 +0,0 @@ -package tokenize2_test - -// This file contains some tools that are used for writing tests. - -import ( - "regexp" - "testing" - - tokenize "git.makaay.nl/mauricem/go-parsekit/tokenize2" -) - -func AssertEqual(t *testing.T, expected interface{}, actual interface{}, forWhat string) { - if expected != actual { - t.Errorf( - "Unexpected value for %s:\nexpected: %q\nactual: %q", - forWhat, expected, actual) - } -} - -func AssertTrue(t *testing.T, b bool, assertion string) { - if !b { - t.Errorf("Assertion %s is false", assertion) - } -} - -type PanicT struct { - Function func() - Regexp bool - Expect string -} - -func AssertPanics(t *testing.T, testSet []PanicT) { - for _, test := range testSet { - AssertPanic(t, test) - } -} - -func AssertPanic(t *testing.T, p PanicT) { - defer func() { - if r := recover(); r != nil { - mismatch := false - if p.Regexp && !regexp.MustCompile(p.Expect).MatchString(r.(string)) { - mismatch = true - } - if !p.Regexp && p.Expect != r.(string) { - mismatch = true - } - if mismatch { - t.Errorf( - "Code did panic, but unexpected panic message received:\nexpected: %q\nactual: %q", - p.Expect, r) - } - } else { - t.Errorf("Function did not panic (expected panic message: %s)", p.Expect) - } - }() - p.Function() -} - -type HandlerT struct { - Input string - Handler tokenize.Handler - MustMatch bool - Expected string -} - -func AssertHandlers(t *testing.T, testSet []HandlerT) { - for _, test := range testSet { - AssertHandler(t, test) - } -} - -func AssertHandler(t *testing.T, test HandlerT) { - result, err := tokenize.New(test.Handler)(test.Input) - if test.MustMatch { - if err != nil { - t.Errorf("Test %q failed with error: %s", test.Input, err) - } else if output := result.String(); output != test.Expected { - t.Errorf("Test %q failed: not expected output:\nexpected: %q\nactual: %q\n", test.Input, test.Expected, output) - } - } else { - if err == nil { - t.Errorf("Test %q failed: should not match, but it did", test.Input) - } - } -} - -type TokenMakerT struct { - Input string - Handler tokenize.Handler - Expected []tokenize.Token -} - -func AssertTokenMakers(t *testing.T, testSet []TokenMakerT) { - for _, test := range testSet { - AssertTokenMaker(t, test) - } -} - -func AssertTokenMaker(t *testing.T, test TokenMakerT) { - result, err := tokenize.New(test.Handler)(test.Input) - if err != nil { - t.Errorf("Test %q failed with error: %s", test.Input, err) - } else { - if len(result.Tokens()) != len(test.Expected) { - t.Errorf("Unexpected number of tokens in output:\nexpected: %d\nactual: %d", len(test.Expected), len(result.Tokens())) - } - for i, expected := range test.Expected { - actual := result.Token(i) - if expected.Type != actual.Type { - t.Errorf("Unexpected Type in result.Tokens[%d]:\nexpected: (%T) %s\nactual: (%T) %s", i, expected.Type, expected.Type, actual.Type, actual.Type) - } - if expected.Value != actual.Value { - t.Errorf("Unexpected Value in result.Tokens[%d]:\nexpected: (%T) %s\nactual: (%T) %s", i, expected.Value, expected.Value, actual.Value, actual.Value) - } - } - } -} diff --git a/tokenize2/callerinfo.go b/tokenize2/callerinfo.go deleted file mode 100644 index dcb4f21..0000000 --- a/tokenize2/callerinfo.go +++ /dev/null @@ -1,33 +0,0 @@ -package tokenize2 - -import ( - "fmt" - "runtime" - "strings" -) - -func callerPanic(name, f string, data ...interface{}) { - filepos := callerBefore(name) - m := fmt.Sprintf(f, data...) - m = strings.Replace(m, "{caller}", filepos, -1) - m = strings.Replace(m, "{name}", name, -1) - panic(m) -} - -func callerBefore(name string) string { - found := false - for i := 1; ; i++ { - pc, file, line, ok := runtime.Caller(i) - if found { - return fmt.Sprintf("%s:%d", file, line) - } - if !ok { - return "unknown caller" - } - f := runtime.FuncForPC(pc) - - if strings.HasSuffix(f.Name(), "."+name) { - found = true - } - } -} diff --git a/tokenize2/cursor.go b/tokenize2/cursor.go deleted file mode 100644 index a5e8799..0000000 --- a/tokenize2/cursor.go +++ /dev/null @@ -1,45 +0,0 @@ -package tokenize2 - -import ( - "fmt" - "unicode/utf8" -) - -// Cursor represents the position of a cursor in various ways. -type Cursor struct { - Byte int // The cursor offset in bytes - Rune int // The cursor offset in UTF8 runes - Column int // The column at which the cursor is (0-indexed) - Line int // The line at which the cursor is (0-indexed) -} - -// String produces a string representation of the cursor position. -func (c Cursor) String() string { - if c.Line == 0 && c.Column == 0 { - return fmt.Sprintf("start of file") - } - return fmt.Sprintf("line %d, column %d", c.Line+1, c.Column+1) -} - -// move updates the position of the cursor, based on the provided input string. -// The input string represents the runes that the cursor must be moved over. -// This method will take newlines into account to keep track of line numbers and -// column positions automatically. -func (c *Cursor) move(input string) *Cursor { - for _, r := range input { - c.moveByRune(r) - } - return c -} - -func (c *Cursor) moveByRune(r rune) *Cursor { - c.Byte += utf8.RuneLen(r) - c.Rune++ - if r == '\n' { - c.Column = 0 - c.Line++ - } else { - c.Column++ - } - return c -} diff --git a/tokenize2/cursor_test.go b/tokenize2/cursor_test.go deleted file mode 100644 index 8569354..0000000 --- a/tokenize2/cursor_test.go +++ /dev/null @@ -1,69 +0,0 @@ -package tokenize2 - -import ( - "fmt" - "testing" -) - -func ExampleCursor_move() { - c := Cursor{} - fmt.Printf("after initialization : %s\n", c) - fmt.Printf("after 'some words' : %s\n", c.move("some words")) - fmt.Printf("after '\\n' : %s\n", c.move("\n")) - fmt.Printf("after '\\r\\nskip\\nlines' : %s\n", c.move("\r\nskip\nlines")) - - // Output: - // after initialization : start of file - // after 'some words' : line 1, column 11 - // after '\n' : line 2, column 1 - // after '\r\nskip\nlines' : line 4, column 6 -} - -func ExampleCursor_String() { - c := Cursor{} - fmt.Println(c.String()) - - c.move("\nfoobar") - fmt.Println(c.String()) - - // Output: - // start of file - // line 2, column 7 -} - -func TestGivenCursor_WhenMoving_CursorIsUpdated(t *testing.T) { - for _, test := range []struct { - name string - input []string - byte int - rune int - line int - column int - }{ - {"No input at all", []string{""}, 0, 0, 0, 0}, - {"One ASCII char", []string{"a"}, 1, 1, 0, 1}, - {"Multiple ASCII chars", []string{"abc"}, 3, 3, 0, 3}, - {"One newline", []string{"\n"}, 1, 1, 1, 0}, - {"Carriage return", []string{"\r\r\r"}, 3, 3, 0, 3}, - {"One UTF8 3 byte char", []string{"⌘"}, 3, 1, 0, 1}, - {"Mixture", []string{"Hello\n\npretty\nW⌘O⌘R⌘L⌘D"}, 31, 23, 3, 9}, - {"Multiple calls", []string{"hello", "world"}, 10, 10, 0, 10}, - } { - c := Cursor{} - for _, s := range test.input { - c.move(s) - } - if c.Byte != test.byte { - t.Errorf("[%s] Unexpected byte offset %d (expected %d)", test.name, c.Byte, test.byte) - } - if c.Rune != test.rune { - t.Errorf("[%s] Unexpected rune offset %d (expected %d)", test.name, c.Rune, test.rune) - } - if c.Line != test.line { - t.Errorf("[%s] Unexpected line offset %d (expected %d)", test.name, c.Line, test.line) - } - if c.Column != test.column { - t.Errorf("[%s] Unexpected column offset %d (expected %d)", test.name, c.Column, test.column) - } - } -} diff --git a/tokenize2/handler.go b/tokenize2/handler.go deleted file mode 100644 index a2c637b..0000000 --- a/tokenize2/handler.go +++ /dev/null @@ -1,53 +0,0 @@ -package tokenize2 - -// Handler is the function type that is involved in turning a low level -// stream of UTF8 runes into lexical tokens. Its purpose is to check if input -// data matches some kind of pattern and to report back the results. -// -// A Handler function gets an API as its input and returns a boolean to -// indicate whether or not it found a match on the input. The API is used -// for retrieving input data to match against and for reporting back results. -type Handler func(t *API) bool - -// Match is syntactic sugar that allows you to write a construction like -// NewTokenizer(handler).Execute(input) as handler.Match(input). -func (handler Handler) Match(input interface{}) (*API, error) { - tokenizer := New(handler) - return tokenizer(input) -} - -// Or is syntactic sugar that allows you to write a construction like -// MatchAny(tokenHandler1, tokenHandler2) as tokenHandler1.Or(tokenHandler2). -func (handler Handler) Or(otherHandler Handler) Handler { - return MatchAny(handler, otherHandler) -} - -// Times is syntactic sugar that allows you to write a construction like -// MatchRep(3, handler) as handler.Times(3). -func (handler Handler) Times(n int) Handler { - return MatchRep(n, handler) -} - -// Then is syntactic sugar that allows you to write a construction like -// MatchSeq(handler1, handler2, handler3) as handler1.Then(handler2).Then(handler3). -func (handler Handler) Then(otherHandler Handler) Handler { - return MatchSeq(handler, otherHandler) -} - -// SeparatedBy is syntactic sugar that allows you to write a construction like -// MatchSeparated(handler, separator) as handler.SeparatedBy(separator). -func (handler Handler) SeparatedBy(separator Handler) Handler { - return MatchSeparated(separator, handler) -} - -// Optional is syntactic sugar that allows you to write a construction like -// MatchOptional(handler) as handler.Optional(). -func (handler Handler) Optional() Handler { - return MatchOptional(handler) -} - -// Except is syntactic sugar that allows you to write a construction like -// MatchExcept(handler) as handler.Optional(). -func (handler Handler) Except(exceptHandler Handler) Handler { - return MatchExcept(handler, exceptHandler) -} diff --git a/tokenize2/handler_test.go b/tokenize2/handler_test.go deleted file mode 100644 index 7688d1e..0000000 --- a/tokenize2/handler_test.go +++ /dev/null @@ -1,101 +0,0 @@ -package tokenize2_test - -import ( - "fmt" - "testing" - - tokenize "git.makaay.nl/mauricem/go-parsekit/tokenize2" -) - -func TestSyntacticSugar(t *testing.T) { - var a = tokenize.A - AssertHandlers(t, []HandlerT{ - {"aaaaaa", a.Rune('a').Times(4), true, "aaaa"}, - {"ababab", a.Rune('a').Or(a.Rune('b')).Times(4), true, "abab"}, - {"ababab", a.Rune('a').Then(a.Rune('b')), true, "ab"}, - {"bababa", a.Rune('a').Then(a.Rune('b')), false, ""}, - {"cccccc", a.Rune('c').Optional(), true, "c"}, - {"dddddd", a.Rune('c').Optional(), true, ""}, - {"a,b,c,d", a.ASCII.SeparatedBy(a.Comma), true, "a,b,c,d"}, - {"a, b, c, d", a.ASCII.SeparatedBy(a.Comma.Then(a.Space)), true, "a, b, c, d"}, - {"a, b,c,d", a.ASCII.SeparatedBy(a.Comma.Then(a.Space.Optional())), true, "a, b,c,d"}, - {"a, b, c, d", a.ASCII.SeparatedBy(a.Space.Optional().Then(a.Comma.Then(a.Space.Optional()))), true, "a, b, c, d"}, - {"a,b ,c, d|", a.ASCII.SeparatedBy(a.Space.Optional().Then(a.Comma).Then(a.Space.Optional())), true, "a,b ,c, d"}, - }) -} - -func ExampleHandler_Times() { - c, a := tokenize.C, tokenize.A - phoneNumber := c.Seq(a.Rune('0'), a.Digit.Times(9)) - - fmt.Println(phoneNumber.Match("0201234567")) - // Output: - // 0201234567 -} - -func ExampleHandler_Then() { - c, a := tokenize.C, tokenize.A - phoneNumber := a.Rune('0').Then(c.Repeated(9, a.Digit)) - - fmt.Println(phoneNumber.Match("0208888888")) - // Output: - // 0208888888 -} - -func ExampleHandler_Or() { - c, a := tokenize.C, tokenize.A - phoneNumber := c.Seq(a.Str("00").Or(a.Plus), a.Str("31"), a.DigitNotZero, c.Repeated(8, a.Digit)) - - fmt.Println(phoneNumber.Match("+31209876543")) - fmt.Println(phoneNumber.Match("0031209876543")) - fmt.Println(phoneNumber.Match("0031020991234")) - fmt.Println(phoneNumber.Match("0031201234")) - // Output: - // +31209876543 - // 0031209876543 - // mismatch at start of file - // mismatch at start of file -} - -func ExampleHandler_SeparatedBy() { - a, t := tokenize.A, tokenize.T - csv := t.Int("number", a.Digits).SeparatedBy(a.Comma) - - r, _ := csv.Match("123,456,7,8,9") - for i, token := range r.Tokens() { - fmt.Printf("[%d] %v\n", i, token) - } - // Output: - // [0] number((int)123) - // [1] number((int)456) - // [2] number((int)7) - // [3] number((int)8) - // [4] number((int)9) -} - -func ExampleHandler_Optional() { - c, a := tokenize.C, tokenize.A - - spanish := c.Seq( - a.Rune('¿').Optional(), - c.OneOrMore(a.AnyRune.Except(a.Question)), - a.Rune('?').Optional()) - - fmt.Println(spanish.Match("¿Habla español María?")) - fmt.Println(spanish.Match("Sí, María habla español.")) - // Output: - // ¿Habla español María? - // Sí, María habla español. -} - -func ExampleHandler_Match() { - r, err := tokenize.A.IPv4.Match("001.002.003.004") - fmt.Println(r, err) - - r, err = tokenize.A.IPv4.Match("1.2.3") - fmt.Println(r, err) - - // Output: - // 1.2.3.4 - // mismatch at start of file -} diff --git a/tokenize2/handlers_builtin.go b/tokenize2/handlers_builtin.go deleted file mode 100644 index 6821b71..0000000 --- a/tokenize2/handlers_builtin.go +++ /dev/null @@ -1,1489 +0,0 @@ -package tokenize2 - -import ( - "fmt" - "io" - "net" - "strconv" - "strings" - "unicode" - "unicode/utf8" -) - -// C provides convenient access to a range of parser/combinators that can be -// used to construct Handler functions. -// -// Parser/combinators are so called higher order functions that take in one -// or more other Handler functions and output a new Handler. They can be -// used to combine Handler functions in useful ways to create new more complex -// Handler functions. -// -// When using C in your own parser, then it is advised to create a variable -// to reference it, for example: -// -// c := tokenize.C -// -// Doing so saves you a lot of typing, and it makes your code a lot cleaner. -var C = struct { - Any func(...Handler) Handler - Not func(Handler) Handler - Seq func(...Handler) Handler - Min func(min int, handler Handler) Handler - Max func(max int, handler Handler) Handler - Repeated func(times int, handler Handler) Handler - Optional func(Handler) Handler - ZeroOrMore func(Handler) Handler - OneOrMore func(Handler) Handler - MinMax func(min int, max int, handler Handler) Handler - Separated func(separator Handler, separated Handler) Handler - Except func(except Handler, handler Handler) Handler - FollowedBy func(lookAhead Handler, handler Handler) Handler - NotFollowedBy func(lookAhead Handler, handler Handler) Handler - FlushInput func(Handler) Handler -}{ - Any: MatchAny, - Not: MatchNot, - Seq: MatchSeq, - Min: MatchMin, - Max: MatchMax, - Repeated: MatchRep, - Optional: MatchOptional, - ZeroOrMore: MatchZeroOrMore, - OneOrMore: MatchOneOrMore, - MinMax: MatchMinMax, - Separated: MatchSeparated, - Except: MatchExcept, - FollowedBy: MatchFollowedBy, - NotFollowedBy: MatchNotFollowedBy, - FlushInput: MakeInputFlusher, -} - -// A provides convenient access to a range of atoms or functions to build atoms. -// -// When using A in your own parser, then it is advised to create a variable -// to reference it: -// -// a := tokenize.A -// -// Doing so saves you a lot of typing, and it makes your code a lot cleaner. -var A = struct { - Rune func(rune) Handler - Runes func(...rune) Handler - RuneRange func(rune, rune) Handler - Str func(string) Handler - StrNoCase func(string) Handler - EndOfLine Handler - EndOfFile Handler - UntilEndOfLine Handler - AnyRune Handler - ValidRune Handler - InvalidRune Handler - Space Handler - Tab Handler - CR Handler - LF Handler - CRLF Handler - Excl Handler - DoubleQuote Handler - Hash Handler - Dollar Handler - Percent Handler - Amp Handler - SingleQuote Handler - RoundOpen Handler - LeftParen Handler - RoundClose Handler - RightParen Handler - Asterisk Handler - Multiply Handler - Plus Handler - Add Handler - Comma Handler - Minus Handler - Subtract Handler - Dot Handler - Slash Handler - Divide Handler - Colon Handler - Semicolon Handler - AngleOpen Handler - LessThan Handler - Equal Handler - AngleClose Handler - GreaterThan Handler - Question Handler - At Handler - SquareOpen Handler - Backslash Handler - SquareClose Handler - Caret Handler - Underscore Handler - Backquote Handler - CurlyOpen Handler - Pipe Handler - CurlyClose Handler - Tilde Handler - Newline Handler - Blank Handler - Blanks Handler - Whitespace Handler - UnicodeSpace Handler - Digit Handler - DigitNotZero Handler - Digits Handler - Zero Handler - Float Handler - Boolean Handler - Integer Handler - Signed func(Handler) Handler - IntegerBetween func(min int64, max int64) Handler - ASCII Handler - ASCIILower Handler - ASCIIUpper Handler - Letter Handler - Lower Handler - Upper Handler - HexDigit Handler - Octet Handler - IPv4 Handler - IPv4CIDRMask Handler - IPv4Netmask Handler - IPv4Net Handler - IPv6 Handler - IPv6CIDRMask Handler - IPv6Net Handler -}{ - Rune: MatchRune, - Runes: MatchRunes, - RuneRange: MatchRuneRange, - Str: MatchStr, - StrNoCase: MatchStrNoCase, - EndOfFile: MatchEndOfFile(), - EndOfLine: MatchEndOfLine(), - UntilEndOfLine: MatchUntilEndOfLine(), - AnyRune: MatchAnyRune(), - ValidRune: MatchValidRune(), - InvalidRune: MatchInvalidRune(), - Space: MatchRune(' '), - Tab: MatchRune('\t'), - CR: MatchRune('\r'), - LF: MatchRune('\n'), - CRLF: MatchStr("\r\n"), - Excl: MatchRune('!'), - DoubleQuote: MatchRune('"'), - Hash: MatchRune('#'), - Dollar: MatchRune('$'), - Percent: MatchRune('%'), - Amp: MatchRune('&'), - SingleQuote: MatchRune('\''), - RoundOpen: MatchRune('('), - LeftParen: MatchRune('('), - RoundClose: MatchRune(')'), - RightParen: MatchRune(')'), - Asterisk: MatchRune('*'), - Multiply: MatchRune('*'), - Plus: MatchRune('+'), - Add: MatchRune('+'), - Comma: MatchRune(','), - Minus: MatchRune('-'), - Subtract: MatchRune('-'), - Dot: MatchRune('.'), - Slash: MatchRune('/'), - Divide: MatchRune('/'), - Colon: MatchRune(':'), - Semicolon: MatchRune(';'), - AngleOpen: MatchRune('<'), - LessThan: MatchRune('<'), - Equal: MatchRune('='), - AngleClose: MatchRune('>'), - GreaterThan: MatchRune('>'), - Question: MatchRune('?'), - At: MatchRune('@'), - SquareOpen: MatchRune('['), - Backslash: MatchRune('\\'), - SquareClose: MatchRune(']'), - Caret: MatchRune('^'), - Underscore: MatchRune('_'), - Backquote: MatchRune('`'), - CurlyOpen: MatchRune('{'), - Pipe: MatchRune('|'), - CurlyClose: MatchRune('}'), - Tilde: MatchRune('~'), - Newline: MatchNewline(), - Blank: MatchBlank(), - Blanks: MatchBlanks(), - Whitespace: MatchWhitespace(), - UnicodeSpace: MatchUnicodeSpace(), - Digit: MatchDigit(), - DigitNotZero: MatchDigitNotZero(), - Digits: MatchDigits(), - Zero: MatchRune('0'), - Integer: MatchInteger(), - Signed: MatchSigned, - IntegerBetween: MatchIntegerBetween, - Float: MatchFloat(), - Boolean: MatchBoolean(), - ASCII: MatchASCII(), - ASCIILower: MatchASCIILower(), - ASCIIUpper: MatchASCIIUpper(), - Letter: MatchUnicodeLetter(), - Lower: MatchUnicodeLower(), - Upper: MatchUnicodeUpper(), - HexDigit: MatchHexDigit(), - Octet: MatchOctet(true), - IPv4: MatchIPv4(true), - IPv4CIDRMask: MatchIPv4CIDRMask(true), - IPv4Netmask: MatchIPv4Netmask(true), - IPv4Net: MatchIPv4Net(true), - IPv6: MatchIPv6(true), - IPv6CIDRMask: MatchIPv6CIDRMask(true), - IPv6Net: MatchIPv6Net(true), -} - -// M provides convenient access to a range of modifiers (which in their nature are -// parser/combinators) that can be used when creating Handler functions. -// -// In parsekit, a modifier is defined as a Handler function that modifies the -// resulting output of another Handler in some way. It does not do any matching -// against input of its own. -// -// When using M in your own parser, then it is advised to create a variable -// to reference it: -// -// m := tokenize.M -// -// Doing so saves you a lot of typing, and it makes your code a lot cleaner. -var M = struct { - Drop func(Handler) Handler - Trim func(handler Handler, cutset string) Handler - TrimLeft func(handler Handler, cutset string) Handler - TrimRight func(handler Handler, cutset string) Handler - TrimSpace func(handler Handler) Handler - ToLower func(Handler) Handler - ToUpper func(Handler) Handler - Replace func(handler Handler, replaceWith string) Handler - ByCallback func(Handler, func(string) string) Handler -}{ - Drop: ModifyDrop, - Trim: ModifyTrim, - TrimLeft: ModifyTrimLeft, - TrimRight: ModifyTrimRight, - TrimSpace: ModifyTrimSpace, - ToLower: ModifyToLower, - ToUpper: ModifyToUpper, - Replace: ModifyReplace, - ByCallback: ModifyByCallback, -} - -// T provides convenient access to a range of Token producers (which in their -// nature are parser/combinators) that can be used when creating Handler -// functions. -// -// When using T in your own parser, then it is advised to create a variable -// to reference it: -// -// t := tokenize.T -// -// Doing so saves you a lot of typing, and it makes your code a lot cleaner. -var T = struct { - Str func(interface{}, Handler) Handler - StrInterpreted func(interface{}, Handler) Handler - Byte func(interface{}, Handler) Handler - Rune func(interface{}, Handler) Handler - Int func(interface{}, Handler) Handler - Int8 func(interface{}, Handler) Handler - Int16 func(interface{}, Handler) Handler - Int32 func(interface{}, Handler) Handler - Int64 func(interface{}, Handler) Handler - Int64Base func(interface{}, int, Handler) Handler - Uint func(interface{}, Handler) Handler - Uint8 func(interface{}, Handler) Handler - Uint16 func(interface{}, Handler) Handler - Uint32 func(interface{}, Handler) Handler - Uint64 func(interface{}, Handler) Handler - Uint64Base func(interface{}, int, Handler) Handler - Float32 func(interface{}, Handler) Handler - Float64 func(interface{}, Handler) Handler - Boolean func(interface{}, Handler) Handler - ByValue func(toktype interface{}, handler Handler, value interface{}) Handler - ByCallback func(toktype interface{}, handler Handler, makeValue func(t *API) interface{}) Handler - Group func(interface{}, Handler) Handler -}{ - Str: MakeStrLiteralToken, - StrInterpreted: MakeStrInterpretedToken, - Byte: MakeByteToken, - Rune: MakeRuneToken, - Int: MakeIntToken, - Int8: MakeInt8Token, - Int16: MakeInt16Token, - Int32: MakeInt32Token, - Int64: MakeInt64Token, - Int64Base: MakeInt64BaseToken, - Uint: MakeUintToken, - Uint8: MakeUint8Token, - Uint16: MakeUint16Token, - Uint32: MakeUint32Token, - Uint64: MakeUint64Token, - Uint64Base: MakeUint64BaseToken, - Float32: MakeFloat32Token, - Float64: MakeFloat64Token, - Boolean: MakeBooleanToken, - ByValue: MakeTokenByValue, - ByCallback: MakeTokenByCallback, - Group: MakeTokenGroup, -} - -// MatchRune creates a Handler function that matches against the provided rune. -func MatchRune(expected rune) Handler { - return MatchRuneByCallback(func(r rune) bool { return r == expected }) -} - -// MatchRunes creates a Handler function that checks if the input matches -// one of the provided runes. The first match counts. -func MatchRunes(expected ...rune) Handler { - s := string(expected) - return MatchRuneByCallback(func(r rune) bool { return strings.ContainsRune(s, r) }) -} - -// MatchRuneRange creates a Handler function that checks if the input -// matches the provided rune range. The rune range is defined by a start and -// an end rune, inclusive, so: -// -// MatchRuneRange('g', 'k') -// -// creates a Handler that will match any of 'g', 'h', 'i', 'j' or 'k'. -func MatchRuneRange(start rune, end rune) Handler { - if end < start { - callerPanic("MatchRuneRange", "Handler: {name} definition error at {caller}: start %q must not be < end %q", start, end) - } - return MatchRuneByCallback(func(r rune) bool { return r >= start && r <= end }) -} - -// MatchNewline creates a handler that matches a newline, which is either -// a DOS-style newline (CRLF, \r\n) or a UNIX-style newline (just a LF, \n). -func MatchNewline() Handler { - return MatchAny(MatchStr("\r\n"), MatchRune('\n')) -} - -// MatchBlank creates a Handler that matches one rune from the input -// against blank characters, meaning tabs and spaces. -// -// When you need whitespace matching, which also includes characters like -// newlines, then take a look at MatchWhitespace(). -func MatchBlank() Handler { - return MatchRuneByCallback(func(r rune) bool { return r == ' ' || r == '\t' }) -} - -// MatchBlanks creates a Handler that matches the input against one -// or more blank characters, meaning tabs and spaces. -// -// When you need whitespace matching, which also includes characters like -// newlines, then make use of MatchWhitespace(). -// When you need unicode whitespace matching, which also includes characters -// like a vertical tab, then make use of MatchUnicodeSpace(). -func MatchBlanks() Handler { - return MatchOneOrMore(MatchBlank()) -} - -// MatchWhitespace creates a Handler that matches the input against one or more -// whitespace characters, defined as space ' ', tab, ' ', newline '\n' (LF) and -// carriage return '\r' followed by a newline '\n' (CRLF). -func MatchWhitespace() Handler { - return MatchOneOrMore(MatchBlank().Or(MatchNewline())) -} - -// MatchUnicodeSpace creates a Handler that matches the input against one or more -// whitespace characters, as defined by unicode. -func MatchUnicodeSpace() Handler { - return MatchOneOrMore(MatchRuneByCallback(unicode.IsSpace)) -} - -// MatchRuneByCallback creates a Handler that matches a single rune from the -// input against the provided callback function. When the callback returns true, -// it is considered a match. -// -// Note that the callback function matches the signature of the unicode.Is* functions, -// so those can be used. E.g. MatchRuneByCallback(unicode.IsLower). -func MatchRuneByCallback(callback func(rune) bool) Handler { - return func(t *API) bool { - r, err := t.NextRune() - if err == nil && callback(r) { - t.Accept() - return true - } - return false - } -} - -// MatchEndOfLine creates a Handler that matches a newline ("\r\n" or "\n") or EOF. -func MatchEndOfLine() Handler { - return MatchAny(MatchNewline(), MatchEndOfFile()) -} - -// MatchStr creates a Handler that matches the input against the provided string. -func MatchStr(expected string) Handler { - var handlers = make([]Handler, len(expected)) - for i, r := range expected { - handlers[i] = MatchRune(r) - } - return MatchSeq(handlers...) -} - -// MatchStrNoCase creates a Handler that matches the input against the -// provided string in a case-insensitive manner. -func MatchStrNoCase(expected string) Handler { - var handlers = []Handler{} - for _, r := range expected { - u := unicode.ToUpper(r) - l := unicode.ToLower(r) - handlers = append(handlers, MatchRunes(u, l)) - } - return MatchSeq(handlers...) -} - -// MatchOptional creates a Handler that makes the provided Handler optional. -// When the provided Handler applies, then its output is used, otherwise -// no output is generated but still a successful match is reported (but the -// result will be empty). -func MatchOptional(handler Handler) Handler { - return matchMinMax(0, 1, handler, "MatchOptional") -} - -// MatchSeq creates a Handler that checks if the provided Handlers can be -// applied in their exact order. Only if all Handlers apply, the sequence -// reports successful match. -func MatchSeq(handlers ...Handler) Handler { - return func(t *API) bool { - child := t.Fork() - for _, handler := range handlers { - subchild := t.Fork() - if !handler(t) { - t.Dispose(subchild) - t.Dispose(child) - return false - } - t.Merge(subchild) - t.Dispose(subchild) - } - t.Merge(child) - t.Dispose(child) - return true - } -} - -// MatchAny creates a Handler that checks if any of the provided Handlers -// can be applied. They are applied in their provided order. The first Handler -// that applies is used for reporting back a match. -func MatchAny(handlers ...Handler) Handler { - return func(t *API) bool { - for _, handler := range handlers { - child := t.Fork() - if handler(t) { - t.Merge(child) - t.Dispose(child) - return true - } - t.Dispose(child) // TODO switch to Reset() and move forking outside the loop? - } - - return false - } -} - -// MatchNot creates a Handler that checks if the provided Handler applies to -// the current input. If it does, then a failed match will be reported. If it -// does not, then the next rune from the input will be reported as a match. -func MatchNot(handler Handler) Handler { - return func(t *API) bool { - child := t.Fork() - if handler(t) { - t.Dispose(child) - return false - } - t.Dispose(child) - _, err := t.NextRune() - if err == nil { - t.Accept() - return true - } - return false - } -} - -// MatchRep creates a Handler that checks if the provided Handler can be -// applied exactly the provided amount of times. -// -// Note that the input can contain more than the provided number of matches, e.g.: -// -// MatchRep(4, MatchRune('X')) -// -// will not match input "XXX", it will match input "XXXX", but also "XXXXXX". -// In that last case, there will be a remainder "XX" on the input. -// -// Another way to use this method, is by applying the following syntactic sugar: -// -// MatchRune('X').Times(4) -func MatchRep(times int, handler Handler) Handler { - return matchMinMax(times, times, handler, "MatchRep") -} - -// MatchMin creates a Handler that checks if the provided Handler can be -// applied at least the provided minimum number of times. -// When more matches are possible, these will be included in the output. -func MatchMin(min int, handler Handler) Handler { - if min < 0 { - callerPanic("MatchMin", "Handler: {name} definition error at {caller}: min must be >= 0") - } - return matchMinMax(min, -1, handler, "MatchMin") -} - -// MatchMax creates a Handler that checks if the provided Handler can be -// applied at maximum the provided minimum number of times. -// When more matches are possible, thhandler(ese will be included in the output. -// Zero matches are considered a successful match. -func MatchMax(max int, handler Handler) Handler { - if max < 0 { - callerPanic("MatchMax", "Handler: {name} definition error at {caller}: max must be >= 0") - } - return matchMinMax(0, max, handler, "MatchMax") -} - -// MatchZeroOrMore creates a Handler that checks if the provided Handler can -// be applied zero or more times. All matches will be included in the output. -// Zero matches are considered a successful match. -func MatchZeroOrMore(handler Handler) Handler { - return matchMinMax(0, -1, handler, "MatchZeroOfMore") -} - -// MatchOneOrMore creates a Handler that checks if the provided Handler can -// be applied one or more times. All matches will be included in the output. -func MatchOneOrMore(handler Handler) Handler { - return matchMinMax(1, -1, handler, "MatchOneOrMore") -} - -// MatchMinMax creates a Handler that checks if the provided Handler can -// be applied between the provided minimum and maximum number of times, -// inclusive. All matches will be included in the output. -func MatchMinMax(min int, max int, handler Handler) Handler { - if max < 0 { - callerPanic("MatchMinMax", "Handler: {name} definition error at {caller}: max must be >= 0") - } - if min < 0 { - callerPanic("MatchMinMax", "Handler: {name} definition error at {caller}: min must be >= 0") - } - return matchMinMax(min, max, handler, "MatchMinMax") -} - -func matchMinMax(min int, max int, handler Handler, name string) Handler { - if max >= 0 && min > max { - callerPanic(name, "Handler: {name} definition error at {caller}: max %d must not be < min %d", max, min) - } - return func(t *API) bool { - total := 0 - - // Check for the minimum required amount of matches. - child := t.Fork() - for total < min { - total++ - if !handler(t) { - t.Dispose(child) - return false - } - } - - // No specified max: include the rest of the available matches. - // Specified max: include the rest of the availble matches, up to the max. - //child.Merge() - for max < 0 || total < max { - total++ - if !handler(t) { - break - } - } - t.Merge(child) - t.Dispose(child) - return true - } -} - -// MatchSeparated creates a Handler that checks for a pattern of one or more -// Handlers of one type (the separated), separated by Handler of another type -// (the separator). All matches (separated + separator) are included in the -// output. -func MatchSeparated(separator Handler, separated Handler) Handler { - return MatchSeq(separated, MatchZeroOrMore(MatchSeq(separator, separated))) -} - -// MatchExcept creates a Handler that checks if the provided Handler can be -// applied to the upcoming input. It also checks if the except Handler can be -// applied. If the handler applies, but the except Handler as well, then the match -// as a whole will be treated as a mismatch. -func MatchExcept(handler Handler, except Handler) Handler { - return func(t *API) bool { - child := t.Fork() - if except(t) { - t.Dispose(child) - return false - } - t.Dispose(child) - return handler(t) - } -} - -// MatchFollowedBy creates a Handler that checks if the provided handler matches -// and if the provided lookAhead handler matches after the handler. -// When both handlers match, the match for the handler is accepted and the match -// for the lookAhead handler is ignored. -func MatchFollowedBy(lookAhead Handler, handler Handler) Handler { - return func(t *API) bool { - if handler(t) { - child := t.Fork() - result := lookAhead(t) - t.Dispose(child) - return result - } - return false - } -} - -// MatchNotFollowedBy creates a Handler that checks if the provided handler matches -// and if the provided lookAhead handler does not match after the handler. -// If the handler matches and the lookAhead handler doesn't, then the match for -// the handler is accepted. -func MatchNotFollowedBy(lookAhead Handler, handler Handler) Handler { - return func(t *API) bool { - if handler(t) { - child := t.Fork() - result := !lookAhead(t) - t.Dispose(child) - return result - } - return false - } -} - -// MakeInputFlusher creates a Handler that will flush the input buffer when the -// provided handler matches. -// -// This is useful when constructing a grammar using only parsekit.tokenize -// functionality (parsekit.parse will automatically flush the input for you) -// that has to process large input data. -// -// Without flushing the input, the input reader will allocate memory -// during the parsing process, eventually enough to hold the full input -// in memory. By wrapping Handlers with an input flusher, you can tell parsekit -// that the accumulated input so far will no longer be needed, allowing -// this input to be flushed from memory. -// -// Rule of thumb is: only use it when you have to actually fix a memory -// hogging issue for your use case. -func MakeInputFlusher(handler Handler) Handler { - return func(t *API) bool { - if handler(t) { - t.FlushInput() - return true - } - return false - } -} - -// MatchSigned creates a Handler that checks if the provided Handler is -// prefixed by an optional '+' or '-' sign. This can be used to turn numeric -// atoms into a signed version, e.g. -// -// C.Signed(A.Integer) -func MatchSigned(handler Handler) Handler { - sign := MatchOptional(MatchAny(MatchRune('+'), MatchRune('-'))) - return MatchSeq(sign, handler) -} - -// MatchIntegerBetween creates a Handler that checks for an integer -// value between the provided min and max boundaries (inclusive). -// It uses an int64 for checking internally, so you can check values -// ranging from -9223372036854775808 to 9223372036854775807. -func MatchIntegerBetween(min int64, max int64) Handler { - if max < min { - callerPanic("MatchIntegerBetween", "Handler: {name} definition error at {caller}: max %d must not be < min %d", max, min) - } - digits := MatchSigned(MatchDigits()) - - return func(t *API) bool { - if !digits(t) { - return false - } - value, _ := strconv.ParseInt(t.String(), 10, 64) - if value < min || value > max { - return false - } - return true - } -} - -// MatchEndOfFile creates a Handler that checks if the end of the input data -// has been reached. This Handler will never produce output. It only reports -// a successful or a failing match through its boolean return value. -func MatchEndOfFile() Handler { - return func(t *API) bool { - child := t.Fork() - _, err := t.NextRune() - t.Dispose(child) - return err == io.EOF - } -} - -// MatchUntilEndOfLine creates a Handler function that accepts one or -// more runes until the end of the line (or file when that's the case). -// The newline itself is not included in the match. -func MatchUntilEndOfLine() Handler { - return MatchOneOrMore(MatchNot(MatchEndOfLine())) -} - -// MatchAnyRune creates a Handler function that checks if a rune can be -// read from the input. Invalid runes on the input are replaced with the UTF8 -// replacement rune \uFFFD (i.e. utf8.RuneError), which displays as �. -func MatchAnyRune() Handler { - return func(t *API) bool { - _, err := t.NextRune() - if err == nil { - t.Accept() - return true - } - return false - } -} - -// MatchValidRune creates a Handler function that checks if a valid -// UTF8 rune can be read from the input. -func MatchValidRune() Handler { - return func(t *API) bool { - r, err := t.NextRune() - if err == nil && r != utf8.RuneError { - t.Accept() - return true - } - return false - } -} - -// MatchInvalidRune creates a Handler function that checks if an invalid -// UTF8 rune can be read from the input. -func MatchInvalidRune() Handler { - return func(t *API) bool { - r, err := t.NextRune() - if err == nil && r == utf8.RuneError { - t.Accept() - return true - } - return false - } -} - -// MatchDigit creates a Handler that checks if a single digit can be read -// from the input. -func MatchDigit() Handler { - return MatchRuneRange('0', '9') -} - -// MatchDigits creates a Handler that checks if one or more digits can be read -// from the input. -func MatchDigits() Handler { - return MatchOneOrMore(MatchDigit()) -} - -// MatchDigitNotZero creates a Handler that checks if a single digit not equal -// to zero '0' can be read from the input. -func MatchDigitNotZero() Handler { - return MatchRuneRange('1', '9') -} - -// MatchInteger creates a Handler function that checks if a valid integer -// can be read from the input. In line with Go, an integer cannot start with -// a zero. Starting with a zero is used to indicate other bases, like octal or -// hexadecimal. -func MatchInteger() Handler { - justZero := MatchRune('0') - integer := MatchSeq(MatchDigitNotZero(), MatchZeroOrMore(MatchDigit())) - return MatchAny(integer, justZero) -} - -// MatchFloat creates a Handler function that checks if a valid float value -// can be read from the input. In case the fractional part is missing, this -// Handler will report a match, so both "123" and "123.123" will match. -func MatchFloat() Handler { - digits := MatchDigits() - return MatchSeq(digits, MatchOptional(MatchSeq(MatchRune('.'), digits))) -} - -// MatchBoolean creates a Handler function that checks if a boolean -// value can be read from the input. It supports the boolean values as understood -// by Go's strconv.ParseBool() function. -// -// True values: true, TRUE, True, 1, t, T -// -// False falues: false, FALSE, False, 0, f, F -func MatchBoolean() Handler { - trues := MatchAny(MatchStr("true"), MatchStr("TRUE"), MatchStr("True"), MatchRune('1'), MatchRune('t'), MatchRune('T')) - falses := MatchAny(MatchStr("false"), MatchStr("FALSE"), MatchStr("False"), MatchRune('0'), MatchRune('f'), MatchRune('F')) - return MatchAny(trues, falses) -} - -// MatchASCII creates a Handler function that matches against any -// ASCII value on the input. -func MatchASCII() Handler { - return MatchRuneRange('\x00', '\x7F') -} - -// MatchASCIILower creates a Handler function that matches against any -// lower case ASCII letter on the input (a - z). -func MatchASCIILower() Handler { - return MatchRuneRange('a', 'z') -} - -// MatchASCIIUpper creates a Handler function that matches against any -// upper case ASCII letter on the input (a - z). -func MatchASCIIUpper() Handler { - return MatchRuneRange('A', 'Z') -} - -// MatchUnicodeLetter creates a Handler function that matches against any -// unicode letter on the input (see unicode.IsLetter(rune)). -func MatchUnicodeLetter() Handler { - return MatchRuneByCallback(unicode.IsLetter) -} - -// MatchUnicodeUpper creates a Handler function that matches against any -// upper case unicode letter on the input (see unicode.IsUpper(rune)). -func MatchUnicodeUpper() Handler { - return MatchRuneByCallback(unicode.IsUpper) -} - -// MatchUnicodeLower creates a Handler function that matches against any -// lower case unicode letter on the input (see unicode.IsLower(rune)). -func MatchUnicodeLower() Handler { - return MatchRuneByCallback(unicode.IsLower) -} - -// MatchHexDigit creates a Handler function that check if a single hexadecimal -// digit can be read from the input. -func MatchHexDigit() Handler { - return MatchAny(MatchRuneRange('0', '9'), MatchRuneRange('a', 'f'), MatchRuneRange('A', 'F')) -} - -// MatchOctet creates a Handler function that checks if a valid octet value -// can be read from the input (octet = byte value representation, with a value -// between 0 and 255 inclusive). It only looks at the first 1 to 3 upcoming -// digits, not if there's a non-digit after it, meaning that "123255" would be -// a valid sequence of two octets. -// -// When the normalize parameter is set to true, then leading zeroes will be -// stripped from the octet. -func MatchOctet(normalize bool) Handler { - max3Digits := MatchMinMax(1, 3, MatchDigit()) - return func(t *API) bool { - if !max3Digits(t) { - return false - } - value, _ := strconv.ParseInt(t.String(), 10, 16) - if value > 255 { - return false - } - if normalize { - runes := t.Runes() - for len(runes) > 1 && runes[0] == '0' { - runes = runes[1:] - } - t.SetRunes(runes...) - } - return true - } -} - -// MatchIPv4 creates a Handler function that checks if a valid IPv4 -// IP address value can be read from the input. -// -// When the normalize parameter is true, IP-addresses that look like -// "192.168.001.012" will be normalize to "192.168.1.12". -func MatchIPv4(normalize bool) Handler { - octet := MatchOctet(normalize) - dot := MatchRune('.') - return MatchSeq(octet, dot, octet, dot, octet, dot, octet) -} - -// MatchIPv4CIDRMask creates a Handler function that checks if a -// valid IPv4 CIDR mask (0 - 32) value can be read from the input. -func MatchIPv4CIDRMask(normalize bool) Handler { - return matchCIDRMask(32, normalize) -} - -// MatchIPv4Netmask creates a Handler function that checks if a valid -// IPv4 netmask can be read from input (e.g. 255.255.255.0). -// Only a netmask in canonical form is accepted (meaning that in binary form -// it start with zero or more 1-bits, followed by only 0-bits up to the -// 32 bit length). -// -// When the normalize parameter is true, netmasks that look like -// "255.255.192.000" will be normalized to "255.255.192.0". -func MatchIPv4Netmask(normalize bool) Handler { - octet := MakeUint8Token(nil, MatchOctet(normalize)) - dot := MatchRune('.') - netmask := MatchSeq(octet, dot, octet, dot, octet, dot, octet) - - return func(t *API) bool { - if !netmask(t) { - return false - } - - // Check if the mask is provided in canonical form (at the binary level, ones followed by zeroes). - mask := net.IPv4Mask(t.TokenValue(0).(byte), t.TokenValue(1).(byte), t.TokenValue(2).(byte), t.TokenValue(3).(byte)) - ones, bits := mask.Size() - if ones == 0 && bits == 0 { - return false - } - - t.ClearTokens() - return true - } -} - -// MatchIPv4Net creates a Handler function that checks the input for an -// IPv4 + mask input. Both / (e.g. 192.168.0.1/24) and / -// (e.g. 172.16.10.254/255.255.192.0) are acceptable. -// -// When the normalize parameter is true, then the IP address and the mask are -// normalized. The mask will be normalized to cidr, so the above example would -// be normalized to 172.16.10.254/18. -func MatchIPv4Net(normalize bool) Handler { - ip := MakeStrLiteralToken("ip", MatchIPv4(normalize)) - slash := MatchRune('/') - mask := MatchAny( - MakeStrLiteralToken("mask", MatchIPv4Netmask(normalize)), - MakeUint8Token("cidr", MatchIPv4CIDRMask(normalize))) - ipnet := MatchSeq(ip, slash, mask) - - return func(t *API) bool { - if !ipnet(t) { - return false - } - - if !normalize { - return true - } - - maskToken := t.Token(1) - if maskToken.Type == "cidr" { - t.SetString(fmt.Sprintf("%s/%d", t.TokenValue(0), t.TokenValue(1).(uint8))) - } else { - o := strings.Split(t.TokenValue(1).(string), ".") - b := func(idx int) byte { i, _ := strconv.Atoi(o[idx]); return byte(i) } - mask := net.IPv4Mask(b(0), b(1), b(2), b(3)) - bits, _ := mask.Size() - t.SetString(fmt.Sprintf("%s/%d", t.TokenValue(0), bits)) - } - - t.ClearTokens() - return true - } -} - -// MatchIPv6 creates a Handler function that checks if an IPv6 address -// can be read from the input. -func MatchIPv6(normalize bool) Handler { - hextet := MatchMinMax(1, 4, MatchHexDigit()) - colon := MatchRune(':') - empty := MatchSeq(colon, colon) - - return func(t *API) bool { - nrOfHextets := 0 - for nrOfHextets < 8 { - if hextet(t) { - nrOfHextets++ - } else if empty(t) { - nrOfHextets += 2 - } else if !colon(t) { - break - } - } - // No hextets or too many hextets (e.g. 1:1:1:1:1:1:1:: <-- since :: is 2 or more hextets). - if nrOfHextets == 0 || nrOfHextets > 8 { - return false - } - - // Invalid IPv6, when net.ParseIP() cannot handle it. - parsed := net.ParseIP(t.String()) - if parsed == nil { - return false - } - - if normalize { - t.SetString(parsed.String()) - } - return true - } -} - -// MatchIPv6CIDRMask creates a Handler function that checks if a -// valid IPv6 CIDR mask (0 - 128) value can be read from the input. -func MatchIPv6CIDRMask(normalize bool) Handler { - return matchCIDRMask(128, normalize) -} - -func matchCIDRMask(bits int64, normalize bool) Handler { - mask := MatchIntegerBetween(0, bits) - - if !normalize { - return mask - } - - return func(t *API) bool { - if !mask(t) { - return false - } - bits, _ := strconv.Atoi(t.String()) - t.SetString(fmt.Sprintf("%d", bits)) - return true - } -} - -// MatchIPv6Net creates a Handler function that checks the input for an -// IPv6 + mask input, e.g. fe80:0:0:0:0216:3eff:fe96:0002/64. -// -// When the normalize parameter is true, then the IP address and the mask are -// normalized. The above example would be normalized to fe08::216:3eff:fe96:2/64. -func MatchIPv6Net(normalize bool) Handler { - ip := MatchIPv6(normalize) - slash := MatchRune('/') - mask := MatchIPv6CIDRMask(normalize) - return MatchSeq(ip, slash, mask) -} - -// ModifyDrop creates a Handler that checks if the provided Handler applies. -// If it does, then its output is disposed completely. -// -// Note that if the Handler does not apply, a mismatch will be reported back, -// even though we would have dropped the output anyway. So if you would like -// to drop optional blanks (spaces and tabs), then use something like: -// -// M.Drop(C.Optional(A.Blanks)) -// -// instead of: -// -// M.Drop(A.Blanks) -// -// Since A.Blanks is defined as "1 or more spaces and/or tabs", the input -// string "bork" would not match against the second form, but " bork" would. -// In both cases, it would match the first form. -func ModifyDrop(handler Handler) Handler { - return func(t *API) bool { - child := t.Fork() - if handler(t) { - t.Reset() - t.Merge(child) - t.Dispose(child) - return true - } - t.Dispose(child) - return false - } -} - -// ModifyTrim creates a Handler that checks if the provided Handler applies. -// If it does, then its output is taken and characters from the provided -// cutset are trimmed from both the left and the right of the output. -func ModifyTrim(handler Handler, cutset string) Handler { - return modifyTrim(handler, cutset, true, true) -} - -// ModifyTrimLeft creates a Handler that checks if the provided Handler applies. -// If it does, then its output is taken and characters from the provided -// cutset are trimmed from the left of the output. -func ModifyTrimLeft(handler Handler, cutset string) Handler { - return modifyTrim(handler, cutset, true, false) -} - -// ModifyTrimRight creates a Handler that checks if the provided Handler applies. -// If it does, then its output is taken and characters from the provided -// cutset are trimmed from the right of the output. -func ModifyTrimRight(handler Handler, cutset string) Handler { - return modifyTrim(handler, cutset, false, true) -} - -func modifyTrim(handler Handler, cutset string, trimLeft bool, trimRight bool) Handler { - modfunc := func(s string) string { - if trimLeft { - s = strings.TrimLeft(s, cutset) - } - if trimRight { - s = strings.TrimRight(s, cutset) - } - return s - } - return ModifyByCallback(handler, modfunc) -} - -// ModifyTrimSpace creates a Handler that checks if the provided Handler applies. -// If it does, then its output is taken and all leading and trailing whitespace characters, -// as defined by Unicode are removed from it. -func ModifyTrimSpace(handler Handler) Handler { - return ModifyByCallback(handler, strings.TrimSpace) -} - -// ModifyToUpper creates a Handler that checks if the provided Handler applies. -// If it does, then its output is taken and characters from the provided -// cutset are converted into upper case. -func ModifyToUpper(handler Handler) Handler { - return ModifyByCallback(handler, strings.ToUpper) -} - -// ModifyToLower creates a Handler that checks if the provided Handler applies. -// If it does, then its output is taken and characters from the provided -// cutset are converted into lower case. -func ModifyToLower(handler Handler) Handler { - return ModifyByCallback(handler, strings.ToLower) -} - -// ModifyReplace creates a Handler that checks if the provided Handler applies. -// If it does, then its output is replaced by the provided string. -func ModifyReplace(handler Handler, replaceWith string) Handler { - return ModifyByCallback(handler, func(string) string { - return replaceWith - }) -} - -// ModifyByCallback creates a Handler that checks if the provided Handler applies. -// If it does, then its output is taken and it is fed to the provided modfunc. -// This is a simple function that takes a string on input and returns a possibly -// modified string on output. The return value of the modfunc will replace the -// resulting output. -func ModifyByCallback(handler Handler, modfunc func(string) string) Handler { - return func(t *API) bool { - child := t.Fork() - if handler(t) { - s := modfunc(t.String()) - t.SetString(s) - t.Merge(child) - t.Dispose(child) - return true - } - t.Dispose(child) - return false - } -} - -// MakeStrLiteralToken creates a Handler that will add a Token to the -// Result, for which the Token.Value is set to a string-typed -// representation of the read Runes. This string is literal, meaning that an -// escape sequence like "\n" is kept as-is (a backslash character, followed by -// an 'n'-character). -func MakeStrLiteralToken(toktype interface{}, handler Handler) Handler { - return MakeTokenByCallback(toktype, handler, func(t *API) interface{} { - literal := t.String() - return literal - }) -} - -// MakeStrInterpretedToken creates a Handler that will add a Token to the -// Result, for which the Token.Value is set to a string-typed -// representation of the read Runes. This string is interpreted, meaning that an -// escape sequence like "\n" is translated to an actual newline control character -func MakeStrInterpretedToken(toktype interface{}, handler Handler) Handler { - return MakeTokenByCallback(toktype, handler, func(t *API) interface{} { - // TODO ERROR HANDLING - interpreted, _ := interpretString(t.String()) - return interpreted - }) -} - -func interpretString(str string) (string, error) { - var sb strings.Builder - for len(str) > 0 { - r, _, remainder, err := strconv.UnquoteChar(str, '"') - if err != nil { - return sb.String(), err - } - str = remainder - sb.WriteRune(r) - } - return sb.String(), nil -} - -// MakeRuneToken creates a Handler that will add a Token to the -// Result, for which the Token.Value is set to a Rune-representation -// of the read Rune. -func MakeRuneToken(toktype interface{}, handler Handler) Handler { - return MakeTokenByCallback(toktype, handler, func(t *API) interface{} { - // TODO ERROR HANDLING --- not a 1 rune input - return t.Rune(0) - }) -} - -// MakeByteToken creates a Handler that will add a Token to the -// Result, for which the Token.Value is set to a Byte-representation -// of the read Rune. -func MakeByteToken(toktype interface{}, handler Handler) Handler { - return MakeTokenByCallback(toktype, handler, func(t *API) interface{} { - // TODO ERROR HANDLING --- not a 1 byte input - return byte(t.Rune(0)) - }) -} - -// MakeIntToken creates a Handler that will add a Token to the -// Result, for which the Token.Value is set to an int-representation -// of the read Rune. -func MakeIntToken(toktype interface{}, handler Handler) Handler { - return makeStrconvToken("int", toktype, handler, func(s string) (interface{}, error) { - return strconv.Atoi(s) - }) -} - -// MakeInt8Token creates a Handler that will add a Token to the -// Result, for which the Token.Value is set to an int8-representation -// of the read Rune. -// TODO allow other Go types for oct and hex too. -func MakeInt8Token(toktype interface{}, handler Handler) Handler { - return makeStrconvToken("int8", toktype, handler, - func(s string) (interface{}, error) { - value, err := strconv.ParseInt(s, 10, 8) - if err == nil { - return int8(value), err - } - return value, err - }) -} - -// MakeInt16Token creates a Handler that will add a Token to the -// Result, for which the Token.Value is set to an int16-representation -// of the read Rune. -func MakeInt16Token(toktype interface{}, handler Handler) Handler { - return makeStrconvToken("int16", toktype, handler, - func(s string) (interface{}, error) { - value, err := strconv.ParseInt(s, 10, 16) - if err == nil { - return int16(value), err - } - return value, err - }) -} - -// MakeInt32Token creates a Handler that will add a Token to the -// Result, for which the Token.Value is set to an int32-representation -// of the read Rune. -func MakeInt32Token(toktype interface{}, handler Handler) Handler { - return makeStrconvToken("int32", toktype, handler, - func(s string) (interface{}, error) { - value, err := strconv.ParseInt(s, 10, 32) - if err == nil { - return int32(value), err - } - return value, err - }) -} - -// MakeInt64BaseToken creates a Handler that will add a Token to the -// Result, for which the Token.Value is set to an int64-representation -// of the read Rune, using the provided base (e.g. 2 = binary, 8 = octal, -// 10 = decimal, 16 = hexadecimal). -func MakeInt64BaseToken(toktype interface{}, base int, handler Handler) Handler { - return makeInt64BaseToken(toktype, base, handler) -} - -func makeInt64BaseToken(toktype interface{}, base int, handler Handler) Handler { - return makeStrconvToken("int64", toktype, handler, - func(s string) (interface{}, error) { - value, err := strconv.ParseInt(s, base, 64) - if err == nil { - return int64(value), err - } - return value, err - }) -} - -// MakeInt64Token creates a Handler that will add a Token to the -// Result, for which the Token.Value is set to an int64-representation -// of the read Rune. -func MakeInt64Token(toktype interface{}, handler Handler) Handler { - return MakeInt64BaseToken(toktype, 10, handler) -} - -// MakeUintToken creates a Handler that will add a Token to the -// Result, for which the Token.Value is set to an uint-representation -// of the read Rune. -func MakeUintToken(toktype interface{}, handler Handler) Handler { - return makeStrconvToken("uint", toktype, handler, - func(s string) (interface{}, error) { - value, err := strconv.ParseUint(s, 10, 0) - if err == nil { - return uint(value), err - } - return value, err - }) -} - -// MakeUint8Token creates a Handler that will add a Token to the -// Result, for which the Token.Value is set to an uint8-representation -// of the read Rune. -// TODO allow other Go types for oct and hex too. -func MakeUint8Token(toktype interface{}, handler Handler) Handler { - return makeStrconvToken("uint8", toktype, handler, - func(s string) (interface{}, error) { - value, err := strconv.ParseUint(s, 10, 8) - if err == nil { - return uint8(value), err - } - return value, err - }) -} - -// MakeUint16Token creates a Handler that will add a Token to the -// Result, for which the Token.Value is set to an uint16-representation -// of the read Rune. -func MakeUint16Token(toktype interface{}, handler Handler) Handler { - return makeStrconvToken("uint16", toktype, handler, - func(s string) (interface{}, error) { - value, err := strconv.ParseUint(s, 10, 16) - if err == nil { - return uint16(value), err - } - return value, err - }) -} - -// MakeUint32Token creates a Handler that will add a Token to the -// Result, for which the Token.Value is set to an uint32-representation -// of the read Rune. -func MakeUint32Token(toktype interface{}, handler Handler) Handler { - return makeStrconvToken("unit32", toktype, handler, - func(s string) (interface{}, error) { - value, err := strconv.ParseUint(s, 10, 32) - if err == nil { - return uint32(value), err - } - return value, err - }) -} - -// MakeUint64BaseToken creates a Handler that will add a Token to the -// Result, for which the Token.Value is set to an uint64-representation -// of the read Rune, using the provided base (e.g. 2 = binary, 8 = octal, -// 10 = decimal, 16 = hexadecimal). -func MakeUint64BaseToken(toktype interface{}, base int, handler Handler) Handler { - return makeStrconvToken("uint64", toktype, handler, - func(s string) (interface{}, error) { - value, err := strconv.ParseUint(s, base, 64) - if err == nil { - return uint64(value), err - } - return value, err - }) -} - -// MakeUint64Token creates a Handler that will add a Token to the -// Result, for which the Token.Value is set to an uint64-representation -// of the read Rune. -func MakeUint64Token(toktype interface{}, handler Handler) Handler { - return MakeUint64BaseToken(toktype, 10, handler) -} - -// MakeFloat32Token creates a Handler that will add a Token to the -// Result, for which the Token.Value is set to an float32-representation -// of the read Rune. -func MakeFloat32Token(toktype interface{}, handler Handler) Handler { - return makeStrconvToken("float32", toktype, handler, - func(s string) (interface{}, error) { - value, err := strconv.ParseFloat(s, 32) - if err == nil { - return float32(value), err - } - return value, err - }) -} - -// MakeFloat64Token creates a Handler that will add a Token to the -// Result, for which the Token.Value is set to an float64-representation -// of the read Rune. -func MakeFloat64Token(toktype interface{}, handler Handler) Handler { - return makeStrconvToken("float64", toktype, handler, - func(s string) (interface{}, error) { - value, err := strconv.ParseFloat(s, 64) - if err == nil { - return float64(value), err - } - return value, err - }) -} - -// MakeBooleanToken creates a Handler that will add a Token to the -// Result, for which the Token.Value is set to an bool-representation -// of the read Rune. -func MakeBooleanToken(toktype interface{}, handler Handler) Handler { - return makeStrconvToken("boolean", toktype, handler, - func(s string) (interface{}, error) { - value, err := strconv.ParseBool(s) - if err == nil { - return bool(value), err - } - return value, err - }) -} - -func makeStrconvToken(name string, toktype interface{}, handler Handler, convert func(s string) (interface{}, error)) Handler { - return MakeTokenByCallback(toktype, handler, func(t *API) interface{} { - value, err := convert(t.String()) - if err != nil { - // TODO meh, panic feels so bad here. Maybe just turn this case into "no match"? - panic(fmt.Sprintf("%s token invalid (%s)", name, err)) - } - return value - }) -} - -// MakeTokenByValue creates a Handler that will add a static Token value -// to the Result. -func MakeTokenByValue(toktype interface{}, handler Handler, value interface{}) Handler { - return MakeTokenByCallback(toktype, handler, func(t *API) interface{} { return value }) -} - -// MakeTokenByCallback creates a Handler that will add a Token to the -// Result, for which the Token.Value is to be generated by the provided -// makeValue() callback function. The function gets the current API as -// its input and must return the token value. -func MakeTokenByCallback(toktype interface{}, handler Handler, makeValue func(t *API) interface{}) Handler { - return func(t *API) bool { - child := t.Fork() - if handler(t) { - // The token is not added to the child here. The child might have produced its own - // tokens and we want those to come after the token for the current parsing level. - // By adding the token to the input API and then merging the child tokens, the order - // of the tokens will match the expectations. - // e.g. when a parsing hierarchy looks like ("date" ("year", "month" "day")), the - // tokens will end up in the order "date", "year", "month", "day". When we'd add the - // token to the child here, the order would have been "year", "month", "day", "date". - token := Token{Type: toktype, Value: makeValue(t)} - t.AddTokens(token) - t.Merge(child) - t.Dispose(child) - - return true - } - t.Dispose(child) - return false - } -} - -// MakeTokenGroup checks if the provided handler matches the input. If yes, then it will -// take the tokens as produced by the handler and group them together in a single token. -func MakeTokenGroup(toktype interface{}, handler Handler) Handler { - return func(t *API) bool { - child := t.Fork() - if handler(t) { - tokens := t.Tokens() - tokensCopy := make([]Token, len(tokens)) - copy(tokensCopy, tokens) - t.SetTokens(Token{Type: toktype, Value: tokensCopy}) - t.Merge(child) - t.Dispose(child) - return true - } - t.Dispose(child) - return false - } -} diff --git a/tokenize2/handlers_builtin_test.go b/tokenize2/handlers_builtin_test.go deleted file mode 100644 index a2a8897..0000000 --- a/tokenize2/handlers_builtin_test.go +++ /dev/null @@ -1,512 +0,0 @@ -package tokenize2_test - -import ( - "fmt" - "testing" - - tokenize "git.makaay.nl/mauricem/go-parsekit/tokenize2" -) - -func TestCombinatorsTempDebug(t *testing.T) { - var a = tokenize.A - AssertHandlers(t, []HandlerT{ - // {"024", a.IPv4CIDRMask, true, "24"}, - // {"024", a.Octet, true, "24"}, - {"192.168.6.123/024", a.IPv4Net, true, "192.168.6.123/24"}, - }) -} - -func TestCombinators(t *testing.T) { - var c, a, m = tokenize.C, tokenize.A, tokenize.M - AssertHandlers(t, []HandlerT{ - {"", c.Not(a.Rune('b')), false, ""}, - {"abc not", c.Not(a.Rune('b')), true, "a"}, - {"bcd not", c.Not(a.Rune('b')), false, ""}, - {"aaaxxxb", c.OneOrMore(c.Not(a.Rune('b'))), true, "aaaxxx"}, - {"1010 not", c.Not(c.Seq(a.Rune('2'), a.Rune('0'))), true, "1"}, - {"2020 not", c.Not(c.Seq(a.Rune('2'), a.Rune('0'))), false, ""}, - {"abc any", c.Any(a.Rune('a'), a.Rune('b')), true, "a"}, - {"bcd any", c.Any(a.Rune('a'), a.Rune('b')), true, "b"}, - {"cde any", c.Any(a.Rune('a'), a.Rune('b')), false, ""}, - {"ababc repeated", c.Repeated(4, a.Runes('a', 'b')), true, "abab"}, - {"ababc repeated", c.Repeated(5, a.Runes('a', 'b')), false, ""}, - {"", c.Min(0, a.Rune('a')), true, ""}, - {"a", c.Min(0, a.Rune('a')), true, "a"}, - {"aaaaa", c.Min(4, a.Rune('a')), true, "aaaaa"}, - {"aaaaa", c.Min(5, a.Rune('a')), true, "aaaaa"}, - {"aaaaa", c.Min(6, a.Rune('a')), false, ""}, - {"", c.Max(4, a.Rune('b')), true, ""}, - {"X", c.Max(4, a.Rune('b')), true, ""}, - {"bbbbbX", c.Max(4, a.Rune('b')), true, "bbbb"}, - {"bbbbbX", c.Max(5, a.Rune('b')), true, "bbbbb"}, - {"bbbbbX", c.Max(6, a.Rune('b')), true, "bbbbb"}, - {"", c.MinMax(0, 0, a.Rune('c')), true, ""}, - {"X", c.MinMax(0, 0, a.Rune('c')), true, ""}, - {"cccc", c.MinMax(0, 5, a.Rune('c')), true, "cccc"}, - {"ccccc", c.MinMax(0, 5, a.Rune('c')), true, "ccccc"}, - {"cccccc", c.MinMax(0, 5, a.Rune('c')), true, "ccccc"}, - {"cccccX", c.MinMax(0, 0, a.Rune('c')), true, ""}, - {"cccccX", c.MinMax(0, 1, a.Rune('c')), true, "c"}, - {"cccccX", c.MinMax(0, 5, a.Rune('c')), true, "ccccc"}, - {"cccccX", c.MinMax(0, 6, a.Rune('c')), true, "ccccc"}, - {"cccccX", c.MinMax(1, 1, a.Rune('c')), true, "c"}, - {"", c.MinMax(1, 1, a.Rune('c')), false, ""}, - {"X", c.MinMax(1, 1, a.Rune('c')), false, ""}, - {"cccccX", c.MinMax(1, 3, a.Rune('c')), true, "ccc"}, - {"cccccX", c.MinMax(1, 6, a.Rune('c')), true, "ccccc"}, - {"cccccX", c.MinMax(3, 4, a.Rune('c')), true, "cccc"}, - {"", c.OneOrMore(a.Rune('d')), false, ""}, - {"X", c.OneOrMore(a.Rune('d')), false, ""}, - {"dX", c.OneOrMore(a.Rune('d')), true, "d"}, - {"dddddX", c.OneOrMore(a.Rune('d')), true, "ddddd"}, - {"", c.ZeroOrMore(a.Rune('e')), true, ""}, - {"X", c.ZeroOrMore(a.Rune('e')), true, ""}, - {"eX", c.ZeroOrMore(a.Rune('e')), true, "e"}, - {"eeeeeX", c.ZeroOrMore(a.Rune('e')), true, "eeeee"}, - {"HI!", c.Seq(a.Rune('H'), a.Rune('I'), a.Rune('!')), true, "HI!"}, - {"Hello, world!X", c.Seq(a.Str("Hello"), a.Comma, a.Space, a.Str("world"), a.Excl), true, "Hello, world!"}, - {"101010123", c.OneOrMore(c.Seq(a.Rune('1'), a.Rune('0'))), true, "101010"}, - {"", c.Optional(c.OneOrMore(a.Rune('f'))), true, ""}, - {"ghijkl", c.Optional(a.Rune('h')), true, ""}, - {"ghijkl", c.Optional(a.Rune('g')), true, "g"}, - {"fffffX", c.Optional(c.OneOrMore(a.Rune('f'))), true, "fffff"}, - {"1,2,3,b,c", c.Separated(a.Comma, a.Digit), true, "1,2,3"}, - {`\x9a\x01\xF0\xfCAndSomeMoreStuff`, c.OneOrMore(c.Seq(a.Backslash, a.Rune('x'), c.Repeated(2, a.HexDigit))), true, `\x9a\x01\xF0\xfC`}, - {" ", m.Trim(c.OneOrMore(a.AnyRune), " "), true, ""}, - {" a", m.TrimLeft(c.OneOrMore(a.AnyRune), " "), true, "a"}, - {"a ", m.TrimRight(c.OneOrMore(a.AnyRune), " "), true, "a"}, - {" a ", m.TrimSpace(c.OneOrMore(a.AnyRune)), true, "a"}, - {"ab", c.FollowedBy(a.Rune('b'), a.Rune('a')), true, "a"}, - {"ba", c.FollowedBy(a.Rune('b'), a.Rune('a')), false, ""}, - {"aa", c.FollowedBy(a.Rune('b'), a.Rune('a')), false, ""}, - {"aaabbbcccddd", c.FollowedBy(c.OneOrMore(a.Rune('d')), c.OneOrMore(a.Rune('a')).Then(c.OneOrMore(c.Not(a.Rune('d'))))), true, "aaabbbccc"}, - {"aaabbbcccxxx", c.FollowedBy(c.OneOrMore(a.Rune('d')), c.OneOrMore(a.Rune('a')).Then(c.OneOrMore(c.Not(a.Rune('d'))))), false, ""}, - {"xy", c.NotFollowedBy(a.Rune('a'), a.Rune('x')), true, "x"}, - {"yx", c.NotFollowedBy(a.Rune('a'), a.Rune('x')), false, ""}, - {"xx", c.NotFollowedBy(a.Rune('a'), a.Rune('x')), true, "x"}, - {"xa", c.NotFollowedBy(a.Rune('a'), a.Rune('x')), false, ""}, - {"xxxyyyzzzaaa", c.NotFollowedBy(a.Rune('a'), c.OneOrMore(a.Runes('x', 'y', 'z'))), false, ""}, - {"xxxyyyzzzbaa", c.NotFollowedBy(a.Rune('a'), c.OneOrMore(a.Runes('x', 'y', 'z'))), true, "xxxyyyzzz"}, - }) -} - -func TestCombinatorPanics(t *testing.T) { - var c, a = tokenize.C, tokenize.A - AssertPanics(t, []PanicT{ - {func() { a.RuneRange('z', 'a') }, true, - `Handler: MatchRuneRange definition error at /.*/handlers_builtin_test\.go:\d+: start 'z' must not be < end 'a'`}, - {func() { c.MinMax(-1, 1, a.Space) }, true, - `Handler: MatchMinMax definition error at /.*/handlers_builtin_test\.go:\d+: min must be >= 0`}, - {func() { c.MinMax(1, -1, a.Space) }, true, - `Handler: MatchMinMax definition error at /.*/handlers_builtin_test\.go:\d+: max must be >= 0`}, - {func() { c.MinMax(10, 5, a.Space) }, true, - `Handler: MatchMinMax definition error at /.*/handlers_builtin_test\.go:\d+: max 5 must not be < min 10`}, - {func() { c.Min(-10, a.Space) }, true, - `Handler: MatchMin definition error at /.*/handlers_builtin_test\.go:\d+: min must be >= 0`}, - {func() { c.Max(-42, a.Space) }, true, - `Handler: MatchMax definition error at /.*/handlers_builtin_test\.go:\d+: max must be >= 0`}, - {func() { a.IntegerBetween(10, -10) }, true, - `Handler: MatchIntegerBetween definition error at /.*/handlers_builtin_test.go:\d+: max -10 must not be < min 10`}, - }) -} - -func TestAtoms(t *testing.T) { - var a = tokenize.A - AssertHandlers(t, []HandlerT{ - {"dd", a.RuneRange('b', 'e'), true, "d"}, - {"ee", a.RuneRange('b', 'e'), true, "e"}, - {"ff", a.RuneRange('b', 'e'), false, ""}, - {"Hello, world!", a.Str("Hello"), true, "Hello"}, - {"HellÖ, world!", a.StrNoCase("hellö"), true, "HellÖ"}, - {"+X", a.Runes('+', '-', '*', '/'), true, "+"}, - {"-X", a.Runes('+', '-', '*', '/'), true, "-"}, - {"*X", a.Runes('+', '-', '*', '/'), true, "*"}, - {"/X", a.Runes('+', '-', '*', '/'), true, "/"}, - {"!X", a.Runes('+', '-', '*', '/'), false, ""}, - {"xxx", a.Rune('x'), true, "x"}, - {"x ", a.Rune(' '), false, ""}, - {"aa", a.RuneRange('b', 'e'), false, ""}, - {"bb", a.RuneRange('b', 'e'), true, "b"}, - {"cc", a.RuneRange('b', 'e'), true, "c"}, - {"", a.EndOfFile, true, ""}, - {"⌘", a.AnyRune, true, "⌘"}, - {"\xbc with AnyRune", a.AnyRune, true, "�"}, - {"", a.AnyRune, false, ""}, - {"⌘", a.ValidRune, true, "⌘"}, - {"\xbc with ValidRune", a.ValidRune, false, ""}, - {"", a.ValidRune, false, ""}, - {"\xbc with InvalidRune", a.InvalidRune, true, "�"}, - {"ok with InvalidRune", a.InvalidRune, false, ""}, - {" ", a.Space, true, " "}, - {"X", a.Space, false, ""}, - {"\t", a.Tab, true, "\t"}, - {"\r", a.CR, true, "\r"}, - {"\n", a.LF, true, "\n"}, - {"!", a.Excl, true, "!"}, - {"\"", a.DoubleQuote, true, "\""}, - {"#", a.Hash, true, "#"}, - {"$", a.Dollar, true, "$"}, - {"%", a.Percent, true, "%"}, - {"&", a.Amp, true, "&"}, - {"'", a.SingleQuote, true, "'"}, - {"(", a.LeftParen, true, "("}, - {"(", a.RoundOpen, true, "("}, - {")", a.RightParen, true, ")"}, - {")", a.RoundClose, true, ")"}, - {"*", a.Asterisk, true, "*"}, - {"*", a.Multiply, true, "*"}, - {"+", a.Plus, true, "+"}, - {"+", a.Add, true, "+"}, - {",", a.Comma, true, ","}, - {"-", a.Minus, true, "-"}, - {"-", a.Subtract, true, "-"}, - {".", a.Dot, true, "."}, - {"/", a.Slash, true, "/"}, - {"/", a.Divide, true, "/"}, - {":", a.Colon, true, ":"}, - {";", a.Semicolon, true, ";"}, - {"<", a.AngleOpen, true, "<"}, - {"<", a.LessThan, true, "<"}, - {"=", a.Equal, true, "="}, - {">", a.AngleClose, true, ">"}, - {">", a.GreaterThan, true, ">"}, - {"?", a.Question, true, "?"}, - {"@", a.At, true, "@"}, - {"[", a.SquareOpen, true, "["}, - {"\\", a.Backslash, true, "\\"}, - {"]", a.SquareClose, true, "]"}, - {"^", a.Caret, true, "^"}, - {"_", a.Underscore, true, "_"}, - {"`", a.Backquote, true, "`"}, - {"{", a.CurlyOpen, true, "{"}, - {"|", a.Pipe, true, "|"}, - {"}", a.CurlyClose, true, "}"}, - {"~", a.Tilde, true, "~"}, - {"\t \t \r\n", a.Blank, true, "\t"}, - {" \t \t \r\n", a.Blanks, true, " \t \t "}, - {"xxx", a.Whitespace, false, ""}, - {" ", a.Whitespace, true, " "}, - {"\t", a.Whitespace, true, "\t"}, - {"\n", a.Whitespace, true, "\n"}, - {"\r\n", a.Whitespace, true, "\r\n"}, - {" \t\r\n \n \t\t\r\n ", a.Whitespace, true, " \t\r\n \n \t\t\r\n "}, - {"xxx", a.UnicodeSpace, false, ""}, - {" \t\r\n \r\v\f ", a.UnicodeSpace, true, " \t\r\n \r\v\f "}, - {"", a.EndOfLine, true, ""}, - {"\r\n", a.EndOfLine, true, "\r\n"}, - {"\n", a.EndOfLine, true, "\n"}, - {"0", a.Digit, true, "0"}, - {"1", a.Digit, true, "1"}, - {"2", a.Digit, true, "2"}, - {"3", a.Digit, true, "3"}, - {"4", a.Digit, true, "4"}, - {"5", a.Digit, true, "5"}, - {"6", a.Digit, true, "6"}, - {"7", a.Digit, true, "7"}, - {"8", a.Digit, true, "8"}, - {"9", a.Digit, true, "9"}, - {"X", a.Digit, false, ""}, - {"a", a.ASCIILower, true, "a"}, - {"z", a.ASCIILower, true, "z"}, - {"A", a.ASCIILower, false, ""}, - {"Z", a.ASCIILower, false, ""}, - {"A", a.ASCIIUpper, true, "A"}, - {"Z", a.ASCIIUpper, true, "Z"}, - {"a", a.ASCIIUpper, false, ""}, - {"z", a.ASCIIUpper, false, ""}, - {"1", a.Letter, false, ""}, - {"a", a.Letter, true, "a"}, - {"Ø", a.Letter, true, "Ø"}, - {"Ë", a.Lower, false, ""}, - {"ë", a.Lower, true, "ë"}, - {"ä", a.Upper, false, "ä"}, - {"Ä", a.Upper, true, "Ä"}, - {"0", a.HexDigit, true, "0"}, - {"9", a.HexDigit, true, "9"}, - {"a", a.HexDigit, true, "a"}, - {"f", a.HexDigit, true, "f"}, - {"A", a.HexDigit, true, "A"}, - {"F", a.HexDigit, true, "F"}, - {"g", a.HexDigit, false, "g"}, - {"G", a.HexDigit, false, "G"}, - {"0", a.Integer, true, "0"}, - {"09", a.Integer, true, "0"}, // following Go: 09 is invalid octal, so only 0 is valid for the integer - {"1", a.Integer, true, "1"}, - {"-10X", a.Integer, false, ""}, - {"+10X", a.Integer, false, ""}, - {"-10X", a.Signed(a.Integer), true, "-10"}, - {"+10X", a.Signed(a.Integer), true, "+10"}, - {"+10.1X", a.Signed(a.Integer), true, "+10"}, - {"0X", a.Float, true, "0"}, - {"0X", a.Float, true, "0"}, - {"1X", a.Float, true, "1"}, - {"1.", a.Float, true, "1"}, // incomplete float, so only the 1 is picked up - {"123.321X", a.Float, true, "123.321"}, - {"-3.14X", a.Float, false, ""}, - {"-3.14X", a.Signed(a.Float), true, "-3.14"}, - {"-003.0014X", a.Signed(a.Float), true, "-003.0014"}, - {"-11", a.IntegerBetween(-10, 10), false, "0"}, - {"-10", a.IntegerBetween(-10, 10), true, "-10"}, - {"0", a.IntegerBetween(-10, 10), true, "0"}, - {"10", a.IntegerBetween(-10, 10), true, "10"}, - {"11", a.IntegerBetween(0, 10), false, ""}, - {"fifteen", a.IntegerBetween(0, 10), false, ""}, - }) -} - -func TestIPv4Atoms(t *testing.T) { - var a = tokenize.A - AssertHandlers(t, []HandlerT{ - // Not normalized octet. - {"0X", tokenize.MatchOctet(false), true, "0"}, - {"00X", tokenize.MatchOctet(false), true, "00"}, - {"000X", tokenize.MatchOctet(false), true, "000"}, - {"10X", tokenize.MatchOctet(false), true, "10"}, - {"010X", tokenize.MatchOctet(false), true, "010"}, - {"255123", tokenize.MatchOctet(false), true, "255"}, - {"256123", tokenize.MatchOctet(false), false, ""}, - {"300", tokenize.MatchOctet(false), false, ""}, - - // Octet. - {"0", tokenize.MatchOctet(false), true, "0"}, - {"02", tokenize.MatchOctet(false), true, "02"}, - {"003", tokenize.MatchOctet(false), true, "003"}, - {"256", tokenize.MatchOctet(false), false, ""}, - {"0X", a.Octet, true, "0"}, - {"00X", a.Octet, true, "0"}, - {"000X", a.Octet, true, "0"}, - {"10X", a.Octet, true, "10"}, - {"010X", a.Octet, true, "10"}, - {"255123", a.Octet, true, "255"}, - {"256123", a.Octet, false, ""}, - {"300", a.Octet, false, ""}, - - // IPv4 address. - {"0.0.0.0", tokenize.MatchIPv4(false), true, "0.0.0.0"}, - {"010.0.255.01", tokenize.MatchIPv4(false), true, "010.0.255.01"}, - {"0.0.0.0", a.IPv4, true, "0.0.0.0"}, - {"10.20.30.40", a.IPv4, true, "10.20.30.40"}, - {"010.020.003.004", a.IPv4, true, "10.20.3.4"}, - {"255.255.255.255", a.IPv4, true, "255.255.255.255"}, - {"256.255.255.255", a.IPv4, false, ""}, - - // IPv4 CIDR netmask. - {"0", tokenize.MatchIPv4CIDRMask(false), true, "0"}, - {"000", tokenize.MatchIPv4CIDRMask(false), true, "000"}, - {"0", a.IPv4CIDRMask, true, "0"}, - {"00", a.IPv4CIDRMask, true, "0"}, - {"000", a.IPv4CIDRMask, true, "0"}, - {"32", a.IPv4CIDRMask, true, "32"}, - {"032", a.IPv4CIDRMask, true, "32"}, - {"33", a.IPv4CIDRMask, false, ""}, - - // IPv4 netmask in dotted quad format. - {"0.0.0.0", tokenize.MatchIPv4Netmask(false), true, "0.0.0.0"}, - {"255.128.000.000", tokenize.MatchIPv4Netmask(false), true, "255.128.000.000"}, - {"0.0.0.0", a.IPv4Netmask, true, "0.0.0.0"}, - {"255.255.128.0", a.IPv4Netmask, true, "255.255.128.0"}, - {"255.255.255.255", a.IPv4Netmask, true, "255.255.255.255"}, - {"255.255.132.0", a.IPv4Netmask, false, ""}, // not a canonical netmask (1-bits followed by 0-bits) - - // IPv4 address + CIDR or dotted quad netmask. - {"192.168.6.123", a.IPv4Net, false, ""}, - {"192.168.6.123/24", tokenize.MatchIPv4Net(false), true, "192.168.6.123/24"}, - {"001.002.003.004/016", tokenize.MatchIPv4Net(false), true, "001.002.003.004/016"}, - {"192.168.6.123/024", a.IPv4Net, true, "192.168.6.123/24"}, - {"192.168.6.123/255.255.255.0", a.IPv4Net, true, "192.168.6.123/24"}, - {"10.0.0.10/192.0.0.0", a.IPv4Net, true, "10.0.0.10/2"}, - {"10.0.0.10/193.0.0.0", a.IPv4Net, false, ""}, // invalid netmask and 193 is also invalid cidr - {"010.000.000.010/16.000.000.000", a.IPv4Net, true, "10.0.0.10/16"}, // invalid netmask, but 16 cidr is ok, remainder input = ".0.0.0" - }) -} - -func TestIPv6Atoms(t *testing.T) { - var a = tokenize.A - AssertHandlers(t, []HandlerT{ - {"", a.IPv6, false, ""}, - {"::", a.IPv6, true, "::"}, - {"1::", a.IPv6, true, "1::"}, - {"1::1", a.IPv6, true, "1::1"}, - {"::1", a.IPv6, true, "::1"}, - {"1:2:3:4:5:6:7::", a.IPv6, false, ""}, - {"::1:2:3:4:5:6:7:8:9", a.IPv6, true, "::1:2:3:4:5:6"}, - {"1:2:3:4::5:6:7:8:9", a.IPv6, true, "1:2:3:4::5:6"}, - {"a:b::ffff:0:1111", a.IPv6, true, "a:b::ffff:0:1111"}, - {"000a:000b:0000:000:00:ffff:0000:1111", a.IPv6, true, "a:b::ffff:0:1111"}, - {"000a:0000:0000:001:00:0:ffff:1111", a.IPv6, true, "a::1:0:0:ffff:1111"}, - {"0000:0000:0000:001:00:0:ffff:1111", a.IPv6, true, "::1:0:0:ffff:1111"}, - {"aaaa:bbbb:cccc:dddd:eeee:ffff:0000:1111", a.IPv6, true, "aaaa:bbbb:cccc:dddd:eeee:ffff:0:1111"}, - {"gggg:bbbb:cccc:dddd:eeee:ffff:0000:1111", a.IPv6, false, ""}, - {"ffff::gggg:eeee:ffff:0000:1111", a.IPv6, true, "ffff::"}, - {"0", a.IPv6CIDRMask, true, "0"}, - {"128", a.IPv6CIDRMask, true, "128"}, - {"129", a.IPv6CIDRMask, false, ""}, - {"::1/128", a.IPv6Net, true, "::1/128"}, - {"::1/129", a.IPv6Net, false, ""}, - {"1.1.1.1/24", a.IPv6Net, false, ""}, - {"ffff:0:0:0::1010/0", a.IPv6Net, true, "ffff::1010/0"}, - {"fe80:0:0:0:0216:3eff:fe96:0002/64", a.IPv6Net, true, "fe80::216:3eff:fe96:2/64"}, - }) -} - -func TestModifiers(t *testing.T) { - var c, a, m = tokenize.C, tokenize.A, tokenize.M - AssertHandlers(t, []HandlerT{ - {"missed me!", m.Drop(a.Rune('w')), false, ""}, - {"where are you?", m.Drop(a.Rune('w')), true, ""}, - {"--cool", c.Seq(m.Drop(c.OneOrMore(a.Minus)), a.Str("cool")), true, "cool"}, - {"12345", c.Seq(a.Digit, m.Drop(a.Digit), a.Digit, m.Drop(a.Digit), a.Digit), true, "135"}, - {" trim ", m.Trim(c.OneOrMore(a.AnyRune), " "), true, "trim"}, - {" \t trim \t ", m.Trim(c.OneOrMore(a.AnyRune), " \t"), true, "trim"}, - {" trim ", m.TrimLeft(c.OneOrMore(a.AnyRune), " "), true, "trim "}, - {" trim ", m.TrimRight(c.OneOrMore(a.AnyRune), " "), true, " trim"}, - {" \t trim \t ", m.TrimRight(c.OneOrMore(a.AnyRune), " \t"), true, " \t trim"}, - {"dirtyword", m.Replace(c.OneOrMore(a.AnyRune), "*******"), true, "*******"}, - {"abcdefghijk", m.ByCallback(a.Str("abc"), func(s string) string { return "X" }), true, "X"}, - {"abcdefghijk", m.ByCallback(a.Str("xyz"), func(s string) string { return "X" }), false, ""}, - {"NoTaLlUpPeR", m.ToUpper(a.StrNoCase("notallUPPER")), true, "NOTALLUPPER"}, - {"NoTaLlLoWeR", m.ToLower(a.StrNoCase("NOTALLlower")), true, "notalllower"}, - }) -} - -// When a TokenMaker encounters an error, this is considered a programmer error. -// A TokenMaker should not be called, unless the input is already validated to -// follow the correct pattern. Therefore, tokenmakers will panic when the -// input cannot be processed successfully. -func TestTokenMakerErrorHandling(t *testing.T) { - var a, tok = tokenize.A, tokenize.T - invalid := tok.Boolean("BOOL", a.Str("no")) // not valid for strconv.ParseBool() - tokenizer := tokenize.New(invalid) - AssertPanic(t, PanicT{ - func() { tokenizer("no") }, false, - `boolean token invalid (strconv.ParseBool: parsing "no": invalid syntax)`, - }) -} - -func TestTokenMakers(t *testing.T) { - var c, a, tok = tokenize.C, tokenize.A, tokenize.T - AssertTokenMakers(t, []TokenMakerT{ - {`empty token`, tok.Str("A", c.ZeroOrMore(a.Digit)), - []tokenize.Token{{Type: "A", Value: ""}}}, - - {`Ѝюج literal \string`, tok.Str("B", c.OneOrMore(a.AnyRune)), - []tokenize.Token{{Type: "B", Value: `Ѝюج literal \string`}}}, - - {`Ѝюجinterpreted \n string \u2318`, tok.StrInterpreted("C", c.OneOrMore(a.AnyRune)), - []tokenize.Token{{Type: "C", Value: "Ѝюجinterpreted \n string ⌘"}}}, - - {`\uD801 invalid rune`, tok.StrInterpreted("D", c.OneOrMore(a.AnyRune)), []tokenize.Token{{Type: "D", Value: "� invalid rune"}}}, - - // I don't check the returned error here, but it's good enough to see that the parsing - // stopped after the illegal \g escape sequence. - {`invalid \g escape`, tok.StrInterpreted("E", c.OneOrMore(a.AnyRune)), []tokenize.Token{{Type: "E", Value: "invalid "}}}, - - {"Ø*", tok.Byte("Q", a.AnyRune), []tokenize.Token{{Type: "Q", Value: byte('Ø')}}}, - {"ROCKS", c.OneOrMore(tok.Byte("bar", a.ASCII)), []tokenize.Token{ - {Type: "bar", Value: byte('R')}, - {Type: "bar", Value: byte('O')}, - {Type: "bar", Value: byte('C')}, - {Type: "bar", Value: byte('K')}, - {Type: "bar", Value: byte('S')}, - }}, - - {"Ø*", tok.Rune("P", a.AnyRune), []tokenize.Token{{Type: "P", Value: rune('Ø')}}}, - - {`2147483647XYZ`, tok.Int("D", a.Integer), []tokenize.Token{{Type: "D", Value: int(2147483647)}}}, - {`-2147483647XYZ`, tok.Int("D", a.Signed(a.Integer)), []tokenize.Token{{Type: "D", Value: int(-2147483647)}}}, - {`127XYZ`, tok.Int8("E", a.Integer), []tokenize.Token{{Type: "E", Value: int8(127)}}}, - {`-127XYZ`, tok.Int8("E", a.Signed(a.Integer)), []tokenize.Token{{Type: "E", Value: int8(-127)}}}, - {`32767XYZ`, tok.Int16("F", a.Integer), []tokenize.Token{{Type: "F", Value: int16(32767)}}}, - {`-32767XYZ`, tok.Int16("F", a.Signed(a.Integer)), []tokenize.Token{{Type: "F", Value: int16(-32767)}}}, - {`2147483647XYZ`, tok.Int32("G", a.Integer), []tokenize.Token{{Type: "G", Value: int32(2147483647)}}}, - {`-2147483647XYZ`, tok.Int32("G", a.Signed(a.Integer)), []tokenize.Token{{Type: "G", Value: int32(-2147483647)}}}, - {`-9223372036854775807XYZ`, tok.Int64("H", a.Signed(a.Integer)), []tokenize.Token{{Type: "H", Value: int64(-9223372036854775807)}}}, - - {`4294967295`, tok.Uint("I", a.Integer), []tokenize.Token{{Type: "I", Value: uint(4294967295)}}}, - {`255XYZ`, tok.Uint8("J", a.Integer), []tokenize.Token{{Type: "J", Value: uint8(255)}}}, - {`65535XYZ`, tok.Uint16("K", a.Integer), []tokenize.Token{{Type: "K", Value: uint16(65535)}}}, - {`4294967295XYZ`, tok.Uint32("L", a.Integer), []tokenize.Token{{Type: "L", Value: uint32(4294967295)}}}, - {`18446744073709551615XYZ`, tok.Uint64("M", a.Integer), []tokenize.Token{{Type: "M", Value: uint64(18446744073709551615)}}}, - - {`3.1415=PI`, tok.Float32("N", a.Float), []tokenize.Token{{Type: "N", Value: float32(3.1415)}}}, - {`24.19287=PI`, tok.Float64("O", a.Float), []tokenize.Token{{Type: "O", Value: float64(24.19287)}}}, - - {`1tTtrueTRUETrue`, c.OneOrMore(tok.Boolean("P", a.Boolean)), []tokenize.Token{ - {Type: "P", Value: true}, - {Type: "P", Value: true}, - {Type: "P", Value: true}, - {Type: "P", Value: true}, - {Type: "P", Value: true}, - {Type: "P", Value: true}, - }}, - - {`0fFfalseFALSEFalse`, c.OneOrMore(tok.Boolean("P", a.Boolean)), []tokenize.Token{ - {Type: "P", Value: false}, - {Type: "P", Value: false}, - {Type: "P", Value: false}, - {Type: "P", Value: false}, - {Type: "P", Value: false}, - {Type: "P", Value: false}, - }}, - - {`anything`, tok.ByValue("Q", c.OneOrMore(a.AnyRune), "Kaboom!"), []tokenize.Token{{Type: "Q", Value: "Kaboom!"}}}, - }) -} - -func TestTokenGroup_Match(t *testing.T) { - var c, a, tok = tokenize.C, tokenize.A, tokenize.T - tokenizer := tokenize.New(tok.Group("Group", - c.Seq(tok.Rune(1, a.Letter), tok.Rune(2, a.Letter), tok.Rune(3, a.Letter)))) - - api, err := tokenizer("xxxxx") - AssertTrue(t, err == nil, "Tokenizer result") - tokens := api.Tokens() - AssertEqual(t, 1, len(tokens), "Length of tokens slice") - contained := tokens[0].Value.([]tokenize.Token) - AssertEqual(t, 3, len(contained), "Length of contained tokens") - AssertEqual(t, 1, contained[0].Type.(int), "Value of contained Token 1") - AssertEqual(t, 2, contained[1].Type.(int), "Value of contained Token 2") - AssertEqual(t, 3, contained[2].Type.(int), "Value of contained Token 3") -} - -func TestTokenGroup_Mismatch(t *testing.T) { - var c, a, tok = tokenize.C, tokenize.A, tokenize.T - tokenizer := tokenize.New(tok.Group("Group", - c.Seq(tok.Rune(1, a.Letter), tok.Rune(2, a.Letter), tok.Rune(3, a.Letter))).Optional()) - - api, err := tokenizer("12345") - AssertTrue(t, err == nil, "Tokenizer result") - tokens := api.Tokens() - AssertEqual(t, 0, len(tokens), "Length of tokens slice") -} - -// I know, this is hell, but that's the whole point for this test :-> -func TestCombination(t *testing.T) { - var c, a, m = tokenize.C, tokenize.A, tokenize.M - demonic := c.Seq( - c.Optional(a.SquareOpen), - m.Trim( - c.Seq( - c.Optional(a.Blanks), - c.Repeated(3, a.AngleClose), - m.ByCallback(c.OneOrMore(a.StrNoCase("hello")), func(s string) string { - return fmt.Sprintf("%d", len(s)) - }), - m.Replace(c.Separated(a.Comma, c.Optional(a.Blanks)), ", "), - m.ToUpper(c.Min(1, a.ASCIILower)), - m.Drop(a.Excl), - c.Repeated(3, a.AngleOpen), - c.Optional(a.Blanks), - ), - " \t", - ), - c.Optional(a.SquareClose), - ) - - AssertHandlers(t, []HandlerT{ - {"[ \t >>>Hello, world!<<< ]", demonic, true, "[>>>5, WORLD<<<]"}, - {"[ \t >>>Hello, world!<<< ", demonic, true, "[>>>5, WORLD<<<"}, - {">>>HellohellO, world!<<< ]", demonic, true, ">>>10, WORLD<<<]"}, - {"[ \t >>>HellohellO , , , world!<<< ", demonic, true, "[>>>10, WORLD<<<"}, - }) -} diff --git a/tokenize2/tokenize.go b/tokenize2/tokenize.go deleted file mode 100644 index 1fc35c3..0000000 --- a/tokenize2/tokenize.go +++ /dev/null @@ -1,41 +0,0 @@ -// Package tokenize provides tooling to build a tokenizer in -// parser/combinator-style, used to feed data to the parser. -package tokenize2 - -import ( - "fmt" -) - -// Func is the function signature as returned by New: a function that takes -// any supported type of input, executes a tokenizer run and returns a -// Result struct (possibly nil) and an error (possibly nil). -type Func func(input interface{}) (*API, error) - -// New instantiates a new tokenizer. -// -// The tokenizer is a tokenizing state machine, in which tokenize.Handler -// functions are used to move the state machine forward during tokenizing. -// Using the New function, you can wrap a tokenize.Handler in a simple way, -// making it possible to feed some input to the handler and retrieve the -// tokenizing results. -// -// The startHandler argument points the tokenizer to the tokenize.Handler function -// that must be executed at the start of the tokenizing process. From there on -// other tokenize.Handler functions can be invoked recursively to implement the -// tokenizing process. -// -// THis function returns a function that can be invoked to run the tokenizer -// against the provided input data. For an overview of allowed inputs, take a -// look at the documentation for parsekit.read.New(). -func New(tokenHandler Handler) Func { - return func(input interface{}) (*API, error) { - api := NewAPI(input) - ok := tokenHandler(api) - - if !ok { - err := fmt.Errorf("mismatch at %s", Cursor{}) - return nil, err - } - return api, nil - } -} diff --git a/tokenize2/tokenizer_test.go b/tokenize2/tokenizer_test.go deleted file mode 100644 index 55fe905..0000000 --- a/tokenize2/tokenizer_test.go +++ /dev/null @@ -1,223 +0,0 @@ -package tokenize2_test - -import ( - "fmt" - "io" - "strings" - "testing" - "unicode/utf8" - - tokenize "git.makaay.nl/mauricem/go-parsekit/tokenize2" -) - -// TODO For error handling, it would be really cool if for example the -// 10.0.300.1/24 case would return an actual error stating that -// 300 is not a valid octet for an IPv4 address. -// Biggest thing to take care of here, is that errors should not stop -// a Parser flow (since we might be trying to match different cases in -// sequence), but a Parser flow should optionally be able to make use -// of the actual error. -// The same goes for a Tokenizer, since those can also make use of -// optional matching using tokenize.C.Any(...) for example. If matching -// for Any(IPv4, Digits), the example case should simply end up with 10 -// after the IPv4 mismatch. -func ExampleNew() { - // Build the tokenizer for ip/mask. - var c, a, t = tokenize.C, tokenize.A, tokenize.T - ip := t.Str("ip", a.IPv4) - mask := t.Int8("mask", a.IPv4CIDRMask) - cidr := c.Seq(ip, a.Slash, mask) - tokenizer := tokenize.New(cidr) - - for _, input := range []string{ - "000.000.000.000/000", - "192.168.0.1/24", - "255.255.255.255/32", - "10.0.300.1/24", - "not an IPv4 CIDR", - } { - // Execute returns a Result and an error, which is nil on success. - result, err := tokenizer(input) - - if err == nil { - fmt.Printf("Result: %s\n", result.Tokens()) - } else { - fmt.Printf("Error: %s\n", err) - } - } - // Output: - // Result: [ip("0.0.0.0") mask((int8)0)] - // Result: [ip("192.168.0.1") mask((int8)24)] - // Result: [ip("255.255.255.255") mask((int8)32)] - // Error: mismatch at start of file - // Error: mismatch at start of file -} - -func TestCallingNextRune_ReturnsNextRune(t *testing.T) { - api := makeTokenizeAPI() - r, _ := api.NextRune() - AssertEqual(t, 'T', r, "first rune") -} - -func TestInputCanAcceptRunesFromReader(t *testing.T) { - i := makeTokenizeAPI() - i.NextRune() - i.Accept() - i.NextRune() - i.Accept() - i.NextRune() - i.Accept() - AssertEqual(t, "Tes", i.String(), "i.String()") -} - -func TestCallingNextRuneTwice_Panics(t *testing.T) { - AssertPanic(t, PanicT{ - Function: func() { - i := makeTokenizeAPI() - i.NextRune() - i.NextRune() - }, - Regexp: true, - Expect: `tokenize\.API\.NextRune\(\): NextRune\(\) called at /.*_test\.go:\d+ ` + - `without a prior call to Accept\(\)`, - }) -} - -func TestCallingAcceptWithoutCallingNextRune_Panics(t *testing.T) { - api := makeTokenizeAPI() - AssertPanic(t, PanicT{ - Function: api.Accept, - Regexp: true, - Expect: `tokenize\.API\.Accept\(\): Accept\(\) called at /.*test\.go:\d+ ` + - `without first calling NextRune\(\)`, - }) -} - -func TestCallingAcceptAfterReadError_Panics(t *testing.T) { - api := tokenize.NewAPI("") - AssertPanic(t, PanicT{ - Function: func() { - api.NextRune() - api.Accept() - }, - Regexp: true, - Expect: `tokenize\.API\.Accept\(\): Accept\(\) called at /.*_test\.go:\d+` + - `, but the prior call to NextRune\(\) failed`, - }) -} - -func TestCallingMergeOnTopLevelAPI_Panics(t *testing.T) { - AssertPanic(t, PanicT{ - Function: func() { - i := makeTokenizeAPI() - i.Merge(0) - }, - Regexp: true, - Expect: `tokenize\.API\.Merge\(\): Merge\(\) called at /.*_test.go:\d+ on the top-level API`}) -} - -func TestCallingMergeOnForkParentAPI_Panics(t *testing.T) { - AssertPanic(t, PanicT{ - Function: func() { - i := makeTokenizeAPI() - child := i.Fork() - i.Fork() - i.Merge(child) - }, - Regexp: true, - Expect: `tokenize\.API\.Merge\(\): Merge\(\) called at /.*_test.go:\d+ ` + - `on API stack level 1, but the current stack level is 2 \(forgot to Dispose\(\) a forked child\?\)`}) -} - -func TestCallingDisposeOnTopLevelAPI_Panics(t *testing.T) { - AssertPanic(t, PanicT{ - Function: func() { - i := makeTokenizeAPI() - i.Dispose(0) - }, - Regexp: true, - Expect: `tokenize\.API\.Dispose\(\): Dispose\(\) called at /.*_test.go:\d+ on the top-level API`}) -} - -func TestCallingDisposeOnForkParentAPI_Panics(t *testing.T) { - AssertPanic(t, PanicT{ - Function: func() { - i := makeTokenizeAPI() - child := i.Fork() - i.Fork() - i.Dispose(child) - }, - Regexp: true, - Expect: `tokenize\.API\.Dispose\(\): Dispose\(\) called at /.*_test.go:\d+ ` + - `on API stack level 1, but the current stack level is 2 \(forgot to Dispose\(\) a forked child\?\)`}) -} - -func TestCallingForkOnForkedParentAPI_Panics(t *testing.T) { - AssertPanic(t, PanicT{ - Function: func() { - i := makeTokenizeAPI() - i.Fork() - g := i.Fork() - i.Fork() - i.Merge(g) - }, - Regexp: true, - Expect: `tokenize\.API\.Merge\(\): Merge\(\) called at /.*_test.go:\d+ ` + - `on API stack level 2, but the current stack level is 3 \(forgot to Dispose\(\) a forked child\?\)`}) -} - -func TestForkingInput_ClearsLastRune(t *testing.T) { - AssertPanic(t, PanicT{ - Function: func() { - i := makeTokenizeAPI() - i.NextRune() - i.Fork() - i.Accept() - }, - Regexp: true, - Expect: `tokenize\.API\.Accept\(\): Accept\(\) called at /.*_test\.go:\d+ without first calling NextRune\(\)`, - }) -} - -func TestAccept_UpdatesCursor(t *testing.T) { - i := tokenize.NewAPI(strings.NewReader("input\r\nwith\r\nnewlines")) - AssertEqual(t, "start of file", i.Cursor().String(), "cursor 1") - for j := 0; j < 6; j++ { // read "input\r", cursor end up at "\n" - i.NextRune() - i.Accept() - } - AssertEqual(t, "line 1, column 7", i.Cursor().String(), "cursor 2") - i.NextRune() // read "\n", cursor ends up at start of new line - i.Accept() - AssertEqual(t, "line 2, column 1", i.Cursor().String(), "cursor 3") - for j := 0; j < 10; j++ { // read "with\r\nnewl", cursor end up at "i" - i.NextRune() - i.Accept() - } - AssertEqual(t, "line 3, column 5", i.Cursor().String(), "cursor 4") -} - -func TestWhenCallingNextruneAtEndOfFile_EOFIsReturned(t *testing.T) { - i := tokenize.NewAPI(strings.NewReader("X")) - i.NextRune() - i.Accept() - r, err := i.NextRune() - AssertEqual(t, true, r == utf8.RuneError, "returned rune from NextRune()") - AssertEqual(t, true, err == io.EOF, "returned error from NextRune()") -} -func TestAfterReadingruneAtEndOfFile_EarlierRunesCanStillBeAccessed(t *testing.T) { - i := tokenize.NewAPI(strings.NewReader("X")) - child := i.Fork() - i.NextRune() - i.Accept() - r, err := i.NextRune() - AssertEqual(t, true, r == utf8.RuneError, "returned rune from 2nd NextRune()") - i.Dispose(child) // brings the read offset back to the start - r, err = i.NextRune() // so here we should see the same rune - AssertEqual(t, 'X', r, "returned rune from 2nd NextRune()") - AssertEqual(t, true, err == nil, "returned error from 2nd NextRune()") -} - -func makeTokenizeAPI() *tokenize.API { - return tokenize.NewAPI("Testing") -} diff --git a/tokenize2/tokenizer_whitebox_test.go b/tokenize2/tokenizer_whitebox_test.go deleted file mode 100644 index 5c6556c..0000000 --- a/tokenize2/tokenizer_whitebox_test.go +++ /dev/null @@ -1,131 +0,0 @@ -package tokenize2 - -import ( - "testing" -) - -func TestFork_CreatesForkOfInputAtSameCursorPosition(t *testing.T) { - // Create input, accept the first rune. - i := NewAPI("Testing") - i.NextRune() - i.Accept() // T - AssertEqual(t, "T", i.String(), "accepted rune in input") - // Fork - child := i.Fork() - AssertEqual(t, 1, i.stackFrame.cursor.Byte, "parent cursor.Byte") - AssertEqual(t, 1, i.stackFrame.offset, "parent offset") - AssertEqual(t, 1, i.stackFrame.cursor.Byte, "child cursor.Byte") - AssertEqual(t, 1, i.stackFrame.offset, "child offset") - // Accept two runes via fork. - i.NextRune() - i.Accept() // e - i.NextRune() - i.Accept() // s - AssertEqual(t, "es", i.String(), "result runes in fork") - AssertEqual(t, 1, i.stackFrames[i.stackLevel-1].cursor.Byte, "parent cursor.Byte") - AssertEqual(t, 1, i.stackFrames[i.stackLevel-1].offset, "parent offset") - AssertEqual(t, 3, i.stackFrame.cursor.Byte, "child cursor.Byte") - AssertEqual(t, 3, i.stackFrame.offset, "child offset") - // Merge fork back into parent - i.Merge(child) - i.Dispose(child) - AssertEqual(t, "Tes", i.String(), "result runes in parent Input after Merge()") - AssertEqual(t, 3, i.stackFrame.cursor.Byte, "parent cursor.Byte") - AssertEqual(t, 3, i.stackFrame.offset, "parent offset") -} - -func TestGivenForkedChildWhichAcceptedRune_AfterMerging_RuneEndsUpInParentResult(t *testing.T) { - i := NewAPI("Testing") - i.NextRune() - i.Accept() - f1 := i.Fork() - i.NextRune() - i.Accept() - f2 := i.Fork() - i.NextRune() - i.Accept() - AssertEqual(t, "s", i.String(), "f2 String()") - AssertEqual(t, 3, i.stackFrame.offset, "f2.offset A") - i.Merge(f2) - i.Dispose(f2) - AssertEqual(t, "es", i.String(), "f1 String()") - AssertEqual(t, 3, i.stackFrame.offset, "f1.offset A") - i.Merge(f1) - i.Dispose(f1) - AssertEqual(t, "Tes", i.String(), "top-level API String()") - AssertEqual(t, 3, i.stackFrame.offset, "f1.offset A") -} - -func TestCallingAcceptAfterNextRune_AcceptsRuneAndMovesReadOffsetForward(t *testing.T) { - i := NewAPI("Testing") - r, _ := i.NextRune() - AssertEqual(t, 'T', r, "result from 1st call to NextRune()") - AssertTrue(t, i.lastRune == 'T', "API.lastRune after NextRune() is not 'T'") - AssertTrue(t, i.runeRead, "API.runeRead after NextRune() is not true") - i.Accept() - AssertTrue(t, i.runeRead == false, "API.runeRead after Accept() is not false") - AssertEqual(t, 1, i.stackFrame.offset, "API.stackFrame.offset") - r, _ = i.NextRune() - AssertEqual(t, 'e', r, "result from 2nd call to NextRune()") -} - -func TestFlushInput(t *testing.T) { - api := NewAPI("cool") - - // Flushing without any read data is okay. FlushInput() will return - // false in this case, and nothing else happens. - AssertTrue(t, api.FlushInput() == false, "flush input at start") - - api.NextRune() - api.Accept() - api.NextRune() - api.Accept() - - AssertTrue(t, api.FlushInput() == true, "flush input after reading some data") - AssertEqual(t, 0, api.stackFrame.offset, "offset after flush input") - - AssertTrue(t, api.FlushInput() == false, "flush input after flush input") - - // Read offset is now zero, but reading should continue after "co". - api.NextRune() - api.Accept() - api.NextRune() - api.Accept() - - AssertEqual(t, "cool", api.String(), "end result") -} - -func TestInputFlusherWrapper(t *testing.T) { - runeA := A.Rune('a') - flushB := C.FlushInput(A.Rune('b')) - api := NewAPI("abaab") - runeA(api) - AssertEqual(t, 1, api.stackFrame.offset, "offset after 1 read") - AssertEqual(t, "a", api.String(), "runes after 1 read") - flushB(api) - AssertEqual(t, 0, api.stackFrame.offset, "offset after 2 reads + input flush") - AssertEqual(t, "ab", api.String(), "runes after 2 reads") - runeA(api) - AssertEqual(t, 1, api.stackFrame.offset, "offset after 3 reads") - AssertEqual(t, "aba", api.String(), "runes after 3 reads") - runeA(api) - AssertEqual(t, 2, api.stackFrame.offset, "offset after 4 reads") - AssertEqual(t, "abaa", api.String(), "runes after 4 reads") - flushB(api) - AssertEqual(t, 0, api.stackFrame.offset, "offset after 5 reads + input flush") - AssertEqual(t, "abaab", api.String(), "runes after 5 reads") -} - -func AssertEqual(t *testing.T, expected interface{}, actual interface{}, forWhat string) { - if expected != actual { - t.Errorf( - "Unexpected value for %s:\nexpected: %q\nactual: %q", - forWhat, expected, actual) - } -} - -func AssertTrue(t *testing.T, b bool, assertion string) { - if !b { - t.Errorf("Assertion %s is false", assertion) - } -}