diff --git a/examples/example_helloManyStateParser_test.go b/examples/example_helloManyStateParser_test.go index 4eebfdd..6b08742 100644 --- a/examples/example_helloManyStateParser_test.go +++ b/examples/example_helloManyStateParser_test.go @@ -144,7 +144,7 @@ func (h *helloparser1) end(p *parse.API) { h.greetee = strings.TrimSpace(h.greetee) if h.greetee == "" { - p.Error("The name cannot be empty") + p.SetError("The name cannot be empty") } else { p.Stop() } diff --git a/examples/example_helloSingleStateParser_test.go b/examples/example_helloSingleStateParser_test.go index b9f6aa2..233015c 100644 --- a/examples/example_helloSingleStateParser_test.go +++ b/examples/example_helloSingleStateParser_test.go @@ -82,27 +82,27 @@ func (h *helloparser2) Parse(input string) (string, error) { func (h *helloparser2) start(p *parse.API) { c, a, m := tokenize.C, tokenize.A, tokenize.M if !p.Accept(a.StrNoCase("hello")) { - p.Error("the greeting is not being friendly") + p.SetError("the greeting is not being friendly") return } if !p.Accept(c.Seq(c.Optional(a.Blanks), a.Comma, c.Optional(a.Blanks))) { - p.Error("the greeting is not properly separated") + p.SetError("the greeting is not properly separated") return } if p.Accept(m.TrimSpace(c.OneOrMore(a.AnyRune.Except(a.Excl)))) { h.greetee = p.Result.String() if h.greetee == "" { - p.Error("the name cannot be empty") + p.SetError("the name cannot be empty") return } } else { - p.Error("the greeting is targeted at thin air") + p.SetError("the greeting is targeted at thin air") return } if !p.Accept(a.Excl) { - p.Error("the greeting is not loud enough") + p.SetError("the greeting is not loud enough") } else if !p.Accept(a.EndOfFile) { - p.Error("too much stuff going on after the closing '!'") + p.SetError("too much stuff going on after the closing '!'") } else { p.Stop() } diff --git a/parse/api.go b/parse/api.go index 38219c2..64fcc22 100644 --- a/parse/api.go +++ b/parse/api.go @@ -30,13 +30,14 @@ type API struct { // On a successful peek, the results (data + tokens) are returned by the peek. // They are availablel (as with Accept()) through parse.API.Result. func (parseAPI *API) PeekWithResult(tokenHandler tokenize.Handler) bool { - child, ok := parseAPI.invokeHandler("Peek", tokenHandler) + _, ok := parseAPI.invokeTokenizeHandler("Peek", tokenHandler) tokenAPI := parseAPI.tokenAPI if ok { parseAPI.Result.Tokens = tokenAPI.Output.Tokens() parseAPI.Result.Runes = tokenAPI.Output.Runes() } - tokenAPI.Dispose(child) + tokenAPI.Input.Reset() + tokenAPI.Output.Reset() return ok } @@ -48,11 +49,12 @@ func (parseAPI *API) PeekWithResult(tokenHandler tokenize.Handler) bool { // No results (data + tokens) are returned by Peek(). If want access to the data // through parse.API.Result, make use of PeekWithResult() instead. func (parseAPI *API) Peek(tokenHandler tokenize.Handler) bool { - child, ok := parseAPI.invokeHandler("Peek", tokenHandler) - t := parseAPI.tokenAPI + _, ok := parseAPI.invokeTokenizeHandler("Peek", tokenHandler) + tokenAPI := parseAPI.tokenAPI parseAPI.Result.Tokens = nil parseAPI.Result.Runes = nil - t.Dispose(child) + tokenAPI.Input.Reset() + tokenAPI.Output.Reset() return ok } @@ -61,37 +63,42 @@ func (parseAPI *API) Peek(tokenHandler tokenize.Handler) bool { // forward to beyond the match that was found. Otherwise false will be // and the read cursor will stay at the same position. // -// After calling this method, you can retrieve the results using the Result() method. +// After calling this method, you can retrieve the results through the API.Result field. func (parseAPI *API) Accept(tokenHandler tokenize.Handler) bool { - t := parseAPI.tokenAPI - child, ok := parseAPI.invokeHandler("Accept", tokenHandler) + tokenAPI := parseAPI.tokenAPI + _, ok := parseAPI.invokeTokenizeHandler("Accept", tokenHandler) if ok { // Keep track of the results as produced by this child. - parseAPI.Result.Tokens = t.Output.Tokens() - parseAPI.Result.Runes = t.Output.Runes() + parseAPI.Result.Tokens = tokenAPI.Output.Tokens() + parseAPI.Result.Runes = tokenAPI.Output.Runes() - // Merge to the parent level. - t.Merge(child) - t.Dispose(child) + // Now the results are stored, we can reset the results for the next handler. + tokenAPI.Output.Reset() // And flush the input reader buffer. - t.Input.Flush() + tokenAPI.Input.Flush() } else { - t.Dispose(child) + // No match, so reset the tokenize.API for the next handler. + // This moves back the read cursor to the start and clears all results. + tokenAPI.Input.Reset() + tokenAPI.Output.Reset() } return ok } -func (parseAPI *API) invokeHandler(name string, tokenHandler tokenize.Handler) (int, bool) { +// invokeTokenizeHandler forks the tokenize.API, and invokes the tokenize.Handler +// in the context of the created child. The child is returned, so the caller +// has full control over merging and disposing the child. +func (parseAPI *API) invokeTokenizeHandler(name string, tokenHandler tokenize.Handler) (int, bool) { parseAPI.panicWhenStoppedOrInError(name) if tokenHandler == nil { callerPanic(name, "parsekit.parse.API.{name}(): {name}() called with nil tokenHandler argument at {caller}") } - child := parseAPI.tokenAPI.Fork() + //child := parseAPI.tokenAPI.Fork() ok := tokenHandler(parseAPI.tokenAPI) - return child, ok + return 0, ok } // panicWhenStoppedOrInError will panic when the parser has produced an error @@ -165,19 +172,18 @@ func (parseAPI *API) Stop() { parseAPI.stopped = true } -// Error sets the error message in the API. +// SetError sets the error message in the API. // // After setting an error, no more calls to API methods are allowed. // Calling a method in this state will result in a panic. -// TODO ... wait how do I read the error? I don't I guess, I just return it. Is Error() a good name or SetError() better for example? -func (parseAPI *API) Error(format string, data ...interface{}) { - // No call to p.panicWhenStoppedOrInError(), to allow a parser to - // set a different error message when needed. +// You can still call SetError() though, to set a different error message +// if you feel the need to do so. +func (parseAPI *API) SetError(format string, data ...interface{}) { message := fmt.Sprintf(format, data...) parseAPI.err = fmt.Errorf("%s at %s", message, parseAPI.tokenAPI.Input.Cursor()) } -// ExpectEndOfFile can be used to check if the input is at end of file. +// ExpectEndOfFile checks if the end of the input file has been reached. // // When it finds that the end of the file was indeed reached, then the parser // will be stopped through Stop(). Otherwise, the unexpected input is reported @@ -209,11 +215,11 @@ func (parseAPI *API) Expected(expected string) { _, err := parseAPI.tokenAPI.Byte.Peek(0) switch { case err == nil: - parseAPI.Error("unexpected input%s", fmtExpects(expected)) + parseAPI.SetError("unexpected input%s", fmtExpects(expected)) case err == io.EOF: - parseAPI.Error("unexpected end of file%s", fmtExpects(expected)) + parseAPI.SetError("unexpected end of file%s", fmtExpects(expected)) default: - parseAPI.Error("unexpected error '%s'%s", err, fmtExpects(expected)) + parseAPI.SetError("unexpected error '%s'%s", err, fmtExpects(expected)) } } diff --git a/parse/parse.go b/parse/parse.go index 0a2bbf8..cc8da66 100644 --- a/parse/parse.go +++ b/parse/parse.go @@ -41,6 +41,7 @@ func New(startHandler Handler) Func { // and try to make the best of it. api.ExpectEndOfFile() } + return api.err } } diff --git a/parse/parse_test.go b/parse/parse_test.go index 693c669..5864b47 100644 --- a/parse/parse_test.go +++ b/parse/parse_test.go @@ -245,7 +245,7 @@ func TestGivenParserWithErrorSet_HandlePanics(t *testing.T) { panic("This is not the handler you're looking for") } p := parse.New(func(p *parse.API) { - p.Error("It ends here") + p.SetError("It ends here") p.Handle(otherHandler) }) parse.AssertPanic(t, parse.PanicT{ diff --git a/read/read.go b/read/read.go index ab5132f..a4e6ea9 100644 --- a/read/read.go +++ b/read/read.go @@ -213,6 +213,7 @@ func (buf *Buffer) fill(minBytes int) { } const defaultBufferSize = 1024 +const runeCacheSize = 128 // ErrTooLarge is passed to panic if memory cannot be allocated to store data in a buffer. var ErrTooLarge = errors.New("parsekit.read.Buffer: too large") diff --git a/tokenize/api.go b/tokenize/api.go index 7b084fb..4fc734d 100644 --- a/tokenize/api.go +++ b/tokenize/api.go @@ -86,13 +86,14 @@ type API struct { } type stackFrame struct { - offset int // the read offset (relative to the start of the reader buffer) for this stack frame - column int // the column at which the cursor is (0-indexed, relative to the start of the stack frame) - line int // the line at which the cursor is (0-indexed, relative to the start of the stack frame) - bytesStart int // the starting point in the API.bytes slice for runes produced by this stack level - bytesEnd int // the end point in the API.bytes slice for runes produced by this stack level - tokenStart int // the starting point in the API.tokens slice for tokens produced by this stack level - tokenEnd int // the end point in the API.tokens slice for tokens produced by this stack level + offsetLocal int // the read offset, relative to the start if this stack frame + offset int // the read offset, relative to the start of the reader buffer + column int // the column at which the cursor is (0-indexed, relative to the start of the stack frame) + line int // the line at which the cursor is (0-indexed, relative to the start of the stack frame) + bytesStart int // the starting point in the API.bytes slice for runes produced by this stack level + bytesEnd int // the end point in the API.bytes slice for runes produced by this stack level + tokenStart int // the starting point in the API.tokens slice for tokens produced by this stack level + tokenEnd int // the end point in the API.tokens slice for tokens produced by this stack level // TODO err error // can be used by a Handler to report a specific issue with the input @@ -177,16 +178,7 @@ func (tokenAPI *API) Fork() int { // Once the child is no longer needed, it can be disposed of by using the // method Dispose(), which will return the tokenizer to the parent. func (tokenAPI *API) Merge(stackLevel int) { - if stackLevel == 0 { - callerPanic("Merge", "tokenize.API.{name}(): {name}() called at {caller} "+ - "on the top-level API stack level 0") - } - if stackLevel != tokenAPI.stackLevel { - callerPanic("Merge", "tokenize.API.{name}(): {name}() called at {caller} "+ - "on API stack level %d, but the current stack level is %d "+ - "(forgot to Dispose() a forked child?)", stackLevel, tokenAPI.stackLevel) - } - + tokenAPI.checkStackLevelForMethod("Merge", stackLevel) parent := &tokenAPI.stackFrames[stackLevel-1] f := tokenAPI.stackFrame @@ -206,6 +198,7 @@ func (tokenAPI *API) Merge(stackLevel int) { f.tokenStart = f.tokenEnd // Update the parent read offset. + parent.offsetLocal = parent.offsetLocal + (f.offset - parent.offset) parent.offset = f.offset // Update the parent cursor position. @@ -221,24 +214,20 @@ func (tokenAPI *API) Merge(stackLevel int) { f.err = nil } -// Reset moves the read cursor back to the beginning for the currently active API child. -// Aditionally, all output (bytes and tokens) that was emitted from the API child is -// cleared as well. -func (tokenAPI *API) Reset() { - f := tokenAPI.stackFrame - f.bytesEnd = f.bytesStart - f.tokenEnd = f.tokenStart - f.column = 0 - f.line = 0 - if tokenAPI.stackLevel == 0 { - f.offset = 0 - } else { - f.offset = tokenAPI.stackFrames[tokenAPI.stackLevel-1].offset - } - f.err = nil -} - func (tokenAPI *API) Dispose(stackLevel int) { + tokenAPI.checkStackLevelForMethod("Dispose", stackLevel) tokenAPI.stackLevel = stackLevel - 1 tokenAPI.stackFrame = &tokenAPI.stackFrames[stackLevel-1] } + +func (tokenAPI *API) checkStackLevelForMethod(name string, stackLevel int) { + if stackLevel == 0 { + callerPanic(name, "tokenize.API.{name}(): {name}() called at {caller} "+ + "on the top-level API stack level 0") + } + if stackLevel != tokenAPI.stackLevel { + callerPanic(name, "tokenize.API.{name}(): {name}() called at {caller} "+ + "on API stack level %d, but the current stack level is %d "+ + "(forgot to Dispose() a forked child?)", stackLevel, tokenAPI.stackLevel) + } +} diff --git a/tokenize/api_bytemode.go b/tokenize/api_bytemode.go index 4f50849..87bf65d 100644 --- a/tokenize/api_bytemode.go +++ b/tokenize/api_bytemode.go @@ -30,7 +30,7 @@ func (byteMode InputByteMode) Accept(b byte) { byteMode.MoveCursor(b) } -// AcceptMulti is used to accept one or more bytes that were read from the input. +// AcceptMulti accepts one or more bytes that were read from the input. // This tells the tokenizer: "I've seen these bytes. I want to make use of them // for the final output, so please remember them for me. I will now continue // reading after these bytes." @@ -62,6 +62,7 @@ func (byteMode InputByteMode) MoveCursor(b byte) { } f.offset++ + f.offsetLocal++ } // MoveCursorMulti updates the position of the read cursor, based on the provided bytes. diff --git a/tokenize/api_input.go b/tokenize/api_input.go index 49e3978..7001963 100644 --- a/tokenize/api_input.go +++ b/tokenize/api_input.go @@ -30,6 +30,16 @@ func (i Input) Cursor() string { return fmt.Sprintf("line %d, column %d", line+1, column+1) } +func (i Input) Reset() { + f := i.api.stackFrame + if f.offsetLocal > 0 { + f.column = 0 + f.line = 0 + f.offset -= f.offsetLocal + f.offsetLocal = 0 + } +} + // Flush flushes input data from the read buffer up to the current // read cursor position of the tokenizer. // @@ -41,6 +51,7 @@ func (i Input) Flush() bool { if f.offset > 0 { i.reader.Flush(f.offset) f.offset = 0 + f.offsetLocal = 0 return true } return false diff --git a/tokenize/api_output.go b/tokenize/api_output.go index e2c2d27..786029d 100644 --- a/tokenize/api_output.go +++ b/tokenize/api_output.go @@ -26,6 +26,13 @@ func (o Output) Rune(offset int) rune { return r } +func (o Output) Reset() { + f := o.api.stackFrame + f.bytesEnd = f.bytesStart + f.tokenEnd = f.tokenStart + f.err = nil +} + func (o Output) ClearData() { f := o.api.stackFrame f.bytesEnd = f.bytesStart diff --git a/tokenize/api_runemode.go b/tokenize/api_runemode.go index 188fb92..c9f7193 100644 --- a/tokenize/api_runemode.go +++ b/tokenize/api_runemode.go @@ -97,6 +97,7 @@ func (runeMode InputRuneMode) MoveCursor(r rune) int { width := utf8.RuneLen(r) f.offset += width + f.offsetLocal += width return width } diff --git a/tokenize/handlers_builtin.go b/tokenize/handlers_builtin.go index 5261c42..b4b9576 100644 --- a/tokenize/handlers_builtin.go +++ b/tokenize/handlers_builtin.go @@ -663,23 +663,23 @@ func MatchStrNoCase(expected string) Handler { return func(tokenAPI *API) bool { matches := make([]rune, l) - width := 0 + offset := 0 i := 0 for _, e := range expected { if e <= '\x7F' { - b, err := tokenAPI.Byte.Peek(width) + b, err := tokenAPI.Byte.Peek(offset) if err != nil || (b != byte(e) && unicode.ToUpper(rune(b)) != unicode.ToUpper(e)) { return false } matches[i] = rune(b) - width++ + offset++ } else { - r, w, err := tokenAPI.Rune.Peek(width) + r, w, err := tokenAPI.Rune.Peek(offset) if err != nil || (r != e && unicode.ToUpper(r) != unicode.ToUpper(e)) { return false } matches[i] = r - width += w + offset += w } i++ } @@ -737,7 +737,8 @@ func MatchAny(handlers ...Handler) Handler { tokenAPI.Dispose(child) return true } - tokenAPI.Reset() + tokenAPI.Input.Reset() + tokenAPI.Output.Reset() } tokenAPI.Dispose(child) diff --git a/tokenize/tokenize.go b/tokenize/tokenize.go index df24951..d31eff8 100644 --- a/tokenize/tokenize.go +++ b/tokenize/tokenize.go @@ -34,7 +34,7 @@ func (result *Result) String() string { // other tokenize.Handler functions can be invoked recursively to implement the // tokenizing process. // -// THis function returns a function that can be invoked to run the tokenizer +// This function returns a function that can be invoked to run the tokenizer // against the provided input data. For an overview of allowed inputs, take a // look at the documentation for parsekit.read.New(). func New(tokenHandler Handler) Func {