From 4c94374107fb3120a4b66cea3d94784324db0784 Mon Sep 17 00:00:00 2001 From: Maurice Makaay Date: Fri, 26 Jul 2019 12:14:15 +0000 Subject: [PATCH] Getting rid of forking, the new system delivers more performance. --- read/read.go | 2 +- tokenize/api.go | 174 +++-------------------------------- tokenize/api_bytemode.go | 16 ++-- tokenize/api_input.go | 33 ++----- tokenize/api_output.go | 77 +++++++--------- tokenize/api_runemode.go | 27 +++--- tokenize/handlers_builtin.go | 38 +++----- 7 files changed, 87 insertions(+), 280 deletions(-) diff --git a/read/read.go b/read/read.go index a4e6ea9..d48ce6f 100644 --- a/read/read.go +++ b/read/read.go @@ -93,7 +93,7 @@ func makeBufioReader(input interface{}) *bufio.Reader { // To minimize memory use, it is also possible to flush the read buffer when there is // no more need to go back to previously read data. // -// This parserkit.reader.Reader is used internally by tokenize.API. +// This buffer is used internally by tokenize.API. type Buffer struct { bufio *bufio.Reader // used for ReadRune() buffer []byte // input buffer, holding bytes that were read from input diff --git a/tokenize/api.go b/tokenize/api.go index 82ed102..50bb6fe 100644 --- a/tokenize/api.go +++ b/tokenize/api.go @@ -71,20 +71,14 @@ import ( // can lead to hard to track bugs. I much prefer this forking method, since // no bookkeeping has to be implemented when implementing a parser. type API struct { - stackFrames []stackFrame // the stack frames, containing stack level-specific data - stackLevel int // the current stack level - stackFrame *stackFrame // the current stack frame - - reader *read.Buffer // the buffered input reader - Input Input // provides input-related functionality - Byte InputByteMode // access to a set of byte-based input methods - Rune InputRuneMode // access to a set of rune-based input methods - - Output Output // provides output-related functionality - outputTokens []Token // accepted tokens - outputBytes []byte // accepted bytes - - snapshot [9]int // storage for the Snapshot() / RestoreSnapshot() feature + reader *read.Buffer // the buffered input reader + pointers stackFrame // various pointers for keeping track of input, output, cursor. + Input Input // access to a set of general input-related methods + Byte InputByteMode // access to a set of byte-based input methods + Rune InputRuneMode // access to a set of rune-based input methods + Output Output // access to a set of output-related functionality + outputTokens []Token // storage for accepted tokens + outputBytes []byte // storage for accepted bytes } type stackFrame struct { @@ -96,14 +90,10 @@ type stackFrame struct { bytesEnd int // the end point in the API.bytes slice for runes produced by this stack level tokenStart int // the starting point in the API.tokens slice for tokens produced by this stack level tokenEnd int // the end point in the API.tokens slice for tokens produced by this stack level - - // TODO - err error // can be used by a Handler to report a specific issue with the input } -const initialStackDepth = 64 const initialTokenStoreLength = 64 -const initialByteStoreLength = 1024 +const initialByteStoreLength = 128 // NewAPI initializes a new API struct, wrapped around the provided input. // For an overview of allowed inputs, take a look at the documentation @@ -111,7 +101,6 @@ const initialByteStoreLength = 1024 func NewAPI(input interface{}) *API { reader := read.New(input) tokenAPI := &API{ - stackFrames: make([]stackFrame, initialStackDepth), outputBytes: make([]byte, initialByteStoreLength), outputTokens: make([]Token, initialTokenStoreLength), reader: reader, @@ -120,154 +109,15 @@ func NewAPI(input interface{}) *API { tokenAPI.Byte = InputByteMode{api: tokenAPI, reader: reader} tokenAPI.Rune = InputRuneMode{api: tokenAPI, reader: reader} tokenAPI.Output = Output{api: tokenAPI} - tokenAPI.stackFrame = &tokenAPI.stackFrames[0] - tokenAPI.snapshot[0] = -1 - return tokenAPI } -// Fork forks off a child of the API struct. It will reuse the same -// read buffer and cursor position, but for the rest this can be considered -// a fresh API. -// -// By forking an API, you can freely work with the forked child, without -// affecting the parent API. This is for example useful when you must perform -// some form of lookahead. -// -// When processing of the Handler was successful and you want to add the results -// to the parent API, you can call Merge() on the forked child. -// This will add the results to the results of the parent (runes, tokens). -// It also updates the read cursor position of the parent to that of the child. -// -// When the lookahead was unsuccessful, then the forked child API can -// disposed by calling Dispose() on the forked child. This is not mandatory. -// Garbage collection will take care of this automatically. -// The parent API was never modified, so it can safely be used after disposal -// as if the lookahead never happened. -func (tokenAPI *API) Fork() int { - tokenAPI.stackLevel++ - newStackLevel := tokenAPI.stackLevel - - // Grow the stack frames capacity when needed. - frames := tokenAPI.stackFrames - if cap(frames) < (newStackLevel + 1) { - newFrames := make([]stackFrame, cap(frames)*2) - copy(newFrames, frames) - tokenAPI.stackFrames = newFrames - } - - parent := tokenAPI.stackFrame - tokenAPI.stackFrames[newStackLevel] = stackFrame{ - offset: parent.offset, - bytesStart: parent.bytesEnd, - bytesEnd: parent.bytesEnd, - tokenStart: parent.tokenEnd, - tokenEnd: parent.tokenEnd, - } - tokenAPI.stackFrame = &tokenAPI.stackFrames[newStackLevel] - - return newStackLevel -} - -// Merge appends the results of a forked child API (runes, tokens) to the -// results of its parent. The read cursor of the parent is also updated -// to that of the forked child. -// -// After the merge operation, the child results are reset so it can immediately -// be reused for performing another match. This means that all Result data are -// cleared, but the read cursor position is kept at its current position. -// This allows a child to feed results in chunks to its parent. -// -// Once the child is no longer needed, it can be disposed of by using the -// method Dispose(), which will return the tokenizer to the parent. -func (tokenAPI *API) Merge(stackLevel int) { - tokenAPI.checkStackLevelForMethod("Merge", stackLevel) - parent := &tokenAPI.stackFrames[stackLevel-1] - f := tokenAPI.stackFrame - - // The end of the parent slice aligns with the start of the child slice. - // Because of this, to merge the parent slice can simply be expanded - // to include the child slice. - // parent : |----------| - // child: |------| - // After merge operation: - // parent: |-----------------| - // child: |---> continue reading from here - parent.bytesEnd = f.bytesEnd - f.bytesStart = f.bytesEnd - - // The same logic applies to tokens. - parent.tokenEnd = f.tokenEnd - f.tokenStart = f.tokenEnd - - // Update the parent read offset. - parent.offsetLocal = parent.offsetLocal + (f.offset - parent.offset) - parent.offset = f.offset - - // Update the parent cursor position. - if f.line > parent.line { - parent.line += f.line - parent.column = f.column - } else { - parent.column += f.column - } - - f.line = 0 - f.column = 0 - f.err = nil -} - -func (tokenAPI *API) Dispose(stackLevel int) { - tokenAPI.checkStackLevelForMethod("Dispose", stackLevel) - tokenAPI.stackLevel = stackLevel - 1 - tokenAPI.stackFrame = &tokenAPI.stackFrames[stackLevel-1] -} - -func (tokenAPI *API) checkStackLevelForMethod(name string, stackLevel int) { - if stackLevel == 0 { - callerPanic(name, "tokenize.API.{name}(): {name}() called at {caller} "+ - "on the top-level API stack level 0") - } - if stackLevel != tokenAPI.stackLevel { - callerPanic(name, "tokenize.API.{name}(): {name}() called at {caller} "+ - "on API stack level %d, but the current stack level is %d "+ - "(forgot to Dispose() a forked child?)", stackLevel, tokenAPI.stackLevel) - } -} - -type Snapshot [9]int +type Snapshot stackFrame func (tokenAPI *API) MakeSnapshot() Snapshot { - f := tokenAPI.stackFrame - - return Snapshot{ - tokenAPI.stackLevel, - f.bytesStart, - f.bytesEnd, - f.tokenStart, - f.tokenEnd, - f.offset, - f.offsetLocal, - f.line, - f.column, - } + return Snapshot(tokenAPI.pointers) } func (tokenAPI *API) RestoreSnapshot(snap Snapshot) { - f := tokenAPI.stackFrame - - if snap[0] != tokenAPI.stackLevel { - callerPanic("RestoreSnapshot", "tokenize.API.{name}(): {name}() called at {caller} "+ - "on API stack level %d, but the provided snapshot was created for stack level %d", - tokenAPI.stackLevel, snap[0]) - } - - f.bytesStart = snap[1] - f.bytesEnd = snap[2] - f.tokenStart = snap[3] - f.tokenEnd = snap[4] - f.offset = snap[5] - f.offsetLocal = snap[6] - f.line = snap[7] - f.column = snap[8] + tokenAPI.pointers = stackFrame(snap) } diff --git a/tokenize/api_bytemode.go b/tokenize/api_bytemode.go index 87bf65d..6decf46 100644 --- a/tokenize/api_bytemode.go +++ b/tokenize/api_bytemode.go @@ -14,7 +14,7 @@ type InputByteMode struct { // When an offset is requested that is beyond the length of the available input // data, then the error will be io.EOF. func (byteMode InputByteMode) Peek(offset int) (byte, error) { - return byteMode.reader.ByteAt(byteMode.api.stackFrame.offset + offset) + return byteMode.reader.ByteAt(byteMode.api.pointers.offset + offset) } // PeekMulti returns at max the provided maximum number of bytes at the provided @@ -22,7 +22,7 @@ func (byteMode InputByteMode) Peek(offset int) (byte, error) { // error as such. The returned error can in such case be set to io.EOF to indicate // that the end of the input was reached though. func (byteMode InputByteMode) PeekMulti(offset int, count int) ([]byte, error) { - return byteMode.reader.BytesAt(byteMode.api.stackFrame.offset+offset, count) + return byteMode.reader.BytesAt(byteMode.api.pointers.offset+offset, count) } func (byteMode InputByteMode) Accept(b byte) { @@ -53,16 +53,16 @@ func (byteMode InputByteMode) AcceptMulti(bytes ...byte) { // After the call, byte offset 0 for Peek() and PeekMulti() will point at // the first byte at the new cursor position. func (byteMode InputByteMode) MoveCursor(b byte) { - f := byteMode.api.stackFrame + a := byteMode.api if b == '\n' { - f.column = 0 - f.line++ + a.pointers.column = 0 + a.pointers.line++ } else { - f.column++ + a.pointers.column++ } - f.offset++ - f.offsetLocal++ + a.pointers.offset++ + a.pointers.offsetLocal++ } // MoveCursorMulti updates the position of the read cursor, based on the provided bytes. diff --git a/tokenize/api_input.go b/tokenize/api_input.go index 7001963..839dfd9 100644 --- a/tokenize/api_input.go +++ b/tokenize/api_input.go @@ -15,29 +15,10 @@ type Input struct { // Cursor returns a string that describes the current read cursor position. func (i Input) Cursor() string { - column, line := 0, 0 - for _, f := range i.api.stackFrames[:i.api.stackLevel+1] { - if f.line > 0 { - column = f.column - line += f.line - } else { - column += f.column - } - } - if line == 0 && column == 0 { + if i.api.pointers.line == 0 && i.api.pointers.column == 0 { return fmt.Sprintf("start of file") } - return fmt.Sprintf("line %d, column %d", line+1, column+1) -} - -func (i Input) Reset() { - f := i.api.stackFrame - if f.offsetLocal > 0 { - f.column = 0 - f.line = 0 - f.offset -= f.offsetLocal - f.offsetLocal = 0 - } + return fmt.Sprintf("line %d, column %d", i.api.pointers.line+1, i.api.pointers.column+1) } // Flush flushes input data from the read buffer up to the current @@ -47,11 +28,11 @@ func (i Input) Reset() { // Parsekit will call this method at points where it knows it is a // safe thing to do. func (i Input) Flush() bool { - f := i.api.stackFrame - if f.offset > 0 { - i.reader.Flush(f.offset) - f.offset = 0 - f.offsetLocal = 0 + a := i.api + if a.pointers.offset > 0 { + i.reader.Flush(a.pointers.offset) + a.pointers.offset = 0 + a.pointers.offsetLocal = 0 return true } return false diff --git a/tokenize/api_output.go b/tokenize/api_output.go index 1b315dc..595f097 100644 --- a/tokenize/api_output.go +++ b/tokenize/api_output.go @@ -11,8 +11,7 @@ type Output struct { func (o Output) String() string { a := o.api - f := a.stackFrame - bytes := a.outputBytes[f.bytesStart:f.bytesEnd] + bytes := a.outputBytes[a.pointers.bytesStart:a.pointers.bytesEnd] return string(bytes) } @@ -22,36 +21,35 @@ func (o Output) Runes() []rune { func (o Output) Rune(offset int) rune { a := o.api - r, _ := utf8.DecodeRune(a.outputBytes[a.stackFrame.bytesStart+offset:]) + r, _ := utf8.DecodeRune(a.outputBytes[a.pointers.bytesStart+offset:]) return r } type Split [2]int func (o Output) Split() Split { - f := o.api.stackFrame - split := Split{f.bytesStart, f.tokenStart} - f.bytesStart = f.bytesEnd - f.tokenStart = f.tokenEnd + a := o.api + split := Split{a.pointers.bytesStart, a.pointers.tokenStart} + a.pointers.bytesStart = a.pointers.bytesEnd + a.pointers.tokenStart = a.pointers.tokenEnd return split } func (o Output) MergeSplit(split Split) { - f := o.api.stackFrame - f.bytesStart = split[0] - f.tokenStart = split[1] + a := o.api + a.pointers.bytesStart = split[0] + a.pointers.tokenStart = split[1] } func (o Output) Reset() { - f := o.api.stackFrame - f.bytesEnd = f.bytesStart - f.tokenEnd = f.tokenStart - f.err = nil + a := o.api + a.pointers.bytesEnd = a.pointers.bytesStart + a.pointers.tokenEnd = a.pointers.tokenStart } func (o Output) ClearData() { - f := o.api.stackFrame - f.bytesEnd = f.bytesStart + a := o.api + a.pointers.bytesEnd = a.pointers.bytesStart } func (o Output) SetBytes(bytes ...byte) { @@ -61,11 +59,10 @@ func (o Output) SetBytes(bytes ...byte) { func (o Output) AddByte(b byte) { a := o.api - f := a.stackFrame - curBytesEnd := f.bytesEnd + curBytesEnd := a.pointers.bytesEnd a.growOutputData(curBytesEnd + 1) a.outputBytes[curBytesEnd] = b - f.bytesEnd++ + a.pointers.bytesEnd++ } func (o Output) SetRunes(runes ...rune) { @@ -75,22 +72,20 @@ func (o Output) SetRunes(runes ...rune) { func (o Output) AddBytes(bytes ...byte) { a := o.api - f := a.stackFrame - curBytesEnd := f.bytesEnd + curBytesEnd := a.pointers.bytesEnd newBytesEnd := curBytesEnd + len(bytes) a.growOutputData(newBytesEnd) copy(a.outputBytes[curBytesEnd:], bytes) - f.bytesEnd = newBytesEnd + a.pointers.bytesEnd = newBytesEnd } func (o Output) AddRunes(runes ...rune) { a := o.api - f := a.stackFrame runesAsString := string(runes) - newBytesEnd := f.bytesEnd + len(runesAsString) + newBytesEnd := a.pointers.bytesEnd + len(runesAsString) a.growOutputData(newBytesEnd) - copy(a.outputBytes[f.bytesEnd:], runesAsString) - f.bytesEnd = newBytesEnd + copy(a.outputBytes[a.pointers.bytesEnd:], runesAsString) + a.pointers.bytesEnd = newBytesEnd } func (o Output) AddString(s string) { @@ -104,23 +99,22 @@ func (o Output) SetString(s string) { func (o Output) Tokens() []Token { a := o.api - f := a.stackFrame - return a.outputTokens[f.tokenStart:f.tokenEnd] + return a.outputTokens[a.pointers.tokenStart:a.pointers.tokenEnd] } func (o Output) Token(offset int) Token { a := o.api - return a.outputTokens[a.stackFrame.tokenStart+offset] + return a.outputTokens[a.pointers.tokenStart+offset] } func (o Output) TokenValue(offset int) interface{} { a := o.api - return a.outputTokens[a.stackFrame.tokenStart+offset].Value + return a.outputTokens[a.pointers.tokenStart+offset].Value } func (o Output) ClearTokens() { - f := o.api.stackFrame - f.tokenEnd = f.tokenStart + a := o.api + a.pointers.tokenEnd = a.pointers.tokenStart } func (o Output) SetTokens(tokens ...Token) { @@ -130,18 +124,16 @@ func (o Output) SetTokens(tokens ...Token) { func (o Output) AddToken(token Token) { a := o.api - f := a.stackFrame - tokenEnd := f.tokenEnd + tokenEnd := a.pointers.tokenEnd a.growOutputTokens(tokenEnd + 1) a.outputTokens[tokenEnd] = token - f.tokenEnd++ + a.pointers.tokenEnd++ } func (o Output) InsertTokenAtStart(token Token) { a := o.api - f := a.stackFrame - tokenEnd := f.tokenEnd - tokenStart := f.tokenStart + tokenEnd := a.pointers.tokenEnd + tokenStart := a.pointers.tokenStart a.growOutputTokens(tokenEnd + 1) if tokenStart == tokenEnd { a.outputTokens[tokenEnd] = token @@ -149,16 +141,15 @@ func (o Output) InsertTokenAtStart(token Token) { copy(a.outputTokens[tokenStart+1:], a.outputTokens[tokenStart:tokenEnd]) a.outputTokens[tokenStart] = token } - f.tokenEnd++ + a.pointers.tokenEnd++ } func (o Output) AddTokens(tokens ...Token) { a := o.api - f := a.stackFrame - a.growOutputTokens(f.tokenEnd + len(tokens)) + a.growOutputTokens(a.pointers.tokenEnd + len(tokens)) for _, t := range tokens { - a.outputTokens[f.tokenEnd] = t - f.tokenEnd++ + a.outputTokens[a.pointers.tokenEnd] = t + a.pointers.tokenEnd++ } } diff --git a/tokenize/api_runemode.go b/tokenize/api_runemode.go index c9f7193..8d444c8 100644 --- a/tokenize/api_runemode.go +++ b/tokenize/api_runemode.go @@ -26,7 +26,7 @@ type InputRuneMode struct { // When an offset is requested that is beyond the length of the available input // data, then the error will be io.EOF. func (runeMode InputRuneMode) Peek(offset int) (rune, int, error) { - return runeMode.reader.RuneAt(runeMode.api.stackFrame.offset + offset) + return runeMode.reader.RuneAt(runeMode.api.pointers.offset + offset) } // Accept is used to accept a single rune that was read from the input. @@ -42,14 +42,11 @@ func (runeMode InputRuneMode) Peek(offset int) (rune, int, error) { // the first byte after the accepted rune. func (runeMode InputRuneMode) Accept(r rune) { a := runeMode.api - f := a.stackFrame - - curBytesEnd := f.bytesEnd + curBytesEnd := a.pointers.bytesEnd maxRequiredBytes := curBytesEnd + utf8.UTFMax a.growOutputData(maxRequiredBytes) w := utf8.EncodeRune(a.outputBytes[curBytesEnd:], r) - f.bytesEnd += w - + a.pointers.bytesEnd += w runeMode.MoveCursor(r) } @@ -66,9 +63,7 @@ func (runeMode InputRuneMode) Accept(r rune) { // the first byte after the accepted runes. func (runeMode InputRuneMode) AcceptMulti(runes ...rune) { a := runeMode.api - f := a.stackFrame - - curBytesEnd := f.bytesEnd + curBytesEnd := a.pointers.bytesEnd maxBytes := curBytesEnd + len(runes)*utf8.UTFMax a.growOutputData(maxBytes) @@ -77,7 +72,7 @@ func (runeMode InputRuneMode) AcceptMulti(runes ...rune) { curBytesEnd += w runeMode.MoveCursor(r) } - f.bytesEnd = curBytesEnd + a.pointers.bytesEnd = curBytesEnd } // MoveCursor updates the position of the read cursor, based on the provided rune. @@ -87,17 +82,17 @@ func (runeMode InputRuneMode) AcceptMulti(runes ...rune) { // After the call, byte offset 0 for Peek() and PeekMulti() will point at // the first rune at the new cursor position. func (runeMode InputRuneMode) MoveCursor(r rune) int { - f := runeMode.api.stackFrame + a := runeMode.api if r == '\n' { - f.column = 0 - f.line++ + a.pointers.column = 0 + a.pointers.line++ } else { - f.column++ + a.pointers.column++ } width := utf8.RuneLen(r) - f.offset += width - f.offsetLocal += width + a.pointers.offset += width + a.pointers.offsetLocal += width return width } diff --git a/tokenize/handlers_builtin.go b/tokenize/handlers_builtin.go index 069d721..ade50fe 100644 --- a/tokenize/handlers_builtin.go +++ b/tokenize/handlers_builtin.go @@ -707,27 +707,15 @@ func MatchOptional(handler Handler) Handler { // reports successful match. func MatchSeq(handlers ...Handler) Handler { return func(tokenAPI *API) bool { - f := tokenAPI.stackFrame snap := tokenAPI.MakeSnapshot() for _, handler := range handlers { - tokenAPI.Output.Split() - // Move forward the output pointers, so the handler that we're about - // to call will make use of a fresh output buffer. - f.bytesStart = f.bytesEnd - f.tokenStart = f.tokenEnd - + split := tokenAPI.Output.Split() if !handler(tokenAPI) { tokenAPI.RestoreSnapshot(snap) return false } + tokenAPI.Output.MergeSplit(split) } - - // Move back the output pointers to where they were originally. This - // stiches together all the pieces of output that were genarated by - // the individual handlers in the sequence. - f.bytesStart = snap[1] - f.tokenStart = snap[3] - return true } } @@ -842,7 +830,10 @@ func matchMinMax(min int, max int, handler Handler, name string) Handler { snap := tokenAPI.MakeSnapshot() for total < min { total++ - if !handler(tokenAPI) { + split := tokenAPI.Output.Split() + ok := handler(tokenAPI) + tokenAPI.Output.MergeSplit(split) + if !ok { tokenAPI.RestoreSnapshot(snap) return false } @@ -853,7 +844,10 @@ func matchMinMax(min int, max int, handler Handler, name string) Handler { //child.Merge() for max < 0 || total < max { total++ - if !handler(tokenAPI) { + split := tokenAPI.Output.Split() + ok := handler(tokenAPI) + tokenAPI.Output.MergeSplit(split) + if !ok { break } } @@ -1522,13 +1516,13 @@ func MatchIPv6Net(normalize bool) Handler { // In both cases, it would match the first form. func ModifyDrop(handler Handler) Handler { return func(tokenAPI *API) bool { - runeEnd := tokenAPI.stackFrame.bytesEnd - tokenEnd := tokenAPI.stackFrame.tokenEnd + runeEnd := tokenAPI.pointers.bytesEnd + tokenEnd := tokenAPI.pointers.tokenEnd if handler(tokenAPI) { // We keep offset and cursor updates, but rollback any runes / tokens // that were added by the handler. - tokenAPI.stackFrame.bytesEnd = runeEnd - tokenAPI.stackFrame.tokenEnd = tokenEnd + tokenAPI.pointers.bytesEnd = runeEnd + tokenAPI.pointers.tokenEnd = tokenEnd return true } return false @@ -1921,8 +1915,6 @@ func MakeTokenByValue(toktype interface{}, handler Handler, value interface{}) H // its input and must return the token value. func MakeTokenByCallback(toktype interface{}, handler Handler, makeValue func(tokenAPI *API) interface{}) Handler { return func(tokenAPI *API) bool { - snap := tokenAPI.MakeSnapshot() - split := tokenAPI.Output.Split() if handler(tokenAPI) { // When a parsing hierarchy looks like ("date" ("year", "month" "day")), the // tokens must end up in the order "date", "year", "month", "day" and not @@ -1931,11 +1923,9 @@ func MakeTokenByCallback(toktype interface{}, handler Handler, makeValue func(to // that were already created by the handler call. token := Token{Type: toktype, Value: makeValue(tokenAPI)} tokenAPI.Output.InsertTokenAtStart(token) - tokenAPI.Output.MergeSplit(split) return true } - tokenAPI.RestoreSnapshot(snap) return false } }