Getting rid of forking, the new system delivers more performance.

This commit is contained in:
Maurice Makaay 2019-07-26 12:14:15 +00:00
parent 87cdadae78
commit 4c94374107
7 changed files with 87 additions and 280 deletions

View File

@ -93,7 +93,7 @@ func makeBufioReader(input interface{}) *bufio.Reader {
// To minimize memory use, it is also possible to flush the read buffer when there is
// no more need to go back to previously read data.
//
// This parserkit.reader.Reader is used internally by tokenize.API.
// This buffer is used internally by tokenize.API.
type Buffer struct {
bufio *bufio.Reader // used for ReadRune()
buffer []byte // input buffer, holding bytes that were read from input

View File

@ -71,20 +71,14 @@ import (
// can lead to hard to track bugs. I much prefer this forking method, since
// no bookkeeping has to be implemented when implementing a parser.
type API struct {
stackFrames []stackFrame // the stack frames, containing stack level-specific data
stackLevel int // the current stack level
stackFrame *stackFrame // the current stack frame
reader *read.Buffer // the buffered input reader
Input Input // provides input-related functionality
Byte InputByteMode // access to a set of byte-based input methods
Rune InputRuneMode // access to a set of rune-based input methods
Output Output // provides output-related functionality
outputTokens []Token // accepted tokens
outputBytes []byte // accepted bytes
snapshot [9]int // storage for the Snapshot() / RestoreSnapshot() feature
reader *read.Buffer // the buffered input reader
pointers stackFrame // various pointers for keeping track of input, output, cursor.
Input Input // access to a set of general input-related methods
Byte InputByteMode // access to a set of byte-based input methods
Rune InputRuneMode // access to a set of rune-based input methods
Output Output // access to a set of output-related functionality
outputTokens []Token // storage for accepted tokens
outputBytes []byte // storage for accepted bytes
}
type stackFrame struct {
@ -96,14 +90,10 @@ type stackFrame struct {
bytesEnd int // the end point in the API.bytes slice for runes produced by this stack level
tokenStart int // the starting point in the API.tokens slice for tokens produced by this stack level
tokenEnd int // the end point in the API.tokens slice for tokens produced by this stack level
// TODO
err error // can be used by a Handler to report a specific issue with the input
}
const initialStackDepth = 64
const initialTokenStoreLength = 64
const initialByteStoreLength = 1024
const initialByteStoreLength = 128
// NewAPI initializes a new API struct, wrapped around the provided input.
// For an overview of allowed inputs, take a look at the documentation
@ -111,7 +101,6 @@ const initialByteStoreLength = 1024
func NewAPI(input interface{}) *API {
reader := read.New(input)
tokenAPI := &API{
stackFrames: make([]stackFrame, initialStackDepth),
outputBytes: make([]byte, initialByteStoreLength),
outputTokens: make([]Token, initialTokenStoreLength),
reader: reader,
@ -120,154 +109,15 @@ func NewAPI(input interface{}) *API {
tokenAPI.Byte = InputByteMode{api: tokenAPI, reader: reader}
tokenAPI.Rune = InputRuneMode{api: tokenAPI, reader: reader}
tokenAPI.Output = Output{api: tokenAPI}
tokenAPI.stackFrame = &tokenAPI.stackFrames[0]
tokenAPI.snapshot[0] = -1
return tokenAPI
}
// Fork forks off a child of the API struct. It will reuse the same
// read buffer and cursor position, but for the rest this can be considered
// a fresh API.
//
// By forking an API, you can freely work with the forked child, without
// affecting the parent API. This is for example useful when you must perform
// some form of lookahead.
//
// When processing of the Handler was successful and you want to add the results
// to the parent API, you can call Merge() on the forked child.
// This will add the results to the results of the parent (runes, tokens).
// It also updates the read cursor position of the parent to that of the child.
//
// When the lookahead was unsuccessful, then the forked child API can
// disposed by calling Dispose() on the forked child. This is not mandatory.
// Garbage collection will take care of this automatically.
// The parent API was never modified, so it can safely be used after disposal
// as if the lookahead never happened.
func (tokenAPI *API) Fork() int {
tokenAPI.stackLevel++
newStackLevel := tokenAPI.stackLevel
// Grow the stack frames capacity when needed.
frames := tokenAPI.stackFrames
if cap(frames) < (newStackLevel + 1) {
newFrames := make([]stackFrame, cap(frames)*2)
copy(newFrames, frames)
tokenAPI.stackFrames = newFrames
}
parent := tokenAPI.stackFrame
tokenAPI.stackFrames[newStackLevel] = stackFrame{
offset: parent.offset,
bytesStart: parent.bytesEnd,
bytesEnd: parent.bytesEnd,
tokenStart: parent.tokenEnd,
tokenEnd: parent.tokenEnd,
}
tokenAPI.stackFrame = &tokenAPI.stackFrames[newStackLevel]
return newStackLevel
}
// Merge appends the results of a forked child API (runes, tokens) to the
// results of its parent. The read cursor of the parent is also updated
// to that of the forked child.
//
// After the merge operation, the child results are reset so it can immediately
// be reused for performing another match. This means that all Result data are
// cleared, but the read cursor position is kept at its current position.
// This allows a child to feed results in chunks to its parent.
//
// Once the child is no longer needed, it can be disposed of by using the
// method Dispose(), which will return the tokenizer to the parent.
func (tokenAPI *API) Merge(stackLevel int) {
tokenAPI.checkStackLevelForMethod("Merge", stackLevel)
parent := &tokenAPI.stackFrames[stackLevel-1]
f := tokenAPI.stackFrame
// The end of the parent slice aligns with the start of the child slice.
// Because of this, to merge the parent slice can simply be expanded
// to include the child slice.
// parent : |----------|
// child: |------|
// After merge operation:
// parent: |-----------------|
// child: |---> continue reading from here
parent.bytesEnd = f.bytesEnd
f.bytesStart = f.bytesEnd
// The same logic applies to tokens.
parent.tokenEnd = f.tokenEnd
f.tokenStart = f.tokenEnd
// Update the parent read offset.
parent.offsetLocal = parent.offsetLocal + (f.offset - parent.offset)
parent.offset = f.offset
// Update the parent cursor position.
if f.line > parent.line {
parent.line += f.line
parent.column = f.column
} else {
parent.column += f.column
}
f.line = 0
f.column = 0
f.err = nil
}
func (tokenAPI *API) Dispose(stackLevel int) {
tokenAPI.checkStackLevelForMethod("Dispose", stackLevel)
tokenAPI.stackLevel = stackLevel - 1
tokenAPI.stackFrame = &tokenAPI.stackFrames[stackLevel-1]
}
func (tokenAPI *API) checkStackLevelForMethod(name string, stackLevel int) {
if stackLevel == 0 {
callerPanic(name, "tokenize.API.{name}(): {name}() called at {caller} "+
"on the top-level API stack level 0")
}
if stackLevel != tokenAPI.stackLevel {
callerPanic(name, "tokenize.API.{name}(): {name}() called at {caller} "+
"on API stack level %d, but the current stack level is %d "+
"(forgot to Dispose() a forked child?)", stackLevel, tokenAPI.stackLevel)
}
}
type Snapshot [9]int
type Snapshot stackFrame
func (tokenAPI *API) MakeSnapshot() Snapshot {
f := tokenAPI.stackFrame
return Snapshot{
tokenAPI.stackLevel,
f.bytesStart,
f.bytesEnd,
f.tokenStart,
f.tokenEnd,
f.offset,
f.offsetLocal,
f.line,
f.column,
}
return Snapshot(tokenAPI.pointers)
}
func (tokenAPI *API) RestoreSnapshot(snap Snapshot) {
f := tokenAPI.stackFrame
if snap[0] != tokenAPI.stackLevel {
callerPanic("RestoreSnapshot", "tokenize.API.{name}(): {name}() called at {caller} "+
"on API stack level %d, but the provided snapshot was created for stack level %d",
tokenAPI.stackLevel, snap[0])
}
f.bytesStart = snap[1]
f.bytesEnd = snap[2]
f.tokenStart = snap[3]
f.tokenEnd = snap[4]
f.offset = snap[5]
f.offsetLocal = snap[6]
f.line = snap[7]
f.column = snap[8]
tokenAPI.pointers = stackFrame(snap)
}

View File

@ -14,7 +14,7 @@ type InputByteMode struct {
// When an offset is requested that is beyond the length of the available input
// data, then the error will be io.EOF.
func (byteMode InputByteMode) Peek(offset int) (byte, error) {
return byteMode.reader.ByteAt(byteMode.api.stackFrame.offset + offset)
return byteMode.reader.ByteAt(byteMode.api.pointers.offset + offset)
}
// PeekMulti returns at max the provided maximum number of bytes at the provided
@ -22,7 +22,7 @@ func (byteMode InputByteMode) Peek(offset int) (byte, error) {
// error as such. The returned error can in such case be set to io.EOF to indicate
// that the end of the input was reached though.
func (byteMode InputByteMode) PeekMulti(offset int, count int) ([]byte, error) {
return byteMode.reader.BytesAt(byteMode.api.stackFrame.offset+offset, count)
return byteMode.reader.BytesAt(byteMode.api.pointers.offset+offset, count)
}
func (byteMode InputByteMode) Accept(b byte) {
@ -53,16 +53,16 @@ func (byteMode InputByteMode) AcceptMulti(bytes ...byte) {
// After the call, byte offset 0 for Peek() and PeekMulti() will point at
// the first byte at the new cursor position.
func (byteMode InputByteMode) MoveCursor(b byte) {
f := byteMode.api.stackFrame
a := byteMode.api
if b == '\n' {
f.column = 0
f.line++
a.pointers.column = 0
a.pointers.line++
} else {
f.column++
a.pointers.column++
}
f.offset++
f.offsetLocal++
a.pointers.offset++
a.pointers.offsetLocal++
}
// MoveCursorMulti updates the position of the read cursor, based on the provided bytes.

View File

@ -15,29 +15,10 @@ type Input struct {
// Cursor returns a string that describes the current read cursor position.
func (i Input) Cursor() string {
column, line := 0, 0
for _, f := range i.api.stackFrames[:i.api.stackLevel+1] {
if f.line > 0 {
column = f.column
line += f.line
} else {
column += f.column
}
}
if line == 0 && column == 0 {
if i.api.pointers.line == 0 && i.api.pointers.column == 0 {
return fmt.Sprintf("start of file")
}
return fmt.Sprintf("line %d, column %d", line+1, column+1)
}
func (i Input) Reset() {
f := i.api.stackFrame
if f.offsetLocal > 0 {
f.column = 0
f.line = 0
f.offset -= f.offsetLocal
f.offsetLocal = 0
}
return fmt.Sprintf("line %d, column %d", i.api.pointers.line+1, i.api.pointers.column+1)
}
// Flush flushes input data from the read buffer up to the current
@ -47,11 +28,11 @@ func (i Input) Reset() {
// Parsekit will call this method at points where it knows it is a
// safe thing to do.
func (i Input) Flush() bool {
f := i.api.stackFrame
if f.offset > 0 {
i.reader.Flush(f.offset)
f.offset = 0
f.offsetLocal = 0
a := i.api
if a.pointers.offset > 0 {
i.reader.Flush(a.pointers.offset)
a.pointers.offset = 0
a.pointers.offsetLocal = 0
return true
}
return false

View File

@ -11,8 +11,7 @@ type Output struct {
func (o Output) String() string {
a := o.api
f := a.stackFrame
bytes := a.outputBytes[f.bytesStart:f.bytesEnd]
bytes := a.outputBytes[a.pointers.bytesStart:a.pointers.bytesEnd]
return string(bytes)
}
@ -22,36 +21,35 @@ func (o Output) Runes() []rune {
func (o Output) Rune(offset int) rune {
a := o.api
r, _ := utf8.DecodeRune(a.outputBytes[a.stackFrame.bytesStart+offset:])
r, _ := utf8.DecodeRune(a.outputBytes[a.pointers.bytesStart+offset:])
return r
}
type Split [2]int
func (o Output) Split() Split {
f := o.api.stackFrame
split := Split{f.bytesStart, f.tokenStart}
f.bytesStart = f.bytesEnd
f.tokenStart = f.tokenEnd
a := o.api
split := Split{a.pointers.bytesStart, a.pointers.tokenStart}
a.pointers.bytesStart = a.pointers.bytesEnd
a.pointers.tokenStart = a.pointers.tokenEnd
return split
}
func (o Output) MergeSplit(split Split) {
f := o.api.stackFrame
f.bytesStart = split[0]
f.tokenStart = split[1]
a := o.api
a.pointers.bytesStart = split[0]
a.pointers.tokenStart = split[1]
}
func (o Output) Reset() {
f := o.api.stackFrame
f.bytesEnd = f.bytesStart
f.tokenEnd = f.tokenStart
f.err = nil
a := o.api
a.pointers.bytesEnd = a.pointers.bytesStart
a.pointers.tokenEnd = a.pointers.tokenStart
}
func (o Output) ClearData() {
f := o.api.stackFrame
f.bytesEnd = f.bytesStart
a := o.api
a.pointers.bytesEnd = a.pointers.bytesStart
}
func (o Output) SetBytes(bytes ...byte) {
@ -61,11 +59,10 @@ func (o Output) SetBytes(bytes ...byte) {
func (o Output) AddByte(b byte) {
a := o.api
f := a.stackFrame
curBytesEnd := f.bytesEnd
curBytesEnd := a.pointers.bytesEnd
a.growOutputData(curBytesEnd + 1)
a.outputBytes[curBytesEnd] = b
f.bytesEnd++
a.pointers.bytesEnd++
}
func (o Output) SetRunes(runes ...rune) {
@ -75,22 +72,20 @@ func (o Output) SetRunes(runes ...rune) {
func (o Output) AddBytes(bytes ...byte) {
a := o.api
f := a.stackFrame
curBytesEnd := f.bytesEnd
curBytesEnd := a.pointers.bytesEnd
newBytesEnd := curBytesEnd + len(bytes)
a.growOutputData(newBytesEnd)
copy(a.outputBytes[curBytesEnd:], bytes)
f.bytesEnd = newBytesEnd
a.pointers.bytesEnd = newBytesEnd
}
func (o Output) AddRunes(runes ...rune) {
a := o.api
f := a.stackFrame
runesAsString := string(runes)
newBytesEnd := f.bytesEnd + len(runesAsString)
newBytesEnd := a.pointers.bytesEnd + len(runesAsString)
a.growOutputData(newBytesEnd)
copy(a.outputBytes[f.bytesEnd:], runesAsString)
f.bytesEnd = newBytesEnd
copy(a.outputBytes[a.pointers.bytesEnd:], runesAsString)
a.pointers.bytesEnd = newBytesEnd
}
func (o Output) AddString(s string) {
@ -104,23 +99,22 @@ func (o Output) SetString(s string) {
func (o Output) Tokens() []Token {
a := o.api
f := a.stackFrame
return a.outputTokens[f.tokenStart:f.tokenEnd]
return a.outputTokens[a.pointers.tokenStart:a.pointers.tokenEnd]
}
func (o Output) Token(offset int) Token {
a := o.api
return a.outputTokens[a.stackFrame.tokenStart+offset]
return a.outputTokens[a.pointers.tokenStart+offset]
}
func (o Output) TokenValue(offset int) interface{} {
a := o.api
return a.outputTokens[a.stackFrame.tokenStart+offset].Value
return a.outputTokens[a.pointers.tokenStart+offset].Value
}
func (o Output) ClearTokens() {
f := o.api.stackFrame
f.tokenEnd = f.tokenStart
a := o.api
a.pointers.tokenEnd = a.pointers.tokenStart
}
func (o Output) SetTokens(tokens ...Token) {
@ -130,18 +124,16 @@ func (o Output) SetTokens(tokens ...Token) {
func (o Output) AddToken(token Token) {
a := o.api
f := a.stackFrame
tokenEnd := f.tokenEnd
tokenEnd := a.pointers.tokenEnd
a.growOutputTokens(tokenEnd + 1)
a.outputTokens[tokenEnd] = token
f.tokenEnd++
a.pointers.tokenEnd++
}
func (o Output) InsertTokenAtStart(token Token) {
a := o.api
f := a.stackFrame
tokenEnd := f.tokenEnd
tokenStart := f.tokenStart
tokenEnd := a.pointers.tokenEnd
tokenStart := a.pointers.tokenStart
a.growOutputTokens(tokenEnd + 1)
if tokenStart == tokenEnd {
a.outputTokens[tokenEnd] = token
@ -149,16 +141,15 @@ func (o Output) InsertTokenAtStart(token Token) {
copy(a.outputTokens[tokenStart+1:], a.outputTokens[tokenStart:tokenEnd])
a.outputTokens[tokenStart] = token
}
f.tokenEnd++
a.pointers.tokenEnd++
}
func (o Output) AddTokens(tokens ...Token) {
a := o.api
f := a.stackFrame
a.growOutputTokens(f.tokenEnd + len(tokens))
a.growOutputTokens(a.pointers.tokenEnd + len(tokens))
for _, t := range tokens {
a.outputTokens[f.tokenEnd] = t
f.tokenEnd++
a.outputTokens[a.pointers.tokenEnd] = t
a.pointers.tokenEnd++
}
}

View File

@ -26,7 +26,7 @@ type InputRuneMode struct {
// When an offset is requested that is beyond the length of the available input
// data, then the error will be io.EOF.
func (runeMode InputRuneMode) Peek(offset int) (rune, int, error) {
return runeMode.reader.RuneAt(runeMode.api.stackFrame.offset + offset)
return runeMode.reader.RuneAt(runeMode.api.pointers.offset + offset)
}
// Accept is used to accept a single rune that was read from the input.
@ -42,14 +42,11 @@ func (runeMode InputRuneMode) Peek(offset int) (rune, int, error) {
// the first byte after the accepted rune.
func (runeMode InputRuneMode) Accept(r rune) {
a := runeMode.api
f := a.stackFrame
curBytesEnd := f.bytesEnd
curBytesEnd := a.pointers.bytesEnd
maxRequiredBytes := curBytesEnd + utf8.UTFMax
a.growOutputData(maxRequiredBytes)
w := utf8.EncodeRune(a.outputBytes[curBytesEnd:], r)
f.bytesEnd += w
a.pointers.bytesEnd += w
runeMode.MoveCursor(r)
}
@ -66,9 +63,7 @@ func (runeMode InputRuneMode) Accept(r rune) {
// the first byte after the accepted runes.
func (runeMode InputRuneMode) AcceptMulti(runes ...rune) {
a := runeMode.api
f := a.stackFrame
curBytesEnd := f.bytesEnd
curBytesEnd := a.pointers.bytesEnd
maxBytes := curBytesEnd + len(runes)*utf8.UTFMax
a.growOutputData(maxBytes)
@ -77,7 +72,7 @@ func (runeMode InputRuneMode) AcceptMulti(runes ...rune) {
curBytesEnd += w
runeMode.MoveCursor(r)
}
f.bytesEnd = curBytesEnd
a.pointers.bytesEnd = curBytesEnd
}
// MoveCursor updates the position of the read cursor, based on the provided rune.
@ -87,17 +82,17 @@ func (runeMode InputRuneMode) AcceptMulti(runes ...rune) {
// After the call, byte offset 0 for Peek() and PeekMulti() will point at
// the first rune at the new cursor position.
func (runeMode InputRuneMode) MoveCursor(r rune) int {
f := runeMode.api.stackFrame
a := runeMode.api
if r == '\n' {
f.column = 0
f.line++
a.pointers.column = 0
a.pointers.line++
} else {
f.column++
a.pointers.column++
}
width := utf8.RuneLen(r)
f.offset += width
f.offsetLocal += width
a.pointers.offset += width
a.pointers.offsetLocal += width
return width
}

View File

@ -707,27 +707,15 @@ func MatchOptional(handler Handler) Handler {
// reports successful match.
func MatchSeq(handlers ...Handler) Handler {
return func(tokenAPI *API) bool {
f := tokenAPI.stackFrame
snap := tokenAPI.MakeSnapshot()
for _, handler := range handlers {
tokenAPI.Output.Split()
// Move forward the output pointers, so the handler that we're about
// to call will make use of a fresh output buffer.
f.bytesStart = f.bytesEnd
f.tokenStart = f.tokenEnd
split := tokenAPI.Output.Split()
if !handler(tokenAPI) {
tokenAPI.RestoreSnapshot(snap)
return false
}
tokenAPI.Output.MergeSplit(split)
}
// Move back the output pointers to where they were originally. This
// stiches together all the pieces of output that were genarated by
// the individual handlers in the sequence.
f.bytesStart = snap[1]
f.tokenStart = snap[3]
return true
}
}
@ -842,7 +830,10 @@ func matchMinMax(min int, max int, handler Handler, name string) Handler {
snap := tokenAPI.MakeSnapshot()
for total < min {
total++
if !handler(tokenAPI) {
split := tokenAPI.Output.Split()
ok := handler(tokenAPI)
tokenAPI.Output.MergeSplit(split)
if !ok {
tokenAPI.RestoreSnapshot(snap)
return false
}
@ -853,7 +844,10 @@ func matchMinMax(min int, max int, handler Handler, name string) Handler {
//child.Merge()
for max < 0 || total < max {
total++
if !handler(tokenAPI) {
split := tokenAPI.Output.Split()
ok := handler(tokenAPI)
tokenAPI.Output.MergeSplit(split)
if !ok {
break
}
}
@ -1522,13 +1516,13 @@ func MatchIPv6Net(normalize bool) Handler {
// In both cases, it would match the first form.
func ModifyDrop(handler Handler) Handler {
return func(tokenAPI *API) bool {
runeEnd := tokenAPI.stackFrame.bytesEnd
tokenEnd := tokenAPI.stackFrame.tokenEnd
runeEnd := tokenAPI.pointers.bytesEnd
tokenEnd := tokenAPI.pointers.tokenEnd
if handler(tokenAPI) {
// We keep offset and cursor updates, but rollback any runes / tokens
// that were added by the handler.
tokenAPI.stackFrame.bytesEnd = runeEnd
tokenAPI.stackFrame.tokenEnd = tokenEnd
tokenAPI.pointers.bytesEnd = runeEnd
tokenAPI.pointers.tokenEnd = tokenEnd
return true
}
return false
@ -1921,8 +1915,6 @@ func MakeTokenByValue(toktype interface{}, handler Handler, value interface{}) H
// its input and must return the token value.
func MakeTokenByCallback(toktype interface{}, handler Handler, makeValue func(tokenAPI *API) interface{}) Handler {
return func(tokenAPI *API) bool {
snap := tokenAPI.MakeSnapshot()
split := tokenAPI.Output.Split()
if handler(tokenAPI) {
// When a parsing hierarchy looks like ("date" ("year", "month" "day")), the
// tokens must end up in the order "date", "year", "month", "day" and not
@ -1931,11 +1923,9 @@ func MakeTokenByCallback(toktype interface{}, handler Handler, makeValue func(to
// that were already created by the handler call.
token := Token{Type: toktype, Value: makeValue(tokenAPI)}
tokenAPI.Output.InsertTokenAtStart(token)
tokenAPI.Output.MergeSplit(split)
return true
}
tokenAPI.RestoreSnapshot(snap)
return false
}
}