From b9eeac3480b1459437c2552eb659dfe29bac90bc Mon Sep 17 00:00:00 2001 From: Maurice Makaay Date: Thu, 18 Jul 2019 08:06:26 +0000 Subject: [PATCH] Work in progress on switching to byte stack. Committing to do some performance checks against master. --- tokenize/api.go | 93 +++++++++++++++++++++++++------------------------ 1 file changed, 47 insertions(+), 46 deletions(-) diff --git a/tokenize/api.go b/tokenize/api.go index e768ee3..74eca7c 100644 --- a/tokenize/api.go +++ b/tokenize/api.go @@ -2,6 +2,7 @@ package tokenize import ( "fmt" + "unicode/utf8" "git.makaay.nl/mauricem/go-parsekit/read" ) @@ -75,7 +76,7 @@ type API struct { lastRuneWidth int // the width in bytes of the last read rune lastRuneErr error // the error for the last NextRune() call runeRead bool // whether or not a rune was read using NextRune() - runes []rune // accepted runes + bytes []byte // accepted bytes tokens []Token // accepted tokens stackFrames []stackFrame // the stack frames, containing stack level-specific data stackLevel int // the current stack level @@ -86,8 +87,8 @@ type stackFrame struct { offset int // current rune read offset relative to the Reader's sliding window column int // The column at which the cursor is (0-indexed) line int // The line at which the cursor is (0-indexed) - runeStart int // the starting point in the API.runes slice for runes produced by this stack level - runeEnd int // the end point in the API.runes slice for runes produced by this stack level + runeStart int // the starting point in the APi.bytes slice for runes produced by this stack level + runeEnd int // the end point in the APi.bytes slice for runes produced by this stack level tokenStart int // the starting point in the API.tokens slice for tokens produced by this stack level tokenEnd int // the end point in the API.tokens slice for tokens produced by this stack level @@ -95,9 +96,9 @@ type stackFrame struct { err error // can be used by a Handler to report a specific issue with the input } -const initialStackDepth = 64 +const initialStackDepth = 32 const initialTokenStoreLength = 32 -const initialRuneStoreLength = 128 +const initialByteStoreLength = 256 // NewAPI initializes a new API struct, wrapped around the provided input. // For an overview of allowed inputs, take a look at the documentation @@ -105,7 +106,7 @@ const initialRuneStoreLength = 128 func NewAPI(input interface{}) *API { api := &API{ reader: read.New(input), - runes: make([]rune, initialRuneStoreLength), + bytes: make([]byte, initialByteStoreLength), tokens: make([]Token, initialTokenStoreLength), stackFrames: make([]stackFrame, initialStackDepth), } @@ -179,21 +180,21 @@ func (i *API) skipBytes(bytes ...byte) { } func (i *API) acceptBytes(bytes ...byte) { - curRuneEnd := i.stackFrame.runeEnd - newRuneEnd := curRuneEnd + len(bytes) + curBytesEnd := i.stackFrame.runeEnd + newBytesEnd := curBytesEnd + len(bytes) - // Grow the runes capacity when needed. - if cap(i.runes) < newRuneEnd { - newRunes := make([]rune, newRuneEnd*2) - copy(newRunes, i.runes) - i.runes = newRunes + // Grow the bytes capacity when needed. + if cap(i.bytes) < newBytesEnd { + newBytes := make([]byte, newBytesEnd*2) + copy(newBytes, i.bytes) + i.bytes = newBytes } for offset, b := range bytes { - i.runes[curRuneEnd+offset] = rune(b) + i.bytes[curBytesEnd+offset] = b i.stackFrame.moveCursorByByte(b) } - i.stackFrame.runeEnd = newRuneEnd + i.stackFrame.runeEnd = newBytesEnd i.stackFrame.offset += len(bytes) i.runeRead = false } @@ -207,21 +208,23 @@ func (i *API) skipRunes(width int, runes ...rune) { } func (i *API) acceptRunes(width int, runes ...rune) { - curRuneEnd := i.stackFrame.runeEnd - newRuneEnd := curRuneEnd + len(runes) + runesAsString := string(runes) + curBytesEnd := i.stackFrame.runeEnd + newBytesEnd := curBytesEnd + len(runesAsString) // Grow the runes capacity when needed. - if cap(i.runes) < newRuneEnd { - newRunes := make([]rune, newRuneEnd*2) - copy(newRunes, i.runes) - i.runes = newRunes + if cap(i.bytes) < newBytesEnd { + newBytes := make([]byte, newBytesEnd*2) + copy(newBytes, i.bytes) + i.bytes = newBytes } - for offset, r := range runes { - i.runes[curRuneEnd+offset] = r + for _, r := range runes { i.stackFrame.moveCursorByRune(r) } - i.stackFrame.runeEnd = newRuneEnd + copy(i.bytes[curBytesEnd:], runesAsString) + + i.stackFrame.runeEnd = newBytesEnd i.stackFrame.offset += width i.runeRead = false } @@ -362,7 +365,6 @@ func (i *API) Reset() { // When writing your own TokenHandler, you normally won't have to call this // method yourself. It is automatically called by parsekit when needed. func (i *API) FlushInput() bool { - // result := &(i.state.stack[i.stackLevel]) if i.stackFrame.offset > 0 { i.reader.Flush(i.stackFrame.offset) i.stackFrame.offset = 0 @@ -372,15 +374,16 @@ func (i *API) FlushInput() bool { } func (i *API) String() string { - return string(i.Runes()) + return string(i.bytes[i.stackFrame.runeStart:i.stackFrame.runeEnd]) } func (i *API) Runes() []rune { - return i.runes[i.stackFrame.runeStart:i.stackFrame.runeEnd] + return []rune(string(i.bytes[i.stackFrame.runeStart:i.stackFrame.runeEnd])) } func (i *API) Rune(offset int) rune { - return i.runes[i.stackFrame.runeStart+offset] + r, _ := utf8.DecodeRune(i.bytes[i.stackFrame.runeStart+offset:]) + return r } func (i *API) ClearRunes() { @@ -389,32 +392,30 @@ func (i *API) ClearRunes() { func (i *API) SetRunes(runes ...rune) { // Grow the runes capacity when needed. - newRuneEnd := i.stackFrame.runeStart + len(runes) - if cap(i.runes) < newRuneEnd { - newRunes := make([]rune, newRuneEnd*2) - copy(newRunes, i.runes) - i.runes = newRunes + runesAsString := string(runes) + newBytesEnd := i.stackFrame.runeStart + len(runesAsString) + if cap(i.bytes) < newBytesEnd { + newBytes := make([]byte, newBytesEnd*2) + copy(newBytes, i.bytes) + i.bytes = newBytes } - for offset, r := range runes { - i.runes[i.stackFrame.runeStart+offset] = r - } - i.stackFrame.runeEnd = newRuneEnd + copy(i.bytes[i.stackFrame.runeStart:], runesAsString) + i.stackFrame.runeEnd = newBytesEnd } func (i *API) AddRunes(runes ...rune) { // Grow the runes capacity when needed. - newRuneEnd := i.stackFrame.runeEnd + len(runes) - if cap(i.runes) < newRuneEnd { - newRunes := make([]rune, newRuneEnd*2) - copy(newRunes, i.runes) - i.runes = newRunes + runesAsString := string(runes) + newBytesEnd := i.stackFrame.runeEnd + len(runesAsString) + if cap(i.bytes) < newBytesEnd { + newBytes := make([]byte, newBytesEnd*2) + copy(newBytes, i.bytes) + i.bytes = newBytes } - for offset, r := range runes { - i.runes[i.stackFrame.runeEnd+offset] = r - } - i.stackFrame.runeEnd = newRuneEnd + copy(i.bytes[i.stackFrame.runeEnd:], runesAsString) + i.stackFrame.runeEnd = newBytesEnd } func (i *API) AddString(s string) {