Backup work to performance tuning.

This commit is contained in:
Maurice Makaay 2019-07-05 08:08:42 +00:00
parent 583197c37a
commit 5e9879326a
3 changed files with 76 additions and 16 deletions

View File

@ -67,10 +67,12 @@ func (p *API) Accept(tokenHandler tokenize.Handler) bool {
forkedAPI.Merge() forkedAPI.Merge()
p.result = p.tokenAPI.Result() p.result = p.tokenAPI.Result()
forkedAPI.Dispose() forkedAPI.Dispose()
if p.sanityChecksEnabled && p.tokenAPI.FlushInput() { if p.tokenAPI.FlushInput() {
if p.sanityChecksEnabled {
p.initLoopCheck() p.initLoopCheck()
} }
} }
}
return ok return ok
} }

View File

@ -90,6 +90,7 @@ func makeBufioReader(input interface{}) *bufio.Reader {
// The parserkit.reader.Reader is used internally by tokenize.API. // The parserkit.reader.Reader is used internally by tokenize.API.
type Buffer struct { type Buffer struct {
bufio *bufio.Reader // used for ReadRune() bufio *bufio.Reader // used for ReadRune()
store []rune // buffer store, the buffer field is a slice on top of this one
buffer []rune // input buffer, holding runes that were read from input buffer []rune // input buffer, holding runes that were read from input
err error // a read error, if one occurred err error // a read error, if one occurred
errOffset int // the offset in the buffer at which the read error was encountered errOffset int // the offset in the buffer at which the read error was encountered
@ -121,7 +122,11 @@ func (r *Buffer) RuneAt(offset int) (rune, error) {
// Rune at provided offset is not yet available in the input buffer. // Rune at provided offset is not yet available in the input buffer.
// Read runes until we have enough runes to satisfy the offset. // Read runes until we have enough runes to satisfy the offset.
l := len(r.buffer) l := len(r.buffer)
n := offset - l + 1 // nr of runes to add to the buffer to get to offset
// Number of runes to add to the buffer to have enough space to store
// the rune at the offset
n := offset - l + 1
if n > 0 { if n > 0 {
r.grow(n) r.grow(n)
for writeAt := l; writeAt <= offset; writeAt++ { for writeAt := l; writeAt <= offset; writeAt++ {
@ -162,26 +167,35 @@ var ErrTooLarge = errors.New("parsekit.read.Buffer: too large")
// It returns the index where bytes should be written. // It returns the index where bytes should be written.
// If the buffer can't grow it will panic with ErrTooLarge. // If the buffer can't grow it will panic with ErrTooLarge.
func (r *Buffer) grow(n int) { func (r *Buffer) grow(n int) {
// Instantiate new buffer. // Instantiate new buffer store
if r.buffer == nil { if r.store == nil {
b := smallBufferSize b := smallBufferSize
if b < n { if b < n {
b = n b = n
} }
r.buffer = make([]rune, n, b) r.store = make([]rune, n, b)
r.buffer = r.store
return return
} }
l := len(r.buffer) l := len(r.buffer)
c := cap(r.buffer) c := cap(r.buffer)
// Grow the buffer by reslicing within the available capacity. // Grow the buffer store by reslicing within the available capacity.
if n <= c-l { if n <= c-l {
r.buffer = r.buffer[:l+n] r.buffer = r.buffer[:l+n]
return return
} }
// Grow the buffer by allocating a new one and copying the data. // Grow the buffer by moving the data to the start of the store.
buf := makeSlice(2*c + n) if cap(r.store)-l-n > 0 {
copy(r.store, r.buffer)
r.buffer = r.store[:l+n]
return
}
// Grow the buffer store by allocating a new one and copying the data.
buf := makeSlice(2*cap(r.store) + n)
fmt.Printf("ALLOC %d\n", 2*cap(r.store)+n)
copy(buf, r.buffer) copy(buf, r.buffer)
r.buffer = buf[:l+n] r.store = buf
r.buffer = r.store[:l+n]
} }
// makeSlice allocates a slice of size n. If the allocation fails, it panics // makeSlice allocates a slice of size n. If the allocation fails, it panics
@ -201,13 +215,22 @@ func makeSlice(n int) []rune {
// the rune that comes after the runes that were flushed. // the rune that comes after the runes that were flushed.
// So what this basically does, is turn the Buffer into a sliding window. // So what this basically does, is turn the Buffer into a sliding window.
func (r *Buffer) Flush(numberOfRunes int) { func (r *Buffer) Flush(numberOfRunes int) {
if numberOfRunes > len(r.buffer) { l := len(r.buffer)
if numberOfRunes > l {
panic(fmt.Sprintf( panic(fmt.Sprintf(
"parsekit.read.Buffer.Flush(): number of runes to flush (%d) "+ "parsekit.read.Buffer.Flush(): number of runes to flush (%d) "+
"exceeds size of the buffer (%d)", numberOfRunes, len(r.buffer))) "exceeds size of the buffer (%d)", numberOfRunes, l))
}
if numberOfRunes == 0 {
return
}
if l == numberOfRunes {
r.buffer = r.store[:0]
r.errOffset = 0
return
} }
r.buffer = r.buffer[numberOfRunes:] r.buffer = r.buffer[numberOfRunes:]
if r.err != nil { if r.err != nil {
r.errOffset -= numberOfRunes r.errOffset = r.errOffset - numberOfRunes
} }
} }

View File

@ -39,6 +39,7 @@ var C = struct {
Except func(except Handler, handler Handler) Handler Except func(except Handler, handler Handler) Handler
FollowedBy func(lookAhead Handler, handler Handler) Handler FollowedBy func(lookAhead Handler, handler Handler) Handler
NotFollowedBy func(lookAhead Handler, handler Handler) Handler NotFollowedBy func(lookAhead Handler, handler Handler) Handler
FlushInput func(Handler) Handler
}{ }{
Any: MatchAny, Any: MatchAny,
Not: MatchNot, Not: MatchNot,
@ -54,6 +55,7 @@ var C = struct {
Except: MatchExcept, Except: MatchExcept,
FollowedBy: MatchFollowedBy, FollowedBy: MatchFollowedBy,
NotFollowedBy: MatchNotFollowedBy, NotFollowedBy: MatchNotFollowedBy,
FlushInput: MakeInputFlusher,
} }
// A provides convenient access to a range of atoms or functions to build atoms. // A provides convenient access to a range of atoms or functions to build atoms.
@ -70,7 +72,9 @@ var A = struct {
RuneRange func(rune, rune) Handler RuneRange func(rune, rune) Handler
Str func(string) Handler Str func(string) Handler
StrNoCase func(string) Handler StrNoCase func(string) Handler
EndOfLine Handler
EndOfFile Handler EndOfFile Handler
UntilEndOfLine Handler
AnyRune Handler AnyRune Handler
ValidRune Handler ValidRune Handler
InvalidRune Handler InvalidRune Handler
@ -124,7 +128,6 @@ var A = struct {
Blanks Handler Blanks Handler
Whitespace Handler Whitespace Handler
UnicodeSpace Handler UnicodeSpace Handler
EndOfLine Handler
Digit Handler Digit Handler
DigitNotZero Handler DigitNotZero Handler
Digits Handler Digits Handler
@ -156,6 +159,8 @@ var A = struct {
Str: MatchStr, Str: MatchStr,
StrNoCase: MatchStrNoCase, StrNoCase: MatchStrNoCase,
EndOfFile: MatchEndOfFile(), EndOfFile: MatchEndOfFile(),
EndOfLine: MatchEndOfLine(),
UntilEndOfLine: MatchUntilEndOfLine(),
AnyRune: MatchAnyRune(), AnyRune: MatchAnyRune(),
ValidRune: MatchValidRune(), ValidRune: MatchValidRune(),
InvalidRune: MatchInvalidRune(), InvalidRune: MatchInvalidRune(),
@ -209,7 +214,6 @@ var A = struct {
Blanks: MatchBlanks(), Blanks: MatchBlanks(),
Whitespace: MatchWhitespace(), Whitespace: MatchWhitespace(),
UnicodeSpace: MatchUnicodeSpace(), UnicodeSpace: MatchUnicodeSpace(),
EndOfLine: MatchEndOfLine(),
Digit: MatchDigit(), Digit: MatchDigit(),
DigitNotZero: MatchDigitNotZero(), DigitNotZero: MatchDigitNotZero(),
Digits: MatchDigits(), Digits: MatchDigits(),
@ -641,6 +645,31 @@ func MatchNotFollowedBy(lookAhead Handler, handler Handler) Handler {
} }
} }
// MakeInputFlusher creates a Handler that will flush the input buffer when the
// provided handler matches.
//
// This is useful when constructing a grammar using only parsekit.tokenize
// functionality (parsekit.parse will automatically flush the input for you)
// that has to process large input data.
//
// Without flushing the input, the input reader will allocate memory
// during the parsing process, eventually enough to hold the full input
// in memory. By wrapping Handlers with DoFlushInput, you can tell parsekit
// that the accumulated input so far will no longer be needed, allowing
// this input to be flushed from memory.
//
// Rule of thumb is: only use it when you have to actually fix a memory
// hogging issue for your use case.
func MakeInputFlusher(handler Handler) Handler {
return func(t *API) bool {
if handler(t) {
t.FlushInput()
return true
}
return false
}
}
// MatchSigned creates a Handler that checks if the provided Handler is // MatchSigned creates a Handler that checks if the provided Handler is
// prefixed by an optional '+' or '-' sign. This can be used to turn numeric // prefixed by an optional '+' or '-' sign. This can be used to turn numeric
// atoms into a signed version, e.g. // atoms into a signed version, e.g.
@ -683,6 +712,13 @@ func MatchEndOfFile() Handler {
} }
} }
// MatchUntilEndOfLine creates a Handler function that accepts any rune
// until the end of the line (or file when that's the case).
// The newline itself is not included in the match.
func MatchUntilEndOfLine() Handler {
return MatchZeroOrMore(MatchNot(MatchEndOfLine()))
}
// MatchAnyRune creates a Handler function that checks if a rune can be // MatchAnyRune creates a Handler function that checks if a rune can be
// read from the input. Invalid runes on the input are replaced with the UTF8 // read from the input. Invalid runes on the input are replaced with the UTF8
// replacement rune \uFFFD (i.e. utf8.RuneError), which displays as <20>. // replacement rune \uFFFD (i.e. utf8.RuneError), which displays as <20>.
@ -1421,7 +1457,6 @@ func MakeTokenGroup(toktype interface{}, handler Handler) Handler {
token := &Token{Type: toktype, Runes: result.Runes(), Value: result.Tokens()} token := &Token{Type: toktype, Runes: result.Runes(), Value: result.Tokens()}
result.SetTokens(token) result.SetTokens(token)
child.Merge() child.Merge()
return true return true
} }
return false return false