From 5e9879326a26c43aa1a4321b95375d266e88bc78 Mon Sep 17 00:00:00 2001 From: Maurice Makaay Date: Fri, 5 Jul 2019 08:08:42 +0000 Subject: [PATCH] Backup work to performance tuning. --- parse/api.go | 6 +++-- read/read.go | 45 +++++++++++++++++++++++++++--------- tokenize/handlers_builtin.go | 41 +++++++++++++++++++++++++++++--- 3 files changed, 76 insertions(+), 16 deletions(-) diff --git a/parse/api.go b/parse/api.go index 8a64599..9865dcb 100644 --- a/parse/api.go +++ b/parse/api.go @@ -67,8 +67,10 @@ func (p *API) Accept(tokenHandler tokenize.Handler) bool { forkedAPI.Merge() p.result = p.tokenAPI.Result() forkedAPI.Dispose() - if p.sanityChecksEnabled && p.tokenAPI.FlushInput() { - p.initLoopCheck() + if p.tokenAPI.FlushInput() { + if p.sanityChecksEnabled { + p.initLoopCheck() + } } } return ok diff --git a/read/read.go b/read/read.go index bd9c27a..08aebb7 100644 --- a/read/read.go +++ b/read/read.go @@ -90,6 +90,7 @@ func makeBufioReader(input interface{}) *bufio.Reader { // The parserkit.reader.Reader is used internally by tokenize.API. type Buffer struct { bufio *bufio.Reader // used for ReadRune() + store []rune // buffer store, the buffer field is a slice on top of this one buffer []rune // input buffer, holding runes that were read from input err error // a read error, if one occurred errOffset int // the offset in the buffer at which the read error was encountered @@ -121,7 +122,11 @@ func (r *Buffer) RuneAt(offset int) (rune, error) { // Rune at provided offset is not yet available in the input buffer. // Read runes until we have enough runes to satisfy the offset. l := len(r.buffer) - n := offset - l + 1 // nr of runes to add to the buffer to get to offset + + // Number of runes to add to the buffer to have enough space to store + // the rune at the offset + n := offset - l + 1 + if n > 0 { r.grow(n) for writeAt := l; writeAt <= offset; writeAt++ { @@ -162,26 +167,35 @@ var ErrTooLarge = errors.New("parsekit.read.Buffer: too large") // It returns the index where bytes should be written. // If the buffer can't grow it will panic with ErrTooLarge. func (r *Buffer) grow(n int) { - // Instantiate new buffer. - if r.buffer == nil { + // Instantiate new buffer store + if r.store == nil { b := smallBufferSize if b < n { b = n } - r.buffer = make([]rune, n, b) + r.store = make([]rune, n, b) + r.buffer = r.store return } l := len(r.buffer) c := cap(r.buffer) - // Grow the buffer by reslicing within the available capacity. + // Grow the buffer store by reslicing within the available capacity. if n <= c-l { r.buffer = r.buffer[:l+n] return } - // Grow the buffer by allocating a new one and copying the data. - buf := makeSlice(2*c + n) + // Grow the buffer by moving the data to the start of the store. + if cap(r.store)-l-n > 0 { + copy(r.store, r.buffer) + r.buffer = r.store[:l+n] + return + } + // Grow the buffer store by allocating a new one and copying the data. + buf := makeSlice(2*cap(r.store) + n) + fmt.Printf("ALLOC %d\n", 2*cap(r.store)+n) copy(buf, r.buffer) - r.buffer = buf[:l+n] + r.store = buf + r.buffer = r.store[:l+n] } // makeSlice allocates a slice of size n. If the allocation fails, it panics @@ -201,13 +215,22 @@ func makeSlice(n int) []rune { // the rune that comes after the runes that were flushed. // So what this basically does, is turn the Buffer into a sliding window. func (r *Buffer) Flush(numberOfRunes int) { - if numberOfRunes > len(r.buffer) { + l := len(r.buffer) + if numberOfRunes > l { panic(fmt.Sprintf( "parsekit.read.Buffer.Flush(): number of runes to flush (%d) "+ - "exceeds size of the buffer (%d)", numberOfRunes, len(r.buffer))) + "exceeds size of the buffer (%d)", numberOfRunes, l)) + } + if numberOfRunes == 0 { + return + } + if l == numberOfRunes { + r.buffer = r.store[:0] + r.errOffset = 0 + return } r.buffer = r.buffer[numberOfRunes:] if r.err != nil { - r.errOffset -= numberOfRunes + r.errOffset = r.errOffset - numberOfRunes } } diff --git a/tokenize/handlers_builtin.go b/tokenize/handlers_builtin.go index 40481e9..632d0b1 100644 --- a/tokenize/handlers_builtin.go +++ b/tokenize/handlers_builtin.go @@ -39,6 +39,7 @@ var C = struct { Except func(except Handler, handler Handler) Handler FollowedBy func(lookAhead Handler, handler Handler) Handler NotFollowedBy func(lookAhead Handler, handler Handler) Handler + FlushInput func(Handler) Handler }{ Any: MatchAny, Not: MatchNot, @@ -54,6 +55,7 @@ var C = struct { Except: MatchExcept, FollowedBy: MatchFollowedBy, NotFollowedBy: MatchNotFollowedBy, + FlushInput: MakeInputFlusher, } // A provides convenient access to a range of atoms or functions to build atoms. @@ -70,7 +72,9 @@ var A = struct { RuneRange func(rune, rune) Handler Str func(string) Handler StrNoCase func(string) Handler + EndOfLine Handler EndOfFile Handler + UntilEndOfLine Handler AnyRune Handler ValidRune Handler InvalidRune Handler @@ -124,7 +128,6 @@ var A = struct { Blanks Handler Whitespace Handler UnicodeSpace Handler - EndOfLine Handler Digit Handler DigitNotZero Handler Digits Handler @@ -156,6 +159,8 @@ var A = struct { Str: MatchStr, StrNoCase: MatchStrNoCase, EndOfFile: MatchEndOfFile(), + EndOfLine: MatchEndOfLine(), + UntilEndOfLine: MatchUntilEndOfLine(), AnyRune: MatchAnyRune(), ValidRune: MatchValidRune(), InvalidRune: MatchInvalidRune(), @@ -209,7 +214,6 @@ var A = struct { Blanks: MatchBlanks(), Whitespace: MatchWhitespace(), UnicodeSpace: MatchUnicodeSpace(), - EndOfLine: MatchEndOfLine(), Digit: MatchDigit(), DigitNotZero: MatchDigitNotZero(), Digits: MatchDigits(), @@ -641,6 +645,31 @@ func MatchNotFollowedBy(lookAhead Handler, handler Handler) Handler { } } +// MakeInputFlusher creates a Handler that will flush the input buffer when the +// provided handler matches. +// +// This is useful when constructing a grammar using only parsekit.tokenize +// functionality (parsekit.parse will automatically flush the input for you) +// that has to process large input data. +// +// Without flushing the input, the input reader will allocate memory +// during the parsing process, eventually enough to hold the full input +// in memory. By wrapping Handlers with DoFlushInput, you can tell parsekit +// that the accumulated input so far will no longer be needed, allowing +// this input to be flushed from memory. +// +// Rule of thumb is: only use it when you have to actually fix a memory +// hogging issue for your use case. +func MakeInputFlusher(handler Handler) Handler { + return func(t *API) bool { + if handler(t) { + t.FlushInput() + return true + } + return false + } +} + // MatchSigned creates a Handler that checks if the provided Handler is // prefixed by an optional '+' or '-' sign. This can be used to turn numeric // atoms into a signed version, e.g. @@ -683,6 +712,13 @@ func MatchEndOfFile() Handler { } } +// MatchUntilEndOfLine creates a Handler function that accepts any rune +// until the end of the line (or file when that's the case). +// The newline itself is not included in the match. +func MatchUntilEndOfLine() Handler { + return MatchZeroOrMore(MatchNot(MatchEndOfLine())) +} + // MatchAnyRune creates a Handler function that checks if a rune can be // read from the input. Invalid runes on the input are replaced with the UTF8 // replacement rune \uFFFD (i.e. utf8.RuneError), which displays as �. @@ -1421,7 +1457,6 @@ func MakeTokenGroup(toktype interface{}, handler Handler) Handler { token := &Token{Type: toktype, Runes: result.Runes(), Value: result.Tokens()} result.SetTokens(token) child.Merge() - return true } return false