Backup work to performance tuning.

2019-07-05 08:08:42 +00:00 · 2019-07-05 08:08:42 +00:00 · 5e9879326a
parent 583197c37a
commit 5e9879326a
3 changed files with 76 additions and 16 deletions
--- a/parse/api.go
+++ b/parse/api.go
@ -67,8 +67,10 @@ func (p *API) Accept(tokenHandler tokenize.Handler) bool {
 		forkedAPI.Merge()
 		p.result = p.tokenAPI.Result()
 		forkedAPI.Dispose()
-		if p.sanityChecksEnabled && p.tokenAPI.FlushInput() {
-			p.initLoopCheck()
+		if p.tokenAPI.FlushInput() {
+			if p.sanityChecksEnabled {
+				p.initLoopCheck()
+			}
 		}
 	}
 	return ok
--- a/read/read.go
+++ b/read/read.go
@ -90,6 +90,7 @@ func makeBufioReader(input interface{}) *bufio.Reader {
 // The parserkit.reader.Reader is used internally by tokenize.API.
 type Buffer struct {
 	bufio         *bufio.Reader // used for ReadRune()
+	store         []rune        // buffer store, the buffer field is a slice on top of this one
 	buffer        []rune        // input buffer, holding runes that were read from input
 	err           error         // a read error, if one occurred
 	errOffset     int           // the offset in the buffer at which the read error was encountered
@ -121,7 +122,11 @@ func (r *Buffer) RuneAt(offset int) (rune, error) {
 	// Rune at provided offset is not yet available in the input buffer.
 	// Read runes until we have enough runes to satisfy the offset.
 	l := len(r.buffer)
-	n := offset - l + 1 // nr of runes to add to the buffer to get to offset
+
+	// Number of runes to add to the buffer to have enough space to store
+	// the rune at the offset
+	n := offset - l + 1
+
 	if n > 0 {
 		r.grow(n)
 		for writeAt := l; writeAt <= offset; writeAt++ {
@ -162,26 +167,35 @@ var ErrTooLarge = errors.New("parsekit.read.Buffer: too large")
 // It returns the index where bytes should be written.
 // If the buffer can't grow it will panic with ErrTooLarge.
 func (r *Buffer) grow(n int) {
-	// Instantiate new buffer.
-	if r.buffer == nil {
+	// Instantiate new buffer store
+	if r.store == nil {
 		b := smallBufferSize
 		if b < n {
 			b = n
 		}
-		r.buffer = make([]rune, n, b)
+		r.store = make([]rune, n, b)
+		r.buffer = r.store
 		return
 	}
 	l := len(r.buffer)
 	c := cap(r.buffer)
-	// Grow the buffer by reslicing within the available capacity.
+	// Grow the buffer store by reslicing within the available capacity.
 	if n <= c-l {
 		r.buffer = r.buffer[:l+n]
 		return
 	}
-	// Grow the buffer by allocating a new one and copying the data.
-	buf := makeSlice(2*c + n)
+	// Grow the buffer by moving the data to the start of the store.
+	if cap(r.store)-l-n > 0 {
+		copy(r.store, r.buffer)
+		r.buffer = r.store[:l+n]
+		return
+	}
+	// Grow the buffer store by allocating a new one and copying the data.
+	buf := makeSlice(2*cap(r.store) + n)
+	fmt.Printf("ALLOC %d\n", 2*cap(r.store)+n)
 	copy(buf, r.buffer)
-	r.buffer = buf[:l+n]
+	r.store = buf
+	r.buffer = r.store[:l+n]
 }

 // makeSlice allocates a slice of size n. If the allocation fails, it panics
@ -201,13 +215,22 @@ func makeSlice(n int) []rune {
 // the rune that comes after the runes that were flushed.
 // So what this basically does, is turn the Buffer into a sliding window.
 func (r *Buffer) Flush(numberOfRunes int) {
-	if numberOfRunes > len(r.buffer) {
+	l := len(r.buffer)
+	if numberOfRunes > l {
 		panic(fmt.Sprintf(
 			"parsekit.read.Buffer.Flush(): number of runes to flush (%d) "+
-				"exceeds size of the buffer (%d)", numberOfRunes, len(r.buffer)))
+				"exceeds size of the buffer (%d)", numberOfRunes, l))
+	}
+	if numberOfRunes == 0 {
+		return
+	}
+	if l == numberOfRunes {
+		r.buffer = r.store[:0]
+		r.errOffset = 0
+		return
 	}
 	r.buffer = r.buffer[numberOfRunes:]
 	if r.err != nil {
-		r.errOffset -= numberOfRunes
+		r.errOffset = r.errOffset - numberOfRunes
 	}
 }
--- a/tokenize/handlers_builtin.go
+++ b/tokenize/handlers_builtin.go
@ -39,6 +39,7 @@ var C = struct {
 	Except        func(except Handler, handler Handler) Handler
 	FollowedBy    func(lookAhead Handler, handler Handler) Handler
 	NotFollowedBy func(lookAhead Handler, handler Handler) Handler
+	FlushInput    func(Handler) Handler
 }{
 	Any:           MatchAny,
 	Not:           MatchNot,
@ -54,6 +55,7 @@ var C = struct {
 	Except:        MatchExcept,
 	FollowedBy:    MatchFollowedBy,
 	NotFollowedBy: MatchNotFollowedBy,
+	FlushInput:    MakeInputFlusher,
 }

 // A provides convenient access to a range of atoms or functions to build atoms.
@ -70,7 +72,9 @@ var A = struct {
 	RuneRange      func(rune, rune) Handler
 	Str            func(string) Handler
 	StrNoCase      func(string) Handler
+	EndOfLine      Handler
 	EndOfFile      Handler
+	UntilEndOfLine Handler
 	AnyRune        Handler
 	ValidRune      Handler
 	InvalidRune    Handler
@ -124,7 +128,6 @@ var A = struct {
 	Blanks         Handler
 	Whitespace     Handler
 	UnicodeSpace   Handler
-	EndOfLine      Handler
 	Digit          Handler
 	DigitNotZero   Handler
 	Digits         Handler
@ -156,6 +159,8 @@ var A = struct {
 	Str:            MatchStr,
 	StrNoCase:      MatchStrNoCase,
 	EndOfFile:      MatchEndOfFile(),
+	EndOfLine:      MatchEndOfLine(),
+	UntilEndOfLine: MatchUntilEndOfLine(),
 	AnyRune:        MatchAnyRune(),
 	ValidRune:      MatchValidRune(),
 	InvalidRune:    MatchInvalidRune(),
@ -209,7 +214,6 @@ var A = struct {
 	Blanks:         MatchBlanks(),
 	Whitespace:     MatchWhitespace(),
 	UnicodeSpace:   MatchUnicodeSpace(),
-	EndOfLine:      MatchEndOfLine(),
 	Digit:          MatchDigit(),
 	DigitNotZero:   MatchDigitNotZero(),
 	Digits:         MatchDigits(),
@ -641,6 +645,31 @@ func MatchNotFollowedBy(lookAhead Handler, handler Handler) Handler {
 	}
 }

+// MakeInputFlusher creates a Handler that will flush the input buffer when the
+// provided handler matches.
+//
+// This is useful when constructing a grammar using only parsekit.tokenize
+// functionality (parsekit.parse will automatically flush the input for you)
+// that has to process large input data.
+//
+//  Without flushing the input, the input reader will allocate memory
+// during the parsing process, eventually enough to hold the full input
+// in memory. By wrapping Handlers with DoFlushInput, you can tell parsekit
+// that the accumulated input so far will no longer be needed, allowing
+// this input to be flushed from memory.
+//
+// Rule of thumb is: only use it when you have to actually fix a memory
+// hogging issue for your use case.
+func MakeInputFlusher(handler Handler) Handler {
+	return func(t *API) bool {
+		if handler(t) {
+			t.FlushInput()
+			return true
+		}
+		return false
+	}
+}
+
 // MatchSigned creates a Handler that checks if the provided Handler is
 // prefixed by an optional '+' or '-' sign. This can be used to turn numeric
 // atoms into a signed version, e.g.
@ -683,6 +712,13 @@ func MatchEndOfFile() Handler {
 	}
 }

+// MatchUntilEndOfLine creates a Handler function that accepts any rune
+// until the end of the line (or file when that's the case).
+// The newline itself is not included in the match.
+func MatchUntilEndOfLine() Handler {
+	return MatchZeroOrMore(MatchNot(MatchEndOfLine()))
+}
+
 // MatchAnyRune creates a Handler function that checks if a rune can be
 // read from the input. Invalid runes on the input are replaced with the UTF8
 // replacement rune \uFFFD (i.e. utf8.RuneError), which displays as <20>.
@ -1421,7 +1457,6 @@ func MakeTokenGroup(toktype interface{}, handler Handler) Handler {
 			token := &Token{Type: toktype, Runes: result.Runes(), Value: result.Tokens()}
 			result.SetTokens(token)
 			child.Merge()
-
 			return true
 		}
 		return false