From 65895ac50297611b875d351ae47435c60368e8ea Mon Sep 17 00:00:00 2001 From: Maurice Makaay Date: Sun, 9 Jun 2019 21:55:01 +0000 Subject: [PATCH] Making parsekit.reader both simpler and more complex (more complex by adopting some buffer allocation logic from the built-in bytes package, to not be copying memory all the time during the read operations. --- reader/reader.go | 109 +++++++++++++++++++++++++++++++----------- reader/reader_test.go | 42 ++++++++++++++++ 2 files changed, 123 insertions(+), 28 deletions(-) diff --git a/reader/reader.go b/reader/reader.go index ea3af91..104d08e 100644 --- a/reader/reader.go +++ b/reader/reader.go @@ -41,6 +41,7 @@ package reader import ( "bufio" + "errors" "fmt" "io" "strings" @@ -58,12 +59,11 @@ import ( // // The parserkit.reader.Reader is used internally by parsekit.TokenAPI. type Reader struct { - bufio *bufio.Reader // Used for ReadRune() - buffer []rune // Input buffer, holding runes that were read from input - bufferOffset int // The offset of the buffer, relative to the start of the input - bufferLen int // Input size, the number of runes in the buffer - err error // A read error, if one occurred - errOffset int // The offset in the buffer at which the read error was encountered + bufio *bufio.Reader // Used for ReadRune() + buffer []rune // Input buffer, holding runes that were read from input + err error // A read error, if one occurred + errOffset int // The offset in the buffer at which the read error was encountered + firstReadDone bool // Whether or not the first read was done } // New initializes a new reader struct, wrapped around the provided input. @@ -74,8 +74,7 @@ type Reader struct { // - bufio.Reader func New(input interface{}) *Reader { return &Reader{ - bufio: makeBufioReader(input), - buffer: []rune{}, + bufio: makeBufioReader(input), } } @@ -108,6 +107,8 @@ func makeBufioReader(input interface{}) *bufio.Reader { // When reading failed, the rune will be utf8.RuneError and the error will // be not nil. One special read fail is actually a normal situation: end // of file reached. In that case, the returned error wille be io.EOF. +// Once a read error is encountered, that same read error will guaranteed +// be return on every subsequent read at or beyond the provided offset. func (r *Reader) RuneAt(offset int) (rune, error) { // Re-issue a previously seen read error. if r.err != nil && offset >= r.errOffset { @@ -116,40 +117,92 @@ func (r *Reader) RuneAt(offset int) (rune, error) { // Rune at provided offset is not yet available in the input buffer. // Read runes until we have enough runes to satisfy the offset. - for r.bufferLen <= offset { - readRune, _, err := r.bufio.ReadRune() + l := len(r.buffer) + n := offset - l + 1 // nr of runes to add to the buffer to get to offset + if n > 0 { + r.grow(n) + for writeAt := l; writeAt <= offset; writeAt++ { + readRune, _, err := r.bufio.ReadRune() - // Handle errors. - if err != nil { - r.err = err - r.errOffset = r.bufferLen - return utf8.RuneError, err + // Skip BOM. + if !r.firstReadDone { + r.firstReadDone = true + if readRune == '\uFEFF' { + writeAt-- + continue + } + } + + // Handle errors. + if err != nil { + r.err = err + r.errOffset = writeAt + return utf8.RuneError, err + } + + r.buffer[writeAt] = readRune } - - // Skip BOM. - if readRune == '\uFEFF' && r.bufferOffset == 0 { - r.bufferOffset++ - continue - } - - r.buffer = append(r.buffer, readRune) - r.bufferLen++ } + return r.buffer[offset], nil } +// The upcoming code was inspired heavily by the Go built-in 'bytes' package. + +// smallBufferSize is an initial allocation minimal capacity. +const smallBufferSize = 64 + +// ErrTooLarge is passed to panic if memory cannot be allocated to store data in a buffer. +var ErrTooLarge = errors.New("parsekit.reader: too large") + +// grow grows the buffer to guarantee space for n more bytes. +// It returns the index where bytes should be written. +// If the buffer can't grow it will panic with ErrTooLarge. +func (r *Reader) grow(n int) { + // Instantiate new buffer. + if r.buffer == nil { + b := smallBufferSize + if b < n { + b = n + } + r.buffer = make([]rune, n, b) + return + } + l := len(r.buffer) + c := cap(r.buffer) + // Grow the buffer by reslicing within the available capacity. + if n <= c-l { + r.buffer = r.buffer[:l+n] + return + } + // Grow the buffer by allocating a new one and copying the data. + buf := makeSlice(2*c + n) + copy(buf, r.buffer) + r.buffer = buf[:l+n] +} + +// makeSlice allocates a slice of size n. If the allocation fails, it panics +// with ErrTooLarge. +func makeSlice(n int) []rune { + // If the make fails, give a known error. + defer func() { + if recover() != nil { + panic(ErrTooLarge) + } + }() + return make([]rune, n) +} + // Flush deletes the provided number of runes from the start of the // reader buffer. After flushing the buffer, offset 0 as used by RuneAt() // will point to the rune that comes after the flushed runes. // So what this basically does is turn the Reader into a sliding window. func (r *Reader) Flush(numberOfRunes int) { - if numberOfRunes > r.bufferLen { + if numberOfRunes > len(r.buffer) { panic(fmt.Sprintf( "parsekit.Input.Reader.Flush(): number of runes to flush (%d) "+ - "exceeds size of the buffer (%d)", numberOfRunes, r.bufferLen)) + "exceeds size of the buffer (%d)", numberOfRunes, len(r.buffer))) } - r.bufferOffset += numberOfRunes - r.bufferLen -= numberOfRunes r.buffer = r.buffer[numberOfRunes:] if r.err != nil { r.errOffset -= numberOfRunes diff --git a/reader/reader_test.go b/reader/reader_test.go index 234dbfe..d32977a 100644 --- a/reader/reader_test.go +++ b/reader/reader_test.go @@ -231,6 +231,48 @@ func TestGivenErrorFromReader_ErrorIsCached(t *testing.T) { assert.Equal(t, io.EOF, err) } +func TestInputLargerThanDefaultBufSize64(t *testing.T) { + input, size := makeLargeStubReader() + r := reader.New(input) + + readRune, err := r.RuneAt(0) + assert.Equal(t, 'X', readRune) + readRune, err = r.RuneAt(size - 1) + assert.Equal(t, 'Y', readRune) + readRune, err = r.RuneAt(size) + assert.Equal(t, io.EOF, err) + readRune, err = r.RuneAt(10) + assert.Equal(t, 'X', readRune) +} + +func TestInputLargerThanDefaultBufSize64_WithFirstReadLargerThanBufSize64(t *testing.T) { + input, size := makeLargeStubReader() + r := reader.New(input) + + readRune, _ := r.RuneAt(size - 200) + assert.Equal(t, 'X', readRune) + readRune, _ = r.RuneAt(size - 1) + assert.Equal(t, 'Y', readRune) +} + +func TestInputLargerThanDefaultBufSize64_WithFirstReadToLastByte(t *testing.T) { + input, size := makeLargeStubReader() + r := reader.New(input) + + readRune, _ := r.RuneAt(size - 1) + assert.Equal(t, 'Y', readRune) +} + +func makeLargeStubReader() (*StubReader, int) { + size := utf8.UTFMax * 64 * 5 + bytes := make([]byte, size) + for i := range bytes { + bytes[i] = 'X' + } + bytes[size-1] = 'Y' + return &StubReader{bytes: bytes, errors: []error{io.EOF}}, size +} + type StubReader struct { bytes []byte errors []error