Making parsekit.reader both simpler and more complex (more complex by adopting some buffer allocation logic from the built-in bytes package, to not be copying memory all the time during the read operations.

This commit is contained in:
Maurice Makaay 2019-06-09 21:55:01 +00:00
parent 9656cd4449
commit 65895ac502
2 changed files with 123 additions and 28 deletions

View File

@ -41,6 +41,7 @@ package reader
import ( import (
"bufio" "bufio"
"errors"
"fmt" "fmt"
"io" "io"
"strings" "strings"
@ -58,12 +59,11 @@ import (
// //
// The parserkit.reader.Reader is used internally by parsekit.TokenAPI. // The parserkit.reader.Reader is used internally by parsekit.TokenAPI.
type Reader struct { type Reader struct {
bufio *bufio.Reader // Used for ReadRune() bufio *bufio.Reader // Used for ReadRune()
buffer []rune // Input buffer, holding runes that were read from input buffer []rune // Input buffer, holding runes that were read from input
bufferOffset int // The offset of the buffer, relative to the start of the input err error // A read error, if one occurred
bufferLen int // Input size, the number of runes in the buffer errOffset int // The offset in the buffer at which the read error was encountered
err error // A read error, if one occurred firstReadDone bool // Whether or not the first read was done
errOffset int // The offset in the buffer at which the read error was encountered
} }
// New initializes a new reader struct, wrapped around the provided input. // New initializes a new reader struct, wrapped around the provided input.
@ -74,8 +74,7 @@ type Reader struct {
// - bufio.Reader // - bufio.Reader
func New(input interface{}) *Reader { func New(input interface{}) *Reader {
return &Reader{ return &Reader{
bufio: makeBufioReader(input), bufio: makeBufioReader(input),
buffer: []rune{},
} }
} }
@ -108,6 +107,8 @@ func makeBufioReader(input interface{}) *bufio.Reader {
// When reading failed, the rune will be utf8.RuneError and the error will // When reading failed, the rune will be utf8.RuneError and the error will
// be not nil. One special read fail is actually a normal situation: end // be not nil. One special read fail is actually a normal situation: end
// of file reached. In that case, the returned error wille be io.EOF. // of file reached. In that case, the returned error wille be io.EOF.
// Once a read error is encountered, that same read error will guaranteed
// be return on every subsequent read at or beyond the provided offset.
func (r *Reader) RuneAt(offset int) (rune, error) { func (r *Reader) RuneAt(offset int) (rune, error) {
// Re-issue a previously seen read error. // Re-issue a previously seen read error.
if r.err != nil && offset >= r.errOffset { if r.err != nil && offset >= r.errOffset {
@ -116,40 +117,92 @@ func (r *Reader) RuneAt(offset int) (rune, error) {
// Rune at provided offset is not yet available in the input buffer. // Rune at provided offset is not yet available in the input buffer.
// Read runes until we have enough runes to satisfy the offset. // Read runes until we have enough runes to satisfy the offset.
for r.bufferLen <= offset { l := len(r.buffer)
readRune, _, err := r.bufio.ReadRune() n := offset - l + 1 // nr of runes to add to the buffer to get to offset
if n > 0 {
r.grow(n)
for writeAt := l; writeAt <= offset; writeAt++ {
readRune, _, err := r.bufio.ReadRune()
// Handle errors. // Skip BOM.
if err != nil { if !r.firstReadDone {
r.err = err r.firstReadDone = true
r.errOffset = r.bufferLen if readRune == '\uFEFF' {
return utf8.RuneError, err writeAt--
continue
}
}
// Handle errors.
if err != nil {
r.err = err
r.errOffset = writeAt
return utf8.RuneError, err
}
r.buffer[writeAt] = readRune
} }
// Skip BOM.
if readRune == '\uFEFF' && r.bufferOffset == 0 {
r.bufferOffset++
continue
}
r.buffer = append(r.buffer, readRune)
r.bufferLen++
} }
return r.buffer[offset], nil return r.buffer[offset], nil
} }
// The upcoming code was inspired heavily by the Go built-in 'bytes' package.
// smallBufferSize is an initial allocation minimal capacity.
const smallBufferSize = 64
// ErrTooLarge is passed to panic if memory cannot be allocated to store data in a buffer.
var ErrTooLarge = errors.New("parsekit.reader: too large")
// grow grows the buffer to guarantee space for n more bytes.
// It returns the index where bytes should be written.
// If the buffer can't grow it will panic with ErrTooLarge.
func (r *Reader) grow(n int) {
// Instantiate new buffer.
if r.buffer == nil {
b := smallBufferSize
if b < n {
b = n
}
r.buffer = make([]rune, n, b)
return
}
l := len(r.buffer)
c := cap(r.buffer)
// Grow the buffer by reslicing within the available capacity.
if n <= c-l {
r.buffer = r.buffer[:l+n]
return
}
// Grow the buffer by allocating a new one and copying the data.
buf := makeSlice(2*c + n)
copy(buf, r.buffer)
r.buffer = buf[:l+n]
}
// makeSlice allocates a slice of size n. If the allocation fails, it panics
// with ErrTooLarge.
func makeSlice(n int) []rune {
// If the make fails, give a known error.
defer func() {
if recover() != nil {
panic(ErrTooLarge)
}
}()
return make([]rune, n)
}
// Flush deletes the provided number of runes from the start of the // Flush deletes the provided number of runes from the start of the
// reader buffer. After flushing the buffer, offset 0 as used by RuneAt() // reader buffer. After flushing the buffer, offset 0 as used by RuneAt()
// will point to the rune that comes after the flushed runes. // will point to the rune that comes after the flushed runes.
// So what this basically does is turn the Reader into a sliding window. // So what this basically does is turn the Reader into a sliding window.
func (r *Reader) Flush(numberOfRunes int) { func (r *Reader) Flush(numberOfRunes int) {
if numberOfRunes > r.bufferLen { if numberOfRunes > len(r.buffer) {
panic(fmt.Sprintf( panic(fmt.Sprintf(
"parsekit.Input.Reader.Flush(): number of runes to flush (%d) "+ "parsekit.Input.Reader.Flush(): number of runes to flush (%d) "+
"exceeds size of the buffer (%d)", numberOfRunes, r.bufferLen)) "exceeds size of the buffer (%d)", numberOfRunes, len(r.buffer)))
} }
r.bufferOffset += numberOfRunes
r.bufferLen -= numberOfRunes
r.buffer = r.buffer[numberOfRunes:] r.buffer = r.buffer[numberOfRunes:]
if r.err != nil { if r.err != nil {
r.errOffset -= numberOfRunes r.errOffset -= numberOfRunes

View File

@ -231,6 +231,48 @@ func TestGivenErrorFromReader_ErrorIsCached(t *testing.T) {
assert.Equal(t, io.EOF, err) assert.Equal(t, io.EOF, err)
} }
func TestInputLargerThanDefaultBufSize64(t *testing.T) {
input, size := makeLargeStubReader()
r := reader.New(input)
readRune, err := r.RuneAt(0)
assert.Equal(t, 'X', readRune)
readRune, err = r.RuneAt(size - 1)
assert.Equal(t, 'Y', readRune)
readRune, err = r.RuneAt(size)
assert.Equal(t, io.EOF, err)
readRune, err = r.RuneAt(10)
assert.Equal(t, 'X', readRune)
}
func TestInputLargerThanDefaultBufSize64_WithFirstReadLargerThanBufSize64(t *testing.T) {
input, size := makeLargeStubReader()
r := reader.New(input)
readRune, _ := r.RuneAt(size - 200)
assert.Equal(t, 'X', readRune)
readRune, _ = r.RuneAt(size - 1)
assert.Equal(t, 'Y', readRune)
}
func TestInputLargerThanDefaultBufSize64_WithFirstReadToLastByte(t *testing.T) {
input, size := makeLargeStubReader()
r := reader.New(input)
readRune, _ := r.RuneAt(size - 1)
assert.Equal(t, 'Y', readRune)
}
func makeLargeStubReader() (*StubReader, int) {
size := utf8.UTFMax * 64 * 5
bytes := make([]byte, size)
for i := range bytes {
bytes[i] = 'X'
}
bytes[size-1] = 'Y'
return &StubReader{bytes: bytes, errors: []error{io.EOF}}, size
}
type StubReader struct { type StubReader struct {
bytes []byte bytes []byte
errors []error errors []error