Making parsekit.reader both simpler and more complex (more complex by adopting some buffer allocation logic from the built-in bytes package, to not be copying memory all the time during the read operations.

This commit is contained in:
Maurice Makaay 2019-06-09 21:55:01 +00:00
parent 9656cd4449
commit 65895ac502
2 changed files with 123 additions and 28 deletions

View File

@ -41,6 +41,7 @@ package reader
import (
"bufio"
"errors"
"fmt"
"io"
"strings"
@ -58,12 +59,11 @@ import (
//
// The parserkit.reader.Reader is used internally by parsekit.TokenAPI.
type Reader struct {
bufio *bufio.Reader // Used for ReadRune()
buffer []rune // Input buffer, holding runes that were read from input
bufferOffset int // The offset of the buffer, relative to the start of the input
bufferLen int // Input size, the number of runes in the buffer
err error // A read error, if one occurred
errOffset int // The offset in the buffer at which the read error was encountered
bufio *bufio.Reader // Used for ReadRune()
buffer []rune // Input buffer, holding runes that were read from input
err error // A read error, if one occurred
errOffset int // The offset in the buffer at which the read error was encountered
firstReadDone bool // Whether or not the first read was done
}
// New initializes a new reader struct, wrapped around the provided input.
@ -74,8 +74,7 @@ type Reader struct {
// - bufio.Reader
func New(input interface{}) *Reader {
return &Reader{
bufio: makeBufioReader(input),
buffer: []rune{},
bufio: makeBufioReader(input),
}
}
@ -108,6 +107,8 @@ func makeBufioReader(input interface{}) *bufio.Reader {
// When reading failed, the rune will be utf8.RuneError and the error will
// be not nil. One special read fail is actually a normal situation: end
// of file reached. In that case, the returned error wille be io.EOF.
// Once a read error is encountered, that same read error will guaranteed
// be return on every subsequent read at or beyond the provided offset.
func (r *Reader) RuneAt(offset int) (rune, error) {
// Re-issue a previously seen read error.
if r.err != nil && offset >= r.errOffset {
@ -116,40 +117,92 @@ func (r *Reader) RuneAt(offset int) (rune, error) {
// Rune at provided offset is not yet available in the input buffer.
// Read runes until we have enough runes to satisfy the offset.
for r.bufferLen <= offset {
readRune, _, err := r.bufio.ReadRune()
l := len(r.buffer)
n := offset - l + 1 // nr of runes to add to the buffer to get to offset
if n > 0 {
r.grow(n)
for writeAt := l; writeAt <= offset; writeAt++ {
readRune, _, err := r.bufio.ReadRune()
// Handle errors.
if err != nil {
r.err = err
r.errOffset = r.bufferLen
return utf8.RuneError, err
// Skip BOM.
if !r.firstReadDone {
r.firstReadDone = true
if readRune == '\uFEFF' {
writeAt--
continue
}
}
// Handle errors.
if err != nil {
r.err = err
r.errOffset = writeAt
return utf8.RuneError, err
}
r.buffer[writeAt] = readRune
}
// Skip BOM.
if readRune == '\uFEFF' && r.bufferOffset == 0 {
r.bufferOffset++
continue
}
r.buffer = append(r.buffer, readRune)
r.bufferLen++
}
return r.buffer[offset], nil
}
// The upcoming code was inspired heavily by the Go built-in 'bytes' package.
// smallBufferSize is an initial allocation minimal capacity.
const smallBufferSize = 64
// ErrTooLarge is passed to panic if memory cannot be allocated to store data in a buffer.
var ErrTooLarge = errors.New("parsekit.reader: too large")
// grow grows the buffer to guarantee space for n more bytes.
// It returns the index where bytes should be written.
// If the buffer can't grow it will panic with ErrTooLarge.
func (r *Reader) grow(n int) {
// Instantiate new buffer.
if r.buffer == nil {
b := smallBufferSize
if b < n {
b = n
}
r.buffer = make([]rune, n, b)
return
}
l := len(r.buffer)
c := cap(r.buffer)
// Grow the buffer by reslicing within the available capacity.
if n <= c-l {
r.buffer = r.buffer[:l+n]
return
}
// Grow the buffer by allocating a new one and copying the data.
buf := makeSlice(2*c + n)
copy(buf, r.buffer)
r.buffer = buf[:l+n]
}
// makeSlice allocates a slice of size n. If the allocation fails, it panics
// with ErrTooLarge.
func makeSlice(n int) []rune {
// If the make fails, give a known error.
defer func() {
if recover() != nil {
panic(ErrTooLarge)
}
}()
return make([]rune, n)
}
// Flush deletes the provided number of runes from the start of the
// reader buffer. After flushing the buffer, offset 0 as used by RuneAt()
// will point to the rune that comes after the flushed runes.
// So what this basically does is turn the Reader into a sliding window.
func (r *Reader) Flush(numberOfRunes int) {
if numberOfRunes > r.bufferLen {
if numberOfRunes > len(r.buffer) {
panic(fmt.Sprintf(
"parsekit.Input.Reader.Flush(): number of runes to flush (%d) "+
"exceeds size of the buffer (%d)", numberOfRunes, r.bufferLen))
"exceeds size of the buffer (%d)", numberOfRunes, len(r.buffer)))
}
r.bufferOffset += numberOfRunes
r.bufferLen -= numberOfRunes
r.buffer = r.buffer[numberOfRunes:]
if r.err != nil {
r.errOffset -= numberOfRunes

View File

@ -231,6 +231,48 @@ func TestGivenErrorFromReader_ErrorIsCached(t *testing.T) {
assert.Equal(t, io.EOF, err)
}
func TestInputLargerThanDefaultBufSize64(t *testing.T) {
input, size := makeLargeStubReader()
r := reader.New(input)
readRune, err := r.RuneAt(0)
assert.Equal(t, 'X', readRune)
readRune, err = r.RuneAt(size - 1)
assert.Equal(t, 'Y', readRune)
readRune, err = r.RuneAt(size)
assert.Equal(t, io.EOF, err)
readRune, err = r.RuneAt(10)
assert.Equal(t, 'X', readRune)
}
func TestInputLargerThanDefaultBufSize64_WithFirstReadLargerThanBufSize64(t *testing.T) {
input, size := makeLargeStubReader()
r := reader.New(input)
readRune, _ := r.RuneAt(size - 200)
assert.Equal(t, 'X', readRune)
readRune, _ = r.RuneAt(size - 1)
assert.Equal(t, 'Y', readRune)
}
func TestInputLargerThanDefaultBufSize64_WithFirstReadToLastByte(t *testing.T) {
input, size := makeLargeStubReader()
r := reader.New(input)
readRune, _ := r.RuneAt(size - 1)
assert.Equal(t, 'Y', readRune)
}
func makeLargeStubReader() (*StubReader, int) {
size := utf8.UTFMax * 64 * 5
bytes := make([]byte, size)
for i := range bytes {
bytes[i] = 'X'
}
bytes[size-1] = 'Y'
return &StubReader{bytes: bytes, errors: []error{io.EOF}}, size
}
type StubReader struct {
bytes []byte
errors []error