diff --git a/read/assertions_test.go b/read/assertions_test.go index 47fed22..d5a5843 100644 --- a/read/assertions_test.go +++ b/read/assertions_test.go @@ -44,12 +44,15 @@ func assertPanic(t *testing.T, code func(), expected string) { code() } -func assertCache(t *testing.T, name string, r *Buffer, code func(), bufLen, bufCap int) { +func assertBuffer(t *testing.T, name string, r *Buffer, code func(), capacity, start, length int) { code() - if bufLen != len(r.buffer) { - t.Errorf("[%s] Unexpected buffer len (expected %d, got %d)", name, bufLen, len(r.buffer)) + if capacity != r.cap { + t.Errorf("[%s] Unexpected buffer cap (expected %d, got %d)", name, capacity, r.cap) } - if bufCap != cap(r.buffer) { - t.Errorf("[%s] Unexpected buffer cap (expected %d, got %d)", name, bufCap, cap(r.buffer)) + if start != r.start { + t.Errorf("[%s] Unexpected data starting point (expected %d, got %d)", name, start, r.start) + } + if length != r.len { + t.Errorf("[%s] Unexpected buffer len (expected %d, got %d)", name, length, r.len) } } diff --git a/read/read.go b/read/read.go index 7aafc05..8ff6c72 100644 --- a/read/read.go +++ b/read/read.go @@ -93,7 +93,9 @@ func makeBufioReader(input interface{}) *bufio.Reader { type Buffer struct { bufio *bufio.Reader // used for ReadRune() buffer []byte // input buffer, holding runes that were read from input - bufOffset int // the offset in the buffer at which the sliding data window starts + cap int // the full buffer capacity + start int // the offset from where to read buffered data in the buffer + len int // the length of the buffered data err error // a read error, if one occurred errOffset int // the offset in the buffer at which the read error was encountered } @@ -117,11 +119,13 @@ type Buffer struct { // Once a read error is encountered, that same read error will guaranteed // be return on every subsequent read at or beyond the provided offset. func (buf *Buffer) RuneAt(offset int) (rune, int, error) { - buf.fill(offset + utf8.UTFMax) + if buf.len < offset+utf8.MaxRune && buf.err == nil { + buf.fill(offset + utf8.UTFMax) + } if buf.err != nil && offset >= buf.errOffset { return utf8.RuneError, 0, buf.err } - r, w := utf8.DecodeRune(buf.buffer[buf.bufOffset+offset:]) + r, w := utf8.DecodeRune(buf.buffer[buf.start+offset:]) return r, w, nil } @@ -143,50 +147,37 @@ func (buf *Buffer) RuneAt(offset int) (rune, int, error) { // Once a read error is encountered, that same read error will guaranteed // be return on every subsequent read at or beyond the provided offset. func (buf *Buffer) ByteAt(offset int) (byte, error) { - buf.fill(offset + 1) + if buf.len < offset+1 && buf.err == nil { + buf.fill(offset + 1) + } if buf.err != nil && offset >= buf.errOffset { return 0, buf.err } - return buf.buffer[buf.bufOffset+offset], nil + return buf.buffer[buf.start+offset], nil } func (buf *Buffer) fill(minBytes int) { - // Check the current length of the buffer data. - bufLen := len(buf.buffer[buf.bufOffset:]) - - // If the required amount of bytes fits in the available data, or when - // an error was encountered previously, then no action is needed. - if minBytes <= bufLen || buf.err != nil { - return - } - // Grow the buffer so it can contain at least the number of requested bytes. - // The return value is the actual capacity of the buffer after growing it. - // - // Note: - // The grow() method will always arrange the data to be at the start of the - // buffer, getting rid of the leading unused space that might exist due to - // calls to Flush(). This means that buf.bufOffset will be 0 from here on, - // so there's no need to accomodate for this offset in the following code. - bufLen, bufCap := buf.grow(minBytes) + if minBytes > buf.cap-buf.start { + buf.grow(minBytes) + } // Now we try to fill the buffer completely with data from our source. // This is more efficient than only filling the data up to the point where // we can read the data at the 'minBytes' position. Ideally, the buffer is // filled completely with data to work with. - for bufLen < bufCap { + for buf.len < buf.cap { // Read bytes from our source, and append them to the end of the // current buffer data. - n, err := buf.bufio.Read(buf.buffer[bufLen:bufCap]) - bufLen += n + n, err := buf.bufio.Read(buf.buffer[buf.len:buf.cap]) + buf.len += n if err != nil { buf.err = err - buf.errOffset = bufLen + buf.errOffset = buf.len break } } - buf.buffer = buf.buffer[:bufLen] // TODO work with a separate bufLen field in the buffer stuct, that might be simpler to work with and maybe faster. } const bufferBlockSize = 1024 @@ -196,20 +187,12 @@ var ErrTooLarge = errors.New("parsekit.read.Buffer: too large") // grow grows the buffer to guarantee space for at least the requested amount // of bytes, either shifting data around or reallocating the buffer. -func (buf *Buffer) grow(minBytes int) (int, int) { - if buf.err != nil { - panic("Cannot grow buffer, there was an error earlier on!") - } - +func (buf *Buffer) grow(minBytes int) { // When possible, grow the buffer by moving the data to the start of // the buffer, freeing up extra capacity at the end. - bufLen := len(buf.buffer) - buf.bufOffset - bufCap := cap(buf.buffer) - if buf.bufOffset > 0 && minBytes <= bufCap { - copy(buf.buffer, buf.buffer[buf.bufOffset:]) - buf.buffer = buf.buffer[:bufLen] - buf.bufOffset = 0 - return bufLen, bufCap + if buf.start > 0 && minBytes <= buf.cap { + copy(buf.buffer, buf.buffer[buf.start:buf.start+buf.len]) + buf.start = 0 } // Grow the buffer store by allocating a new one and copying the data. @@ -217,23 +200,23 @@ func (buf *Buffer) grow(minBytes int) (int, int) { if minBytes%bufferBlockSize > 0 { newbufCap += bufferBlockSize } - newStore := makeSlice(minBytes, newbufCap) - copy(newStore, buf.buffer[buf.bufOffset:]) + newStore := makeSlice(newbufCap) + copy(newStore, buf.buffer[buf.start:buf.start+buf.len]) buf.buffer = newStore - buf.bufOffset = 0 - return bufLen, newbufCap + buf.start = 0 + buf.cap = newbufCap } // makeSlice allocates a slice of size n. If the allocation fails, it panics // with ErrTooLarge. -func makeSlice(l int, c int) []byte { +func makeSlice(c int) []byte { // If the make fails, give a known error. defer func() { if recover() != nil { panic(ErrTooLarge) } }() - return make([]byte, l, c) + return make([]byte, c) } // Flush deletes the provided number of bytes from the start of the Buffer. @@ -245,17 +228,15 @@ func (buf *Buffer) Flush(numberOfBytes int) { return } - bufLen := len(buf.buffer) - dataLen := bufLen - buf.bufOffset - if numberOfBytes > dataLen { + if numberOfBytes > buf.len { panic(fmt.Sprintf( "parsekit.read.Buffer.Flush(): number of bytes to flush (%d) "+ - "exceeds size of the buffered data (%d)", numberOfBytes, dataLen)) + "exceeds size of the buffered data (%d)", numberOfBytes, buf.len)) } - if dataLen == numberOfBytes { - buf.buffer = buf.buffer[:0] - buf.bufOffset = 0 + if buf.len == numberOfBytes { + buf.len = 0 + buf.start = 0 buf.errOffset = 0 return } @@ -263,5 +244,6 @@ func (buf *Buffer) Flush(numberOfBytes int) { if buf.err != nil { buf.errOffset -= numberOfBytes } - buf.bufOffset += numberOfBytes + buf.start += numberOfBytes + buf.len -= numberOfBytes } diff --git a/read/read_test.go b/read/read_test.go index ea44936..a4b4512 100644 --- a/read/read_test.go +++ b/read/read_test.go @@ -329,74 +329,113 @@ func TestAllocationPatterns(t *testing.T) { r := New(input) // The first read will create the standard buffer and fill it with data. + // The first rune is requested, but there's more input data availble, + // so the cache is filled up completely. // buffer |xxxx1024xxxxx| - assertCache(t, "read 1", r, func() { r.RuneAt(0) }, 1024, 1024) + assertBuffer(t, "read 1", r, func() { r.RuneAt(0) }, 1024, 0, 1024) rn, _, _ := r.RuneAt(0) assertEqual(t, 'A', rn) // The first 1024 bytes will fit in the standard buffer. // buffer |xxxx1024xxxxx| - assertCache(t, "read fill cache", r, func() { r.ByteAt(1023) }, 1024, 1024) + assertBuffer(t, "read fill cache", r, func() { r.ByteAt(1023) }, 1024, 0, 1024) // Flushing zero input keeps everything as-is. // buffer |xxxx1024xxxxx| - assertCache(t, "flush zero", r, func() { r.Flush(0) }, 1024, 1024) + assertBuffer(t, "flush zero", r, func() { r.Flush(0) }, 1024, 0, 1024) - // Flushing all cached input truncates the cache. + // Flushing all cached input truncates the buffer. // buffer | 1024 | - assertCache(t, "flush full cache", r, func() { r.Flush(1024) }, 0, 1024) + assertBuffer(t, "flush full cache", r, func() { r.Flush(1024) }, 1024, 0, 0) // Reading 1025 chars will allocate a new store of 2 * 1024 and fill it with data. + // Offset 1024 is requested, but there's more input data availble, + // so the cache is filled up completely. // buffer |xxxxxxxxxxxx2048xxxxxxxxxxxxxx| - assertCache(t, "read cap + 1", r, func() { r.ByteAt(1024) }, 2048, 2048) + runeBefore, _, _ := r.RuneAt(0) + assertBuffer(t, "read cap + 1", r, func() { r.ByteAt(1024) }, 2048, 0, 2048) + runeAfter, _, _ := r.RuneAt(0) // The bytes that we had before must be copied to the newly allocated store. - rn, _, _ = r.RuneAt(0) - assertEqual(t, 'K', rn) + assertEqual(t, runeBefore, runeAfter) // A partial flush moves the buffer offset, but the stored data stay the same. // buffer 25 |xxxxxxxxxxx2023xxxxxxxxxx| - assertCache(t, "flush partial", r, func() { r.Flush(25) }, 2048, 2048) + assertBuffer(t, "flush partial", r, func() { r.Flush(25) }, 2048, 25, 2023) // The capacity for the usable part of the buffer is now 2023 // This number of runes can be read, without triggering a re-allocation. // buffer 25 |xxxxxxxxxxx2023xxxxxxxxxx| - assertCache(t, "read fill cache after partial flush", r, func() { r.ByteAt(2022) }, 2048, 2048) + assertBuffer(t, "read fill cache after partial flush", r, func() { r.ByteAt(2022) }, 2048, 25, 2023) // Flush the full input. // store | 2048 | // buffer | 2048 | - assertCache(t, "flush full cache after partial flush", r, func() { r.Flush(2023) }, 0, 2048) + assertBuffer(t, "flush full cache after partial flush", r, func() { r.Flush(2023) }, 2048, 0, 0) // Fill up the store again. + // Offset 1234 is requested, but there's more input data availble, + // so the cache is filled up completely. // buffer |xxxxxxxxxxxx2048xxxxxxxxxxxxxx| - assertCache(t, "fill up the store again", r, func() { r.ByteAt(1234) }, 2048, 2048) + assertBuffer(t, "fill up the store again", r, func() { r.ByteAt(1234) }, 2048, 0, 2048) // Then flush almost all input. // buffer 2047 |x1x| - assertCache(t, "flush almost all input", r, func() { r.Flush(2047) }, 2048, 2048) + assertBuffer(t, "flush almost all input", r, func() { r.Flush(2047) }, 2048, 2047, 1) // Read some data beyond the single byte. This moves the single byte at the end to // the start and fills up the rest of the buffer, without a reallocation. // buffer |xxxxxxxxxxxx2048xxxxxxxxxxxxxx| - assertCache(t, "read the remaining size, triggering a move", r, func() { r.ByteAt(1234) }, 2048, 2048) + assertBuffer(t, "read the remaining size, triggering a move", r, func() { r.ByteAt(1234) }, 2048, 0, 2048) // Now flush only one rune from the cache. // buffer 1 |xxxxxxxxx2047xxxxxxxxxxxxxx| - assertCache(t, "flush 1", r, func() { r.Flush(1) }, 2048, 2048) + assertBuffer(t, "flush 1", r, func() { r.Flush(1) }, 2048, 1, 2047) // Now read the full available capacity. This will not fit, so // space has to be made. Since there's 1 free space at the start of the store, // the data are moved to the start and no reallocation is needed. // buffer |xxxxxxxxxxxx2048xxxxxxxxxxxxx| - assertCache(t, "read full capacity with 1 free byte at start", r, func() { r.ByteAt(2047) }, 2048, 2048) + assertBuffer(t, "read full capacity with 1 free byte at start", r, func() { r.ByteAt(2047) }, 2048, 0, 2048) // Now read in the whole rest of the buffer, asking for an offset that is way out of range. // It does allocate enough memory to store 10000 bytes (bringing us to 10240), but while reading it is // detected that there are not enough bytes to fill it. That puts a limit on the amount of data in - // the buffer (5120 instead of the full 10240 buffer size). + // the buffer, so the buffer is not completely filled. // buffer |xxxxxxxxxxxxxxx5120xxxxxxxxxxxxxxxxxxxx 10240-5120 | - assertCache(t, "over-ask", r, func() { r.ByteAt(10000) }, 5120, 10240) + remaining := input.remaining + assertBuffer(t, "over-ask", r, func() { r.ByteAt(10000) }, 10240, 0, 2048+remaining) +} + +func makeLargeStubReader() (*StubReader, int) { + size := 8192 + bytes := make([]byte, size) + for i := range bytes { + bytes[i] = 'A' + byte(i%26) + } + return &StubReader{bytes: bytes, errors: []error{io.EOF}, remaining: size}, size +} + +type StubReader struct { + bytes []byte + errors []error + remaining int +} + +func (r *StubReader) Read(p []byte) (n int, err error) { + if len(r.bytes) > 0 { + head, tail := r.bytes[0], r.bytes[1:] + r.bytes = tail + p[0] = head + r.remaining-- + return 1, nil + } + if len(r.errors) > 0 { + head, tail := r.errors[0], r.errors[1:] + r.errors = tail + return 0, head + } + panic("StubReader is all out of bytes and errors") } func Benchmark0BytesInputFile(b *testing.B) { @@ -461,32 +500,3 @@ func processInputFile(b *testing.B, testSize int) { } } } - -func makeLargeStubReader() (*StubReader, int) { - size := 8192 - bytes := make([]byte, size) - for i := range bytes { - bytes[i] = 'A' + byte(i%26) - } - return &StubReader{bytes: bytes, errors: []error{io.EOF}}, size -} - -type StubReader struct { - bytes []byte - errors []error -} - -func (r *StubReader) Read(p []byte) (n int, err error) { - if len(r.bytes) > 0 { - head, tail := r.bytes[0], r.bytes[1:] - r.bytes = tail - p[0] = head - return 1, nil - } - if len(r.errors) > 0 { - head, tail := r.errors[0], r.errors[1:] - r.errors = tail - return 0, head - } - panic("StubReader is all out of bytes and errors") -}