298 lines
10 KiB
Go
298 lines
10 KiB
Go
// Package read provides a buffered input reader that is used to feed data to the tokenizer.
|
|
//
|
|
// Functionally, it provides an input buffer in the form of a sliding window.
|
|
// Let's say we've got the following input coming up in the io.Reader that is
|
|
// wrapped by the Reader:
|
|
//
|
|
// |H|e|l|l|o|,| |w|o|r|l|d|!| <-- bytes
|
|
// 0 6 12 <-- byte offset
|
|
//
|
|
// The Reader can now be used to retrieve data from the input, based on their
|
|
// byte offset, e.g. using RuneAt(offset) or ByteAt(offset). Normally these data
|
|
// will be retrieved in sequence by the user of this code, but that is not a
|
|
// requirement. Let's say we right away ask to retrieve the byte with offset 6
|
|
// from the input (the 'w'). The Reader buffer will then be filled with at
|
|
// least 6 bytes and the bytes at offset 6 will be returned.
|
|
//
|
|
// Note: the actual Reader would not stop after reading the byte at offset 6.
|
|
// For performance reasons, it would read as much data into the available buffer
|
|
// space as possible (but no more than the available space).
|
|
//
|
|
// |H|e|l|l|o| |w|
|
|
// 0 6
|
|
//
|
|
// This means that you can retrieve data for arbitrary offsets. If you request
|
|
// offsets that are already in the Reader buffer, then the buffered data are
|
|
// returned. If you request an offset that is not available in the buffer, then
|
|
// the buffer will be expanded.
|
|
//
|
|
// To make this into a sliding window (which preserves memory space while scanning
|
|
// the input data), the Reader provides the method Flush(numberOfBytes).
|
|
// This method will drop the provided number of bytes from the Reader buffer.
|
|
// So when we'd do a Flush(3) on the example buffer from above, then the Reader
|
|
// buffer would become:
|
|
//
|
|
// |l|o| |w|
|
|
// 0 3
|
|
//
|
|
// Note that the offset for the first rune 'l' in the buffer is now 0.
|
|
// You can consider the complete input to be changed in a similar way:
|
|
//
|
|
// |l|o|,| |w|o|r|l|d|!|
|
|
// 0 6 9
|
|
//
|
|
// So after a flush, the first upcoming rune after the flushed runes
|
|
// will always be at offset 0.
|
|
package read
|
|
|
|
import (
|
|
"bufio"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"strings"
|
|
"unicode/utf8"
|
|
)
|
|
|
|
// New initializes a new Buffer struct, wrapped around the provided input.
|
|
//
|
|
// The input can be any one of the following types:
|
|
//
|
|
// • string
|
|
//
|
|
// • a type implementing io.Reader
|
|
//
|
|
// • bufio.Reader
|
|
func New(input interface{}) Buffer {
|
|
return Buffer{
|
|
bufio: makeBufioReader(input),
|
|
}
|
|
}
|
|
|
|
func makeBufioReader(input interface{}) *bufio.Reader {
|
|
switch input := input.(type) {
|
|
case bufio.Reader:
|
|
return &input
|
|
case *bufio.Reader:
|
|
return input
|
|
case io.Reader:
|
|
return bufio.NewReader(input)
|
|
case string:
|
|
return bufio.NewReader(strings.NewReader(input))
|
|
default:
|
|
panic(fmt.Sprintf("parsekit.read.New(): no support for input of type %T", input))
|
|
}
|
|
}
|
|
|
|
// Buffer wraps around a bufio.Reader and provides an additional layer of
|
|
// buffering that allows us to read the same data over and over again.
|
|
// This is useful for implementing a parser that must be able to do lookahead
|
|
// on the input, returning to the original input position after finishing
|
|
// that lookahead).
|
|
//
|
|
// To minimize memory use, it is also possible to flush the read buffer when there is
|
|
// no more need to go back to previously read data.
|
|
//
|
|
// This buffer is used internally by tokenize.API.
|
|
type Buffer struct {
|
|
bufio *bufio.Reader // used for ReadRune()
|
|
buffer []byte // input buffer, holding bytes that were read from input
|
|
cap int // the full buffer capacity
|
|
start int // the offset from where on to read buffered data in the buffer
|
|
len int // the length of the buffered data
|
|
err error // a read error, if one occurred
|
|
errOffset int // the offset in the buffer at which the read error was encountered
|
|
}
|
|
|
|
// RuneAt reads the rune at the provided byte offset.
|
|
//
|
|
// The byte offset is relative to the current starting position of the Buffer.
|
|
// When starting reading, offset 0 will point at the start of the input.
|
|
// After flushing some bytes, offset 0 will point at the input up to where
|
|
// the flush was done.
|
|
//
|
|
// When reading was successful, the rune and the width of the rune in bytes
|
|
// will be returned. The returned error will be nil.
|
|
// When an invalid UTF8 rune is encountered on the input, the error will be nil,
|
|
// but the rune will be utf8.RuneError
|
|
//
|
|
// When reading failed, the rune will be utf8.RuneError and the error will
|
|
// be not nil. One special read fail is actually a normal situation: end
|
|
// of file reached. In that case, the returned error wille be io.EOF.
|
|
//
|
|
// Once a read error is encountered, that same read error will guaranteed
|
|
// be return on every subsequent read at or beyond the provided offset.
|
|
func (buf *Buffer) RuneAt(offset int) (rune, int, error) {
|
|
if buf.len < offset+utf8.MaxRune && buf.err == nil {
|
|
buf.fill(offset + utf8.UTFMax)
|
|
}
|
|
if buf.err != nil && offset >= buf.errOffset {
|
|
return utf8.RuneError, 0, buf.err
|
|
}
|
|
r, w := utf8.DecodeRune(buf.buffer[buf.start+offset:])
|
|
return r, w, nil
|
|
}
|
|
|
|
// ByteAt reads the byte at the provided byte offset.
|
|
//
|
|
// The byte offset is relative to the current starting position of the Buffer.
|
|
// When starting reading, offset 0 will point at the start of the input.
|
|
// After flushing, offset 0 will point at the input up to where the flush
|
|
// was done.
|
|
//
|
|
// When reading was successful, the byte will be returned. The returned
|
|
// error will be nil.
|
|
//
|
|
// When reading failed, the byte will be 0x00 and the error will
|
|
// not be nil. One special read fail is actually a normal situation: end
|
|
// of file reached. In that case, the returned error wille be io.EOF.
|
|
//
|
|
// Once a read error is encountered, that same read error will guaranteed
|
|
// be return on every subsequent read at or beyond the provided offset.
|
|
func (buf *Buffer) ByteAt(offset int) (byte, error) {
|
|
if buf.len < offset+1 && buf.err == nil {
|
|
buf.fill(offset + 1)
|
|
}
|
|
if buf.err != nil && offset >= buf.errOffset {
|
|
return 0, buf.err
|
|
}
|
|
return buf.buffer[buf.start+offset], nil
|
|
}
|
|
|
|
// BytesAt reads at max the provided number of bytes at the provided byte offset.
|
|
//
|
|
// The byte offset is relative to the current starting position of the Buffer.
|
|
// When starting reading, offset 0 will point at the start of the input.
|
|
// After flushing, offset 0 will point at the input up to where the flush
|
|
// was done.
|
|
//
|
|
// When reading was successful, the byte will be returned. The returned
|
|
// error will be nil.
|
|
//
|
|
// When reading failed, the returned byte slice might be empty, or it might
|
|
// contain a part of the requsted bytes. The error will not be nil.
|
|
// One special read fail is actually a normal situation: end
|
|
// of file reached. In that case, the returned error wille be io.EOF.
|
|
//
|
|
// Once a read error is encountered, that same read error will guaranteed
|
|
// be return on every subsequent read at or beyond the provided offset.
|
|
func (buf *Buffer) BytesAt(offset int, count int) ([]byte, error) {
|
|
if buf.len < offset+count && buf.err == nil {
|
|
buf.fill(offset + count)
|
|
}
|
|
|
|
if buf.err != nil && offset+count > buf.errOffset {
|
|
return buf.buffer[buf.start+offset : buf.start+buf.errOffset], buf.err
|
|
}
|
|
return buf.buffer[buf.start+offset : buf.start+offset+count], nil
|
|
}
|
|
|
|
func (buf *Buffer) BufferedBytesAt(offset int) ([]byte, error) {
|
|
if buf.len < offset+1 && buf.err == nil {
|
|
buf.fill(offset + 1)
|
|
}
|
|
|
|
if buf.err != nil {
|
|
return buf.buffer[buf.start+offset : buf.start+buf.errOffset], buf.err
|
|
}
|
|
return buf.buffer[buf.start+offset : buf.start+buf.len], nil
|
|
}
|
|
|
|
func (buf *Buffer) fill(minBytes int) {
|
|
// Grow the buffer so it can contain at least the number of requested bytes.
|
|
if minBytes > buf.cap-buf.start {
|
|
buf.grow(minBytes)
|
|
}
|
|
|
|
// Try to fill the buffer completely with data from our source.
|
|
// This is more efficient than only filling the data up to the point where
|
|
// we can read the data at the 'minBytes' position. Ideally, the buffer is
|
|
// filled completely with data to work with.
|
|
availableLen := buf.cap - buf.start
|
|
for buf.len < availableLen {
|
|
// Read bytes from our source, and append them to the end of the
|
|
// current buffer data.
|
|
n, err := buf.bufio.Read(buf.buffer[buf.len:buf.cap])
|
|
buf.len += n
|
|
|
|
if err != nil {
|
|
buf.err = err
|
|
buf.errOffset = buf.len
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
const defaultBufferSize = 1024
|
|
const runeCacheSize = 128
|
|
|
|
// ErrTooLarge is passed to panic if memory cannot be allocated to store data in a buffer.
|
|
var ErrTooLarge = errors.New("parsekit.read.Buffer: too large")
|
|
|
|
// grow grows the buffer to guarantee space for at least the requested amount
|
|
// of bytes, either shifting data around or reallocating the buffer.
|
|
// When reallocating, the new buffer size will always be a multitude of the
|
|
// default buffer size.
|
|
func (buf *Buffer) grow(minBytes int) {
|
|
// When possible, grow the buffer by moving the data to the start of
|
|
// the buffer, freeing up extra capacity at the end.
|
|
if buf.start > 0 && minBytes <= buf.cap {
|
|
copy(buf.buffer, buf.buffer[buf.start:buf.start+buf.len])
|
|
buf.start = 0
|
|
return
|
|
}
|
|
|
|
// Grow the buffer store by allocating a new one and copying the data.
|
|
newbufCap := (minBytes / defaultBufferSize) * defaultBufferSize
|
|
if minBytes%defaultBufferSize > 0 {
|
|
newbufCap += defaultBufferSize
|
|
}
|
|
newStore := makeSlice(newbufCap)
|
|
|
|
copy(newStore, buf.buffer[buf.start:buf.start+buf.len])
|
|
buf.buffer = newStore
|
|
buf.start = 0
|
|
buf.cap = newbufCap
|
|
return
|
|
}
|
|
|
|
// makeSlice allocates a slice of size n. If the allocation fails, it panics
|
|
// with ErrTooLarge.
|
|
func makeSlice(c int) []byte {
|
|
// If the make fails, give a known error.
|
|
defer func() {
|
|
if recover() != nil {
|
|
panic(ErrTooLarge)
|
|
}
|
|
}()
|
|
return make([]byte, c)
|
|
}
|
|
|
|
// Flush deletes the provided number of bytes from the start of the Buffer.
|
|
// After flushing the Buffer, offset 0 as used by RuneAt() and ByteAt() will
|
|
// point to the first byte that came after the bytes that were flushed.
|
|
func (buf *Buffer) Flush(numberOfBytes int) {
|
|
if numberOfBytes == 0 {
|
|
return
|
|
}
|
|
|
|
if numberOfBytes > buf.len {
|
|
panic(fmt.Sprintf(
|
|
"parsekit.read.Buffer.Flush(): number of bytes to flush (%d) "+
|
|
"exceeds size of the buffered data (%d)", numberOfBytes, buf.len))
|
|
}
|
|
|
|
if buf.len == numberOfBytes {
|
|
buf.len = 0
|
|
buf.start = 0
|
|
buf.errOffset = 0
|
|
return
|
|
}
|
|
|
|
if buf.err != nil {
|
|
buf.errOffset -= numberOfBytes
|
|
}
|
|
buf.start += numberOfBytes
|
|
buf.len -= numberOfBytes
|
|
}
|