go-parsekit/read/read.go

// Package read provides a buffered input reader that is used to feed data to the tokenizer.
//
// Functionally, it provides an input buffer in the form of a sliding window.
// Let's say we've got the following input coming up in the io.Reader that is
// wrapped by the Reader:
//
//     |H|e|l|l|o|,| |w|o|r|l|d|!|  <-- bytes
//      0           6           12  <-- byte offset
//
// The Reader can now be used to retrieve data from the input, based on their
// byte offset, e.g. using RuneAt(offset) or ByteAt(offset). Normally these data
// will be retrieved in sequence by the user of this code, but that is not a
// requirement. Let's say we retrieve the byte with offset 6 from the input
// (the 'w'), then the Reader buffer be filled with runes from the io.Reader
// until there are enough runes available to return the rune for offset 6:
//
//     |H|e|l|l|o| |w|
//      0           6
//
// This means that you can retrieve data for arbitrary offsets. If you request
// an offset that is already in the Reader buffer, then the buffered data are
// returned. If you request one that is not in the buffer, then the buffer will
// be expanded.
//
// To make this into a sliding window (preserving memory space while scanning
// the input data), the Reader provides the method Flush(numberOfBytes).
// This method will drop the provided number of bytes from the Reader buffer.
// So when we'd do a Flush(3) on the example buffer from above, then the Reader
// buffer would become:
//
//     |l|o| |w|
//      0     3
//
// Note that the offset for the first rune 'l' in the buffer is now 0.
// You can consider the input to be changed in a similar way:
//
//     |l|o|,| |w|o|r|l|d|!|
//      0           6     9
//
// So after a flush, the first upcoming rune after the flushed runes
// will always be at offset 0.
package read

import (
	"bufio"
	"errors"
	"fmt"
	"io"
	"strings"
	"unicode/utf8"
)

// New initializes a new Buffer struct, wrapped around the provided input.
//
// The input can be any one of the following types:
//
// • string
//
// • a type implementing io.Reader
//
// • bufio.Reader
func New(input interface{}) *Buffer {
	return &Buffer{
		bufio: makeBufioReader(input),
	}
}

func makeBufioReader(input interface{}) *bufio.Reader {
	switch input := input.(type) {
	case bufio.Reader:
		return &input
	case *bufio.Reader:
		return input
	case io.Reader:
		return bufio.NewReader(input)
	case string:
		return bufio.NewReader(strings.NewReader(input))
	default:
		panic(fmt.Sprintf("parsekit.read.New(): no support for input of type %T", input))
	}
}

// Buffer wraps around a bufio.Reader and provides an additional layer of
// buffering that allows us to read the same data over and over again.
// This is useful for implementing a parser that must be able to do lookahead
// on the input, returning to the original input position after finishing
// that lookahead).
//
// To minimize memory use, it is also possible to flush the read buffer when there is
// no more need to go back to previously read data.
//
// This parserkit.reader.Reader is used internally by tokenize.API.
type Buffer struct {
	bufio     *bufio.Reader // used for ReadRune()
	store     []byte        // buffer store, the buffer field is a slice on top of this one
	buffer    []byte        // input buffer, holding runes that were read from input
	err       error         // a read error, if one occurred
	errOffset int           // the offset in the buffer at which the read error was encountered
}

// RuneAt reads the rune at the provided byte offset.
//
// The offset is relative to the current starting position of the Buffer.
// When starting reading, offset 0 will point at the start of the input.
// After flushing, offset 0 will point at the input up to where the flush
// was done.
//
// When reading was successful, the rune and the width of the rune in bytes
// will be returned. The returned error will be nil.
// When an invalid UTF8 rune is encountered on the input, the error will be nil,
// but the rune will be utf8.RuneError
//
// When reading failed, the rune will be utf8.RuneError and the error will
// be not nil. One special read fail is actually a normal situation: end
// of file reached. In that case, the returned error wille be io.EOF.
//
// Once a read error is encountered, that same read error will guaranteed
// be return on every subsequent read at or beyond the provided offset.
func (buf *Buffer) RuneAt(offset int) (rune, int, error) {
	// Shortcut: re-issue a previously seen read error.
	if buf.err != nil && offset >= buf.errOffset {
		return utf8.RuneError, 0, buf.err
	}

	// Compute the number of bytes that we need in the buffer to be able
	// to return the rune at the provided byte offset.
	bufferLen := len(buf.buffer)
	requiredLen := offset + utf8.UTFMax
	if requiredLen > bufferLen && buf.err == nil {
		buf.grow(requiredLen)
		for writeAt := bufferLen; writeAt < requiredLen; writeAt++ {
			b, err := buf.bufio.ReadByte()
			if err != nil {
				buf.err = err
				buf.errOffset = writeAt
				buf.buffer = buf.buffer[:writeAt]
				break
			}
			buf.buffer[writeAt] = b
		}
	}

	if buf.err != nil && offset >= buf.errOffset {
		return utf8.RuneError, 0, buf.err
	}

	r, w := utf8.DecodeRune(buf.buffer[offset:])
	return r, w, nil
}

// ByteAt reads the byte at the provided byte offset.
//
// The offset is relative to the current starting position of the Buffer.
// When starting reading, offset 0 will point at the start of the input.
// After flushing, offset 0 will point at the input up to where the flush
// was done.
//
// When reading was successful, the byte will be returned. The returned
// error will be nil.
//
// When reading failed, the byte will be 0x00 and the error will
// be not nil. One special read fail is actually a normal situation: end
// of file reached. In that case, the returned error wille be io.EOF.
//
// Once a read error is encountered, that same read error will guaranteed
// be return on every subsequent read at or beyond the provided offset.
func (buf *Buffer) ByteAt(offset int) (byte, error) {
	// Shortcut: re-issue a previously seen read error.
	if buf.err != nil && offset >= buf.errOffset {
		return 0, buf.err
	}

	// Compute the number of bytes that we need in the buffer to be able
	// to return the byte at the provided byte offset.
	bufferLen := len(buf.buffer)
	requiredLen := offset + 1
	if requiredLen > bufferLen && buf.err == nil {
		buf.grow(requiredLen)
		for writeAt := bufferLen; writeAt < requiredLen; writeAt++ {
			b, err := buf.bufio.ReadByte()
			if err != nil {
				buf.err = err
				buf.errOffset = writeAt
				buf.buffer = buf.buffer[:writeAt]
				break
			}
			buf.buffer[writeAt] = b
		}
	}

	if buf.err != nil && offset >= buf.errOffset {
		return 0, buf.err
	}

	return buf.buffer[offset], nil
}

// The upcoming code was inspired heavily by the Go built-in 'bytes' package.

// smallBufferSize is an initial allocation minimal capacity.
const smallBufferSize = 64

// ErrTooLarge is passed to panic if memory cannot be allocated to store data in a buffer.
var ErrTooLarge = errors.New("parsekit.read.Buffer: too large")

// grow grows the buffer to guarantee space for n more bytes.
// It returns the index where bytes should be written.
// If the buffer can't grow it will panic with ErrTooLarge.
func (buf *Buffer) grow(requiredSize int) {
	// Instantiate new buffer store
	if buf.store == nil {
		b := smallBufferSize
		if b < requiredSize {
			b = requiredSize
		}
		buf.store = make([]byte, 0, b)
		buf.buffer = buf.store[:requiredSize]
		return
	}

	capBuffer := cap(buf.buffer)

	// Grow the buffer store by reslicing within the available capacity.
	if capBuffer >= requiredSize {
		buf.buffer = buf.buffer[:requiredSize]
		return
	}

	capStore := cap(buf.store)
	freeAtStartOfStore := capStore - capBuffer

	// Grow the buffer by moving the data to the start of the store.
	// Note: according to the spec, overlapping slices are allowed with copy().
	if freeAtStartOfStore > 0 && requiredSize <= capStore {
		buf.store = buf.store[0:requiredSize]
		copy(buf.store, buf.buffer)
		buf.buffer = buf.store[:requiredSize]
		buf.store = buf.store[:0]
		return
	}

	// Grow the buffer store by allocating a new one and copying the data.
	newStore := makeSlice(2*capStore + requiredSize)
	copy(newStore, buf.buffer)
	buf.store = newStore
	buf.buffer = buf.store[:requiredSize]
}

// makeSlice allocates a slice of size n. If the allocation fails, it panics
// with ErrTooLarge.
func makeSlice(n int) []byte {
	// If the make fails, give a known error.
	defer func() {
		if recover() != nil {
			panic(ErrTooLarge)
		}
	}()
	return make([]byte, 0, n)
}

// Flush deletes the provided number of bytes from the start of the Buffer.
// After flushing the Buffer, offset 0 as used by RuneAt() will point to
// the rune that comes after the runes that were flushed.
// So what this basically does, is turn the Buffer into a sliding window.
func (buf *Buffer) Flush(numberOfBytes int) {
	if numberOfBytes == 0 {
		return
	}

	bufferLen := len(buf.buffer)
	if numberOfBytes > bufferLen {
		panic(fmt.Sprintf(
			"parsekit.read.Buffer.Flush(): number of runes to flush (%d) "+
				"exceeds size of the buffer (%d)", numberOfBytes, bufferLen))
	}
	if bufferLen == numberOfBytes {
		buf.buffer = buf.store[:0]
		buf.errOffset = 0
		return
	}
	buf.buffer = buf.buffer[numberOfBytes:]
	if buf.err != nil {
		buf.errOffset = buf.errOffset - numberOfBytes
	}
}