158 lines
5.4 KiB
Go
158 lines
5.4 KiB
Go
// Package reader provides a buffered Reader that wraps around an io.Reader.
|
|
//
|
|
// Functionally, it provides an input buffer in the form of a sliding window.
|
|
// Let's say we've got the following input coming up in the io.Reader that is
|
|
// wrapped by the Reader:
|
|
//
|
|
// |H|e|l|l|o|,| |w|o|r|l|d|!| <-- runes
|
|
// 0 6 12 <-- rune offset
|
|
//
|
|
// The Reader can now be used to retrieve runes from the input, based on their
|
|
// offset, using RuneAt(offset). Normally these runes will be retrieved in
|
|
// sequence, but that is not a requirement. Let's say we retrieve the rune with
|
|
// offset 6 from the input (the 'w'), then the Reader buffer be filled with runes
|
|
// from the io.Reader until there are enough runes available to return the rune
|
|
// for offset 6:
|
|
//
|
|
// |H|e|l|l|o| |w|
|
|
// 0 6
|
|
//
|
|
// Using RuneAt, you can retrieve arbitrary runes. If you request one that is
|
|
// in the Reader buffer, then the buffered rune is returned. If you request one
|
|
// that is not in the buffer, then the buffer will be expanded.
|
|
//
|
|
// To make this into a sliding window, the Reader provides the method
|
|
// Flush(numberOfRunes). This method will drop the provided number of runes from
|
|
// the Reader buffer. So when we'd do a Flush(3) on the example buffer from above,
|
|
// then the Reader buffer would become:
|
|
//
|
|
// |l|o| |w|
|
|
// 0 3
|
|
//
|
|
// Note that the offset for the first rune 'l' in the buffer is now 0.
|
|
// You can consider the input to be changed in a similar way:
|
|
//
|
|
// |l|o|,| |w|o|r|l|d|!|
|
|
// 0 6 9
|
|
//
|
|
// So after a flush, the first upcoming rune after the flushed runes
|
|
// will always have index 0.
|
|
package reader
|
|
|
|
import (
|
|
"bufio"
|
|
"fmt"
|
|
"io"
|
|
"strings"
|
|
"unicode/utf8"
|
|
)
|
|
|
|
// Reader wraps around a bufio.Reader and provides an additional layer of
|
|
// buffering that allows us to read the same runes over and over again.
|
|
// This is useful for implementing a parser that must be able to do lookahead
|
|
// on the input, returning to the original input position after finishing
|
|
// that lookahead).
|
|
//
|
|
// To minimze memory use, it is also possible to flush the read buffer when there is
|
|
// no more need to go back to previously read runes.
|
|
//
|
|
// The parserkit.reader.Reader is used internally by parsekit.TokenAPI.
|
|
type Reader struct {
|
|
bufio *bufio.Reader // Used for ReadRune()
|
|
buffer []rune // Input buffer, holding runes that were read from input
|
|
bufferOffset int // The offset of the buffer, relative to the start of the input
|
|
bufferLen int // Input size, the number of runes in the buffer
|
|
err error // A read error, if one occurred
|
|
errOffset int // The offset in the buffer at which the read error was encountered
|
|
}
|
|
|
|
// New initializes a new reader struct, wrapped around the provided input.
|
|
//
|
|
// The input can be any one of the following types:
|
|
// - string
|
|
// - type implementing io.Reader
|
|
// - bufio.Reader
|
|
func New(input interface{}) *Reader {
|
|
return &Reader{
|
|
bufio: makeBufioReader(input),
|
|
buffer: []rune{},
|
|
}
|
|
}
|
|
|
|
func makeBufioReader(input interface{}) *bufio.Reader {
|
|
switch input := input.(type) {
|
|
case bufio.Reader:
|
|
return &input
|
|
case *bufio.Reader:
|
|
return input
|
|
case io.Reader:
|
|
return bufio.NewReader(input)
|
|
case string:
|
|
return bufio.NewReader(strings.NewReader(input))
|
|
default:
|
|
panic(fmt.Sprintf("parsekit.reader.New(): no support for input of type %T", input))
|
|
}
|
|
}
|
|
|
|
// RuneAt reads the rune at the provided rune offset.
|
|
//
|
|
// This offset is relative to the current starting position of the buffer in
|
|
// the reader. When starting reading, offset 0 will point at the start of the
|
|
// input. After flushing, offset 0 will point at the input up to where
|
|
// the flush was done.
|
|
//
|
|
// The error return value will be nil when reading was successful.
|
|
// When an invalid rune is encountered on the input, the error will be nil,
|
|
// but the rune will be utf8.RuneError
|
|
//
|
|
// When reading failed, the rune will be utf8.RuneError and the error will
|
|
// be not nil. One special read fail is actually a normal situation: end
|
|
// of file reached. In that case, the returned error wille be io.EOF.
|
|
func (r *Reader) RuneAt(offset int) (rune, error) {
|
|
// Re-issue a previously seen read error.
|
|
if r.err != nil && offset >= r.errOffset {
|
|
return utf8.RuneError, r.err
|
|
}
|
|
|
|
// Rune at provided offset is not yet available in the input buffer.
|
|
// Read runes until we have enough runes to satisfy the offset.
|
|
for r.bufferLen <= offset {
|
|
readRune, _, err := r.bufio.ReadRune()
|
|
|
|
// Handle errors.
|
|
if err != nil {
|
|
r.err = err
|
|
r.errOffset = r.bufferLen
|
|
return utf8.RuneError, err
|
|
}
|
|
|
|
// Skip BOM.
|
|
if readRune == '\uFEFF' && r.bufferOffset == 0 {
|
|
r.bufferOffset++
|
|
continue
|
|
}
|
|
|
|
r.buffer = append(r.buffer, readRune)
|
|
r.bufferLen++
|
|
}
|
|
return r.buffer[offset], nil
|
|
}
|
|
|
|
// Flush deletes the provided number of runes from the start of the
|
|
// reader buffer. After flushing the buffer, offset 0 as used by RuneAt()
|
|
// will point to the rune that comes after the flushed runes.
|
|
// So what this basically does is turn the Reader into a sliding window.
|
|
func (r *Reader) Flush(numberOfRunes int) {
|
|
if numberOfRunes > r.bufferLen {
|
|
panic(fmt.Sprintf(
|
|
"parsekit.Input.Reader.Flush(): number of runes to flush (%d) "+
|
|
"exceeds size of the buffer (%d)", numberOfRunes, r.bufferLen))
|
|
}
|
|
r.bufferOffset += numberOfRunes
|
|
r.bufferLen -= numberOfRunes
|
|
r.buffer = r.buffer[numberOfRunes:]
|
|
if r.err != nil {
|
|
r.errOffset -= numberOfRunes
|
|
}
|
|
}
|