// Package read provides a buffered input reader that is used to feed data to the tokenizer. // // Functionally, it provides an input buffer in the form of a sliding window. // Let's say we've got the following input coming up in the io.Reader that is // wrapped by the Reader: // // |H|e|l|l|o|,| |w|o|r|l|d|!| <-- runes // 0 6 12 <-- rune offset // // The Reader can now be used to retrieve runes from the input, based on their // offset, using RuneAt(offset). Normally these runes will be retrieved in // sequence, but that is not a requirement. Let's say we retrieve the rune with // offset 6 from the input (the 'w'), then the Reader buffer be filled with runes // from the io.Reader until there are enough runes available to return the rune // for offset 6: // // |H|e|l|l|o| |w| // 0 6 // // Using RuneAt, you can retrieve arbitrary runes. If you request one that is // in the Reader buffer, then the buffered rune is returned. If you request one // that is not in the buffer, then the buffer will be expanded. // // To make this into a sliding window, the Reader provides the method // Flush(numberOfRunes). This method will drop the provided number of runes from // the Reader buffer. So when we'd do a Flush(3) on the example buffer from above, // then the Reader buffer would become: // // |l|o| |w| // 0 3 // // Note that the offset for the first rune 'l' in the buffer is now 0. // You can consider the input to be changed in a similar way: // // |l|o|,| |w|o|r|l|d|!| // 0 6 9 // // So after a flush, the first upcoming rune after the flushed runes // will always be at offset 0. package read import ( "bufio" "errors" "fmt" "io" "strings" "unicode/utf8" ) // New initializes a new Buffer struct, wrapped around the provided input. // // The input can be any one of the following types: // // • string // // • a type implementing io.Reader // // • bufio.Reader func New(input interface{}) *Buffer { return &Buffer{ bufio: makeBufioReader(input), } } func makeBufioReader(input interface{}) *bufio.Reader { switch input := input.(type) { case bufio.Reader: return &input case *bufio.Reader: return input case io.Reader: return bufio.NewReader(input) case string: return bufio.NewReader(strings.NewReader(input)) default: panic(fmt.Sprintf("parsekit.read.New(): no support for input of type %T", input)) } } // Buffer wraps around a bufio.Reader and provides an additional layer of // buffering that allows us to read the same runes over and over again. // This is useful for implementing a parser that must be able to do lookahead // on the input, returning to the original input position after finishing // that lookahead). // // To minimze memory use, it is also possible to flush the read buffer when there is // no more need to go back to previously read runes. // // The parserkit.reader.Reader is used internally by tokenize.API. type Buffer struct { bufio *bufio.Reader // used for ReadRune() store []rune // buffer store, the buffer field is a slice on top of this one buffer []rune // input buffer, holding runes that were read from input err error // a read error, if one occurred errOffset int // the offset in the buffer at which the read error was encountered firstReadDone bool // whether or not the first read was done } // RuneAt reads the rune at the provided rune offset. // // This offset is relative to the current starting position of the Buffer. // When starting reading, offset 0 will point at the start of the input. // After flushing, offset 0 will point at the input up to where the flush was done. // // The error return value will be nil when reading was successful. // When an invalid rune is encountered on the input, the error will be nil, // but the rune will be utf8.RuneError // // When reading failed, the rune will be utf8.RuneError and the error will // be not nil. One special read fail is actually a normal situation: end // of file reached. In that case, the returned error wille be io.EOF. // // Once a read error is encountered, that same read error will guaranteed // be return on every subsequent read at or beyond the provided offset. func (r *Buffer) RuneAt(offset int) (rune, error) { // Re-issue a previously seen read error. if r.err != nil && offset >= r.errOffset { return utf8.RuneError, r.err } // Rune at provided offset is not yet available in the input buffer. // Read runes until we have enough runes to satisfy the offset. l := len(r.buffer) // Number of runes to add to the buffer to have enough space to store // the rune at the offset n := offset - l + 1 if n > 0 { r.grow(n) var readRune rune var err error for writeAt := l; writeAt <= offset; writeAt++ { readRune, _, err = r.bufio.ReadRune() // Skip BOM. if !r.firstReadDone { r.firstReadDone = true if readRune == '\uFEFF' { writeAt-- continue } } // Handle errors. if err != nil { r.err = err r.errOffset = writeAt return utf8.RuneError, err } r.buffer[writeAt] = readRune } return readRune, nil } return r.buffer[offset], nil } // The upcoming code was inspired heavily by the Go built-in 'bytes' package. // smallBufferSize is an initial allocation minimal capacity. const smallBufferSize = 64 // ErrTooLarge is passed to panic if memory cannot be allocated to store data in a buffer. var ErrTooLarge = errors.New("parsekit.read.Buffer: too large") // grow grows the buffer to guarantee space for n more bytes. // It returns the index where bytes should be written. // If the buffer can't grow it will panic with ErrTooLarge. func (r *Buffer) grow(n int) { // Instantiate new buffer store if r.store == nil { b := smallBufferSize if b < n { b = n } r.store = make([]rune, 0, b) r.buffer = r.store[:n] return } lenBuffer := len(r.buffer) capBuffer := cap(r.buffer) freeBuffer := capBuffer - lenBuffer newSize := lenBuffer + n // Grow the buffer store by reslicing within the available capacity. if freeBuffer >= n { r.buffer = r.buffer[:newSize] return } capStore := cap(r.store) freeAtStartOfStore := capStore - capBuffer // Grow the buffer by moving the data to the start of the store. // Note: according to the spec, overlapping slices are allowed with copy(). if freeAtStartOfStore > 0 && newSize <= capStore { r.store = r.store[0:newSize] copy(r.store, r.buffer) r.buffer = r.store[:newSize] r.store = r.store[:0] return } // Grow the buffer store by allocating a new one and copying the data. buf := makeSlice(2*capStore + n) copy(buf, r.buffer) r.store = buf r.buffer = r.store[:newSize] } // makeSlice allocates a slice of size n. If the allocation fails, it panics // with ErrTooLarge. func makeSlice(n int) []rune { // If the make fails, give a known error. defer func() { if recover() != nil { panic(ErrTooLarge) } }() return make([]rune, 0, n) } // Flush deletes the provided number of runes from the start of the Buffer. // After flushing the Buffer, offset 0 as used by RuneAt() will point to // the rune that comes after the runes that were flushed. // So what this basically does, is turn the Buffer into a sliding window. func (r *Buffer) Flush(numberOfRunes int) { l := len(r.buffer) if numberOfRunes > l { panic(fmt.Sprintf( "parsekit.read.Buffer.Flush(): number of runes to flush (%d) "+ "exceeds size of the buffer (%d)", numberOfRunes, l)) } if numberOfRunes == 0 { return } if l == numberOfRunes { r.buffer = r.store[:0] r.errOffset = 0 return } r.buffer = r.buffer[numberOfRunes:] if r.err != nil { r.errOffset = r.errOffset - numberOfRunes } }