// Package read provides a buffered input reader that is used to feed data to the tokenizer. // // Functionally, it provides an input buffer in the form of a sliding window. // Let's say we've got the following input coming up in the io.Reader that is // wrapped by the Reader: // // |H|e|l|l|o|,| |w|o|r|l|d|!| <-- bytes // 0 6 12 <-- byte offset // // The Reader can now be used to retrieve data from the input, based on their // byte offset, e.g. using RuneAt(offset) or ByteAt(offset). Normally these data // will be retrieved in sequence by the user of this code, but that is not a // requirement. Let's say we right away ask to retrieve the byte with offset 6 // from the input (the 'w'). The Reader buffer will then be filled with at // least 6 bytes and the bytes at offset 6 will be returned. // // Note: the actual Reader would not stop after reading the byte at offset 6. // For performance reasons, it would read as much data into the available buffer // space as possible (but no more than the available space). // // |H|e|l|l|o| |w| // 0 6 // // This means that you can retrieve data for arbitrary offsets. If you request // offsets that are already in the Reader buffer, then the buffered data are // returned. If you request an offset that is not available in the buffer, then // the buffer will be expanded. // // To make this into a sliding window (which preserves memory space while scanning // the input data), the Reader provides the method Flush(numberOfBytes). // This method will drop the provided number of bytes from the Reader buffer. // So when we'd do a Flush(3) on the example buffer from above, then the Reader // buffer would become: // // |l|o| |w| // 0 3 // // Note that the offset for the first rune 'l' in the buffer is now 0. // You can consider the complete input to be changed in a similar way: // // |l|o|,| |w|o|r|l|d|!| // 0 6 9 // // So after a flush, the first upcoming rune after the flushed runes // will always be at offset 0. package read import ( "bufio" "errors" "fmt" "io" "strings" "unicode/utf8" ) // New initializes a new Buffer struct, wrapped around the provided input. // // The input can be any one of the following types: // // • string // // • a type implementing io.Reader // // • bufio.Reader func New(input interface{}) Buffer { return Buffer{ bufio: makeBufioReader(input), } } func makeBufioReader(input interface{}) *bufio.Reader { switch input := input.(type) { case bufio.Reader: return &input case *bufio.Reader: return input case io.Reader: return bufio.NewReader(input) case string: return bufio.NewReader(strings.NewReader(input)) default: panic(fmt.Sprintf("parsekit.read.New(): no support for input of type %T", input)) } } // Buffer wraps around a bufio.Reader and provides an additional layer of // buffering that allows us to read the same data over and over again. // This is useful for implementing a parser that must be able to do lookahead // on the input, returning to the original input position after finishing // that lookahead). // // To minimize memory use, it is also possible to flush the read buffer when there is // no more need to go back to previously read data. // // This buffer is used internally by tokenize.API. type Buffer struct { bufio *bufio.Reader // used for ReadRune() buffer []byte // input buffer, holding bytes that were read from input cap int // the full buffer capacity start int // the offset from where on to read buffered data in the buffer len int // the length of the buffered data err error // a read error, if one occurred errOffset int // the offset in the buffer at which the read error was encountered } // RuneAt reads the rune at the provided byte offset. // // The byte offset is relative to the current starting position of the Buffer. // When starting reading, offset 0 will point at the start of the input. // After flushing some bytes, offset 0 will point at the input up to where // the flush was done. // // When reading was successful, the rune and the width of the rune in bytes // will be returned. The returned error will be nil. // When an invalid UTF8 rune is encountered on the input, the error will be nil, // but the rune will be utf8.RuneError // // When reading failed, the rune will be utf8.RuneError and the error will // be not nil. One special read fail is actually a normal situation: end // of file reached. In that case, the returned error wille be io.EOF. // // Once a read error is encountered, that same read error will guaranteed // be return on every subsequent read at or beyond the provided offset. func (buf *Buffer) RuneAt(offset int) (rune, int, error) { if buf.len < offset+utf8.MaxRune && buf.err == nil { buf.fill(offset + utf8.UTFMax) } if buf.err != nil && offset >= buf.errOffset { return utf8.RuneError, 0, buf.err } r, w := utf8.DecodeRune(buf.buffer[buf.start+offset:]) return r, w, nil } // ByteAt reads the byte at the provided byte offset. // // The byte offset is relative to the current starting position of the Buffer. // When starting reading, offset 0 will point at the start of the input. // After flushing, offset 0 will point at the input up to where the flush // was done. // // When reading was successful, the byte will be returned. The returned // error will be nil. // // When reading failed, the byte will be 0x00 and the error will // not be nil. One special read fail is actually a normal situation: end // of file reached. In that case, the returned error wille be io.EOF. // // Once a read error is encountered, that same read error will guaranteed // be return on every subsequent read at or beyond the provided offset. func (buf *Buffer) ByteAt(offset int) (byte, error) { if buf.len < offset+1 && buf.err == nil { buf.fill(offset + 1) } if buf.err != nil && offset >= buf.errOffset { return 0, buf.err } return buf.buffer[buf.start+offset], nil } // BytesAt reads at max the provided number of bytes at the provided byte offset. // // The byte offset is relative to the current starting position of the Buffer. // When starting reading, offset 0 will point at the start of the input. // After flushing, offset 0 will point at the input up to where the flush // was done. // // When reading was successful, the byte will be returned. The returned // error will be nil. // // When reading failed, the returned byte slice might be empty, or it might // contain a part of the requsted bytes. The error will not be nil. // One special read fail is actually a normal situation: end // of file reached. In that case, the returned error wille be io.EOF. // // Once a read error is encountered, that same read error will guaranteed // be return on every subsequent read at or beyond the provided offset. func (buf *Buffer) BytesAt(offset int, count int) ([]byte, error) { if buf.len < offset+count && buf.err == nil { buf.fill(offset + count) } if buf.err != nil && offset+count > buf.errOffset { return buf.buffer[buf.start+offset : buf.start+buf.errOffset], buf.err } return buf.buffer[buf.start+offset : buf.start+offset+count], nil } func (buf *Buffer) BufferedBytesAt(offset int) ([]byte, error) { if buf.len < offset+1 && buf.err == nil { buf.fill(offset + 1) } if buf.err != nil { return buf.buffer[buf.start+offset : buf.start+buf.errOffset], buf.err } return buf.buffer[buf.start+offset : buf.start+buf.len], nil } func (buf *Buffer) fill(minBytes int) { // Grow the buffer so it can contain at least the number of requested bytes. if minBytes > buf.cap-buf.start { buf.grow(minBytes) } // Try to fill the buffer completely with data from our source. // This is more efficient than only filling the data up to the point where // we can read the data at the 'minBytes' position. Ideally, the buffer is // filled completely with data to work with. availableLen := buf.cap - buf.start for buf.len < availableLen { // Read bytes from our source, and append them to the end of the // current buffer data. n, err := buf.bufio.Read(buf.buffer[buf.len:buf.cap]) buf.len += n if err != nil { buf.err = err buf.errOffset = buf.len break } } } const defaultBufferSize = 1024 const runeCacheSize = 128 // ErrTooLarge is passed to panic if memory cannot be allocated to store data in a buffer. var ErrTooLarge = errors.New("parsekit.read.Buffer: too large") // grow grows the buffer to guarantee space for at least the requested amount // of bytes, either shifting data around or reallocating the buffer. // When reallocating, the new buffer size will always be a multitude of the // default buffer size. func (buf *Buffer) grow(minBytes int) { // When possible, grow the buffer by moving the data to the start of // the buffer, freeing up extra capacity at the end. if buf.start > 0 && minBytes <= buf.cap { copy(buf.buffer, buf.buffer[buf.start:buf.start+buf.len]) buf.start = 0 return } // Grow the buffer store by allocating a new one and copying the data. newbufCap := (minBytes / defaultBufferSize) * defaultBufferSize if minBytes%defaultBufferSize > 0 { newbufCap += defaultBufferSize } newStore := makeSlice(newbufCap) copy(newStore, buf.buffer[buf.start:buf.start+buf.len]) buf.buffer = newStore buf.start = 0 buf.cap = newbufCap return } // makeSlice allocates a slice of size n. If the allocation fails, it panics // with ErrTooLarge. func makeSlice(c int) []byte { // If the make fails, give a known error. defer func() { if recover() != nil { panic(ErrTooLarge) } }() return make([]byte, c) } // Flush deletes the provided number of bytes from the start of the Buffer. // After flushing the Buffer, offset 0 as used by RuneAt() and ByteAt() will // point to the first byte that came after the bytes that were flushed. func (buf *Buffer) Flush(numberOfBytes int) { if numberOfBytes == 0 { return } if numberOfBytes > buf.len { panic(fmt.Sprintf( "parsekit.read.Buffer.Flush(): number of bytes to flush (%d) "+ "exceeds size of the buffered data (%d)", numberOfBytes, buf.len)) } if buf.len == numberOfBytes { buf.len = 0 buf.start = 0 buf.errOffset = 0 return } if buf.err != nil { buf.errOffset -= numberOfBytes } buf.start += numberOfBytes buf.len -= numberOfBytes }