Bytes reader working, now carry on switching to byte reading in the tokenizer code.

This commit is contained in:
Maurice Makaay 2019-07-15 20:03:05 +00:00
parent 17935b7534
commit d4492e4f0a
5 changed files with 331 additions and 236 deletions

View File

@ -4,27 +4,29 @@
// Let's say we've got the following input coming up in the io.Reader that is
// wrapped by the Reader:
//
// |H|e|l|l|o|,| |w|o|r|l|d|!| <-- runes
// 0 6 12 <-- rune offset
// |H|e|l|l|o|,| |w|o|r|l|d|!| <-- bytes
// 0 6 12 <-- byte offset
//
// The Reader can now be used to retrieve runes from the input, based on their
// offset, using RuneAt(offset). Normally these runes will be retrieved in
// sequence, but that is not a requirement. Let's say we retrieve the rune with
// offset 6 from the input (the 'w'), then the Reader buffer be filled with runes
// from the io.Reader until there are enough runes available to return the rune
// for offset 6:
// The Reader can now be used to retrieve data from the input, based on their
// byte offset, e.g. using RuneAt(offset) or ByteAt(offset). Normally these data
// will be retrieved in sequence by the user of this code, but that is not a
// requirement. Let's say we retrieve the byte with offset 6 from the input
// (the 'w'), then the Reader buffer be filled with runes from the io.Reader
// until there are enough runes available to return the rune for offset 6:
//
// |H|e|l|l|o| |w|
// 0 6
//
// Using RuneAt, you can retrieve arbitrary runes. If you request one that is
// in the Reader buffer, then the buffered rune is returned. If you request one
// that is not in the buffer, then the buffer will be expanded.
// This means that you can retrieve data for arbitrary offsets. If you request
// an offset that is already in the Reader buffer, then the buffered data are
// returned. If you request one that is not in the buffer, then the buffer will
// be expanded.
//
// To make this into a sliding window, the Reader provides the method
// Flush(numberOfRunes). This method will drop the provided number of runes from
// the Reader buffer. So when we'd do a Flush(3) on the example buffer from above,
// then the Reader buffer would become:
// To make this into a sliding window (preserving memory space while scanning
// the input data), the Reader provides the method Flush(numberOfBytes).
// This method will drop the provided number of bytes from the Reader buffer.
// So when we'd do a Flush(3) on the example buffer from above, then the Reader
// buffer would become:
//
// |l|o| |w|
// 0 3
@ -79,32 +81,33 @@ func makeBufioReader(input interface{}) *bufio.Reader {
}
// Buffer wraps around a bufio.Reader and provides an additional layer of
// buffering that allows us to read the same runes over and over again.
// buffering that allows us to read the same data over and over again.
// This is useful for implementing a parser that must be able to do lookahead
// on the input, returning to the original input position after finishing
// that lookahead).
//
// To minimze memory use, it is also possible to flush the read buffer when there is
// no more need to go back to previously read runes.
// To minimize memory use, it is also possible to flush the read buffer when there is
// no more need to go back to previously read data.
//
// The parserkit.reader.Reader is used internally by tokenize.API.
// This parserkit.reader.Reader is used internally by tokenize.API.
type Buffer struct {
bufio *bufio.Reader // used for ReadRune()
store []rune // buffer store, the buffer field is a slice on top of this one
buffer []rune // input buffer, holding runes that were read from input
err error // a read error, if one occurred
errOffset int // the offset in the buffer at which the read error was encountered
firstReadDone bool // whether or not the first read was done
bufio *bufio.Reader // used for ReadRune()
store []byte // buffer store, the buffer field is a slice on top of this one
buffer []byte // input buffer, holding runes that were read from input
err error // a read error, if one occurred
errOffset int // the offset in the buffer at which the read error was encountered
}
// RuneAt reads the rune at the provided rune offset.
// RuneAt reads the rune at the provided byte offset.
//
// This offset is relative to the current starting position of the Buffer.
// The offset is relative to the current starting position of the Buffer.
// When starting reading, offset 0 will point at the start of the input.
// After flushing, offset 0 will point at the input up to where the flush was done.
// After flushing, offset 0 will point at the input up to where the flush
// was done.
//
// The error return value will be nil when reading was successful.
// When an invalid rune is encountered on the input, the error will be nil,
// When reading was successful, the rune and the width of the rune in bytes
// will be returned. The returned error will be nil.
// When an invalid UTF8 rune is encountered on the input, the error will be nil,
// but the rune will be utf8.RuneError
//
// When reading failed, the rune will be utf8.RuneError and the error will
@ -113,48 +116,83 @@ type Buffer struct {
//
// Once a read error is encountered, that same read error will guaranteed
// be return on every subsequent read at or beyond the provided offset.
func (r *Buffer) RuneAt(offset int) (rune, error) {
// Re-issue a previously seen read error.
if r.err != nil && offset >= r.errOffset {
return utf8.RuneError, r.err
func (buf *Buffer) RuneAt(offset int) (rune, int, error) {
// Shortcut: re-issue a previously seen read error.
if buf.err != nil && offset >= buf.errOffset {
return utf8.RuneError, 0, buf.err
}
// Rune at provided offset is not yet available in the input buffer.
// Read runes until we have enough runes to satisfy the offset.
l := len(r.buffer)
// Number of runes to add to the buffer to have enough space to store
// the rune at the offset
n := offset - l + 1
if n > 0 {
r.grow(n)
var readRune rune
var err error
for writeAt := l; writeAt <= offset; writeAt++ {
readRune, _, err = r.bufio.ReadRune()
// Skip BOM.
if !r.firstReadDone {
r.firstReadDone = true
if readRune == '\uFEFF' {
writeAt--
continue
}
}
// Handle errors.
// Compute the number of bytes that we need in the buffer to be able
// to return the rune at the provided byte offset.
bufferLen := len(buf.buffer)
requiredLen := offset + utf8.UTFMax
if requiredLen > bufferLen && buf.err == nil {
buf.grow(requiredLen)
for writeAt := bufferLen; writeAt < requiredLen; writeAt++ {
b, err := buf.bufio.ReadByte()
if err != nil {
r.err = err
r.errOffset = writeAt
return utf8.RuneError, err
buf.err = err
buf.errOffset = writeAt
buf.buffer = buf.buffer[:writeAt]
break
}
r.buffer[writeAt] = readRune
buf.buffer[writeAt] = b
}
return readRune, nil
}
return r.buffer[offset], nil
if buf.err != nil && offset >= buf.errOffset {
return utf8.RuneError, 0, buf.err
}
r, w := utf8.DecodeRune(buf.buffer[offset:])
return r, w, nil
}
// ByteAt reads the byte at the provided byte offset.
//
// The offset is relative to the current starting position of the Buffer.
// When starting reading, offset 0 will point at the start of the input.
// After flushing, offset 0 will point at the input up to where the flush
// was done.
//
// When reading was successful, the byte will be returned. The returned
// error will be nil.
//
// When reading failed, the byte will be 0x00 and the error will
// be not nil. One special read fail is actually a normal situation: end
// of file reached. In that case, the returned error wille be io.EOF.
//
// Once a read error is encountered, that same read error will guaranteed
// be return on every subsequent read at or beyond the provided offset.
func (buf *Buffer) ByteAt(offset int) (byte, error) {
// Shortcut: re-issue a previously seen read error.
if buf.err != nil && offset >= buf.errOffset {
return 0, buf.err
}
// Compute the number of bytes that we need in the buffer to be able
// to return the byte at the provided byte offset.
bufferLen := len(buf.buffer)
requiredLen := offset + 1
if requiredLen > bufferLen && buf.err == nil {
buf.grow(requiredLen)
for writeAt := bufferLen; writeAt < requiredLen; writeAt++ {
b, err := buf.bufio.ReadByte()
if err != nil {
buf.err = err
buf.errOffset = writeAt
buf.buffer = buf.buffer[:writeAt]
break
}
buf.buffer[writeAt] = b
}
}
if buf.err != nil && offset >= buf.errOffset {
return 0, buf.err
}
return buf.buffer[offset], nil
}
// The upcoming code was inspired heavily by the Go built-in 'bytes' package.
@ -168,82 +206,80 @@ var ErrTooLarge = errors.New("parsekit.read.Buffer: too large")
// grow grows the buffer to guarantee space for n more bytes.
// It returns the index where bytes should be written.
// If the buffer can't grow it will panic with ErrTooLarge.
func (r *Buffer) grow(n int) {
func (buf *Buffer) grow(requiredSize int) {
// Instantiate new buffer store
if r.store == nil {
if buf.store == nil {
b := smallBufferSize
if b < n {
b = n
if b < requiredSize {
b = requiredSize
}
r.store = make([]rune, 0, b)
r.buffer = r.store[:n]
buf.store = make([]byte, 0, b)
buf.buffer = buf.store[:requiredSize]
return
}
lenBuffer := len(r.buffer)
capBuffer := cap(r.buffer)
freeBuffer := capBuffer - lenBuffer
newSize := lenBuffer + n
capBuffer := cap(buf.buffer)
// Grow the buffer store by reslicing within the available capacity.
if freeBuffer >= n {
r.buffer = r.buffer[:newSize]
if capBuffer >= requiredSize {
buf.buffer = buf.buffer[:requiredSize]
return
}
capStore := cap(r.store)
capStore := cap(buf.store)
freeAtStartOfStore := capStore - capBuffer
// Grow the buffer by moving the data to the start of the store.
// Note: according to the spec, overlapping slices are allowed with copy().
if freeAtStartOfStore > 0 && newSize <= capStore {
r.store = r.store[0:newSize]
copy(r.store, r.buffer)
r.buffer = r.store[:newSize]
r.store = r.store[:0]
if freeAtStartOfStore > 0 && requiredSize <= capStore {
buf.store = buf.store[0:requiredSize]
copy(buf.store, buf.buffer)
buf.buffer = buf.store[:requiredSize]
buf.store = buf.store[:0]
return
}
// Grow the buffer store by allocating a new one and copying the data.
buf := makeSlice(2*capStore + n)
copy(buf, r.buffer)
r.store = buf
r.buffer = r.store[:newSize]
newStore := makeSlice(2*capStore + requiredSize)
copy(newStore, buf.buffer)
buf.store = newStore
buf.buffer = buf.store[:requiredSize]
}
// makeSlice allocates a slice of size n. If the allocation fails, it panics
// with ErrTooLarge.
func makeSlice(n int) []rune {
func makeSlice(n int) []byte {
// If the make fails, give a known error.
defer func() {
if recover() != nil {
panic(ErrTooLarge)
}
}()
return make([]rune, 0, n)
return make([]byte, 0, n)
}
// Flush deletes the provided number of runes from the start of the Buffer.
// Flush deletes the provided number of bytes from the start of the Buffer.
// After flushing the Buffer, offset 0 as used by RuneAt() will point to
// the rune that comes after the runes that were flushed.
// So what this basically does, is turn the Buffer into a sliding window.
func (r *Buffer) Flush(numberOfRunes int) {
l := len(r.buffer)
if numberOfRunes > l {
func (buf *Buffer) Flush(numberOfBytes int) {
if numberOfBytes == 0 {
return
}
bufferLen := len(buf.buffer)
if numberOfBytes > bufferLen {
panic(fmt.Sprintf(
"parsekit.read.Buffer.Flush(): number of runes to flush (%d) "+
"exceeds size of the buffer (%d)", numberOfRunes, l))
"exceeds size of the buffer (%d)", numberOfBytes, bufferLen))
}
if numberOfRunes == 0 {
if bufferLen == numberOfBytes {
buf.buffer = buf.store[:0]
buf.errOffset = 0
return
}
if l == numberOfRunes {
r.buffer = r.store[:0]
r.errOffset = 0
return
}
r.buffer = r.buffer[numberOfRunes:]
if r.err != nil {
r.errOffset = r.errOffset - numberOfRunes
buf.buffer = buf.buffer[numberOfBytes:]
if buf.err != nil {
buf.errOffset = buf.errOffset - numberOfBytes
}
}

View File

@ -12,8 +12,8 @@ import (
func ExampleNew() {
printFirstRuneOf := func(input interface{}) {
r := New(input)
c, _ := r.RuneAt(0)
fmt.Printf("%q\n", c)
c, w, _ := r.RuneAt(0)
fmt.Printf("rune %q, width %d\n", c, w)
}
simpleString := "Hello, world!"
@ -25,14 +25,14 @@ func ExampleNew() {
bufioReaderPointer := bufio.NewReader(strings.NewReader("Where do we go, world?"))
printFirstRuneOf(bufioReaderPointer)
bufioReaderValue := *(bufio.NewReader(strings.NewReader("Where do we go, world?")))
bufioReaderValue := *(bufio.NewReader(strings.NewReader("Ɍead the manual!")))
printFirstRuneOf(bufioReaderValue)
// Output:
// 'H'
// 'G'
// 'W'
// 'W'
// rune 'H', width 1
// rune 'G', width 1
// rune 'W', width 1
// rune 'Ɍ', width 2
}
func TestNew_VariousInputTypesCanBeUsed(t *testing.T) {
@ -46,13 +46,13 @@ func TestNew_VariousInputTypesCanBeUsed(t *testing.T) {
{"bufio.Reader", *(bufio.NewReader(strings.NewReader("Hello, world!")))},
} {
r := New(test.input)
firstRune, _ := r.RuneAt(0)
firstRune, _, _ := r.RuneAt(0)
if firstRune != 'H' {
t.Errorf("[%s] first rune not 'H'", test.name)
}
lastRune, _ := r.RuneAt(12)
lastRune, _, _ := r.RuneAt(12)
if lastRune != '!' {
t.Errorf("[%s] last rune not '!'", test.name)
t.Errorf("[%s] last rune not '!', but %q", test.name, lastRune)
}
}
}
@ -63,41 +63,63 @@ func TestNew_UnhandledInputType_Panics(t *testing.T) {
"parsekit.read.New(): no support for input of type int")
}
func TestBuffer_RuneAt(t *testing.T) {
func TestBuffer_ByteAt(t *testing.T) {
r := New(strings.NewReader("Hello, world!"))
at := func(i int) rune { r, _ := r.RuneAt(i); return r }
at := func(i int) byte { b, _ := r.ByteAt(i); return b }
// It is possible to go back and forth while reading the input.
result := fmt.Sprintf("%c%c%c%c", at(0), at(12), at(7), at(0))
assertEqual(t, "H!wH", result)
}
func TestBuffer_RuneAt(t *testing.T) {
r := New(strings.NewReader("¡pןɹoʍ 'oןןǝH"))
at := func(i int) rune { r, _, _ := r.RuneAt(i); return r }
// It is possible to go back and forth while reading the input.
result := fmt.Sprintf("%c%c%c%c", at(0), at(5), at(8), at(0))
assertEqual(t, "¡ɹʍ¡", result)
}
func TestBuffer_ByteAt_endOfFile(t *testing.T) {
r := New(strings.NewReader("Hello, world!"))
b, err := r.ByteAt(13)
result := fmt.Sprintf("%q %s %t", b, err, err == io.EOF)
assertEqual(t, "'\\x00' EOF true", result)
b, err = r.ByteAt(20)
result = fmt.Sprintf("%q %s %t", b, err, err == io.EOF)
assertEqual(t, "'\\x00' EOF true", result)
}
func TestBuffer_RuneAt_endOfFile(t *testing.T) {
r := New(strings.NewReader("Hello, world!"))
rn, err := r.RuneAt(13)
rn, _, err := r.RuneAt(13)
result := fmt.Sprintf("%q %s %t", rn, err, err == io.EOF)
assertEqual(t, "'<27>' EOF true", result)
rn, err = r.RuneAt(20)
rn, _, err = r.RuneAt(20)
result = fmt.Sprintf("%q %s %t", rn, err, err == io.EOF)
assertEqual(t, "'<27>' EOF true", result)
}
func TestBuffer_RuneAt_invalidRune(t *testing.T) {
r := New(strings.NewReader("Hello, \xcdworld!"))
at := func(i int) rune { r, _ := r.RuneAt(i); return r }
at := func(i int) rune { r, _, _ := r.RuneAt(i); return r }
result := fmt.Sprintf("%c%c%c%c", at(6), at(7), at(8), at(9))
assertEqual(t, " <20>wo", result)
}
func ExampleBuffer_RuneAt() {
func ExampleBuffer_ByteAt() {
reader := New(strings.NewReader("Hello, world!"))
fmt.Printf("Runes: ")
for i := 0; ; i++ {
r, err := reader.RuneAt(i)
offset := 0
for {
r, err := reader.ByteAt(offset)
offset++
if err != nil {
fmt.Printf("\nErr: %s\n", err)
break
@ -110,18 +132,39 @@ func ExampleBuffer_RuneAt() {
// Err: EOF
}
func TestRuneAt_SkipsBOMAtStartOfFile(t *testing.T) {
r := New(strings.NewReader("\uFEFFBommetje!"))
b, _ := r.RuneAt(0)
o, _ := r.RuneAt(1)
m, _ := r.RuneAt(2)
bom := fmt.Sprintf("%c%c%c", b, o, m)
assertEqual(t, "Bom", bom)
func ExampleBuffer_RuneAt() {
reader := New(strings.NewReader("Hello, pןɹoʍ!"))
fmt.Printf("Runes: ")
offset := 0
for {
r, w, err := reader.RuneAt(offset)
offset += w
if err != nil {
fmt.Printf("\nErr: %s\n", err)
break
}
fmt.Printf("%c", r)
}
// Output:
// Runes: Hello, pןɹoʍ!
// Err: EOF
}
// TODO reimplement somewhere, maybe a separate call in the reader or should it be part of a parser?
// func TestRuneAt_SkipsBOMAtStartOfFile(t *testing.T) {
// r := New(strings.NewReader("\uFEFFBommetje!"))
// b, _, _ := r.RuneAt(0)
// o, _, _ := r.RuneAt(1)
// m, _, _ := r.RuneAt(2)
// bom := fmt.Sprintf("%c%c%c", b, o, m)
// assertEqual(t, "Bom", bom)
// }
func TestBuffer_Flush(t *testing.T) {
r := New(strings.NewReader("Hello, world!"))
at := func(i int) rune { r, _ := r.RuneAt(i); return r }
at := func(i int) rune { r, _, _ := r.RuneAt(i); return r }
// Fills the buffer with the first 8 runes on the input: "Hello, w"
result := fmt.Sprintf("%c", at(7))
@ -138,7 +181,7 @@ func TestBuffer_Flush(t *testing.T) {
func ExampleBuffer_Flush() {
r := New(strings.NewReader("dog eat dog!"))
at := func(offset int) rune { c, _ := r.RuneAt(offset); return c }
at := func(offset int) rune { c, _, _ := r.RuneAt(offset); return c }
// Read from the first 4 runes of the input.
fmt.Printf("%c%c%c%c", at(0), at(1), at(2), at(3))
@ -162,30 +205,31 @@ func ExampleBuffer_Flush() {
func TestGivenNumberOfRunesTooHigh_Flush_Panics(t *testing.T) {
r := New(strings.NewReader("Hello, world!"))
// Fill buffer with "Hello, worl", the first 11 runes.
r.RuneAt(10)
// Fill buffer with "Hello, world!", the first 13 runes.
rn, _, _ := r.RuneAt(12)
assertEqual(t, '!', rn)
// However, we flush 12 runes, which exceeds the buffer size.
// However, we flush 14 runes, which exceeds the buffer size.
assertPanic(t,
func() { r.Flush(12) },
func() { r.Flush(14) },
"parsekit.read.Buffer.Flush(): number of runes to flush "+
"(12) exceeds size of the buffer (11)")
"(14) exceeds size of the buffer (13)")
}
func TestGivenEOFFollowedByFlush_EOFCanStillBeRead(t *testing.T) {
r := New(strings.NewReader("Hello, world!"))
_, err := r.RuneAt(13)
_, _, err := r.RuneAt(13)
assertEqual(t, err.Error(), "EOF")
_, err = r.RuneAt(13)
_, _, err = r.RuneAt(13)
assertEqual(t, err.Error(), "EOF")
_, err = r.RuneAt(14)
_, _, err = r.RuneAt(14)
assertEqual(t, err.Error(), "EOF")
r.Flush(13)
_, err = r.RuneAt(0)
_, _, err = r.RuneAt(0)
assertEqual(t, err.Error(), "EOF")
_, err = r.RuneAt(1)
_, _, err = r.RuneAt(1)
assertEqual(t, err.Error(), "EOF")
_, err = r.RuneAt(2)
_, _, err = r.RuneAt(2)
assertEqual(t, err.Error(), "EOF")
}
@ -203,43 +247,43 @@ func TestGivenErrorFromBuffer_ErrorIsCached(t *testing.T) {
r := New(input)
// Read the last availble rune.
readRune, _ := r.RuneAt(3)
readRune, _, _ := r.RuneAt(3)
assertEqual(t, 'd', readRune)
// Reading the next offset must result in the io.EOF error from the stub.
readRune, err := r.RuneAt(4)
readRune, _, err := r.RuneAt(4)
assertEqual(t, utf8.RuneError, readRune)
assertEqual(t, io.EOF, err)
// Reading even further should yield the same io.EOF error.
readRune, err = r.RuneAt(5)
readRune, _, err = r.RuneAt(5)
assertEqual(t, utf8.RuneError, readRune)
assertEqual(t, io.EOF, err)
// After an error, we must still be able to read the last rune.
readRune, _ = r.RuneAt(3)
readRune, _, _ = r.RuneAt(3)
assertEqual(t, 'd', readRune)
// Flushing updates the error index too.
r.Flush(3)
// The last rune is now at offset 0.
readRune, _ = r.RuneAt(0)
readRune, _, _ = r.RuneAt(0)
assertEqual(t, 'd', readRune)
// The io.EOF is now at offset 1.
_, err = r.RuneAt(1)
_, _, err = r.RuneAt(1)
assertEqual(t, io.EOF, err)
// Let's flush that last rune too.
r.Flush(1)
// The io.EOF is now at offset 0.
_, err = r.RuneAt(0)
_, _, err = r.RuneAt(0)
assertEqual(t, io.EOF, err)
// And reading beyond that offset also yields io.EOF.
_, err = r.RuneAt(1)
_, _, err = r.RuneAt(1)
assertEqual(t, io.EOF, err)
}
@ -247,13 +291,13 @@ func TestInputLargerThanDefaultBufSize64(t *testing.T) {
input, size := makeLargeStubReader()
r := New(input)
readRune, err := r.RuneAt(0)
readRune, _, err := r.RuneAt(0)
assertEqual(t, 'X', readRune)
readRune, err = r.RuneAt(size - 1)
readRune, _, err = r.RuneAt(size - 1)
assertEqual(t, 'Y', readRune)
readRune, err = r.RuneAt(size)
readRune, _, err = r.RuneAt(size)
assertEqual(t, io.EOF, err)
readRune, err = r.RuneAt(10)
readRune, _, err = r.RuneAt(10)
assertEqual(t, 'X', readRune)
}
@ -261,9 +305,9 @@ func TestInputLargerThanDefaultBufSize64_WithFirstReadLargerThanBufSize64(t *tes
input, size := makeLargeStubReader()
r := New(input)
readRune, _ := r.RuneAt(size - 200)
readRune, _, _ := r.RuneAt(size - 200)
assertEqual(t, 'X', readRune)
readRune, _ = r.RuneAt(size - 1)
readRune, _, _ = r.RuneAt(size - 1)
assertEqual(t, 'Y', readRune)
}
@ -271,7 +315,7 @@ func TestInputLargerThanDefaultBufSize64_WithFirstReadToLastByte(t *testing.T) {
input, size := makeLargeStubReader()
r := New(input)
readRune, _ := r.RuneAt(size - 1)
readRune, _, _ := r.RuneAt(size - 1)
assertEqual(t, 'Y', readRune)
}
@ -282,12 +326,17 @@ func TestAllocationPatterns(t *testing.T) {
// The first read will create the standard cache.
// store |x 64 |
// buffer |x 64 |
assertCache(t, "read 1", r, func() { r.RuneAt(0) }, 0, 64, 1, 64)
assertCache(t, "read 1", r, func() { r.RuneAt(0) }, 0, 64, 4, 64)
// The first 64 reads will fit in the standard cache.
// The first 64 bytes will fit in the standard cache.
// store |xxxx64xxxxx|
// buffer |xxxx64xxxxx|
assertCache(t, "read fill cache", r, func() { r.RuneAt(63) }, 0, 64, 64, 64)
//
// Note: in the test offset 60 is used instead of offset 63, because
// RuneAt() will fill the buffer with 4 bytes to accomodate for the
// longest UTF8 character encodings. In all upcoming tests, the same
// logic applies to the RuneAt() calls.
assertCache(t, "read fill cache", r, func() { r.RuneAt(60) }, 0, 64, 64, 64)
// Flushing zero input keeps everything as-is.
// store |xxxx64xxxxx|
@ -302,7 +351,7 @@ func TestAllocationPatterns(t *testing.T) {
// Reading 65 chars will allocate a new store of 2 * size + n.
// store |xxxxx65xxxxx 128 |
// buffer |xxxxx65xxxxx 128 |
assertCache(t, "read cap + 1", r, func() { r.RuneAt(64) }, 0, 65+128, 65, 65+128)
assertCache(t, "read cap + 1", r, func() { r.RuneAt(61) }, 0, 65+128, 65, 65+128)
// A partial flush frees the start of the store and moves
// the buffer slice.
@ -315,7 +364,7 @@ func TestAllocationPatterns(t *testing.T) {
// without a new allocation.
// store | 50 xxxxxxxxx143xxxxxxxx|
// buffer |xxxxxxxxx143xxxxxxxx|
assertCache(t, "read fill cache after partial flush", r, func() { r.RuneAt(142) }, 0, 50+143, 143, 143)
assertCache(t, "read fill cache after partial flush", r, func() { r.RuneAt(139) }, 0, 50+143, 143, 143)
// Flush the full input.
// store | 193 |
@ -325,7 +374,7 @@ func TestAllocationPatterns(t *testing.T) {
// Read a bit more than half the capacity.
// store |xxxxxx101xxxxxxxx 92 |
// buffer |xxxxxx101xxxxxxxx 92 |
assertCache(t, "read more than half the cap", r, func() { r.RuneAt(100) }, 0, 193, 101, 193)
assertCache(t, "read more than half the cap", r, func() { r.RuneAt(97) }, 0, 193, 101, 193)
// Then flush almost all input.
// store | 100 x1x 92 |
@ -337,7 +386,7 @@ func TestAllocationPatterns(t *testing.T) {
// store (where it fits), space is freed up for the read operation.
// store |xxxxx100xxxxxx 93 |
// buffer |xxxxx100xxxxxx 93 |
assertCache(t, "read beyond cap with free space at start of store", r, func() { r.RuneAt(99) }, 0, 193, 100, 193)
assertCache(t, "read beyond cap with free space at start of store", r, func() { r.RuneAt(96) }, 0, 193, 100, 193)
// Now flush only one rune from the cache.
// store |1 xxxx99xxxxx 93 |
@ -349,7 +398,7 @@ func TestAllocationPatterns(t *testing.T) {
// the data is moved to the start and no reallocation is needed.
// store |1 xxxx99xxxxx 93 |
// buffer |xxxx99xxxxx 93 |
assertCache(t, "read 1 more than cap with 1 free at start", r, func() { r.RuneAt(192) }, 0, 193, 193, 193)
assertCache(t, "read 1 more than cap with 1 free at start", r, func() { r.RuneAt(189) }, 0, 193, 193, 193)
}
func makeLargeStubReader() (*StubReader, int) {

View File

@ -70,15 +70,16 @@ import (
// can lead to hard to track bugs. I much prefer this forking method, since
// no bookkeeping has to be implemented when implementing a parser.
type API struct {
reader *read.Buffer // the input data reader
lastRune rune // the rune as retrieved by the last NextRune() calll
lastRuneErr error // the error for the last NextRune() call
runeRead bool // whether or not a rune was read using NextRune()
runes []rune // the rune stack
tokens []Token // the token stack
stackFrames []stackFrame // the stack frames, containing stack level-specific data
stackLevel int // the current stack level
stackFrame *stackFrame // the current stack frame
reader *read.Buffer // the input data reader
lastRune rune // the rune as retrieved by the last NextRune() call
lastRuneWidth int // the width in bytes of the last read rune
lastRuneErr error // the error for the last NextRune() call
runeRead bool // whether or not a rune was read using NextRune()
runes []rune // the rune stack
tokens []Token // the token stack
stackFrames []stackFrame // the stack frames, containing stack level-specific data
stackLevel int // the current stack level
stackFrame *stackFrame // the current stack frame
}
type stackFrame struct {
@ -129,8 +130,9 @@ func (i *API) NextRune() (rune, error) {
"without a prior call to Accept()")
}
readRune, err := i.reader.RuneAt(i.stackFrame.offset)
readRune, runeWidth, err := i.reader.RuneAt(i.stackFrame.offset)
i.lastRune = readRune
i.lastRuneWidth = runeWidth
i.lastRuneErr = err
i.runeRead = true
@ -140,7 +142,7 @@ func (i *API) NextRune() (rune, error) {
// PeekRune returns the rune at the provided offset.
//
// The read cursor and current read offset are not updated by this operation.
func (i *API) PeekRune(offset int) (rune, error) {
func (i *API) PeekRune(offset int) (rune, int, error) {
return i.reader.RuneAt(i.stackFrame.offset + offset)
}
@ -158,10 +160,10 @@ func (i *API) Accept() {
"but the prior call to NextRune() failed")
}
i.accept(i.lastRune)
i.acceptRunes(i.lastRuneWidth, i.lastRune)
}
func (i *API) accept(runes ...rune) {
func (i *API) acceptRunes(width int, runes ...rune) {
curRuneEnd := i.stackFrame.runeEnd
newRuneEnd := curRuneEnd + len(runes)
@ -179,7 +181,7 @@ func (i *API) accept(runes ...rune) {
i.stackFrame.moveCursorByRune(r)
}
i.stackFrame.runeEnd = newRuneEnd
i.stackFrame.offset += len(runes)
i.stackFrame.offset += width
i.runeRead = false
}
@ -216,6 +218,8 @@ func (i *API) Fork() int {
i.stackLevel++
i.runeRead = false
// TODO do some good benchmarking on these two options. The explicit version might be
// the faster one, but I am not sure of that right now.
// A
// i.stackFrames[i.stackLevel] = *i.stackFrame
// i.stackFrame = &i.stackFrames[i.stackLevel]

View File

@ -29,10 +29,10 @@ func ExampleAPI_NextRune() {
func ExampleAPI_PeekRune() {
api := tokenize.NewAPI("The input that the API will handle")
r1, err := api.PeekRune(19) // 'A'
r2, err := api.PeekRune(20) // 'P'
r3, err := api.PeekRune(21) // 'I'
_, err = api.PeekRune(100) // EOF
r1, _, err := api.PeekRune(19) // 'A'
r2, _, err := api.PeekRune(20) // 'P'
r3, _, err := api.PeekRune(21) // 'I'
_, _, err = api.PeekRune(100) // EOF
fmt.Printf("%c%c%c %s\n", r1, r2, r3, err)

View File

@ -336,9 +336,9 @@ var T = struct {
// MatchRune creates a Handler function that matches against the provided rune.
func MatchRune(expected rune) Handler {
return func(t *API) bool {
r, err := t.PeekRune(0)
r, w, err := t.PeekRune(0)
if err == nil && r == expected {
t.accept(r)
t.acceptRunes(w, r)
return true
}
return false
@ -349,13 +349,13 @@ func MatchRune(expected rune) Handler {
// one of the provided runes. The first match counts.
func MatchRunes(expected ...rune) Handler {
return func(t *API) bool {
r, err := t.PeekRune(0)
r, w, err := t.PeekRune(0)
if err != nil {
return false
}
for _, e := range expected {
if r == e {
t.accept(r)
t.acceptRunes(w, r)
return true
}
}
@ -375,9 +375,9 @@ func MatchRuneRange(start rune, end rune) Handler {
callerPanic("MatchRuneRange", "Handler: {name} definition error at {caller}: start %q must not be < end %q", start, end)
}
return func(t *API) bool {
r, err := t.PeekRune(0)
r, w, err := t.PeekRune(0)
if err == nil && r >= start && r <= end {
t.accept(r)
t.acceptRunes(w, r)
return true
}
return false
@ -388,18 +388,18 @@ func MatchRuneRange(start rune, end rune) Handler {
// a DOS-style newline (CRLF, \r\n) or a UNIX-style newline (just a LF, \n).
func MatchNewline() Handler {
return func(t *API) bool {
r1, err := t.PeekRune(0)
r1, _, err := t.PeekRune(0)
if err != nil {
return false
}
if r1 == '\n' {
t.accept(r1)
t.acceptRunes(1, r1)
return true
}
if r1 == '\r' {
r2, err := t.PeekRune(1)
r2, _, err := t.PeekRune(1)
if err == nil && r2 == '\n' {
t.accept(r1, r2)
t.acceptRunes(2, r1, r2)
return true
}
}
@ -433,19 +433,20 @@ func MatchBlank() Handler {
func MatchBlanks() Handler {
return func(t *API) bool {
// Match the first blank.
r, err := t.PeekRune(0)
r, _, err := t.PeekRune(0)
if err != nil || (r != ' ' && r != '\t') {
return false
}
t.acceptRunes(1, r)
// Now match any number of followup blanks. We've already got
// a successful match at this point, so we'll always return true at the end.
for {
r, err := t.PeekRune(0)
r, _, err := t.PeekRune(0)
if err != nil || (r != ' ' && r != '\t') {
return true
}
t.accept(r)
t.acceptRunes(1, r)
}
}
}
@ -456,35 +457,35 @@ func MatchBlanks() Handler {
func MatchWhitespace() Handler {
return func(t *API) bool {
// Match the first whitespace.
r1, err := t.PeekRune(0)
r1, _, err := t.PeekRune(0)
if err != nil || (r1 != ' ' && r1 != '\t' && r1 != '\n' && r1 != '\r') {
return false
}
if r1 == '\r' {
r2, err := t.PeekRune(1)
r2, _, err := t.PeekRune(1)
if err != nil || r2 != '\n' {
return false
}
t.accept(r1, r2)
t.acceptRunes(2, r1, r2)
} else {
t.accept(r1)
t.acceptRunes(1, r1)
}
// Now match any number of followup whitespace. We've already got
// a successful match at this point, so we'll always return true at the end.
for {
r1, err := t.PeekRune(0)
r1, _, err := t.PeekRune(0)
if err != nil || (r1 != ' ' && r1 != '\t' && r1 != '\n' && r1 != '\r') {
return true
}
if r1 == '\r' {
r2, err := t.PeekRune(1)
r2, _, err := t.PeekRune(1)
if err != nil || r2 != '\n' {
return true
}
t.accept(r1, r2)
t.acceptRunes(2, r1, r2)
} else {
t.accept(r1)
t.acceptRunes(1, r1)
}
}
}
@ -504,9 +505,9 @@ func MatchUnicodeSpace() Handler {
// so those can be used. E.g. MatchRuneByCallback(unicode.IsLower).
func MatchRuneByCallback(callback func(rune) bool) Handler {
return func(t *API) bool {
r, err := t.PeekRune(0)
r, w, err := t.PeekRune(0)
if err == nil && callback(r) {
t.accept(r)
t.acceptRunes(w, r)
return true
}
return false
@ -516,18 +517,18 @@ func MatchRuneByCallback(callback func(rune) bool) Handler {
// MatchEndOfLine creates a Handler that matches a newline ("\r\n" or "\n") or EOF.
func MatchEndOfLine() Handler {
return func(t *API) bool {
r1, err := t.PeekRune(0)
r1, _, err := t.PeekRune(0)
if err != nil {
return err == io.EOF
}
if r1 == '\n' {
t.accept(r1)
t.acceptRunes(1, r1)
return true
}
if r1 == '\r' {
r2, _ := t.PeekRune(1)
r2, _, _ := t.PeekRune(1)
if r2 == '\n' {
t.accept(r1, r2)
t.acceptRunes(2, r1, r2)
return true
}
}
@ -537,14 +538,17 @@ func MatchEndOfLine() Handler {
// MatchStr creates a Handler that matches the input against the provided string.
func MatchStr(expected string) Handler {
expectedRunes := []rune(expected)
width := len(expected)
return func(t *API) bool {
for i, e := range expected {
r, err := t.PeekRune(i)
for i, e := range expectedRunes {
r, _, err := t.PeekRune(i)
if err != nil || e != r {
return false
}
}
t.accept([]rune(expected)...)
t.acceptRunes(width, expectedRunes...)
return true
}
}
@ -553,16 +557,18 @@ func MatchStr(expected string) Handler {
// provided string in a case-insensitive manner.
func MatchStrNoCase(expected string) Handler {
l := len([]rune(expected))
matches := make([]rune, l)
return func(t *API) bool {
matches := make([]rune, l)
width := 0
for i, e := range expected {
r, err := t.PeekRune(i)
r, w, err := t.PeekRune(i)
if err != nil || unicode.ToUpper(e) != unicode.ToUpper(r) {
return false
}
matches[i] = r
width += w
}
t.accept(matches...)
t.acceptRunes(width, matches...)
return true
}
}
@ -882,9 +888,9 @@ func MatchAnyRune() Handler {
// UTF8 rune can be read from the input.
func MatchValidRune() Handler {
return func(t *API) bool {
r, err := t.PeekRune(0)
r, w, err := t.PeekRune(0)
if err == nil && r != utf8.RuneError {
t.accept(r)
t.acceptRunes(w, r)
return true
}
return false
@ -895,9 +901,9 @@ func MatchValidRune() Handler {
// UTF8 rune can be read from the input.
func MatchInvalidRune() Handler {
return func(t *API) bool {
r, err := t.PeekRune(0)
r, w, err := t.PeekRune(0)
if err == nil && r == utf8.RuneError {
t.accept(r)
t.acceptRunes(w, r)
return true
}
return false
@ -949,45 +955,45 @@ func MatchFloat() Handler {
// False falues: false, FALSE, False, 0, f, F
func MatchBoolean() Handler {
return func(t *API) bool {
r1, err := t.PeekRune(0)
r1, _, err := t.PeekRune(0)
if err != nil {
return false
}
if r1 == '1' || r1 == '0' {
t.accept(r1)
t.acceptRunes(1, r1)
return true
}
if r1 == 't' || r1 == 'T' {
r2, _ := t.PeekRune(1)
r3, _ := t.PeekRune(2)
r4, err := t.PeekRune(3)
r2, _, _ := t.PeekRune(1)
r3, _, _ := t.PeekRune(2)
r4, _, err := t.PeekRune(3)
if err == nil && r2 == 'r' && r3 == 'u' && r4 == 'e' {
t.accept(r1, r2, r3, r4)
t.acceptRunes(4, r1, r2, r3, r4)
return true
}
if err == nil && r1 == 'T' && r2 == 'R' && r3 == 'U' && r4 == 'E' {
t.accept(r1, r2, r3, r4)
t.acceptRunes(4, r1, r2, r3, r4)
return true
}
t.accept(r1)
t.acceptRunes(1, r1)
return true
}
if r1 == 'f' || r1 == 'F' {
r2, _ := t.PeekRune(1)
r3, _ := t.PeekRune(2)
r4, _ := t.PeekRune(3)
r5, err := t.PeekRune(4)
r2, _, _ := t.PeekRune(1)
r3, _, _ := t.PeekRune(2)
r4, _, _ := t.PeekRune(3)
r5, _, err := t.PeekRune(4)
if err == nil && r2 == 'a' && r3 == 'l' && r4 == 's' && r5 == 'e' {
t.accept(r1, r2, r3, r4, r5)
t.acceptRunes(5, r1, r2, r3, r4, r5)
return true
}
if err == nil && r1 == 'F' && r2 == 'A' && r3 == 'L' && r4 == 'S' && r5 == 'E' {
t.accept(r1, r2, r3, r4, r5)
t.acceptRunes(5, r1, r2, r3, r4, r5)
return true
}
t.accept(r1)
t.acceptRunes(1, r1)
return true
}
return false