Bytes reader working, now carry on switching to byte reading in the tokenizer code.

2019-07-15 20:03:05 +00:00 · 2019-07-15 20:03:05 +00:00 · d4492e4f0a
parent 17935b7534
commit d4492e4f0a
5 changed files with 331 additions and 236 deletions
--- a/read/read.go
+++ b/read/read.go
@ -4,27 +4,29 @@
 // Let's say we've got the following input coming up in the io.Reader that is
 // wrapped by the Reader:
 //
-//     |H|e|l|l|o|,| |w|o|r|l|d|!|  <-- runes
-//      0           6           12  <-- rune offset
+//     |H|e|l|l|o|,| |w|o|r|l|d|!|  <-- bytes
+//      0           6           12  <-- byte offset
 //
-// The Reader can now be used to retrieve runes from the input, based on their
-// offset, using RuneAt(offset). Normally these runes will be retrieved in
-// sequence, but that is not a requirement. Let's say we retrieve the rune with
-// offset 6 from the input (the 'w'), then the Reader buffer be filled with runes
-// from the io.Reader until there are enough runes available to return the rune
-// for offset 6:
+// The Reader can now be used to retrieve data from the input, based on their
+// byte offset, e.g. using RuneAt(offset) or ByteAt(offset). Normally these data
+// will be retrieved in sequence by the user of this code, but that is not a
+// requirement. Let's say we retrieve the byte with offset 6 from the input
+// (the 'w'), then the Reader buffer be filled with runes from the io.Reader
+// until there are enough runes available to return the rune for offset 6:
 //
 //     |H|e|l|l|o| |w|
 //      0           6
 //
-// Using RuneAt, you can retrieve arbitrary runes. If you request one that is
-// in the Reader buffer, then the buffered rune is returned. If you request one
-// that is not in the buffer, then the buffer will be expanded.
+// This means that you can retrieve data for arbitrary offsets. If you request
+// an offset that is already in the Reader buffer, then the buffered data are
+// returned. If you request one that is not in the buffer, then the buffer will
+// be expanded.
 //
-// To make this into a sliding window, the Reader provides the method
-// Flush(numberOfRunes). This method will drop the provided number of runes from
-// the Reader buffer. So when we'd do a Flush(3) on the example buffer from above,
-// then the Reader buffer would become:
+// To make this into a sliding window (preserving memory space while scanning
+// the input data), the Reader provides the method Flush(numberOfBytes).
+// This method will drop the provided number of bytes from the Reader buffer.
+// So when we'd do a Flush(3) on the example buffer from above, then the Reader
+// buffer would become:
 //
 //     |l|o| |w|
 //      0     3
@ -79,32 +81,33 @@ func makeBufioReader(input interface{}) *bufio.Reader {
 }

 // Buffer wraps around a bufio.Reader and provides an additional layer of
-// buffering that allows us to read the same runes over and over again.
+// buffering that allows us to read the same data over and over again.
 // This is useful for implementing a parser that must be able to do lookahead
 // on the input, returning to the original input position after finishing
 // that lookahead).
 //
-// To minimze memory use, it is also possible to flush the read buffer when there is
-// no more need to go back to previously read runes.
+// To minimize memory use, it is also possible to flush the read buffer when there is
+// no more need to go back to previously read data.
 //
-// The parserkit.reader.Reader is used internally by tokenize.API.
+// This parserkit.reader.Reader is used internally by tokenize.API.
 type Buffer struct {
-	bufio         *bufio.Reader // used for ReadRune()
-	store         []rune        // buffer store, the buffer field is a slice on top of this one
-	buffer        []rune        // input buffer, holding runes that were read from input
-	err           error         // a read error, if one occurred
-	errOffset     int           // the offset in the buffer at which the read error was encountered
-	firstReadDone bool          // whether or not the first read was done
+	bufio     *bufio.Reader // used for ReadRune()
+	store     []byte        // buffer store, the buffer field is a slice on top of this one
+	buffer    []byte        // input buffer, holding runes that were read from input
+	err       error         // a read error, if one occurred
+	errOffset int           // the offset in the buffer at which the read error was encountered
 }

-// RuneAt reads the rune at the provided rune offset.
+// RuneAt reads the rune at the provided byte offset.
 //
-// This offset is relative to the current starting position of the Buffer.
+// The offset is relative to the current starting position of the Buffer.
 // When starting reading, offset 0 will point at the start of the input.
-// After flushing, offset 0 will point at the input up to where the flush was done.
+// After flushing, offset 0 will point at the input up to where the flush
+// was done.
 //
-// The error return value will be nil when reading was successful.
-// When an invalid rune is encountered on the input, the error will be nil,
+// When reading was successful, the rune and the width of the rune in bytes
+// will be returned. The returned error will be nil.
+// When an invalid UTF8 rune is encountered on the input, the error will be nil,
 // but the rune will be utf8.RuneError
 //
 // When reading failed, the rune will be utf8.RuneError and the error will
@ -113,48 +116,83 @@ type Buffer struct {
 //
 // Once a read error is encountered, that same read error will guaranteed
 // be return on every subsequent read at or beyond the provided offset.
-func (r *Buffer) RuneAt(offset int) (rune, error) {
-	// Re-issue a previously seen read error.
-	if r.err != nil && offset >= r.errOffset {
-		return utf8.RuneError, r.err
+func (buf *Buffer) RuneAt(offset int) (rune, int, error) {
+	// Shortcut: re-issue a previously seen read error.
+	if buf.err != nil && offset >= buf.errOffset {
+		return utf8.RuneError, 0, buf.err
 	}

-	// Rune at provided offset is not yet available in the input buffer.
-	// Read runes until we have enough runes to satisfy the offset.
-	l := len(r.buffer)
-
-	// Number of runes to add to the buffer to have enough space to store
-	// the rune at the offset
-	n := offset - l + 1
-
-	if n > 0 {
-		r.grow(n)
-		var readRune rune
-		var err error
-		for writeAt := l; writeAt <= offset; writeAt++ {
-			readRune, _, err = r.bufio.ReadRune()
-
-			// Skip BOM.
-			if !r.firstReadDone {
-				r.firstReadDone = true
-				if readRune == '\uFEFF' {
-					writeAt--
-					continue
-				}
-			}
-
-			// Handle errors.
+	// Compute the number of bytes that we need in the buffer to be able
+	// to return the rune at the provided byte offset.
+	bufferLen := len(buf.buffer)
+	requiredLen := offset + utf8.UTFMax
+	if requiredLen > bufferLen && buf.err == nil {
+		buf.grow(requiredLen)
+		for writeAt := bufferLen; writeAt < requiredLen; writeAt++ {
+			b, err := buf.bufio.ReadByte()
 			if err != nil {
-				r.err = err
-				r.errOffset = writeAt
-				return utf8.RuneError, err
+				buf.err = err
+				buf.errOffset = writeAt
+				buf.buffer = buf.buffer[:writeAt]
+				break
 			}
-
-			r.buffer[writeAt] = readRune
+			buf.buffer[writeAt] = b
 		}
-		return readRune, nil
 	}
-	return r.buffer[offset], nil
+
+	if buf.err != nil && offset >= buf.errOffset {
+		return utf8.RuneError, 0, buf.err
+	}
+
+	r, w := utf8.DecodeRune(buf.buffer[offset:])
+	return r, w, nil
+}
+
+// ByteAt reads the byte at the provided byte offset.
+//
+// The offset is relative to the current starting position of the Buffer.
+// When starting reading, offset 0 will point at the start of the input.
+// After flushing, offset 0 will point at the input up to where the flush
+// was done.
+//
+// When reading was successful, the byte will be returned. The returned
+// error will be nil.
+//
+// When reading failed, the byte will be 0x00 and the error will
+// be not nil. One special read fail is actually a normal situation: end
+// of file reached. In that case, the returned error wille be io.EOF.
+//
+// Once a read error is encountered, that same read error will guaranteed
+// be return on every subsequent read at or beyond the provided offset.
+func (buf *Buffer) ByteAt(offset int) (byte, error) {
+	// Shortcut: re-issue a previously seen read error.
+	if buf.err != nil && offset >= buf.errOffset {
+		return 0, buf.err
+	}
+
+	// Compute the number of bytes that we need in the buffer to be able
+	// to return the byte at the provided byte offset.
+	bufferLen := len(buf.buffer)
+	requiredLen := offset + 1
+	if requiredLen > bufferLen && buf.err == nil {
+		buf.grow(requiredLen)
+		for writeAt := bufferLen; writeAt < requiredLen; writeAt++ {
+			b, err := buf.bufio.ReadByte()
+			if err != nil {
+				buf.err = err
+				buf.errOffset = writeAt
+				buf.buffer = buf.buffer[:writeAt]
+				break
+			}
+			buf.buffer[writeAt] = b
+		}
+	}
+
+	if buf.err != nil && offset >= buf.errOffset {
+		return 0, buf.err
+	}
+
+	return buf.buffer[offset], nil
 }

 // The upcoming code was inspired heavily by the Go built-in 'bytes' package.
@ -168,82 +206,80 @@ var ErrTooLarge = errors.New("parsekit.read.Buffer: too large")
 // grow grows the buffer to guarantee space for n more bytes.
 // It returns the index where bytes should be written.
 // If the buffer can't grow it will panic with ErrTooLarge.
-func (r *Buffer) grow(n int) {
+func (buf *Buffer) grow(requiredSize int) {
 	// Instantiate new buffer store
-	if r.store == nil {
+	if buf.store == nil {
 		b := smallBufferSize
-		if b < n {
-			b = n
+		if b < requiredSize {
+			b = requiredSize
 		}
-		r.store = make([]rune, 0, b)
-		r.buffer = r.store[:n]
+		buf.store = make([]byte, 0, b)
+		buf.buffer = buf.store[:requiredSize]
 		return
 	}

-	lenBuffer := len(r.buffer)
-	capBuffer := cap(r.buffer)
-	freeBuffer := capBuffer - lenBuffer
-	newSize := lenBuffer + n
+	capBuffer := cap(buf.buffer)

 	// Grow the buffer store by reslicing within the available capacity.
-	if freeBuffer >= n {
-		r.buffer = r.buffer[:newSize]
+	if capBuffer >= requiredSize {
+		buf.buffer = buf.buffer[:requiredSize]
 		return
 	}

-	capStore := cap(r.store)
+	capStore := cap(buf.store)
 	freeAtStartOfStore := capStore - capBuffer

 	// Grow the buffer by moving the data to the start of the store.
 	// Note: according to the spec, overlapping slices are allowed with copy().
-	if freeAtStartOfStore > 0 && newSize <= capStore {
-		r.store = r.store[0:newSize]
-		copy(r.store, r.buffer)
-		r.buffer = r.store[:newSize]
-		r.store = r.store[:0]
+	if freeAtStartOfStore > 0 && requiredSize <= capStore {
+		buf.store = buf.store[0:requiredSize]
+		copy(buf.store, buf.buffer)
+		buf.buffer = buf.store[:requiredSize]
+		buf.store = buf.store[:0]
 		return
 	}

 	// Grow the buffer store by allocating a new one and copying the data.
-	buf := makeSlice(2*capStore + n)
-	copy(buf, r.buffer)
-	r.store = buf
-	r.buffer = r.store[:newSize]
+	newStore := makeSlice(2*capStore + requiredSize)
+	copy(newStore, buf.buffer)
+	buf.store = newStore
+	buf.buffer = buf.store[:requiredSize]
 }

 // makeSlice allocates a slice of size n. If the allocation fails, it panics
 // with ErrTooLarge.
-func makeSlice(n int) []rune {
+func makeSlice(n int) []byte {
 	// If the make fails, give a known error.
 	defer func() {
 		if recover() != nil {
 			panic(ErrTooLarge)
 		}
 	}()
-	return make([]rune, 0, n)
+	return make([]byte, 0, n)
 }

-// Flush deletes the provided number of runes from the start of the Buffer.
+// Flush deletes the provided number of bytes from the start of the Buffer.
 // After flushing the Buffer, offset 0 as used by RuneAt() will point to
 // the rune that comes after the runes that were flushed.
 // So what this basically does, is turn the Buffer into a sliding window.
-func (r *Buffer) Flush(numberOfRunes int) {
-	l := len(r.buffer)
-	if numberOfRunes > l {
+func (buf *Buffer) Flush(numberOfBytes int) {
+	if numberOfBytes == 0 {
+		return
+	}
+
+	bufferLen := len(buf.buffer)
+	if numberOfBytes > bufferLen {
 		panic(fmt.Sprintf(
 			"parsekit.read.Buffer.Flush(): number of runes to flush (%d) "+
-				"exceeds size of the buffer (%d)", numberOfRunes, l))
+				"exceeds size of the buffer (%d)", numberOfBytes, bufferLen))
 	}
-	if numberOfRunes == 0 {
+	if bufferLen == numberOfBytes {
+		buf.buffer = buf.store[:0]
+		buf.errOffset = 0
 		return
 	}
-	if l == numberOfRunes {
-		r.buffer = r.store[:0]
-		r.errOffset = 0
-		return
-	}
-	r.buffer = r.buffer[numberOfRunes:]
-	if r.err != nil {
-		r.errOffset = r.errOffset - numberOfRunes
+	buf.buffer = buf.buffer[numberOfBytes:]
+	if buf.err != nil {
+		buf.errOffset = buf.errOffset - numberOfBytes
 	}
 }
--- a/read/read_test.go
+++ b/read/read_test.go
@ -12,8 +12,8 @@ import (
 func ExampleNew() {
 	printFirstRuneOf := func(input interface{}) {
 		r := New(input)
-		c, _ := r.RuneAt(0)
-		fmt.Printf("%q\n", c)
+		c, w, _ := r.RuneAt(0)
+		fmt.Printf("rune %q, width %d\n", c, w)
 	}

 	simpleString := "Hello, world!"
@ -25,14 +25,14 @@ func ExampleNew() {
 	bufioReaderPointer := bufio.NewReader(strings.NewReader("Where do we go, world?"))
 	printFirstRuneOf(bufioReaderPointer)

-	bufioReaderValue := *(bufio.NewReader(strings.NewReader("Where do we go, world?")))
+	bufioReaderValue := *(bufio.NewReader(strings.NewReader("Ɍead the manual!")))
 	printFirstRuneOf(bufioReaderValue)

 	// Output:
-	// 'H'
-	// 'G'
-	// 'W'
-	// 'W'
+	// rune 'H', width 1
+	// rune 'G', width 1
+	// rune 'W', width 1
+	// rune 'Ɍ', width 2
 }

 func TestNew_VariousInputTypesCanBeUsed(t *testing.T) {
@ -46,13 +46,13 @@ func TestNew_VariousInputTypesCanBeUsed(t *testing.T) {
 		{"bufio.Reader", *(bufio.NewReader(strings.NewReader("Hello, world!")))},
 	} {
 		r := New(test.input)
-		firstRune, _ := r.RuneAt(0)
+		firstRune, _, _ := r.RuneAt(0)
 		if firstRune != 'H' {
 			t.Errorf("[%s] first rune not 'H'", test.name)
 		}
-		lastRune, _ := r.RuneAt(12)
+		lastRune, _, _ := r.RuneAt(12)
 		if lastRune != '!' {
-			t.Errorf("[%s] last rune not '!'", test.name)
+			t.Errorf("[%s] last rune not '!', but %q", test.name, lastRune)
 		}
 	}
 }
@ -63,41 +63,63 @@ func TestNew_UnhandledInputType_Panics(t *testing.T) {
 		"parsekit.read.New(): no support for input of type int")
 }

-func TestBuffer_RuneAt(t *testing.T) {
+func TestBuffer_ByteAt(t *testing.T) {
 	r := New(strings.NewReader("Hello, world!"))
-	at := func(i int) rune { r, _ := r.RuneAt(i); return r }
+	at := func(i int) byte { b, _ := r.ByteAt(i); return b }

-	// It is possible to go back and forth while reading the input.
 	result := fmt.Sprintf("%c%c%c%c", at(0), at(12), at(7), at(0))
 	assertEqual(t, "H!wH", result)
 }

+func TestBuffer_RuneAt(t *testing.T) {
+	r := New(strings.NewReader("¡pןɹoʍ 'oןןǝH"))
+	at := func(i int) rune { r, _, _ := r.RuneAt(i); return r }
+
+	// It is possible to go back and forth while reading the input.
+	result := fmt.Sprintf("%c%c%c%c", at(0), at(5), at(8), at(0))
+	assertEqual(t, "¡ɹʍ¡", result)
+}
+
+func TestBuffer_ByteAt_endOfFile(t *testing.T) {
+	r := New(strings.NewReader("Hello, world!"))
+
+	b, err := r.ByteAt(13)
+	result := fmt.Sprintf("%q %s %t", b, err, err == io.EOF)
+	assertEqual(t, "'\\x00' EOF true", result)
+
+	b, err = r.ByteAt(20)
+	result = fmt.Sprintf("%q %s %t", b, err, err == io.EOF)
+	assertEqual(t, "'\\x00' EOF true", result)
+}
+
 func TestBuffer_RuneAt_endOfFile(t *testing.T) {
 	r := New(strings.NewReader("Hello, world!"))

-	rn, err := r.RuneAt(13)
+	rn, _, err := r.RuneAt(13)
 	result := fmt.Sprintf("%q %s %t", rn, err, err == io.EOF)
 	assertEqual(t, "'<27>' EOF true", result)

-	rn, err = r.RuneAt(20)
+	rn, _, err = r.RuneAt(20)
 	result = fmt.Sprintf("%q %s %t", rn, err, err == io.EOF)
 	assertEqual(t, "'<27>' EOF true", result)
 }

 func TestBuffer_RuneAt_invalidRune(t *testing.T) {
 	r := New(strings.NewReader("Hello, \xcdworld!"))
-	at := func(i int) rune { r, _ := r.RuneAt(i); return r }
+	at := func(i int) rune { r, _, _ := r.RuneAt(i); return r }

 	result := fmt.Sprintf("%c%c%c%c", at(6), at(7), at(8), at(9))
 	assertEqual(t, " <20>wo", result)
 }

-func ExampleBuffer_RuneAt() {
+func ExampleBuffer_ByteAt() {
 	reader := New(strings.NewReader("Hello, world!"))

 	fmt.Printf("Runes: ")
-	for i := 0; ; i++ {
-		r, err := reader.RuneAt(i)
+	offset := 0
+	for {
+		r, err := reader.ByteAt(offset)
+		offset++
 		if err != nil {
 			fmt.Printf("\nErr: %s\n", err)
 			break
@ -110,18 +132,39 @@ func ExampleBuffer_RuneAt() {
 	// Err: EOF
 }

-func TestRuneAt_SkipsBOMAtStartOfFile(t *testing.T) {
-	r := New(strings.NewReader("\uFEFFBommetje!"))
-	b, _ := r.RuneAt(0)
-	o, _ := r.RuneAt(1)
-	m, _ := r.RuneAt(2)
-	bom := fmt.Sprintf("%c%c%c", b, o, m)
-	assertEqual(t, "Bom", bom)
+func ExampleBuffer_RuneAt() {
+	reader := New(strings.NewReader("Hello, pןɹoʍ!"))
+
+	fmt.Printf("Runes: ")
+	offset := 0
+	for {
+		r, w, err := reader.RuneAt(offset)
+		offset += w
+		if err != nil {
+			fmt.Printf("\nErr: %s\n", err)
+			break
+		}
+		fmt.Printf("%c", r)
+	}
+
+	// Output:
+	// Runes: Hello, pןɹoʍ!
+	// Err: EOF
 }

+// TODO reimplement somewhere, maybe a separate call in the reader or should it be part of a parser?
+// func TestRuneAt_SkipsBOMAtStartOfFile(t *testing.T) {
+// 	r := New(strings.NewReader("\uFEFFBommetje!"))
+// 	b, _, _ := r.RuneAt(0)
+// 	o, _, _ := r.RuneAt(1)
+// 	m, _, _ := r.RuneAt(2)
+// 	bom := fmt.Sprintf("%c%c%c", b, o, m)
+// 	assertEqual(t, "Bom", bom)
+// }
+
 func TestBuffer_Flush(t *testing.T) {
 	r := New(strings.NewReader("Hello, world!"))
-	at := func(i int) rune { r, _ := r.RuneAt(i); return r }
+	at := func(i int) rune { r, _, _ := r.RuneAt(i); return r }

 	// Fills the buffer with the first 8 runes on the input: "Hello, w"
 	result := fmt.Sprintf("%c", at(7))
@ -138,7 +181,7 @@ func TestBuffer_Flush(t *testing.T) {

 func ExampleBuffer_Flush() {
 	r := New(strings.NewReader("dog eat dog!"))
-	at := func(offset int) rune { c, _ := r.RuneAt(offset); return c }
+	at := func(offset int) rune { c, _, _ := r.RuneAt(offset); return c }

 	// Read from the first 4 runes of the input.
 	fmt.Printf("%c%c%c%c", at(0), at(1), at(2), at(3))
@ -162,30 +205,31 @@ func ExampleBuffer_Flush() {
 func TestGivenNumberOfRunesTooHigh_Flush_Panics(t *testing.T) {
 	r := New(strings.NewReader("Hello, world!"))

-	// Fill buffer with "Hello, worl", the first 11 runes.
-	r.RuneAt(10)
+	// Fill buffer with "Hello, world!", the first 13 runes.
+	rn, _, _ := r.RuneAt(12)
+	assertEqual(t, '!', rn)

-	// However, we flush 12 runes, which exceeds the buffer size.
+	// However, we flush 14 runes, which exceeds the buffer size.
 	assertPanic(t,
-		func() { r.Flush(12) },
+		func() { r.Flush(14) },
 		"parsekit.read.Buffer.Flush(): number of runes to flush "+
-			"(12) exceeds size of the buffer (11)")
+			"(14) exceeds size of the buffer (13)")
 }

 func TestGivenEOFFollowedByFlush_EOFCanStillBeRead(t *testing.T) {
 	r := New(strings.NewReader("Hello, world!"))
-	_, err := r.RuneAt(13)
+	_, _, err := r.RuneAt(13)
 	assertEqual(t, err.Error(), "EOF")
-	_, err = r.RuneAt(13)
+	_, _, err = r.RuneAt(13)
 	assertEqual(t, err.Error(), "EOF")
-	_, err = r.RuneAt(14)
+	_, _, err = r.RuneAt(14)
 	assertEqual(t, err.Error(), "EOF")
 	r.Flush(13)
-	_, err = r.RuneAt(0)
+	_, _, err = r.RuneAt(0)
 	assertEqual(t, err.Error(), "EOF")
-	_, err = r.RuneAt(1)
+	_, _, err = r.RuneAt(1)
 	assertEqual(t, err.Error(), "EOF")
-	_, err = r.RuneAt(2)
+	_, _, err = r.RuneAt(2)
 	assertEqual(t, err.Error(), "EOF")
 }

@ -203,43 +247,43 @@ func TestGivenErrorFromBuffer_ErrorIsCached(t *testing.T) {
 	r := New(input)

 	// Read the last availble rune.
-	readRune, _ := r.RuneAt(3)
+	readRune, _, _ := r.RuneAt(3)
 	assertEqual(t, 'd', readRune)

 	// Reading the next offset must result in the io.EOF error from the stub.
-	readRune, err := r.RuneAt(4)
+	readRune, _, err := r.RuneAt(4)
 	assertEqual(t, utf8.RuneError, readRune)
 	assertEqual(t, io.EOF, err)

 	// Reading even further should yield the same io.EOF error.
-	readRune, err = r.RuneAt(5)
+	readRune, _, err = r.RuneAt(5)
 	assertEqual(t, utf8.RuneError, readRune)
 	assertEqual(t, io.EOF, err)

 	// After an error, we must still be able to read the last rune.
-	readRune, _ = r.RuneAt(3)
+	readRune, _, _ = r.RuneAt(3)
 	assertEqual(t, 'd', readRune)

 	// Flushing updates the error index too.
 	r.Flush(3)

 	// The last rune is now at offset 0.
-	readRune, _ = r.RuneAt(0)
+	readRune, _, _ = r.RuneAt(0)
 	assertEqual(t, 'd', readRune)

 	// The io.EOF is now at offset 1.
-	_, err = r.RuneAt(1)
+	_, _, err = r.RuneAt(1)
 	assertEqual(t, io.EOF, err)

 	// Let's flush that last rune too.
 	r.Flush(1)

 	// The io.EOF is now at offset 0.
-	_, err = r.RuneAt(0)
+	_, _, err = r.RuneAt(0)
 	assertEqual(t, io.EOF, err)

 	// And reading beyond that offset also yields io.EOF.
-	_, err = r.RuneAt(1)
+	_, _, err = r.RuneAt(1)
 	assertEqual(t, io.EOF, err)
 }

@ -247,13 +291,13 @@ func TestInputLargerThanDefaultBufSize64(t *testing.T) {
 	input, size := makeLargeStubReader()
 	r := New(input)

-	readRune, err := r.RuneAt(0)
+	readRune, _, err := r.RuneAt(0)
 	assertEqual(t, 'X', readRune)
-	readRune, err = r.RuneAt(size - 1)
+	readRune, _, err = r.RuneAt(size - 1)
 	assertEqual(t, 'Y', readRune)
-	readRune, err = r.RuneAt(size)
+	readRune, _, err = r.RuneAt(size)
 	assertEqual(t, io.EOF, err)
-	readRune, err = r.RuneAt(10)
+	readRune, _, err = r.RuneAt(10)
 	assertEqual(t, 'X', readRune)
 }

@ -261,9 +305,9 @@ func TestInputLargerThanDefaultBufSize64_WithFirstReadLargerThanBufSize64(t *tes
 	input, size := makeLargeStubReader()
 	r := New(input)

-	readRune, _ := r.RuneAt(size - 200)
+	readRune, _, _ := r.RuneAt(size - 200)
 	assertEqual(t, 'X', readRune)
-	readRune, _ = r.RuneAt(size - 1)
+	readRune, _, _ = r.RuneAt(size - 1)
 	assertEqual(t, 'Y', readRune)
 }

@ -271,7 +315,7 @@ func TestInputLargerThanDefaultBufSize64_WithFirstReadToLastByte(t *testing.T) {
 	input, size := makeLargeStubReader()
 	r := New(input)

-	readRune, _ := r.RuneAt(size - 1)
+	readRune, _, _ := r.RuneAt(size - 1)
 	assertEqual(t, 'Y', readRune)
 }

@ -282,12 +326,17 @@ func TestAllocationPatterns(t *testing.T) {
 	// The first read will create the standard cache.
 	// store  |x   64     |
 	// buffer |x   64     |
-	assertCache(t, "read 1", r, func() { r.RuneAt(0) }, 0, 64, 1, 64)
+	assertCache(t, "read 1", r, func() { r.RuneAt(0) }, 0, 64, 4, 64)

-	// The first 64 reads will fit in the standard cache.
+	// The first 64 bytes will fit in the standard cache.
 	// store  |xxxx64xxxxx|
 	// buffer |xxxx64xxxxx|
-	assertCache(t, "read fill cache", r, func() { r.RuneAt(63) }, 0, 64, 64, 64)
+	//
+	// Note: in the test offset 60 is used instead of offset 63, because
+	// RuneAt() will fill the buffer with 4 bytes to accomodate for the
+	// longest UTF8 character encodings. In all upcoming tests, the same
+	// logic applies to the RuneAt() calls.
+	assertCache(t, "read fill cache", r, func() { r.RuneAt(60) }, 0, 64, 64, 64)

 	// Flushing zero input keeps everything as-is.
 	// store  |xxxx64xxxxx|
@ -302,7 +351,7 @@ func TestAllocationPatterns(t *testing.T) {
 	// Reading 65 chars will allocate a new store of 2 * size + n.
 	// store  |xxxxx65xxxxx     128        |
 	// buffer |xxxxx65xxxxx     128        |
-	assertCache(t, "read cap + 1", r, func() { r.RuneAt(64) }, 0, 65+128, 65, 65+128)
+	assertCache(t, "read cap + 1", r, func() { r.RuneAt(61) }, 0, 65+128, 65, 65+128)

 	// A partial flush frees the start of the store and moves
 	// the buffer slice.
@ -315,7 +364,7 @@ func TestAllocationPatterns(t *testing.T) {
 	// without a new allocation.
 	// store  |   50   xxxxxxxxx143xxxxxxxx|
 	// buffer         |xxxxxxxxx143xxxxxxxx|
-	assertCache(t, "read fill cache after partial flush", r, func() { r.RuneAt(142) }, 0, 50+143, 143, 143)
+	assertCache(t, "read fill cache after partial flush", r, func() { r.RuneAt(139) }, 0, 50+143, 143, 143)

 	// Flush the full input.
 	// store  |            193             |
@ -325,7 +374,7 @@ func TestAllocationPatterns(t *testing.T) {
 	// Read a bit more than half the capacity.
 	// store  |xxxxxx101xxxxxxxx    92     |
 	// buffer |xxxxxx101xxxxxxxx    92     |
-	assertCache(t, "read more than half the cap", r, func() { r.RuneAt(100) }, 0, 193, 101, 193)
+	assertCache(t, "read more than half the cap", r, func() { r.RuneAt(97) }, 0, 193, 101, 193)

 	// Then flush almost all input.
 	// store  |      100     x1x    92     |
@ -337,7 +386,7 @@ func TestAllocationPatterns(t *testing.T) {
 	// store (where it fits), space is freed up for the read operation.
 	// store  |xxxxx100xxxxxx      93      |
 	// buffer |xxxxx100xxxxxx      93      |
-	assertCache(t, "read beyond cap with free space at start of store", r, func() { r.RuneAt(99) }, 0, 193, 100, 193)
+	assertCache(t, "read beyond cap with free space at start of store", r, func() { r.RuneAt(96) }, 0, 193, 100, 193)

 	// Now flush only one rune from the cache.
 	// store  |1 xxxx99xxxxx      93      |
@ -349,7 +398,7 @@ func TestAllocationPatterns(t *testing.T) {
 	// the data is moved to the start and no reallocation is needed.
 	// store  |1 xxxx99xxxxx      93      |
 	// buffer   |xxxx99xxxxx      93      |
-	assertCache(t, "read 1 more than cap with 1 free at start", r, func() { r.RuneAt(192) }, 0, 193, 193, 193)
+	assertCache(t, "read 1 more than cap with 1 free at start", r, func() { r.RuneAt(189) }, 0, 193, 193, 193)
 }

 func makeLargeStubReader() (*StubReader, int) {
--- a/tokenize/api.go
+++ b/tokenize/api.go
@ -70,15 +70,16 @@ import (
 // can lead to hard to track bugs. I much prefer this forking method, since
 // no bookkeeping has to be implemented when implementing a parser.
 type API struct {
-	reader      *read.Buffer // the input data reader
-	lastRune    rune         // the rune as retrieved by the last NextRune() calll
-	lastRuneErr error        // the error for the last NextRune() call
-	runeRead    bool         // whether or not a rune was read using NextRune()
-	runes       []rune       // the rune stack
-	tokens      []Token      // the token stack
-	stackFrames []stackFrame // the stack frames, containing stack level-specific data
-	stackLevel  int          // the current stack level
-	stackFrame  *stackFrame  // the current stack frame
+	reader        *read.Buffer // the input data reader
+	lastRune      rune         // the rune as retrieved by the last NextRune() call
+	lastRuneWidth int          // the width in bytes of the last read rune
+	lastRuneErr   error        // the error for the last NextRune() call
+	runeRead      bool         // whether or not a rune was read using NextRune()
+	runes         []rune       // the rune stack
+	tokens        []Token      // the token stack
+	stackFrames   []stackFrame // the stack frames, containing stack level-specific data
+	stackLevel    int          // the current stack level
+	stackFrame    *stackFrame  // the current stack frame
 }

 type stackFrame struct {
@ -129,8 +130,9 @@ func (i *API) NextRune() (rune, error) {
 			"without a prior call to Accept()")
 	}

-	readRune, err := i.reader.RuneAt(i.stackFrame.offset)
+	readRune, runeWidth, err := i.reader.RuneAt(i.stackFrame.offset)
 	i.lastRune = readRune
+	i.lastRuneWidth = runeWidth
 	i.lastRuneErr = err
 	i.runeRead = true

@ -140,7 +142,7 @@ func (i *API) NextRune() (rune, error) {
 // PeekRune returns the rune at the provided offset.
 //
 // The read cursor and current read offset are not updated by this operation.
-func (i *API) PeekRune(offset int) (rune, error) {
+func (i *API) PeekRune(offset int) (rune, int, error) {
 	return i.reader.RuneAt(i.stackFrame.offset + offset)
 }

@ -158,10 +160,10 @@ func (i *API) Accept() {
 			"but the prior call to NextRune() failed")
 	}

-	i.accept(i.lastRune)
+	i.acceptRunes(i.lastRuneWidth, i.lastRune)
 }

-func (i *API) accept(runes ...rune) {
+func (i *API) acceptRunes(width int, runes ...rune) {
 	curRuneEnd := i.stackFrame.runeEnd
 	newRuneEnd := curRuneEnd + len(runes)

@ -179,7 +181,7 @@ func (i *API) accept(runes ...rune) {
 		i.stackFrame.moveCursorByRune(r)
 	}
 	i.stackFrame.runeEnd = newRuneEnd
-	i.stackFrame.offset += len(runes)
+	i.stackFrame.offset += width
 	i.runeRead = false
 }

@ -216,6 +218,8 @@ func (i *API) Fork() int {
 	i.stackLevel++
 	i.runeRead = false

+	// TODO do some good benchmarking on these two options. The explicit version might be
+	// the faster one, but I am not sure of that right now.
 	// A
 	// i.stackFrames[i.stackLevel] = *i.stackFrame
 	// i.stackFrame = &i.stackFrames[i.stackLevel]
--- a/tokenize/api_test.go
+++ b/tokenize/api_test.go
@ -29,10 +29,10 @@ func ExampleAPI_NextRune() {
 func ExampleAPI_PeekRune() {
 	api := tokenize.NewAPI("The input that the API will handle")

-	r1, err := api.PeekRune(19) // 'A'
-	r2, err := api.PeekRune(20) // 'P'
-	r3, err := api.PeekRune(21) // 'I'
-	_, err = api.PeekRune(100)  // EOF
+	r1, _, err := api.PeekRune(19) // 'A'
+	r2, _, err := api.PeekRune(20) // 'P'
+	r3, _, err := api.PeekRune(21) // 'I'
+	_, _, err = api.PeekRune(100)  // EOF

 	fmt.Printf("%c%c%c %s\n", r1, r2, r3, err)

--- a/tokenize/handlers_builtin.go
+++ b/tokenize/handlers_builtin.go
@ -336,9 +336,9 @@ var T = struct {
 // MatchRune creates a Handler function that matches against the provided rune.
 func MatchRune(expected rune) Handler {
 	return func(t *API) bool {
-		r, err := t.PeekRune(0)
+		r, w, err := t.PeekRune(0)
 		if err == nil && r == expected {
-			t.accept(r)
+			t.acceptRunes(w, r)
 			return true
 		}
 		return false
@ -349,13 +349,13 @@ func MatchRune(expected rune) Handler {
 // one of the provided runes. The first match counts.
 func MatchRunes(expected ...rune) Handler {
 	return func(t *API) bool {
-		r, err := t.PeekRune(0)
+		r, w, err := t.PeekRune(0)
 		if err != nil {
 			return false
 		}
 		for _, e := range expected {
 			if r == e {
-				t.accept(r)
+				t.acceptRunes(w, r)
 				return true
 			}
 		}
@ -375,9 +375,9 @@ func MatchRuneRange(start rune, end rune) Handler {
 		callerPanic("MatchRuneRange", "Handler: {name} definition error at {caller}: start %q must not be < end %q", start, end)
 	}
 	return func(t *API) bool {
-		r, err := t.PeekRune(0)
+		r, w, err := t.PeekRune(0)
 		if err == nil && r >= start && r <= end {
-			t.accept(r)
+			t.acceptRunes(w, r)
 			return true
 		}
 		return false
@ -388,18 +388,18 @@ func MatchRuneRange(start rune, end rune) Handler {
 // a DOS-style newline (CRLF, \r\n) or a UNIX-style newline (just a LF, \n).
 func MatchNewline() Handler {
 	return func(t *API) bool {
-		r1, err := t.PeekRune(0)
+		r1, _, err := t.PeekRune(0)
 		if err != nil {
 			return false
 		}
 		if r1 == '\n' {
-			t.accept(r1)
+			t.acceptRunes(1, r1)
 			return true
 		}
 		if r1 == '\r' {
-			r2, err := t.PeekRune(1)
+			r2, _, err := t.PeekRune(1)
 			if err == nil && r2 == '\n' {
-				t.accept(r1, r2)
+				t.acceptRunes(2, r1, r2)
 				return true
 			}
 		}
@ -433,19 +433,20 @@ func MatchBlank() Handler {
 func MatchBlanks() Handler {
 	return func(t *API) bool {
 		// Match the first blank.
-		r, err := t.PeekRune(0)
+		r, _, err := t.PeekRune(0)
 		if err != nil || (r != ' ' && r != '\t') {
 			return false
 		}
+		t.acceptRunes(1, r)

 		// Now match any number of followup blanks. We've already got
 		// a successful match at this point, so we'll always return true at the end.
 		for {
-			r, err := t.PeekRune(0)
+			r, _, err := t.PeekRune(0)
 			if err != nil || (r != ' ' && r != '\t') {
 				return true
 			}
-			t.accept(r)
+			t.acceptRunes(1, r)
 		}
 	}
 }
@ -456,35 +457,35 @@ func MatchBlanks() Handler {
 func MatchWhitespace() Handler {
 	return func(t *API) bool {
 		// Match the first whitespace.
-		r1, err := t.PeekRune(0)
+		r1, _, err := t.PeekRune(0)
 		if err != nil || (r1 != ' ' && r1 != '\t' && r1 != '\n' && r1 != '\r') {
 			return false
 		}
 		if r1 == '\r' {
-			r2, err := t.PeekRune(1)
+			r2, _, err := t.PeekRune(1)
 			if err != nil || r2 != '\n' {
 				return false
 			}
-			t.accept(r1, r2)
+			t.acceptRunes(2, r1, r2)
 		} else {
-			t.accept(r1)
+			t.acceptRunes(1, r1)
 		}

 		// Now match any number of followup whitespace. We've already got
 		// a successful match at this point, so we'll always return true at the end.
 		for {
-			r1, err := t.PeekRune(0)
+			r1, _, err := t.PeekRune(0)
 			if err != nil || (r1 != ' ' && r1 != '\t' && r1 != '\n' && r1 != '\r') {
 				return true
 			}
 			if r1 == '\r' {
-				r2, err := t.PeekRune(1)
+				r2, _, err := t.PeekRune(1)
 				if err != nil || r2 != '\n' {
 					return true
 				}
-				t.accept(r1, r2)
+				t.acceptRunes(2, r1, r2)
 			} else {
-				t.accept(r1)
+				t.acceptRunes(1, r1)
 			}
 		}
 	}
@ -504,9 +505,9 @@ func MatchUnicodeSpace() Handler {
 // so those can be used. E.g. MatchRuneByCallback(unicode.IsLower).
 func MatchRuneByCallback(callback func(rune) bool) Handler {
 	return func(t *API) bool {
-		r, err := t.PeekRune(0)
+		r, w, err := t.PeekRune(0)
 		if err == nil && callback(r) {
-			t.accept(r)
+			t.acceptRunes(w, r)
 			return true
 		}
 		return false
@ -516,18 +517,18 @@ func MatchRuneByCallback(callback func(rune) bool) Handler {
 // MatchEndOfLine creates a Handler that matches a newline ("\r\n" or "\n") or EOF.
 func MatchEndOfLine() Handler {
 	return func(t *API) bool {
-		r1, err := t.PeekRune(0)
+		r1, _, err := t.PeekRune(0)
 		if err != nil {
 			return err == io.EOF
 		}
 		if r1 == '\n' {
-			t.accept(r1)
+			t.acceptRunes(1, r1)
 			return true
 		}
 		if r1 == '\r' {
-			r2, _ := t.PeekRune(1)
+			r2, _, _ := t.PeekRune(1)
 			if r2 == '\n' {
-				t.accept(r1, r2)
+				t.acceptRunes(2, r1, r2)
 				return true
 			}
 		}
@ -537,14 +538,17 @@ func MatchEndOfLine() Handler {

 // MatchStr creates a Handler that matches the input against the provided string.
 func MatchStr(expected string) Handler {
+	expectedRunes := []rune(expected)
+	width := len(expected)
+
 	return func(t *API) bool {
-		for i, e := range expected {
-			r, err := t.PeekRune(i)
+		for i, e := range expectedRunes {
+			r, _, err := t.PeekRune(i)
 			if err != nil || e != r {
 				return false
 			}
 		}
-		t.accept([]rune(expected)...)
+		t.acceptRunes(width, expectedRunes...)
 		return true
 	}
 }
@ -553,16 +557,18 @@ func MatchStr(expected string) Handler {
 // provided string in a case-insensitive manner.
 func MatchStrNoCase(expected string) Handler {
 	l := len([]rune(expected))
-	matches := make([]rune, l)
 	return func(t *API) bool {
+		matches := make([]rune, l)
+		width := 0
 		for i, e := range expected {
-			r, err := t.PeekRune(i)
+			r, w, err := t.PeekRune(i)
 			if err != nil || unicode.ToUpper(e) != unicode.ToUpper(r) {
 				return false
 			}
 			matches[i] = r
+			width += w
 		}
-		t.accept(matches...)
+		t.acceptRunes(width, matches...)
 		return true
 	}
 }
@ -882,9 +888,9 @@ func MatchAnyRune() Handler {
 // UTF8 rune can be read from the input.
 func MatchValidRune() Handler {
 	return func(t *API) bool {
-		r, err := t.PeekRune(0)
+		r, w, err := t.PeekRune(0)
 		if err == nil && r != utf8.RuneError {
-			t.accept(r)
+			t.acceptRunes(w, r)
 			return true
 		}
 		return false
@ -895,9 +901,9 @@ func MatchValidRune() Handler {
 // UTF8 rune can be read from the input.
 func MatchInvalidRune() Handler {
 	return func(t *API) bool {
-		r, err := t.PeekRune(0)
+		r, w, err := t.PeekRune(0)
 		if err == nil && r == utf8.RuneError {
-			t.accept(r)
+			t.acceptRunes(w, r)
 			return true
 		}
 		return false
@ -949,45 +955,45 @@ func MatchFloat() Handler {
 // False falues: false, FALSE, False, 0, f, F
 func MatchBoolean() Handler {
 	return func(t *API) bool {
-		r1, err := t.PeekRune(0)
+		r1, _, err := t.PeekRune(0)
 		if err != nil {
 			return false
 		}
 		if r1 == '1' || r1 == '0' {
-			t.accept(r1)
+			t.acceptRunes(1, r1)
 			return true
 		}
 		if r1 == 't' || r1 == 'T' {
-			r2, _ := t.PeekRune(1)
-			r3, _ := t.PeekRune(2)
-			r4, err := t.PeekRune(3)
+			r2, _, _ := t.PeekRune(1)
+			r3, _, _ := t.PeekRune(2)
+			r4, _, err := t.PeekRune(3)
 			if err == nil && r2 == 'r' && r3 == 'u' && r4 == 'e' {
-				t.accept(r1, r2, r3, r4)
+				t.acceptRunes(4, r1, r2, r3, r4)
 				return true
 			}
 			if err == nil && r1 == 'T' && r2 == 'R' && r3 == 'U' && r4 == 'E' {
-				t.accept(r1, r2, r3, r4)
+				t.acceptRunes(4, r1, r2, r3, r4)
 				return true
 			}
-			t.accept(r1)
+			t.acceptRunes(1, r1)
 			return true
 		}

 		if r1 == 'f' || r1 == 'F' {
-			r2, _ := t.PeekRune(1)
-			r3, _ := t.PeekRune(2)
-			r4, _ := t.PeekRune(3)
-			r5, err := t.PeekRune(4)
+			r2, _, _ := t.PeekRune(1)
+			r3, _, _ := t.PeekRune(2)
+			r4, _, _ := t.PeekRune(3)
+			r5, _, err := t.PeekRune(4)

 			if err == nil && r2 == 'a' && r3 == 'l' && r4 == 's' && r5 == 'e' {
-				t.accept(r1, r2, r3, r4, r5)
+				t.acceptRunes(5, r1, r2, r3, r4, r5)
 				return true
 			}
 			if err == nil && r1 == 'F' && r2 == 'A' && r3 == 'L' && r4 == 'S' && r5 == 'E' {
-				t.accept(r1, r2, r3, r4, r5)
+				t.acceptRunes(5, r1, r2, r3, r4, r5)
 				return true
 			}
-			t.accept(r1)
+			t.acceptRunes(1, r1)
 			return true
 		}
 		return false