go-parsekit/read/read_test.go

package read

import (
	"bufio"
	"fmt"
	"io"
	"strings"
	"testing"
	"unicode/utf8"
)

func ExampleNew() {
	printFirstRuneOf := func(input interface{}) {
		r := New(input)
		c, w, _ := r.RuneAt(0)
		fmt.Printf("rune %q, width %d\n", c, w)
	}

	simpleString := "Hello, world!"
	printFirstRuneOf(simpleString)

	ioReaderImplementation := strings.NewReader("Good bye, world!")
	printFirstRuneOf(ioReaderImplementation)

	bufioReaderPointer := bufio.NewReader(strings.NewReader("Where do we go, world?"))
	printFirstRuneOf(bufioReaderPointer)

	bufioReaderValue := *(bufio.NewReader(strings.NewReader("Ɍead the manual!")))
	printFirstRuneOf(bufioReaderValue)

	// Output:
	// rune 'H', width 1
	// rune 'G', width 1
	// rune 'W', width 1
	// rune 'Ɍ', width 2
}

func TestNew_VariousInputTypesCanBeUsed(t *testing.T) {
	for _, test := range []struct {
		name  string
		input interface{}
	}{
		{"string", "Hello, world!"},
		{"io.Reader", strings.NewReader("Hello, world!")},
		{"*bufio.Reader", bufio.NewReader(strings.NewReader("Hello, world!"))},
		{"bufio.Reader", *(bufio.NewReader(strings.NewReader("Hello, world!")))},
	} {
		r := New(test.input)
		firstRune, _, _ := r.RuneAt(0)
		if firstRune != 'H' {
			t.Errorf("[%s] first rune not 'H'", test.name)
		}
		lastRune, _, _ := r.RuneAt(12)
		if lastRune != '!' {
			t.Errorf("[%s] last rune not '!', but %q", test.name, lastRune)
		}
	}
}

func TestNew_UnhandledInputType_Panics(t *testing.T) {
	assertPanic(t,
		func() { New(12345) },
		"parsekit.read.New(): no support for input of type int")
}

func TestBuffer_ByteAt(t *testing.T) {
	r := New(strings.NewReader("Hello, world!"))
	at := func(i int) byte { b, _ := r.ByteAt(i); return b }

	result := fmt.Sprintf("%c%c%c%c", at(0), at(12), at(7), at(0))
	assertEqual(t, "H!wH", result)
}

func TestBuffer_RuneAt(t *testing.T) {
	r := New(strings.NewReader("¡pןɹoʍ 'oןןǝH"))
	at := func(i int) rune { r, _, _ := r.RuneAt(i); return r }

	// It is possible to go back and forth while reading the input.
	result := fmt.Sprintf("%c%c%c%c", at(0), at(5), at(8), at(0))
	assertEqual(t, "¡ɹʍ¡", result)
}

func TestBuffer_ByteAt_endOfFile(t *testing.T) {
	r := New(strings.NewReader("Hello, world!"))

	b, err := r.ByteAt(13)
	result := fmt.Sprintf("%q %s %t", b, err, err == io.EOF)
	assertEqual(t, "'\\x00' EOF true", result)

	b, err = r.ByteAt(20)
	result = fmt.Sprintf("%q %s %t", b, err, err == io.EOF)
	assertEqual(t, "'\\x00' EOF true", result)
}

func TestBuffer_RuneAt_endOfFile(t *testing.T) {
	r := New(strings.NewReader("Hello, world!"))

	rn, _, err := r.RuneAt(13)
	result := fmt.Sprintf("%q %s %t", rn, err, err == io.EOF)
	assertEqual(t, "'<27>' EOF true", result)

	rn, _, err = r.RuneAt(20)
	result = fmt.Sprintf("%q %s %t", rn, err, err == io.EOF)
	assertEqual(t, "'<27>' EOF true", result)
}

func TestBuffer_RuneAt_invalidRune(t *testing.T) {
	r := New(strings.NewReader("Hello, \xcdworld!"))
	at := func(i int) rune { r, _, _ := r.RuneAt(i); return r }

	result := fmt.Sprintf("%c%c%c%c", at(6), at(7), at(8), at(9))
	assertEqual(t, " <20>wo", result)
}

func ExampleBuffer_ByteAt() {
	reader := New(strings.NewReader("Hello, world!"))

	fmt.Printf("Runes: ")
	offset := 0
	for {
		r, err := reader.ByteAt(offset)
		offset++
		if err != nil {
			fmt.Printf("\nErr: %s\n", err)
			break
		}
		fmt.Printf("%c", r)
	}

	// Output:
	// Runes: Hello, world!
	// Err: EOF
}

func ExampleBuffer_RuneAt() {
	reader := New(strings.NewReader("Hello, pןɹoʍ!"))

	fmt.Printf("Runes: ")
	offset := 0
	for {
		r, w, err := reader.RuneAt(offset)
		offset += w
		if err != nil {
			fmt.Printf("\nErr: %s\n", err)
			break
		}
		fmt.Printf("%c", r)
	}

	// Output:
	// Runes: Hello, pןɹoʍ!
	// Err: EOF
}

// TODO reimplement somewhere, maybe a separate call in the reader or should it be part of a parser?
// func TestRuneAt_SkipsBOMAtStartOfFile(t *testing.T) {
// 	r := New(strings.NewReader("\uFEFFBommetje!"))
// 	b, _, _ := r.RuneAt(0)
// 	o, _, _ := r.RuneAt(1)
// 	m, _, _ := r.RuneAt(2)
// 	bom := fmt.Sprintf("%c%c%c", b, o, m)
// 	assertEqual(t, "Bom", bom)
// }

func TestBuffer_Flush(t *testing.T) {
	r := New(strings.NewReader("Hello, world!"))
	at := func(i int) rune { r, _, _ := r.RuneAt(i); return r }

	// Fills the buffer with the first 8 runes on the input: "Hello, w"
	result := fmt.Sprintf("%c", at(7))
	assertEqual(t, "w", result)

	// Now flush the first 4 runes from the buffer (dropping "Hell" from it)
	r.Flush(4)

	// Rune 0 is now pointing at what originally was rune offset 4.
	// We can continue reading from there.
	result = fmt.Sprintf("%c%c%c%c%c%c", at(0), at(1), at(2), at(3), at(4), at(5))
	assertEqual(t, "o, wor", result)
}

func ExampleBuffer_Flush() {
	r := New(strings.NewReader("dog eat dog!"))
	at := func(offset int) rune { c, _, _ := r.RuneAt(offset); return c }

	// Read from the first 4 runes of the input.
	fmt.Printf("%c%c%c%c", at(0), at(1), at(2), at(3))

	// Flush those 4 runes, bringing offset 0 to the start of "eat dog".
	r.Flush(4)

	// Read another 4 runes, because of the flushing, we start at offset 0.
	fmt.Printf("%c%c%c%c", at(1), at(2), at(0), at(3))

	// Again, flush 4 runes, bringing offset 0 to the start of "dog!".
	r.Flush(4)

	// Read from the remainder runes.
	fmt.Printf("%c%c%c%c%c", at(2), at(1), at(1), at(0), at(3))

	// Output:
	// dog ate good!
}

func TestGivenNumberOfRunesTooHigh_Flush_Panics(t *testing.T) {
	r := New(strings.NewReader("Hello, world!"))

	// Fill buffer with "Hello, world!", the first 13 runes.
	rn, _, _ := r.RuneAt(12)
	assertEqual(t, '!', rn)

	// However, we flush 14 runes, which exceeds the buffer size.
	assertPanic(t,
		func() { r.Flush(14) },
		"parsekit.read.Buffer.Flush(): number of runes to flush "+
			"(14) exceeds size of the buffer (13)")
}

func TestGivenEOFFollowedByFlush_EOFCanStillBeRead(t *testing.T) {
	r := New(strings.NewReader("Hello, world!"))
	_, _, err := r.RuneAt(13)
	assertEqual(t, err.Error(), "EOF")
	_, _, err = r.RuneAt(13)
	assertEqual(t, err.Error(), "EOF")
	_, _, err = r.RuneAt(14)
	assertEqual(t, err.Error(), "EOF")
	r.Flush(13)
	_, _, err = r.RuneAt(0)
	assertEqual(t, err.Error(), "EOF")
	_, _, err = r.RuneAt(1)
	assertEqual(t, err.Error(), "EOF")
	_, _, err = r.RuneAt(2)
	assertEqual(t, err.Error(), "EOF")
}

// In this test, I want to make sure that once a Buffer returns an error,
// that error is cached and will be returned when data for the offset where
// the error occurred is read at a later time.
func TestGivenErrorFromBuffer_ErrorIsCached(t *testing.T) {
	input := &StubReader{
		bytes: []byte{'a', 'b', 'c', 'd'},
		errors: []error{
			io.EOF,
			io.ErrUnexpectedEOF, // This error must never popup in the tests below.
		},
	}
	r := New(input)

	// Read the last availble rune.
	readRune, _, _ := r.RuneAt(3)
	assertEqual(t, 'd', readRune)
	return

	// Reading the next offset must result in the io.EOF error from the stub.
	readRune, _, err := r.RuneAt(4)
	assertEqual(t, utf8.RuneError, readRune)
	assertEqual(t, io.EOF, err)

	// Reading even further should yield the same io.EOF error.
	readRune, _, err = r.RuneAt(5)
	assertEqual(t, utf8.RuneError, readRune)
	assertEqual(t, io.EOF, err)

	// After an error, we must still be able to read the last rune.
	readRune, _, _ = r.RuneAt(3)
	assertEqual(t, 'd', readRune)

	// Flushing updates the error index too.
	r.Flush(3)

	// The last rune is now at offset 0.
	readRune, _, _ = r.RuneAt(0)
	assertEqual(t, 'd', readRune)

	// // The io.EOF is now at offset 1.
	// _, _, err = r.RuneAt(1)
	// assertEqual(t, io.EOF, err)

	// // Let's flush that last rune too.
	// r.Flush(1)

	// // The io.EOF is now at offset 0.
	// _, _, err = r.RuneAt(0)
	// assertEqual(t, io.EOF, err)

	// // And reading beyond that offset also yields io.EOF.
	// _, _, err = r.RuneAt(1)
	// assertEqual(t, io.EOF, err)
}

func TestInputLargerThanDefaultBufSize64(t *testing.T) {
	input, size := makeLargeStubReader()
	r := New(input)

	readRune, _, err := r.RuneAt(0)
	assertEqual(t, 'X', readRune)
	readRune, _, err = r.RuneAt(size - 1)
	assertEqual(t, 'Y', readRune)
	readRune, _, err = r.RuneAt(size)
	assertEqual(t, io.EOF, err)
	readRune, _, err = r.RuneAt(10)
	assertEqual(t, 'X', readRune)
}

func TestInputLargerThanDefaultBufSize64_WithFirstReadLargerThanBufSize64(t *testing.T) {
	input, size := makeLargeStubReader()
	r := New(input)

	readRune, _, _ := r.RuneAt(size - 200)
	assertEqual(t, 'X', readRune)
	readRune, _, _ = r.RuneAt(size - 1)
	assertEqual(t, 'Y', readRune)
}

func TestInputLargerThanDefaultBufSize64_WithFirstReadToLastByte(t *testing.T) {
	input, size := makeLargeStubReader()
	r := New(input)

	readRune, _, _ := r.RuneAt(size - 1)
	assertEqual(t, 'Y', readRune)
}

func TestAllocationPatterns(t *testing.T) {
	input, _ := makeLargeStubReader()
	r := New(input)

	// The first read will create the standard cache.
	// store  |x   1024     |
	// buffer |x   1024     |
	assertCache(t, "read 1", r, func() { r.RuneAt(0) }, 0, 1024, 4, 1024)
	rn, _, _ := r.RuneAt(0)
	assertEqual(t, 'X', rn)

	// The first 1024 bytes will fit in the standard cache.
	// store  |xxxx1024xxxxx|
	// buffer |xxxx1024xxxxx|
	assertCache(t, "read fill cache", r, func() { r.ByteAt(1023) }, 0, 1024, 1024, 1024)

	// Flushing zero input keeps everything as-is.
	// store  |xxxx1024xxxxx|
	// buffer |xxxx1024xxxxx|
	assertCache(t, "flush zero", r, func() { r.Flush(0) }, 0, 1024, 1024, 1024)

	// Flushing all cached input truncates the cache.
	// store  |    1024     |
	// buffer |    1024     |
	assertCache(t, "flush full cache", r, func() { r.Flush(1024) }, 0, 1024, 0, 1024)

	// Reading 1025 chars will allocate a new store of 2 * 1024.
	// store  |xxxxx1025xxxxx    1023        |
	// buffer |xxxxx1025xxxxx    1023        |
	assertCache(t, "read cap + 1", r, func() { r.ByteAt(1024) }, 0, 2048, 1025, 2048)

	// The bytes that we had before must be copied to the newly allocated store.
	rn, _, _ = r.RuneAt(0)
	assertEqual(t, 'X', rn)

	// A partial flush frees the start of the store and moves
	// the buffer slice.
	// store  | 25  xxx1000xxx   1023        |
	// buffer      |xxx1000xxx   1023        |
	assertCache(t, "flush partial", r, func() { r.Flush(25) }, 0, 2048, 1000, 2048-25)

	// The capacity for the buffer is now 2023
	// This number of runes can be read, filling up the store
	// without a new allocation.
	// store  | 25  xxxxxxxxxxx2023xxxxxxxxxx|
	// buffer      |xxxxxxxxxxx2023xxxxxxxxxx|
	assertCache(t, "read fill cache after partial flush", r, func() { r.ByteAt(2022) }, 0, 2048, 2023, 2048)

	// Flush the full input.
	// store  |            2048             |
	// buffer |            2048             |
	assertCache(t, "flush full cache after partial flush", r, func() { r.Flush(2023) }, 0, 2048, 0, 2048)

	// Read a bit more than half the capacity.
	// store  |xxxx1025xxxxxx     1023      |
	// buffer |xxxx1025xxxxxx     1023      |
	assertCache(t, "read more than half the cap", r, func() { r.ByteAt(1024) }, 0, 2048, 1025, 2048)

	// Then flush almost all input.
	// store  |    1024     x1x    1023     |
	// buffer      1024    |x1x    1023     |
	assertCache(t, "flush almost all input", r, func() { r.Flush(1024) }, 0, 2048, 1, 1024)

	// Again read a bit more than half the capacity. This does not fit at the
	// end of the store, but by moving the current buffer to the start of the
	// store (where it fits), space is freed up for the read operation.
	// store  |xxxxx1025xxxxxx     1023     |
	// buffer |xxxxx1025xxxxxx     1023     |
	assertCache(t, "read beyond cap with free space at start of store", r, func() { r.ByteAt(1024) }, 0, 2048, 1025, 2048)

	// Now flush only one rune from the cache.
	// store  |1 xxx1024xxxxxx     1023     |
	// buffer   |xxx1024xxxxxx     1023     |
	assertCache(t, "flush 1", r, func() { r.Flush(1) }, 0, 2048, 1024, 2047)

	// Now read the full available capacity. This will not fit, so
	// space has to be made. Since there's 1 free space at the start of the store,
	// the data are moved to the start and no reallocation is needed.
	// store  |xxxxxxxxxxxx2048xxxxxxxxxxxxx|
	// buffer |xxxxxxxxxxxx2048xxxxxxxxxxxxx|
	assertCache(t, "read full capacity with 1 free byte at start", r, func() { r.ByteAt(2047) }, 0, 2048, 2048, 2048)
}

func makeLargeStubReader() (*StubReader, int) {
	size := 8192
	bytes := make([]byte, size)
	for i := range bytes {
		bytes[i] = 'X'
	}
	bytes[size-1] = 'Y'
	return &StubReader{bytes: bytes, errors: []error{io.EOF}}, size
}

type StubReader struct {
	bytes  []byte
	errors []error
}

func (r *StubReader) Read(p []byte) (n int, err error) {
	if len(r.bytes) > 0 {
		head, tail := r.bytes[0], r.bytes[1:]
		r.bytes = tail
		p[0] = head
		return 1, nil
	}
	if len(r.errors) > 0 {
		head, tail := r.errors[0], r.errors[1:]
		r.errors = tail
		return 0, head
	}
	panic("StubReader is all out of bytes and errors")
}