go-parsekit/read/read_test.go

522 lines
15 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package read
import (
"bufio"
"fmt"
"io"
"os"
"runtime"
"strings"
"testing"
"unicode/utf8"
)
func ExampleNew() {
printFirstRuneOf := func(input interface{}) {
r := New(input)
c, w, _ := r.RuneAt(0)
fmt.Printf("rune %q, width in bytes = %d\n", c, w)
}
simpleString := "Ƕello, world!"
printFirstRuneOf(simpleString)
ioReaderImplementation := strings.NewReader("Good bye, world!")
printFirstRuneOf(ioReaderImplementation)
bufioReaderPointer := bufio.NewReader(strings.NewReader("Where do we go, world?"))
printFirstRuneOf(bufioReaderPointer)
bufioReaderValue := *(bufio.NewReader(strings.NewReader("Ɍead the manual!")))
printFirstRuneOf(bufioReaderValue)
// Output:
// rune 'Ƕ', width in bytes = 2
// rune 'G', width in bytes = 1
// rune 'W', width in bytes = 1
// rune 'Ɍ', width in bytes = 2
}
func TestNew_VariousInputTypesCanBeUsed(t *testing.T) {
for _, test := range []struct {
name string
input interface{}
}{
{"string", "Hello, world!"},
{"io.Reader", strings.NewReader("Hello, world!")},
{"*bufio.Reader", bufio.NewReader(strings.NewReader("Hello, world!"))},
{"bufio.Reader", *(bufio.NewReader(strings.NewReader("Hello, world!")))},
} {
r := New(test.input)
firstRune, _, _ := r.RuneAt(0)
if firstRune != 'H' {
t.Errorf("[%s] first rune not 'H'", test.name)
}
lastRune, _, _ := r.RuneAt(12)
if lastRune != '!' {
t.Errorf("[%s] last rune not '!', but %q", test.name, lastRune)
}
}
}
func TestNew_UnhandledInputType_Panics(t *testing.T) {
assertPanic(t,
func() { New(12345) },
"parsekit.read.New(): no support for input of type int")
}
func TestBuffer_ByteAt(t *testing.T) {
r := New(strings.NewReader("Hello, world!"))
at := func(i int) byte { b, _ := r.ByteAt(i); return b }
result := fmt.Sprintf("%c%c%c%c", at(0), at(12), at(7), at(0))
assertEqual(t, "H!wH", result)
}
func TestBuffer_RuneAt(t *testing.T) {
r := New(strings.NewReader("¡pןɹoʍ 'oןןǝH"))
at := func(i int) rune { r, _, _ := r.RuneAt(i); return r }
// It is possible to go back and forth while reading the input.
result := fmt.Sprintf("%c%c%c%c", at(0), at(5), at(8), at(0))
assertEqual(t, "¡ɹʍ¡", result)
}
func TestBuffer_ByteAt_endOfFile(t *testing.T) {
r := New(strings.NewReader("Hello, world!"))
b, err := r.ByteAt(13)
result := fmt.Sprintf("%q %s %t", b, err, err == io.EOF)
assertEqual(t, "'\\x00' EOF true", result)
b, err = r.ByteAt(20)
result = fmt.Sprintf("%q %s %t", b, err, err == io.EOF)
assertEqual(t, "'\\x00' EOF true", result)
}
func TestBuffer_RuneAt_endOfFile(t *testing.T) {
r := New(strings.NewReader("Hello, world!"))
rn, _, err := r.RuneAt(13)
result := fmt.Sprintf("%q %s %t", rn, err, err == io.EOF)
assertEqual(t, "'<27>' EOF true", result)
rn, _, err = r.RuneAt(20)
result = fmt.Sprintf("%q %s %t", rn, err, err == io.EOF)
assertEqual(t, "'<27>' EOF true", result)
}
func TestBuffer_RuneAt_invalidRune(t *testing.T) {
r := New(strings.NewReader("Hello, \xcdworld!"))
at := func(i int) rune { r, _, _ := r.RuneAt(i); return r }
result := fmt.Sprintf("%c%c%c%c", at(6), at(7), at(8), at(9))
assertEqual(t, " <20>wo", result)
}
func ExampleBuffer_ByteAt() {
reader := New(strings.NewReader("Hello, world!"))
fmt.Printf("Runes: ")
offset := 0
for {
r, err := reader.ByteAt(offset)
offset++
if err != nil {
fmt.Printf("\nErr: %s\n", err)
break
}
fmt.Printf("%c", r)
}
// Output:
// Runes: Hello, world!
// Err: EOF
}
func ExampleBuffer_BytesAt() {
reader := New(strings.NewReader("Hello, world!"))
b, err := reader.BytesAt(0, 5)
fmt.Printf("%s err=%v\n", b, err)
b, err = reader.BytesAt(7, 10)
fmt.Printf("%s err=%v\n", b, err)
b, err = reader.BytesAt(7, 5)
fmt.Printf("%s err=%v\n", b, err)
// Output:
// Hello err=<nil>
// world! err=EOF
// world err=<nil>
}
func ExampleBuffer_RuneAt() {
reader := New(strings.NewReader("Hello, pןɹoʍ!"))
fmt.Printf("Runes: ")
offset := 0
for {
r, _, err := reader.RuneAt(offset)
offset += utf8.RuneLen(r)
if err != nil {
fmt.Printf("\nErr: %s\n", err)
break
}
fmt.Printf("%c", r)
}
// Output:
// Runes: Hello, pןɹoʍ!
// Err: EOF
}
// TODO reimplement somewhere, maybe a separate call in the reader or should it be part of a parser?
// func TestRuneAt_SkipsBOMAtStartOfFile(t *testing.T) {
// r := New(strings.NewReader("\uFEFFBommetje!"))
// b, _, _ := r.RuneAt(0)
// o, _, _ := r.RuneAt(1)
// m, _, _ := r.RuneAt(2)
// bom := fmt.Sprintf("%c%c%c", b, o, m)
// assertEqual(t, "Bom", bom)
// }
func TestBuffer_Flush(t *testing.T) {
r := New(strings.NewReader("Hello, world!"))
at := func(i int) rune { r, _, _ := r.RuneAt(i); return r }
// Fills the buffer with the first 8 runes on the input: "Hello, w"
result := fmt.Sprintf("%c", at(7))
assertEqual(t, "w", result)
// Now flush the first 4 runes from the buffer (dropping "Hell" from it)
r.Flush(4)
// Rune 0 is now pointing at what originally was rune offset 4.
// We can continue reading from there.
result = fmt.Sprintf("%c%c%c%c%c%c", at(0), at(1), at(2), at(3), at(4), at(5))
assertEqual(t, "o, wor", result)
}
func ExampleBuffer_Flush() {
r := New(strings.NewReader("dog eat dog!"))
at := func(offset int) rune { c, _, _ := r.RuneAt(offset); return c }
// Read from the first 4 runes of the input.
fmt.Printf("%c%c%c%c", at(0), at(1), at(2), at(3))
// Flush those 4 runes, bringing offset 0 to the start of "eat dog".
r.Flush(4)
// Read another 4 runes, because of the flushing, we start at offset 0.
fmt.Printf("%c%c%c%c", at(1), at(2), at(0), at(3))
// We might even read some more runes. That is no problem.
at(4)
at(5)
// Again, flush 4 runes, bringing offset 0 to the start of "dog!".
r.Flush(4)
// Read from the remainder runes.
fmt.Printf("%c%c%c%c%c", at(2), at(1), at(1), at(0), at(3))
// Output:
// dog ate good!
}
func TestGivenNumberOfRunesTooHigh_Flush_Panics(t *testing.T) {
r := New(strings.NewReader("Hello, world!"))
// Fill buffer with "Hello, world!", the first 13 runes.
rn, _, _ := r.RuneAt(12)
assertEqual(t, '!', rn)
// However, we flush 14 runes, which exceeds the buffer size.
assertPanic(t,
func() { r.Flush(14) },
"parsekit.read.Buffer.Flush(): number of bytes to flush (14) exceeds size of the buffered data (13)")
}
func TestGivenEOFFollowedByFlush_EOFCanStillBeRead(t *testing.T) {
r := New(strings.NewReader("Hello, world!"))
_, _, err := r.RuneAt(13)
assertEqual(t, err.Error(), "EOF")
_, _, err = r.RuneAt(13)
assertEqual(t, err.Error(), "EOF")
_, _, err = r.RuneAt(14)
assertEqual(t, err.Error(), "EOF")
r.Flush(13)
_, _, err = r.RuneAt(0)
assertEqual(t, err.Error(), "EOF")
_, _, err = r.RuneAt(1)
assertEqual(t, err.Error(), "EOF")
_, _, err = r.RuneAt(2)
assertEqual(t, err.Error(), "EOF")
}
// In this test, I want to make sure that once a Buffer returns an error,
// that error is cached and will be returned when data for the offset where
// the error occurred is read at a later time.
func TestGivenErrorFromBuffer_ErrorIsCached(t *testing.T) {
input := &StubReader{
bytes: []byte{'a', 'b', 'c', 'd'},
errors: []error{
io.EOF,
io.ErrUnexpectedEOF, // This error must never popup in the tests below.
},
}
r := New(input)
// Read the last availble rune.
readRune, _, _ := r.RuneAt(3)
assertEqual(t, 'd', readRune)
// Reading the next offset must result in the io.EOF error from the stub.
readRune, _, err := r.RuneAt(4)
assertEqual(t, utf8.RuneError, readRune)
assertEqual(t, io.EOF, err)
// Reading even further should yield the same io.EOF error.
readRune, _, err = r.RuneAt(5)
assertEqual(t, utf8.RuneError, readRune)
assertEqual(t, io.EOF, err)
// After an error, we must still be able to read the last rune.
readRune, _, _ = r.RuneAt(3)
assertEqual(t, 'd', readRune)
// Flushing updates the error index too.
r.Flush(3)
// The last rune is now at offset 0.
readRune, _, _ = r.RuneAt(0)
assertEqual(t, 'd', readRune)
// The io.EOF is now at offset 1.
_, _, err = r.RuneAt(1)
assertEqual(t, io.EOF, err)
// Let's flush that last rune too.
r.Flush(1)
// The io.EOF is now at offset 0.
_, _, err = r.RuneAt(0)
assertEqual(t, io.EOF, err)
// And reading beyond that offset also yields io.EOF.
_, _, err = r.RuneAt(1)
assertEqual(t, io.EOF, err)
}
func TestInputLargerThanDefaultBufSize(t *testing.T) {
input, size := makeLargeStubReader()
r := New(input)
readRune, _, err := r.RuneAt(0)
assertEqual(t, 'A', readRune)
readRune, _, err = r.RuneAt(size - 1)
assertEqual(t, 'B', readRune)
readRune, _, err = r.RuneAt(size)
assertEqual(t, io.EOF, err)
readRune, _, err = r.RuneAt(10)
assertEqual(t, 'K', readRune)
}
func TestInputLargerThanDefaultBufSize_WithFirstReadLargerThanBufSize(t *testing.T) {
input, size := makeLargeStubReader()
r := New(input)
readRune, _, _ := r.RuneAt(size - 200)
assertEqual(t, 'K', readRune)
readRune, _, _ = r.RuneAt(size - 1)
assertEqual(t, 'B', readRune)
}
func TestInputLargerThanDefaultBufSize_WithFirstReadToLastByte(t *testing.T) {
input, size := makeLargeStubReader()
r := New(input)
readRune, _, _ := r.RuneAt(size - 1)
assertEqual(t, 'B', readRune)
}
func TestAllocationPatterns(t *testing.T) {
input, _ := makeLargeStubReader()
buf := New(input)
r := &buf
// The first read will create the standard buffer and fill it with data.
// The first rune is requested, but there's more input data availble,
// so the cache is filled up completely.
// buffer |xxxx1024xxxxx|
assertBuffer(t, "read 1", r, func() { r.RuneAt(0) }, 1024, 0, 1024)
rn, _, _ := r.RuneAt(0)
assertEqual(t, 'A', rn)
// The first 1024 bytes will fit in the standard buffer.
// buffer |xxxx1024xxxxx|
assertBuffer(t, "read fill cache", r, func() { r.ByteAt(1023) }, 1024, 0, 1024)
// Flushing zero input keeps everything as-is.
// buffer |xxxx1024xxxxx|
assertBuffer(t, "flush zero", r, func() { r.Flush(0) }, 1024, 0, 1024)
// Flushing all cached input truncates the buffer.
// buffer | 1024 |
assertBuffer(t, "flush full cache", r, func() { r.Flush(1024) }, 1024, 0, 0)
// Reading 1025 chars will allocate a new store of 2 * 1024 and fill it with data.
// Offset 1024 is requested, but there's more input data availble,
// so the cache is filled up completely.
// buffer |xxxxxxxxxxxx2048xxxxxxxxxxxxxx|
runeBefore, _, _ := r.RuneAt(0)
assertBuffer(t, "read cap + 1", r, func() { r.ByteAt(1024) }, 2048, 0, 2048)
runeAfter, _, _ := r.RuneAt(0)
// The bytes that we had before must be copied to the newly allocated store.
assertEqual(t, runeBefore, runeAfter)
// A partial flush moves the buffer offset, but the stored data stay the same.
// buffer 25 |xxxxxxxxxxx2023xxxxxxxxxx|
assertBuffer(t, "flush partial", r, func() { r.Flush(25) }, 2048, 25, 2023)
// The capacity for the usable part of the buffer is now 2023
// This number of runes can be read, without triggering a re-allocation.
// buffer 25 |xxxxxxxxxxx2023xxxxxxxxxx|
assertBuffer(t, "read fill cache after partial flush", r, func() { r.ByteAt(2022) }, 2048, 25, 2023)
// Flush the full input.
// store | 2048 |
// buffer | 2048 |
assertBuffer(t, "flush full cache after partial flush", r, func() { r.Flush(2023) }, 2048, 0, 0)
// Fill up the store again.
// Offset 1234 is requested, but there's more input data availble,
// so the cache is filled up completely.
// buffer |xxxxxxxxxxxx2048xxxxxxxxxxxxxx|
assertBuffer(t, "fill up the store again", r, func() { r.ByteAt(1234) }, 2048, 0, 2048)
// Then flush almost all input.
// buffer 2047 |x1x|
assertBuffer(t, "flush almost all input", r, func() { r.Flush(2047) }, 2048, 2047, 1)
// Read some data beyond the single byte. This moves the single byte at the end to
// the start and fills up the rest of the buffer, without a reallocation.
// buffer |xxxxxxxxxxxx2048xxxxxxxxxxxxxx|
assertBuffer(t, "read the remaining size, triggering a move", r, func() { r.ByteAt(1234) }, 2048, 0, 2048)
// Now flush only one rune from the cache.
// buffer 1 |xxxxxxxxx2047xxxxxxxxxxxxxx|
assertBuffer(t, "flush 1", r, func() { r.Flush(1) }, 2048, 1, 2047)
// Now read the full available capacity. This will not fit, so
// space has to be made. Since there's 1 free space at the start of the store,
// the data are moved to the start and no reallocation is needed.
// buffer |xxxxxxxxxxxx2048xxxxxxxxxxxxx|
assertBuffer(t, "read full capacity with 1 free byte at start", r, func() { r.ByteAt(2047) }, 2048, 0, 2048)
// Now read in the whole rest of the buffer, asking for an offset that is way out of range.
// It does allocate enough memory to store 10000 bytes (bringing us to 10240), but while reading it is
// detected that there are not enough bytes to fill it. That puts a limit on the amount of data in
// the buffer, so the buffer is not completely filled.
// buffer |xxxxxxxxxxxxxxx5120xxxxxxxxxxxxxxxxxxxx 10240-5120 |
remaining := input.remaining
assertBuffer(t, "over-ask", r, func() { r.ByteAt(10000) }, 10240, 0, 2048+remaining)
}
func makeLargeStubReader() (*StubReader, int) {
size := 8192
bytes := make([]byte, size)
for i := range bytes {
bytes[i] = 'A' + byte(i%26)
}
return &StubReader{bytes: bytes, errors: []error{io.EOF}, remaining: size}, size
}
type StubReader struct {
bytes []byte
errors []error
remaining int
}
func (r *StubReader) Read(p []byte) (n int, err error) {
if len(r.bytes) > 0 {
head, tail := r.bytes[0], r.bytes[1:]
r.bytes = tail
p[0] = head
r.remaining--
return 1, nil
}
if len(r.errors) > 0 {
head, tail := r.errors[0], r.errors[1:]
r.errors = tail
return 0, head
}
panic("StubReader is all out of bytes and errors")
}
func Benchmark0BytesInputFile(b *testing.B) {
processInputFile(b, 0)
}
func Benchmark100BytesInputFile(b *testing.B) {
processInputFile(b, 100)
}
func Benchmark1024BytesInputFile(b *testing.B) {
processInputFile(b, 1024)
}
func Benchmark2048BytesInputFile(b *testing.B) {
processInputFile(b, 2048)
}
func Benchmark2000000BytesInputFile(b *testing.B) {
processInputFile(b, 2000000)
}
func processInputFile(b *testing.B, testSize int) {
for x := 0; x < b.N; x++ {
_, filename, _, _ := runtime.Caller(0)
path := strings.Replace(filename, "read_test.go", fmt.Sprintf("testfiles/%dbytes.txt", testSize), 1)
input, err := os.Open(path)
if err != nil {
panic(fmt.Sprintf("Cannot open file for test (%v): %s", path, err))
}
i := New(input)
offset := 0
readSize := 0
flushAt := 1024
for {
_, err := i.ByteAt(offset)
if err != nil {
break
}
offset++
readSize++
if offset == flushAt {
i.Flush(offset)
offset = 0
// So we flush full buffer sizes and partial buffer sizes to
// get more test coverage.
if flushAt == 1000 {
flushAt = 1024
} else {
flushAt = 1000
}
}
if readSize > testSize {
b.Fatalf("Test input is %d bytes, but read %d bytes so far!", testSize, readSize)
}
}
if readSize != testSize {
b.Fatalf("Expected to read %d bytes, but read %d bytes instead", testSize, readSize)
}
}
}