package read import ( "bufio" "fmt" "io" "os" "runtime" "strings" "testing" "unicode/utf8" ) func ExampleNew() { printFirstRuneOf := func(input interface{}) { r := New(input) c, w, _ := r.RuneAt(0) fmt.Printf("rune %q, width in bytes = %d\n", c, w) } simpleString := "Ƕello, world!" printFirstRuneOf(simpleString) ioReaderImplementation := strings.NewReader("Good bye, world!") printFirstRuneOf(ioReaderImplementation) bufioReaderPointer := bufio.NewReader(strings.NewReader("Where do we go, world?")) printFirstRuneOf(bufioReaderPointer) bufioReaderValue := *(bufio.NewReader(strings.NewReader("Ɍead the manual!"))) printFirstRuneOf(bufioReaderValue) // Output: // rune 'Ƕ', width in bytes = 2 // rune 'G', width in bytes = 1 // rune 'W', width in bytes = 1 // rune 'Ɍ', width in bytes = 2 } func TestNew_VariousInputTypesCanBeUsed(t *testing.T) { for _, test := range []struct { name string input interface{} }{ {"string", "Hello, world!"}, {"io.Reader", strings.NewReader("Hello, world!")}, {"*bufio.Reader", bufio.NewReader(strings.NewReader("Hello, world!"))}, {"bufio.Reader", *(bufio.NewReader(strings.NewReader("Hello, world!")))}, } { r := New(test.input) firstRune, _, _ := r.RuneAt(0) if firstRune != 'H' { t.Errorf("[%s] first rune not 'H'", test.name) } lastRune, _, _ := r.RuneAt(12) if lastRune != '!' { t.Errorf("[%s] last rune not '!', but %q", test.name, lastRune) } } } func TestNew_UnhandledInputType_Panics(t *testing.T) { assertPanic(t, func() { New(12345) }, "parsekit.read.New(): no support for input of type int") } func TestBuffer_ByteAt(t *testing.T) { r := New(strings.NewReader("Hello, world!")) at := func(i int) byte { b, _ := r.ByteAt(i); return b } result := fmt.Sprintf("%c%c%c%c", at(0), at(12), at(7), at(0)) assertEqual(t, "H!wH", result) } func TestBuffer_RuneAt(t *testing.T) { r := New(strings.NewReader("¡pןɹoʍ 'oןןǝH")) at := func(i int) rune { r, _, _ := r.RuneAt(i); return r } // It is possible to go back and forth while reading the input. result := fmt.Sprintf("%c%c%c%c", at(0), at(5), at(8), at(0)) assertEqual(t, "¡ɹʍ¡", result) } func TestBuffer_ByteAt_endOfFile(t *testing.T) { r := New(strings.NewReader("Hello, world!")) b, err := r.ByteAt(13) result := fmt.Sprintf("%q %s %t", b, err, err == io.EOF) assertEqual(t, "'\\x00' EOF true", result) b, err = r.ByteAt(20) result = fmt.Sprintf("%q %s %t", b, err, err == io.EOF) assertEqual(t, "'\\x00' EOF true", result) } func TestBuffer_RuneAt_endOfFile(t *testing.T) { r := New(strings.NewReader("Hello, world!")) rn, _, err := r.RuneAt(13) result := fmt.Sprintf("%q %s %t", rn, err, err == io.EOF) assertEqual(t, "'�' EOF true", result) rn, _, err = r.RuneAt(20) result = fmt.Sprintf("%q %s %t", rn, err, err == io.EOF) assertEqual(t, "'�' EOF true", result) } func TestBuffer_RuneAt_invalidRune(t *testing.T) { r := New(strings.NewReader("Hello, \xcdworld!")) at := func(i int) rune { r, _, _ := r.RuneAt(i); return r } result := fmt.Sprintf("%c%c%c%c", at(6), at(7), at(8), at(9)) assertEqual(t, " �wo", result) } func ExampleBuffer_ByteAt() { reader := New(strings.NewReader("Hello, world!")) fmt.Printf("Runes: ") offset := 0 for { r, err := reader.ByteAt(offset) offset++ if err != nil { fmt.Printf("\nErr: %s\n", err) break } fmt.Printf("%c", r) } // Output: // Runes: Hello, world! // Err: EOF } func ExampleBuffer_BytesAt() { reader := New(strings.NewReader("Hello, world!")) b, err := reader.BytesAt(0, 5) fmt.Printf("%s err=%v\n", b, err) b, err = reader.BytesAt(7, 10) fmt.Printf("%s err=%v\n", b, err) b, err = reader.BytesAt(7, 5) fmt.Printf("%s err=%v\n", b, err) // Output: // Hello err= // world! err=EOF // world err= } func ExampleBuffer_RuneAt() { reader := New(strings.NewReader("Hello, pןɹoʍ!")) fmt.Printf("Runes: ") offset := 0 for { r, _, err := reader.RuneAt(offset) offset += utf8.RuneLen(r) if err != nil { fmt.Printf("\nErr: %s\n", err) break } fmt.Printf("%c", r) } // Output: // Runes: Hello, pןɹoʍ! // Err: EOF } // TODO reimplement somewhere, maybe a separate call in the reader or should it be part of a parser? // func TestRuneAt_SkipsBOMAtStartOfFile(t *testing.T) { // r := New(strings.NewReader("\uFEFFBommetje!")) // b, _, _ := r.RuneAt(0) // o, _, _ := r.RuneAt(1) // m, _, _ := r.RuneAt(2) // bom := fmt.Sprintf("%c%c%c", b, o, m) // assertEqual(t, "Bom", bom) // } func TestBuffer_Flush(t *testing.T) { r := New(strings.NewReader("Hello, world!")) at := func(i int) rune { r, _, _ := r.RuneAt(i); return r } // Fills the buffer with the first 8 runes on the input: "Hello, w" result := fmt.Sprintf("%c", at(7)) assertEqual(t, "w", result) // Now flush the first 4 runes from the buffer (dropping "Hell" from it) r.Flush(4) // Rune 0 is now pointing at what originally was rune offset 4. // We can continue reading from there. result = fmt.Sprintf("%c%c%c%c%c%c", at(0), at(1), at(2), at(3), at(4), at(5)) assertEqual(t, "o, wor", result) } func ExampleBuffer_Flush() { r := New(strings.NewReader("dog eat dog!")) at := func(offset int) rune { c, _, _ := r.RuneAt(offset); return c } // Read from the first 4 runes of the input. fmt.Printf("%c%c%c%c", at(0), at(1), at(2), at(3)) // Flush those 4 runes, bringing offset 0 to the start of "eat dog". r.Flush(4) // Read another 4 runes, because of the flushing, we start at offset 0. fmt.Printf("%c%c%c%c", at(1), at(2), at(0), at(3)) // We might even read some more runes. That is no problem. at(4) at(5) // Again, flush 4 runes, bringing offset 0 to the start of "dog!". r.Flush(4) // Read from the remainder runes. fmt.Printf("%c%c%c%c%c", at(2), at(1), at(1), at(0), at(3)) // Output: // dog ate good! } func TestGivenNumberOfRunesTooHigh_Flush_Panics(t *testing.T) { r := New(strings.NewReader("Hello, world!")) // Fill buffer with "Hello, world!", the first 13 runes. rn, _, _ := r.RuneAt(12) assertEqual(t, '!', rn) // However, we flush 14 runes, which exceeds the buffer size. assertPanic(t, func() { r.Flush(14) }, "parsekit.read.Buffer.Flush(): number of bytes to flush (14) exceeds size of the buffered data (13)") } func TestGivenEOFFollowedByFlush_EOFCanStillBeRead(t *testing.T) { r := New(strings.NewReader("Hello, world!")) _, _, err := r.RuneAt(13) assertEqual(t, err.Error(), "EOF") _, _, err = r.RuneAt(13) assertEqual(t, err.Error(), "EOF") _, _, err = r.RuneAt(14) assertEqual(t, err.Error(), "EOF") r.Flush(13) _, _, err = r.RuneAt(0) assertEqual(t, err.Error(), "EOF") _, _, err = r.RuneAt(1) assertEqual(t, err.Error(), "EOF") _, _, err = r.RuneAt(2) assertEqual(t, err.Error(), "EOF") } // In this test, I want to make sure that once a Buffer returns an error, // that error is cached and will be returned when data for the offset where // the error occurred is read at a later time. func TestGivenErrorFromBuffer_ErrorIsCached(t *testing.T) { input := &StubReader{ bytes: []byte{'a', 'b', 'c', 'd'}, errors: []error{ io.EOF, io.ErrUnexpectedEOF, // This error must never popup in the tests below. }, } r := New(input) // Read the last availble rune. readRune, _, _ := r.RuneAt(3) assertEqual(t, 'd', readRune) // Reading the next offset must result in the io.EOF error from the stub. readRune, _, err := r.RuneAt(4) assertEqual(t, utf8.RuneError, readRune) assertEqual(t, io.EOF, err) // Reading even further should yield the same io.EOF error. readRune, _, err = r.RuneAt(5) assertEqual(t, utf8.RuneError, readRune) assertEqual(t, io.EOF, err) // After an error, we must still be able to read the last rune. readRune, _, _ = r.RuneAt(3) assertEqual(t, 'd', readRune) // Flushing updates the error index too. r.Flush(3) // The last rune is now at offset 0. readRune, _, _ = r.RuneAt(0) assertEqual(t, 'd', readRune) // The io.EOF is now at offset 1. _, _, err = r.RuneAt(1) assertEqual(t, io.EOF, err) // Let's flush that last rune too. r.Flush(1) // The io.EOF is now at offset 0. _, _, err = r.RuneAt(0) assertEqual(t, io.EOF, err) // And reading beyond that offset also yields io.EOF. _, _, err = r.RuneAt(1) assertEqual(t, io.EOF, err) } func TestInputLargerThanDefaultBufSize(t *testing.T) { input, size := makeLargeStubReader() r := New(input) readRune, _, err := r.RuneAt(0) assertEqual(t, 'A', readRune) readRune, _, err = r.RuneAt(size - 1) assertEqual(t, 'B', readRune) readRune, _, err = r.RuneAt(size) assertEqual(t, io.EOF, err) readRune, _, err = r.RuneAt(10) assertEqual(t, 'K', readRune) } func TestInputLargerThanDefaultBufSize_WithFirstReadLargerThanBufSize(t *testing.T) { input, size := makeLargeStubReader() r := New(input) readRune, _, _ := r.RuneAt(size - 200) assertEqual(t, 'K', readRune) readRune, _, _ = r.RuneAt(size - 1) assertEqual(t, 'B', readRune) } func TestInputLargerThanDefaultBufSize_WithFirstReadToLastByte(t *testing.T) { input, size := makeLargeStubReader() r := New(input) readRune, _, _ := r.RuneAt(size - 1) assertEqual(t, 'B', readRune) } func TestAllocationPatterns(t *testing.T) { input, _ := makeLargeStubReader() buf := New(input) r := &buf // The first read will create the standard buffer and fill it with data. // The first rune is requested, but there's more input data availble, // so the cache is filled up completely. // buffer |xxxx1024xxxxx| assertBuffer(t, "read 1", r, func() { r.RuneAt(0) }, 1024, 0, 1024) rn, _, _ := r.RuneAt(0) assertEqual(t, 'A', rn) // The first 1024 bytes will fit in the standard buffer. // buffer |xxxx1024xxxxx| assertBuffer(t, "read fill cache", r, func() { r.ByteAt(1023) }, 1024, 0, 1024) // Flushing zero input keeps everything as-is. // buffer |xxxx1024xxxxx| assertBuffer(t, "flush zero", r, func() { r.Flush(0) }, 1024, 0, 1024) // Flushing all cached input truncates the buffer. // buffer | 1024 | assertBuffer(t, "flush full cache", r, func() { r.Flush(1024) }, 1024, 0, 0) // Reading 1025 chars will allocate a new store of 2 * 1024 and fill it with data. // Offset 1024 is requested, but there's more input data availble, // so the cache is filled up completely. // buffer |xxxxxxxxxxxx2048xxxxxxxxxxxxxx| runeBefore, _, _ := r.RuneAt(0) assertBuffer(t, "read cap + 1", r, func() { r.ByteAt(1024) }, 2048, 0, 2048) runeAfter, _, _ := r.RuneAt(0) // The bytes that we had before must be copied to the newly allocated store. assertEqual(t, runeBefore, runeAfter) // A partial flush moves the buffer offset, but the stored data stay the same. // buffer 25 |xxxxxxxxxxx2023xxxxxxxxxx| assertBuffer(t, "flush partial", r, func() { r.Flush(25) }, 2048, 25, 2023) // The capacity for the usable part of the buffer is now 2023 // This number of runes can be read, without triggering a re-allocation. // buffer 25 |xxxxxxxxxxx2023xxxxxxxxxx| assertBuffer(t, "read fill cache after partial flush", r, func() { r.ByteAt(2022) }, 2048, 25, 2023) // Flush the full input. // store | 2048 | // buffer | 2048 | assertBuffer(t, "flush full cache after partial flush", r, func() { r.Flush(2023) }, 2048, 0, 0) // Fill up the store again. // Offset 1234 is requested, but there's more input data availble, // so the cache is filled up completely. // buffer |xxxxxxxxxxxx2048xxxxxxxxxxxxxx| assertBuffer(t, "fill up the store again", r, func() { r.ByteAt(1234) }, 2048, 0, 2048) // Then flush almost all input. // buffer 2047 |x1x| assertBuffer(t, "flush almost all input", r, func() { r.Flush(2047) }, 2048, 2047, 1) // Read some data beyond the single byte. This moves the single byte at the end to // the start and fills up the rest of the buffer, without a reallocation. // buffer |xxxxxxxxxxxx2048xxxxxxxxxxxxxx| assertBuffer(t, "read the remaining size, triggering a move", r, func() { r.ByteAt(1234) }, 2048, 0, 2048) // Now flush only one rune from the cache. // buffer 1 |xxxxxxxxx2047xxxxxxxxxxxxxx| assertBuffer(t, "flush 1", r, func() { r.Flush(1) }, 2048, 1, 2047) // Now read the full available capacity. This will not fit, so // space has to be made. Since there's 1 free space at the start of the store, // the data are moved to the start and no reallocation is needed. // buffer |xxxxxxxxxxxx2048xxxxxxxxxxxxx| assertBuffer(t, "read full capacity with 1 free byte at start", r, func() { r.ByteAt(2047) }, 2048, 0, 2048) // Now read in the whole rest of the buffer, asking for an offset that is way out of range. // It does allocate enough memory to store 10000 bytes (bringing us to 10240), but while reading it is // detected that there are not enough bytes to fill it. That puts a limit on the amount of data in // the buffer, so the buffer is not completely filled. // buffer |xxxxxxxxxxxxxxx5120xxxxxxxxxxxxxxxxxxxx 10240-5120 | remaining := input.remaining assertBuffer(t, "over-ask", r, func() { r.ByteAt(10000) }, 10240, 0, 2048+remaining) } func makeLargeStubReader() (*StubReader, int) { size := 8192 bytes := make([]byte, size) for i := range bytes { bytes[i] = 'A' + byte(i%26) } return &StubReader{bytes: bytes, errors: []error{io.EOF}, remaining: size}, size } type StubReader struct { bytes []byte errors []error remaining int } func (r *StubReader) Read(p []byte) (n int, err error) { if len(r.bytes) > 0 { head, tail := r.bytes[0], r.bytes[1:] r.bytes = tail p[0] = head r.remaining-- return 1, nil } if len(r.errors) > 0 { head, tail := r.errors[0], r.errors[1:] r.errors = tail return 0, head } panic("StubReader is all out of bytes and errors") } func Benchmark0BytesInputFile(b *testing.B) { processInputFile(b, 0) } func Benchmark100BytesInputFile(b *testing.B) { processInputFile(b, 100) } func Benchmark1024BytesInputFile(b *testing.B) { processInputFile(b, 1024) } func Benchmark2048BytesInputFile(b *testing.B) { processInputFile(b, 2048) } func Benchmark2000000BytesInputFile(b *testing.B) { processInputFile(b, 2000000) } func processInputFile(b *testing.B, testSize int) { for x := 0; x < b.N; x++ { _, filename, _, _ := runtime.Caller(0) path := strings.Replace(filename, "read_test.go", fmt.Sprintf("testfiles/%dbytes.txt", testSize), 1) input, err := os.Open(path) if err != nil { panic(fmt.Sprintf("Cannot open file for test (%v): %s", path, err)) } i := New(input) offset := 0 readSize := 0 flushAt := 1024 for { _, err := i.ByteAt(offset) if err != nil { break } offset++ readSize++ if offset == flushAt { i.Flush(offset) offset = 0 // So we flush full buffer sizes and partial buffer sizes to // get more test coverage. if flushAt == 1000 { flushAt = 1024 } else { flushAt = 1000 } } if readSize > testSize { b.Fatalf("Test input is %d bytes, but read %d bytes so far!", testSize, readSize) } } if readSize != testSize { b.Fatalf("Expected to read %d bytes, but read %d bytes instead", testSize, readSize) } } }