From d4492e4f0ad2a543ba03ca002bac9445b9965096 Mon Sep 17 00:00:00 2001 From: Maurice Makaay Date: Mon, 15 Jul 2019 20:03:05 +0000 Subject: [PATCH] Bytes reader working, now carry on switching to byte reading in the tokenizer code. --- read/read.go | 244 ++++++++++++++++++++--------------- read/read_test.go | 173 ++++++++++++++++--------- tokenize/api.go | 32 +++-- tokenize/api_test.go | 8 +- tokenize/handlers_builtin.go | 110 ++++++++-------- 5 files changed, 331 insertions(+), 236 deletions(-) diff --git a/read/read.go b/read/read.go index 33f6875..e84fadf 100644 --- a/read/read.go +++ b/read/read.go @@ -4,27 +4,29 @@ // Let's say we've got the following input coming up in the io.Reader that is // wrapped by the Reader: // -// |H|e|l|l|o|,| |w|o|r|l|d|!| <-- runes -// 0 6 12 <-- rune offset +// |H|e|l|l|o|,| |w|o|r|l|d|!| <-- bytes +// 0 6 12 <-- byte offset // -// The Reader can now be used to retrieve runes from the input, based on their -// offset, using RuneAt(offset). Normally these runes will be retrieved in -// sequence, but that is not a requirement. Let's say we retrieve the rune with -// offset 6 from the input (the 'w'), then the Reader buffer be filled with runes -// from the io.Reader until there are enough runes available to return the rune -// for offset 6: +// The Reader can now be used to retrieve data from the input, based on their +// byte offset, e.g. using RuneAt(offset) or ByteAt(offset). Normally these data +// will be retrieved in sequence by the user of this code, but that is not a +// requirement. Let's say we retrieve the byte with offset 6 from the input +// (the 'w'), then the Reader buffer be filled with runes from the io.Reader +// until there are enough runes available to return the rune for offset 6: // // |H|e|l|l|o| |w| // 0 6 // -// Using RuneAt, you can retrieve arbitrary runes. If you request one that is -// in the Reader buffer, then the buffered rune is returned. If you request one -// that is not in the buffer, then the buffer will be expanded. +// This means that you can retrieve data for arbitrary offsets. If you request +// an offset that is already in the Reader buffer, then the buffered data are +// returned. If you request one that is not in the buffer, then the buffer will +// be expanded. // -// To make this into a sliding window, the Reader provides the method -// Flush(numberOfRunes). This method will drop the provided number of runes from -// the Reader buffer. So when we'd do a Flush(3) on the example buffer from above, -// then the Reader buffer would become: +// To make this into a sliding window (preserving memory space while scanning +// the input data), the Reader provides the method Flush(numberOfBytes). +// This method will drop the provided number of bytes from the Reader buffer. +// So when we'd do a Flush(3) on the example buffer from above, then the Reader +// buffer would become: // // |l|o| |w| // 0 3 @@ -79,32 +81,33 @@ func makeBufioReader(input interface{}) *bufio.Reader { } // Buffer wraps around a bufio.Reader and provides an additional layer of -// buffering that allows us to read the same runes over and over again. +// buffering that allows us to read the same data over and over again. // This is useful for implementing a parser that must be able to do lookahead // on the input, returning to the original input position after finishing // that lookahead). // -// To minimze memory use, it is also possible to flush the read buffer when there is -// no more need to go back to previously read runes. +// To minimize memory use, it is also possible to flush the read buffer when there is +// no more need to go back to previously read data. // -// The parserkit.reader.Reader is used internally by tokenize.API. +// This parserkit.reader.Reader is used internally by tokenize.API. type Buffer struct { - bufio *bufio.Reader // used for ReadRune() - store []rune // buffer store, the buffer field is a slice on top of this one - buffer []rune // input buffer, holding runes that were read from input - err error // a read error, if one occurred - errOffset int // the offset in the buffer at which the read error was encountered - firstReadDone bool // whether or not the first read was done + bufio *bufio.Reader // used for ReadRune() + store []byte // buffer store, the buffer field is a slice on top of this one + buffer []byte // input buffer, holding runes that were read from input + err error // a read error, if one occurred + errOffset int // the offset in the buffer at which the read error was encountered } -// RuneAt reads the rune at the provided rune offset. +// RuneAt reads the rune at the provided byte offset. // -// This offset is relative to the current starting position of the Buffer. +// The offset is relative to the current starting position of the Buffer. // When starting reading, offset 0 will point at the start of the input. -// After flushing, offset 0 will point at the input up to where the flush was done. +// After flushing, offset 0 will point at the input up to where the flush +// was done. // -// The error return value will be nil when reading was successful. -// When an invalid rune is encountered on the input, the error will be nil, +// When reading was successful, the rune and the width of the rune in bytes +// will be returned. The returned error will be nil. +// When an invalid UTF8 rune is encountered on the input, the error will be nil, // but the rune will be utf8.RuneError // // When reading failed, the rune will be utf8.RuneError and the error will @@ -113,48 +116,83 @@ type Buffer struct { // // Once a read error is encountered, that same read error will guaranteed // be return on every subsequent read at or beyond the provided offset. -func (r *Buffer) RuneAt(offset int) (rune, error) { - // Re-issue a previously seen read error. - if r.err != nil && offset >= r.errOffset { - return utf8.RuneError, r.err +func (buf *Buffer) RuneAt(offset int) (rune, int, error) { + // Shortcut: re-issue a previously seen read error. + if buf.err != nil && offset >= buf.errOffset { + return utf8.RuneError, 0, buf.err } - // Rune at provided offset is not yet available in the input buffer. - // Read runes until we have enough runes to satisfy the offset. - l := len(r.buffer) - - // Number of runes to add to the buffer to have enough space to store - // the rune at the offset - n := offset - l + 1 - - if n > 0 { - r.grow(n) - var readRune rune - var err error - for writeAt := l; writeAt <= offset; writeAt++ { - readRune, _, err = r.bufio.ReadRune() - - // Skip BOM. - if !r.firstReadDone { - r.firstReadDone = true - if readRune == '\uFEFF' { - writeAt-- - continue - } - } - - // Handle errors. + // Compute the number of bytes that we need in the buffer to be able + // to return the rune at the provided byte offset. + bufferLen := len(buf.buffer) + requiredLen := offset + utf8.UTFMax + if requiredLen > bufferLen && buf.err == nil { + buf.grow(requiredLen) + for writeAt := bufferLen; writeAt < requiredLen; writeAt++ { + b, err := buf.bufio.ReadByte() if err != nil { - r.err = err - r.errOffset = writeAt - return utf8.RuneError, err + buf.err = err + buf.errOffset = writeAt + buf.buffer = buf.buffer[:writeAt] + break } - - r.buffer[writeAt] = readRune + buf.buffer[writeAt] = b } - return readRune, nil } - return r.buffer[offset], nil + + if buf.err != nil && offset >= buf.errOffset { + return utf8.RuneError, 0, buf.err + } + + r, w := utf8.DecodeRune(buf.buffer[offset:]) + return r, w, nil +} + +// ByteAt reads the byte at the provided byte offset. +// +// The offset is relative to the current starting position of the Buffer. +// When starting reading, offset 0 will point at the start of the input. +// After flushing, offset 0 will point at the input up to where the flush +// was done. +// +// When reading was successful, the byte will be returned. The returned +// error will be nil. +// +// When reading failed, the byte will be 0x00 and the error will +// be not nil. One special read fail is actually a normal situation: end +// of file reached. In that case, the returned error wille be io.EOF. +// +// Once a read error is encountered, that same read error will guaranteed +// be return on every subsequent read at or beyond the provided offset. +func (buf *Buffer) ByteAt(offset int) (byte, error) { + // Shortcut: re-issue a previously seen read error. + if buf.err != nil && offset >= buf.errOffset { + return 0, buf.err + } + + // Compute the number of bytes that we need in the buffer to be able + // to return the byte at the provided byte offset. + bufferLen := len(buf.buffer) + requiredLen := offset + 1 + if requiredLen > bufferLen && buf.err == nil { + buf.grow(requiredLen) + for writeAt := bufferLen; writeAt < requiredLen; writeAt++ { + b, err := buf.bufio.ReadByte() + if err != nil { + buf.err = err + buf.errOffset = writeAt + buf.buffer = buf.buffer[:writeAt] + break + } + buf.buffer[writeAt] = b + } + } + + if buf.err != nil && offset >= buf.errOffset { + return 0, buf.err + } + + return buf.buffer[offset], nil } // The upcoming code was inspired heavily by the Go built-in 'bytes' package. @@ -168,82 +206,80 @@ var ErrTooLarge = errors.New("parsekit.read.Buffer: too large") // grow grows the buffer to guarantee space for n more bytes. // It returns the index where bytes should be written. // If the buffer can't grow it will panic with ErrTooLarge. -func (r *Buffer) grow(n int) { +func (buf *Buffer) grow(requiredSize int) { // Instantiate new buffer store - if r.store == nil { + if buf.store == nil { b := smallBufferSize - if b < n { - b = n + if b < requiredSize { + b = requiredSize } - r.store = make([]rune, 0, b) - r.buffer = r.store[:n] + buf.store = make([]byte, 0, b) + buf.buffer = buf.store[:requiredSize] return } - lenBuffer := len(r.buffer) - capBuffer := cap(r.buffer) - freeBuffer := capBuffer - lenBuffer - newSize := lenBuffer + n + capBuffer := cap(buf.buffer) // Grow the buffer store by reslicing within the available capacity. - if freeBuffer >= n { - r.buffer = r.buffer[:newSize] + if capBuffer >= requiredSize { + buf.buffer = buf.buffer[:requiredSize] return } - capStore := cap(r.store) + capStore := cap(buf.store) freeAtStartOfStore := capStore - capBuffer // Grow the buffer by moving the data to the start of the store. // Note: according to the spec, overlapping slices are allowed with copy(). - if freeAtStartOfStore > 0 && newSize <= capStore { - r.store = r.store[0:newSize] - copy(r.store, r.buffer) - r.buffer = r.store[:newSize] - r.store = r.store[:0] + if freeAtStartOfStore > 0 && requiredSize <= capStore { + buf.store = buf.store[0:requiredSize] + copy(buf.store, buf.buffer) + buf.buffer = buf.store[:requiredSize] + buf.store = buf.store[:0] return } // Grow the buffer store by allocating a new one and copying the data. - buf := makeSlice(2*capStore + n) - copy(buf, r.buffer) - r.store = buf - r.buffer = r.store[:newSize] + newStore := makeSlice(2*capStore + requiredSize) + copy(newStore, buf.buffer) + buf.store = newStore + buf.buffer = buf.store[:requiredSize] } // makeSlice allocates a slice of size n. If the allocation fails, it panics // with ErrTooLarge. -func makeSlice(n int) []rune { +func makeSlice(n int) []byte { // If the make fails, give a known error. defer func() { if recover() != nil { panic(ErrTooLarge) } }() - return make([]rune, 0, n) + return make([]byte, 0, n) } -// Flush deletes the provided number of runes from the start of the Buffer. +// Flush deletes the provided number of bytes from the start of the Buffer. // After flushing the Buffer, offset 0 as used by RuneAt() will point to // the rune that comes after the runes that were flushed. // So what this basically does, is turn the Buffer into a sliding window. -func (r *Buffer) Flush(numberOfRunes int) { - l := len(r.buffer) - if numberOfRunes > l { +func (buf *Buffer) Flush(numberOfBytes int) { + if numberOfBytes == 0 { + return + } + + bufferLen := len(buf.buffer) + if numberOfBytes > bufferLen { panic(fmt.Sprintf( "parsekit.read.Buffer.Flush(): number of runes to flush (%d) "+ - "exceeds size of the buffer (%d)", numberOfRunes, l)) + "exceeds size of the buffer (%d)", numberOfBytes, bufferLen)) } - if numberOfRunes == 0 { + if bufferLen == numberOfBytes { + buf.buffer = buf.store[:0] + buf.errOffset = 0 return } - if l == numberOfRunes { - r.buffer = r.store[:0] - r.errOffset = 0 - return - } - r.buffer = r.buffer[numberOfRunes:] - if r.err != nil { - r.errOffset = r.errOffset - numberOfRunes + buf.buffer = buf.buffer[numberOfBytes:] + if buf.err != nil { + buf.errOffset = buf.errOffset - numberOfBytes } } diff --git a/read/read_test.go b/read/read_test.go index 6f18d8c..5548b7d 100644 --- a/read/read_test.go +++ b/read/read_test.go @@ -12,8 +12,8 @@ import ( func ExampleNew() { printFirstRuneOf := func(input interface{}) { r := New(input) - c, _ := r.RuneAt(0) - fmt.Printf("%q\n", c) + c, w, _ := r.RuneAt(0) + fmt.Printf("rune %q, width %d\n", c, w) } simpleString := "Hello, world!" @@ -25,14 +25,14 @@ func ExampleNew() { bufioReaderPointer := bufio.NewReader(strings.NewReader("Where do we go, world?")) printFirstRuneOf(bufioReaderPointer) - bufioReaderValue := *(bufio.NewReader(strings.NewReader("Where do we go, world?"))) + bufioReaderValue := *(bufio.NewReader(strings.NewReader("Ɍead the manual!"))) printFirstRuneOf(bufioReaderValue) // Output: - // 'H' - // 'G' - // 'W' - // 'W' + // rune 'H', width 1 + // rune 'G', width 1 + // rune 'W', width 1 + // rune 'Ɍ', width 2 } func TestNew_VariousInputTypesCanBeUsed(t *testing.T) { @@ -46,13 +46,13 @@ func TestNew_VariousInputTypesCanBeUsed(t *testing.T) { {"bufio.Reader", *(bufio.NewReader(strings.NewReader("Hello, world!")))}, } { r := New(test.input) - firstRune, _ := r.RuneAt(0) + firstRune, _, _ := r.RuneAt(0) if firstRune != 'H' { t.Errorf("[%s] first rune not 'H'", test.name) } - lastRune, _ := r.RuneAt(12) + lastRune, _, _ := r.RuneAt(12) if lastRune != '!' { - t.Errorf("[%s] last rune not '!'", test.name) + t.Errorf("[%s] last rune not '!', but %q", test.name, lastRune) } } } @@ -63,41 +63,63 @@ func TestNew_UnhandledInputType_Panics(t *testing.T) { "parsekit.read.New(): no support for input of type int") } -func TestBuffer_RuneAt(t *testing.T) { +func TestBuffer_ByteAt(t *testing.T) { r := New(strings.NewReader("Hello, world!")) - at := func(i int) rune { r, _ := r.RuneAt(i); return r } + at := func(i int) byte { b, _ := r.ByteAt(i); return b } - // It is possible to go back and forth while reading the input. result := fmt.Sprintf("%c%c%c%c", at(0), at(12), at(7), at(0)) assertEqual(t, "H!wH", result) } +func TestBuffer_RuneAt(t *testing.T) { + r := New(strings.NewReader("¡pןɹoʍ 'oןןǝH")) + at := func(i int) rune { r, _, _ := r.RuneAt(i); return r } + + // It is possible to go back and forth while reading the input. + result := fmt.Sprintf("%c%c%c%c", at(0), at(5), at(8), at(0)) + assertEqual(t, "¡ɹʍ¡", result) +} + +func TestBuffer_ByteAt_endOfFile(t *testing.T) { + r := New(strings.NewReader("Hello, world!")) + + b, err := r.ByteAt(13) + result := fmt.Sprintf("%q %s %t", b, err, err == io.EOF) + assertEqual(t, "'\\x00' EOF true", result) + + b, err = r.ByteAt(20) + result = fmt.Sprintf("%q %s %t", b, err, err == io.EOF) + assertEqual(t, "'\\x00' EOF true", result) +} + func TestBuffer_RuneAt_endOfFile(t *testing.T) { r := New(strings.NewReader("Hello, world!")) - rn, err := r.RuneAt(13) + rn, _, err := r.RuneAt(13) result := fmt.Sprintf("%q %s %t", rn, err, err == io.EOF) assertEqual(t, "'�' EOF true", result) - rn, err = r.RuneAt(20) + rn, _, err = r.RuneAt(20) result = fmt.Sprintf("%q %s %t", rn, err, err == io.EOF) assertEqual(t, "'�' EOF true", result) } func TestBuffer_RuneAt_invalidRune(t *testing.T) { r := New(strings.NewReader("Hello, \xcdworld!")) - at := func(i int) rune { r, _ := r.RuneAt(i); return r } + at := func(i int) rune { r, _, _ := r.RuneAt(i); return r } result := fmt.Sprintf("%c%c%c%c", at(6), at(7), at(8), at(9)) assertEqual(t, " �wo", result) } -func ExampleBuffer_RuneAt() { +func ExampleBuffer_ByteAt() { reader := New(strings.NewReader("Hello, world!")) fmt.Printf("Runes: ") - for i := 0; ; i++ { - r, err := reader.RuneAt(i) + offset := 0 + for { + r, err := reader.ByteAt(offset) + offset++ if err != nil { fmt.Printf("\nErr: %s\n", err) break @@ -110,18 +132,39 @@ func ExampleBuffer_RuneAt() { // Err: EOF } -func TestRuneAt_SkipsBOMAtStartOfFile(t *testing.T) { - r := New(strings.NewReader("\uFEFFBommetje!")) - b, _ := r.RuneAt(0) - o, _ := r.RuneAt(1) - m, _ := r.RuneAt(2) - bom := fmt.Sprintf("%c%c%c", b, o, m) - assertEqual(t, "Bom", bom) +func ExampleBuffer_RuneAt() { + reader := New(strings.NewReader("Hello, pןɹoʍ!")) + + fmt.Printf("Runes: ") + offset := 0 + for { + r, w, err := reader.RuneAt(offset) + offset += w + if err != nil { + fmt.Printf("\nErr: %s\n", err) + break + } + fmt.Printf("%c", r) + } + + // Output: + // Runes: Hello, pןɹoʍ! + // Err: EOF } +// TODO reimplement somewhere, maybe a separate call in the reader or should it be part of a parser? +// func TestRuneAt_SkipsBOMAtStartOfFile(t *testing.T) { +// r := New(strings.NewReader("\uFEFFBommetje!")) +// b, _, _ := r.RuneAt(0) +// o, _, _ := r.RuneAt(1) +// m, _, _ := r.RuneAt(2) +// bom := fmt.Sprintf("%c%c%c", b, o, m) +// assertEqual(t, "Bom", bom) +// } + func TestBuffer_Flush(t *testing.T) { r := New(strings.NewReader("Hello, world!")) - at := func(i int) rune { r, _ := r.RuneAt(i); return r } + at := func(i int) rune { r, _, _ := r.RuneAt(i); return r } // Fills the buffer with the first 8 runes on the input: "Hello, w" result := fmt.Sprintf("%c", at(7)) @@ -138,7 +181,7 @@ func TestBuffer_Flush(t *testing.T) { func ExampleBuffer_Flush() { r := New(strings.NewReader("dog eat dog!")) - at := func(offset int) rune { c, _ := r.RuneAt(offset); return c } + at := func(offset int) rune { c, _, _ := r.RuneAt(offset); return c } // Read from the first 4 runes of the input. fmt.Printf("%c%c%c%c", at(0), at(1), at(2), at(3)) @@ -162,30 +205,31 @@ func ExampleBuffer_Flush() { func TestGivenNumberOfRunesTooHigh_Flush_Panics(t *testing.T) { r := New(strings.NewReader("Hello, world!")) - // Fill buffer with "Hello, worl", the first 11 runes. - r.RuneAt(10) + // Fill buffer with "Hello, world!", the first 13 runes. + rn, _, _ := r.RuneAt(12) + assertEqual(t, '!', rn) - // However, we flush 12 runes, which exceeds the buffer size. + // However, we flush 14 runes, which exceeds the buffer size. assertPanic(t, - func() { r.Flush(12) }, + func() { r.Flush(14) }, "parsekit.read.Buffer.Flush(): number of runes to flush "+ - "(12) exceeds size of the buffer (11)") + "(14) exceeds size of the buffer (13)") } func TestGivenEOFFollowedByFlush_EOFCanStillBeRead(t *testing.T) { r := New(strings.NewReader("Hello, world!")) - _, err := r.RuneAt(13) + _, _, err := r.RuneAt(13) assertEqual(t, err.Error(), "EOF") - _, err = r.RuneAt(13) + _, _, err = r.RuneAt(13) assertEqual(t, err.Error(), "EOF") - _, err = r.RuneAt(14) + _, _, err = r.RuneAt(14) assertEqual(t, err.Error(), "EOF") r.Flush(13) - _, err = r.RuneAt(0) + _, _, err = r.RuneAt(0) assertEqual(t, err.Error(), "EOF") - _, err = r.RuneAt(1) + _, _, err = r.RuneAt(1) assertEqual(t, err.Error(), "EOF") - _, err = r.RuneAt(2) + _, _, err = r.RuneAt(2) assertEqual(t, err.Error(), "EOF") } @@ -203,43 +247,43 @@ func TestGivenErrorFromBuffer_ErrorIsCached(t *testing.T) { r := New(input) // Read the last availble rune. - readRune, _ := r.RuneAt(3) + readRune, _, _ := r.RuneAt(3) assertEqual(t, 'd', readRune) // Reading the next offset must result in the io.EOF error from the stub. - readRune, err := r.RuneAt(4) + readRune, _, err := r.RuneAt(4) assertEqual(t, utf8.RuneError, readRune) assertEqual(t, io.EOF, err) // Reading even further should yield the same io.EOF error. - readRune, err = r.RuneAt(5) + readRune, _, err = r.RuneAt(5) assertEqual(t, utf8.RuneError, readRune) assertEqual(t, io.EOF, err) // After an error, we must still be able to read the last rune. - readRune, _ = r.RuneAt(3) + readRune, _, _ = r.RuneAt(3) assertEqual(t, 'd', readRune) // Flushing updates the error index too. r.Flush(3) // The last rune is now at offset 0. - readRune, _ = r.RuneAt(0) + readRune, _, _ = r.RuneAt(0) assertEqual(t, 'd', readRune) // The io.EOF is now at offset 1. - _, err = r.RuneAt(1) + _, _, err = r.RuneAt(1) assertEqual(t, io.EOF, err) // Let's flush that last rune too. r.Flush(1) // The io.EOF is now at offset 0. - _, err = r.RuneAt(0) + _, _, err = r.RuneAt(0) assertEqual(t, io.EOF, err) // And reading beyond that offset also yields io.EOF. - _, err = r.RuneAt(1) + _, _, err = r.RuneAt(1) assertEqual(t, io.EOF, err) } @@ -247,13 +291,13 @@ func TestInputLargerThanDefaultBufSize64(t *testing.T) { input, size := makeLargeStubReader() r := New(input) - readRune, err := r.RuneAt(0) + readRune, _, err := r.RuneAt(0) assertEqual(t, 'X', readRune) - readRune, err = r.RuneAt(size - 1) + readRune, _, err = r.RuneAt(size - 1) assertEqual(t, 'Y', readRune) - readRune, err = r.RuneAt(size) + readRune, _, err = r.RuneAt(size) assertEqual(t, io.EOF, err) - readRune, err = r.RuneAt(10) + readRune, _, err = r.RuneAt(10) assertEqual(t, 'X', readRune) } @@ -261,9 +305,9 @@ func TestInputLargerThanDefaultBufSize64_WithFirstReadLargerThanBufSize64(t *tes input, size := makeLargeStubReader() r := New(input) - readRune, _ := r.RuneAt(size - 200) + readRune, _, _ := r.RuneAt(size - 200) assertEqual(t, 'X', readRune) - readRune, _ = r.RuneAt(size - 1) + readRune, _, _ = r.RuneAt(size - 1) assertEqual(t, 'Y', readRune) } @@ -271,7 +315,7 @@ func TestInputLargerThanDefaultBufSize64_WithFirstReadToLastByte(t *testing.T) { input, size := makeLargeStubReader() r := New(input) - readRune, _ := r.RuneAt(size - 1) + readRune, _, _ := r.RuneAt(size - 1) assertEqual(t, 'Y', readRune) } @@ -282,12 +326,17 @@ func TestAllocationPatterns(t *testing.T) { // The first read will create the standard cache. // store |x 64 | // buffer |x 64 | - assertCache(t, "read 1", r, func() { r.RuneAt(0) }, 0, 64, 1, 64) + assertCache(t, "read 1", r, func() { r.RuneAt(0) }, 0, 64, 4, 64) - // The first 64 reads will fit in the standard cache. + // The first 64 bytes will fit in the standard cache. // store |xxxx64xxxxx| // buffer |xxxx64xxxxx| - assertCache(t, "read fill cache", r, func() { r.RuneAt(63) }, 0, 64, 64, 64) + // + // Note: in the test offset 60 is used instead of offset 63, because + // RuneAt() will fill the buffer with 4 bytes to accomodate for the + // longest UTF8 character encodings. In all upcoming tests, the same + // logic applies to the RuneAt() calls. + assertCache(t, "read fill cache", r, func() { r.RuneAt(60) }, 0, 64, 64, 64) // Flushing zero input keeps everything as-is. // store |xxxx64xxxxx| @@ -302,7 +351,7 @@ func TestAllocationPatterns(t *testing.T) { // Reading 65 chars will allocate a new store of 2 * size + n. // store |xxxxx65xxxxx 128 | // buffer |xxxxx65xxxxx 128 | - assertCache(t, "read cap + 1", r, func() { r.RuneAt(64) }, 0, 65+128, 65, 65+128) + assertCache(t, "read cap + 1", r, func() { r.RuneAt(61) }, 0, 65+128, 65, 65+128) // A partial flush frees the start of the store and moves // the buffer slice. @@ -315,7 +364,7 @@ func TestAllocationPatterns(t *testing.T) { // without a new allocation. // store | 50 xxxxxxxxx143xxxxxxxx| // buffer |xxxxxxxxx143xxxxxxxx| - assertCache(t, "read fill cache after partial flush", r, func() { r.RuneAt(142) }, 0, 50+143, 143, 143) + assertCache(t, "read fill cache after partial flush", r, func() { r.RuneAt(139) }, 0, 50+143, 143, 143) // Flush the full input. // store | 193 | @@ -325,7 +374,7 @@ func TestAllocationPatterns(t *testing.T) { // Read a bit more than half the capacity. // store |xxxxxx101xxxxxxxx 92 | // buffer |xxxxxx101xxxxxxxx 92 | - assertCache(t, "read more than half the cap", r, func() { r.RuneAt(100) }, 0, 193, 101, 193) + assertCache(t, "read more than half the cap", r, func() { r.RuneAt(97) }, 0, 193, 101, 193) // Then flush almost all input. // store | 100 x1x 92 | @@ -337,7 +386,7 @@ func TestAllocationPatterns(t *testing.T) { // store (where it fits), space is freed up for the read operation. // store |xxxxx100xxxxxx 93 | // buffer |xxxxx100xxxxxx 93 | - assertCache(t, "read beyond cap with free space at start of store", r, func() { r.RuneAt(99) }, 0, 193, 100, 193) + assertCache(t, "read beyond cap with free space at start of store", r, func() { r.RuneAt(96) }, 0, 193, 100, 193) // Now flush only one rune from the cache. // store |1 xxxx99xxxxx 93 | @@ -349,7 +398,7 @@ func TestAllocationPatterns(t *testing.T) { // the data is moved to the start and no reallocation is needed. // store |1 xxxx99xxxxx 93 | // buffer |xxxx99xxxxx 93 | - assertCache(t, "read 1 more than cap with 1 free at start", r, func() { r.RuneAt(192) }, 0, 193, 193, 193) + assertCache(t, "read 1 more than cap with 1 free at start", r, func() { r.RuneAt(189) }, 0, 193, 193, 193) } func makeLargeStubReader() (*StubReader, int) { diff --git a/tokenize/api.go b/tokenize/api.go index b9b23d1..35698a9 100644 --- a/tokenize/api.go +++ b/tokenize/api.go @@ -70,15 +70,16 @@ import ( // can lead to hard to track bugs. I much prefer this forking method, since // no bookkeeping has to be implemented when implementing a parser. type API struct { - reader *read.Buffer // the input data reader - lastRune rune // the rune as retrieved by the last NextRune() calll - lastRuneErr error // the error for the last NextRune() call - runeRead bool // whether or not a rune was read using NextRune() - runes []rune // the rune stack - tokens []Token // the token stack - stackFrames []stackFrame // the stack frames, containing stack level-specific data - stackLevel int // the current stack level - stackFrame *stackFrame // the current stack frame + reader *read.Buffer // the input data reader + lastRune rune // the rune as retrieved by the last NextRune() call + lastRuneWidth int // the width in bytes of the last read rune + lastRuneErr error // the error for the last NextRune() call + runeRead bool // whether or not a rune was read using NextRune() + runes []rune // the rune stack + tokens []Token // the token stack + stackFrames []stackFrame // the stack frames, containing stack level-specific data + stackLevel int // the current stack level + stackFrame *stackFrame // the current stack frame } type stackFrame struct { @@ -129,8 +130,9 @@ func (i *API) NextRune() (rune, error) { "without a prior call to Accept()") } - readRune, err := i.reader.RuneAt(i.stackFrame.offset) + readRune, runeWidth, err := i.reader.RuneAt(i.stackFrame.offset) i.lastRune = readRune + i.lastRuneWidth = runeWidth i.lastRuneErr = err i.runeRead = true @@ -140,7 +142,7 @@ func (i *API) NextRune() (rune, error) { // PeekRune returns the rune at the provided offset. // // The read cursor and current read offset are not updated by this operation. -func (i *API) PeekRune(offset int) (rune, error) { +func (i *API) PeekRune(offset int) (rune, int, error) { return i.reader.RuneAt(i.stackFrame.offset + offset) } @@ -158,10 +160,10 @@ func (i *API) Accept() { "but the prior call to NextRune() failed") } - i.accept(i.lastRune) + i.acceptRunes(i.lastRuneWidth, i.lastRune) } -func (i *API) accept(runes ...rune) { +func (i *API) acceptRunes(width int, runes ...rune) { curRuneEnd := i.stackFrame.runeEnd newRuneEnd := curRuneEnd + len(runes) @@ -179,7 +181,7 @@ func (i *API) accept(runes ...rune) { i.stackFrame.moveCursorByRune(r) } i.stackFrame.runeEnd = newRuneEnd - i.stackFrame.offset += len(runes) + i.stackFrame.offset += width i.runeRead = false } @@ -216,6 +218,8 @@ func (i *API) Fork() int { i.stackLevel++ i.runeRead = false + // TODO do some good benchmarking on these two options. The explicit version might be + // the faster one, but I am not sure of that right now. // A // i.stackFrames[i.stackLevel] = *i.stackFrame // i.stackFrame = &i.stackFrames[i.stackLevel] diff --git a/tokenize/api_test.go b/tokenize/api_test.go index 8c01d94..0944c35 100644 --- a/tokenize/api_test.go +++ b/tokenize/api_test.go @@ -29,10 +29,10 @@ func ExampleAPI_NextRune() { func ExampleAPI_PeekRune() { api := tokenize.NewAPI("The input that the API will handle") - r1, err := api.PeekRune(19) // 'A' - r2, err := api.PeekRune(20) // 'P' - r3, err := api.PeekRune(21) // 'I' - _, err = api.PeekRune(100) // EOF + r1, _, err := api.PeekRune(19) // 'A' + r2, _, err := api.PeekRune(20) // 'P' + r3, _, err := api.PeekRune(21) // 'I' + _, _, err = api.PeekRune(100) // EOF fmt.Printf("%c%c%c %s\n", r1, r2, r3, err) diff --git a/tokenize/handlers_builtin.go b/tokenize/handlers_builtin.go index 3a1eefd..ff4c452 100644 --- a/tokenize/handlers_builtin.go +++ b/tokenize/handlers_builtin.go @@ -336,9 +336,9 @@ var T = struct { // MatchRune creates a Handler function that matches against the provided rune. func MatchRune(expected rune) Handler { return func(t *API) bool { - r, err := t.PeekRune(0) + r, w, err := t.PeekRune(0) if err == nil && r == expected { - t.accept(r) + t.acceptRunes(w, r) return true } return false @@ -349,13 +349,13 @@ func MatchRune(expected rune) Handler { // one of the provided runes. The first match counts. func MatchRunes(expected ...rune) Handler { return func(t *API) bool { - r, err := t.PeekRune(0) + r, w, err := t.PeekRune(0) if err != nil { return false } for _, e := range expected { if r == e { - t.accept(r) + t.acceptRunes(w, r) return true } } @@ -375,9 +375,9 @@ func MatchRuneRange(start rune, end rune) Handler { callerPanic("MatchRuneRange", "Handler: {name} definition error at {caller}: start %q must not be < end %q", start, end) } return func(t *API) bool { - r, err := t.PeekRune(0) + r, w, err := t.PeekRune(0) if err == nil && r >= start && r <= end { - t.accept(r) + t.acceptRunes(w, r) return true } return false @@ -388,18 +388,18 @@ func MatchRuneRange(start rune, end rune) Handler { // a DOS-style newline (CRLF, \r\n) or a UNIX-style newline (just a LF, \n). func MatchNewline() Handler { return func(t *API) bool { - r1, err := t.PeekRune(0) + r1, _, err := t.PeekRune(0) if err != nil { return false } if r1 == '\n' { - t.accept(r1) + t.acceptRunes(1, r1) return true } if r1 == '\r' { - r2, err := t.PeekRune(1) + r2, _, err := t.PeekRune(1) if err == nil && r2 == '\n' { - t.accept(r1, r2) + t.acceptRunes(2, r1, r2) return true } } @@ -433,19 +433,20 @@ func MatchBlank() Handler { func MatchBlanks() Handler { return func(t *API) bool { // Match the first blank. - r, err := t.PeekRune(0) + r, _, err := t.PeekRune(0) if err != nil || (r != ' ' && r != '\t') { return false } + t.acceptRunes(1, r) // Now match any number of followup blanks. We've already got // a successful match at this point, so we'll always return true at the end. for { - r, err := t.PeekRune(0) + r, _, err := t.PeekRune(0) if err != nil || (r != ' ' && r != '\t') { return true } - t.accept(r) + t.acceptRunes(1, r) } } } @@ -456,35 +457,35 @@ func MatchBlanks() Handler { func MatchWhitespace() Handler { return func(t *API) bool { // Match the first whitespace. - r1, err := t.PeekRune(0) + r1, _, err := t.PeekRune(0) if err != nil || (r1 != ' ' && r1 != '\t' && r1 != '\n' && r1 != '\r') { return false } if r1 == '\r' { - r2, err := t.PeekRune(1) + r2, _, err := t.PeekRune(1) if err != nil || r2 != '\n' { return false } - t.accept(r1, r2) + t.acceptRunes(2, r1, r2) } else { - t.accept(r1) + t.acceptRunes(1, r1) } // Now match any number of followup whitespace. We've already got // a successful match at this point, so we'll always return true at the end. for { - r1, err := t.PeekRune(0) + r1, _, err := t.PeekRune(0) if err != nil || (r1 != ' ' && r1 != '\t' && r1 != '\n' && r1 != '\r') { return true } if r1 == '\r' { - r2, err := t.PeekRune(1) + r2, _, err := t.PeekRune(1) if err != nil || r2 != '\n' { return true } - t.accept(r1, r2) + t.acceptRunes(2, r1, r2) } else { - t.accept(r1) + t.acceptRunes(1, r1) } } } @@ -504,9 +505,9 @@ func MatchUnicodeSpace() Handler { // so those can be used. E.g. MatchRuneByCallback(unicode.IsLower). func MatchRuneByCallback(callback func(rune) bool) Handler { return func(t *API) bool { - r, err := t.PeekRune(0) + r, w, err := t.PeekRune(0) if err == nil && callback(r) { - t.accept(r) + t.acceptRunes(w, r) return true } return false @@ -516,18 +517,18 @@ func MatchRuneByCallback(callback func(rune) bool) Handler { // MatchEndOfLine creates a Handler that matches a newline ("\r\n" or "\n") or EOF. func MatchEndOfLine() Handler { return func(t *API) bool { - r1, err := t.PeekRune(0) + r1, _, err := t.PeekRune(0) if err != nil { return err == io.EOF } if r1 == '\n' { - t.accept(r1) + t.acceptRunes(1, r1) return true } if r1 == '\r' { - r2, _ := t.PeekRune(1) + r2, _, _ := t.PeekRune(1) if r2 == '\n' { - t.accept(r1, r2) + t.acceptRunes(2, r1, r2) return true } } @@ -537,14 +538,17 @@ func MatchEndOfLine() Handler { // MatchStr creates a Handler that matches the input against the provided string. func MatchStr(expected string) Handler { + expectedRunes := []rune(expected) + width := len(expected) + return func(t *API) bool { - for i, e := range expected { - r, err := t.PeekRune(i) + for i, e := range expectedRunes { + r, _, err := t.PeekRune(i) if err != nil || e != r { return false } } - t.accept([]rune(expected)...) + t.acceptRunes(width, expectedRunes...) return true } } @@ -553,16 +557,18 @@ func MatchStr(expected string) Handler { // provided string in a case-insensitive manner. func MatchStrNoCase(expected string) Handler { l := len([]rune(expected)) - matches := make([]rune, l) return func(t *API) bool { + matches := make([]rune, l) + width := 0 for i, e := range expected { - r, err := t.PeekRune(i) + r, w, err := t.PeekRune(i) if err != nil || unicode.ToUpper(e) != unicode.ToUpper(r) { return false } matches[i] = r + width += w } - t.accept(matches...) + t.acceptRunes(width, matches...) return true } } @@ -882,9 +888,9 @@ func MatchAnyRune() Handler { // UTF8 rune can be read from the input. func MatchValidRune() Handler { return func(t *API) bool { - r, err := t.PeekRune(0) + r, w, err := t.PeekRune(0) if err == nil && r != utf8.RuneError { - t.accept(r) + t.acceptRunes(w, r) return true } return false @@ -895,9 +901,9 @@ func MatchValidRune() Handler { // UTF8 rune can be read from the input. func MatchInvalidRune() Handler { return func(t *API) bool { - r, err := t.PeekRune(0) + r, w, err := t.PeekRune(0) if err == nil && r == utf8.RuneError { - t.accept(r) + t.acceptRunes(w, r) return true } return false @@ -949,45 +955,45 @@ func MatchFloat() Handler { // False falues: false, FALSE, False, 0, f, F func MatchBoolean() Handler { return func(t *API) bool { - r1, err := t.PeekRune(0) + r1, _, err := t.PeekRune(0) if err != nil { return false } if r1 == '1' || r1 == '0' { - t.accept(r1) + t.acceptRunes(1, r1) return true } if r1 == 't' || r1 == 'T' { - r2, _ := t.PeekRune(1) - r3, _ := t.PeekRune(2) - r4, err := t.PeekRune(3) + r2, _, _ := t.PeekRune(1) + r3, _, _ := t.PeekRune(2) + r4, _, err := t.PeekRune(3) if err == nil && r2 == 'r' && r3 == 'u' && r4 == 'e' { - t.accept(r1, r2, r3, r4) + t.acceptRunes(4, r1, r2, r3, r4) return true } if err == nil && r1 == 'T' && r2 == 'R' && r3 == 'U' && r4 == 'E' { - t.accept(r1, r2, r3, r4) + t.acceptRunes(4, r1, r2, r3, r4) return true } - t.accept(r1) + t.acceptRunes(1, r1) return true } if r1 == 'f' || r1 == 'F' { - r2, _ := t.PeekRune(1) - r3, _ := t.PeekRune(2) - r4, _ := t.PeekRune(3) - r5, err := t.PeekRune(4) + r2, _, _ := t.PeekRune(1) + r3, _, _ := t.PeekRune(2) + r4, _, _ := t.PeekRune(3) + r5, _, err := t.PeekRune(4) if err == nil && r2 == 'a' && r3 == 'l' && r4 == 's' && r5 == 'e' { - t.accept(r1, r2, r3, r4, r5) + t.acceptRunes(5, r1, r2, r3, r4, r5) return true } if err == nil && r1 == 'F' && r2 == 'A' && r3 == 'L' && r4 == 'S' && r5 == 'E' { - t.accept(r1, r2, r3, r4, r5) + t.acceptRunes(5, r1, r2, r3, r4, r5) return true } - t.accept(r1) + t.acceptRunes(1, r1) return true } return false