From 0362763e83c8a7ef7937879f08f5500e44606ff0 Mon Sep 17 00:00:00 2001 From: Maurice Makaay Date: Mon, 15 Jul 2019 22:48:00 +0000 Subject: [PATCH] Switched to byte input for built-in tokenize.Handler functions. --- read/read.go | 8 +- read/read_test.go | 6 ++ tokenize/api.go | 29 ++++++++ tokenize/cursor.go | 12 ++- tokenize/cursor_test.go | 27 ++++++- tokenize/handlers_builtin.go | 139 ++++++++++++++++++++--------------- 6 files changed, 154 insertions(+), 67 deletions(-) diff --git a/read/read.go b/read/read.go index e84fadf..e55fc00 100644 --- a/read/read.go +++ b/read/read.go @@ -240,22 +240,22 @@ func (buf *Buffer) grow(requiredSize int) { } // Grow the buffer store by allocating a new one and copying the data. - newStore := makeSlice(2*capStore + requiredSize) + newStore := makeSlice(requiredSize, 2*capStore+requiredSize) copy(newStore, buf.buffer) - buf.store = newStore + buf.store = newStore[:0] buf.buffer = buf.store[:requiredSize] } // makeSlice allocates a slice of size n. If the allocation fails, it panics // with ErrTooLarge. -func makeSlice(n int) []byte { +func makeSlice(l int, c int) []byte { // If the make fails, give a known error. defer func() { if recover() != nil { panic(ErrTooLarge) } }() - return make([]byte, 0, n) + return make([]byte, l, c) } // Flush deletes the provided number of bytes from the start of the Buffer. diff --git a/read/read_test.go b/read/read_test.go index 5548b7d..a02299d 100644 --- a/read/read_test.go +++ b/read/read_test.go @@ -327,6 +327,8 @@ func TestAllocationPatterns(t *testing.T) { // store |x 64 | // buffer |x 64 | assertCache(t, "read 1", r, func() { r.RuneAt(0) }, 0, 64, 4, 64) + rn, _, _ := r.RuneAt(0) + assertEqual(t, 'X', rn) // The first 64 bytes will fit in the standard cache. // store |xxxx64xxxxx| @@ -353,6 +355,10 @@ func TestAllocationPatterns(t *testing.T) { // buffer |xxxxx65xxxxx 128 | assertCache(t, "read cap + 1", r, func() { r.RuneAt(61) }, 0, 65+128, 65, 65+128) + // The bytes that we had before must be copied to the newly allocated store. + rn, _, _ = r.RuneAt(0) + assertEqual(t, 'X', rn) + // A partial flush frees the start of the store and moves // the buffer slice. // store | 50 x15x 128 | diff --git a/tokenize/api.go b/tokenize/api.go index 35698a9..2503c1a 100644 --- a/tokenize/api.go +++ b/tokenize/api.go @@ -146,6 +146,13 @@ func (i *API) PeekRune(offset int) (rune, int, error) { return i.reader.RuneAt(i.stackFrame.offset + offset) } +// PeekByte returns the byte at the provided offset. +// +// The read cursor and current read offset are not updated by this operation. +func (i *API) PeekByte(offset int) (byte, error) { + return i.reader.ByteAt(i.stackFrame.offset + offset) +} + // Accept the last rune as read by NextRune() into the Result runes and move // the cursor forward. // @@ -163,6 +170,28 @@ func (i *API) Accept() { i.acceptRunes(i.lastRuneWidth, i.lastRune) } +func (i *API) acceptBytes(bytes ...byte) { + curRuneEnd := i.stackFrame.runeEnd + newRuneEnd := curRuneEnd + len(bytes) + + // Grow the runes capacity when needed. + if cap(i.runes) < newRuneEnd { + newRunes := make([]rune, newRuneEnd, newRuneEnd*2) + copy(newRunes, i.runes) + i.runes = newRunes + } else { + i.runes = i.runes[0:newRuneEnd] + } + + for offset, b := range bytes { + i.runes[curRuneEnd+offset] = rune(b) + i.stackFrame.moveCursorByByte(b) + } + i.stackFrame.runeEnd = newRuneEnd + i.stackFrame.offset += len(bytes) + i.runeRead = false +} + func (i *API) acceptRunes(width int, runes ...rune) { curRuneEnd := i.stackFrame.runeEnd newRuneEnd := curRuneEnd + len(runes) diff --git a/tokenize/cursor.go b/tokenize/cursor.go index 8043641..b88a430 100644 --- a/tokenize/cursor.go +++ b/tokenize/cursor.go @@ -11,12 +11,20 @@ func (f *stackFrame) moveCursor(input string) *stackFrame { return f } -func (f *stackFrame) moveCursorByRune(r rune) *stackFrame { +func (f *stackFrame) moveCursorByRune(r rune) { if r == '\n' { f.column = 0 f.line++ } else { f.column++ } - return f +} + +func (f *stackFrame) moveCursorByByte(b byte) { + if b == '\n' { + f.column = 0 + f.line++ + } else { + f.column++ + } } diff --git a/tokenize/cursor_test.go b/tokenize/cursor_test.go index f89508f..971fd98 100644 --- a/tokenize/cursor_test.go +++ b/tokenize/cursor_test.go @@ -4,7 +4,32 @@ import ( "testing" ) -func TestGivenCursor_WhenMoving_CursorIsUpdated(t *testing.T) { +func TestMoveCursorByBytes(t *testing.T) { + api := NewAPI("") + api.stackFrame.moveCursorByByte('a') + api.stackFrame.moveCursorByByte('b') + api.stackFrame.moveCursorByByte('c') + api.stackFrame.moveCursorByByte('\r') + api.stackFrame.moveCursorByByte('\n') + api.stackFrame.moveCursorByByte('a') + api.stackFrame.moveCursorByByte('b') + + AssertEqual(t, "line 2, column 3", api.Cursor(), "Cursor position after moving by byte") +} + +func TestMoveCursorByRunes(t *testing.T) { + api := NewAPI("") + api.stackFrame.moveCursorByRune('ɹ') + api.stackFrame.moveCursorByRune('n') + api.stackFrame.moveCursorByRune('u') + api.stackFrame.moveCursorByRune('\r') + api.stackFrame.moveCursorByRune('\n') + api.stackFrame.moveCursorByRune('ǝ') + + AssertEqual(t, "line 2, column 2", api.Cursor(), "Cursor position after moving by rune") +} + +func TestWhenMovingCursor_CursorPositionIsUpdated(t *testing.T) { for _, test := range []struct { name string input []string diff --git a/tokenize/handlers_builtin.go b/tokenize/handlers_builtin.go index ff4c452..b21ac69 100644 --- a/tokenize/handlers_builtin.go +++ b/tokenize/handlers_builtin.go @@ -374,6 +374,18 @@ func MatchRuneRange(start rune, end rune) Handler { if end < start { callerPanic("MatchRuneRange", "Handler: {name} definition error at {caller}: start %q must not be < end %q", start, end) } + if end <= 127 { + byteStart := byte(start) + byteEnd := byte(end) + return func(t *API) bool { + r, err := t.PeekByte(0) + if err == nil && r >= byteStart && r <= byteEnd { + t.acceptBytes(r) + return true + } + return false + } + } return func(t *API) bool { r, w, err := t.PeekRune(0) if err == nil && r >= start && r <= end { @@ -388,18 +400,18 @@ func MatchRuneRange(start rune, end rune) Handler { // a DOS-style newline (CRLF, \r\n) or a UNIX-style newline (just a LF, \n). func MatchNewline() Handler { return func(t *API) bool { - r1, _, err := t.PeekRune(0) + b1, err := t.PeekByte(0) if err != nil { return false } - if r1 == '\n' { - t.acceptRunes(1, r1) + if b1 == '\n' { + t.acceptBytes(b1) return true } - if r1 == '\r' { - r2, _, err := t.PeekRune(1) - if err == nil && r2 == '\n' { - t.acceptRunes(2, r1, r2) + if b1 == '\r' { + b2, err := t.PeekByte(1) + if err == nil && b2 == '\n' { + t.acceptBytes(b1, b2) return true } } @@ -414,9 +426,9 @@ func MatchNewline() Handler { // newlines, then take a look at MatchWhitespace(). func MatchBlank() Handler { return func(t *API) bool { - r, err := t.NextRune() - if err == nil && (r == ' ' || r == '\t') { - t.Accept() + b, err := t.PeekByte(0) + if err == nil && (b == ' ' || b == '\t') { + t.acceptBytes(b) return true } return false @@ -433,20 +445,20 @@ func MatchBlank() Handler { func MatchBlanks() Handler { return func(t *API) bool { // Match the first blank. - r, _, err := t.PeekRune(0) - if err != nil || (r != ' ' && r != '\t') { + b, err := t.PeekByte(0) + if err != nil || (b != ' ' && b != '\t') { return false } - t.acceptRunes(1, r) + t.acceptBytes(b) // Now match any number of followup blanks. We've already got // a successful match at this point, so we'll always return true at the end. for { - r, _, err := t.PeekRune(0) - if err != nil || (r != ' ' && r != '\t') { + b, err := t.PeekByte(0) + if err != nil || (b != ' ' && b != '\t') { return true } - t.acceptRunes(1, r) + t.acceptBytes(b) } } } @@ -457,35 +469,35 @@ func MatchBlanks() Handler { func MatchWhitespace() Handler { return func(t *API) bool { // Match the first whitespace. - r1, _, err := t.PeekRune(0) - if err != nil || (r1 != ' ' && r1 != '\t' && r1 != '\n' && r1 != '\r') { + b1, err := t.PeekByte(0) + if err != nil || (b1 != ' ' && b1 != '\t' && b1 != '\n' && b1 != '\r') { return false } - if r1 == '\r' { - r2, _, err := t.PeekRune(1) - if err != nil || r2 != '\n' { + if b1 == '\r' { + b2, err := t.PeekByte(1) + if err != nil || b2 != '\n' { return false } - t.acceptRunes(2, r1, r2) + t.acceptBytes(b1, b2) } else { - t.acceptRunes(1, r1) + t.acceptBytes(b1) } // Now match any number of followup whitespace. We've already got // a successful match at this point, so we'll always return true at the end. for { - r1, _, err := t.PeekRune(0) - if err != nil || (r1 != ' ' && r1 != '\t' && r1 != '\n' && r1 != '\r') { + b1, err := t.PeekByte(0) + if err != nil || (b1 != ' ' && b1 != '\t' && b1 != '\n' && b1 != '\r') { return true } - if r1 == '\r' { - r2, _, err := t.PeekRune(1) - if err != nil || r2 != '\n' { + if b1 == '\r' { + b2, err := t.PeekByte(1) + if err != nil || b2 != '\n' { return true } - t.acceptRunes(2, r1, r2) + t.acceptBytes(b1, b2) } else { - t.acceptRunes(1, r1) + t.acceptBytes(b1) } } } @@ -517,18 +529,18 @@ func MatchRuneByCallback(callback func(rune) bool) Handler { // MatchEndOfLine creates a Handler that matches a newline ("\r\n" or "\n") or EOF. func MatchEndOfLine() Handler { return func(t *API) bool { - r1, _, err := t.PeekRune(0) + b1, err := t.PeekByte(0) if err != nil { return err == io.EOF } - if r1 == '\n' { - t.acceptRunes(1, r1) + if b1 == '\n' { + t.acceptBytes(b1) return true } - if r1 == '\r' { - r2, _, _ := t.PeekRune(1) - if r2 == '\n' { - t.acceptRunes(2, r1, r2) + if b1 == '\r' { + b2, _ := t.PeekByte(1) + if b2 == '\n' { + t.acceptBytes(b1, b2) return true } } @@ -955,45 +967,45 @@ func MatchFloat() Handler { // False falues: false, FALSE, False, 0, f, F func MatchBoolean() Handler { return func(t *API) bool { - r1, _, err := t.PeekRune(0) + b1, err := t.PeekByte(0) if err != nil { return false } - if r1 == '1' || r1 == '0' { - t.acceptRunes(1, r1) + if b1 == '1' || b1 == '0' { + t.acceptBytes(b1) return true } - if r1 == 't' || r1 == 'T' { - r2, _, _ := t.PeekRune(1) - r3, _, _ := t.PeekRune(2) - r4, _, err := t.PeekRune(3) - if err == nil && r2 == 'r' && r3 == 'u' && r4 == 'e' { - t.acceptRunes(4, r1, r2, r3, r4) + if b1 == 't' || b1 == 'T' { + b2, _ := t.PeekByte(1) + b3, _ := t.PeekByte(2) + b4, err := t.PeekByte(3) + if err == nil && b2 == 'r' && b3 == 'u' && b4 == 'e' { + t.acceptBytes(b1, b2, b3, b4) return true } - if err == nil && r1 == 'T' && r2 == 'R' && r3 == 'U' && r4 == 'E' { - t.acceptRunes(4, r1, r2, r3, r4) + if err == nil && b1 == 'T' && b2 == 'R' && b3 == 'U' && b4 == 'E' { + t.acceptBytes(b1, b2, b3, b4) return true } - t.acceptRunes(1, r1) + t.acceptBytes(b1) return true } - if r1 == 'f' || r1 == 'F' { - r2, _, _ := t.PeekRune(1) - r3, _, _ := t.PeekRune(2) - r4, _, _ := t.PeekRune(3) - r5, _, err := t.PeekRune(4) + if b1 == 'f' || b1 == 'F' { + b2, _ := t.PeekByte(1) + b3, _ := t.PeekByte(2) + b4, _ := t.PeekByte(3) + b5, err := t.PeekByte(4) - if err == nil && r2 == 'a' && r3 == 'l' && r4 == 's' && r5 == 'e' { - t.acceptRunes(5, r1, r2, r3, r4, r5) + if err == nil && b2 == 'a' && b3 == 'l' && b4 == 's' && b5 == 'e' { + t.acceptBytes(b1, b2, b3, b4, b5) return true } - if err == nil && r1 == 'F' && r2 == 'A' && r3 == 'L' && r4 == 'S' && r5 == 'E' { - t.acceptRunes(5, r1, r2, r3, r4, r5) + if err == nil && b1 == 'F' && b2 == 'A' && b3 == 'L' && b4 == 'S' && b5 == 'E' { + t.acceptBytes(b1, b2, b3, b4, b5) return true } - t.acceptRunes(1, r1) + t.acceptBytes(b1) return true } return false @@ -1039,7 +1051,14 @@ func MatchUnicodeLower() Handler { // MatchHexDigit creates a Handler function that check if a single hexadecimal // digit can be read from the input. func MatchHexDigit() Handler { - return MatchAny(MatchRuneRange('0', '9'), MatchRuneRange('a', 'f'), MatchRuneRange('A', 'F')) + return func(t *API) bool { + b, err := t.PeekByte(0) + if err == nil && ((b >= '0' && b <= '9') || (b >= 'a' && b <= 'f') || (b >= 'A' && b <= 'F')) { + t.acceptBytes(b) + return true + } + return false + } } // MatchOctet creates a Handler function that checks if a valid octet value