diff --git a/tokenize/api.go b/tokenize/api.go index d0192cf..0509dc6 100644 --- a/tokenize/api.go +++ b/tokenize/api.go @@ -73,7 +73,7 @@ import ( // can lead to hard to track bugs. I much prefer this forking method, since // no bookkeeping has to be implemented when implementing a parser. type API struct { - stackFrames []stackFrame // the stack frames, containing stack level-specific data + stackFrames []stackFrame // the stack frames, containing stack level-specific dat stackLevel int // the current stack level stackFrame *stackFrame // the current stack frame reader *read.Buffer // the buffered input reader @@ -105,12 +105,13 @@ const initialByteStoreLength = 1024 // For an overview of allowed inputs, take a look at the documentation // for parsekit.read.New(). func NewAPI(input interface{}) *API { + reader := read.New(input) api := &API{ - reader: read.New(input), stackFrames: make([]stackFrame, initialStackDepth), + reader: reader, } - api.Byte = ByteMode{api: api} - api.Rune = RuneMode{api: api} + api.Byte = ByteMode{api: api, reader: reader} + api.Rune = RuneMode{api: api, reader: reader} api.Output = Output{api: api} api.stackFrame = &api.stackFrames[0] @@ -210,8 +211,8 @@ func (tokenAPI *API) Merge(stackLevel int) { f.err = nil } -// Reset moves the input cursor back to the beginning for the currently active API child. -// Aditionally, any output (bytes and tokens) that was emitted from the API child are +// Reset moves the read cursor back to the beginning for the currently active API child. +// Aditionally, all output (bytes and tokens) that was emitted from the API child is // cleared as well. func (api *API) Reset() { f := api.stackFrame diff --git a/tokenize/api_bytemode.go b/tokenize/api_bytemode.go index 7836984..3eb5fcf 100644 --- a/tokenize/api_bytemode.go +++ b/tokenize/api_bytemode.go @@ -1,8 +1,11 @@ package tokenize +import "git.makaay.nl/mauricem/go-parsekit/read" + // ByteMode provides byte-driven input/output functionality for the tokenize API. type ByteMode struct { - api *API + api *API + reader *read.Buffer // the buffered input reader } // Peek returns the byte at the provided byte offset. @@ -11,37 +14,12 @@ type ByteMode struct { // When an offset is requested that is beyond the length of the available input // data, then the error will be io.EOF. func (byteMode ByteMode) Peek(offset int) (byte, error) { - a := byteMode.api - return a.reader.ByteAt(a.stackFrame.offset + offset) -} - -// Skip is used to skip over one or more bytes that were read from the input. -// This tells the tokenizer: "I've seen these bytes. They are of no interest. -// I will now continue reading after these bytes." -// -// This will merely update the position of the cursor (which keeps track of what -// line and column we are on in the input data). The bytes are not added to -// the output. -// -// After the call, byte offset 0 for PeekByte() and PeekRune() will point at -// the first byte after the skipped bytes. -func (byteMode ByteMode) Skip(b byte) { - f := byteMode.api.stackFrame - f.moveCursorByByte(b) - f.offset++ -} - -func (byteMode ByteMode) SkipMulti(bytes ...byte) { - f := byteMode.api.stackFrame - for _, b := range bytes { - f.moveCursorByByte(b) - f.offset++ - } + return byteMode.reader.ByteAt(byteMode.api.stackFrame.offset + offset) } func (byteMode ByteMode) Accept(b byte) { byteMode.api.Output.AddByte(b) - byteMode.Skip(b) + byteMode.MoveCursor(b) } // AcceptMulti is used to accept one or more bytes that were read from the input. @@ -57,5 +35,35 @@ func (byteMode ByteMode) Accept(b byte) { // the first byte after the accepted bytes. func (byteMode ByteMode) AcceptMulti(bytes ...byte) { byteMode.api.Output.AddBytes(bytes...) - byteMode.SkipMulti(bytes...) + byteMode.MoveCursorMulti(bytes...) +} + +// MoveCursor updates the position of the read cursor, based on the provided byte. +// This method takes newlines into account to keep track of line numbers and +// column positions for the input cursor. +// +// After the call, byte offset 0 for Peek() and PeekMulti() will point at +// the first byte at the new cursor position. +func (byteMode ByteMode) MoveCursor(b byte) { + f := byteMode.api.stackFrame + if b == '\n' { + f.column = 0 + f.line++ + } else { + f.column++ + } + + f.offset++ +} + +// MoveCursorMulti updates the position of the read cursor, based on the provided bytes. +// This method takes newlines into account to keep track of line numbers and +// column positions for the input cursor. +// +// After the call, byte offset 0 for Peek() and PeekMulti() will point at +// the first byte at the new cursor position. +func (byteMode ByteMode) MoveCursorMulti(bytes ...byte) { + for _, b := range bytes { + byteMode.MoveCursor(b) + } } diff --git a/tokenize/api_runemode.go b/tokenize/api_runemode.go index 819e981..868603d 100644 --- a/tokenize/api_runemode.go +++ b/tokenize/api_runemode.go @@ -2,11 +2,14 @@ package tokenize import ( "unicode/utf8" + + "git.makaay.nl/mauricem/go-parsekit/read" ) // RuneMode provides (UTF8) rune-driven input/output functionality for the tokenize API. type RuneMode struct { - api *API + api *API + reader *read.Buffer // the buffered input reader } // Peek returns the UTF8 rune at the provided byte offset, including its byte width. @@ -23,42 +26,7 @@ type RuneMode struct { // When an offset is requested that is beyond the length of the available input // data, then the error will be io.EOF. func (runeMode RuneMode) Peek(offset int) (rune, int, error) { - a := runeMode.api - return a.reader.RuneAt(a.stackFrame.offset + offset) -} - -// Skip is used to skip over a single rune that was read from the input. -// This tells the tokenizer: "I've seen this rune. It is of no interest. -// I will now continue reading after this rune." -// -// This will merely update the position of the cursor (which keeps track of what -// line and column we are on in APIthe input data). The rune is not added to -// the output. -// -// After the call, byte offset 0 for PeekByte() and PeekRune() will point at -// the first byte after the skipped rune. -func (runeMode RuneMode) Skip(r rune) { - f := runeMode.api.stackFrame - f.moveCursorByRune(r) - f.offset += utf8.RuneLen(r) -} - -// SkipMulti is used to skip over one or more runes that were read from the input. -// This tells the tokenizer: "I've seen these runes. They are of no interest. -// I will now continue reading after these runes." -// -// This will merely update the position of the cursor (which keeps track of what -// line and column we are on in the input data). The runes are not added to -// the output. -// -// After the call, byte offset 0 for PeekByte() and PeekRune() will point at -// the first byte after the skipped runes. -func (runeMode RuneMode) SkipMulti(runes ...rune) { - f := runeMode.api.stackFrame - for _, r := range runes { - f.moveCursorByRune(r) - f.offset += utf8.RuneLen(r) - } + return runeMode.reader.RuneAt(runeMode.api.stackFrame.offset + offset) } // Accept is used to accept a single rune that was read from the input. @@ -67,7 +35,7 @@ func (runeMode RuneMode) SkipMulti(runes ...rune) { // reading after this rune." // // This will update the position of the cursor (which keeps track of what line -// and column we are on in the input data) and add the rune to the tokenizer +// and column we are on intin the input data) and add the rune to the tokenizer // output. // // After the call, byte offset 0 for PeekByte() and PeekRune() will point at @@ -75,15 +43,14 @@ func (runeMode RuneMode) SkipMulti(runes ...rune) { func (runeMode RuneMode) Accept(r rune) { a := runeMode.api f := a.stackFrame + curBytesEnd := f.bytesEnd maxRequiredBytes := curBytesEnd + utf8.UTFMax - a.growOutputData(maxRequiredBytes) w := utf8.EncodeRune(a.outputData[curBytesEnd:], r) f.bytesEnd += w - f.offset += w - f.moveCursorByRune(r) + runeMode.MoveCursor(r) } // AcceptMulti is used to accept one or more runes that were read from the input. @@ -100,17 +67,47 @@ func (runeMode RuneMode) Accept(r rune) { func (runeMode RuneMode) AcceptMulti(runes ...rune) { a := runeMode.api f := a.stackFrame - runesAsString := string(runes) - byteLen := len(runesAsString) - curBytesEnd := f.bytesEnd - newBytesEnd := curBytesEnd + byteLen - a.growOutputData(newBytesEnd) - copy(a.outputData[curBytesEnd:], runesAsString) - f.bytesEnd = newBytesEnd - f.offset += byteLen + curBytesEnd := f.bytesEnd + maxBytes := curBytesEnd + len(runes)*utf8.UTFMax + a.growOutputData(maxBytes) for _, r := range runes { - f.moveCursorByRune(r) + w := utf8.EncodeRune(a.outputData[curBytesEnd:], r) + curBytesEnd += w + runeMode.MoveCursor(r) + } + f.bytesEnd = curBytesEnd +} + +// MoveCursor updates the position of the read cursor, based on the provided rune. +// This method takes newlines into account to keep track of line numbers and +// column positions for the input cursor. +// +// After the call, byte offset 0 for Peek() and PeekMulti() will point at +// the first rune at the new cursor position. +func (runeMode RuneMode) MoveCursor(r rune) int { + f := runeMode.api.stackFrame + if r == '\n' { + f.column = 0 + f.line++ + } else { + f.column++ + } + + width := utf8.RuneLen(r) + f.offset += width + return width +} + +// MoveCursorMulti updates the position of the read cursor, based on the provided runes. +// This method takes newlines into account to keep track of line numbers and +// column positions for the input cursor. +// +// After the call, byte offset 0 for Peek() and PeekMulti() will point at +// the first rune at the new cursor position. +func (runeMode RuneMode) MoveCursorMulti(runes ...rune) { + for _, r := range runes { + runeMode.MoveCursor(r) } } diff --git a/tokenize/api_test.go b/tokenize/api_test.go index de28a24..5f95163 100644 --- a/tokenize/api_test.go +++ b/tokenize/api_test.go @@ -94,7 +94,7 @@ func ExampleAPI_SkipRune() { if strings.ContainsRune("aeiouAEIOU", r) { api.Rune.Accept(r) } else { - api.Rune.Skip(r) + api.Rune.MoveCursor(r) } } diff --git a/tokenize/cursor.go b/tokenize/cursor.go deleted file mode 100644 index b88a430..0000000 --- a/tokenize/cursor.go +++ /dev/null @@ -1,30 +0,0 @@ -package tokenize - -// move updates the position of the cursor, based on the provided input string. -// The input string represents the runes that the cursor must be moved over. -// This method will take newlines into account to keep track of line numbers and -// column positions automatically. -func (f *stackFrame) moveCursor(input string) *stackFrame { - for _, r := range input { - f.moveCursorByRune(r) - } - return f -} - -func (f *stackFrame) moveCursorByRune(r rune) { - if r == '\n' { - f.column = 0 - f.line++ - } else { - f.column++ - } -} - -func (f *stackFrame) moveCursorByByte(b byte) { - if b == '\n' { - f.column = 0 - f.line++ - } else { - f.column++ - } -} diff --git a/tokenize/cursor_test.go b/tokenize/cursor_test.go index 971fd98..257c181 100644 --- a/tokenize/cursor_test.go +++ b/tokenize/cursor_test.go @@ -6,27 +6,29 @@ import ( func TestMoveCursorByBytes(t *testing.T) { api := NewAPI("") - api.stackFrame.moveCursorByByte('a') - api.stackFrame.moveCursorByByte('b') - api.stackFrame.moveCursorByByte('c') - api.stackFrame.moveCursorByByte('\r') - api.stackFrame.moveCursorByByte('\n') - api.stackFrame.moveCursorByByte('a') - api.stackFrame.moveCursorByByte('b') + api.Byte.MoveCursor('a') + api.Byte.MoveCursor('b') + api.Byte.MoveCursor('c') + api.Byte.MoveCursor('\r') + api.Byte.MoveCursor('\n') + api.Byte.MoveCursor('a') + api.Byte.MoveCursor('b') AssertEqual(t, "line 2, column 3", api.Cursor(), "Cursor position after moving by byte") + AssertEqual(t, 7, api.stackFrame.offset, "Offset after moving by byte") } func TestMoveCursorByRunes(t *testing.T) { api := NewAPI("") - api.stackFrame.moveCursorByRune('ɹ') - api.stackFrame.moveCursorByRune('n') - api.stackFrame.moveCursorByRune('u') - api.stackFrame.moveCursorByRune('\r') - api.stackFrame.moveCursorByRune('\n') - api.stackFrame.moveCursorByRune('ǝ') + api.Rune.MoveCursor('ɹ') + api.Rune.MoveCursor('n') + api.Rune.MoveCursor('u') + api.Rune.MoveCursor('\r') + api.Rune.MoveCursor('\n') + api.Rune.MoveCursor('ǝ') AssertEqual(t, "line 2, column 2", api.Cursor(), "Cursor position after moving by rune") + AssertEqual(t, 8, api.stackFrame.offset, "Offset after moving by rune") } func TestWhenMovingCursor_CursorPositionIsUpdated(t *testing.T) { @@ -49,7 +51,9 @@ func TestWhenMovingCursor_CursorPositionIsUpdated(t *testing.T) { } { api := NewAPI("") for _, s := range test.input { - api.stackFrame.moveCursor(s) + for _, r := range s { + api.Rune.MoveCursor(r) + } } if api.stackFrame.line != test.line { t.Errorf("[%s] Unexpected line offset %d (expected %d)", test.name, api.stackFrame.line, test.line) diff --git a/tokenize/handlers_builtin.go b/tokenize/handlers_builtin.go index 36e0fe0..cb99bb4 100644 --- a/tokenize/handlers_builtin.go +++ b/tokenize/handlers_builtin.go @@ -476,7 +476,7 @@ func MatchNewline() Handler { return false } if b1 == '\n' { - t.Byte.AcceptMulti(b1) + t.Byte.Accept(b1) return true } if b1 == '\r' { @@ -1121,7 +1121,7 @@ func MatchInteger(normalize bool) Handler { // The next character is a zero, skip the leading zero and check again. if err == nil && b2 == b { - t.Byte.Skip('0') + t.Byte.MoveCursor('0') continue } // The next character is not a zero, nor a digit at all. @@ -1131,7 +1131,7 @@ func MatchInteger(normalize bool) Handler { return true } // The next character is a digit. SKip the leading zero and go with the digit. - t.Byte.Skip('0') + t.Byte.MoveCursor('0') t.Byte.Accept(b2) break } @@ -1170,7 +1170,7 @@ func MatchDecimal(normalize bool) Handler { // The next character is a zero, skip the leading zero and check again. if err == nil && b2 == b { - t.Byte.Skip('0') + t.Byte.MoveCursor('0') continue } // The next character is a dot, go with the zero before the dot and @@ -1186,7 +1186,7 @@ func MatchDecimal(normalize bool) Handler { return true } // The next character is a digit. SKip the leading zero and go with the digit. - t.Byte.Skip('0') + t.Byte.MoveCursor('0') t.Byte.Accept(b2) break } @@ -1198,7 +1198,7 @@ func MatchDecimal(normalize bool) Handler { if err != nil || b < '0' || b > '9' { break } - t.Byte.AcceptMulti(b) + t.Byte.Accept(b) } // No dot or no digit after a dot? Then we're done. @@ -1266,6 +1266,7 @@ func MatchBoolean() Handler { t.Byte.Accept(b1) return true } + // TODO Multibyte peeks (also useful for strings) b3, _ := t.Byte.Peek(2) b4, _ := t.Byte.Peek(3) b5, err := t.Byte.Peek(4) @@ -1362,7 +1363,7 @@ func MatchOctet(normalize bool) Handler { if err != nil || b2 < '0' || b2 > '9' { // Output 2-digit octet. if normalize && b0 == '0' { - t.Byte.Skip(b0) + t.Byte.MoveCursor(b0) t.Byte.Accept(b1) } else { t.Byte.AcceptMulti(b0, b1) @@ -1377,9 +1378,9 @@ func MatchOctet(normalize bool) Handler { // Output 3-digit octet. if normalize && b0 == '0' { - t.Byte.Skip(b0) + t.Byte.MoveCursor(b0) if b1 == '0' { - t.Byte.Skip(b1) + t.Byte.MoveCursor(b1) } else { t.Byte.Accept(b1) } @@ -1598,7 +1599,7 @@ func ModifyDropUntilEndOfLine() Handler { if b == '\n' { return true } - t.Byte.Skip(b) + t.Byte.MoveCursor(b) } } }