Code cleanup, making the byte and rune inputs look as much the same as possible and get rid of some unneeded functionality.

This commit is contained in:
Maurice Makaay 2019-07-23 08:03:16 +00:00
parent 93d2cfa6f1
commit a968f22d45
7 changed files with 121 additions and 140 deletions

View File

@ -73,7 +73,7 @@ import (
// can lead to hard to track bugs. I much prefer this forking method, since
// no bookkeeping has to be implemented when implementing a parser.
type API struct {
stackFrames []stackFrame // the stack frames, containing stack level-specific data
stackFrames []stackFrame // the stack frames, containing stack level-specific dat
stackLevel int // the current stack level
stackFrame *stackFrame // the current stack frame
reader *read.Buffer // the buffered input reader
@ -105,12 +105,13 @@ const initialByteStoreLength = 1024
// For an overview of allowed inputs, take a look at the documentation
// for parsekit.read.New().
func NewAPI(input interface{}) *API {
reader := read.New(input)
api := &API{
reader: read.New(input),
stackFrames: make([]stackFrame, initialStackDepth),
reader: reader,
}
api.Byte = ByteMode{api: api}
api.Rune = RuneMode{api: api}
api.Byte = ByteMode{api: api, reader: reader}
api.Rune = RuneMode{api: api, reader: reader}
api.Output = Output{api: api}
api.stackFrame = &api.stackFrames[0]
@ -210,8 +211,8 @@ func (tokenAPI *API) Merge(stackLevel int) {
f.err = nil
}
// Reset moves the input cursor back to the beginning for the currently active API child.
// Aditionally, any output (bytes and tokens) that was emitted from the API child are
// Reset moves the read cursor back to the beginning for the currently active API child.
// Aditionally, all output (bytes and tokens) that was emitted from the API child is
// cleared as well.
func (api *API) Reset() {
f := api.stackFrame

View File

@ -1,8 +1,11 @@
package tokenize
import "git.makaay.nl/mauricem/go-parsekit/read"
// ByteMode provides byte-driven input/output functionality for the tokenize API.
type ByteMode struct {
api *API
api *API
reader *read.Buffer // the buffered input reader
}
// Peek returns the byte at the provided byte offset.
@ -11,37 +14,12 @@ type ByteMode struct {
// When an offset is requested that is beyond the length of the available input
// data, then the error will be io.EOF.
func (byteMode ByteMode) Peek(offset int) (byte, error) {
a := byteMode.api
return a.reader.ByteAt(a.stackFrame.offset + offset)
}
// Skip is used to skip over one or more bytes that were read from the input.
// This tells the tokenizer: "I've seen these bytes. They are of no interest.
// I will now continue reading after these bytes."
//
// This will merely update the position of the cursor (which keeps track of what
// line and column we are on in the input data). The bytes are not added to
// the output.
//
// After the call, byte offset 0 for PeekByte() and PeekRune() will point at
// the first byte after the skipped bytes.
func (byteMode ByteMode) Skip(b byte) {
f := byteMode.api.stackFrame
f.moveCursorByByte(b)
f.offset++
}
func (byteMode ByteMode) SkipMulti(bytes ...byte) {
f := byteMode.api.stackFrame
for _, b := range bytes {
f.moveCursorByByte(b)
f.offset++
}
return byteMode.reader.ByteAt(byteMode.api.stackFrame.offset + offset)
}
func (byteMode ByteMode) Accept(b byte) {
byteMode.api.Output.AddByte(b)
byteMode.Skip(b)
byteMode.MoveCursor(b)
}
// AcceptMulti is used to accept one or more bytes that were read from the input.
@ -57,5 +35,35 @@ func (byteMode ByteMode) Accept(b byte) {
// the first byte after the accepted bytes.
func (byteMode ByteMode) AcceptMulti(bytes ...byte) {
byteMode.api.Output.AddBytes(bytes...)
byteMode.SkipMulti(bytes...)
byteMode.MoveCursorMulti(bytes...)
}
// MoveCursor updates the position of the read cursor, based on the provided byte.
// This method takes newlines into account to keep track of line numbers and
// column positions for the input cursor.
//
// After the call, byte offset 0 for Peek() and PeekMulti() will point at
// the first byte at the new cursor position.
func (byteMode ByteMode) MoveCursor(b byte) {
f := byteMode.api.stackFrame
if b == '\n' {
f.column = 0
f.line++
} else {
f.column++
}
f.offset++
}
// MoveCursorMulti updates the position of the read cursor, based on the provided bytes.
// This method takes newlines into account to keep track of line numbers and
// column positions for the input cursor.
//
// After the call, byte offset 0 for Peek() and PeekMulti() will point at
// the first byte at the new cursor position.
func (byteMode ByteMode) MoveCursorMulti(bytes ...byte) {
for _, b := range bytes {
byteMode.MoveCursor(b)
}
}

View File

@ -2,11 +2,14 @@ package tokenize
import (
"unicode/utf8"
"git.makaay.nl/mauricem/go-parsekit/read"
)
// RuneMode provides (UTF8) rune-driven input/output functionality for the tokenize API.
type RuneMode struct {
api *API
api *API
reader *read.Buffer // the buffered input reader
}
// Peek returns the UTF8 rune at the provided byte offset, including its byte width.
@ -23,42 +26,7 @@ type RuneMode struct {
// When an offset is requested that is beyond the length of the available input
// data, then the error will be io.EOF.
func (runeMode RuneMode) Peek(offset int) (rune, int, error) {
a := runeMode.api
return a.reader.RuneAt(a.stackFrame.offset + offset)
}
// Skip is used to skip over a single rune that was read from the input.
// This tells the tokenizer: "I've seen this rune. It is of no interest.
// I will now continue reading after this rune."
//
// This will merely update the position of the cursor (which keeps track of what
// line and column we are on in APIthe input data). The rune is not added to
// the output.
//
// After the call, byte offset 0 for PeekByte() and PeekRune() will point at
// the first byte after the skipped rune.
func (runeMode RuneMode) Skip(r rune) {
f := runeMode.api.stackFrame
f.moveCursorByRune(r)
f.offset += utf8.RuneLen(r)
}
// SkipMulti is used to skip over one or more runes that were read from the input.
// This tells the tokenizer: "I've seen these runes. They are of no interest.
// I will now continue reading after these runes."
//
// This will merely update the position of the cursor (which keeps track of what
// line and column we are on in the input data). The runes are not added to
// the output.
//
// After the call, byte offset 0 for PeekByte() and PeekRune() will point at
// the first byte after the skipped runes.
func (runeMode RuneMode) SkipMulti(runes ...rune) {
f := runeMode.api.stackFrame
for _, r := range runes {
f.moveCursorByRune(r)
f.offset += utf8.RuneLen(r)
}
return runeMode.reader.RuneAt(runeMode.api.stackFrame.offset + offset)
}
// Accept is used to accept a single rune that was read from the input.
@ -67,7 +35,7 @@ func (runeMode RuneMode) SkipMulti(runes ...rune) {
// reading after this rune."
//
// This will update the position of the cursor (which keeps track of what line
// and column we are on in the input data) and add the rune to the tokenizer
// and column we are on intin the input data) and add the rune to the tokenizer
// output.
//
// After the call, byte offset 0 for PeekByte() and PeekRune() will point at
@ -75,15 +43,14 @@ func (runeMode RuneMode) SkipMulti(runes ...rune) {
func (runeMode RuneMode) Accept(r rune) {
a := runeMode.api
f := a.stackFrame
curBytesEnd := f.bytesEnd
maxRequiredBytes := curBytesEnd + utf8.UTFMax
a.growOutputData(maxRequiredBytes)
w := utf8.EncodeRune(a.outputData[curBytesEnd:], r)
f.bytesEnd += w
f.offset += w
f.moveCursorByRune(r)
runeMode.MoveCursor(r)
}
// AcceptMulti is used to accept one or more runes that were read from the input.
@ -100,17 +67,47 @@ func (runeMode RuneMode) Accept(r rune) {
func (runeMode RuneMode) AcceptMulti(runes ...rune) {
a := runeMode.api
f := a.stackFrame
runesAsString := string(runes)
byteLen := len(runesAsString)
curBytesEnd := f.bytesEnd
newBytesEnd := curBytesEnd + byteLen
a.growOutputData(newBytesEnd)
copy(a.outputData[curBytesEnd:], runesAsString)
f.bytesEnd = newBytesEnd
f.offset += byteLen
curBytesEnd := f.bytesEnd
maxBytes := curBytesEnd + len(runes)*utf8.UTFMax
a.growOutputData(maxBytes)
for _, r := range runes {
f.moveCursorByRune(r)
w := utf8.EncodeRune(a.outputData[curBytesEnd:], r)
curBytesEnd += w
runeMode.MoveCursor(r)
}
f.bytesEnd = curBytesEnd
}
// MoveCursor updates the position of the read cursor, based on the provided rune.
// This method takes newlines into account to keep track of line numbers and
// column positions for the input cursor.
//
// After the call, byte offset 0 for Peek() and PeekMulti() will point at
// the first rune at the new cursor position.
func (runeMode RuneMode) MoveCursor(r rune) int {
f := runeMode.api.stackFrame
if r == '\n' {
f.column = 0
f.line++
} else {
f.column++
}
width := utf8.RuneLen(r)
f.offset += width
return width
}
// MoveCursorMulti updates the position of the read cursor, based on the provided runes.
// This method takes newlines into account to keep track of line numbers and
// column positions for the input cursor.
//
// After the call, byte offset 0 for Peek() and PeekMulti() will point at
// the first rune at the new cursor position.
func (runeMode RuneMode) MoveCursorMulti(runes ...rune) {
for _, r := range runes {
runeMode.MoveCursor(r)
}
}

View File

@ -94,7 +94,7 @@ func ExampleAPI_SkipRune() {
if strings.ContainsRune("aeiouAEIOU", r) {
api.Rune.Accept(r)
} else {
api.Rune.Skip(r)
api.Rune.MoveCursor(r)
}
}

View File

@ -1,30 +0,0 @@
package tokenize
// move updates the position of the cursor, based on the provided input string.
// The input string represents the runes that the cursor must be moved over.
// This method will take newlines into account to keep track of line numbers and
// column positions automatically.
func (f *stackFrame) moveCursor(input string) *stackFrame {
for _, r := range input {
f.moveCursorByRune(r)
}
return f
}
func (f *stackFrame) moveCursorByRune(r rune) {
if r == '\n' {
f.column = 0
f.line++
} else {
f.column++
}
}
func (f *stackFrame) moveCursorByByte(b byte) {
if b == '\n' {
f.column = 0
f.line++
} else {
f.column++
}
}

View File

@ -6,27 +6,29 @@ import (
func TestMoveCursorByBytes(t *testing.T) {
api := NewAPI("")
api.stackFrame.moveCursorByByte('a')
api.stackFrame.moveCursorByByte('b')
api.stackFrame.moveCursorByByte('c')
api.stackFrame.moveCursorByByte('\r')
api.stackFrame.moveCursorByByte('\n')
api.stackFrame.moveCursorByByte('a')
api.stackFrame.moveCursorByByte('b')
api.Byte.MoveCursor('a')
api.Byte.MoveCursor('b')
api.Byte.MoveCursor('c')
api.Byte.MoveCursor('\r')
api.Byte.MoveCursor('\n')
api.Byte.MoveCursor('a')
api.Byte.MoveCursor('b')
AssertEqual(t, "line 2, column 3", api.Cursor(), "Cursor position after moving by byte")
AssertEqual(t, 7, api.stackFrame.offset, "Offset after moving by byte")
}
func TestMoveCursorByRunes(t *testing.T) {
api := NewAPI("")
api.stackFrame.moveCursorByRune('ɹ')
api.stackFrame.moveCursorByRune('n')
api.stackFrame.moveCursorByRune('u')
api.stackFrame.moveCursorByRune('\r')
api.stackFrame.moveCursorByRune('\n')
api.stackFrame.moveCursorByRune('ǝ')
api.Rune.MoveCursor('ɹ')
api.Rune.MoveCursor('n')
api.Rune.MoveCursor('u')
api.Rune.MoveCursor('\r')
api.Rune.MoveCursor('\n')
api.Rune.MoveCursor('ǝ')
AssertEqual(t, "line 2, column 2", api.Cursor(), "Cursor position after moving by rune")
AssertEqual(t, 8, api.stackFrame.offset, "Offset after moving by rune")
}
func TestWhenMovingCursor_CursorPositionIsUpdated(t *testing.T) {
@ -49,7 +51,9 @@ func TestWhenMovingCursor_CursorPositionIsUpdated(t *testing.T) {
} {
api := NewAPI("")
for _, s := range test.input {
api.stackFrame.moveCursor(s)
for _, r := range s {
api.Rune.MoveCursor(r)
}
}
if api.stackFrame.line != test.line {
t.Errorf("[%s] Unexpected line offset %d (expected %d)", test.name, api.stackFrame.line, test.line)

View File

@ -476,7 +476,7 @@ func MatchNewline() Handler {
return false
}
if b1 == '\n' {
t.Byte.AcceptMulti(b1)
t.Byte.Accept(b1)
return true
}
if b1 == '\r' {
@ -1121,7 +1121,7 @@ func MatchInteger(normalize bool) Handler {
// The next character is a zero, skip the leading zero and check again.
if err == nil && b2 == b {
t.Byte.Skip('0')
t.Byte.MoveCursor('0')
continue
}
// The next character is not a zero, nor a digit at all.
@ -1131,7 +1131,7 @@ func MatchInteger(normalize bool) Handler {
return true
}
// The next character is a digit. SKip the leading zero and go with the digit.
t.Byte.Skip('0')
t.Byte.MoveCursor('0')
t.Byte.Accept(b2)
break
}
@ -1170,7 +1170,7 @@ func MatchDecimal(normalize bool) Handler {
// The next character is a zero, skip the leading zero and check again.
if err == nil && b2 == b {
t.Byte.Skip('0')
t.Byte.MoveCursor('0')
continue
}
// The next character is a dot, go with the zero before the dot and
@ -1186,7 +1186,7 @@ func MatchDecimal(normalize bool) Handler {
return true
}
// The next character is a digit. SKip the leading zero and go with the digit.
t.Byte.Skip('0')
t.Byte.MoveCursor('0')
t.Byte.Accept(b2)
break
}
@ -1198,7 +1198,7 @@ func MatchDecimal(normalize bool) Handler {
if err != nil || b < '0' || b > '9' {
break
}
t.Byte.AcceptMulti(b)
t.Byte.Accept(b)
}
// No dot or no digit after a dot? Then we're done.
@ -1266,6 +1266,7 @@ func MatchBoolean() Handler {
t.Byte.Accept(b1)
return true
}
// TODO Multibyte peeks (also useful for strings)
b3, _ := t.Byte.Peek(2)
b4, _ := t.Byte.Peek(3)
b5, err := t.Byte.Peek(4)
@ -1362,7 +1363,7 @@ func MatchOctet(normalize bool) Handler {
if err != nil || b2 < '0' || b2 > '9' {
// Output 2-digit octet.
if normalize && b0 == '0' {
t.Byte.Skip(b0)
t.Byte.MoveCursor(b0)
t.Byte.Accept(b1)
} else {
t.Byte.AcceptMulti(b0, b1)
@ -1377,9 +1378,9 @@ func MatchOctet(normalize bool) Handler {
// Output 3-digit octet.
if normalize && b0 == '0' {
t.Byte.Skip(b0)
t.Byte.MoveCursor(b0)
if b1 == '0' {
t.Byte.Skip(b1)
t.Byte.MoveCursor(b1)
} else {
t.Byte.Accept(b1)
}
@ -1598,7 +1599,7 @@ func ModifyDropUntilEndOfLine() Handler {
if b == '\n' {
return true
}
t.Byte.Skip(b)
t.Byte.MoveCursor(b)
}
}
}