Code cleanup, making the byte and rune inputs look as much the same as possible and get rid of some unneeded functionality.

This commit is contained in:
Maurice Makaay 2019-07-23 08:03:16 +00:00
parent 93d2cfa6f1
commit a968f22d45
7 changed files with 121 additions and 140 deletions

View File

@ -73,7 +73,7 @@ import (
// can lead to hard to track bugs. I much prefer this forking method, since // can lead to hard to track bugs. I much prefer this forking method, since
// no bookkeeping has to be implemented when implementing a parser. // no bookkeeping has to be implemented when implementing a parser.
type API struct { type API struct {
stackFrames []stackFrame // the stack frames, containing stack level-specific data stackFrames []stackFrame // the stack frames, containing stack level-specific dat
stackLevel int // the current stack level stackLevel int // the current stack level
stackFrame *stackFrame // the current stack frame stackFrame *stackFrame // the current stack frame
reader *read.Buffer // the buffered input reader reader *read.Buffer // the buffered input reader
@ -105,12 +105,13 @@ const initialByteStoreLength = 1024
// For an overview of allowed inputs, take a look at the documentation // For an overview of allowed inputs, take a look at the documentation
// for parsekit.read.New(). // for parsekit.read.New().
func NewAPI(input interface{}) *API { func NewAPI(input interface{}) *API {
reader := read.New(input)
api := &API{ api := &API{
reader: read.New(input),
stackFrames: make([]stackFrame, initialStackDepth), stackFrames: make([]stackFrame, initialStackDepth),
reader: reader,
} }
api.Byte = ByteMode{api: api} api.Byte = ByteMode{api: api, reader: reader}
api.Rune = RuneMode{api: api} api.Rune = RuneMode{api: api, reader: reader}
api.Output = Output{api: api} api.Output = Output{api: api}
api.stackFrame = &api.stackFrames[0] api.stackFrame = &api.stackFrames[0]
@ -210,8 +211,8 @@ func (tokenAPI *API) Merge(stackLevel int) {
f.err = nil f.err = nil
} }
// Reset moves the input cursor back to the beginning for the currently active API child. // Reset moves the read cursor back to the beginning for the currently active API child.
// Aditionally, any output (bytes and tokens) that was emitted from the API child are // Aditionally, all output (bytes and tokens) that was emitted from the API child is
// cleared as well. // cleared as well.
func (api *API) Reset() { func (api *API) Reset() {
f := api.stackFrame f := api.stackFrame

View File

@ -1,8 +1,11 @@
package tokenize package tokenize
import "git.makaay.nl/mauricem/go-parsekit/read"
// ByteMode provides byte-driven input/output functionality for the tokenize API. // ByteMode provides byte-driven input/output functionality for the tokenize API.
type ByteMode struct { type ByteMode struct {
api *API api *API
reader *read.Buffer // the buffered input reader
} }
// Peek returns the byte at the provided byte offset. // Peek returns the byte at the provided byte offset.
@ -11,37 +14,12 @@ type ByteMode struct {
// When an offset is requested that is beyond the length of the available input // When an offset is requested that is beyond the length of the available input
// data, then the error will be io.EOF. // data, then the error will be io.EOF.
func (byteMode ByteMode) Peek(offset int) (byte, error) { func (byteMode ByteMode) Peek(offset int) (byte, error) {
a := byteMode.api return byteMode.reader.ByteAt(byteMode.api.stackFrame.offset + offset)
return a.reader.ByteAt(a.stackFrame.offset + offset)
}
// Skip is used to skip over one or more bytes that were read from the input.
// This tells the tokenizer: "I've seen these bytes. They are of no interest.
// I will now continue reading after these bytes."
//
// This will merely update the position of the cursor (which keeps track of what
// line and column we are on in the input data). The bytes are not added to
// the output.
//
// After the call, byte offset 0 for PeekByte() and PeekRune() will point at
// the first byte after the skipped bytes.
func (byteMode ByteMode) Skip(b byte) {
f := byteMode.api.stackFrame
f.moveCursorByByte(b)
f.offset++
}
func (byteMode ByteMode) SkipMulti(bytes ...byte) {
f := byteMode.api.stackFrame
for _, b := range bytes {
f.moveCursorByByte(b)
f.offset++
}
} }
func (byteMode ByteMode) Accept(b byte) { func (byteMode ByteMode) Accept(b byte) {
byteMode.api.Output.AddByte(b) byteMode.api.Output.AddByte(b)
byteMode.Skip(b) byteMode.MoveCursor(b)
} }
// AcceptMulti is used to accept one or more bytes that were read from the input. // AcceptMulti is used to accept one or more bytes that were read from the input.
@ -57,5 +35,35 @@ func (byteMode ByteMode) Accept(b byte) {
// the first byte after the accepted bytes. // the first byte after the accepted bytes.
func (byteMode ByteMode) AcceptMulti(bytes ...byte) { func (byteMode ByteMode) AcceptMulti(bytes ...byte) {
byteMode.api.Output.AddBytes(bytes...) byteMode.api.Output.AddBytes(bytes...)
byteMode.SkipMulti(bytes...) byteMode.MoveCursorMulti(bytes...)
}
// MoveCursor updates the position of the read cursor, based on the provided byte.
// This method takes newlines into account to keep track of line numbers and
// column positions for the input cursor.
//
// After the call, byte offset 0 for Peek() and PeekMulti() will point at
// the first byte at the new cursor position.
func (byteMode ByteMode) MoveCursor(b byte) {
f := byteMode.api.stackFrame
if b == '\n' {
f.column = 0
f.line++
} else {
f.column++
}
f.offset++
}
// MoveCursorMulti updates the position of the read cursor, based on the provided bytes.
// This method takes newlines into account to keep track of line numbers and
// column positions for the input cursor.
//
// After the call, byte offset 0 for Peek() and PeekMulti() will point at
// the first byte at the new cursor position.
func (byteMode ByteMode) MoveCursorMulti(bytes ...byte) {
for _, b := range bytes {
byteMode.MoveCursor(b)
}
} }

View File

@ -2,11 +2,14 @@ package tokenize
import ( import (
"unicode/utf8" "unicode/utf8"
"git.makaay.nl/mauricem/go-parsekit/read"
) )
// RuneMode provides (UTF8) rune-driven input/output functionality for the tokenize API. // RuneMode provides (UTF8) rune-driven input/output functionality for the tokenize API.
type RuneMode struct { type RuneMode struct {
api *API api *API
reader *read.Buffer // the buffered input reader
} }
// Peek returns the UTF8 rune at the provided byte offset, including its byte width. // Peek returns the UTF8 rune at the provided byte offset, including its byte width.
@ -23,42 +26,7 @@ type RuneMode struct {
// When an offset is requested that is beyond the length of the available input // When an offset is requested that is beyond the length of the available input
// data, then the error will be io.EOF. // data, then the error will be io.EOF.
func (runeMode RuneMode) Peek(offset int) (rune, int, error) { func (runeMode RuneMode) Peek(offset int) (rune, int, error) {
a := runeMode.api return runeMode.reader.RuneAt(runeMode.api.stackFrame.offset + offset)
return a.reader.RuneAt(a.stackFrame.offset + offset)
}
// Skip is used to skip over a single rune that was read from the input.
// This tells the tokenizer: "I've seen this rune. It is of no interest.
// I will now continue reading after this rune."
//
// This will merely update the position of the cursor (which keeps track of what
// line and column we are on in APIthe input data). The rune is not added to
// the output.
//
// After the call, byte offset 0 for PeekByte() and PeekRune() will point at
// the first byte after the skipped rune.
func (runeMode RuneMode) Skip(r rune) {
f := runeMode.api.stackFrame
f.moveCursorByRune(r)
f.offset += utf8.RuneLen(r)
}
// SkipMulti is used to skip over one or more runes that were read from the input.
// This tells the tokenizer: "I've seen these runes. They are of no interest.
// I will now continue reading after these runes."
//
// This will merely update the position of the cursor (which keeps track of what
// line and column we are on in the input data). The runes are not added to
// the output.
//
// After the call, byte offset 0 for PeekByte() and PeekRune() will point at
// the first byte after the skipped runes.
func (runeMode RuneMode) SkipMulti(runes ...rune) {
f := runeMode.api.stackFrame
for _, r := range runes {
f.moveCursorByRune(r)
f.offset += utf8.RuneLen(r)
}
} }
// Accept is used to accept a single rune that was read from the input. // Accept is used to accept a single rune that was read from the input.
@ -67,7 +35,7 @@ func (runeMode RuneMode) SkipMulti(runes ...rune) {
// reading after this rune." // reading after this rune."
// //
// This will update the position of the cursor (which keeps track of what line // This will update the position of the cursor (which keeps track of what line
// and column we are on in the input data) and add the rune to the tokenizer // and column we are on intin the input data) and add the rune to the tokenizer
// output. // output.
// //
// After the call, byte offset 0 for PeekByte() and PeekRune() will point at // After the call, byte offset 0 for PeekByte() and PeekRune() will point at
@ -75,15 +43,14 @@ func (runeMode RuneMode) SkipMulti(runes ...rune) {
func (runeMode RuneMode) Accept(r rune) { func (runeMode RuneMode) Accept(r rune) {
a := runeMode.api a := runeMode.api
f := a.stackFrame f := a.stackFrame
curBytesEnd := f.bytesEnd curBytesEnd := f.bytesEnd
maxRequiredBytes := curBytesEnd + utf8.UTFMax maxRequiredBytes := curBytesEnd + utf8.UTFMax
a.growOutputData(maxRequiredBytes) a.growOutputData(maxRequiredBytes)
w := utf8.EncodeRune(a.outputData[curBytesEnd:], r) w := utf8.EncodeRune(a.outputData[curBytesEnd:], r)
f.bytesEnd += w f.bytesEnd += w
f.offset += w
f.moveCursorByRune(r) runeMode.MoveCursor(r)
} }
// AcceptMulti is used to accept one or more runes that were read from the input. // AcceptMulti is used to accept one or more runes that were read from the input.
@ -100,17 +67,47 @@ func (runeMode RuneMode) Accept(r rune) {
func (runeMode RuneMode) AcceptMulti(runes ...rune) { func (runeMode RuneMode) AcceptMulti(runes ...rune) {
a := runeMode.api a := runeMode.api
f := a.stackFrame f := a.stackFrame
runesAsString := string(runes)
byteLen := len(runesAsString)
curBytesEnd := f.bytesEnd
newBytesEnd := curBytesEnd + byteLen
a.growOutputData(newBytesEnd) curBytesEnd := f.bytesEnd
copy(a.outputData[curBytesEnd:], runesAsString) maxBytes := curBytesEnd + len(runes)*utf8.UTFMax
f.bytesEnd = newBytesEnd a.growOutputData(maxBytes)
f.offset += byteLen
for _, r := range runes { for _, r := range runes {
f.moveCursorByRune(r) w := utf8.EncodeRune(a.outputData[curBytesEnd:], r)
curBytesEnd += w
runeMode.MoveCursor(r)
}
f.bytesEnd = curBytesEnd
}
// MoveCursor updates the position of the read cursor, based on the provided rune.
// This method takes newlines into account to keep track of line numbers and
// column positions for the input cursor.
//
// After the call, byte offset 0 for Peek() and PeekMulti() will point at
// the first rune at the new cursor position.
func (runeMode RuneMode) MoveCursor(r rune) int {
f := runeMode.api.stackFrame
if r == '\n' {
f.column = 0
f.line++
} else {
f.column++
}
width := utf8.RuneLen(r)
f.offset += width
return width
}
// MoveCursorMulti updates the position of the read cursor, based on the provided runes.
// This method takes newlines into account to keep track of line numbers and
// column positions for the input cursor.
//
// After the call, byte offset 0 for Peek() and PeekMulti() will point at
// the first rune at the new cursor position.
func (runeMode RuneMode) MoveCursorMulti(runes ...rune) {
for _, r := range runes {
runeMode.MoveCursor(r)
} }
} }

View File

@ -94,7 +94,7 @@ func ExampleAPI_SkipRune() {
if strings.ContainsRune("aeiouAEIOU", r) { if strings.ContainsRune("aeiouAEIOU", r) {
api.Rune.Accept(r) api.Rune.Accept(r)
} else { } else {
api.Rune.Skip(r) api.Rune.MoveCursor(r)
} }
} }

View File

@ -1,30 +0,0 @@
package tokenize
// move updates the position of the cursor, based on the provided input string.
// The input string represents the runes that the cursor must be moved over.
// This method will take newlines into account to keep track of line numbers and
// column positions automatically.
func (f *stackFrame) moveCursor(input string) *stackFrame {
for _, r := range input {
f.moveCursorByRune(r)
}
return f
}
func (f *stackFrame) moveCursorByRune(r rune) {
if r == '\n' {
f.column = 0
f.line++
} else {
f.column++
}
}
func (f *stackFrame) moveCursorByByte(b byte) {
if b == '\n' {
f.column = 0
f.line++
} else {
f.column++
}
}

View File

@ -6,27 +6,29 @@ import (
func TestMoveCursorByBytes(t *testing.T) { func TestMoveCursorByBytes(t *testing.T) {
api := NewAPI("") api := NewAPI("")
api.stackFrame.moveCursorByByte('a') api.Byte.MoveCursor('a')
api.stackFrame.moveCursorByByte('b') api.Byte.MoveCursor('b')
api.stackFrame.moveCursorByByte('c') api.Byte.MoveCursor('c')
api.stackFrame.moveCursorByByte('\r') api.Byte.MoveCursor('\r')
api.stackFrame.moveCursorByByte('\n') api.Byte.MoveCursor('\n')
api.stackFrame.moveCursorByByte('a') api.Byte.MoveCursor('a')
api.stackFrame.moveCursorByByte('b') api.Byte.MoveCursor('b')
AssertEqual(t, "line 2, column 3", api.Cursor(), "Cursor position after moving by byte") AssertEqual(t, "line 2, column 3", api.Cursor(), "Cursor position after moving by byte")
AssertEqual(t, 7, api.stackFrame.offset, "Offset after moving by byte")
} }
func TestMoveCursorByRunes(t *testing.T) { func TestMoveCursorByRunes(t *testing.T) {
api := NewAPI("") api := NewAPI("")
api.stackFrame.moveCursorByRune('ɹ') api.Rune.MoveCursor('ɹ')
api.stackFrame.moveCursorByRune('n') api.Rune.MoveCursor('n')
api.stackFrame.moveCursorByRune('u') api.Rune.MoveCursor('u')
api.stackFrame.moveCursorByRune('\r') api.Rune.MoveCursor('\r')
api.stackFrame.moveCursorByRune('\n') api.Rune.MoveCursor('\n')
api.stackFrame.moveCursorByRune('ǝ') api.Rune.MoveCursor('ǝ')
AssertEqual(t, "line 2, column 2", api.Cursor(), "Cursor position after moving by rune") AssertEqual(t, "line 2, column 2", api.Cursor(), "Cursor position after moving by rune")
AssertEqual(t, 8, api.stackFrame.offset, "Offset after moving by rune")
} }
func TestWhenMovingCursor_CursorPositionIsUpdated(t *testing.T) { func TestWhenMovingCursor_CursorPositionIsUpdated(t *testing.T) {
@ -49,7 +51,9 @@ func TestWhenMovingCursor_CursorPositionIsUpdated(t *testing.T) {
} { } {
api := NewAPI("") api := NewAPI("")
for _, s := range test.input { for _, s := range test.input {
api.stackFrame.moveCursor(s) for _, r := range s {
api.Rune.MoveCursor(r)
}
} }
if api.stackFrame.line != test.line { if api.stackFrame.line != test.line {
t.Errorf("[%s] Unexpected line offset %d (expected %d)", test.name, api.stackFrame.line, test.line) t.Errorf("[%s] Unexpected line offset %d (expected %d)", test.name, api.stackFrame.line, test.line)

View File

@ -476,7 +476,7 @@ func MatchNewline() Handler {
return false return false
} }
if b1 == '\n' { if b1 == '\n' {
t.Byte.AcceptMulti(b1) t.Byte.Accept(b1)
return true return true
} }
if b1 == '\r' { if b1 == '\r' {
@ -1121,7 +1121,7 @@ func MatchInteger(normalize bool) Handler {
// The next character is a zero, skip the leading zero and check again. // The next character is a zero, skip the leading zero and check again.
if err == nil && b2 == b { if err == nil && b2 == b {
t.Byte.Skip('0') t.Byte.MoveCursor('0')
continue continue
} }
// The next character is not a zero, nor a digit at all. // The next character is not a zero, nor a digit at all.
@ -1131,7 +1131,7 @@ func MatchInteger(normalize bool) Handler {
return true return true
} }
// The next character is a digit. SKip the leading zero and go with the digit. // The next character is a digit. SKip the leading zero and go with the digit.
t.Byte.Skip('0') t.Byte.MoveCursor('0')
t.Byte.Accept(b2) t.Byte.Accept(b2)
break break
} }
@ -1170,7 +1170,7 @@ func MatchDecimal(normalize bool) Handler {
// The next character is a zero, skip the leading zero and check again. // The next character is a zero, skip the leading zero and check again.
if err == nil && b2 == b { if err == nil && b2 == b {
t.Byte.Skip('0') t.Byte.MoveCursor('0')
continue continue
} }
// The next character is a dot, go with the zero before the dot and // The next character is a dot, go with the zero before the dot and
@ -1186,7 +1186,7 @@ func MatchDecimal(normalize bool) Handler {
return true return true
} }
// The next character is a digit. SKip the leading zero and go with the digit. // The next character is a digit. SKip the leading zero and go with the digit.
t.Byte.Skip('0') t.Byte.MoveCursor('0')
t.Byte.Accept(b2) t.Byte.Accept(b2)
break break
} }
@ -1198,7 +1198,7 @@ func MatchDecimal(normalize bool) Handler {
if err != nil || b < '0' || b > '9' { if err != nil || b < '0' || b > '9' {
break break
} }
t.Byte.AcceptMulti(b) t.Byte.Accept(b)
} }
// No dot or no digit after a dot? Then we're done. // No dot or no digit after a dot? Then we're done.
@ -1266,6 +1266,7 @@ func MatchBoolean() Handler {
t.Byte.Accept(b1) t.Byte.Accept(b1)
return true return true
} }
// TODO Multibyte peeks (also useful for strings)
b3, _ := t.Byte.Peek(2) b3, _ := t.Byte.Peek(2)
b4, _ := t.Byte.Peek(3) b4, _ := t.Byte.Peek(3)
b5, err := t.Byte.Peek(4) b5, err := t.Byte.Peek(4)
@ -1362,7 +1363,7 @@ func MatchOctet(normalize bool) Handler {
if err != nil || b2 < '0' || b2 > '9' { if err != nil || b2 < '0' || b2 > '9' {
// Output 2-digit octet. // Output 2-digit octet.
if normalize && b0 == '0' { if normalize && b0 == '0' {
t.Byte.Skip(b0) t.Byte.MoveCursor(b0)
t.Byte.Accept(b1) t.Byte.Accept(b1)
} else { } else {
t.Byte.AcceptMulti(b0, b1) t.Byte.AcceptMulti(b0, b1)
@ -1377,9 +1378,9 @@ func MatchOctet(normalize bool) Handler {
// Output 3-digit octet. // Output 3-digit octet.
if normalize && b0 == '0' { if normalize && b0 == '0' {
t.Byte.Skip(b0) t.Byte.MoveCursor(b0)
if b1 == '0' { if b1 == '0' {
t.Byte.Skip(b1) t.Byte.MoveCursor(b1)
} else { } else {
t.Byte.Accept(b1) t.Byte.Accept(b1)
} }
@ -1598,7 +1599,7 @@ func ModifyDropUntilEndOfLine() Handler {
if b == '\n' { if b == '\n' {
return true return true
} }
t.Byte.Skip(b) t.Byte.MoveCursor(b)
} }
} }
} }