483 lines
16 KiB
Go
483 lines
16 KiB
Go
package tokenize
|
|
|
|
import (
|
|
"fmt"
|
|
"unicode/utf8"
|
|
|
|
"git.makaay.nl/mauricem/go-parsekit/read"
|
|
)
|
|
|
|
// API holds the internal state of a tokenizer run and provides an API that
|
|
// tokenize.Handler functions can use to:
|
|
//
|
|
// • read and accept runes from the input (NextRune, Accept)
|
|
//
|
|
// • fork the API for easy lookahead support (Fork, Merge, Reset, Dispose)
|
|
//
|
|
// • flush already read input data when not needed anymore (FlushInput)
|
|
//
|
|
// • retrieve the tokenizer Result struct (Result) to read or modify the results
|
|
//
|
|
// BASIC OPERATION:
|
|
//
|
|
// To retrieve the next rune from the API, call the NextRune() method.
|
|
//
|
|
// When the rune is to be accepted as input, call the method Accept(). The rune
|
|
// is then added to the result runes of the API and the read cursor is moved
|
|
// forward.
|
|
//
|
|
// By invoking NextRune() + Accept() multiple times, the result can be extended
|
|
// with as many runes as needed. Runes collected this way can later on be
|
|
// retrieved using the method Runes().
|
|
//
|
|
// It is mandatory to call Accept() after retrieving a rune, before calling
|
|
// NextRune() again. Failing to do so will result in a panic.
|
|
//
|
|
// Next to adding runes to the result, it is also possible to modify the
|
|
// stored runes or to add lexical Tokens to the result. For all things
|
|
// concerning results, take a look at the Result struct, which
|
|
// can be accessed though the method Result().
|
|
//
|
|
// FORKING OPERATION FOR EASY LOOKEAHEAD SUPPORT:
|
|
//
|
|
// Sometimes, we must be able to perform a lookahead, which might either
|
|
// succeed or fail. In case of a failing lookahead, the state of the
|
|
// API must be brought back to the original state, so we can try
|
|
// a different route.
|
|
//
|
|
// The way in which this is supported, is by forking an API struct by
|
|
// calling method Fork(). This will return a forked child API, with
|
|
// empty result data, but using the same read cursor position as the
|
|
// forked parent.
|
|
//
|
|
// After forking, the same interface as described for BASIC OPERATION can be
|
|
// used to fill the results. When the lookahead was successful, then
|
|
// Merge() can be called on the forked child to append the child's results
|
|
// to the parent's results, and to move the read cursor position to that
|
|
// of the child.
|
|
//
|
|
// When the lookahead was unsuccessful, then the forked child API can
|
|
// disposed by calling Dispose() on the forked child. This is not mandatory.
|
|
// Garbage collection will take care of this automatically.
|
|
// The parent API was never modified, so it can safely be used after disposal
|
|
// as if the lookahead never happened.
|
|
//
|
|
// Opinionized note:
|
|
// Many tokenizers/parsers take a different approach on lookaheads by using
|
|
// peeks and by moving the read cursor position back and forth, or by putting
|
|
// read input back on the input stream. That often leads to code that is
|
|
// efficient, however, in my opinion, not very intuitive to read. It can also
|
|
// be tedious to get the cursor position back at the correct position, which
|
|
// can lead to hard to track bugs. I much prefer this forking method, since
|
|
// no bookkeeping has to be implemented when implementing a parser.
|
|
type API struct {
|
|
reader *read.Buffer // the input data reader
|
|
lastRune rune // the rune as retrieved by the last NextRune() call
|
|
lastRuneErr error // the error for the last NextRune() call
|
|
runeRead bool // whether or not a rune was read using NextRune()
|
|
bytes []byte // accepted bytes
|
|
tokens []Token // accepted tokens
|
|
stackFrames []stackFrame // the stack frames, containing stack level-specific data
|
|
stackLevel int // the current stack level
|
|
stackFrame *stackFrame // the current stack frame
|
|
}
|
|
|
|
type stackFrame struct {
|
|
offset int // current rune read offset relative to the Reader's sliding window
|
|
column int // The column at which the cursor is (0-indexed)
|
|
line int // The line at which the cursor is (0-indexed)
|
|
bytesStart int // the starting point in the API.bytes slice for runes produced by this stack level
|
|
bytesEnd int // the end point in the API.bytes slice for runes produced by this stack level
|
|
tokenStart int // the starting point in the API.tokens slice for tokens produced by this stack level
|
|
tokenEnd int // the end point in the API.tokens slice for tokens produced by this stack level
|
|
|
|
// TODO
|
|
err error // can be used by a Handler to report a specific issue with the input
|
|
}
|
|
|
|
const initialStackDepth = 64
|
|
const initialTokenStoreLength = 64
|
|
const initialByteStoreLength = 1024
|
|
|
|
// NewAPI initializes a new API struct, wrapped around the provided input.
|
|
// For an overview of allowed inputs, take a look at the documentation
|
|
// for parsekit.read.New().
|
|
func NewAPI(input interface{}) *API {
|
|
api := &API{
|
|
reader: read.New(input),
|
|
bytes: make([]byte, initialByteStoreLength),
|
|
tokens: make([]Token, initialTokenStoreLength),
|
|
stackFrames: make([]stackFrame, initialStackDepth),
|
|
}
|
|
api.stackFrame = &api.stackFrames[0]
|
|
|
|
return api
|
|
}
|
|
|
|
// NextRune returns the rune at the current read offset.
|
|
//
|
|
// When an invalid UTF8 rune is encountered on the input, it is replaced with
|
|
// the utf.RuneError rune. It's up to the caller to handle this as an error
|
|
// when needed.
|
|
//
|
|
// After reading a rune it must be Accept()-ed to move the read cursor forward
|
|
// to the next rune. Doing so is mandatory. When doing a second call to NextRune()
|
|
// without explicitly accepting, this method will panic. You can see this as a
|
|
// built-in unit test, enforcing correct serialization of API method calls.
|
|
func (i *API) NextRune() (rune, error) {
|
|
if i.runeRead {
|
|
callerPanic("NextRune", "tokenize.API.{name}(): {name}() called at {caller} "+
|
|
"without a prior call to Accept()")
|
|
}
|
|
|
|
readRune, _, err := i.reader.RuneAt(i.stackFrame.offset)
|
|
i.lastRune = readRune
|
|
i.lastRuneErr = err
|
|
i.runeRead = true
|
|
|
|
return readRune, err
|
|
}
|
|
|
|
// PeekRune returns the rune at the provided offset.
|
|
//
|
|
// The read cursor and current read offset are not updated by this operation.
|
|
func (i *API) PeekRune(offset int) (rune, int, error) {
|
|
return i.reader.RuneAt(i.stackFrame.offset + offset)
|
|
}
|
|
|
|
// PeekByte returns the byte at the provided offset.
|
|
//
|
|
// The read cursor and current read offset are not updated by this operation.
|
|
func (i *API) PeekByte(offset int) (byte, error) {
|
|
return i.reader.ByteAt(i.stackFrame.offset + offset)
|
|
}
|
|
|
|
// Accept the last rune as read by NextRune() into the Result runes and move
|
|
// the cursor forward.
|
|
//
|
|
// It is not allowed to call Accept() when the previous call to NextRune()
|
|
// returned an error. Calling Accept() in such case will result in a panic.
|
|
func (i *API) Accept() {
|
|
if !i.runeRead {
|
|
callerPanic("Accept", "tokenize.API.{name}(): {name}() called at {caller} "+
|
|
"without first calling NextRune()")
|
|
} else if i.lastRuneErr != nil {
|
|
callerPanic("Accept", "tokenize.API.{name}(): {name}() called at {caller}, "+
|
|
"but the prior call to NextRune() failed")
|
|
}
|
|
|
|
i.acceptRunes(i.lastRune)
|
|
}
|
|
|
|
func (i *API) skipBytes(bytes ...byte) {
|
|
for _, b := range bytes {
|
|
i.stackFrame.moveCursorByByte(b)
|
|
i.stackFrame.offset++
|
|
}
|
|
i.runeRead = false
|
|
}
|
|
|
|
func (i *API) acceptBytes(bytes ...byte) {
|
|
curBytesEnd := i.stackFrame.bytesEnd
|
|
newBytesEnd := curBytesEnd + len(bytes)
|
|
|
|
// Grow the bytes capacity when needed.
|
|
if cap(i.bytes) < newBytesEnd {
|
|
newBytes := make([]byte, newBytesEnd*2)
|
|
copy(newBytes, i.bytes)
|
|
i.bytes = newBytes
|
|
}
|
|
|
|
copy(i.bytes[curBytesEnd:], bytes)
|
|
for _, b := range bytes {
|
|
i.stackFrame.moveCursorByByte(b)
|
|
i.stackFrame.offset++
|
|
}
|
|
i.stackFrame.bytesEnd = newBytesEnd
|
|
i.runeRead = false
|
|
}
|
|
|
|
func (i *API) skipRunes(width int, runes ...rune) {
|
|
for _, r := range runes {
|
|
i.stackFrame.moveCursorByRune(r)
|
|
}
|
|
i.stackFrame.offset += width
|
|
i.runeRead = false
|
|
}
|
|
|
|
func (i *API) acceptRunes(runes ...rune) {
|
|
runesAsString := string(runes)
|
|
curBytesEnd := i.stackFrame.bytesEnd
|
|
newBytesEnd := curBytesEnd + len(runesAsString)
|
|
|
|
// Grow the runes capacity when needed.
|
|
if cap(i.bytes) < newBytesEnd {
|
|
newBytes := make([]byte, newBytesEnd*2)
|
|
copy(newBytes, i.bytes)
|
|
i.bytes = newBytes
|
|
}
|
|
|
|
for _, r := range runes {
|
|
i.stackFrame.moveCursorByRune(r)
|
|
}
|
|
copy(i.bytes[curBytesEnd:], runesAsString)
|
|
|
|
i.stackFrame.bytesEnd = newBytesEnd
|
|
i.stackFrame.offset += len(runesAsString)
|
|
i.runeRead = false
|
|
}
|
|
|
|
// Fork forks off a child of the API struct. It will reuse the same
|
|
// read buffer and cursor position, but for the rest this is a fresh API.
|
|
//
|
|
// By forking an API, you can freely work with the forked child, without
|
|
// affecting the parent API. This is for example useful when you must perform
|
|
// some form of lookahead.
|
|
//
|
|
// When processing of the Handler was successful and you want to add the results
|
|
// to the parent API, you can call Merge() on the forked child.
|
|
// This will add the results to the results of the parent (runes, tokens).
|
|
// It also updates the read cursor position of the parent to that of the child.
|
|
//
|
|
// When the lookahead was unsuccessful, then the forked child API can
|
|
// disposed by calling Dispose() on the forked child. This is not mandatory.
|
|
// Garbage collection will take care of this automatically.
|
|
// The parent API was never modified, so it can safely be used after disposal
|
|
// as if the lookahead never happened.
|
|
func (i *API) Fork() int {
|
|
newStackLevel := i.stackLevel + 1
|
|
newStackSize := newStackLevel + 1
|
|
|
|
// Grow the stack frames capacity when needed.
|
|
if cap(i.stackFrames) < newStackSize {
|
|
newFrames := make([]stackFrame, newStackSize*2)
|
|
copy(newFrames, i.stackFrames)
|
|
i.stackFrames = newFrames
|
|
}
|
|
|
|
i.stackLevel++
|
|
i.runeRead = false
|
|
|
|
// This can be written in a shorter way, but this turned out to
|
|
// be the best way performance-wise.
|
|
parent := i.stackFrame
|
|
child := &i.stackFrames[i.stackLevel]
|
|
child.offset = parent.offset
|
|
child.column = parent.column
|
|
child.line = parent.line
|
|
child.bytesStart = parent.bytesEnd
|
|
child.bytesEnd = parent.bytesEnd
|
|
child.tokenStart = parent.tokenEnd
|
|
child.tokenEnd = parent.tokenEnd
|
|
i.stackFrame = child
|
|
|
|
return i.stackLevel
|
|
}
|
|
|
|
// Merge appends the results of a forked child API (runes, tokens) to the
|
|
// results of its parent. The read cursor of the parent is also updated
|
|
// to that of the forked child.
|
|
//
|
|
// After the merge operation, the child results are reset so it can immediately
|
|
// be reused for performing another match. This means that all Result data are
|
|
// cleared, but the read cursor position is kept at its current position.
|
|
// This allows a child to feed results in chunks to its parent.
|
|
//
|
|
// Once the child is no longer needed, it can be disposed of by using the
|
|
// method Dispose(), which will return the tokenizer to the parent.
|
|
func (i *API) Merge(stackLevel int) {
|
|
if stackLevel == 0 {
|
|
callerPanic("Merge", "tokenize.API.{name}(): {name}() called at {caller} "+
|
|
"on the top-level API stack level 0")
|
|
}
|
|
if stackLevel != i.stackLevel {
|
|
callerPanic("Merge", "tokenize.API.{name}(): {name}() called at {caller} "+
|
|
"on API stack level %d, but the current stack level is %d "+
|
|
"(forgot to Dispose() a forked child?)", stackLevel, i.stackLevel)
|
|
}
|
|
|
|
parent := &i.stackFrames[stackLevel-1]
|
|
|
|
// The end of the parent slice aligns with the start of the child slice.
|
|
// Because of this, to merge the parent slice can simply be expanded
|
|
// to include the child slice.
|
|
// parent : |----------|
|
|
// child: |------|
|
|
// After merge operation:
|
|
// parent: |-----------------|
|
|
// child: |---> continue reading from here
|
|
parent.bytesEnd = i.stackFrame.bytesEnd
|
|
i.stackFrame.bytesStart = i.stackFrame.bytesEnd
|
|
|
|
// The same logic applies to tokens.
|
|
parent.tokenEnd = i.stackFrame.tokenEnd
|
|
i.stackFrame.tokenStart = i.stackFrame.tokenEnd
|
|
|
|
parent.offset = i.stackFrame.offset
|
|
parent.line = i.stackFrame.line
|
|
parent.column = i.stackFrame.column
|
|
|
|
i.stackFrame.err = nil
|
|
i.runeRead = false
|
|
}
|
|
|
|
func (i *API) Dispose(stackLevel int) {
|
|
if stackLevel == 0 {
|
|
callerPanic("Dispose", "tokenize.API.{name}(): {name}() called at {caller} "+
|
|
"on the top-level API stack level 0")
|
|
}
|
|
if stackLevel != i.stackLevel {
|
|
callerPanic("Dispose", "tokenize.API.{name}(): {name}() called at {caller} "+
|
|
"on API stack level %d, but the current stack level is %d "+
|
|
"(forgot to Dispose() a forked child?)", stackLevel, i.stackLevel)
|
|
}
|
|
|
|
i.runeRead = false
|
|
i.stackLevel = stackLevel - 1
|
|
i.stackFrame = &i.stackFrames[stackLevel-1]
|
|
}
|
|
|
|
func (i *API) Reset() {
|
|
i.runeRead = false
|
|
if i.stackLevel == 0 {
|
|
i.stackFrame.column = 0
|
|
i.stackFrame.line = 0
|
|
i.stackFrame.offset = 0
|
|
} else {
|
|
parent := i.stackFrames[i.stackLevel-1]
|
|
i.stackFrame.column = parent.column
|
|
i.stackFrame.line = parent.line
|
|
i.stackFrame.offset = parent.offset
|
|
}
|
|
i.stackFrame.bytesEnd = i.stackFrame.bytesStart
|
|
i.stackFrame.tokenEnd = i.stackFrame.tokenStart
|
|
i.stackFrame.err = nil
|
|
}
|
|
|
|
// FlushInput flushes input data from the read.Buffer up to the current
|
|
// read offset of the parser.
|
|
//
|
|
// Note:
|
|
// When writing your own TokenHandler, you normally won't have to call this
|
|
// method yourself. It is automatically called by parsekit when possible.
|
|
func (i *API) FlushInput() bool {
|
|
if i.stackFrame.offset > 0 {
|
|
i.reader.Flush(i.stackFrame.offset)
|
|
i.stackFrame.offset = 0
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
func (i *API) String() string {
|
|
bytes := i.bytes[i.stackFrame.bytesStart:i.stackFrame.bytesEnd]
|
|
return string(bytes)
|
|
}
|
|
|
|
func (i *API) Runes() []rune {
|
|
bytes := i.bytes[i.stackFrame.bytesStart:i.stackFrame.bytesEnd]
|
|
return []rune(string(bytes))
|
|
}
|
|
|
|
func (i *API) Rune(offset int) rune {
|
|
r, _ := utf8.DecodeRune(i.bytes[i.stackFrame.bytesStart+offset:])
|
|
return r
|
|
}
|
|
|
|
func (i *API) ClearBytes() {
|
|
i.stackFrame.bytesEnd = i.stackFrame.bytesStart
|
|
}
|
|
|
|
func (i *API) SetBytes(bytes ...byte) {
|
|
i.ClearBytes()
|
|
i.AddBytes(bytes...)
|
|
}
|
|
|
|
func (i *API) AddBytes(bytes ...byte) {
|
|
// Grow the runes capacity when needed.
|
|
newBytesEnd := i.stackFrame.bytesEnd + len(bytes)
|
|
if cap(i.bytes) < newBytesEnd {
|
|
newBytes := make([]byte, newBytesEnd*2)
|
|
copy(newBytes, i.bytes)
|
|
i.bytes = newBytes
|
|
}
|
|
|
|
copy(i.bytes[i.stackFrame.bytesEnd:], bytes)
|
|
i.stackFrame.bytesEnd = newBytesEnd
|
|
}
|
|
|
|
func (i *API) ClearRunes() {
|
|
i.stackFrame.bytesEnd = i.stackFrame.bytesStart
|
|
}
|
|
|
|
func (i *API) SetRunes(runes ...rune) {
|
|
i.ClearRunes()
|
|
i.AddRunes(runes...)
|
|
}
|
|
|
|
func (i *API) AddRunes(runes ...rune) {
|
|
// Grow the runes capacity when needed.
|
|
runesAsString := string(runes)
|
|
newBytesEnd := i.stackFrame.bytesEnd + len(runesAsString)
|
|
if cap(i.bytes) < newBytesEnd {
|
|
newBytes := make([]byte, newBytesEnd*2)
|
|
copy(newBytes, i.bytes)
|
|
i.bytes = newBytes
|
|
}
|
|
|
|
copy(i.bytes[i.stackFrame.bytesEnd:], runesAsString)
|
|
i.stackFrame.bytesEnd = newBytesEnd
|
|
}
|
|
|
|
func (i *API) AddString(s string) {
|
|
i.AddBytes([]byte(s)...)
|
|
}
|
|
|
|
func (i *API) SetString(s string) {
|
|
i.ClearBytes()
|
|
i.SetBytes([]byte(s)...)
|
|
}
|
|
|
|
func (i *API) Cursor() string {
|
|
if i.stackFrame.line == 0 && i.stackFrame.column == 0 {
|
|
return fmt.Sprintf("start of file")
|
|
}
|
|
return fmt.Sprintf("line %d, column %d", i.stackFrame.line+1, i.stackFrame.column+1)
|
|
}
|
|
|
|
func (i *API) Tokens() []Token {
|
|
return i.tokens[i.stackFrame.tokenStart:i.stackFrame.tokenEnd]
|
|
}
|
|
|
|
func (i *API) Token(offset int) Token {
|
|
return i.tokens[i.stackFrame.tokenStart+offset]
|
|
}
|
|
|
|
func (i *API) TokenValue(offset int) interface{} {
|
|
return i.tokens[i.stackFrame.tokenStart+offset].Value
|
|
}
|
|
|
|
func (i *API) ClearTokens() {
|
|
i.stackFrame.tokenEnd = i.stackFrame.tokenStart
|
|
}
|
|
|
|
func (i *API) SetTokens(tokens ...Token) {
|
|
i.ClearTokens()
|
|
i.AddTokens(tokens...)
|
|
}
|
|
|
|
func (i *API) AddTokens(tokens ...Token) {
|
|
// Grow the tokens capacity when needed.
|
|
newTokenEnd := i.stackFrame.tokenEnd + len(tokens)
|
|
if cap(i.tokens) < newTokenEnd {
|
|
newTokens := make([]Token, newTokenEnd*2)
|
|
copy(newTokens, i.tokens)
|
|
i.tokens = newTokens
|
|
}
|
|
|
|
for offset, t := range tokens {
|
|
i.tokens[i.stackFrame.tokenEnd+offset] = t
|
|
}
|
|
i.stackFrame.tokenEnd = newTokenEnd
|
|
}
|