go-parsekit/tokenapi.go

212 lines
7.7 KiB
Go

package parsekit
import (
"fmt"
"git.makaay.nl/mauricem/go-parsekit/reader"
)
// TokenAPI wraps a parsekit.reader and its purpose is to retrieve data from
// a parsekit.reader.Reader and to report back tokenizing results. For easy
// lookahead support, a forking strategy is provided.
//
// BASIC OPERATION:
//
// To retrieve the next rune from the TokenAPI, call the NextRune() method.
//
// When the rune is to be accepted as input, call the method Accept(). The rune
// is then added to the result runes of the TokenAPI and the read cursor is moved
// forward.
//
// By invoking NextRune() + Accept() multiple times, the result can be extended
// with as many runes as needed. Runes collected this way can later on be
// retrieved using the method Result().Runes().
//
// It is mandatory to call Accept() after retrieving a rune, before calling
// NextRune() again. Failing to do so will result in a panic.
//
// Next to adding runes to the result, it is also possible to modify the
// stored runes or to add lexical Tokens to the result. For all things
// concerning results, take a look at the TokenHandlerResult struct, which
// can be accessed though the method Result().
//
// FORKING OPERATION FOR EASY LOOKEAHEAD SUPPORT:
//
// Sometimes, we must be able to perform a lookahead, which might either
// succeed or fail. In case of a failing lookahead, the state of the
// TokenAPI must be brought back to the original state, so we can try
// a different route.
//
// The way in which this is supported, is by forking a TokenAPI struct by
// calling method Fork(). This will return a forked child TokenAPI, with
// empty result data, but using the same read cursor position as the
// forked parent.
//
// After forking, the same interface as described for BASIC OPERATION can be
// used to fill the results. When the lookahead was successful, then
// Merge() can be called on the forked child to append the child's results
// to the parent's results, and to move the read cursor position to that
// of the child.
//
// When the lookahead was unsuccessful, then the forked child TokenAPI can
// simply be discarded. The parent TokenAPI was never modified, so it can
// safely be used as if the lookahead never happened.
//
// Opinionized note:
// Many tokenizers/parsers take a different approach on lookaheads by using
// peeks and by moving the read cursor position back and forth, or by putting
// read input back on the input stream. That often leads to code that is
// efficient, however, in my opinion, not very intuitive to read. It can also
// be tedious to get the cursor position back at the correct position, which
// can lead to hard to track bugs. I much prefer this forking method, since
// no bookkeeping has to be implemented when implementing a parser.
type TokenAPI struct {
reader *reader.Reader
parent *TokenAPI // parent TokenAPI in case this TokenAPI is a fork child
child *TokenAPI // child TokenAPI in case this TokenAPI is a fork parent
result *TokenHandlerResult // results as produced by a TokenHandler (runes, Tokens, cursor position)
}
// NewTokenAPI initializes a new TokenAPI struct, wrapped around the provided io.Reader.
func NewTokenAPI(input interface{}) *TokenAPI {
return &TokenAPI{
reader: reader.New(input),
result: newTokenHandlerResult(),
}
}
// NextRune returns the rune at the current read offset.
//
// When an invalid UTF8 rune is encountered on the input, it is replaced with
// the utf.RuneError rune. It's up to the caller to handle this as an error
// when needed.
//
// After reading a rune it must be Accept()-ed to move the read cursor forward
// to the next rune. Doing so is mandatory. When doing a second call to NextRune()
// without explicitly accepting, this method will panic.
func (i *TokenAPI) NextRune() (rune, error) {
if i.result.lastRune != nil {
callerPanic(1, "parsekit.TokenAPI.NextRune(): NextRune() called at {caller} "+
"without a prior call to Accept()")
}
i.detachChilds()
readRune, err := i.reader.RuneAt(i.result.offset)
i.result.lastRune = &runeInfo{r: readRune, err: err}
return readRune, err
}
// Accept the last rune as read by NextRune() into the result buffer and move
// the cursor forward.
//
// It is not allowed to call Accept() when the previous call to NextRune()
// returned an error. Calling Accept() in such case will result in a panic.
func (i *TokenAPI) Accept() {
if i.result.lastRune == nil {
callerPanic(1, "parsekit.TokenAPI.Accept(): Accept() called at {caller} without first calling NextRune()")
} else if i.result.lastRune.err != nil {
callerPanic(1, "parsekit.TokenAPI.Accept(): Accept() called at {caller}, but the prior call to NextRune() failed")
}
i.result.runes = append(i.result.runes, i.result.lastRune.r)
i.result.cursor.Move(fmt.Sprintf("%c", i.result.lastRune.r))
i.result.offset++
i.result.lastRune = nil
}
// Fork forks off a child of the TokenAPI struct. It will reuse the same Reader and
// read cursor position, but for the rest this is a fresh TokenAPI.
//
// By forking a TokenAPI, you can freely work with the forked child, without
// affecting the parent TokenAPI. This is for example useful when you must perform
// some form of lookahead.
//
// When processing of the TokenHandler was successful and you want to add the results
// to the parent TokenAPI, you can call TokenAPIold.Merge() on the forked
// child. This will add the runes in the result buffer to the result buffer of
// the parent. It also updates the read cursor position of the parent to that
// of the child.
//
// When processing failed, or you don't want to use the results as produced by that
// lookahead, the forked child can simply be discarded. You can continue to work
// with the parent TokenAPI as if nothing ever happened.
func (i *TokenAPI) Fork() *TokenAPI {
// Cleanup current forking / reading state.
i.detachChilds()
i.result.lastRune = nil
// Create the new fork.
child := &TokenAPI{
reader: i.reader,
parent: i,
}
child.result = newTokenHandlerResult()
i.syncCursorTo(child)
i.child = child
return child
}
// Merge appends the TokenHandlerResult of a forked child TokenAPI to the TokenHandlerResult
// of its parent. The read cursor position of the parent is also updated to
// that of the forked child.
//
// After the merge operation, the child is reset so it can immediately be
// reused for performing another match. This means that all TokenHandlerResult data are
// cleared, but the read cursor position is kept at its current position.
// This allows a child to feed results in chunks to its parent.
func (i *TokenAPI) Merge() {
if i.parent == nil {
callerPanic(1, "parsekit.TokenAPI.Merge(): Merge() called at {caller} on a non-forked TokenAPI")
}
i.addResultsToParent()
i.syncCursorTo(i.parent)
i.clearResults()
i.detachChilds()
}
func (i *TokenAPI) addResultsToParent() {
i.parent.result.runes = append(i.parent.result.runes, i.result.runes...)
i.parent.result.tokens = append(i.parent.result.tokens, i.result.tokens...)
}
func (i *TokenAPI) syncCursorTo(to *TokenAPI) {
to.result.offset = i.result.offset
*to.result.cursor = *i.result.cursor
}
func (i *TokenAPI) clearResults() {
i.result.lastRune = nil
i.result.runes = []rune{}
i.result.tokens = []*Token{}
i.result.err = nil
}
func (i *TokenAPI) detachChilds() {
if i.child != nil {
i.child.detachChildsRecurse()
i.child = nil
}
}
func (i *TokenAPI) detachChildsRecurse() {
if i.child != nil {
i.child.detachChildsRecurse()
}
i.child = nil
i.parent = nil
}
func (i *TokenAPI) flushReader() bool {
if i.result.offset > 0 {
i.reader.Flush(i.result.offset)
i.result.offset = 0
return true
}
return false
}
// Result returns the TokenHandlerResult data for the TokenAPI. The returned struct
// can be used to retrieve and to modify result data.
func (i *TokenAPI) Result() *TokenHandlerResult {
return i.result
}