280 lines
10 KiB
Go
280 lines
10 KiB
Go
package tokenize
|
|
|
|
import (
|
|
"fmt"
|
|
|
|
"git.makaay.nl/mauricem/go-parsekit/read"
|
|
)
|
|
|
|
// API holds the internal state of a tokenizer run and provides an API that
|
|
// tokenize.Handler functions can use to:
|
|
//
|
|
// • read and accept runes from the input (NextRune, Accept)
|
|
//
|
|
// • fork the API for easy lookahead support (Fork, Merge, Reset, Dispose)
|
|
//
|
|
// • flush already read input data when not needed anymore (FlushInput)
|
|
//
|
|
// • retrieve the tokenizer Result struct (Result) to read or modify the results
|
|
//
|
|
// BASIC OPERATION:
|
|
//
|
|
// To retrieve the next rune from the API, call the NextRune() method.
|
|
//
|
|
// When the rune is to be accepted as input, call the method Accept(). The rune
|
|
// is then added to the result runes of the API and the read cursor is moved
|
|
// forward.
|
|
//
|
|
// By invoking NextRune() + Accept() multiple times, the result can be extended
|
|
// with as many runes as needed. Runes collected this way can later on be
|
|
// retrieved using the method Result().Runes().
|
|
//
|
|
// It is mandatory to call Accept() after retrieving a rune, before calling
|
|
// NextRune() again. Failing to do so will result in a panic.
|
|
//
|
|
// Next to adding runes to the result, it is also possible to modify the
|
|
// stored runes or to add lexical Tokens to the result. For all things
|
|
// concerning results, take a look at the Result struct, which
|
|
// can be accessed though the method Result().
|
|
//
|
|
// FORKING OPERATION FOR EASY LOOKEAHEAD SUPPORT:
|
|
//
|
|
// Sometimes, we must be able to perform a lookahead, which might either
|
|
// succeed or fail. In case of a failing lookahead, the state of the
|
|
// API must be brought back to the original state, so we can try
|
|
// a different route.
|
|
//
|
|
// The way in which this is supported, is by forking an API struct by
|
|
// calling method Fork(). This will return a forked child API, with
|
|
// empty result data, but using the same read cursor position as the
|
|
// forked parent.
|
|
//
|
|
// After forking, the same interface as described for BASIC OPERATION can be
|
|
// used to fill the results. When the lookahead was successful, then
|
|
// Merge() can be called on the forked child to append the child's results
|
|
// to the parent's results, and to move the read cursor position to that
|
|
// of the child.
|
|
//
|
|
// When the lookahead was unsuccessful, then the forked child API can
|
|
// disposed by calling Dispose() on the forked child. This is not mandatory.
|
|
// Garbage collection will take care of this automatically.
|
|
// The parent API was never modified, so it can safely be used after disposal
|
|
// as if the lookahead never happened.
|
|
//
|
|
// Opinionized note:
|
|
// Many tokenizers/parsers take a different approach on lookaheads by using
|
|
// peeks and by moving the read cursor position back and forth, or by putting
|
|
// read input back on the input stream. That often leads to code that is
|
|
// efficient, however, in my opinion, not very intuitive to read. It can also
|
|
// be tedious to get the cursor position back at the correct position, which
|
|
// can lead to hard to track bugs. I much prefer this forking method, since
|
|
// no bookkeeping has to be implemented when implementing a parser.
|
|
type API struct {
|
|
state *apiState // shared API state data
|
|
stackLevel int // the stack level for this API object
|
|
}
|
|
|
|
type apiState struct {
|
|
reader *read.Buffer
|
|
stack []Result // the stack, used for forking / merging the API.
|
|
}
|
|
|
|
// initialAPIstackDepth determines the initial stack depth for th API.
|
|
// This value should work in most cases. When a parser requires a higher
|
|
// stack depth, then this is no problem. The API will automatically scale
|
|
// the stack when forking beyond this default number of stack levels.
|
|
const initialAPIstackDepth = 10
|
|
|
|
// NewAPI initializes a new API struct, wrapped around the provided input.
|
|
// For an overview of allowed inputs, take a look at the documentation
|
|
// for parsekit.read.New().
|
|
func NewAPI(input interface{}) API {
|
|
stack := make([]Result, 1, initialAPIstackDepth)
|
|
stack[0] = newResult()
|
|
state := apiState{
|
|
reader: read.New(input),
|
|
stack: stack,
|
|
}
|
|
api := API{
|
|
state: &state,
|
|
stackLevel: 0,
|
|
}
|
|
return api
|
|
}
|
|
|
|
// NextRune returns the rune at the current read offset.
|
|
//
|
|
// When an invalid UTF8 rune is encountered on the input, it is replaced with
|
|
// the utf.RuneError rune. It's up to the caller to handle this as an error
|
|
// when needed.
|
|
//
|
|
// After reading a rune it must be Accept()-ed to move the read cursor forward
|
|
// to the next rune. Doing so is mandatory. When doing a second call to NextRune()
|
|
// without explicitly accepting, this method will panic. You can see this as a
|
|
// built-in unit test, enforcing correct serialization of API method calls.
|
|
func (i *API) NextRune() (rune, error) {
|
|
if i.stackLevel > len(i.state.stack)-1 {
|
|
callerPanic("NextRune", "tokenize.API.{name}(): {name}() called at {caller} "+
|
|
"using a non-active API fork (a parent was read or merged, causing this "+
|
|
"fork to be invalidated)")
|
|
}
|
|
result := &(i.state.stack[i.stackLevel])
|
|
if result.lastRune != nil {
|
|
callerPanic("NextRune", "tokenize.API.{name}(): {name}() called at {caller} "+
|
|
"without a prior call to Accept()")
|
|
}
|
|
|
|
readRune, err := i.state.reader.RuneAt(result.offset)
|
|
result.lastRune = &runeInfo{r: readRune, err: err}
|
|
return readRune, err
|
|
}
|
|
|
|
// Accept the last rune as read by NextRune() into the Result runes and move
|
|
// the cursor forward.
|
|
//
|
|
// It is not allowed to call Accept() when the previous call to NextRune()
|
|
// returned an error. Calling Accept() in such case will result in a panic.
|
|
func (i *API) Accept() {
|
|
if i.stackLevel > len(i.state.stack)-1 {
|
|
callerPanic("NextRune", "tokenize.API.{name}(): {name}() called at {caller} "+
|
|
"using a non-active API fork (a parent was read or merged, causing this "+
|
|
"fork to be invalidated)")
|
|
}
|
|
result := &(i.state.stack[i.stackLevel])
|
|
if result.lastRune == nil {
|
|
callerPanic("Accept", "tokenize.API.{name}(): {name}() called at {caller} without first calling NextRune()")
|
|
} else if result.lastRune.err != nil {
|
|
callerPanic("Accept", "tokenize.API.{name}(): {name}() called at {caller}, but the prior call to NextRune() failed")
|
|
}
|
|
result.runes = append(result.runes, result.lastRune.r)
|
|
result.cursor.moveByRune(result.lastRune.r)
|
|
result.offset++
|
|
result.lastRune = nil
|
|
}
|
|
|
|
// Fork forks off a child of the API struct. It will reuse the same
|
|
// read buffer and cursor position, but for the rest this is a fresh API.
|
|
//
|
|
// By forking an API, you can freely work with the forked child, without
|
|
// affecting the parent API. This is for example useful when you must perform
|
|
// some form of lookahead.
|
|
//
|
|
// When processing of the Handler was successful and you want to add the results
|
|
// to the parent API, you can call Merge() on the forked child.
|
|
// This will add the results to the results of the parent (runes, tokens).
|
|
// It also updates the read cursor position of the parent to that of the child.
|
|
//
|
|
// When the lookahead was unsuccessful, then the forked child API can
|
|
// disposed by calling Dispose() on the forked child. This is not mandatory.
|
|
// Garbage collection will take care of this automatically.
|
|
// The parent API was never modified, so it can safely be used after disposal
|
|
// as if the lookahead never happened.
|
|
func (i *API) Fork() API {
|
|
if i.stackLevel > len(i.state.stack)-1 {
|
|
callerPanic("NextRune", "tokenize.API.{name}(): {name}() called at {caller} "+
|
|
"using a non-active API fork (a parent was read or merged, causing this "+
|
|
"fork to be invalidated)")
|
|
}
|
|
result := &(i.state.stack[i.stackLevel])
|
|
|
|
// Grow the stack storage when needed.
|
|
newStackSize := i.stackLevel + 2
|
|
if cap(i.state.stack) < newStackSize {
|
|
newStack := make([]Result, newStackSize, 2*newStackSize)
|
|
copy(newStack, i.state.stack)
|
|
i.state.stack = newStack
|
|
|
|
}
|
|
|
|
// Create the new fork.
|
|
child := API{
|
|
state: i.state,
|
|
stackLevel: i.stackLevel + 1,
|
|
}
|
|
childResult := newResult()
|
|
childResult.cursor = result.cursor
|
|
childResult.offset = result.offset
|
|
i.state.stack = i.state.stack[:newStackSize] // todo use append() directly?
|
|
i.state.stack[child.stackLevel] = childResult
|
|
|
|
// Update the parent.
|
|
result.lastRune = nil
|
|
|
|
return child
|
|
}
|
|
|
|
// stackDump provides a dump of the currently active stack levels in the API.
|
|
// This is used for debugging purposes and is normally not part of the standard
|
|
// code flow.
|
|
func (i *API) stackDump() {
|
|
for i, r := range i.state.stack {
|
|
fmt.Printf("[%d] %s: %q\n", i, r.cursor, r.String())
|
|
}
|
|
}
|
|
|
|
// Merge appends the results of a forked child API (runes, tokens) to the
|
|
// results of its parent. The read cursor of the parent is also updated
|
|
// to that of the forked child.
|
|
//
|
|
// After the merge operation, the child results are reset so it can immediately
|
|
// be reused for performing another match. This means that all Result data are
|
|
// cleared, but the read cursor position is kept at its current position.
|
|
// This allows a child to feed results in chunks to its parent.
|
|
func (i *API) Merge() {
|
|
if i.stackLevel == 0 {
|
|
callerPanic("Merge", "tokenize.API.{name}(): {name}() called at {caller} on a non-forked API")
|
|
}
|
|
if i.stackLevel > len(i.state.stack)-1 {
|
|
callerPanic("NextRune", "tokenize.API.{name}(): {name}() called at {caller} "+
|
|
"using a non-active API fork (a parent was read or merged, causing this "+
|
|
"fork to be invalidated)")
|
|
}
|
|
result := &(i.state.stack[i.stackLevel])
|
|
parentResult := &(i.state.stack[i.stackLevel-1])
|
|
parentResult.runes = append(parentResult.runes, result.runes...)
|
|
parentResult.tokens = append(parentResult.tokens, result.tokens...)
|
|
parentResult.offset = result.offset
|
|
parentResult.cursor = result.cursor
|
|
i.Reset()
|
|
i.DisposeChilds()
|
|
}
|
|
|
|
func (i *API) Dispose() {
|
|
i.state.stack = i.state.stack[:i.stackLevel]
|
|
}
|
|
|
|
func (i *API) DisposeChilds() {
|
|
i.state.stack = i.state.stack[:i.stackLevel+1]
|
|
}
|
|
|
|
func (i *API) Reset() {
|
|
result := &(i.state.stack[i.stackLevel])
|
|
result.lastRune = nil
|
|
result.runes = result.runes[:0]
|
|
result.tokens = result.tokens[:0]
|
|
result.err = nil
|
|
}
|
|
|
|
// FlushInput flushes processed input data from the read.Buffer.
|
|
// In this context 'processed' means all runes that were read using NextRune()
|
|
// and that were added to the results using Accept().
|
|
//
|
|
// Note:
|
|
// When writing your own TokenHandler, you normally won't have to call this
|
|
// method yourself. It is automatically called by parsekit when needed.
|
|
func (i API) FlushInput() bool {
|
|
result := &(i.state.stack[i.stackLevel])
|
|
if result.offset > 0 {
|
|
i.state.reader.Flush(result.offset)
|
|
result.offset = 0
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
// Result returns the Result struct from the API. The returned struct
|
|
// can be used to retrieve and to modify result data.
|
|
func (i API) Result() *Result {
|
|
return &(i.state.stack[i.stackLevel])
|
|
}
|