go-parsekit/tokenize/api.go

package tokenize

import (
	"git.makaay.nl/mauricem/go-parsekit/read"
)

// API holds the internal state of a tokenizer run. A tokenizer run uses'
// tokenize.Handler functions to move the tokenizer forward through the
// input and to provide tokenizer output.
//
// The methods as provided by the API are used by tokenize.Handler functions to:
//
// • access and process runes / bytes from the input data
//
// • flush processed input data that are not required anymore (FlushInput)
//
// • fork the API for easy lookahead support (Fork, Merge, Reset, Dispose)
//
// • emit tokens and/or bytes to be used by a parser
//
// BASIC OPERATION:
//
// To retrieve the next rune from the API, call the NextRune() method.
//
// When the rune is to be accepted as input, call the method Accept(). The rune
// is then added to the result runes of the API and the read cursor is moved
// forward.
//
// By invoking NextRune() + Accept() multiple times, the result can be extended
// with as many runes as needed. Runes collected this way can later on be
// retrieved using the method Runes().
//
// It is mandatory to call Accept() after retrieving a rune, before calling
// NextRune() again. Failing to do so will result in a panic.
//
// Next to adding runes to the result, it is also possible to modify the
// stored runes or to add lexical Tokens to the result. For all things
// concerning results, take a look at the Result struct, which
// can be accessed though the method Result().
//
// FORKING OPERATION FOR EASY LOOKEAHEAD SUPPORT:
//
// Sometimes, we must be able to perform a lookahead, which might either
// succeed or fail. In case of a failing lookahead, the state of the
// API must be brought back to the original state, so we can try
// a different route.
//
// The way in which this is supported, is by forking an API struct by
// calling method Fork(). This will return a forked child API, with
// empty result data, but using the same read cursor position as the
// forked parent.
//
// After forking, the same interface as described for BASIC OPERATION can be
// used to fill the results. When the lookahead was successful, then
// Merge() can be called on the forked child to append the child's results
// to the parent's results, and to move the read cursor position to that
// of the child.
//
// When the lookahead was unsuccessful, then the forked child API can
// disposed by calling Dispose() on the forked child. This is not mandatory.
// Garbage collection will take care of this automatically.
// The parent API was never modified, so it can safely be used after disposal
// as if the lookahead never happened.
//
// Opinionized note:
// Many tokenizers/parsers take a different approach on lookaheads by using
// peeks and by moving the read cursor position back and forth, or by putting
// read input back on the input stream. That often leads to code that is
// efficient, however, in my opinion, not very intuitive to read. It can also
// be tedious to get the cursor position back at the correct position, which
// can lead to hard to track bugs. I much prefer this forking method, since
// no bookkeeping has to be implemented when implementing a parser.
type API struct {
	stackFrames []stackFrame // the stack frames, containing stack level-specific data
	stackLevel  int          // the current stack level
	stackFrame  *stackFrame  // the current stack frame

	reader *read.Buffer  // the buffered input reader
	Input  Input         // provides input-related functionality
	Byte   InputByteMode // access to a set of byte-based input methods
	Rune   InputRuneMode // access to a set of rune-based input methods

	Output       Output  // provides output-related functionality
	outputTokens []Token // accepted tokens
	outputBytes  []byte  // accepted bytes

	snapshot [9]int // storage for the Snapshot() / RestoreSnapshot() feature
}

type stackFrame struct {
	offsetLocal int // the read offset, relative to the start if this stack frame
	offset      int // the read offset, relative to the start of the reader buffer
	column      int // the column at which the cursor is (0-indexed, relative to the start of the stack frame)
	line        int // the line at which the cursor is (0-indexed, relative to the start of the stack frame)
	bytesStart  int // the starting point in the API.bytes slice for runes produced by this stack level
	bytesEnd    int // the end point in the API.bytes slice for runes produced by this stack level
	tokenStart  int // the starting point in the API.tokens slice for tokens produced by this stack level
	tokenEnd    int // the end point in the API.tokens slice for tokens produced by this stack level

	// TODO
	err error // can be used by a Handler to report a specific issue with the input
}

const initialStackDepth = 64
const initialTokenStoreLength = 64
const initialByteStoreLength = 1024

// NewAPI initializes a new API struct, wrapped around the provided input.
// For an overview of allowed inputs, take a look at the documentation
// for parsekit.read.New().
func NewAPI(input interface{}) *API {
	reader := read.New(input)
	tokenAPI := &API{
		stackFrames:  make([]stackFrame, initialStackDepth),
		outputBytes:  make([]byte, initialByteStoreLength),
		outputTokens: make([]Token, initialTokenStoreLength),
		reader:       reader,
	}
	tokenAPI.Input = Input{api: tokenAPI, reader: reader}
	tokenAPI.Byte = InputByteMode{api: tokenAPI, reader: reader}
	tokenAPI.Rune = InputRuneMode{api: tokenAPI, reader: reader}
	tokenAPI.Output = Output{api: tokenAPI}
	tokenAPI.stackFrame = &tokenAPI.stackFrames[0]
	tokenAPI.snapshot[0] = -1

	return tokenAPI
}

// Fork forks off a child of the API struct. It will reuse the same
// read buffer and cursor position, but for the rest this can be considered
// a fresh API.
//
// By forking an API, you can freely work with the forked child, without
// affecting the parent API. This is for example useful when you must perform
// some form of lookahead.
//
// When processing of the Handler was successful and you want to add the results
// to the parent API, you can call Merge() on the forked child.
// This will add the results to the results of the parent (runes, tokens).
// It also updates the read cursor position of the parent to that of the child.
//
// When the lookahead was unsuccessful, then the forked child API can
// disposed by calling Dispose() on the forked child. This is not mandatory.
// Garbage collection will take care of this automatically.
// The parent API was never modified, so it can safely be used after disposal
// as if the lookahead never happened.
func (tokenAPI *API) Fork() int {
	tokenAPI.stackLevel++
	newStackLevel := tokenAPI.stackLevel

	// Grow the stack frames capacity when needed.
	frames := tokenAPI.stackFrames
	if cap(frames) < (newStackLevel + 1) {
		newFrames := make([]stackFrame, cap(frames)*2)
		copy(newFrames, frames)
		tokenAPI.stackFrames = newFrames
	}

	parent := tokenAPI.stackFrame
	tokenAPI.stackFrames[newStackLevel] = stackFrame{
		offset:     parent.offset,
		bytesStart: parent.bytesEnd,
		bytesEnd:   parent.bytesEnd,
		tokenStart: parent.tokenEnd,
		tokenEnd:   parent.tokenEnd,
	}
	tokenAPI.stackFrame = &tokenAPI.stackFrames[newStackLevel]

	return newStackLevel
}

// Merge appends the results of a forked child API (runes, tokens) to the
// results of its parent. The read cursor of the parent is also updated
// to that of the forked child.
//
// After the merge operation, the child results are reset so it can immediately
// be reused for performing another match. This means that all Result data are
// cleared, but the read cursor position is kept at its current position.
// This allows a child to feed results in chunks to its parent.
//
// Once the child is no longer needed, it can be disposed of by using the
// method Dispose(), which will return the tokenizer to the parent.
func (tokenAPI *API) Merge(stackLevel int) {
	tokenAPI.checkStackLevelForMethod("Merge", stackLevel)
	parent := &tokenAPI.stackFrames[stackLevel-1]
	f := tokenAPI.stackFrame

	// The end of the parent slice aligns with the start of the child slice.
	// Because of this, to merge the parent slice can simply be expanded
	// to include the child slice.
	// parent :  |----------|
	// child:               |------|
	// After merge operation:
	// parent:   |-----------------|
	// child:                      |---> continue reading from here
	parent.bytesEnd = f.bytesEnd
	f.bytesStart = f.bytesEnd

	// The same logic applies to tokens.
	parent.tokenEnd = f.tokenEnd
	f.tokenStart = f.tokenEnd

	// Update the parent read offset.
	parent.offsetLocal = parent.offsetLocal + (f.offset - parent.offset)
	parent.offset = f.offset

	// Update the parent cursor position.
	if f.line > parent.line {
		parent.line += f.line
		parent.column = f.column
	} else {
		parent.column += f.column
	}

	f.line = 0
	f.column = 0
	f.err = nil
}

func (tokenAPI *API) Dispose(stackLevel int) {
	tokenAPI.checkStackLevelForMethod("Dispose", stackLevel)
	tokenAPI.stackLevel = stackLevel - 1
	tokenAPI.stackFrame = &tokenAPI.stackFrames[stackLevel-1]
}

func (tokenAPI *API) checkStackLevelForMethod(name string, stackLevel int) {
	if stackLevel == 0 {
		callerPanic(name, "tokenize.API.{name}(): {name}() called at {caller} "+
			"on the top-level API stack level 0")
	}
	if stackLevel != tokenAPI.stackLevel {
		callerPanic(name, "tokenize.API.{name}(): {name}() called at {caller} "+
			"on API stack level %d, but the current stack level is %d "+
			"(forgot to Dispose() a forked child?)", stackLevel, tokenAPI.stackLevel)
	}
}

type Snapshot [9]int

func (tokenAPI *API) MakeSnapshot() Snapshot {
	f := tokenAPI.stackFrame

	return Snapshot{
		tokenAPI.stackLevel,
		f.bytesStart,
		f.bytesEnd,
		f.tokenStart,
		f.tokenEnd,
		f.offset,
		f.offsetLocal,
		f.line,
		f.column,
	}
}

func (tokenAPI *API) RestoreSnapshot(snap Snapshot) {
	f := tokenAPI.stackFrame

	if snap[0] != tokenAPI.stackLevel {
		callerPanic("RestoreSnapshot", "tokenize.API.{name}(): {name}() called at {caller} "+
			"on API stack level %d, but the provided snapshot was created for stack level %d",
			tokenAPI.stackLevel, snap[0])
	}

	f.bytesStart = snap[1]
	f.bytesEnd = snap[2]
	f.tokenStart = snap[3]
	f.tokenEnd = snap[4]
	f.offset = snap[5]
	f.offsetLocal = snap[6]
	f.line = snap[7]
	f.column = snap[8]
}