go-parsekit/tokenize/api.go

549 lines
19 KiB
Go

package tokenize
import (
"fmt"
"unicode/utf8"
"git.makaay.nl/mauricem/go-parsekit/read"
)
// API holds the internal state of a tokenizer run. A run uses tokenize.Handler
// functions to move the tokenizer forward through the input and to provide
// tokenizer output. The API is used by these tokenize.Handler functions to:
//
// • access and process runes / bytes from the input data
//
// • flush processed input data that are not required anymore (FlushInput)
//
// • fork the API for easy lookahead support (Fork, Merge, Reset, Dispose)
//
// • emit tokens and/or bytes to be used by a parser
//
// BASIC OPERATION:
//
// To retrieve the next rune from the API, call the NextRune() method.
//
// When the rune is to be accepted as input, call the method Accept(). The rune
// is then added to the result runes of the API and the read cursor is moved
// forward.
//
// By invoking NextRune() + Accept() multiple times, the result can be extended
// with as many runes as needed. Runes collected this way can later on be
// retrieved using the method Runes().
//
// It is mandatory to call Accept() after retrieving a rune, before calling
// NextRune() again. Failing to do so will result in a panic.
//
// Next to adding runes to the result, it is also possible to modify the
// stored runes or to add lexical Tokens to the result. For all things
// concerning results, take a look at the Result struct, which
// can be accessed though the method Result().
//
// FORKING OPERATION FOR EASY LOOKEAHEAD SUPPORT:
//
// Sometimes, we must be able to perform a lookahead, which might either
// succeed or fail. In case of a failing lookahead, the state of the
// API must be brought back to the original state, so we can try
// a different route.
//
// The way in which this is supported, is by forking an API struct by
// calling method Fork(). This will return a forked child API, with
// empty result data, but using the same read cursor position as the
// forked parent.
//
// After forking, the same interface as described for BASIC OPERATION can be
// used to fill the results. When the lookahead was successful, then
// Merge() can be called on the forked child to append the child's results
// to the parent's results, and to move the read cursor position to that
// of the child.
//
// When the lookahead was unsuccessful, then the forked child API can
// disposed by calling Dispose() on the forked child. This is not mandatory.
// Garbage collection will take care of this automatically.
// The parent API was never modified, so it can safely be used after disposal
// as if the lookahead never happened.
//
// Opinionized note:
// Many tokenizers/parsers take a different approach on lookaheads by using
// peeks and by moving the read cursor position back and forth, or by putting
// read input back on the input stream. That often leads to code that is
// efficient, however, in my opinion, not very intuitive to read. It can also
// be tedious to get the cursor position back at the correct position, which
// can lead to hard to track bugs. I much prefer this forking method, since
// no bookkeeping has to be implemented when implementing a parser.
type API struct {
reader *read.Buffer // the input data reader
bytes []byte // accepted bytes
tokens []Token // accepted tokens
stackFrames []stackFrame // the stack frames, containing stack level-specific data
stackLevel int // the current stack level
stackFrame *stackFrame // the current stack frame
}
type stackFrame struct {
offset int // the read offset (relative to the start of the reader buffer) for this stack frame
column int // the column at which the cursor is (0-indexed)
line int // the line at which the cursor is (0-indexed)
bytesStart int // the starting point in the API.bytes slice for runes produced by this stack level
bytesEnd int // the end point in the API.bytes slice for runes produced by this stack level
tokenStart int // the starting point in the API.tokens slice for tokens produced by this stack level
tokenEnd int // the end point in the API.tokens slice for tokens produced by this stack level
// TODO
err error // can be used by a Handler to report a specific issue with the input
}
const initialStackDepth = 64
const initialTokenStoreLength = 64
const initialByteStoreLength = 1024
// NewAPI initializes a new API struct, wrapped around the provided input.
// For an overview of allowed inputs, take a look at the documentation
// for parsekit.read.New().
func NewAPI(input interface{}) *API {
api := &API{
reader: read.New(input),
bytes: make([]byte, initialByteStoreLength),
tokens: make([]Token, initialTokenStoreLength),
stackFrames: make([]stackFrame, initialStackDepth),
}
api.stackFrame = &api.stackFrames[0]
return api
}
// PeekByte returns the byte at the provided byte offset.
//
// When an error occurs during reading the input, an error will be returned.
// When an offset is requested that is beyond the length of the available input
// data, then the error will be io.EOF.
func (i *API) PeekByte(offset int) (byte, error) {
return i.reader.ByteAt(i.stackFrame.offset + offset)
}
// SkipByte is used to skip over a single bytes that was read from the input.
// This tells the tokenizer: "I've seen this byte. It is of no interest.
// I will now continue reading after this byte."
//
// This will merely update the position of the cursor (which keeps track of what
// line and column we are on in the input data). The byte is not added to
// the results.
func (i *API) SkipByte(b byte) {
i.stackFrame.moveCursorByByte(b)
i.stackFrame.offset++
}
// SkipBytes is used to skip over one or more bytes that were read from the input.
// This tells the tokenizer: "I've seen these bytes. They are of no interest.
// I will now continue reading after these bytes."
//
// This will merely update the position of the cursor (which keeps track of what
// line and column we are on in the input data). The bytes are not added to
// the results.
func (i *API) SkipBytes(bytes ...byte) {
for _, b := range bytes {
i.stackFrame.moveCursorByByte(b)
i.stackFrame.offset++
}
}
// AcceptByte is used to accept a single byte that was read from the input.
// This tells the tokenizer: "I've seen this byte. I want to make use of it
// for the final output, so please remember it for me. I will now continue
// reading after this byte."
//
// This will update the position of the cursor (which keeps track of what line
// and column we are on in the input data) and add the byte to the tokenizer
// results.
func (i *API) AcceptByte(b byte) {
curBytesEnd := i.stackFrame.bytesEnd
maxRequiredBytes := curBytesEnd + 1
// Grow the bytes capacity when needed.
if cap(i.bytes) < maxRequiredBytes {
newBytes := make([]byte, maxRequiredBytes*2)
copy(newBytes, i.bytes)
i.bytes = newBytes
}
i.bytes[curBytesEnd] = b
i.stackFrame.moveCursorByByte(b)
i.stackFrame.bytesEnd++
i.stackFrame.offset++
}
// AcceptBytes is used to accept one or more bytes that were read from the input.
// This tells the tokenizer: "I've seen these bytes. I want to make use of them
// for the final output, so please remember them for me. I will now continue
// reading after these bytes."
//
// This will update the position of the cursor (which keeps track of what line
// and column we are on in the input data) and add the bytes to the tokenizer
// results.
func (i *API) AcceptBytes(bytes ...byte) {
curBytesEnd := i.stackFrame.bytesEnd
newBytesEnd := curBytesEnd + len(bytes)
// Grow the bytes capacity when needed.
if cap(i.bytes) < newBytesEnd {
newBytes := make([]byte, newBytesEnd*2)
copy(newBytes, i.bytes)
i.bytes = newBytes
}
copy(i.bytes[curBytesEnd:], bytes)
for _, b := range bytes {
i.stackFrame.moveCursorByByte(b)
i.stackFrame.offset++
}
i.stackFrame.bytesEnd = newBytesEnd
}
// PeekRune returns the UTF8 rune at the provided byte offset, including its byte width.
//
// The byte width is useful to know what byte offset you'll have to use to peek
// the next byte or rune. Some UTF8 runes take up 4 bytes of data, so when the
// first rune starts at offset = 0, the second rune might start at offset = 4.
//
// When an invalid UTF8 rune is encountered on the input, it is replaced with
// the utf.RuneError rune. It's up to the caller to handle this as an error
// when needed.
//
// When an error occurs during reading the input, an error will be returned.
// When an offset is requested that is beyond the length of the available input
// data, then the error will be io.EOF.
func (i *API) PeekRune(offset int) (rune, int, error) {
return i.reader.RuneAt(i.stackFrame.offset + offset)
}
// SkipRune is used to skip over a single rune that was read from the input.
// This tells the tokenizer: "I've seen this rune. It is of no interest.
// I will now continue reading after this rune."
//
// This will merely update the position of the cursor (which keeps track of what
// line and column we are on in the input data). The rune is not added to
// the results.
func (i *API) SkipRune(r rune) {
i.stackFrame.moveCursorByRune(r)
}
// SkipRunes is used to skip over one or more runes that were read from the input.
// This tells the tokenizer: "I've seen these runes. They are of no interest.
// I will now continue reading after these runes."
//
// This will merely update the position of the cursor (which keeps track of what
// line and column we are on in the input data). The runes are not added to
// the results.
func (i *API) SkipRunes(runes ...rune) {
for _, r := range runes {
i.stackFrame.moveCursorByRune(r)
i.stackFrame.offset += utf8.RuneLen(r)
}
}
// AcceptRune is used to accept a single rune that was read from the input.
// This tells the tokenizer: "I've seen this rune. I want to make use of it
// for the final output, so please remember it for me. I will now continue
// reading after this rune."
//
// This will update the position of the cursor (which keeps track of what line
// and column we are on in the input data) and add the rune to the tokenizer
// results.
func (i *API) AcceptRune(r rune) {
curBytesEnd := i.stackFrame.bytesEnd
maxRequiredBytes := curBytesEnd + utf8.UTFMax
// Grow the runes capacity when needed.
if cap(i.bytes) < maxRequiredBytes {
newBytes := make([]byte, maxRequiredBytes*2)
copy(newBytes, i.bytes)
i.bytes = newBytes
}
i.stackFrame.moveCursorByRune(r)
w := utf8.EncodeRune(i.bytes[curBytesEnd:], r)
i.stackFrame.bytesEnd += w
i.stackFrame.offset += w
}
// AcceptRunes is used to accept one or more runes that were read from the input.
// This tells the tokenizer: "I've seen these runes. I want to make use of them
// for the final output, so please remember them for me. I will now continue
// reading after these runes."
//
// This will update the position of the cursor (which keeps track of what line
// and column we are on in the input data) and add the runes to the tokenizer
// results.
func (i *API) AcceptRunes(runes ...rune) {
runesAsString := string(runes)
byteLen := len(runesAsString)
curBytesEnd := i.stackFrame.bytesEnd
newBytesEnd := curBytesEnd + byteLen
// Grow the runes capacity when needed.
if cap(i.bytes) < newBytesEnd {
newBytes := make([]byte, newBytesEnd*2)
copy(newBytes, i.bytes)
i.bytes = newBytes
}
for _, r := range runes {
i.stackFrame.moveCursorByRune(r)
}
copy(i.bytes[curBytesEnd:], runesAsString)
i.stackFrame.bytesEnd = newBytesEnd
i.stackFrame.offset += byteLen
}
// Fork forks off a child of the API struct. It will reuse the same
// read buffer and cursor position, but for the rest this can be considered
// a fresh API.
//
// By forking an API, you can freely work with the forked child, without
// affecting the parent API. This is for example useful when you must perform
// some form of lookahead.
//
// When processing of the Handler was successful and you want to add the results
// to the parent API, you can call Merge() on the forked child.
// This will add the results to the results of the parent (runes, tokens).
// It also updates the read cursor position of the parent to that of the child.
//
// When the lookahead was unsuccessful, then the forked child API can
// disposed by calling Dispose() on the forked child. This is not mandatory.
// Garbage collection will take care of this automatically.
// The parent API was never modified, so it can safely be used after disposal
// as if the lookahead never happened.
func (i *API) Fork() int {
newStackLevel := i.stackLevel + 1
newStackSize := newStackLevel + 1
// Grow the stack frames capacity when needed.
if cap(i.stackFrames) < newStackSize {
newFrames := make([]stackFrame, newStackSize*2)
copy(newFrames, i.stackFrames)
i.stackFrames = newFrames
}
i.stackLevel++
// This can be written in a shorter way, but this turned out to
// be the best way performance-wise.
parent := i.stackFrame
child := &i.stackFrames[i.stackLevel]
child.offset = parent.offset
child.column = parent.column
child.line = parent.line
child.bytesStart = parent.bytesEnd
child.bytesEnd = parent.bytesEnd
child.tokenStart = parent.tokenEnd
child.tokenEnd = parent.tokenEnd
i.stackFrame = child
return i.stackLevel
}
// Merge appends the results of a forked child API (runes, tokens) to the
// results of its parent. The read cursor of the parent is also updated
// to that of the forked child.
//
// After the merge operation, the child results are reset so it can immediately
// be reused for performing another match. This means that all Result data are
// cleared, but the read cursor position is kept at its current position.
// This allows a child to feed results in chunks to its parent.
//
// Once the child is no longer needed, it can be disposed of by using the
// method Dispose(), which will return the tokenizer to the parent.
func (i *API) Merge(stackLevel int) {
if stackLevel == 0 {
callerPanic("Merge", "tokenize.API.{name}(): {name}() called at {caller} "+
"on the top-level API stack level 0")
}
if stackLevel != i.stackLevel {
callerPanic("Merge", "tokenize.API.{name}(): {name}() called at {caller} "+
"on API stack level %d, but the current stack level is %d "+
"(forgot to Dispose() a forked child?)", stackLevel, i.stackLevel)
}
parent := &i.stackFrames[stackLevel-1]
// The end of the parent slice aligns with the start of the child slice.
// Because of this, to merge the parent slice can simply be expanded
// to include the child slice.
// parent : |----------|
// child: |------|
// After merge operation:
// parent: |-----------------|
// child: |---> continue reading from here
parent.bytesEnd = i.stackFrame.bytesEnd
i.stackFrame.bytesStart = i.stackFrame.bytesEnd
// The same logic applies to tokens.
parent.tokenEnd = i.stackFrame.tokenEnd
i.stackFrame.tokenStart = i.stackFrame.tokenEnd
parent.offset = i.stackFrame.offset
parent.line = i.stackFrame.line
parent.column = i.stackFrame.column
i.stackFrame.err = nil
}
func (i *API) Dispose(stackLevel int) {
if stackLevel == 0 {
callerPanic("Dispose", "tokenize.API.{name}(): {name}() called at {caller} "+
"on the top-level API stack level 0")
}
if stackLevel != i.stackLevel {
callerPanic("Dispose", "tokenize.API.{name}(): {name}() called at {caller} "+
"on API stack level %d, but the current stack level is %d "+
"(forgot to Dispose() a forked child?)", stackLevel, i.stackLevel)
}
i.stackLevel = stackLevel - 1
i.stackFrame = &i.stackFrames[stackLevel-1]
}
func (i *API) Reset() {
if i.stackLevel == 0 {
i.stackFrame.column = 0
i.stackFrame.line = 0
i.stackFrame.offset = 0
} else {
parent := i.stackFrames[i.stackLevel-1]
i.stackFrame.column = parent.column
i.stackFrame.line = parent.line
i.stackFrame.offset = parent.offset
}
i.stackFrame.bytesEnd = i.stackFrame.bytesStart
i.stackFrame.tokenEnd = i.stackFrame.tokenStart
i.stackFrame.err = nil
}
// FlushInput flushes input data from the read.Buffer up to the current
// read offset of the parser.
//
// Note:
// When writing your own TokenHandler, you normally won't have to call this
// method yourself. It is automatically called by parsekit when possible.
func (i *API) FlushInput() bool {
if i.stackFrame.offset > 0 {
i.reader.Flush(i.stackFrame.offset)
i.stackFrame.offset = 0
return true
}
return false
}
func (i *API) String() string {
bytes := i.bytes[i.stackFrame.bytesStart:i.stackFrame.bytesEnd]
return string(bytes)
}
func (i *API) Runes() []rune {
bytes := i.bytes[i.stackFrame.bytesStart:i.stackFrame.bytesEnd]
return []rune(string(bytes))
}
func (i *API) Rune(offset int) rune {
r, _ := utf8.DecodeRune(i.bytes[i.stackFrame.bytesStart+offset:])
return r
}
func (i *API) ClearBytes() {
i.stackFrame.bytesEnd = i.stackFrame.bytesStart
}
func (i *API) SetBytes(bytes ...byte) {
i.ClearBytes()
i.AddBytes(bytes...)
}
func (i *API) AddBytes(bytes ...byte) {
// Grow the runes capacity when needed.
newBytesEnd := i.stackFrame.bytesEnd + len(bytes)
if cap(i.bytes) < newBytesEnd {
newBytes := make([]byte, newBytesEnd*2)
copy(newBytes, i.bytes)
i.bytes = newBytes
}
copy(i.bytes[i.stackFrame.bytesEnd:], bytes)
i.stackFrame.bytesEnd = newBytesEnd
}
func (i *API) ClearRunes() {
i.stackFrame.bytesEnd = i.stackFrame.bytesStart
}
func (i *API) SetRunes(runes ...rune) {
i.ClearRunes()
i.AddRunes(runes...)
}
func (i *API) AddRunes(runes ...rune) {
// Grow the runes capacity when needed.
runesAsString := string(runes)
newBytesEnd := i.stackFrame.bytesEnd + len(runesAsString)
if cap(i.bytes) < newBytesEnd {
newBytes := make([]byte, newBytesEnd*2)
copy(newBytes, i.bytes)
i.bytes = newBytes
}
copy(i.bytes[i.stackFrame.bytesEnd:], runesAsString)
i.stackFrame.bytesEnd = newBytesEnd
}
func (i *API) AddString(s string) {
i.AddBytes([]byte(s)...)
}
func (i *API) SetString(s string) {
i.ClearBytes()
i.SetBytes([]byte(s)...)
}
func (i *API) Cursor() string {
if i.stackFrame.line == 0 && i.stackFrame.column == 0 {
return fmt.Sprintf("start of file")
}
return fmt.Sprintf("line %d, column %d", i.stackFrame.line+1, i.stackFrame.column+1)
}
func (i *API) Tokens() []Token {
return i.tokens[i.stackFrame.tokenStart:i.stackFrame.tokenEnd]
}
func (i *API) Token(offset int) Token {
return i.tokens[i.stackFrame.tokenStart+offset]
}
func (i *API) TokenValue(offset int) interface{} {
return i.tokens[i.stackFrame.tokenStart+offset].Value
}
func (i *API) ClearTokens() {
i.stackFrame.tokenEnd = i.stackFrame.tokenStart
}
func (i *API) SetTokens(tokens ...Token) {
i.ClearTokens()
i.AddTokens(tokens...)
}
func (i *API) AddTokens(tokens ...Token) {
// Grow the tokens capacity when needed.
newTokenEnd := i.stackFrame.tokenEnd + len(tokens)
if cap(i.tokens) < newTokenEnd {
newTokens := make([]Token, newTokenEnd*2)
copy(newTokens, i.tokens)
i.tokens = newTokens
}
for offset, t := range tokens {
i.tokens[i.stackFrame.tokenEnd+offset] = t
}
i.stackFrame.tokenEnd = newTokenEnd
}