package tokenize import ( "git.makaay.nl/mauricem/go-parsekit/read" ) // API holds the internal state of a tokenizer run and provides an API that // tokenize.Handler functions can use to: // // • read and accept runes from the input (NextRune, Accept) // // • fork the API for easy lookahead support (Fork, Merge, Reset, Dispose) // // • flush already read input data when not needed anymore (FlushInput) // // • retrieve the tokenizer Result struct (Result) to read or modify the results // // BASIC OPERATION: // // To retrieve the next rune from the API, call the NextRune() method. // // When the rune is to be accepted as input, call the method Accept(). The rune // is then added to the result runes of the API and the read cursor is moved // forward. // // By invoking NextRune() + Accept() multiple times, the result can be extended // with as many runes as needed. Runes collected this way can later on be // retrieved using the method Result().Runes(). // // It is mandatory to call Accept() after retrieving a rune, before calling // NextRune() again. Failing to do so will result in a panic. // // Next to adding runes to the result, it is also possible to modify the // stored runes or to add lexical Tokens to the result. For all things // concerning results, take a look at the Result struct, which // can be accessed though the method Result(). // // FORKING OPERATION FOR EASY LOOKEAHEAD SUPPORT: // // Sometimes, we must be able to perform a lookahead, which might either // succeed or fail. In case of a failing lookahead, the state of the // API must be brought back to the original state, so we can try // a different route. // // The way in which this is supported, is by forking an API struct by // calling method Fork(). This will return a forked child API, with // empty result data, but using the same read cursor position as the // forked parent. // // After forking, the same interface as described for BASIC OPERATION can be // used to fill the results. When the lookahead was successful, then // Merge() can be called on the forked child to append the child's results // to the parent's results, and to move the read cursor position to that // of the child. // // When the lookahead was unsuccessful, then the forked child API can // disposed by calling Dispose() on the forked child. This is not mandatory. // Garbage collection will take care of this automatically. // The parent API was never modified, so it can safely be used after disposal // as if the lookahead never happened. // // Opinionized note: // Many tokenizers/parsers take a different approach on lookaheads by using // peeks and by moving the read cursor position back and forth, or by putting // read input back on the input stream. That often leads to code that is // efficient, however, in my opinion, not very intuitive to read. It can also // be tedious to get the cursor position back at the correct position, which // can lead to hard to track bugs. I much prefer this forking method, since // no bookkeeping has to be implemented when implementing a parser. type API struct { reader *read.Buffer // the input data reader lastRune rune // the rune as retrieved by the last NextRune() calll lastRuneErr error // the error for the last NextRune() call runeRead bool // whether or not a rune was read using NextRune() runes []rune // the rune stack tokens []Token // the token stack runeStart int runeEnd int tokenStart int tokenEnd int stackLevel int // the stack level for this API object state *apiState // shared API state data } type apiState struct { stack []Result // the stack, used for forking / merging the API. top int // the index of the current top item in the stack } // initialAPIstackDepth determines the initial stack depth for the API. // When a parser requires a higher stack depth, then this is no problem. // The API will automatically scale the stack when forking beyond this // default number of stack levels. const initialAPIstackDepth = 10 // NewAPI initializes a new API struct, wrapped around the provided input. // For an overview of allowed inputs, take a look at the documentation // for parsekit.read.New(). func NewAPI(input interface{}) API { stack := make([]Result, 1, initialAPIstackDepth) state := apiState{ stack: stack, } return API{ runes: make([]rune, initialAPIstackDepth), tokens: make([]Token, initialAPIstackDepth), reader: read.New(input), state: &state, } } // NextRune returns the rune at the current read offset. // // When an invalid UTF8 rune is encountered on the input, it is replaced with // the utf.RuneError rune. It's up to the caller to handle this as an error // when needed. // // After reading a rune it must be Accept()-ed to move the read cursor forward // to the next rune. Doing so is mandatory. When doing a second call to NextRune() // without explicitly accepting, this method will panic. You can see this as a // built-in unit test, enforcing correct serialization of API method calls. func (i *API) NextRune() (rune, error) { if i.stackLevel > i.state.top { callerPanic("NextRune", "tokenize.API.{name}(): {name}() called at {caller} "+ "using a non-active API fork (a parent was read, forked or merged, "+ "causing this fork to be invalidated)") } result := &(i.state.stack[i.stackLevel]) if i.runeRead { callerPanic("NextRune", "tokenize.API.{name}(): {name}() called at {caller} "+ "without a prior call to Accept()") } readRune, err := i.reader.RuneAt(result.offset) i.lastRune = readRune i.lastRuneErr = err i.runeRead = true i.DisposeChilds() return readRune, err } // Accept the last rune as read by NextRune() into the Result runes and move // the cursor forward. // // It is not allowed to call Accept() when the previous call to NextRune() // returned an error. Calling Accept() in such case will result in a panic. func (i *API) Accept() { if i.stackLevel > i.state.top { callerPanic("Accept", "tokenize.API.{name}(): {name}() called at {caller} "+ "using a non-active API fork (a parent was read, forked or merged, "+ "causing this fork to be invalidated)") } result := &(i.state.stack[i.stackLevel]) if !i.runeRead { callerPanic("Accept", "tokenize.API.{name}(): {name}() called at {caller} without first calling NextRune()") } else if i.lastRuneErr != nil { callerPanic("Accept", "tokenize.API.{name}(): {name}() called at {caller}, but the prior call to NextRune() failed") } result.runes = append(result.runes, i.lastRune) result.cursor.moveByRune(i.lastRune) result.offset++ i.runeRead = false } // Fork forks off a child of the API struct. It will reuse the same // read buffer and cursor position, but for the rest this is a fresh API. // // By forking an API, you can freely work with the forked child, without // affecting the parent API. This is for example useful when you must perform // some form of lookahead. // // When processing of the Handler was successful and you want to add the results // to the parent API, you can call Merge() on the forked child. // This will add the results to the results of the parent (runes, tokens). // It also updates the read cursor position of the parent to that of the child. // // When the lookahead was unsuccessful, then the forked child API can // disposed by calling Dispose() on the forked child. This is not mandatory. // Garbage collection will take care of this automatically. // The parent API was never modified, so it can safely be used after disposal // as if the lookahead never happened. func (i *API) Fork() API { if i.stackLevel > i.state.top { callerPanic("Fork", "tokenize.API.{name}(): {name}() called at {caller} "+ "using a non-active API fork (a parent was read, forked or merged, "+ "causing this fork to be invalidated)") } i.DisposeChilds() result := &(i.state.stack[i.stackLevel]) // Grow the stack storage when needed. newStackSize := i.stackLevel + 2 if cap(i.state.stack) < newStackSize { newStack := make([]Result, newStackSize, newStackSize+initialAPIstackDepth) copy(newStack, i.state.stack) i.state.stack = newStack } i.state.stack = i.state.stack[0 : i.stackLevel+1] // Create the new fork. child := API{ state: i.state, stackLevel: i.stackLevel + 1, reader: i.reader, } childResult := Result{ cursor: result.cursor, offset: result.offset, } i.state.stack = append(i.state.stack, childResult) //i.state.stack[i.stackLevel+1] = childResult // Invalidate parent's last read rune. i.runeRead = false i.state.top = child.stackLevel return child } // Merge appends the results of a forked child API (runes, tokens) to the // results of its parent. The read cursor of the parent is also updated // to that of the forked child. // // After the merge operation, the child results are reset so it can immediately // be reused for performing another match. This means that all Result data are // cleared, but the read cursor position is kept at its current position. // This allows a child to feed results in chunks to its parent. func (i *API) Merge() { if i.stackLevel == 0 { callerPanic("Merge", "tokenize.API.{name}(): {name}() called at {caller} on the top-level API") } if i.stackLevel > i.state.top { callerPanic("Merge", "tokenize.API.{name}(): {name}() called at {caller} "+ "using a non-active API fork (a parent was read, forked or merged, "+ "causing this fork to be invalidated)") } result := &(i.state.stack[i.stackLevel]) parentResult := &(i.state.stack[i.stackLevel-1]) // // Grow parent rune storage when needed. // newRuneSize := len(parentResult.runes) + len(result.runes) // if cap(parentResult.runes) < newRuneSize { // newRunes := make([]rune, len(parentResult.runes), 2*newRuneSize) // copy(newRunes, parentResult.runes) // parentResult.runes = newRunes // //fmt.Println("Beefed up runes", i.stackLevel-1, newRuneSize*2) // } // // Grow parent token storage when needed. // newTokenSize := len(parentResult.tokens) + len(result.tokens) // if cap(parentResult.tokens) < newTokenSize { // newTokens := make([]Token, len(parentResult.tokens), 2*newTokenSize) // copy(newTokens, parentResult.tokens) // parentResult.tokens = newTokens // //fmt.Println("Beefed up tokens", i.stackLevel-1, newTokenSize*2) // } parentResult.runes = append(parentResult.runes, result.runes...) parentResult.tokens = append(parentResult.tokens, result.tokens...) parentResult.offset = result.offset parentResult.cursor = result.cursor i.DisposeChilds() i.Reset() } func (i *API) DisposeChilds() { i.state.stack = i.state.stack[:i.stackLevel+1] i.state.top = i.stackLevel } func (i *API) Reset() { result := &(i.state.stack[i.stackLevel]) i.runeRead = false result.runes = result.runes[:0] result.tokens = result.tokens[:0] result.err = nil } // FlushInput flushes processed input data from the read.Buffer. // In this context 'processed' means all runes that were read using NextRune() // and that were added to the results using Accept(). // // Note: // When writing your own TokenHandler, you normally won't have to call this // method yourself. It is automatically called by parsekit when needed. func (i API) FlushInput() bool { result := &(i.state.stack[i.stackLevel]) if result.offset > 0 { i.reader.Flush(result.offset) result.offset = 0 return true } return false } // Result returns the Result struct from the API. The returned struct // can be used to retrieve and to modify result data. func (i API) Result() *Result { return &(i.state.stack[i.stackLevel]) }