package parsekit import ( "fmt" "io" "git.makaay.nl/mauricem/go-parsekit/reader" ) // TokenAPI wraps a parsekit.reader and its purpose is to retrieve data from // the reader and to report back tokenizing results. For easy lookahead support, // a forking strategy is provided. // // BASIC OPERATION: // // To retrieve the next rune from the TokenAPI, call the NextRune() method. // // When the rune is to be accepted as input, call the method Accept(). The rune // is then added to the results of the TokenAPI and the read cursor is moved // forward. // // By invoking NextRune() + Accept() multiple times, the result can be extended // with as many runes as needed. Runes collected this way can later on be // retrieved using the method // // It is mandatory to call Accept() after retrieving a rune, before calling // NextRune() again. Failing to do so will result in a panic. // // Next to adding runes to the result, it is also possible to modify the // stored runes or to add lexical Tokens to the result. For all things // concerning results, take a look at the TokenHandlerResult struct, which // can be accessed though the method Result(). // // FORKING OPERATION FOR EASY LOOKEAHEAD SUPPORT: // // Sometimes, we must be able to perform a lookahead, which might either // succeed or fail. In case of a failing lookahead, the state of the // TokenAPI must be brought back to the original state, so we can try // a different route. // // The way in which this is supported, is by forking a TokenAPI struct by // calling method Fork(). This will return a forked child TokenAPI, with // empty result data, but using the same read cursor position as the // forked parent. // // After forking, the same interface as described for BASIC OPERATION can be // used to fill the results. When the lookahead was successful, then // Merge() can be called on the forked child to append the child's results // to the parent's results, and to move the read cursor position to that // of the child. // // When the lookahead was unsuccessful, then the forked child TokenAPI can // simply be discarded. The parent TokenAPI was never modified, so it can // safely be used as if the lookahead never happened. // // Opinionized note: // Many tokenizers/parsers take a different approach on lookaheads by using // peeks and by moving the read cursor position back and forth, or by putting // read input back on the input stream. That often leads to code that is // efficient, however, in my opinion, not very intuitive to read. It can also // be tedious to get the cursor position back at the correct position, which // can lead to hard to track bugs. I much prefer this forking method, since // no bookkeeping has to be implemented when implementing a parser. type TokenAPI struct { reader *reader.Reader root *TokenAPI // the root TokenAPI parent *TokenAPI // parent TokenAPI in case this TokenAPI is a fork child child *TokenAPI // child TokenAPI in case this TokenAPI is a fork parent result *TokenHandlerResult // results as produced by a TokenHandler (runes, Tokens, cursor position) } // NewTokenAPI initializes a new TokenAPI struct, wrapped around the provided io.Reader. func NewTokenAPI(r io.Reader) *TokenAPI { input := &TokenAPI{ reader: reader.New(r), result: newTokenHandlerResult(), } input.root = input // TODO remove this one from root input, input.root == nil is also a good check for "is root?". return input } // NextRune returns the rune at the current read offset. // // When an invalid UTF8 rune is encountered on the input, it is replaced with // the utf.RuneError rune. It's up to the caller to handle this as an error // when needed. // // After reading a rune it must be Accept()-ed to move the read cursor forward // to the next rune. Doing so is mandatory. When doing a second call to NextRune() // without explicitly accepting, this method will panic. func (i *TokenAPI) NextRune() (rune, error) { if i.result.lastRune != nil { callerPanic(1, "parsekit.TokenAPI.NextRune(): NextRune() called at {caller} "+ "without a prior call to Accept()") } i.detachChilds() readRune, err := i.reader.RuneAt(i.result.offset) i.result.lastRune = &runeInfo{r: readRune, err: err} return readRune, err } // Accept the last rune as read by NextRune() into the result buffer and move // the cursor forward. // // It is not allowed to call Accept() when the previous call to NextRune() // returned an error. Calling Accept() in such case will result in a panic. func (i *TokenAPI) Accept() { if i.result.lastRune == nil { callerPanic(1, "parsekit.TokenAPI.Accept(): Accept() called at {caller} without first calling NextRune()") } else if i.result.lastRune.err != nil { callerPanic(1, "parsekit.TokenAPI.Accept(): Accept() called at {caller}, but the prior call to NextRune() failed") } i.result.runes = append(i.result.runes, i.result.lastRune.r) i.result.cursor.Move(fmt.Sprintf("%c", i.result.lastRune.r)) i.result.offset++ i.result.lastRune = nil } // Fork forks off a child of the TokenAPI struct. It will reuse the same Reader and // read cursor position, but for the rest this is a fresh TokenAPI. // // By forking a TokenAPI, you can freely work with the forked child, without // affecting the parent TokenAPI. This is for example useful when you must perform // some form of lookahead. // // When processing of the TokenHandler was successful and you want to add the results // to the parent TokenAPI, you can call TokenAPIold.Merge() on the forked // child. This will add the runes in the result buffer to the result buffer of // the parent. It also updates the read cursor position of the parent to that // of the child. // // When processing failed, or you don't want to use the results as produced by that // lookahead, the forked child can simply be discarded. You can continue to work // with the parent TokenAPI as if nothing ever happened. func (i *TokenAPI) Fork() *TokenAPI { // Cleanup current forking / reading state. i.detachChilds() i.result.lastRune = nil // Create the new fork. child := &TokenAPI{ reader: i.reader, root: i.root, parent: i, } child.result = newTokenHandlerResult() i.syncCursorTo(child) i.child = child return child } // Merge appends the TokenHandlerResult of a forked child TokenAPI to the TokenHandlerResult // of its parent. The read cursor position of the parent is also updated to // that of the forked child. // // After the merge operation, the child is reset so it can immediately be // reused for performing another match. This means that all TokenHandlerResult data are // cleared, but the read cursor position is kept at its current position. // This allows a child to feed results in chunks to its parent. func (i *TokenAPI) Merge() { if i.parent == nil { callerPanic(1, "parsekit.TokenAPI.Merge(): Merge() called at {caller} on a non-forked TokenAPI") } i.addResultsToParent() i.syncCursorTo(i.parent) i.clearResults() i.detachChilds() } func (i *TokenAPI) addResultsToParent() { i.parent.result.runes = append(i.parent.result.runes, i.result.runes...) i.parent.result.tokens = append(i.parent.result.tokens, i.result.tokens...) } func (i *TokenAPI) syncCursorTo(to *TokenAPI) { to.result.offset = i.result.offset *to.result.cursor = *i.result.cursor } func (i *TokenAPI) clearResults() { i.result.lastRune = nil i.result.runes = []rune{} i.result.tokens = []*Token{} i.result.err = nil } func (i *TokenAPI) detachChilds() { if i.child != nil { i.child.detachChildsRecurse() i.child = nil } } func (i *TokenAPI) detachChildsRecurse() { if i.child != nil { i.child.detachChildsRecurse() } i.child = nil i.parent = nil } // Result returns the TokenHandlerResult data for the TokenAPI. The returned struct // can be used to retrieve and to modify result data. func (i *TokenAPI) Result() *TokenHandlerResult { return i.result }