package tokenize import ( "git.makaay.nl/mauricem/go-parsekit/read" ) // API holds the internal state of a tokenizer run. A tokenizer run uses' // tokenize.Handler functions to move the tokenizer forward through the // input and to provide tokenizer output. // // The methods as provided by the API are used by tokenize.Handler functions to: // // • access and process runes / bytes from the input data // // • flush processed input data that are not required anymore (FlushInput) // // • fork the API for easy lookahead support (Fork, Merge, Reset, Dispose) // // • emit tokens and/or bytes to be used by a parser // // BASIC OPERATION: // // To retrieve the next rune from the API, call the NextRune() method. // // When the rune is to be accepted as input, call the method Accept(). The rune // is then added to the result runes of the API and the read cursor is moved // forward. // // By invoking NextRune() + Accept() multiple times, the result can be extended // with as many runes as needed. Runes collected this way can later on be // retrieved using the method Runes(). // // It is mandatory to call Accept() after retrieving a rune, before calling // NextRune() again. Failing to do so will result in a panic. // // Next to adding runes to the result, it is also possible to modify the // stored runes or to add lexical Tokens to the result. For all things // concerning results, take a look at the Result struct, which // can be accessed though the method Result(). // // FORKING OPERATION FOR EASY LOOKEAHEAD SUPPORT: // // Sometimes, we must be able to perform a lookahead, which might either // succeed or fail. In case of a failing lookahead, the state of the // API must be brought back to the original state, so we can try // a different route. // // The way in which this is supported, is by forking an API struct by // calling method Fork(). This will return a forked child API, with // empty result data, but using the same read cursor position as the // forked parent. // // After forking, the same interface as described for BASIC OPERATION can be // used to fill the results. When the lookahead was successful, then // Merge() can be called on the forked child to append the child's results // to the parent's results, and to move the read cursor position to that // of the child. // // When the lookahead was unsuccessful, then the forked child API can // disposed by calling Dispose() on the forked child. This is not mandatory. // Garbage collection will take care of this automatically. // The parent API was never modified, so it can safely be used after disposal // as if the lookahead never happened. // // Opinionized note: // Many tokenizers/parsers take a different approach on lookaheads by using // peeks and by moving the read cursor position back and forth, or by putting // read input back on the input stream. That often leads to code that is // efficient, however, in my opinion, not very intuitive to read. It can also // be tedious to get the cursor position back at the correct position, which // can lead to hard to track bugs. I much prefer this forking method, since // no bookkeeping has to be implemented when implementing a parser. type API struct { stackFrames []stackFrame // the stack frames, containing stack level-specific data stackLevel int // the current stack level stackFrame *stackFrame // the current stack frame reader *read.Buffer // the buffered input reader Input Input // provides input-related functionality Byte InputByteMode // access to a set of byte-based input methods Rune InputRuneMode // access to a set of rune-based input methods Output Output // provides output-related functionality outputTokens []Token // accepted tokens outputBytes []byte // accepted bytes snapshot [9]int // storage for the Snapshot() / RestoreSnapshot() feature } type stackFrame struct { offsetLocal int // the read offset, relative to the start if this stack frame offset int // the read offset, relative to the start of the reader buffer column int // the column at which the cursor is (0-indexed, relative to the start of the stack frame) line int // the line at which the cursor is (0-indexed, relative to the start of the stack frame) bytesStart int // the starting point in the API.bytes slice for runes produced by this stack level bytesEnd int // the end point in the API.bytes slice for runes produced by this stack level tokenStart int // the starting point in the API.tokens slice for tokens produced by this stack level tokenEnd int // the end point in the API.tokens slice for tokens produced by this stack level // TODO err error // can be used by a Handler to report a specific issue with the input } const initialStackDepth = 64 const initialTokenStoreLength = 64 const initialByteStoreLength = 1024 // NewAPI initializes a new API struct, wrapped around the provided input. // For an overview of allowed inputs, take a look at the documentation // for parsekit.read.New(). func NewAPI(input interface{}) *API { reader := read.New(input) tokenAPI := &API{ stackFrames: make([]stackFrame, initialStackDepth), outputBytes: make([]byte, initialByteStoreLength), outputTokens: make([]Token, initialTokenStoreLength), reader: reader, } tokenAPI.Input = Input{api: tokenAPI, reader: reader} tokenAPI.Byte = InputByteMode{api: tokenAPI, reader: reader} tokenAPI.Rune = InputRuneMode{api: tokenAPI, reader: reader} tokenAPI.Output = Output{api: tokenAPI} tokenAPI.stackFrame = &tokenAPI.stackFrames[0] tokenAPI.snapshot[0] = -1 return tokenAPI } // Fork forks off a child of the API struct. It will reuse the same // read buffer and cursor position, but for the rest this can be considered // a fresh API. // // By forking an API, you can freely work with the forked child, without // affecting the parent API. This is for example useful when you must perform // some form of lookahead. // // When processing of the Handler was successful and you want to add the results // to the parent API, you can call Merge() on the forked child. // This will add the results to the results of the parent (runes, tokens). // It also updates the read cursor position of the parent to that of the child. // // When the lookahead was unsuccessful, then the forked child API can // disposed by calling Dispose() on the forked child. This is not mandatory. // Garbage collection will take care of this automatically. // The parent API was never modified, so it can safely be used after disposal // as if the lookahead never happened. func (tokenAPI *API) Fork() int { tokenAPI.stackLevel++ newStackLevel := tokenAPI.stackLevel // Grow the stack frames capacity when needed. frames := tokenAPI.stackFrames if cap(frames) < (newStackLevel + 1) { newFrames := make([]stackFrame, cap(frames)*2) copy(newFrames, frames) tokenAPI.stackFrames = newFrames } parent := tokenAPI.stackFrame tokenAPI.stackFrames[newStackLevel] = stackFrame{ offset: parent.offset, bytesStart: parent.bytesEnd, bytesEnd: parent.bytesEnd, tokenStart: parent.tokenEnd, tokenEnd: parent.tokenEnd, } tokenAPI.stackFrame = &tokenAPI.stackFrames[newStackLevel] return newStackLevel } // Merge appends the results of a forked child API (runes, tokens) to the // results of its parent. The read cursor of the parent is also updated // to that of the forked child. // // After the merge operation, the child results are reset so it can immediately // be reused for performing another match. This means that all Result data are // cleared, but the read cursor position is kept at its current position. // This allows a child to feed results in chunks to its parent. // // Once the child is no longer needed, it can be disposed of by using the // method Dispose(), which will return the tokenizer to the parent. func (tokenAPI *API) Merge(stackLevel int) { tokenAPI.checkStackLevelForMethod("Merge", stackLevel) parent := &tokenAPI.stackFrames[stackLevel-1] f := tokenAPI.stackFrame // The end of the parent slice aligns with the start of the child slice. // Because of this, to merge the parent slice can simply be expanded // to include the child slice. // parent : |----------| // child: |------| // After merge operation: // parent: |-----------------| // child: |---> continue reading from here parent.bytesEnd = f.bytesEnd f.bytesStart = f.bytesEnd // The same logic applies to tokens. parent.tokenEnd = f.tokenEnd f.tokenStart = f.tokenEnd // Update the parent read offset. parent.offsetLocal = parent.offsetLocal + (f.offset - parent.offset) parent.offset = f.offset // Update the parent cursor position. if f.line > parent.line { parent.line += f.line parent.column = f.column } else { parent.column += f.column } f.line = 0 f.column = 0 f.err = nil } func (tokenAPI *API) Dispose(stackLevel int) { tokenAPI.checkStackLevelForMethod("Dispose", stackLevel) tokenAPI.stackLevel = stackLevel - 1 tokenAPI.stackFrame = &tokenAPI.stackFrames[stackLevel-1] } func (tokenAPI *API) checkStackLevelForMethod(name string, stackLevel int) { if stackLevel == 0 { callerPanic(name, "tokenize.API.{name}(): {name}() called at {caller} "+ "on the top-level API stack level 0") } if stackLevel != tokenAPI.stackLevel { callerPanic(name, "tokenize.API.{name}(): {name}() called at {caller} "+ "on API stack level %d, but the current stack level is %d "+ "(forgot to Dispose() a forked child?)", stackLevel, tokenAPI.stackLevel) } } type Snapshot [9]int func (tokenAPI *API) MakeSnapshot() Snapshot { f := tokenAPI.stackFrame return Snapshot{ tokenAPI.stackLevel, f.bytesStart, f.bytesEnd, f.tokenStart, f.tokenEnd, f.offset, f.offsetLocal, f.line, f.column, } } func (tokenAPI *API) RestoreSnapshot(snap Snapshot) { f := tokenAPI.stackFrame if snap[0] != tokenAPI.stackLevel { callerPanic("RestoreSnapshot", "tokenize.API.{name}(): {name}() called at {caller} "+ "on API stack level %d, but the provided snapshot was created for stack level %d", tokenAPI.stackLevel, snap[0]) } f.bytesStart = snap[1] f.bytesEnd = snap[2] f.tokenStart = snap[3] f.tokenEnd = snap[4] f.offset = snap[5] f.offsetLocal = snap[6] f.line = snap[7] f.column = snap[8] }