package tokenize import ( "fmt" "unicode/utf8" "git.makaay.nl/mauricem/go-parsekit/read" ) // API holds the internal state of a tokenizer run. A run uses tokenize.Handler // functions to move the tokenizer forward through the input and to provide // tokenizer output. The API is used by these tokenize.Handler functions to: // // • access and process runes / bytes from the input data // // • flush processed input data that are not required anymore (FlushInput) // // • fork the API for easy lookahead support (Fork, Merge, Reset, Dispose) // // • emit tokens and/or bytes to be used by a parser // // BASIC OPERATION: // // To retrieve the next rune from the API, call the NextRune() method. // // When the rune is to be accepted as input, call the method Accept(). The rune // is then added to the result runes of the API and the read cursor is moved // forward. // // By invoking NextRune() + Accept() multiple times, the result can be extended // with as many runes as needed. Runes collected this way can later on be // retrieved using the method Runes(). // // It is mandatory to call Accept() after retrieving a rune, before calling // NextRune() again. Failing to do so will result in a panic. // // Next to adding runes to the result, it is also possible to modify the // stored runes or to add lexical Tokens to the result. For all things // concerning results, take a look at the Result struct, which // can be accessed though the method Result(). // // FORKING OPERATION FOR EASY LOOKEAHEAD SUPPORT: // // Sometimes, we must be able to perform a lookahead, which might either // succeed or fail. In case of a failing lookahead, the state of the // API must be brought back to the original state, so we can try // a different route. // // The way in which this is supported, is by forking an API struct by // calling method Fork(). This will return a forked child API, with // empty result data, but using the same read cursor position as the // forked parent. // // After forking, the same interface as described for BASIC OPERATION can be // used to fill the results. When the lookahead was successful, then // Merge() can be called on the forked child to append the child's results // to the parent's results, and to move the read cursor position to that // of the child. // // When the lookahead was unsuccessful, then the forked child API can // disposed by calling Dispose() on the forked child. This is not mandatory. // Garbage collection will take care of this automatically. // The parent API was never modified, so it can safely be used after disposal // as if the lookahead never happened. // // Opinionized note: // Many tokenizers/parsers take a different approach on lookaheads by using // peeks and by moving the read cursor position back and forth, or by putting // read input back on the input stream. That often leads to code that is // efficient, however, in my opinion, not very intuitive to read. It can also // be tedious to get the cursor position back at the correct position, which // can lead to hard to track bugs. I much prefer this forking method, since // no bookkeeping has to be implemented when implementing a parser. type API struct { reader *read.Buffer // the input data reader bytes []byte // accepted bytes tokens []Token // accepted tokens stackFrames []stackFrame // the stack frames, containing stack level-specific data stackLevel int // the current stack level stackFrame *stackFrame // the current stack frame } type stackFrame struct { offset int // the read offset (relative to the start of the reader buffer) for this stack frame column int // the column at which the cursor is (0-indexed) line int // the line at which the cursor is (0-indexed) bytesStart int // the starting point in the API.bytes slice for runes produced by this stack level bytesEnd int // the end point in the API.bytes slice for runes produced by this stack level tokenStart int // the starting point in the API.tokens slice for tokens produced by this stack level tokenEnd int // the end point in the API.tokens slice for tokens produced by this stack level // TODO err error // can be used by a Handler to report a specific issue with the input } const initialStackDepth = 64 const initialTokenStoreLength = 64 const initialByteStoreLength = 1024 // NewAPI initializes a new API struct, wrapped around the provided input. // For an overview of allowed inputs, take a look at the documentation // for parsekit.read.New(). func NewAPI(input interface{}) *API { api := &API{ reader: read.New(input), bytes: make([]byte, initialByteStoreLength), tokens: make([]Token, initialTokenStoreLength), stackFrames: make([]stackFrame, initialStackDepth), } api.stackFrame = &api.stackFrames[0] return api } // PeekByte returns the byte at the provided byte offset. // // When an error occurs during reading the input, an error will be returned. // When an offset is requested that is beyond the length of the available input // data, then the error will be io.EOF. func (i *API) PeekByte(offset int) (byte, error) { return i.reader.ByteAt(i.stackFrame.offset + offset) } // SkipByte is used to skip over a single bytes that was read from the input. // This tells the tokenizer: "I've seen this byte. It is of no interest. // I will now continue reading after this byte." // // This will merely update the position of the cursor (which keeps track of what // line and column we are on in the input data). The byte is not added to // the results. func (i *API) SkipByte(b byte) { i.stackFrame.moveCursorByByte(b) i.stackFrame.offset++ } // SkipBytes is used to skip over one or more bytes that were read from the input. // This tells the tokenizer: "I've seen these bytes. They are of no interest. // I will now continue reading after these bytes." // // This will merely update the position of the cursor (which keeps track of what // line and column we are on in the input data). The bytes are not added to // the results. func (i *API) SkipBytes(bytes ...byte) { for _, b := range bytes { i.stackFrame.moveCursorByByte(b) i.stackFrame.offset++ } } // AcceptByte is used to accept a single byte that was read from the input. // This tells the tokenizer: "I've seen this byte. I want to make use of it // for the final output, so please remember it for me. I will now continue // reading after this byte." // // This will update the position of the cursor (which keeps track of what line // and column we are on in the input data) and add the byte to the tokenizer // results. func (i *API) AcceptByte(b byte) { curBytesEnd := i.stackFrame.bytesEnd maxRequiredBytes := curBytesEnd + 1 // Grow the bytes capacity when needed. if cap(i.bytes) < maxRequiredBytes { newBytes := make([]byte, maxRequiredBytes*2) copy(newBytes, i.bytes) i.bytes = newBytes } i.bytes[curBytesEnd] = b i.stackFrame.moveCursorByByte(b) i.stackFrame.bytesEnd++ i.stackFrame.offset++ } // AcceptBytes is used to accept one or more bytes that were read from the input. // This tells the tokenizer: "I've seen these bytes. I want to make use of them // for the final output, so please remember them for me. I will now continue // reading after these bytes." // // This will update the position of the cursor (which keeps track of what line // and column we are on in the input data) and add the bytes to the tokenizer // results. func (i *API) AcceptBytes(bytes ...byte) { curBytesEnd := i.stackFrame.bytesEnd newBytesEnd := curBytesEnd + len(bytes) // Grow the bytes capacity when needed. if cap(i.bytes) < newBytesEnd { newBytes := make([]byte, newBytesEnd*2) copy(newBytes, i.bytes) i.bytes = newBytes } copy(i.bytes[curBytesEnd:], bytes) for _, b := range bytes { i.stackFrame.moveCursorByByte(b) i.stackFrame.offset++ } i.stackFrame.bytesEnd = newBytesEnd } // PeekRune returns the UTF8 rune at the provided byte offset, including its byte width. // // The byte width is useful to know what byte offset you'll have to use to peek // the next byte or rune. Some UTF8 runes take up 4 bytes of data, so when the // first rune starts at offset = 0, the second rune might start at offset = 4. // // When an invalid UTF8 rune is encountered on the input, it is replaced with // the utf.RuneError rune. It's up to the caller to handle this as an error // when needed. // // When an error occurs during reading the input, an error will be returned. // When an offset is requested that is beyond the length of the available input // data, then the error will be io.EOF. func (i *API) PeekRune(offset int) (rune, int, error) { return i.reader.RuneAt(i.stackFrame.offset + offset) } // SkipRune is used to skip over a single rune that was read from the input. // This tells the tokenizer: "I've seen this rune. It is of no interest. // I will now continue reading after this rune." // // This will merely update the position of the cursor (which keeps track of what // line and column we are on in the input data). The rune is not added to // the results. func (i *API) SkipRune(r rune) { i.stackFrame.moveCursorByRune(r) } // SkipRunes is used to skip over one or more runes that were read from the input. // This tells the tokenizer: "I've seen these runes. They are of no interest. // I will now continue reading after these runes." // // This will merely update the position of the cursor (which keeps track of what // line and column we are on in the input data). The runes are not added to // the results. func (i *API) SkipRunes(runes ...rune) { for _, r := range runes { i.stackFrame.moveCursorByRune(r) i.stackFrame.offset += utf8.RuneLen(r) } } // AcceptRune is used to accept a single rune that was read from the input. // This tells the tokenizer: "I've seen this rune. I want to make use of it // for the final output, so please remember it for me. I will now continue // reading after this rune." // // This will update the position of the cursor (which keeps track of what line // and column we are on in the input data) and add the rune to the tokenizer // results. func (i *API) AcceptRune(r rune) { curBytesEnd := i.stackFrame.bytesEnd maxRequiredBytes := curBytesEnd + utf8.UTFMax // Grow the runes capacity when needed. if cap(i.bytes) < maxRequiredBytes { newBytes := make([]byte, maxRequiredBytes*2) copy(newBytes, i.bytes) i.bytes = newBytes } i.stackFrame.moveCursorByRune(r) w := utf8.EncodeRune(i.bytes[curBytesEnd:], r) i.stackFrame.bytesEnd += w i.stackFrame.offset += w } // AcceptRunes is used to accept one or more runes that were read from the input. // This tells the tokenizer: "I've seen these runes. I want to make use of them // for the final output, so please remember them for me. I will now continue // reading after these runes." // // This will update the position of the cursor (which keeps track of what line // and column we are on in the input data) and add the runes to the tokenizer // results. func (i *API) AcceptRunes(runes ...rune) { runesAsString := string(runes) byteLen := len(runesAsString) curBytesEnd := i.stackFrame.bytesEnd newBytesEnd := curBytesEnd + byteLen // Grow the runes capacity when needed. if cap(i.bytes) < newBytesEnd { newBytes := make([]byte, newBytesEnd*2) copy(newBytes, i.bytes) i.bytes = newBytes } for _, r := range runes { i.stackFrame.moveCursorByRune(r) } copy(i.bytes[curBytesEnd:], runesAsString) i.stackFrame.bytesEnd = newBytesEnd i.stackFrame.offset += byteLen } // Fork forks off a child of the API struct. It will reuse the same // read buffer and cursor position, but for the rest this can be considered // a fresh API. // // By forking an API, you can freely work with the forked child, without // affecting the parent API. This is for example useful when you must perform // some form of lookahead. // // When processing of the Handler was successful and you want to add the results // to the parent API, you can call Merge() on the forked child. // This will add the results to the results of the parent (runes, tokens). // It also updates the read cursor position of the parent to that of the child. // // When the lookahead was unsuccessful, then the forked child API can // disposed by calling Dispose() on the forked child. This is not mandatory. // Garbage collection will take care of this automatically. // The parent API was never modified, so it can safely be used after disposal // as if the lookahead never happened. func (i *API) Fork() int { newStackLevel := i.stackLevel + 1 newStackSize := newStackLevel + 1 // Grow the stack frames capacity when needed. if cap(i.stackFrames) < newStackSize { newFrames := make([]stackFrame, newStackSize*2) copy(newFrames, i.stackFrames) i.stackFrames = newFrames } i.stackLevel++ // This can be written in a shorter way, but this turned out to // be the best way performance-wise. parent := i.stackFrame child := &i.stackFrames[i.stackLevel] child.offset = parent.offset child.column = parent.column child.line = parent.line child.bytesStart = parent.bytesEnd child.bytesEnd = parent.bytesEnd child.tokenStart = parent.tokenEnd child.tokenEnd = parent.tokenEnd i.stackFrame = child return i.stackLevel } // Merge appends the results of a forked child API (runes, tokens) to the // results of its parent. The read cursor of the parent is also updated // to that of the forked child. // // After the merge operation, the child results are reset so it can immediately // be reused for performing another match. This means that all Result data are // cleared, but the read cursor position is kept at its current position. // This allows a child to feed results in chunks to its parent. // // Once the child is no longer needed, it can be disposed of by using the // method Dispose(), which will return the tokenizer to the parent. func (i *API) Merge(stackLevel int) { if stackLevel == 0 { callerPanic("Merge", "tokenize.API.{name}(): {name}() called at {caller} "+ "on the top-level API stack level 0") } if stackLevel != i.stackLevel { callerPanic("Merge", "tokenize.API.{name}(): {name}() called at {caller} "+ "on API stack level %d, but the current stack level is %d "+ "(forgot to Dispose() a forked child?)", stackLevel, i.stackLevel) } parent := &i.stackFrames[stackLevel-1] // The end of the parent slice aligns with the start of the child slice. // Because of this, to merge the parent slice can simply be expanded // to include the child slice. // parent : |----------| // child: |------| // After merge operation: // parent: |-----------------| // child: |---> continue reading from here parent.bytesEnd = i.stackFrame.bytesEnd i.stackFrame.bytesStart = i.stackFrame.bytesEnd // The same logic applies to tokens. parent.tokenEnd = i.stackFrame.tokenEnd i.stackFrame.tokenStart = i.stackFrame.tokenEnd parent.offset = i.stackFrame.offset parent.line = i.stackFrame.line parent.column = i.stackFrame.column i.stackFrame.err = nil } func (i *API) Dispose(stackLevel int) { if stackLevel == 0 { callerPanic("Dispose", "tokenize.API.{name}(): {name}() called at {caller} "+ "on the top-level API stack level 0") } if stackLevel != i.stackLevel { callerPanic("Dispose", "tokenize.API.{name}(): {name}() called at {caller} "+ "on API stack level %d, but the current stack level is %d "+ "(forgot to Dispose() a forked child?)", stackLevel, i.stackLevel) } i.stackLevel = stackLevel - 1 i.stackFrame = &i.stackFrames[stackLevel-1] } func (i *API) Reset() { if i.stackLevel == 0 { i.stackFrame.column = 0 i.stackFrame.line = 0 i.stackFrame.offset = 0 } else { parent := i.stackFrames[i.stackLevel-1] i.stackFrame.column = parent.column i.stackFrame.line = parent.line i.stackFrame.offset = parent.offset } i.stackFrame.bytesEnd = i.stackFrame.bytesStart i.stackFrame.tokenEnd = i.stackFrame.tokenStart i.stackFrame.err = nil } // FlushInput flushes input data from the read.Buffer up to the current // read offset of the parser. // // Note: // When writing your own TokenHandler, you normally won't have to call this // method yourself. It is automatically called by parsekit when possible. func (i *API) FlushInput() bool { if i.stackFrame.offset > 0 { i.reader.Flush(i.stackFrame.offset) i.stackFrame.offset = 0 return true } return false } func (i *API) String() string { bytes := i.bytes[i.stackFrame.bytesStart:i.stackFrame.bytesEnd] return string(bytes) } func (i *API) Runes() []rune { bytes := i.bytes[i.stackFrame.bytesStart:i.stackFrame.bytesEnd] return []rune(string(bytes)) } func (i *API) Rune(offset int) rune { r, _ := utf8.DecodeRune(i.bytes[i.stackFrame.bytesStart+offset:]) return r } func (i *API) ClearBytes() { i.stackFrame.bytesEnd = i.stackFrame.bytesStart } func (i *API) SetBytes(bytes ...byte) { i.ClearBytes() i.AddBytes(bytes...) } func (i *API) AddBytes(bytes ...byte) { // Grow the runes capacity when needed. newBytesEnd := i.stackFrame.bytesEnd + len(bytes) if cap(i.bytes) < newBytesEnd { newBytes := make([]byte, newBytesEnd*2) copy(newBytes, i.bytes) i.bytes = newBytes } copy(i.bytes[i.stackFrame.bytesEnd:], bytes) i.stackFrame.bytesEnd = newBytesEnd } func (i *API) ClearRunes() { i.stackFrame.bytesEnd = i.stackFrame.bytesStart } func (i *API) SetRunes(runes ...rune) { i.ClearRunes() i.AddRunes(runes...) } func (i *API) AddRunes(runes ...rune) { // Grow the runes capacity when needed. runesAsString := string(runes) newBytesEnd := i.stackFrame.bytesEnd + len(runesAsString) if cap(i.bytes) < newBytesEnd { newBytes := make([]byte, newBytesEnd*2) copy(newBytes, i.bytes) i.bytes = newBytes } copy(i.bytes[i.stackFrame.bytesEnd:], runesAsString) i.stackFrame.bytesEnd = newBytesEnd } func (i *API) AddString(s string) { i.AddBytes([]byte(s)...) } func (i *API) SetString(s string) { i.ClearBytes() i.SetBytes([]byte(s)...) } func (i *API) Cursor() string { if i.stackFrame.line == 0 && i.stackFrame.column == 0 { return fmt.Sprintf("start of file") } return fmt.Sprintf("line %d, column %d", i.stackFrame.line+1, i.stackFrame.column+1) } func (i *API) Tokens() []Token { return i.tokens[i.stackFrame.tokenStart:i.stackFrame.tokenEnd] } func (i *API) Token(offset int) Token { return i.tokens[i.stackFrame.tokenStart+offset] } func (i *API) TokenValue(offset int) interface{} { return i.tokens[i.stackFrame.tokenStart+offset].Value } func (i *API) ClearTokens() { i.stackFrame.tokenEnd = i.stackFrame.tokenStart } func (i *API) SetTokens(tokens ...Token) { i.ClearTokens() i.AddTokens(tokens...) } func (i *API) AddTokens(tokens ...Token) { // Grow the tokens capacity when needed. newTokenEnd := i.stackFrame.tokenEnd + len(tokens) if cap(i.tokens) < newTokenEnd { newTokens := make([]Token, newTokenEnd*2) copy(newTokens, i.tokens) i.tokens = newTokens } for offset, t := range tokens { i.tokens[i.stackFrame.tokenEnd+offset] = t } i.stackFrame.tokenEnd = newTokenEnd }