package tokenize import ( "unicode/utf8" "git.makaay.nl/mauricem/go-parsekit/read" ) // InputRuneMode provides (UTF8) rune-driven input/output functionality for the tokenize API. type InputRuneMode struct { api *API reader *read.Buffer // the buffered input reader } // Peek returns the UTF8 rune at the provided byte offset, including its byte width. // // The byte width is useful to know what byte offset you'll have to use to peek // the next byte or rune. Some UTF8 runes take up 4 bytes of data, so when the // first rune starts at offset = 0, the second rune might start at offset = 4. // // When an invalid UTF8 rune is encountered on the input, it is replaced with // the utf.RuneError rune. It's up to the caller to handle this as an error // when needed. // // When an error occurs during reading the input, an error will be returned. // When an offset is requested that is beyond the length of the available input // data, then the error will be io.EOF. func (runeMode InputRuneMode) Peek(offset int) (rune, int, error) { a := runeMode.api return a.reader.RuneAt(a.pointers.offset + offset) } // Accept is used to accept a single rune that was read from the input. // This tells the tokenizer: "I've seen this rune. I want to make use of it // for the final output, so please remember it for me. I will now continue // reading after this rune." // // This will update the position of the cursor (which keeps track of what line // and column we are on intin the input data) and add the rune to the tokenizer // output. // // After the call, byte offset 0 for PeekByte() and PeekRune() will point at // the first byte after the accepted rune. func (runeMode InputRuneMode) Accept(r rune) { a := runeMode.api if a.Output.suspended > 0 { runeMode.MoveCursor(r) return } curBytesEnd := a.pointers.bytesEnd maxRequiredBytes := curBytesEnd + utf8.UTFMax a.growOutputData(maxRequiredBytes) w := utf8.EncodeRune(a.outputBytes[curBytesEnd:], r) a.pointers.bytesEnd += w runeMode.MoveCursor(r) } // AcceptMulti is used to accept one or more runes that were read from the input. // This tells the tokenizer: "I've seen these runes. I want to make use of them // for the final output, so please remember them for me. I will now continue // reading after these runes." // // This will update the position of the cursor (which keeps track of what line // and column we are on in the input data) and add the runes to the tokenizer // output. // // After the call, byte offset 0 for PeekByte() and PeekRune() will point at // the first byte after the accepted runes. func (runeMode InputRuneMode) AcceptMulti(runes ...rune) { a := runeMode.api if a.Output.suspended > 0 { runeMode.MoveCursorMulti(runes...) return } curBytesEnd := a.pointers.bytesEnd maxBytes := curBytesEnd + len(runes)*utf8.UTFMax a.growOutputData(maxBytes) for _, r := range runes { w := utf8.EncodeRune(a.outputBytes[curBytesEnd:], r) curBytesEnd += w runeMode.MoveCursor(r) } a.pointers.bytesEnd = curBytesEnd } // MoveCursor updates the position of the read cursor, based on the provided rune. // This method takes newlines into account to keep track of line numbers and // column positions for the input cursor. // // After the call, byte offset 0 for Peek() and PeekMulti() will point at // the first rune at the new cursor position. func (runeMode InputRuneMode) MoveCursor(r rune) int { a := runeMode.api if r == '\n' { a.pointers.column = 0 a.pointers.line++ } else { a.pointers.column++ } width := utf8.RuneLen(r) a.pointers.offset += width return width } // MoveCursorMulti updates the position of the read cursor, based on the provided runes. // This method takes newlines into account to keep track of line numbers and // column positions for the input cursor. // // After the call, byte offset 0 for Peek() and PeekMulti() will point at // the first rune at the new cursor position. func (runeMode InputRuneMode) MoveCursorMulti(runes ...rune) { for _, r := range runes { runeMode.MoveCursor(r) } }