package tokenize import ( "fmt" "unicode/utf8" "git.makaay.nl/mauricem/go-parsekit/read" ) // Input provides input-related functionality for the tokenize API. type Input struct { api *API reader *read.Buffer // the input data reader } // Reset moves the input cursor back to the beginning for the currently active API child. // Aditionally, any output (bytes and tokens) that was emitted from the API child are // cleared as well. func (i *Input) Reset() { if i.api.stackLevel == 0 { i.api.stackFrame.column = 0 i.api.stackFrame.line = 0 i.api.stackFrame.offset = 0 } else { parent := i.api.stackFrames[i.api.stackLevel-1] i.api.stackFrame.column = parent.column i.api.stackFrame.line = parent.line i.api.stackFrame.offset = parent.offset } i.api.stackFrame.bytesEnd = i.api.stackFrame.bytesStart i.api.stackFrame.tokenEnd = i.api.stackFrame.tokenStart i.api.stackFrame.err = nil } func (i *Input) Cursor() string { if i.api.stackFrame.line == 0 && i.api.stackFrame.column == 0 { return fmt.Sprintf("start of file") } return fmt.Sprintf("line %d, column %d", i.api.stackFrame.line+1, i.api.stackFrame.column+1) } // PeekByte returns the byte at the provided byte offset. // // When an error occurs during reading the input, an error will be returned. // When an offset is requested that is beyond the length of the available input // data, then the error will be io.EOF. func (i *Input) PeekByte(offset int) (byte, error) { return i.reader.ByteAt(i.api.stackFrame.offset + offset) } // SkipByte is used to skip over a single bytes that was read from the input. // This tells the tokenizer: "I've seen this byte. It is of no interest. // I will now continue reading after this byte." // // This will merely update the position of the cursor (which keeps track of what // line and column we are on in the input data). The byte is not added to // the output. // // After the call, byte offset 0 for PeekByte() and PeekRune() will point at // the first byte after the skipped byte. func (i *Input) SkipByte(b byte) { i.api.stackFrame.moveCursorByByte(b) i.api.stackFrame.offset++ } // SkipBytes is used to skip over one or more bytes that were read from the input. // This tells the tokenizer: "I've seen these bytes. They are of no interest. // I will now continue reading after these bytes." // // This will merely update the position of the cursor (which keeps track of what // line and column we are on in the input data). The bytes are not added to // the output. // // After the call, byte offset 0 for PeekByte() and PeekRune() will point at // the first byte after the skipped bytes. func (i *Input) SkipBytes(bytes ...byte) { for _, b := range bytes { i.api.stackFrame.moveCursorByByte(b) i.api.stackFrame.offset++ } } // AcceptByte is used to accept a single byte that was read from the input. // This tells the tokenizer: "I've seen this byte. I want to make use of it // for the final output, so please remember it for me. I will now continue // reading after this byte." // // This will update the position of the cursor (which keeps track of what line // and column we are on in the input data) and add the byte to the tokenizer // output. // // After the call, byte offset 0 for PeekByte() and PeekRune() will point at // the first byte after the accepted byte. func (i *Input) AcceptByte(b byte) { curBytesEnd := i.api.stackFrame.bytesEnd maxRequiredBytes := curBytesEnd + 1 // Grow the bytes capacity when needed. if cap(i.api.Output.data) < maxRequiredBytes { newBytes := make([]byte, maxRequiredBytes*2) copy(newBytes, i.api.Output.data) i.api.Output.data = newBytes } i.api.Output.data[curBytesEnd] = b i.api.stackFrame.moveCursorByByte(b) i.api.stackFrame.bytesEnd++ i.api.stackFrame.offset++ } // AcceptBytes is used to accept one or more bytes that were read from the input. // This tells the tokenizer: "I've seen these bytes. I want to make use of them // for the final output, so please remember them for me. I will now continue // reading after these bytes." // // This will update the position of the cursor (which keeps track of what line // and column we are on in the input data) and add the bytes to the tokenizer // output. // // After the call, byte offset 0 for PeekByte() and PeekRune() will point at // the first byte after the accepted bytes. func (i *Input) AcceptBytes(bytes ...byte) { curBytesEnd := i.api.stackFrame.bytesEnd newBytesEnd := curBytesEnd + len(bytes) // Grow the bytes capacity when needed. if cap(i.api.Output.data) < newBytesEnd { newBytes := make([]byte, newBytesEnd*2) copy(newBytes, i.api.Output.data) i.api.Output.data = newBytes } copy(i.api.Output.data[curBytesEnd:], bytes) for _, b := range bytes { i.api.stackFrame.moveCursorByByte(b) i.api.stackFrame.offset++ } i.api.stackFrame.bytesEnd = newBytesEnd } // PeekRune returns the UTF8 rune at the provided byte offset, including its byte width. // // The byte width is useful to know what byte offset you'll have to use to peek // the next byte or rune. Some UTF8 runes take up 4 bytes of data, so when the // first rune starts at offset = 0, the second rune might start at offset = 4. // // When an invalid UTF8 rune is encountered on the input, it is replaced with // the utf.RuneError rune. It's up to the caller to handle this as an error // when needed. // // When an error occurs during reading the input, an error will be returned. // When an offset is requested that is beyond the length of the available input // data, then the error will be io.EOF. func (i *Input) PeekRune(offset int) (rune, int, error) { return i.reader.RuneAt(i.api.stackFrame.offset + offset) } // SkipRune is used to skip over a single rune that was read from the input. // This tells the tokenizer: "I've seen this rune. It is of no interest. // I will now continue reading after this rune." // // This will merely update the position of the cursor (which keeps track of what // line and column we are on in APIthe input data). The rune is not added to // the output. // // After the call, byte offset 0 for PeekByte() and PeekRune() will point at // the first byte after the skipped rune. func (i *Input) SkipRune(r rune) { i.api.stackFrame.moveCursorByRune(r) i.api.stackFrame.offset += utf8.RuneLen(r) } // SkipRunes is used to skip over one or more runes that were read from the input. // This tells the tokenizer: "I've seen these runes. They are of no interest. // I will now continue reading after these runes." // // This will merely update the position of the cursor (which keeps track of what // line and column we are on in the input data). The runes are not added to // the output. // // After the call, byte offset 0 for PeekByte() and PeekRune() will point at // the first byte after the skipped runes. func (i *Input) SkipRunes(runes ...rune) { for _, r := range runes { i.api.stackFrame.moveCursorByRune(r) i.api.stackFrame.offset += utf8.RuneLen(r) } } // AcceptRune is used to accept a single rune that was read from the input. // This tells the tokenizer: "I've seen this rune. I want to make use of it // for the final output, so please remember it for me. I will now continue // reading after this rune." // // This will update the position of the cursor (which keeps track of what line // and column we are on in the input data) and add the rune to the tokenizer // output. // // After the call, byte offset 0 for PeekByte() and PeekRune() will point at // the first byte after the accepted rune. func (i *Input) AcceptRune(r rune) { curBytesEnd := i.api.stackFrame.bytesEnd maxRequiredBytes := curBytesEnd + utf8.UTFMax // Grow the runes capacity when needed. if cap(i.api.Output.data) < maxRequiredBytes { newBytes := make([]byte, maxRequiredBytes*2) copy(newBytes, i.api.Output.data) i.api.Output.data = newBytes } i.api.stackFrame.moveCursorByRune(r) w := utf8.EncodeRune(i.api.Output.data[curBytesEnd:], r) i.api.stackFrame.bytesEnd += w i.api.stackFrame.offset += w } // AcceptRunes is used to accept one or more runes that were read from the input. // This tells the tokenizer: "I've seen these runes. I want to make use of them // for the final output, so please remember them for me. I will now continue // reading after these runes." // // This will update the position of the cursor (which keeps track of what line // and column we are on in the input data) and add the runes to the tokenizer // output. // // After the call, byte offset 0 for PeekByte() and PeekRune() will point at // the first byte after the accepted runes. func (i *Input) AcceptRunes(runes ...rune) { runesAsString := string(runes) byteLen := len(runesAsString) curBytesEnd := i.api.stackFrame.bytesEnd newBytesEnd := curBytesEnd + byteLen // Grow the runes capacity when needed. if cap(i.api.Output.data) < newBytesEnd { newBytes := make([]byte, newBytesEnd*2) copy(newBytes, i.api.Output.data) i.api.Output.data = newBytes } for _, r := range runes { i.api.stackFrame.moveCursorByRune(r) } copy(i.api.Output.data[curBytesEnd:], runesAsString) i.api.stackFrame.bytesEnd = newBytesEnd i.api.stackFrame.offset += byteLen } // Flush flushes input data from the read.Buffer up to the current // read offset of the parser. // // Note: // When writing your own TokenHandler, you normally won't have to call this // method yourself. It is automatically called by parsekit when possible. func (i *Input) Flush() bool { if i.api.stackFrame.offset > 0 { i.reader.Flush(i.api.stackFrame.offset) i.api.stackFrame.offset = 0 return true } return false }