package lexer import ( "fmt" "strings" "unicode/utf8" "github.com/mmakaay/toml/parser" ) // Lexer holds the state of the lexer. type Lexer struct { input string // the scanned input state parser.StateFn // a function that handles the current state stack []parser.StateFn // state function stack, for nested parsing len int // the total length of the input in bytes pos int // current byte scanning position in the input newline bool // keep track of when we have scanned a newline cursorRow int // current row number in the input cursorColumn int // current column position in the input buffer StringBuffer // an efficient buffer, used to build string values items chan parser.Item // channel of resulting lexer items item parser.Item // the current item as reached by Next() and retrieved by Get() err *Error // an error when lexing failed, retrieved by Error() } // Error is used as the error type when lexing errors occur. // The error includes some extra meta information to allow for useful // error messages to the user. type Error struct { Message string Row int Column int } func (err *Error) Error() string { return err.Message } // New takes an input string and initializes the lexer for it. func New(input string) *Lexer { return &Lexer{ input: input, len: len(input), state: stateKeyValuePair, items: make(chan parser.Item, 2), } } // Next advances to the next lexer item in the input string. // When a valid item was found, then the boolean return parameter will be true. // On error or when reaching the end of the input, false is returned. // When an error occurred, it will be set in the error return value, nil otherwise. func (l *Lexer) Next() (parser.Item, *Error, bool) { for { select { case i := <-l.items: switch { case i.Type == ItemEOF: return i, nil, false case i.Type == ItemError: l.err = &Error{i.Value, l.cursorRow, l.cursorColumn} return i, l.err, false default: l.item = i return i, nil, true } default: l.state = l.state(l) } } } // ToArray returns lexer items as an array (mainly intended for testing purposes) // When an error occurs during scanning, a partial result will be // returned, accompanied by the error that occurred. func (l *Lexer) ToArray() ([]parser.Item, *Error) { var items []parser.Item for { item, err, more := l.Next() if !more { return items, err } items = append(items, item) } } // pushState adds the state function to its stack. // This is used for implementing nested parsing. func (l *Lexer) pushState(state stateFn) { l.stack = append(l.stack, state) } // popState pops the last pushed state from its stack. func (l *Lexer) popState() stateFn { last := len(l.stack) - 1 head, tail := l.stack[:last], l.stack[last] l.stack = head return tail } // atEndOfFile returns true when there is no more data available in the input. func (l *Lexer) atEndOfFile() bool { return l.pos >= l.len } // emit passes a lexer item back to the client, including the provided string. func (l *Lexer) emit(t parser.ItemType, s string) { l.items <- parser.Item{Type: t, Value: s} l.buffer.Reset() } // emitLiteral passes a lexer item back to the client, including the accumulated // string buffer data as a literal string. func (l *Lexer) emitLiteral(t parser.ItemType) { l.emit(t, l.buffer.AsLiteralString()) } // emitTrimmedLiteral passes a lexer item back to the client, including the // accumulated string buffer data as a literal string with whitespace // trimmed from it. func (l *Lexer) emitTrimmedLiteral(t parser.ItemType) { l.emit(t, strings.TrimSpace(l.buffer.AsLiteralString())) } // emitInterpreted passes a lexer item back to the client, including the // accumulated string buffer data an interpreted string (handling escape // codes like \n, \t, \uXXXX, etc.) // This method might return an error, in case there is data in the // string buffer that is not valid for string interpretation. func (l *Lexer) emitInterpreted(t parser.ItemType) error { s, err := l.buffer.AsInterpretedString() if err != nil { return err } l.emit(t, s) return nil } // emitError emits a lexer error item back to the client. func (l *Lexer) emitError(format string, args ...interface{}) stateFn { message := fmt.Sprintf(format, args...) l.emit(ItemError, message) return nil } // peek returns but does not advance to the next rune(s) in the input. // Returns the rune, its width and a boolean. The boolean will be false in case // no upcoming rune can be peeked (end of data or invalid UTF8 character). func (l *Lexer) peek() (rune, int, bool) { r, w := utf8.DecodeRuneInString(l.input[l.pos:]) return r, w, r != utf8.RuneError } // peekMulti takes a peek at multiple upcoming runes in the input. // Returns a slice of runes and a boolean. The boolean will be false in case // less upcoming runes can be peeked than the requested amount // (end of data or invalid UTF8 character). func (l *Lexer) peekMulti(amount int) ([]rune, int, bool) { width := 0 var peeked []rune for i := 0; i < amount; i++ { r, w := utf8.DecodeRuneInString(l.input[l.pos+width:]) switch { case r == utf8.RuneError: return peeked, width, false default: width += w peeked = append(peeked, r) } } return peeked, width, true } // acceptAny adds the next rune from the input to the string buffer. // If no rune could be read (end of file or invalid UTF8 data), then // false is returned. func (l *Lexer) acceptAny() bool { if r, ok := l.next(); ok { l.buffer.WriteRune(r) return true } return false } // accept adds the next rune to the string buffer and returns true if it's // from the valid set of runes. Otherwise false is returned. func (l *Lexer) accept(matches ...string) bool { return l.acceptPattern(matches...) } // AcceptMatching adds the next runes to the string buffer, but only // if the upcoming runes satisfy the provided pattern. // When runes were added then true is returned, false otherwise. func (l *Lexer) acceptPattern(pattern ...string) bool { return l.progress(func(r rune) { l.buffer.WriteRune(r) }, pattern...) } func (l *Lexer) progress(callback func(rune), matches ...string) bool { if runes, w, ok := l.match(matches...); ok { l.pos += w for _, r := range runes { callback(r) l.advanceCursor(r) } return true } return false } // acceptConsecutive adds consecutive runes from the input to the string // buffer when they match the rune match. // If any runes were added then true is returned, false otherwise. func (l *Lexer) acceptConsecutive(match string) bool { accepted := false for l.accept(match) { accepted = true } return accepted } // advanceCursor advances the rune cursor one position in the // input data. While doing so, it keeps tracks of newlines, // so we can report on row + column positions on error. func (l *Lexer) advanceCursor(r rune) { if l.newline { l.cursorColumn = 0 l.cursorRow++ } else { l.cursorColumn++ } l.newline = r == '\n' } // skip skips runes, but only when all provided matches are satisfied. // Returns true when one or more runes were skipped. func (l *Lexer) skipMatching(pattern ...string) bool { return l.progress(func(r rune) {}, pattern...) } // skipConsecutive skips consecutive runes from the provided match. // Returns true when one or more runes were skipped. func (l *Lexer) skipConsecutive(pattern string) bool { didSkip := false for l.skipMatching(pattern) { didSkip = true } return didSkip } // upcoming checks if the upcoming runes satisfy the provided rune matches. // This is a lot like the match method, with the difference that // this one only returns the boolean value. func (l *Lexer) upcoming(matches ...string) bool { _, _, ok := l.match(matches...) return ok } // next returns the next rune from the input and a boolean indicating if // reading the input was successful. // When the end of input is reached, or an invalid UTF8 character is // read, then false is returned. func (l *Lexer) next() (rune, bool) { r, w, ok := l.peek() if ok { l.pos += w l.advanceCursor(r) return r, true } if r == utf8.RuneError && w == 0 { l.emitError("unexpected end of file") } else { l.emitError("invalid UTF8 character") } return r, false } // match checks if the upcoming runes satisfy the provided rune matches. // It returns a slice of runes that were found, their total byte width // and a boolean indicating whether or not all provided matches matched // the input data. func (l *Lexer) match(matches ...string) ([]rune, int, bool) { peeked, width, ok := l.peekMulti(len(matches)) if ok { for i, r := range matches { if strings.IndexRune(r, peeked[i]) < 0 { return peeked, width, false } } return peeked, width, true } return peeked, width, false } func (l *Lexer) unexpectedInputError(expected string) stateFn { // next() takes care of emitting errors for ok == false. if r, ok := l.next(); ok { return l.emitError(fmt.Sprintf("unexpected character %q (expected %s)", r, expected)) } return nil } func (l *Lexer) unexpectedEndOfFile(expected string) stateFn { return l.emitError("Unexpected end of file (expected %s)", expected) }