package lexer import ( "errors" "fmt" "strings" "unicode/utf8" ) // Lexer holds the state of the lexer. type Lexer struct { input string // the scanned input string state stateFn // a function that handles the current state stack []stateFn // state function stack, for nested parsing pos int // current scanning position in the input width int // width of the last rune read, for supporting backup() buffer StringBuffer // an efficient buffer, used to build string values items chan Item // channel of resulting lexer items nextItem Item // the current item as reached by Next() and retrieved by Get() err error // an error message when lexing failed, retrieved by Error() } // Lex takes an input string and initializes the TOML lexer for it. // Usage: // // l := lexer.Lex("...inputstring...") // for l.Next() { // item := l.Get() // ... handle item ... // } // if e := l.Error(); e != nil { // ... handle error message ... // } func Lex(input string) *Lexer { return &Lexer{ input: input, state: stateKeyValuePair, items: make(chan Item, 2), } } // Next advances to the next lexer item in the input string. // When a next item was found, then true is returned. // On error or reaching the end of the input, false is returned. func (l *Lexer) Next() bool { if l.state == nil { panic("This should not happen: nil state reached, but entering Next()") } for { select { case i := <-l.items: if i.Type == ItemEOF { return false } if i.Type == ItemError { l.err = errors.New(i.Value) return false } l.nextItem = i return true default: l.state = l.state(l) } } } func (l *Lexer) Error() error { return l.err } // Get returns the next lexer item, as reached by Next() func (l *Lexer) Get() Item { return l.nextItem } // ToArray returns lexer items as an array. // When an error occurs during scanning, a partial result will be // returned, accompanied by the error that occurred. func (l *Lexer) ToArray() ([]Item, error) { var items []Item for l.Next() { items = append(items, l.Get()) } return items, l.Error() } // pushState adds the state function to its stack. // This is used for implementing nested parsing. func (l *Lexer) pushState(state stateFn) { l.stack = append(l.stack, state) } // popState pops the last pushed state from its stack. func (l *Lexer) popState() stateFn { last := len(l.stack) - 1 head, tail := l.stack[:last], l.stack[last] l.stack = head return tail } // atEndOfFile returns true when there is no more data available in the input. func (l *Lexer) atEndOfFile() bool { return l.pos >= len(l.input) } // emit passes a lexer item back to the client, including the provided string. func (l *Lexer) emit(t itemType, s string) { l.items <- Item{t, s} l.buffer.Reset() } // emitLiteral passes a lexer item back to the client, including the accumulated // string buffer data as a literal string. func (l *Lexer) emitLiteral(t itemType) { l.emit(t, l.buffer.AsLiteralString()) } // emitTrimmedLiteral passes a lexer item back to the client, including the // accumulated string buffer data as a literal string with whitespace // trimmed from it. func (l *Lexer) emitTrimmedLiteral(t itemType) { l.emit(t, strings.TrimSpace(l.buffer.AsLiteralString())) } // emitInterpreted passes a lexer item back to the client, including the // accumulated string buffer data an interpreted string (handling escape // codes like \n, \t, \uXXXX, etc.) // This method might return an error, in case there is data in the // string buffer that is not valid for string interpretation. func (l *Lexer) emitInterpreted(t itemType) error { s, err := l.buffer.AsInterpretedString() if err != nil { return err } l.emit(t, s) return nil } // backup steps back one rune // Can be called only once per call of next. func (l *Lexer) backup() { l.pos -= l.width } // peek returns but does not advance to the next rune(s) in the input. // Returns the rune, its width and a boolean. The boolean will be false in case // no upcoming rune can be peeked (end of data or invalid UTF8 character). func (l *Lexer) peek() (rune, int, bool) { r, w := utf8.DecodeRuneInString(l.input[l.pos:]) switch { case r == utf8.RuneError: return utf8.RuneError, w, false default: return r, w, true } } // peekMulti takes a peek at multiple upcoming runes in the input. // Returns a slice of runes and a boolean. The boolean will be false in case // less upcoming runes can be peeked than the requested amount // (end of data or invalid UTF8 character). func (l *Lexer) peekMulti(amount int) ([]rune, bool) { offset := 0 var peeked []rune for i := 0; i < amount; i++ { r, w := utf8.DecodeRuneInString(l.input[l.pos+offset:]) switch { case r == utf8.RuneError: return peeked, false default: offset += w peeked = append(peeked, r) } } return peeked, true } // acceptNext adds the next rune from the input to the string buffer. // If no rune could be read (end of file or invalid UTF8 data), // then false is returned. func (l *Lexer) acceptNext() bool { r := l.next() if r == endOfFile || r == utf8.RuneError { return false } l.buffer.WriteRune(r) return true } // acceptFrom adds the next rune from the input to the string buffer // when it matches in the provided runes. If the next rune does // not match, false is returned. func (l *Lexer) acceptFrom(runes string) bool { r := l.next() if strings.IndexRune(runes, r) >= 0 { l.buffer.WriteRune(r) return true } l.backup() return false } // acceptRun adds consecutive runes from the input to the string // buffer when they match the provided runes. If no runes were added // at all, false it returned. func (l *Lexer) acceptRun(runes string) bool { accepted := false for l.acceptFrom(runes) { accepted = true } return accepted } // TODO meh... ugly rune. var endOfFile rune = -1 // next returns the next rune from the input. func (l *Lexer) next() rune { l.width = 0 r, w := utf8.DecodeRuneInString(l.input[l.pos:]) switch { case r == utf8.RuneError && w == 0: return endOfFile case r == utf8.RuneError: return utf8.RuneError default: l.width = w l.pos += w return r } } // skip skips a rune from the set of accepted runes. // Returns true when a rune was skipped. func (l *Lexer) skip(runes string) bool { r, w, _ := l.peek() if strings.IndexRune(runes, r) >= 0 { l.pos += w return true } return false } // skipRun skips a run of runes from the set of accepted runes. // Returns true when one or more runes were skipped. func (l *Lexer) skipRun(runes string) bool { didSkip := false for l.skip(runes) { didSkip = true } return didSkip } // accept adds the next rune to the string buffer and returns true if it's // from the valid set of runes. Otherwise false is returned. func (l *Lexer) accept(runes string) bool { r := l.next() if strings.IndexRune(runes, r) >= 0 { return true } l.backup() return false } func (l *Lexer) upcoming(runes string) bool { if l.accept(runes) { l.backup() return true } return false } // TODO nog nodig met stringbuffer? // acceptNot consumes the next rune if it's not from the set of runes. func (l *Lexer) acceptNot(runes string) bool { r := l.next() if r == endOfFile { l.backup() return false } if strings.IndexRune(runes, r) < 0 { return true } l.backup() return false } // acceptUntil consumes a run of runes until ones from the // valid set is encountered. func (l *Lexer) acceptUntil(runes string) bool { accepted := false for l.acceptNot(runes) { accepted = true } return accepted } // acceptRun consumes a run of runes from the set of accepted runes. func (l *Lexer) acceptWhile(runes string) bool { accepted := false for l.accept(runes) { accepted = true } return accepted } // skipUntil skips a run of runes, until a rune from the set of // runes of EOF is reached. func (l *Lexer) skipUntil(runes string) { l.acceptUntil(runes) } // error returns an error token and terminates the scan // by returning nil to l.run. func (l *Lexer) errorf(format string, args ...interface{}) stateFn { l.items <- Item{ ItemError, fmt.Sprintf(format, args...), } return nil } func (l *Lexer) unexpectedInputError(expected string) stateFn { var actual string switch { case l.atEndOfFile(): // TODO maybe not hit anymore after refactoring? actual = "end of file" case !utf8.ValidString(l.input[l.pos:]): actual = "non-UTF8 data" default: r, _, _ := l.peek() actual = fmt.Sprintf("token '%c'", r) } return l.errorf("Unexpected %s (expected %s)", actual, expected) } func (l *Lexer) unexpectedEndOfFile(expected string) stateFn { return l.errorf("Unexpected end of file (expected %s)", expected) }