go-toml/lexer/lexer.go

package lexer

import (
	"errors"
	"fmt"
	"strings"
	"unicode/utf8"
)

// Lexer holds the state of the lexer.
type Lexer struct {
	input    string       // the scanned input string
	state    stateFn      // a function that handles the current state
	stack    []stateFn    // state function stack, for nested parsing
	pos      int          // current scanning position in the input
	width    int          // width of the last rune read, for supporting backup()
	buffer   StringBuffer // an efficient buffer, used to build string values
	items    chan Item    // channel of resulting lexer items
	nextItem Item         // the current item as reached by Next() and retrieved by Get()
	err      error        // an error message when lexing failed, retrieved by Error()
}

// Lex takes an input string and initializes the TOML lexer for it.
// Usage:
//
//     l := lexer.Lex("...inputstring...")
//     for l.Next() {
//         item := l.Get()
//         ... handle item ...
//     }
//     if e := l.Error(); e != nil {
//         ... handle error message ...
//     }
func Lex(input string) *Lexer {
	return &Lexer{
		input: input,
		state: stateKeyValuePair,
		items: make(chan Item, 2),
	}
}

// Next advances to the next lexer item in the input string.
// When a next item was found, then true is returned.
// On error or reaching the end of the input, false is returned.
func (l *Lexer) Next() bool {
	if l.state == nil {
		panic("This should not happen: nil state reached, but entering Next()")
	}
	for {
		select {
		case i := <-l.items:
			if i.Type == ItemEOF {
				return false
			}
			if i.Type == ItemError {
				l.err = errors.New(i.Value)
				return false
			}
			l.nextItem = i
			return true
		default:
			l.state = l.state(l)
		}
	}
}

func (l *Lexer) Error() error {
	return l.err
}

// Get returns the next lexer item, as reached by Next()
func (l *Lexer) Get() Item {
	return l.nextItem
}

// ToArray returns lexer items as an array.
// When an error occurs during scanning, a partial result will be
// returned, accompanied by the error that occurred.
func (l *Lexer) ToArray() ([]Item, error) {
	var items []Item
	for l.Next() {
		items = append(items, l.Get())
	}
	return items, l.Error()
}

// pushState adds the state function to its stack.
// This is used for implementing nested parsing.
func (l *Lexer) pushState(state stateFn) {
	l.stack = append(l.stack, state)
}

// popState pops the last pushed state from its stack.
func (l *Lexer) popState() stateFn {
	last := len(l.stack) - 1
	head, tail := l.stack[:last], l.stack[last]
	l.stack = head
	return tail
}

// atEndOfFile returns true when there is no more data available in the input.
func (l *Lexer) atEndOfFile() bool {
	return l.pos >= len(l.input)
}

// emit passes a lexer item back to the client, including the provided string.
func (l *Lexer) emit(t itemType, s string) {
	l.items <- Item{t, s}
	l.buffer.Reset()
}

// emitLiteral passes a lexer item back to the client, including the accumulated
// string buffer data as a literal string.
func (l *Lexer) emitLiteral(t itemType) {
	l.emit(t, l.buffer.AsLiteralString())
}

// emitTrimmedLiteral passes a lexer item back to the client, including the
// accumulated string buffer data as a literal string with whitespace
// trimmed from it.
func (l *Lexer) emitTrimmedLiteral(t itemType) {
	l.emit(t, strings.TrimSpace(l.buffer.AsLiteralString()))
}

// emitInterpreted passes a lexer item back to the client, including the
// accumulated string buffer data an interpreted string (handling escape
// codes like \n, \t, \uXXXX, etc.)
// This method might return an error, in case there is data in the
// string buffer that is not valid for string interpretation.
func (l *Lexer) emitInterpreted(t itemType) error {
	s, err := l.buffer.AsInterpretedString()
	if err != nil {
		return err
	}
	l.emit(t, s)
	return nil
}

// backup steps back one rune
// Can be called only once per call of next.
func (l *Lexer) backup() {
	l.pos -= l.width
}

// peek returns but does not advance to the next rune(s) in the input.
// Returns the rune, its width and a boolean. The boolean will be false in case
// no upcoming rune can be peeked (end of data or invalid UTF8 character).
func (l *Lexer) peek() (rune, int, bool) {
	r, w := utf8.DecodeRuneInString(l.input[l.pos:])
	switch {
	case r == utf8.RuneError:
		return utf8.RuneError, w, false
	default:
		return r, w, true
	}
}

// peekMulti takes a peek at multiple upcoming runes in the input.
// Returns a slice of runes and a boolean. The boolean will be false in case
// less upcoming runes can be peeked than the requested amount
// (end of data or invalid UTF8 character).
func (l *Lexer) peekMulti(amount int) ([]rune, bool) {
	offset := 0
	var peeked []rune
	for i := 0; i < amount; i++ {
		r, w := utf8.DecodeRuneInString(l.input[l.pos+offset:])
		switch {
		case r == utf8.RuneError:
			return peeked, false
		default:
			offset += w
			peeked = append(peeked, r)
		}
	}
	return peeked, true
}

// acceptNext adds the specified amount of runes from the input to the string buffer.
// If not enough runes could be read (end of file or invalid UTF8 data), then false is returned.
func (l *Lexer) acceptNext(count int) bool {
	for i := 0; i < count; i++ {
		r := l.next()
		if r == endOfFile || r == utf8.RuneError {
			return false
		}
		l.buffer.WriteRune(r)
	}
	return true
}

// acceptFrom adds the next rune from the input to the string buffer
// when it matches in the provided runes. If the next rune does
// not match, false is returned.
func (l *Lexer) acceptFrom(runes string) bool {
	r := l.next()
	if strings.IndexRune(runes, r) >= 0 {
		l.buffer.WriteRune(r)
		return true
	}
	l.backup()
	return false
}

// acceptRun adds consecutive runes from the input to the string
// buffer when they match the provided runes. If no runes were added
// at all, false it returned.
func (l *Lexer) acceptRun(runes string) bool {
	accepted := false
	for l.acceptFrom(runes) {
		accepted = true
	}
	return accepted
}

// TODO meh... ugly rune.
var endOfFile rune = -1

// next returns the next rune from the input.
func (l *Lexer) next() rune {
	l.width = 0
	r, w := utf8.DecodeRuneInString(l.input[l.pos:])
	switch {
	case r == utf8.RuneError && w == 0:
		return endOfFile
	case r == utf8.RuneError:
		return utf8.RuneError
	default:
		l.width = w
		l.pos += w
		return r
	}
}

// skip skips a rune from the set of accepted runes.
// Returns true when a rune was skipped.
func (l *Lexer) skip(runes string) bool {
	r, w, _ := l.peek()
	if strings.IndexRune(runes, r) >= 0 {
		l.pos += w
		return true
	}
	return false
}

// skipRun skips a run of runes from the set of accepted runes.
// Returns true when one or more runes were skipped.
func (l *Lexer) skipRun(runes string) bool {
	didSkip := false
	for l.skip(runes) {
		didSkip = true
	}
	return didSkip
}

// accept adds the next rune to the string buffer and returns true if it's
// from the valid set of runes. Otherwise false is returned.
func (l *Lexer) accept(runes string) bool {
	r := l.next()
	if strings.IndexRune(runes, r) >= 0 {
		return true
	}
	l.backup()
	return false
}

func (l *Lexer) upcoming(runes ...string) bool {
	if peeked, ok := l.peekMulti(len(runes)); ok {
		for i, r := range runes {
			if strings.IndexRune(r, peeked[i]) < 0 {
				return false
			}
		}
		return true
	}
	return false
}

// TODO nog nodig met stringbuffer?
// acceptNot consumes the next rune if it's not from the set of runes.
func (l *Lexer) acceptNot(runes string) bool {
	r := l.next()
	if r == endOfFile {
		l.backup()
		return false
	}
	if strings.IndexRune(runes, r) < 0 {
		return true
	}
	l.backup()
	return false
}

// acceptUntil consumes a run of runes until ones from the
// valid set is encountered.
func (l *Lexer) acceptUntil(runes string) bool {
	accepted := false
	for l.acceptNot(runes) {
		accepted = true
	}
	return accepted
}

// acceptRun consumes a run of runes from the set of accepted runes.
func (l *Lexer) acceptWhile(runes string) bool {
	accepted := false
	for l.accept(runes) {
		accepted = true
	}
	return accepted
}

// skipUntil skips a run of runes, until a rune from the set of
// runes of EOF is reached.
func (l *Lexer) skipUntil(runes string) {
	l.acceptUntil(runes)
}

// error returns an error token and terminates the scan
// by returning nil to l.run.
func (l *Lexer) errorf(format string, args ...interface{}) stateFn {
	l.items <- Item{
		ItemError,
		fmt.Sprintf(format, args...),
	}
	return nil
}

func (l *Lexer) unexpectedInputError(expected string) stateFn {
	var actual string
	switch {
	case l.atEndOfFile(): // TODO maybe not hit anymore after refactoring?
		actual = "end of file"
	case !utf8.ValidString(l.input[l.pos:]):
		actual = "non-UTF8 data"
	default:
		r, _, _ := l.peek()
		actual = fmt.Sprintf("token '%c'", r)
	}
	return l.errorf("Unexpected %s (expected %s)", actual, expected)
}

func (l *Lexer) unexpectedEndOfFile(expected string) stateFn {
	return l.errorf("Unexpected end of file (expected %s)", expected)
}