345 lines
8.8 KiB
Go
345 lines
8.8 KiB
Go
package lexer
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"strings"
|
|
"unicode/utf8"
|
|
)
|
|
|
|
// Lexer holds the state of the lexer.
|
|
type Lexer struct {
|
|
input string // the scanned input string
|
|
state stateFn // a function that handles the current state
|
|
stack []stateFn // state function stack, for nested parsing
|
|
pos int // current scanning position in the input
|
|
width int // width of the last rune read, for supporting backup()
|
|
buffer StringBuffer // an efficient buffer, used to build string values
|
|
items chan Item // channel of resulting lexer items
|
|
nextItem Item // the current item as reached by Next() and retrieved by Get()
|
|
err error // an error message when lexing failed, retrieved by Error()
|
|
}
|
|
|
|
// Lex takes an input string and initializes the TOML lexer for it.
|
|
// Usage:
|
|
//
|
|
// l := lexer.Lex("...inputstring...")
|
|
// for l.Next() {
|
|
// item := l.Get()
|
|
// ... handle item ...
|
|
// }
|
|
// if e := l.Error(); e != nil {
|
|
// ... handle error message ...
|
|
// }
|
|
func Lex(input string) *Lexer {
|
|
return &Lexer{
|
|
input: input,
|
|
state: stateKeyValuePair,
|
|
items: make(chan Item, 2),
|
|
}
|
|
}
|
|
|
|
// Next advances to the next lexer item in the input string.
|
|
// When a next item was found, then true is returned.
|
|
// On error or reaching the end of the input, false is returned.
|
|
func (l *Lexer) Next() bool {
|
|
if l.state == nil {
|
|
panic("This should not happen: nil state reached, but entering Next()")
|
|
}
|
|
for {
|
|
select {
|
|
case i := <-l.items:
|
|
if i.Type == ItemEOF {
|
|
return false
|
|
}
|
|
if i.Type == ItemError {
|
|
l.err = errors.New(i.Value)
|
|
return false
|
|
}
|
|
l.nextItem = i
|
|
return true
|
|
default:
|
|
l.state = l.state(l)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (l *Lexer) Error() error {
|
|
return l.err
|
|
}
|
|
|
|
// Get returns the next lexer item, as reached by Next()
|
|
func (l *Lexer) Get() Item {
|
|
return l.nextItem
|
|
}
|
|
|
|
// ToArray returns lexer items as an array.
|
|
// When an error occurs during scanning, a partial result will be
|
|
// returned, accompanied by the error that occurred.
|
|
func (l *Lexer) ToArray() ([]Item, error) {
|
|
var items []Item
|
|
for l.Next() {
|
|
items = append(items, l.Get())
|
|
}
|
|
return items, l.Error()
|
|
}
|
|
|
|
// pushState adds the state function to its stack.
|
|
// This is used for implementing nested parsing.
|
|
func (l *Lexer) pushState(state stateFn) {
|
|
l.stack = append(l.stack, state)
|
|
}
|
|
|
|
// popState pops the last pushed state from its stack.
|
|
func (l *Lexer) popState() stateFn {
|
|
last := len(l.stack) - 1
|
|
head, tail := l.stack[:last], l.stack[last]
|
|
l.stack = head
|
|
return tail
|
|
}
|
|
|
|
// atEndOfFile returns true when there is no more data available in the input.
|
|
func (l *Lexer) atEndOfFile() bool {
|
|
return l.pos >= len(l.input)
|
|
}
|
|
|
|
// emit passes a lexer item back to the client, including the provided string.
|
|
func (l *Lexer) emit(t itemType, s string) {
|
|
l.items <- Item{t, s}
|
|
l.buffer.Reset()
|
|
}
|
|
|
|
// emitLiteral passes a lexer item back to the client, including the accumulated
|
|
// string buffer data as a literal string.
|
|
func (l *Lexer) emitLiteral(t itemType) {
|
|
l.emit(t, l.buffer.AsLiteralString())
|
|
}
|
|
|
|
// emitTrimmedLiteral passes a lexer item back to the client, including the
|
|
// accumulated string buffer data as a literal string with whitespace
|
|
// trimmed from it.
|
|
func (l *Lexer) emitTrimmedLiteral(t itemType) {
|
|
l.emit(t, strings.TrimSpace(l.buffer.AsLiteralString()))
|
|
}
|
|
|
|
// emitInterpreted passes a lexer item back to the client, including the
|
|
// accumulated string buffer data an interpreted string (handling escape
|
|
// codes like \n, \t, \uXXXX, etc.)
|
|
// This method might return an error, in case there is data in the
|
|
// string buffer that is not valid for string interpretation.
|
|
func (l *Lexer) emitInterpreted(t itemType) error {
|
|
s, err := l.buffer.AsInterpretedString()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
l.emit(t, s)
|
|
return nil
|
|
}
|
|
|
|
// backup steps back one rune
|
|
// Can be called only once per call of next.
|
|
func (l *Lexer) backup() {
|
|
l.pos -= l.width
|
|
}
|
|
|
|
// peek returns but does not advance to the next rune(s) in the input.
|
|
// Returns the rune, its width and a boolean. The boolean will be false in case
|
|
// no upcoming rune can be peeked (end of data or invalid UTF8 character).
|
|
func (l *Lexer) peek() (rune, int, bool) {
|
|
r, w := utf8.DecodeRuneInString(l.input[l.pos:])
|
|
switch {
|
|
case r == utf8.RuneError:
|
|
return utf8.RuneError, w, false
|
|
default:
|
|
return r, w, true
|
|
}
|
|
}
|
|
|
|
// peekMulti takes a peek at multiple upcoming runes in the input.
|
|
// Returns a slice of runes and a boolean. The boolean will be false in case
|
|
// less upcoming runes can be peeked than the requested amount
|
|
// (end of data or invalid UTF8 character).
|
|
func (l *Lexer) peekMulti(amount int) ([]rune, bool) {
|
|
offset := 0
|
|
var peeked []rune
|
|
for i := 0; i < amount; i++ {
|
|
r, w := utf8.DecodeRuneInString(l.input[l.pos+offset:])
|
|
switch {
|
|
case r == utf8.RuneError:
|
|
return peeked, false
|
|
default:
|
|
offset += w
|
|
peeked = append(peeked, r)
|
|
}
|
|
}
|
|
return peeked, true
|
|
}
|
|
|
|
// acceptNext adds the specified amount of runes from the input to the string buffer.
|
|
// If not enough runes could be read (end of file or invalid UTF8 data), then false is returned.
|
|
func (l *Lexer) acceptNext(count int) bool {
|
|
for i := 0; i < count; i++ {
|
|
r := l.next()
|
|
if r == endOfFile || r == utf8.RuneError {
|
|
return false
|
|
}
|
|
l.buffer.WriteRune(r)
|
|
}
|
|
return true
|
|
}
|
|
|
|
// acceptFrom adds the next rune from the input to the string buffer
|
|
// when it matches in the provided runes. If the next rune does
|
|
// not match, false is returned.
|
|
func (l *Lexer) acceptFrom(runes string) bool {
|
|
r := l.next()
|
|
if strings.IndexRune(runes, r) >= 0 {
|
|
l.buffer.WriteRune(r)
|
|
return true
|
|
}
|
|
l.backup()
|
|
return false
|
|
}
|
|
|
|
// acceptRun adds consecutive runes from the input to the string
|
|
// buffer when they match the provided runes. If no runes were added
|
|
// at all, false it returned.
|
|
func (l *Lexer) acceptRun(runes string) bool {
|
|
accepted := false
|
|
for l.acceptFrom(runes) {
|
|
accepted = true
|
|
}
|
|
return accepted
|
|
}
|
|
|
|
// TODO meh... ugly rune.
|
|
var endOfFile rune = -1
|
|
|
|
// next returns the next rune from the input.
|
|
func (l *Lexer) next() rune {
|
|
l.width = 0
|
|
r, w := utf8.DecodeRuneInString(l.input[l.pos:])
|
|
switch {
|
|
case r == utf8.RuneError && w == 0:
|
|
return endOfFile
|
|
case r == utf8.RuneError:
|
|
return utf8.RuneError
|
|
default:
|
|
l.width = w
|
|
l.pos += w
|
|
return r
|
|
}
|
|
}
|
|
|
|
// skip skips a rune from the set of accepted runes.
|
|
// Returns true when a rune was skipped.
|
|
func (l *Lexer) skip(runes string) bool {
|
|
r, w, _ := l.peek()
|
|
if strings.IndexRune(runes, r) >= 0 {
|
|
l.pos += w
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
// skipRun skips a run of runes from the set of accepted runes.
|
|
// Returns true when one or more runes were skipped.
|
|
func (l *Lexer) skipRun(runes string) bool {
|
|
didSkip := false
|
|
for l.skip(runes) {
|
|
didSkip = true
|
|
}
|
|
return didSkip
|
|
}
|
|
|
|
// accept adds the next rune to the string buffer and returns true if it's
|
|
// from the valid set of runes. Otherwise false is returned.
|
|
func (l *Lexer) accept(runes string) bool {
|
|
r := l.next()
|
|
if strings.IndexRune(runes, r) >= 0 {
|
|
return true
|
|
}
|
|
l.backup()
|
|
return false
|
|
}
|
|
|
|
func (l *Lexer) upcoming(runes ...string) bool {
|
|
if peeked, ok := l.peekMulti(len(runes)); ok {
|
|
for i, r := range runes {
|
|
if strings.IndexRune(r, peeked[i]) < 0 {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
// TODO nog nodig met stringbuffer?
|
|
// acceptNot consumes the next rune if it's not from the set of runes.
|
|
func (l *Lexer) acceptNot(runes string) bool {
|
|
r := l.next()
|
|
if r == endOfFile {
|
|
l.backup()
|
|
return false
|
|
}
|
|
if strings.IndexRune(runes, r) < 0 {
|
|
return true
|
|
}
|
|
l.backup()
|
|
return false
|
|
}
|
|
|
|
// acceptUntil consumes a run of runes until ones from the
|
|
// valid set is encountered.
|
|
func (l *Lexer) acceptUntil(runes string) bool {
|
|
accepted := false
|
|
for l.acceptNot(runes) {
|
|
accepted = true
|
|
}
|
|
return accepted
|
|
}
|
|
|
|
// acceptRun consumes a run of runes from the set of accepted runes.
|
|
func (l *Lexer) acceptWhile(runes string) bool {
|
|
accepted := false
|
|
for l.accept(runes) {
|
|
accepted = true
|
|
}
|
|
return accepted
|
|
}
|
|
|
|
// skipUntil skips a run of runes, until a rune from the set of
|
|
// runes of EOF is reached.
|
|
func (l *Lexer) skipUntil(runes string) {
|
|
l.acceptUntil(runes)
|
|
}
|
|
|
|
// error returns an error token and terminates the scan
|
|
// by returning nil to l.run.
|
|
func (l *Lexer) errorf(format string, args ...interface{}) stateFn {
|
|
l.items <- Item{
|
|
ItemError,
|
|
fmt.Sprintf(format, args...),
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (l *Lexer) unexpectedInputError(expected string) stateFn {
|
|
var actual string
|
|
switch {
|
|
case l.atEndOfFile(): // TODO maybe not hit anymore after refactoring?
|
|
actual = "end of file"
|
|
case !utf8.ValidString(l.input[l.pos:]):
|
|
actual = "non-UTF8 data"
|
|
default:
|
|
r, _, _ := l.peek()
|
|
actual = fmt.Sprintf("token '%c'", r)
|
|
}
|
|
return l.errorf("Unexpected %s (expected %s)", actual, expected)
|
|
}
|
|
|
|
func (l *Lexer) unexpectedEndOfFile(expected string) stateFn {
|
|
return l.errorf("Unexpected end of file (expected %s)", expected)
|
|
}
|