go-toml/lexer/states.go

package lexer

// stateFn represents the state of the scanner as a function
// that returns the next state.
type stateFn func(*Lexer) stateFn

const (
	whitespace     string = " \t"
	carriageReturn string = "\r"
	newline        string = "\n"
	hash           string = "#"
	equal          string = "="
	lower          string = "abcdefghijklmnopqrstuvwxyz"
	upper          string = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
	digits         string = "0123456789"
	dot            string = "."
	underscore     string = "_"
	dash           string = "-"
	singleQuote    string = "'"
	doubleQuote    string = "\""
	backslash      string = "\\"
	someQuote      string = singleQuote + doubleQuote
	bareKey        string = lower + upper + digits + underscore + dash
	startOfKey     string = bareKey + someQuote
)

func stateKeyValuePair(l *Lexer) stateFn {
	l.skip(whitespace + carriageReturn + newline)
	if l.upcoming(hash) {
		return stateComment
	}
	if l.upcoming(startOfKey) {
		return stateKey
	}
	return stateEndOfFile
}

// A '#' hash symbol marks the rest of the line as a comment.
func stateComment(l *Lexer) stateFn {
	l.resetStringBuilder()
	for {
		switch {
		case l.atEndOfFile() || l.accept(newline):
			l.emit(ItemComment, l.getString())
			return stateKeyValuePair
		case l.accept(carriageReturn):
			l.ignore()
		default:
			l.addToString(l.next())
		}
	}
}

// A key may be either bare, quoted or dotted.
func stateKey(l *Lexer) stateFn {
	if l.upcoming(bareKey) {
		return stateBareKey
	}
	return l.unexpectedTokenError("a valid key name")
}

// Bare keys may only contain ASCII letters, ASCII digits,
// underscores, and dashes (A-Za-z0-9_-). Note that bare
// keys are allowed to be composed of only ASCII digits,
// e.g. 1234, but are always interpreted as strings.
func stateBareKey(l *Lexer) stateFn {
	l.acceptWhile(bareKey)
	l.emit(ItemKey, l.getAcceptedString())
	return stateEndOfKeyOrKeyDot
}

// Dotted keys are a sequence of bare or quoted keys joined with a dot.
// This allows for grouping similar properties together:
func stateEndOfKeyOrKeyDot(l *Lexer) stateFn {
	// Whitespace around dot-separated parts is ignored, however,
	// best practice is to not use any extraneous whitespace.
	l.skip(whitespace)
	if l.accept(dot) {
		l.emit(ItemKeyDot, ".")
		l.skip(whitespace)
		return stateKey
	}
	return stateKeyAssignment
}

// Keys are on the left of the equals sign and values are on the right.
// Whitespace is ignored around key names and values. The key, equals
// sign, and value must be on the same line (though some values can
// be broken over multiple lines).
func stateKeyAssignment(l *Lexer) stateFn {
	l.skip(whitespace)
	if l.accept(equal) {
		l.skip(whitespace)
		return stateValue
	}
	return l.unexpectedTokenError("an '=' value assignment")
}

func stateValue(l *Lexer) stateFn {
	l.skip(whitespace)
	if l.upcoming(someQuote) {
		return stateStringValue
	}
	return l.unexpectedTokenError("a value")
}

// There are four ways to express strings: basic, multi-line basic, literal,
// and multi-line literal. All strings must contain only valid UTF-8 characters.
func stateStringValue(l *Lexer) stateFn {
	if l.accept(doubleQuote) {
		return stateBasicStringValue
	}
	return l.unexpectedTokenError("a string value")
}

func stateBasicStringValue(l *Lexer) stateFn {
	// Possibly a """ multi-line string start,
	// possibly the end of an "" empty string.
	if l.accept(doubleQuote) {
		// A """ multi-line string.
		if l.accept(doubleQuote) {
			l.ignore()
			return stateMultiLineBasicString
		}
		// An "" empty string.
		l.ignore()
		l.emit(ItemString, "")
		return stateKeyValuePair
	}
	l.ignore()
	return stateBasicString
}

// Basic strings are surrounded by quotation marks. Any Unicode character
// may be used except those that must be escaped: quotation mark, backslash,
// and the control characters (U+0000 to U+001F, U+007F).
//
// For convenience, some popular characters have a compact escape sequence.
//
// \b         - backspace       (U+0008)
// \t         - tab             (U+0009)
// \n         - linefeed        (U+000A)
// \f         - form feed       (U+000C)
// \r         - carriage return (U+000D)
// \"         - quote           (U+0022)
// \\         - backslash       (U+005C)
// \uXXXX     - unicode         (U+XXXX)
// \UXXXXXXXX - unicode         (U+XXXXXXXX)
//
// Any Unicode character may be escaped with the \uXXXX or \UXXXXXXXX forms.
// The escape codes must be valid Unicode scalar values.
//
// All other escape sequences not listed above are reserved and,
// if used, TOML should produce an error.

var basicEscapes = map[rune]rune{
	'b':  rune(8),
	't':  rune(9),
	'n':  rune(10),
	'f':  rune(12),
	'r':  rune(13),
	'"':  rune(34),
	'\\': rune(92),
}

func stateParseBasicString(l *Lexer) stateFn {
	for {
		switch {
		case l.atEndOfFile():
			return l.unexpectedEndOfFile("basic string token")
		case l.accept(doubleQuote):
			return l.popState()
		case l.accept(backslash):
			r := l.next()
			if escaped, ok := basicEscapes[r]; ok {
				l.addToString(escaped)
			} else {
				return l.errorf("Invalid escape sequence \\%c in string value", r)
			}
		default:
			l.addToString(l.next())
		}
	}
}

func stateBasicString(l *Lexer) stateFn {
	l.resetStringBuilder()
	l.pushState(func(l *Lexer) stateFn {
		l.emit(ItemString, l.getString())
		return stateKeyValuePair
	})
	return stateParseBasicString
}

func stateMultiLineBasicString(l *Lexer) stateFn {
	return l.errorf("Not yet implemented")
}

func stateEndOfFile(l *Lexer) stateFn {
	i := l.peek()
	if i == endOfFile {
		l.emit(ItemEOF, "EOF")
		return nil
	}
	return l.unexpectedTokenError("end of file")
}