go-toml/lexer/states.go

package lexer

// stateFn represents the state of the lexer as a function
// that returns the next state.
type stateFn func(*Lexer) stateFn

const (
	whitespace      string = " \t"
	carriageReturn  string = "\r"
	newline         string = "\n"
	hash            string = "#"
	equal           string = "="
	lower           string = "abcdefghijklmnopqrstuvwxyz"
	upper           string = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
	digits          string = "0123456789"
	hex             string = digits + "abcdefABCDEF"
	dot             string = "."
	underscore      string = "_"
	dash            string = "-"
	singleQuote     string = "'"
	doubleQuote     string = "\""
	backslash       string = "\\"
	quoteChars      string = singleQuote + doubleQuote
	bareKeyChars    string = lower + upper + digits + underscore + dash
	startOfKey      string = bareKeyChars + quoteChars
	escapeChars     string = `btnfr"\`
	shortUtf8Escape string = "u"
	longUtf8Escape  string = "U"
)

func stateKeyValuePair(l *Lexer) stateFn {
	l.skipConsecutive(whitespace + carriageReturn + newline)
	if l.skipMatching(hash) {
		return stateComment
	}
	if l.upcoming(startOfKey) {
		return stateKey
	}
	return stateEndOfFile
}

// A '#' hash symbol marks the rest of the line as a comment.
func stateComment(l *Lexer) stateFn {
	for {
		switch {
		case l.atEndOfFile() || l.skipMatching(newline):
			l.emitTrimmedLiteral(ItemComment)
			return stateKeyValuePair
		default:
			if !l.acceptNext(1) {
				return l.unexpectedInputError("comment")
			}
		}
	}
}

// A key may be either bare, quoted or dotted.
func stateKey(l *Lexer) stateFn {
	if l.accept(bareKeyChars) {
		return statebareKeyChars
	}
	return l.unexpectedInputError("a valid key name")
}

// Bare keys may only contain ASCII letters, ASCII digits,
// underscores, and dashes (A-Za-z0-9_-). Note that bare
// keys are allowed to be composed of only ASCII digits,
// e.g. 1234, but are always interpreted as strings.
func statebareKeyChars(l *Lexer) stateFn {
	l.acceptRun(bareKeyChars)
	l.emitLiteral(ItemKey)
	return stateEndOfKeyOrKeyDot
}

// Dotted keys are a sequence of bare or quoted keys joined with a dot.
// This allows for grouping similar properties together:
func stateEndOfKeyOrKeyDot(l *Lexer) stateFn {
	// Whitespace around dot-separated parts is ignored, however,
	// best practice is to not use any extraneous whitespace.
	l.skipConsecutive(whitespace)
	if l.skipMatching(dot) {
		l.emit(ItemKeyDot, "")
		l.skipConsecutive(whitespace)
		return stateKey
	}
	return stateKeyAssignment
}

// Keys are on the left of the equals sign and values are on the right.
// Whitespace is ignored around key names and values. The key, equals
// sign, and value must be on the same line (though some values can
// be broken over multiple lines).
func stateKeyAssignment(l *Lexer) stateFn {
	l.skipConsecutive(whitespace)
	if l.skipMatching(equal) {
		l.emit(ItemAssignment, "")
		l.skipConsecutive(whitespace)
		return stateValue
	}
	return l.unexpectedInputError("a value assignment")
}

// Values must be of the following types: String, Integer, Float, Boolean,
// Datetime, Array, or Inline Table. Unspecified values are invalid.
func stateValue(l *Lexer) stateFn {
	l.skipConsecutive(whitespace)
	if l.upcoming(quoteChars) {
		return stateStringValue
	}
	return l.unexpectedInputError("a value")
}

// There are four ways to express strings: basic, multi-line basic, literal,
// and multi-line literal. All strings must contain only valid UTF-8 characters.
func stateStringValue(l *Lexer) stateFn {
	switch {
	case l.skipMatching(doubleQuote, doubleQuote, doubleQuote):
		// Multi-line basic strings are surrounded by three quotation marks on each side.
		return stateMultiLineBasicString
	case l.skipMatching(doubleQuote):
		// Basic strings are surrounded by quotation marks.
		return stateBasicStringValue
	}
	return l.unexpectedInputError("a string value")
}

func stateBasicStringValue(l *Lexer) stateFn {
	if l.upcoming(doubleQuote, doubleQuote) {
		return stateMultiLineBasicString
	}
	return stateBasicString
}

const invalidBasicStringCharacters string = "" +
	"\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007" +
	"\u0008\u0009\u000A\u000B\u000C\u000D\u000E\u000F" +
	"\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017" +
	"\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F" +
	"\u007F"

func stateParseBasicString(l *Lexer) stateFn {
	for {
		switch {
		case l.atEndOfFile():
			return l.unexpectedEndOfFile("basic string token")
		case l.skipMatching(doubleQuote):
			return l.popState()
		case l.upcoming(backslash, escapeChars):
			// For convenience, some popular characters have a compact escape sequence.
			// \b         - backspace       (U+0008)
			// \t         - tab             (U+0009)
			// \n         - linefeed        (U+000A)
			// \f         - form feed       (U+000C)
			// \r         - carriage return (U+000D)
			// \"         - quote           (U+0022)
			// \\         - backslash       (U+005C)
			l.acceptNext(2)
		case l.upcoming(backslash, shortUtf8Escape, hex, hex, hex, hex):
			// \uXXXX     - unicode         (U+XXXX)
			l.acceptNext(6)
		case l.upcoming(backslash, longUtf8Escape, hex, hex, hex, hex, hex, hex, hex, hex):
			// \UXXXXXXXX - unicode         (U+XXXXXXXX)
			l.acceptNext(10)
		case l.upcoming(backslash):
			// All other escape sequences not listed above are reserved and,
			// if used, TOML should produce an error.
			return l.errorf("Invalid escape sequence in basic string")
		case l.upcoming(invalidBasicStringCharacters):
			// Any Unicode character may be used except those that must be escaped:
			// quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F).
			r, _ := l.next()
			return l.errorf("Invalid character in basic string: %q", r)
		default:
			if !l.acceptNext(1) {
				return l.unexpectedInputError("string value")
			}
		}
	}
}

func stateBasicString(l *Lexer) stateFn {
	l.pushState(func(l *Lexer) stateFn {
		err := l.emitInterpreted(ItemString)
		if err != nil {
			return l.errorf("Invalid data in string: %s", err)
		}
		return stateKeyValuePair
	})
	return stateParseBasicString
}

func stateMultiLineBasicString(l *Lexer) stateFn {
	return l.errorf("Not yet implemented")
}

func stateEndOfFile(l *Lexer) stateFn {
	if l.atEndOfFile() {
		l.emit(ItemEOF, "EOF")
	} else {
		l.unexpectedInputError("end of file")
	}
	return nil
}