package lexer // stateFn represents the state of the lexer as a function // that returns the next state. type stateFn func(*Lexer) stateFn const ( whitespace string = " \t" carriageReturn string = "\r" newline string = "\n" hash string = "#" equal string = "=" lower string = "abcdefghijklmnopqrstuvwxyz" upper string = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" digits string = "0123456789" hex string = digits + "abcdefABCDEF" dot string = "." underscore string = "_" dash string = "-" singleQuote string = "'" doubleQuote string = "\"" backslash string = "\\" quoteChars string = singleQuote + doubleQuote bareKeyChars string = lower + upper + digits + underscore + dash startOfKey string = bareKeyChars + quoteChars escapeChars string = `btnfr"\` shortUtf8Escape string = "u" longUtf8Escape string = "U" ) func stateKeyValuePair(l *Lexer) stateFn { l.skipRun(whitespace + carriageReturn + newline) if l.skip(hash) { return stateComment } if l.upcoming(startOfKey) { return stateKey } return stateEndOfFile } // A '#' hash symbol marks the rest of the line as a comment. func stateComment(l *Lexer) stateFn { for { switch { case l.atEndOfFile() || l.skip(newline): l.emitTrimmedLiteral(ItemComment) return stateKeyValuePair default: if !l.acceptNext(1) { return nil } } } } // A key may be either bare, quoted or dotted. func stateKey(l *Lexer) stateFn { if l.acceptFrom(bareKeyChars) { return statebareKeyChars } return l.unexpectedInputError("a valid key name") } // Bare keys may only contain ASCII letters, ASCII digits, // underscores, and dashes (A-Za-z0-9_-). Note that bare // keys are allowed to be composed of only ASCII digits, // e.g. 1234, but are always interpreted as strings. func statebareKeyChars(l *Lexer) stateFn { l.acceptRun(bareKeyChars) l.emitLiteral(ItemKey) return stateEndOfKeyOrKeyDot } // Dotted keys are a sequence of bare or quoted keys joined with a dot. // This allows for grouping similar properties together: func stateEndOfKeyOrKeyDot(l *Lexer) stateFn { // Whitespace around dot-separated parts is ignored, however, // best practice is to not use any extraneous whitespace. l.skipRun(whitespace) if l.skip(dot) { l.emit(ItemKeyDot, "") l.skipRun(whitespace) return stateKey } return stateKeyAssignment } // Keys are on the left of the equals sign and values are on the right. // Whitespace is ignored around key names and values. The key, equals // sign, and value must be on the same line (though some values can // be broken over multiple lines). func stateKeyAssignment(l *Lexer) stateFn { l.skipRun(whitespace) if l.skip(equal) { l.emit(ItemAssignment, "") l.skipRun(whitespace) return stateValue } return l.unexpectedInputError("a value assignment") } // Values must be of the following types: String, Integer, Float, Boolean, // Datetime, Array, or Inline Table. Unspecified values are invalid. func stateValue(l *Lexer) stateFn { l.skipRun(whitespace) if l.upcoming(quoteChars) { return stateStringValue } return l.unexpectedInputError("a value") } // There are four ways to express strings: basic, multi-line basic, literal, // and multi-line literal. All strings must contain only valid UTF-8 characters. func stateStringValue(l *Lexer) stateFn { // Basic strings are surrounded by quotation marks. if l.skip(doubleQuote) { return stateBasicStringValue } return l.unexpectedInputError("a string value") } func stateBasicStringValue(l *Lexer) stateFn { // Possibly a """ multi-line string start, // possibly the end of an "" empty string. if l.skip(doubleQuote) { // It's a """ multi-line string. if l.skip(doubleQuote) { return stateMultiLineBasicString } // It's an "" empty string. l.emit(ItemString, "") return stateKeyValuePair } return stateBasicString } const invalidBasicStringCharacters string = "" + "\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007" + "\u0008\u0009\u000A\u000B\u000C\u000D\u000E\u000F" + "\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017" + "\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F" + "\u007F" func stateParseBasicString(l *Lexer) stateFn { for { switch { case l.atEndOfFile(): return l.unexpectedEndOfFile("basic string token") case l.skip(doubleQuote): return l.popState() case l.upcoming(backslash, escapeChars): // For convenience, some popular characters have a compact escape sequence. // \b - backspace (U+0008) // \t - tab (U+0009) // \n - linefeed (U+000A) // \f - form feed (U+000C) // \r - carriage return (U+000D) // \" - quote (U+0022) // \\ - backslash (U+005C) l.acceptNext(2) case l.upcoming(backslash, shortUtf8Escape, hex, hex, hex, hex): // \uXXXX - unicode (U+XXXX) l.acceptNext(6) case l.upcoming(backslash, longUtf8Escape, hex, hex, hex, hex, hex, hex, hex, hex): // \UXXXXXXXX - unicode (U+XXXXXXXX) l.acceptNext(10) case l.upcoming(backslash): // All other escape sequences not listed above are reserved and, // if used, TOML should produce an error. return l.errorf("Invalid escape sequence in basic string") case l.upcoming(invalidBasicStringCharacters): // Any Unicode character may be used except those that must be escaped: // quotation mark, backslash, and the control characters (U+0000 to U+001F, U+007F). return l.errorf("Invalid character in basic string: %q", l.next()) default: l.acceptNext(1) } } } func stateBasicString(l *Lexer) stateFn { l.pushState(func(l *Lexer) stateFn { err := l.emitInterpreted(ItemString) if err != nil { return l.errorf("Invalid data in string: %s", err) } return stateKeyValuePair }) return stateParseBasicString } func stateMultiLineBasicString(l *Lexer) stateFn { return l.errorf("Not yet implemented") } func stateEndOfFile(l *Lexer) stateFn { if l.atEndOfFile() { l.emit(ItemEOF, "EOF") return nil } return l.unexpectedInputError("end of file") }