package lexer // stateFn represents the state of the scanner as a function // that returns the next state. type stateFn func(*Lexer) stateFn const ( whitespace string = " \t" carriageReturn string = "\r" newline string = "\n" hash string = "#" equal string = "=" lower string = "abcdefghijklmnopqrstuvwxyz" upper string = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" digits string = "0123456789" dot string = "." underscore string = "_" dash string = "-" singleQuote string = "'" doubleQuote string = "\"" backslash string = "\\" someQuote string = singleQuote + doubleQuote bareKey string = lower + upper + digits + underscore + dash startOfKey string = bareKey + someQuote ) func stateKeyValuePair(l *Lexer) stateFn { l.skip(whitespace + carriageReturn + newline) if l.upcoming(hash) { return stateComment } if l.upcoming(startOfKey) { return stateKey } return stateEndOfFile } // A '#' hash symbol marks the rest of the line as a comment. func stateComment(l *Lexer) stateFn { l.resetStringBuilder() for { switch { case l.atEndOfFile() || l.accept(newline): l.emit(ItemComment, l.getString()) return stateKeyValuePair case l.accept(carriageReturn): l.ignore() default: l.addToString(l.next()) } } } // A key may be either bare, quoted or dotted. func stateKey(l *Lexer) stateFn { if l.upcoming(bareKey) { return stateBareKey } return l.unexpectedTokenError("a valid key name") } // Bare keys may only contain ASCII letters, ASCII digits, // underscores, and dashes (A-Za-z0-9_-). Note that bare // keys are allowed to be composed of only ASCII digits, // e.g. 1234, but are always interpreted as strings. func stateBareKey(l *Lexer) stateFn { l.acceptWhile(bareKey) l.emit(ItemKey, l.getAcceptedString()) return stateEndOfKeyOrKeyDot } // Dotted keys are a sequence of bare or quoted keys joined with a dot. // This allows for grouping similar properties together: func stateEndOfKeyOrKeyDot(l *Lexer) stateFn { // Whitespace around dot-separated parts is ignored, however, // best practice is to not use any extraneous whitespace. l.skip(whitespace) if l.accept(dot) { l.emit(ItemKeyDot, ".") l.skip(whitespace) return stateKey } return stateKeyAssignment } // Keys are on the left of the equals sign and values are on the right. // Whitespace is ignored around key names and values. The key, equals // sign, and value must be on the same line (though some values can // be broken over multiple lines). func stateKeyAssignment(l *Lexer) stateFn { l.skip(whitespace) if l.accept(equal) { l.skip(whitespace) return stateValue } return l.unexpectedTokenError("an '=' value assignment") } func stateValue(l *Lexer) stateFn { l.skip(whitespace) if l.upcoming(someQuote) { return stateStringValue } return l.unexpectedTokenError("a value") } // There are four ways to express strings: basic, multi-line basic, literal, // and multi-line literal. All strings must contain only valid UTF-8 characters. func stateStringValue(l *Lexer) stateFn { if l.accept(doubleQuote) { return stateBasicStringValue } return l.unexpectedTokenError("a string value") } func stateBasicStringValue(l *Lexer) stateFn { // Possibly a """ multi-line string start, // possibly the end of an "" empty string. if l.accept(doubleQuote) { // A """ multi-line string. if l.accept(doubleQuote) { l.ignore() return stateMultiLineBasicString } // An "" empty string. l.ignore() l.emit(ItemString, "") return stateKeyValuePair } l.ignore() return stateBasicString } // Basic strings are surrounded by quotation marks. Any Unicode character // may be used except those that must be escaped: quotation mark, backslash, // and the control characters (U+0000 to U+001F, U+007F). // // For convenience, some popular characters have a compact escape sequence. // // \b - backspace (U+0008) // \t - tab (U+0009) // \n - linefeed (U+000A) // \f - form feed (U+000C) // \r - carriage return (U+000D) // \" - quote (U+0022) // \\ - backslash (U+005C) // \uXXXX - unicode (U+XXXX) // \UXXXXXXXX - unicode (U+XXXXXXXX) // // Any Unicode character may be escaped with the \uXXXX or \UXXXXXXXX forms. // The escape codes must be valid Unicode scalar values. // // All other escape sequences not listed above are reserved and, // if used, TOML should produce an error. var basicEscapes = map[rune]rune{ 'b': rune(8), 't': rune(9), 'n': rune(10), 'f': rune(12), 'r': rune(13), '"': rune(34), '\\': rune(92), } func stateParseBasicString(l *Lexer) stateFn { for { switch { case l.atEndOfFile(): return l.unexpectedEndOfFile("basic string token") case l.accept(doubleQuote): return l.popState() case l.accept(backslash): r := l.next() if escaped, ok := basicEscapes[r]; ok { l.addToString(escaped) } else { return l.errorf("Invalid escape sequence \\%c in string value", r) } default: l.addToString(l.next()) } } } func stateBasicString(l *Lexer) stateFn { l.resetStringBuilder() l.pushState(func(l *Lexer) stateFn { l.emit(ItemString, l.getString()) return stateKeyValuePair }) return stateParseBasicString } func stateMultiLineBasicString(l *Lexer) stateFn { return l.errorf("Not yet implemented") } func stateEndOfFile(l *Lexer) stateFn { i := l.peek() if i == endOfFile { l.emit(ItemEOF, "EOF") return nil } return l.unexpectedTokenError("end of file") }