From e1ef9df7ca061731032e5b4e43266c6256ae3099 Mon Sep 17 00:00:00 2001 From: Maurice Makaay Date: Mon, 17 Jun 2019 23:25:39 +0000 Subject: [PATCH] Implemented all string types and key types --- keyvaluepair.go | 29 +++--- keyvaluepair_test.go | 25 +++--- value_string.go | 205 ++++++++++++++++++++++++++++++++++++++----- value_string_test.go | 65 +++++++++++++- 4 files changed, 273 insertions(+), 51 deletions(-) diff --git a/keyvaluepair.go b/keyvaluepair.go index 7e22be0..5feb8d5 100644 --- a/keyvaluepair.go +++ b/keyvaluepair.go @@ -56,19 +56,24 @@ func (t *parser) startKeyValuePair(p *parse.API) { } func (t *parser) startKey(p *parse.API) { - if p.Peek(bareKeyRune) { - p.Handle(t.startBareKey) - } else { - p.Expected("a key name") - } -} - -func (t *parser) startBareKey(p *parse.API) { - if p.Accept(bareKey) { - t.emitCommand(cKey, p.Result().String()) + endFunc := func(str string) { + t.emitCommand(cKey, str) p.Handle(t.endOfKeyOrDot) - } else { - p.Expected("a bare key name") + } + + switch { + case p.Accept(bareKey): + endFunc(p.Result().String()) + case p.Peek(a.SingleQuote): + if str, ok := t.parseLiteralString("key", p); ok { + endFunc(str) + } + case p.Peek(a.DoubleQuote): + if str, ok := t.parseBasicString("key", p); ok { + endFunc(str) + } + default: + p.Expected("a key name") } } diff --git a/keyvaluepair_test.go b/keyvaluepair_test.go index ea32761..1187938 100644 --- a/keyvaluepair_test.go +++ b/keyvaluepair_test.go @@ -5,16 +5,7 @@ import "testing" func TestKey(t *testing.T) { for _, test := range []parseTest{ {"", []string{`Error: unexpected end of file (expected a key name) at start of file`}}, - {"barekey", []string{`key("barekey")`}}, - } { - p := &parser{} - testParseHandler(t, p, p.startKey, test) - } -} - -func TestBareKey(t *testing.T) { - for _, test := range []parseTest{ - {"", []string{`Error: unexpected end of file (expected a bare key name) at start of file`}}, + // Bare key tests {"barekey", []string{`key("barekey")`}}, {"1234567", []string{`key("1234567")`}}, {"mix-12_34", []string{`key("mix-12_34")`}}, @@ -24,9 +15,21 @@ func TestBareKey(t *testing.T) { {"key1.key2", []string{`key("key1")`, `keydot()`, `key("key2")`}}, {"key . with . spaces", []string{`key("key")`, `keydot()`, `key("with")`, `keydot()`, `key("spaces")`}}, {"key \t . \twithtabs\t . \tandspaces", []string{`key("key")`, `keydot()`, `key("withtabs")`, `keydot()`, `key("andspaces")`}}, + // Single quoted key tests + {"''", []string{`key("")`}}, + {"'single quoted'", []string{`key("single quoted")`}}, + {`'escape\s are literal'`, []string{`key("escape\\s are literal")`}}, + {`'"using inner quotes"'`, []string{`key("\"using inner quotes\"")`}}, + // Double quoted key tests + {`""`, []string{`key("")`}}, + {`"double quoted"`, []string{`key("double quoted")`}}, + {`"escapes are in\terpreted"`, []string{`key("escapes are in\terpreted")`}}, + {`"using 'inner' \"quotes\""`, []string{`key("using 'inner' \"quotes\"")`}}, + // Mixed key types + {`this.'i\s'."madness\t".''`, []string{`key("this")`, `keydot()`, `key("i\\s")`, `keydot()`, `key("madness\t")`, `keydot()`, `key("")`}}, } { p := &parser{} - testParseHandler(t, p, p.startBareKey, test) + testParseHandler(t, p, p.startKey, test) } } diff --git a/value_string.go b/value_string.go index 4123a0a..43d39a3 100644 --- a/value_string.go +++ b/value_string.go @@ -7,17 +7,15 @@ import ( ) var ( - // There are four ways to express strings: basic, multi-line basic, - // literal, and multi-line literal. All strings must contain only valid - // UTF-8 characters. * Multi-line basic strings are surrounded by three - // quotation marks on each side. * Basic strings are surrounded by - // quotation marks. + // Multi-line basic strings are surrounded by three quotation marks on each + // side and allow newlines. doubleQuote3 = a.Str(`"""`) - // Any Unicode character may be used except those that must be escaped: - // quotation mark, backslash, and the control characters (U+0000 to - // U+001F, U+007F). - charThatMustBeEscaped = a.RuneRange('\u0000', '\u001F').Or(a.Rune('\u007F')) + // Multi-line literal strings are surrounded by three single quotes on each side and allow newlines. + singleQuote3 = a.Str(`'''`) + + // Control characters as defined by TOML (U+0000 to U+001F, U+007F) + controlCharacter = a.RuneRange('\u0000', '\u001F').Or(a.Rune('\u007F')) // For convenience, some popular characters have a compact escape sequence. // @@ -30,46 +28,170 @@ var ( // \\ - backslash (U+005C) // \uXXXX - unicode (U+XXXX) // \UXXXXXXXX - unicode (U+XXXXXXXX) - validEscapeChar = c.Any(a.Runes('b', 't', 'n', 'f', 'r'), a.DoubleQuote, a.Backslash) + validEscapeChar = a.Runes('b', 't', 'n', 'f', 'r', '"', '\\') shortEscape = c.Seq(a.Backslash, validEscapeChar) shortUTF8Escape = c.Seq(a.Backslash, a.Rune('u'), a.HexDigit.Times(4)) longUTF8Escape = c.Seq(a.Backslash, a.Rune('U'), a.HexDigit.Times(8)) validEscape = c.Any(shortEscape, shortUTF8Escape, longUTF8Escape) + + // For writing long strings without introducing extraneous whitespace, use a + // "line ending backslash". When the last non-whitespace character on a line is + // a \, it will be trimmed along with all whitespace (including newlines) up to + // the next non-whitespace character or closing delimiter. + lineEndingBackslash = a.Backslash. + Then(c.ZeroOrMore(a.Blanks)). + Then(a.Newline). + Then(c.ZeroOrMore(a.Whitespace)) ) +// There are four ways to express strings: basic, multi-line basic, literal and +// multi-line literal. All strings must contain only valid UTF-8 characters. func (t *parser) startString(p *parse.API) { switch { case p.Peek(doubleQuote3): p.Handle(t.startMultiLineBasicString) case p.Peek(a.DoubleQuote): p.Handle(t.startBasicString) + case p.Peek(singleQuote3): + p.Handle(t.startMultiLineLiteralString) + case p.Peek(a.SingleQuote): + p.Handle(t.startLiteralString) default: p.Expected("a string value") } } // Specific handling of input for basic strings. -// * A double quote ends the string -// * No additional \escape sequences are allowed. What the spec say about this: -// "All other escape sequences [..] are reserved and, if used, TOML should -// produce an error."" +// +// • Basic strings are surrounded by quotation marks. +// +// • Any Unicode character may be used except those that must be escaped: +// quotation mark, backslash, and the control characters (U+0000 to +// U+001F, U+007F). +// +// • No additional \escape sequences are allowed. What the spec say about this: +// "All other escape sequences [..] are reserved and, if used, TOML should +// produce an error."" func (t *parser) startBasicString(p *parse.API) { + if str, ok := t.parseBasicString("basic string", p); ok { + t.emitCommand(csetStrVal, str) + } +} + +func (t *parser) parseBasicString(name string, p *parse.API) (string, bool) { if !p.Accept(a.DoubleQuote) { - p.Expected("a basic string") + p.Expected(`opening quotation marks`) + return "", false + } + sb := &strings.Builder{} + for { + switch { + case p.Peek(controlCharacter): + p.Error("invalid character in %s: %q (must be escaped)", name, p.Result().Rune(0)) + return sb.String(), false + case p.Accept(tok.StrInterpreted(nil, c.OneOrMore(validEscape))): + sb.WriteString(p.Result().Value(0).(string)) + case p.Peek(a.Backslash): + p.Error("invalid escape sequence") + return sb.String(), false + case p.Accept(m.Drop(a.DoubleQuote)): + return sb.String(), true + case p.Accept(a.ValidRune): + sb.WriteString(p.Result().String()) + case p.Peek(a.InvalidRune): + p.Error("invalid UTF8 rune") + return sb.String(), false + default: + p.Expected(`closing quotation marks`) + return sb.String(), false + } + } +} + +// Specific handling of input for literal strings. +// +// • Literal strings are surrounded by single quotes. +// +// • Like basic strings, they must appear on a single line. +// +// • Control characters other than tab are not permitted in a literal string. +func (t *parser) startLiteralString(p *parse.API) { + if str, ok := t.parseLiteralString("literal string", p); ok { + t.emitCommand(csetStrVal, str) + } +} + +func (t *parser) parseLiteralString(name string, p *parse.API) (string, bool) { + if !p.Accept(a.SingleQuote) { + p.Expected("opening single quote") + return "", false + } + sb := &strings.Builder{} + for { + switch { + case p.Accept(m.Drop(a.SingleQuote)): + return sb.String(), true + case p.Accept(a.Tab): + sb.WriteString("\t") + case p.Peek(controlCharacter): + p.Error("invalid character in %s: %q (no control chars allowed, except for tab)", name, p.Result().Rune(0)) + return sb.String(), false + case p.Accept(a.ValidRune): + sb.WriteString(p.Result().String()) + case p.Peek(a.InvalidRune): + p.Error("invalid UTF8 rune") + return sb.String(), false + default: + p.Expected("closing single quote") + return sb.String(), false + } + } +} + +// Specific handling of input for multi-line basic strings. +// +// • Multi-line basic strings are surrounded by three quotation marks on +// each side and allow newlines. +// +// • A newline immediately following the opening delimiter will be trimmed. +// All other whitespace and newline characters remain intact. +// +// • TOML parsers should feel free to normalize newline to whatever makes +// sense for their platform. +// +// • All of the escape sequences that are valid for basic strings are also valid +// for multi-line basic strings. +// +// • Any Unicode character may be used except those that must be escaped: +// backslash and the control characters (U+0000 to U+001F, U+007F). Quotation +// marks need not be escaped unless their presence would create a premature +// closing delimiter. +// +// • For writing long strings without introducing extraneous whitespace, use a +// "line ending backslash". When the last non-whitespace character on a line is +// a \, it will be trimmed along with all whitespace (including newlines) up to +// the next non-whitespace character or closing delimiter. +func (t *parser) startMultiLineBasicString(p *parse.API) { + if !p.Accept(doubleQuote3.Then(a.Newline.Optional())) { + p.Expected("opening three quotation marks") return } sb := &strings.Builder{} for { switch { - case p.Peek(charThatMustBeEscaped): - p.Error("invalid character in basic string: %q (must be escaped)", p.Result().Rune(0)) + case p.Accept(a.Newline): + sb.WriteString("\n") + case p.Peek(controlCharacter): + p.Error("invalid character in multi-line basic string: %q (must be escaped)", p.Result().Rune(0)) return case p.Accept(tok.StrInterpreted(nil, c.OneOrMore(validEscape))): sb.WriteString(p.Result().Value(0).(string)) + case p.Accept(lineEndingBackslash): + // NOOP, the line-ending backslash sequence is skipped. case p.Peek(a.Backslash): p.Error("invalid escape sequence") return - case p.Accept(m.Drop(a.DoubleQuote)): + case p.Accept(m.Drop(doubleQuote3)): t.emitCommand(csetStrVal, sb.String()) return case p.Accept(a.ValidRune): @@ -78,16 +200,51 @@ func (t *parser) startBasicString(p *parse.API) { p.Error("invalid UTF8 rune") return default: - p.Expected("end of string") + p.Expected("closing three quotation marks") return } } } -func (t *parser) startMultiLineBasicString(p *parse.API) { - if p.Accept(doubleQuote3) { - p.Error("not yet implemented") - } else { - p.Expected("a multi-line basic string") +// Specific handling of input for multi-line literal strings. +// +// • Multi-line literal strings are surrounded by three single quotes on +// each side and allow newlines. +// +// • A newline immediately following the opening delimiter will be trimmed. +// +// • All other content between the delimiters is interpreted as-is without modification. +// +// • TOML parsers should feel free to normalize newline to whatever makes +// sense for their platform. +// +// • Control characters other than tab and newline are not permitted in a multi-line literal string. +func (t *parser) startMultiLineLiteralString(p *parse.API) { + if !p.Accept(singleQuote3.Then(a.Newline.Optional())) { + p.Expected("opening three single quotes") + return + } + sb := &strings.Builder{} + for { + switch { + case p.Accept(m.Drop(singleQuote3)): + t.emitCommand(csetStrVal, sb.String()) + return + case p.Accept(a.Tab): + sb.WriteString("\t") + case p.Accept(a.Newline): + sb.WriteString("\n") + case p.Peek(controlCharacter): + p.Error("invalid character in literal string: %q (no control chars allowed, except for tab and newline)", p.Result().Rune(0)) + return + case p.Accept(a.ValidRune): + sb.WriteString(p.Result().String()) + case p.Peek(a.InvalidRune): + p.Error("invalid UTF8 rune") + return + default: + p.Expected("closing three single quotes") + return + } } } diff --git a/value_string_test.go b/value_string_test.go index 7b4da94..e6b226f 100644 --- a/value_string_test.go +++ b/value_string_test.go @@ -9,7 +9,10 @@ func TestString(t *testing.T) { for _, test := range []parseTest{ {``, []string{`Error: unexpected end of file (expected a string value) at start of file`}}, {`no start quote"`, []string{`Error: unexpected input (expected a string value) at start of file`}}, - {`"simple string"`, []string{`string("simple string")`}}, + {`"basic s\tring"`, []string{`string("basic s\tring")`}}, + {"\"\"\"\n basic multi-line\n string value\n\"\"\"", []string{`string(" basic multi-line\n string value\n")`}}, + {`'literal s\tring'`, []string{`string("literal s\\tring")`}}, + {"'''\n literal multi-line\n string value\n'''", []string{`string(" literal multi-line\n string value\n")`}}, } { p := &parser{} testParseHandler(t, p, p.startString, test) @@ -18,9 +21,9 @@ func TestString(t *testing.T) { func TestBasicString(t *testing.T) { for _, test := range []parseTest{ - {``, []string{`Error: unexpected end of file (expected a basic string) at start of file`}}, - {`no start quote"`, []string{`Error: unexpected input (expected a basic string) at start of file`}}, - {`"no end quote`, []string{`Error: unexpected end of file (expected end of string) at line 1, column 14`}}, + {``, []string{`Error: unexpected end of file (expected opening quotation marks) at start of file`}}, + {`no start quote"`, []string{`Error: unexpected input (expected opening quotation marks) at start of file`}}, + {`"no end quote`, []string{`Error: unexpected end of file (expected closing quotation marks) at line 1, column 14`}}, {`""`, []string{`string("")`}}, {`"simple string"`, []string{`string("simple string")`}}, {`"with\tsome\r\nvalid escapes\b"`, []string{`string("with\tsome\r\nvalid escapes\b")`}}, @@ -37,6 +40,60 @@ func TestBasicString(t *testing.T) { } } +func TestMultiLineBasicString(t *testing.T) { + for _, test := range []parseTest{ + {``, []string{`Error: unexpected end of file (expected opening three quotation marks) at start of file`}}, + {`"""missing close quote""`, []string{`Error: unexpected end of file (expected closing three quotation marks) at line 1, column 25`}}, + {`""""""`, []string{`string("")`}}, + {"\"\"\"\n\"\"\"", []string{`string("")`}}, + {"\"\"\"\r\n\r\n\"\"\"", []string{`string("\n")`}}, + {`"""\"\"\"\""""`, []string{`string("\"\"\"\"")`}}, + {"\"\"\"\nThe quick brown \\\n\n\n \t fox jumps over \\\n\t the lazy dog.\\\n \"\"\"", []string{`string("The quick brown fox jumps over the lazy dog.")`}}, + {"\"\"\"No control chars \f allowed\"\"\"", []string{`Error: invalid character in multi-line basic string: '\f' (must be escaped) at line 1, column 21`}}, + {"\"\"\"Escaping control chars\\nis valid\"\"\"", []string{`string("Escaping control chars\nis valid")`}}, + {"\"\"\"Invalid escaping \\is not allowed\"\"\"", []string{`Error: invalid escape sequence at line 1, column 21`}}, + {"\"\"\"Invalid rune \xcd\"\"\"", []string{`Error: invalid UTF8 rune at line 1, column 17`}}, + } { + p := &parser{} + testParseHandler(t, p, p.startMultiLineBasicString, test) + } +} + +func TestLiteralString(t *testing.T) { + for _, test := range []parseTest{ + {``, []string{`Error: unexpected end of file (expected opening single quote) at start of file`}}, + {`'missing close quote`, []string{`Error: unexpected end of file (expected closing single quote) at line 1, column 21`}}, + {`''`, []string{`string("")`}}, + {`'simple'`, []string{`string("simple")`}}, + {`'C:\Users\nodejs\templates'`, []string{`string("C:\\Users\\nodejs\\templates")`}}, + {`'\\ServerX\admin$\system32\'`, []string{`string("\\\\ServerX\\admin$\\system32\\")`}}, + {`'Tom "Dubs" Preston-Werner'`, []string{`string("Tom \"Dubs\" Preston-Werner")`}}, + {`'<\i\c*\s*>'`, []string{`string("<\\i\\c*\\s*>")`}}, + {"'No cont\rol chars allowed'", []string{`Error: invalid character in literal string: '\r' (no control chars allowed, except for tab) at line 1, column 9`}}, + {"'Except\tfor\ttabs'", []string{`string("Except\tfor\ttabs")`}}, + {"'Invalid rune \xcd'", []string{`Error: invalid UTF8 rune at line 1, column 15`}}, + } { + p := &parser{} + testParseHandler(t, p, p.startLiteralString, test) + } +} + +func TestMultiLineLiteralString(t *testing.T) { + for _, test := range []parseTest{ + {``, []string{`Error: unexpected end of file (expected opening three single quotes) at start of file`}}, + {`'''missing close quote''`, []string{`Error: unexpected end of file (expected closing three single quotes) at line 1, column 25`}}, + {`''''''`, []string{`string("")`}}, + {"'''\n'''", []string{`string("")`}}, + {`'''I [dw]on't need \d{2} apples'''`, []string{`string("I [dw]on't need \\d{2} apples")`}}, + {"'''\nThere can\nbe newlines\r\nand \ttabs!\r\n'''", []string{`string("There can\nbe newlines\nand \ttabs!\n")`}}, + {"'''No other \f control characters'''", []string{`Error: invalid character in literal string: '\f' (no control chars allowed, except for tab and newline) at line 1, column 13`}}, + {"'''No invalid runes allowed \xcd'''", []string{"Error: invalid UTF8 rune at line 1, column 29"}}, + } { + p := &parser{} + testParseHandler(t, p, p.startMultiLineLiteralString, test) + } +} + func TestBasicStringWithUnescapedControlCharacters(t *testing.T) { // A quick check for almost all characters that must be escaped. // The missing one (\x7f) is covered in the previous test.