Yay! First version for which parsing long.toml drops below 100ms! Got an outcome of 93ms. Almost down to BurntSushi's speed level, but still with a generic parser backing. Looking good!!

This commit is contained in:
Maurice Makaay 2019-07-16 23:34:01 +00:00
parent ddd0ed49f6
commit 5e3e4b0f0a
4 changed files with 284 additions and 103 deletions

View File

@ -133,7 +133,7 @@ func (calc *calculator) factor(p *parse.API) {
var A, T = tokenize.A, tokenize.T
p.Accept(A.Blanks)
switch {
case p.Accept(T.Float64(nil, A.Signed(A.Float))):
case p.Accept(T.Float64(nil, A.Signed(A.Decimal))):
value := p.Result.Tokens[0].Value.(float64)
calc.interpreter.pushValue(value)
case p.Accept(A.LeftParen):

View File

@ -170,6 +170,14 @@ func (i *API) Accept() {
i.acceptRunes(i.lastRuneWidth, i.lastRune)
}
func (i *API) skipBytes(bytes ...byte) {
for _, b := range bytes {
i.stackFrame.moveCursorByByte(b)
}
i.stackFrame.offset += len(bytes)
i.runeRead = false
}
func (i *API) acceptBytes(bytes ...byte) {
curRuneEnd := i.stackFrame.runeEnd
newRuneEnd := curRuneEnd + len(bytes)
@ -190,6 +198,14 @@ func (i *API) acceptBytes(bytes ...byte) {
i.runeRead = false
}
func (i *API) skipRunes(width int, runes ...rune) {
for _, r := range runes {
i.stackFrame.moveCursorByRune(r)
}
i.stackFrame.offset += width
i.runeRead = false
}
func (i *API) acceptRunes(width int, runes ...rune) {
curRuneEnd := i.stackFrame.runeEnd
newRuneEnd := curRuneEnd + len(runes)

View File

@ -70,9 +70,11 @@ var A = struct {
Byte func(byte) Handler
Bytes func(...byte) Handler
ByteRange func(byte, byte) Handler
ByteByCallback func(func(byte) bool) Handler
Rune func(rune) Handler
Runes func(...rune) Handler
RuneRange func(rune, rune) Handler
RuneByCallback func(func(rune) bool) Handler
Str func(string) Handler
StrNoCase func(string) Handler
EndOfLine Handler
@ -136,11 +138,11 @@ var A = struct {
DigitNotZero Handler
Digits Handler
Zero Handler
Float Handler
Boolean Handler
Integer Handler
Signed func(Handler) Handler
Integer Handler
IntegerBetween func(min int64, max int64) Handler
Decimal Handler
ASCII Handler
ASCIILower Handler
ASCIIUpper Handler
@ -160,9 +162,11 @@ var A = struct {
Byte: MatchByte,
Bytes: MatchBytes,
ByteRange: MatchByteRange,
ByteByCallback: MatchByteByCallback,
Rune: MatchRune,
Runes: MatchRunes,
RuneRange: MatchRuneRange,
RuneByCallback: MatchRuneByCallback,
Str: MatchStr,
StrNoCase: MatchStrNoCase,
EndOfFile: MatchEndOfFile(),
@ -172,51 +176,51 @@ var A = struct {
AnyRune: MatchAnyRune(),
ValidRune: MatchValidRune(),
InvalidRune: MatchInvalidRune(),
Space: MatchRune(' '),
Tab: MatchRune('\t'),
CR: MatchRune('\r'),
LF: MatchRune('\n'),
Space: MatchByte(' '),
Tab: MatchByte('\t'),
CR: MatchByte('\r'),
LF: MatchByte('\n'),
CRLF: MatchStr("\r\n"),
Excl: MatchRune('!'),
DoubleQuote: MatchRune('"'),
Hash: MatchRune('#'),
Dollar: MatchRune('$'),
Percent: MatchRune('%'),
Amp: MatchRune('&'),
SingleQuote: MatchRune('\''),
RoundOpen: MatchRune('('),
LeftParen: MatchRune('('),
RoundClose: MatchRune(')'),
RightParen: MatchRune(')'),
Asterisk: MatchRune('*'),
Multiply: MatchRune('*'),
Plus: MatchRune('+'),
Add: MatchRune('+'),
Comma: MatchRune(','),
Minus: MatchRune('-'),
Subtract: MatchRune('-'),
Dot: MatchRune('.'),
Slash: MatchRune('/'),
Divide: MatchRune('/'),
Colon: MatchRune(':'),
Semicolon: MatchRune(';'),
AngleOpen: MatchRune('<'),
LessThan: MatchRune('<'),
Equal: MatchRune('='),
AngleClose: MatchRune('>'),
GreaterThan: MatchRune('>'),
Question: MatchRune('?'),
At: MatchRune('@'),
SquareOpen: MatchRune('['),
Backslash: MatchRune('\\'),
SquareClose: MatchRune(']'),
Caret: MatchRune('^'),
Underscore: MatchRune('_'),
Backquote: MatchRune('`'),
CurlyOpen: MatchRune('{'),
Pipe: MatchRune('|'),
CurlyClose: MatchRune('}'),
Tilde: MatchRune('~'),
Excl: MatchByte('!'),
DoubleQuote: MatchByte('"'),
Hash: MatchByte('#'),
Dollar: MatchByte('$'),
Percent: MatchByte('%'),
Amp: MatchByte('&'),
SingleQuote: MatchByte('\''),
RoundOpen: MatchByte('('),
LeftParen: MatchByte('('),
RoundClose: MatchByte(')'),
RightParen: MatchByte(')'),
Asterisk: MatchByte('*'),
Multiply: MatchByte('*'),
Plus: MatchByte('+'),
Add: MatchByte('+'),
Comma: MatchByte(','),
Minus: MatchByte('-'),
Subtract: MatchByte('-'),
Dot: MatchByte('.'),
Slash: MatchByte('/'),
Divide: MatchByte('/'),
Colon: MatchByte(':'),
Semicolon: MatchByte(';'),
AngleOpen: MatchByte('<'),
LessThan: MatchByte('<'),
Equal: MatchByte('='),
AngleClose: MatchByte('>'),
GreaterThan: MatchByte('>'),
Question: MatchByte('?'),
At: MatchByte('@'),
SquareOpen: MatchByte('['),
Backslash: MatchByte('\\'),
SquareClose: MatchByte(']'),
Caret: MatchByte('^'),
Underscore: MatchByte('_'),
Backquote: MatchByte('`'),
CurlyOpen: MatchByte('{'),
Pipe: MatchByte('|'),
CurlyClose: MatchByte('}'),
Tilde: MatchByte('~'),
Newline: MatchNewline(),
Blank: MatchBlank(),
Blanks: MatchBlanks(),
@ -225,11 +229,11 @@ var A = struct {
Digit: MatchDigit(),
DigitNotZero: MatchDigitNotZero(),
Digits: MatchDigits(),
Zero: MatchRune('0'),
Integer: MatchInteger(),
Zero: MatchByte('0'),
Signed: MatchSigned,
Integer: MatchInteger(true),
IntegerBetween: MatchIntegerBetween,
Float: MatchFloat(),
Decimal: MatchDecimal(true),
Boolean: MatchBoolean(),
ASCII: MatchASCII(),
ASCIILower: MatchASCIILower(),
@ -355,7 +359,7 @@ func MatchByte(expected byte) Handler {
// MatchRune creates a Handler function that matches against the provided rune.
func MatchRune(expected rune) Handler {
if expected <= 127 {
if expected <= '\x7F' {
return MatchByte(byte(expected))
}
return func(t *API) bool {
@ -392,7 +396,7 @@ func MatchRunes(expected ...rune) Handler {
onlyBytes := true
expectedBytes := make([]byte, len(expected))
for i, r := range expected {
if r > 255 {
if r > '\x7F' {
onlyBytes = false
break
}
@ -448,7 +452,7 @@ func MatchRuneRange(start rune, end rune) Handler {
if end < start {
callerPanic("MatchRuneRange", "Handler: {name} definition error at {caller}: start %q must not be < end %q", start, end)
}
if end <= 127 {
if end <= '\x7F' {
return MatchByteRange(byte(start), byte(end))
}
return func(t *API) bool {
@ -574,6 +578,23 @@ func MatchUnicodeSpace() Handler {
return MatchOneOrMore(MatchRuneByCallback(unicode.IsSpace))
}
// MatchByteByCallback creates a Handler that matches a single byte from the
// input against the provided callback function. When the callback returns true,
// it is considered a match.
//
// Note that the callback function matches the signature of the unicode.Is* functions,
// so those can be used. E.g. MatchRuneByCallback(unicode.IsLower).
func MatchByteByCallback(callback func(byte) bool) Handler {
return func(t *API) bool {
b, err := t.PeekByte(0)
if err == nil && callback(b) {
t.acceptBytes(b)
return true
}
return false
}
}
// MatchRuneByCallback creates a Handler that matches a single rune from the
// input against the provided callback function. When the callback returns true,
// it is considered a match.
@ -621,7 +642,7 @@ func MatchStr(expected string) Handler {
return func(t *API) bool {
offset := 0
for _, e := range expectedRunes {
if e <= 127 {
if e <= '\x7F' {
b, err := t.PeekByte(offset)
if err != nil || b != byte(e) {
return false
@ -650,7 +671,7 @@ func MatchStrNoCase(expected string) Handler {
width := 0
i := 0
for _, e := range expected {
if e <= 127 {
if e <= '\x7F' {
b, err := t.PeekByte(width)
if err != nil || (b != byte(e) && unicode.ToUpper(rune(b)) != unicode.ToUpper(e)) {
return false
@ -732,9 +753,9 @@ func MatchNot(handler Handler) Handler {
return false
}
t.Dispose(child)
_, err := t.NextRune()
r, w, err := t.PeekRune(0)
if err == nil {
t.Accept()
t.acceptRunes(w, r)
return true
}
return false
@ -924,8 +945,24 @@ func MakeInputFlusher(handler Handler) Handler {
//
// C.Signed(A.Integer)
func MatchSigned(handler Handler) Handler {
sign := MatchOptional(MatchAny(MatchRune('+'), MatchRune('-')))
return MatchSeq(sign, handler)
return func(t *API) bool {
child := t.Fork()
b, err := t.PeekByte(0)
if err != nil {
t.Dispose(child)
return false
}
if b == '-' || b == '+' {
t.acceptBytes(b)
}
if handler(t) {
t.Merge(child)
t.Dispose(child)
return true
}
t.Dispose(child)
return false
}
}
// MatchIntegerBetween creates a Handler that checks for an integer
@ -956,7 +993,7 @@ func MatchIntegerBetween(min int64, max int64) Handler {
func MatchEndOfFile() Handler {
return func(t *API) bool {
child := t.Fork()
_, err := t.NextRune()
_, err := t.PeekByte(0)
t.Dispose(child)
return err == io.EOF
}
@ -1024,37 +1061,157 @@ func MatchInvalidRune() Handler {
// MatchDigit creates a Handler that checks if a single digit can be read
// from the input.
func MatchDigit() Handler {
return MatchRuneRange('0', '9')
return MatchByteRange('0', '9')
}
// MatchDigits creates a Handler that checks if one or more digits can be read
// from the input.
func MatchDigits() Handler {
return MatchOneOrMore(MatchDigit())
return func(t *API) bool {
// Check if the first character is a digit.
b, err := t.PeekByte(0)
if err != nil || b < '0' || b > '9' {
return false
}
t.acceptBytes(b)
// Continue accepting bytes as long as they are digits.
for {
b, err := t.PeekByte(0)
if err != nil || b < '0' || b > '9' {
return true
}
t.acceptBytes(b)
}
}
}
// MatchDigitNotZero creates a Handler that checks if a single digit not equal
// to zero '0' can be read from the input.
func MatchDigitNotZero() Handler {
return MatchRuneRange('1', '9')
return MatchByteRange('1', '9')
}
// MatchInteger creates a Handler function that checks if a valid integer
// can be read from the input. In line with Go, an integer cannot start with
// a zero. Starting with a zero is used to indicate other bases, like octal or
// hexadecimal.
func MatchInteger() Handler {
justZero := MatchRune('0')
integer := MatchSeq(MatchDigitNotZero(), MatchZeroOrMore(MatchDigit()))
return MatchAny(integer, justZero)
// can be read from the input.
//
// Leading zeroes are allowed. When the normalize parameter is true, these
// will be stripped from the input.
func MatchInteger(normalize bool) Handler {
return func(t *API) bool {
// Check if the first character is a digit.
b, err := t.PeekByte(0)
if err != nil || b < '0' || b > '9' {
return false
}
// When normalization is requested, drop leading zeroes.
if normalize && b == '0' {
for {
b2, err := t.PeekByte(1)
// The next character is a zero, skip the leading zero and check again.
if err == nil && b2 == b {
t.skipBytes('0')
continue
}
// The next character is not a zero, nor a digit at all.
// We're looking at a zero on its own here.
if err != nil || b2 < '1' || b2 > '9' {
t.acceptBytes('0')
return true
}
// The next character is a digit. SKip the leading zero and go with the digit.
t.skipBytes('0')
t.acceptBytes(b2)
break
}
}
// Continue accepting bytes as long as they are digits.
for {
b, err := t.PeekByte(0)
if err != nil || b < '0' || b > '9' {
return true
}
t.acceptBytes(b)
}
}
}
// MatchFloat creates a Handler function that checks if a valid float value
// can be read from the input. In case the fractional part is missing, this
// Handler will report a match, so both "123" and "123.123" will match.
func MatchFloat() Handler {
digits := MatchDigits()
return MatchSeq(digits, MatchOptional(MatchSeq(MatchRune('.'), digits)))
// MatchDecimal creates a Handler function that checks if a valid decimal value
// can be read from the input. In case the fractional part is missing (which is
// a valid decimal number), this Handler will report a match, so both "123" and
// "123.123" will match.
//
// Leading zeroes are allowed. When the normalize parameter is true, these
// will be stripped from the input.
func MatchDecimal(normalize bool) Handler {
return func(t *API) bool {
// Check if the first character is a digit.
b, err := t.PeekByte(0)
if err != nil || b < '0' || b > '9' {
return false
}
// When normalization is requested, drop leading zeroes.
if normalize && b == '0' {
for {
b2, err := t.PeekByte(1)
// The next character is a zero, skip the leading zero and check again.
if err == nil && b2 == b {
t.skipBytes('0')
continue
}
// The next character is a dot, go with the zero before the dot and
// let the upcoming code handle the dot.
if err == nil && b2 == '.' {
t.acceptBytes('0')
break
}
// The next character is not a zero, nor a digit at all.
// We're looking at a zero on its own here.
if err != nil || b2 < '1' || b2 > '9' {
t.acceptBytes('0')
return true
}
// The next character is a digit. SKip the leading zero and go with the digit.
t.skipBytes('0')
t.acceptBytes(b2)
break
}
}
// Continue accepting bytes as long as they are digits.
for {
b, err = t.PeekByte(0)
if err != nil || b < '0' || b > '9' {
break
}
t.acceptBytes(b)
}
// No dot or no digit after a dot? Then we're done.
if b != '.' {
return true
}
b, err = t.PeekByte(1)
if err != nil || b < '0' || b > '9' {
return true
}
// Continue accepting bytes as long as they are digits.
t.acceptBytes('.', b)
for {
b, err = t.PeekByte(0)
if err != nil || b < '0' || b > '9' {
break
}
t.acceptBytes(b)
}
return true
}
}
// MatchBoolean creates a Handler function that checks if a boolean
@ -1075,7 +1232,11 @@ func MatchBoolean() Handler {
return true
}
if b1 == 't' || b1 == 'T' {
b2, _ := t.PeekByte(1)
b2, err := t.PeekByte(1)
if err != nil || (b2 != 'R' && b2 != 'r') {
t.acceptBytes(b1)
return true
}
b3, _ := t.PeekByte(2)
b4, err := t.PeekByte(3)
if err == nil && b2 == 'r' && b3 == 'u' && b4 == 'e' {
@ -1091,11 +1252,14 @@ func MatchBoolean() Handler {
}
if b1 == 'f' || b1 == 'F' {
b2, _ := t.PeekByte(1)
b2, err := t.PeekByte(1)
if err != nil || (b2 != 'A' && b2 != 'a') {
t.acceptBytes(b1)
return true
}
b3, _ := t.PeekByte(2)
b4, _ := t.PeekByte(3)
b5, err := t.PeekByte(4)
if err == nil && b2 == 'a' && b3 == 'l' && b4 == 's' && b5 == 'e' {
t.acceptBytes(b1, b2, b3, b4, b5)
return true
@ -1114,19 +1278,19 @@ func MatchBoolean() Handler {
// MatchASCII creates a Handler function that matches against any
// ASCII value on the input.
func MatchASCII() Handler {
return MatchRuneRange('\x00', '\x7F')
return MatchByteRange('\x00', '\x7F')
}
// MatchASCIILower creates a Handler function that matches against any
// lower case ASCII letter on the input (a - z).
func MatchASCIILower() Handler {
return MatchRuneRange('a', 'z')
return MatchByteRange('a', 'z')
}
// MatchASCIIUpper creates a Handler function that matches against any
// upper case ASCII letter on the input (a - z).
func MatchASCIIUpper() Handler {
return MatchRuneRange('A', 'Z')
return MatchByteRange('A', 'Z')
}
// MatchUnicodeLetter creates a Handler function that matches against any
@ -1365,19 +1529,15 @@ func MatchIPv6Net(normalize bool) Handler {
// In both cases, it would match the first form.
func ModifyDrop(handler Handler) Handler {
return func(t *API) bool {
child := t.Fork()
runeEnd := t.stackFrame.runeEnd
tokenEnd := t.stackFrame.tokenEnd
if handler(t) {
// Do a partial merge: only move the cursor and read offset forward.
// Any produced runes and tokens are ignored and not merged to the parent
// (since we're dropping those here).
parent := &t.stackFrames[t.stackLevel-1]
parent.offset = t.stackFrame.offset
parent.line = t.stackFrame.line
parent.column = t.stackFrame.column
t.Dispose(child)
// We keep offset and cursor updates, but rollback any runes / tokens
// that were added by the handler.
t.stackFrame.runeEnd = runeEnd
t.stackFrame.tokenEnd = tokenEnd
return true
}
t.Dispose(child)
return false
}
}

View File

@ -231,22 +231,27 @@ func TestAtoms(t *testing.T) {
{"F", a.HexDigit, true, "F"},
{"g", a.HexDigit, false, "g"},
{"G", a.HexDigit, false, "G"},
{"09", a.Integer, true, "9"},
{"0000129", a.Integer, true, "129"},
{"0", a.Integer, true, "0"},
{"09", a.Integer, true, "0"}, // following Go: 09 is invalid octal, so only 0 is valid for the integer
{"00000", a.Integer, true, "0"},
{"1", a.Integer, true, "1"},
{"-10X", a.Integer, false, ""},
{"+10X", a.Integer, false, ""},
{"-10X", a.Signed(a.Integer), true, "-10"},
{"+10X", a.Signed(a.Integer), true, "+10"},
{"+10.1X", a.Signed(a.Integer), true, "+10"},
{"0X", a.Float, true, "0"},
{"0X", a.Float, true, "0"},
{"1X", a.Float, true, "1"},
{"1.", a.Float, true, "1"}, // incomplete float, so only the 1 is picked up
{"123.321X", a.Float, true, "123.321"},
{"-3.14X", a.Float, false, ""},
{"-3.14X", a.Signed(a.Float), true, "-3.14"},
{"-003.0014X", a.Signed(a.Float), true, "-003.0014"},
{"0X", a.Decimal, true, "0"},
{"0000X", a.Decimal, true, "0"},
{"1X", a.Decimal, true, "1"},
{"01X", a.Decimal, true, "1"},
{"000001X", a.Decimal, true, "1"},
{"1.", a.Decimal, true, "1"}, // incomplete float, so only the 1 is picked up
{"123.321X", a.Decimal, true, "123.321"},
{"0.6X", a.Decimal, true, "0.6"},
{"-3.14X", a.Decimal, false, ""},
{"-3.14X", a.Signed(a.Decimal), true, "-3.14"},
{"-003.0014X", a.Signed(a.Decimal), true, "-3.0014"},
{"-11", a.IntegerBetween(-10, 10), false, "0"},
{"-10", a.IntegerBetween(-10, 10), true, "-10"},
{"0", a.IntegerBetween(-10, 10), true, "0"},
@ -430,8 +435,8 @@ func TestTokenMakers(t *testing.T) {
{`4294967295XYZ`, tok.Uint32("L", a.Integer), []tokenize.Token{{Type: "L", Value: uint32(4294967295)}}},
{`18446744073709551615XYZ`, tok.Uint64("M", a.Integer), []tokenize.Token{{Type: "M", Value: uint64(18446744073709551615)}}},
{`3.1415=PI`, tok.Float32("N", a.Float), []tokenize.Token{{Type: "N", Value: float32(3.1415)}}},
{`24.19287=PI`, tok.Float64("O", a.Float), []tokenize.Token{{Type: "O", Value: float64(24.19287)}}},
{`3.1415=PI`, tok.Float32("N", a.Decimal), []tokenize.Token{{Type: "N", Value: float32(3.1415)}}},
{`24.19287=PI`, tok.Float64("O", a.Decimal), []tokenize.Token{{Type: "O", Value: float64(24.19287)}}},
{`1tTtrueTRUETrue`, c.OneOrMore(tok.Boolean("P", a.Boolean)), []tokenize.Token{
{Type: "P", Value: true},