From b9cc91c0aee34e41ab77c0ea20bd3a20c9a50531 Mon Sep 17 00:00:00 2001 From: Maurice Makaay Date: Mon, 29 Jul 2019 22:52:38 +0000 Subject: [PATCH] More speed improvements. --- tokenize/api_test.go | 30 -- tokenize/handlers_builtin.go | 654 +++++++++++++++++------------- tokenize/handlers_builtin_test.go | 2 +- 3 files changed, 362 insertions(+), 324 deletions(-) diff --git a/tokenize/api_test.go b/tokenize/api_test.go index 34728ed..9ab6cc7 100644 --- a/tokenize/api_test.go +++ b/tokenize/api_test.go @@ -8,36 +8,6 @@ import ( "git.makaay.nl/mauricem/go-parsekit/tokenize" ) -func BenchmarkMemclrOptimization(b *testing.B) { - // TODO use or cleanup this one and the next. I'm playing around here. - type s struct { - a int - b string - } - x := []s{{10, "hoi"}, {20, "doei"}, {30, "jadag"}} - - for i := 0; i < b.N; i++ { - for i := range x { - x[i] = s{} - } - } -} - -func BenchmarkCodedClear(b *testing.B) { - type s struct { - a int - b string - } - - x := []s{{10, "hoi"}, {20, "doei"}, {30, "jadag"}} - - for i := 0; i < b.N; i++ { - x[0] = s{} - x[1] = s{} - x[2] = s{} - } -} - func ExampleNewAPI() { tokenize.NewAPI("The input that the API will handle") } diff --git a/tokenize/handlers_builtin.go b/tokenize/handlers_builtin.go index 8d8ff6a..42a76c9 100644 --- a/tokenize/handlers_builtin.go +++ b/tokenize/handlers_builtin.go @@ -25,37 +25,39 @@ import ( // // Doing so saves you a lot of typing, and it makes your code a lot cleaner. var C = struct { - Any func(...Handler) Handler - Not func(Handler) Handler - Seq func(...Handler) Handler - Min func(min int, handler Handler) Handler - Max func(max int, handler Handler) Handler - Repeated func(times int, handler Handler) Handler - Optional func(Handler) Handler - ZeroOrMore func(Handler) Handler - OneOrMore func(Handler) Handler - MinMax func(min int, max int, handler Handler) Handler - Separated func(separator Handler, separated Handler) Handler - Except func(except Handler, handler Handler) Handler - FollowedBy func(lookAhead Handler, handler Handler) Handler - NotFollowedBy func(lookAhead Handler, handler Handler) Handler - FlushInput func(Handler) Handler + Any func(...Handler) Handler + Not func(Handler) Handler + Seq func(...Handler) Handler + Min func(min int, handler Handler) Handler + Max func(max int, handler Handler) Handler + Repeated func(times int, handler Handler) Handler + Optional func(Handler) Handler + ZeroOrMore func(Handler) Handler + OneOrMore func(Handler) Handler + MinMax func(min int, max int, handler Handler) Handler + Separated func(separator Handler, separated Handler) Handler + Except func(except Handler, handler Handler) Handler + FollowedBy func(lookAhead Handler, handler Handler) Handler + NotFollowedBy func(lookAhead Handler, handler Handler) Handler + InOptionalBlanks func(handler Handler) Handler + FlushInput func(Handler) Handler }{ - Any: MatchAny, - Not: MatchNot, - Seq: MatchSeq, - Min: MatchMin, - Max: MatchMax, - Repeated: MatchRep, - Optional: MatchOptional, - ZeroOrMore: MatchZeroOrMore, - OneOrMore: MatchOneOrMore, - MinMax: MatchMinMax, - Separated: MatchSeparated, - Except: MatchExcept, - FollowedBy: MatchFollowedBy, - NotFollowedBy: MatchNotFollowedBy, - FlushInput: MakeInputFlusher, + Any: MatchAny, + Not: MatchNot, + Seq: MatchSeq, + Min: MatchMin, + Max: MatchMax, + Repeated: MatchRep, + Optional: MatchOptional, + ZeroOrMore: MatchZeroOrMore, + OneOrMore: MatchOneOrMore, + MinMax: MatchMinMax, + Separated: MatchSeparated, + Except: MatchExcept, + FollowedBy: MatchFollowedBy, + NotFollowedBy: MatchNotFollowedBy, + InOptionalBlanks: MatchInOptionalBlanks, + FlushInput: MakeInputFlusher, } // A provides convenient access to a range of atoms or functions to build atoms. @@ -67,181 +69,183 @@ var C = struct { // // Doing so saves you a lot of typing, and it makes your code a lot cleaner. var A = struct { - Char func(...rune) Handler - CharRange func(...rune) Handler - ByteByCallback func(func(byte) bool) Handler - RuneByCallback func(func(rune) bool) Handler - AnyByte Handler - AnyRune Handler - ValidRune Handler - InvalidRune Handler - Str func(string) Handler - StrNoCase func(string) Handler - EndOfLine Handler - EndOfFile Handler - UntilEndOfLine Handler - Space Handler - Tab Handler - CR Handler - LF Handler - CRLF Handler - Excl Handler - DoubleQuote Handler - Hash Handler - Dollar Handler - Percent Handler - Amp Handler - SingleQuote Handler - RoundOpen Handler - LeftParen Handler - RoundClose Handler - RightParen Handler - Asterisk Handler - Multiply Handler - Plus Handler - Add Handler - Comma Handler - Minus Handler - Subtract Handler - Dot Handler - Slash Handler - Divide Handler - Colon Handler - Semicolon Handler - AngleOpen Handler - LessThan Handler - Equal Handler - AngleClose Handler - GreaterThan Handler - Question Handler - At Handler - SquareOpen Handler - Backslash Handler - SquareClose Handler - Caret Handler - Underscore Handler - Backquote Handler - CurlyOpen Handler - Pipe Handler - CurlyClose Handler - Tilde Handler - Newline Handler - Blank Handler - Blanks Handler - Whitespace Handler - UnicodeSpace Handler - Digit Handler - DigitNotZero Handler - Digits Handler - Zero Handler - Boolean Handler - Signed func(Handler) Handler - Integer Handler - IntegerBetween func(min int64, max int64) Handler - Decimal Handler - ASCII Handler - ASCIILower Handler - ASCIIUpper Handler - Letter Handler - Lower Handler - Upper Handler - HexDigit Handler - Octet Handler - IPv4 Handler - IPv4CIDRMask Handler - IPv4Netmask Handler - IPv4Net Handler - IPv6 Handler - IPv6CIDRMask Handler - IPv6Net Handler + Char func(...rune) Handler + CharRange func(...rune) Handler + ByteByCallback func(func(byte) bool) Handler + BytesByCallback func(func(byte) bool) Handler + RuneByCallback func(func(rune) bool) Handler + AnyByte Handler + AnyRune Handler + ValidRune Handler + InvalidRune Handler + Str func(string) Handler + StrNoCase func(string) Handler + EndOfLine Handler + EndOfFile Handler + UntilEndOfLine Handler + Space Handler + Tab Handler + CR Handler + LF Handler + CRLF Handler + Excl Handler + DoubleQuote Handler + Hash Handler + Dollar Handler + Percent Handler + Amp Handler + SingleQuote Handler + RoundOpen Handler + LeftParen Handler + RoundClose Handler + RightParen Handler + Asterisk Handler + Multiply Handler + Plus Handler + Add Handler + Comma Handler + Minus Handler + Subtract Handler + Dot Handler + Slash Handler + Divide Handler + Colon Handler + Semicolon Handler + AngleOpen Handler + LessThan Handler + Equal Handler + AngleClose Handler + GreaterThan Handler + Question Handler + At Handler + SquareOpen Handler + Backslash Handler + SquareClose Handler + Caret Handler + Underscore Handler + Backquote Handler + CurlyOpen Handler + Pipe Handler + CurlyClose Handler + Tilde Handler + Newline Handler + Blank Handler + Blanks Handler + Whitespace Handler + UnicodeSpace Handler + Digit Handler + DigitNotZero Handler + Digits Handler + Zero Handler + Boolean Handler + Signed func(Handler) Handler + Integer Handler + IntegerBetween func(min int64, max int64) Handler + Decimal Handler + ASCII Handler + ASCIILower Handler + ASCIIUpper Handler + Letter Handler + Lower Handler + Upper Handler + HexDigit Handler + Octet Handler + IPv4 Handler + IPv4CIDRMask Handler + IPv4Netmask Handler + IPv4Net Handler + IPv6 Handler + IPv6CIDRMask Handler + IPv6Net Handler }{ - Char: MatchChar, - CharRange: MatchCharRange, - ByteByCallback: MatchByteByCallback, - RuneByCallback: MatchRuneByCallback, - AnyByte: MatchAnyByte(), - AnyRune: MatchAnyRune(), - ValidRune: MatchValidRune(), - InvalidRune: MatchInvalidRune(), - Str: MatchStr, - StrNoCase: MatchStrNoCase, - EndOfFile: MatchEndOfFile(), - EndOfLine: MatchEndOfLine(), - UntilEndOfLine: MatchUntilEndOfLine(), - Space: MatchChar(' '), - Tab: MatchChar('\t'), - CR: MatchChar('\r'), - LF: MatchChar('\n'), - CRLF: MatchStr("\r\n"), - Excl: MatchChar('!'), - DoubleQuote: MatchChar('"'), - Hash: MatchChar('#'), - Dollar: MatchChar('$'), - Percent: MatchChar('%'), - Amp: MatchChar('&'), - SingleQuote: MatchChar('\''), - RoundOpen: MatchChar('('), - LeftParen: MatchChar('('), - RoundClose: MatchChar(')'), - RightParen: MatchChar(')'), - Asterisk: MatchChar('*'), - Multiply: MatchChar('*'), - Plus: MatchChar('+'), - Add: MatchChar('+'), - Comma: MatchChar(','), - Minus: MatchChar('-'), - Subtract: MatchChar('-'), - Dot: MatchChar('.'), - Slash: MatchChar('/'), - Divide: MatchChar('/'), - Colon: MatchChar(':'), - Semicolon: MatchChar(';'), - AngleOpen: MatchChar('<'), - LessThan: MatchChar('<'), - Equal: MatchChar('='), - AngleClose: MatchChar('>'), - GreaterThan: MatchChar('>'), - Question: MatchChar('?'), - At: MatchChar('@'), - SquareOpen: MatchChar('['), - Backslash: MatchChar('\\'), - SquareClose: MatchChar(']'), - Caret: MatchChar('^'), - Underscore: MatchChar('_'), - Backquote: MatchChar('`'), - CurlyOpen: MatchChar('{'), - Pipe: MatchChar('|'), - CurlyClose: MatchChar('}'), - Tilde: MatchChar('~'), - Newline: MatchNewline(), - Blank: MatchBlank(), - Blanks: MatchBlanks(), - Whitespace: MatchWhitespace(), - UnicodeSpace: MatchUnicodeSpace(), - Digit: MatchDigit(), - DigitNotZero: MatchDigitNotZero(), - Digits: MatchDigits(), - Zero: MatchChar('0'), - Signed: MatchSigned, - Integer: MatchInteger(true), - IntegerBetween: MatchIntegerBetween, - Decimal: MatchDecimal(true), - Boolean: MatchBoolean(), - ASCII: MatchASCII(), - ASCIILower: MatchASCIILower(), - ASCIIUpper: MatchASCIIUpper(), - Letter: MatchUnicodeLetter(), - Lower: MatchUnicodeLower(), - Upper: MatchUnicodeUpper(), - HexDigit: MatchHexDigit(), - Octet: MatchOctet(true), - IPv4: MatchIPv4(true), - IPv4CIDRMask: MatchIPv4CIDRMask(true), - IPv4Netmask: MatchIPv4Netmask(true), - IPv4Net: MatchIPv4Net(true), - IPv6: MatchIPv6(true), - IPv6CIDRMask: MatchIPv6CIDRMask(true), - IPv6Net: MatchIPv6Net(true), + Char: MatchChar, + CharRange: MatchCharRange, + ByteByCallback: MatchByteByCallback, + BytesByCallback: MatchBytesByCallback, + RuneByCallback: MatchRuneByCallback, + AnyByte: MatchAnyByte(), + AnyRune: MatchAnyRune(), + ValidRune: MatchValidRune(), + InvalidRune: MatchInvalidRune(), + Str: MatchStr, + StrNoCase: MatchStrNoCase, + EndOfFile: MatchEndOfFile(), + EndOfLine: MatchEndOfLine(), + UntilEndOfLine: MatchUntilEndOfLine(), + Space: MatchChar(' '), + Tab: MatchChar('\t'), + CR: MatchChar('\r'), + LF: MatchChar('\n'), + CRLF: MatchStr("\r\n"), + Excl: MatchChar('!'), + DoubleQuote: MatchChar('"'), + Hash: MatchChar('#'), + Dollar: MatchChar('$'), + Percent: MatchChar('%'), + Amp: MatchChar('&'), + SingleQuote: MatchChar('\''), + RoundOpen: MatchChar('('), + LeftParen: MatchChar('('), + RoundClose: MatchChar(')'), + RightParen: MatchChar(')'), + Asterisk: MatchChar('*'), + Multiply: MatchChar('*'), + Plus: MatchChar('+'), + Add: MatchChar('+'), + Comma: MatchChar(','), + Minus: MatchChar('-'), + Subtract: MatchChar('-'), + Dot: MatchChar('.'), + Slash: MatchChar('/'), + Divide: MatchChar('/'), + Colon: MatchChar(':'), + Semicolon: MatchChar(';'), + AngleOpen: MatchChar('<'), + LessThan: MatchChar('<'), + Equal: MatchChar('='), + AngleClose: MatchChar('>'), + GreaterThan: MatchChar('>'), + Question: MatchChar('?'), + At: MatchChar('@'), + SquareOpen: MatchChar('['), + Backslash: MatchChar('\\'), + SquareClose: MatchChar(']'), + Caret: MatchChar('^'), + Underscore: MatchChar('_'), + Backquote: MatchChar('`'), + CurlyOpen: MatchChar('{'), + Pipe: MatchChar('|'), + CurlyClose: MatchChar('}'), + Tilde: MatchChar('~'), + Newline: MatchNewline(), + Blank: MatchBlank(), + Blanks: MatchBlanks(), + Whitespace: MatchWhitespace(), + UnicodeSpace: MatchUnicodeSpace(), + Digit: MatchDigit(), + DigitNotZero: MatchDigitNotZero(), + Digits: MatchDigits(), + Zero: MatchChar('0'), + Signed: MatchSigned, + Integer: MatchInteger(true), + IntegerBetween: MatchIntegerBetween, + Decimal: MatchDecimal(true), + Boolean: MatchBoolean(), + ASCII: MatchASCII(), + ASCIILower: MatchASCIILower(), + ASCIIUpper: MatchASCIIUpper(), + Letter: MatchUnicodeLetter(), + Lower: MatchUnicodeLower(), + Upper: MatchUnicodeUpper(), + HexDigit: MatchHexDigit(), + Octet: MatchOctet(true), + IPv4: MatchIPv4(true), + IPv4CIDRMask: MatchIPv4CIDRMask(true), + IPv4Netmask: MatchIPv4Netmask(true), + IPv4Net: MatchIPv4Net(true), + IPv6: MatchIPv6(true), + IPv6CIDRMask: MatchIPv6CIDRMask(true), + IPv6Net: MatchIPv6Net(true), } // M provides convenient access to a range of modifiers (which in their nature are @@ -552,21 +556,32 @@ func MatchBlank() Handler { // like a vertical tab, then make use of MatchUnicodeSpace(). func MatchBlanks() Handler { return func(tokenAPI *API) bool { - // Match the first blank. - b, err := tokenAPI.Input.Byte.Peek(0) - if err != nil || (b != ' ' && b != '\t') { - return false + f := tokenAPI.Input.Byte.AcceptMulti + if tokenAPI.Output.suspended > 0 { + f = tokenAPI.Input.Byte.MoveCursorMulti } - tokenAPI.Input.Byte.Accept(b) - - // Now match any number of followup blanks. We've already got - // a successful match at this point, so we'll always return true at the end. + ok := false for { - b, err := tokenAPI.Input.Byte.Peek(0) - if err != nil || (b != ' ' && b != '\t') { - return true + chunk, err := tokenAPI.Input.Byte.PeekMulti(0, 128) + for i, b := range chunk { + if b != ' ' && b != '\t' { + if i > 0 { + f(chunk[:i]...) + } + return ok + } + ok = true } - tokenAPI.Input.Byte.Accept(b) + if err != nil { + if err == io.EOF { + if len(chunk) > 0 { + f(chunk...) + } + return ok + } + return false + } + f(chunk...) } } } @@ -576,37 +591,32 @@ func MatchBlanks() Handler { // carriage return '\r' followed by a newline '\n' (CRLF). func MatchWhitespace() Handler { return func(tokenAPI *API) bool { - // Match the first whitespace. - b1, err := tokenAPI.Input.Byte.Peek(0) - if err != nil || (b1 != ' ' && b1 != '\t' && b1 != '\n' && b1 != '\r') { - return false + f := tokenAPI.Input.Byte.AcceptMulti + if tokenAPI.Output.suspended > 0 { + f = tokenAPI.Input.Byte.MoveCursorMulti } - if b1 == '\r' { - b2, err := tokenAPI.Input.Byte.Peek(1) - if err != nil || b2 != '\n' { + ok := false + for { + chunk, err := tokenAPI.Input.Byte.PeekMulti(0, 128) + for i, b := range chunk { + if b != ' ' && b != '\t' && b != '\n' && b != '\r' { + if i > 0 { + f(chunk[:i]...) + } + return ok + } + ok = true + } + if err != nil { + if err == io.EOF { + if len(chunk) > 0 { + f(chunk...) + } + return ok + } return false } - tokenAPI.Input.Byte.AcceptMulti(b1, b2) - } else { - tokenAPI.Input.Byte.Accept(b1) - } - - // Now match any number of followup whitespace. We've already got - // a successful match at this point, so we'll always return true at the end. - for { - b1, err := tokenAPI.Input.Byte.Peek(0) - if err != nil || (b1 != ' ' && b1 != '\t' && b1 != '\n' && b1 != '\r') { - return true - } - if b1 == '\r' { - b2, err := tokenAPI.Input.Byte.Peek(1) - if err != nil || b2 != '\n' { - return true - } - tokenAPI.Input.Byte.AcceptMulti(b1, b2) - } else { - tokenAPI.Input.Byte.Accept(b1) - } + f(chunk...) } } } @@ -620,9 +630,6 @@ func MatchUnicodeSpace() Handler { // MatchByteByCallback creates a Handler that matches a single byte from the // input against the provided callback function. When the callback returns true, // it is considered a match. -// -// Note that the callback function matches the signature of the unicode.Is* functions, -// so those can be used. E.g. MatchRuneByCallback(unicode.IsLower). func MatchByteByCallback(callback func(byte) bool) Handler { return func(tokenAPI *API) bool { b, err := tokenAPI.Input.Byte.Peek(0) @@ -634,6 +641,41 @@ func MatchByteByCallback(callback func(byte) bool) Handler { } } +// MatchBytesByCallback creates a Handler that matches one or more bytes from the +// input against the provided callback function. As long as the callback returns true, +// it is considered a match. +func MatchBytesByCallback(callback func(byte) bool) Handler { + return func(tokenAPI *API) bool { + f := tokenAPI.Input.Byte.AcceptMulti + if tokenAPI.Output.suspended > 0 { + f = tokenAPI.Input.Byte.MoveCursorMulti + } + ok := false + for { + chunk, err := tokenAPI.Input.Byte.PeekMulti(0, 128) + for i, b := range chunk { + if !callback(b) { + if i > 0 { + f(chunk[:i]...) + } + return ok + } + ok = true + } + if err != nil { + if err == io.EOF { + if len(chunk) > 0 { + f(chunk...) + } + return ok + } + return false + } + f(chunk...) + } + } +} + // MatchRuneByCallback creates a Handler that matches a single rune from the // input against the provided callback function. When the callback returns true, // it is considered a match. @@ -947,6 +989,37 @@ func MatchNotFollowedBy(lookAhead Handler, handler Handler) Handler { } } +func MatchInOptionalBlanks(handler Handler) Handler { + return func(tokenAPI *API) bool { + skipBlanks(tokenAPI) + if !handler(tokenAPI) { + return false + } + skipBlanks(tokenAPI) + return true + } +} + +func skipBlanks(tokenAPI *API) { + for { + bs, err := tokenAPI.Input.Byte.PeekMulti(0, 128) + for i, b := range bs { + if b != ' ' && b != '\t' { + if i > 0 { + tokenAPI.Input.Byte.MoveCursorMulti(bs[:i]...) + } + return + } + } + if err != nil { + if len(bs) > 0 { + tokenAPI.Input.Byte.MoveCursorMulti(bs...) + } + return + } + } +} + // MakeInputFlusher creates a Handler that will flush the input buffer when the // provided handler matches. // @@ -1037,31 +1110,35 @@ func MatchUntilEndOfLine() Handler { f = tokenAPI.Input.Byte.MoveCursorMulti } for { - bs, err := tokenAPI.Input.Byte.PeekMulti(0, 128) + chunk, err := tokenAPI.Input.Byte.PeekMulti(0, 128) state := 0 - for i, b := range bs { + ok := false + for i, b := range chunk { if b == '\r' { state = 1 continue } if b == '\n' { if state == 1 { - f(bs[:i+1]...) - } else { - f(bs[:i]...) + f(chunk[:i+1]...) + } else if i > 0 { + f(chunk[:i]...) } - return true + return ok } state = 0 + ok = true } if err != nil { if err == io.EOF { - f(bs...) - return true + if len(chunk) > 0 { + f(chunk...) + } + return ok } return false } - f(bs...) + f(chunk...) } } } @@ -1350,50 +1427,41 @@ func MatchHexDigit() Handler { // stripped from the octet. func MatchOctet(normalize bool) Handler { return func(tokenAPI *API) bool { - // Digit 1 - b0, err := tokenAPI.Input.Byte.Peek(0) - if err != nil || b0 < '0' || b0 > '9' { + chunk, _ := tokenAPI.Input.Byte.PeekMulti(0, 3) + value := 0 + start := 0 + end := 0 + for i, b := range chunk { + if b < '0' || b > '9' { + if i == 0 { + return false + } + break + } + if b == '0' && value == 0 { + start++ + } else { + value = value*10 + int(b-'0') + } + end++ + } + + if value > 255 { return false } - // Digit 2 - b1, err := tokenAPI.Input.Byte.Peek(1) - if err != nil || b1 < '0' || b1 > '9' { - // Output 1-digit octet. - tokenAPI.Input.Byte.Accept(b0) - return true - } - - // Digit 3 - b2, err := tokenAPI.Input.Byte.Peek(2) - if err != nil || b2 < '0' || b2 > '9' { - // Output 2-digit octet. - if normalize && b0 == '0' { - tokenAPI.Input.Byte.MoveCursor(b0) - tokenAPI.Input.Byte.Accept(b1) - } else { - tokenAPI.Input.Byte.AcceptMulti(b0, b1) + if normalize { + if value == 0 { + start-- } - return true - } - - // The value of the octet must be between 0 - 255. - if b0 > '2' || (b0 == '2' && b1 > '5') || (b0 == '2' && b1 == '5' && b2 > '5') { - return false - } - - // Output 3-digit octet. - if normalize && b0 == '0' { - tokenAPI.Input.Byte.MoveCursor(b0) - if b1 == '0' { - tokenAPI.Input.Byte.MoveCursor(b1) - } else { - tokenAPI.Input.Byte.Accept(b1) + if start > 0 { + tokenAPI.Input.Byte.MoveCursorMulti(chunk[0:start]...) } - tokenAPI.Input.Byte.Accept(b2) + tokenAPI.Input.Byte.AcceptMulti(chunk[start:end]...) } else { - tokenAPI.Input.Byte.AcceptMulti(b0, b1, b2) + tokenAPI.Input.Byte.AcceptMulti(chunk[0:end]...) } + return true } } diff --git a/tokenize/handlers_builtin_test.go b/tokenize/handlers_builtin_test.go index 1f95df6..ad0f0d6 100644 --- a/tokenize/handlers_builtin_test.go +++ b/tokenize/handlers_builtin_test.go @@ -267,7 +267,7 @@ func TestIPv4Atoms(t *testing.T) { {"256123", tokenize.MatchOctet(false), false, ""}, {"300", tokenize.MatchOctet(false), false, ""}, - // Octet. + // // Octet. {"0", tokenize.MatchOctet(false), true, "0"}, {"02", tokenize.MatchOctet(false), true, "02"}, {"003", tokenize.MatchOctet(false), true, "003"},