More speed improvements.

This commit is contained in:
Maurice Makaay 2019-07-29 22:52:38 +00:00
parent 8ef9aed096
commit b9cc91c0ae
3 changed files with 362 additions and 324 deletions

View File

@ -8,36 +8,6 @@ import (
"git.makaay.nl/mauricem/go-parsekit/tokenize"
)
func BenchmarkMemclrOptimization(b *testing.B) {
// TODO use or cleanup this one and the next. I'm playing around here.
type s struct {
a int
b string
}
x := []s{{10, "hoi"}, {20, "doei"}, {30, "jadag"}}
for i := 0; i < b.N; i++ {
for i := range x {
x[i] = s{}
}
}
}
func BenchmarkCodedClear(b *testing.B) {
type s struct {
a int
b string
}
x := []s{{10, "hoi"}, {20, "doei"}, {30, "jadag"}}
for i := 0; i < b.N; i++ {
x[0] = s{}
x[1] = s{}
x[2] = s{}
}
}
func ExampleNewAPI() {
tokenize.NewAPI("The input that the API will handle")
}

View File

@ -25,37 +25,39 @@ import (
//
// Doing so saves you a lot of typing, and it makes your code a lot cleaner.
var C = struct {
Any func(...Handler) Handler
Not func(Handler) Handler
Seq func(...Handler) Handler
Min func(min int, handler Handler) Handler
Max func(max int, handler Handler) Handler
Repeated func(times int, handler Handler) Handler
Optional func(Handler) Handler
ZeroOrMore func(Handler) Handler
OneOrMore func(Handler) Handler
MinMax func(min int, max int, handler Handler) Handler
Separated func(separator Handler, separated Handler) Handler
Except func(except Handler, handler Handler) Handler
FollowedBy func(lookAhead Handler, handler Handler) Handler
NotFollowedBy func(lookAhead Handler, handler Handler) Handler
FlushInput func(Handler) Handler
Any func(...Handler) Handler
Not func(Handler) Handler
Seq func(...Handler) Handler
Min func(min int, handler Handler) Handler
Max func(max int, handler Handler) Handler
Repeated func(times int, handler Handler) Handler
Optional func(Handler) Handler
ZeroOrMore func(Handler) Handler
OneOrMore func(Handler) Handler
MinMax func(min int, max int, handler Handler) Handler
Separated func(separator Handler, separated Handler) Handler
Except func(except Handler, handler Handler) Handler
FollowedBy func(lookAhead Handler, handler Handler) Handler
NotFollowedBy func(lookAhead Handler, handler Handler) Handler
InOptionalBlanks func(handler Handler) Handler
FlushInput func(Handler) Handler
}{
Any: MatchAny,
Not: MatchNot,
Seq: MatchSeq,
Min: MatchMin,
Max: MatchMax,
Repeated: MatchRep,
Optional: MatchOptional,
ZeroOrMore: MatchZeroOrMore,
OneOrMore: MatchOneOrMore,
MinMax: MatchMinMax,
Separated: MatchSeparated,
Except: MatchExcept,
FollowedBy: MatchFollowedBy,
NotFollowedBy: MatchNotFollowedBy,
FlushInput: MakeInputFlusher,
Any: MatchAny,
Not: MatchNot,
Seq: MatchSeq,
Min: MatchMin,
Max: MatchMax,
Repeated: MatchRep,
Optional: MatchOptional,
ZeroOrMore: MatchZeroOrMore,
OneOrMore: MatchOneOrMore,
MinMax: MatchMinMax,
Separated: MatchSeparated,
Except: MatchExcept,
FollowedBy: MatchFollowedBy,
NotFollowedBy: MatchNotFollowedBy,
InOptionalBlanks: MatchInOptionalBlanks,
FlushInput: MakeInputFlusher,
}
// A provides convenient access to a range of atoms or functions to build atoms.
@ -67,181 +69,183 @@ var C = struct {
//
// Doing so saves you a lot of typing, and it makes your code a lot cleaner.
var A = struct {
Char func(...rune) Handler
CharRange func(...rune) Handler
ByteByCallback func(func(byte) bool) Handler
RuneByCallback func(func(rune) bool) Handler
AnyByte Handler
AnyRune Handler
ValidRune Handler
InvalidRune Handler
Str func(string) Handler
StrNoCase func(string) Handler
EndOfLine Handler
EndOfFile Handler
UntilEndOfLine Handler
Space Handler
Tab Handler
CR Handler
LF Handler
CRLF Handler
Excl Handler
DoubleQuote Handler
Hash Handler
Dollar Handler
Percent Handler
Amp Handler
SingleQuote Handler
RoundOpen Handler
LeftParen Handler
RoundClose Handler
RightParen Handler
Asterisk Handler
Multiply Handler
Plus Handler
Add Handler
Comma Handler
Minus Handler
Subtract Handler
Dot Handler
Slash Handler
Divide Handler
Colon Handler
Semicolon Handler
AngleOpen Handler
LessThan Handler
Equal Handler
AngleClose Handler
GreaterThan Handler
Question Handler
At Handler
SquareOpen Handler
Backslash Handler
SquareClose Handler
Caret Handler
Underscore Handler
Backquote Handler
CurlyOpen Handler
Pipe Handler
CurlyClose Handler
Tilde Handler
Newline Handler
Blank Handler
Blanks Handler
Whitespace Handler
UnicodeSpace Handler
Digit Handler
DigitNotZero Handler
Digits Handler
Zero Handler
Boolean Handler
Signed func(Handler) Handler
Integer Handler
IntegerBetween func(min int64, max int64) Handler
Decimal Handler
ASCII Handler
ASCIILower Handler
ASCIIUpper Handler
Letter Handler
Lower Handler
Upper Handler
HexDigit Handler
Octet Handler
IPv4 Handler
IPv4CIDRMask Handler
IPv4Netmask Handler
IPv4Net Handler
IPv6 Handler
IPv6CIDRMask Handler
IPv6Net Handler
Char func(...rune) Handler
CharRange func(...rune) Handler
ByteByCallback func(func(byte) bool) Handler
BytesByCallback func(func(byte) bool) Handler
RuneByCallback func(func(rune) bool) Handler
AnyByte Handler
AnyRune Handler
ValidRune Handler
InvalidRune Handler
Str func(string) Handler
StrNoCase func(string) Handler
EndOfLine Handler
EndOfFile Handler
UntilEndOfLine Handler
Space Handler
Tab Handler
CR Handler
LF Handler
CRLF Handler
Excl Handler
DoubleQuote Handler
Hash Handler
Dollar Handler
Percent Handler
Amp Handler
SingleQuote Handler
RoundOpen Handler
LeftParen Handler
RoundClose Handler
RightParen Handler
Asterisk Handler
Multiply Handler
Plus Handler
Add Handler
Comma Handler
Minus Handler
Subtract Handler
Dot Handler
Slash Handler
Divide Handler
Colon Handler
Semicolon Handler
AngleOpen Handler
LessThan Handler
Equal Handler
AngleClose Handler
GreaterThan Handler
Question Handler
At Handler
SquareOpen Handler
Backslash Handler
SquareClose Handler
Caret Handler
Underscore Handler
Backquote Handler
CurlyOpen Handler
Pipe Handler
CurlyClose Handler
Tilde Handler
Newline Handler
Blank Handler
Blanks Handler
Whitespace Handler
UnicodeSpace Handler
Digit Handler
DigitNotZero Handler
Digits Handler
Zero Handler
Boolean Handler
Signed func(Handler) Handler
Integer Handler
IntegerBetween func(min int64, max int64) Handler
Decimal Handler
ASCII Handler
ASCIILower Handler
ASCIIUpper Handler
Letter Handler
Lower Handler
Upper Handler
HexDigit Handler
Octet Handler
IPv4 Handler
IPv4CIDRMask Handler
IPv4Netmask Handler
IPv4Net Handler
IPv6 Handler
IPv6CIDRMask Handler
IPv6Net Handler
}{
Char: MatchChar,
CharRange: MatchCharRange,
ByteByCallback: MatchByteByCallback,
RuneByCallback: MatchRuneByCallback,
AnyByte: MatchAnyByte(),
AnyRune: MatchAnyRune(),
ValidRune: MatchValidRune(),
InvalidRune: MatchInvalidRune(),
Str: MatchStr,
StrNoCase: MatchStrNoCase,
EndOfFile: MatchEndOfFile(),
EndOfLine: MatchEndOfLine(),
UntilEndOfLine: MatchUntilEndOfLine(),
Space: MatchChar(' '),
Tab: MatchChar('\t'),
CR: MatchChar('\r'),
LF: MatchChar('\n'),
CRLF: MatchStr("\r\n"),
Excl: MatchChar('!'),
DoubleQuote: MatchChar('"'),
Hash: MatchChar('#'),
Dollar: MatchChar('$'),
Percent: MatchChar('%'),
Amp: MatchChar('&'),
SingleQuote: MatchChar('\''),
RoundOpen: MatchChar('('),
LeftParen: MatchChar('('),
RoundClose: MatchChar(')'),
RightParen: MatchChar(')'),
Asterisk: MatchChar('*'),
Multiply: MatchChar('*'),
Plus: MatchChar('+'),
Add: MatchChar('+'),
Comma: MatchChar(','),
Minus: MatchChar('-'),
Subtract: MatchChar('-'),
Dot: MatchChar('.'),
Slash: MatchChar('/'),
Divide: MatchChar('/'),
Colon: MatchChar(':'),
Semicolon: MatchChar(';'),
AngleOpen: MatchChar('<'),
LessThan: MatchChar('<'),
Equal: MatchChar('='),
AngleClose: MatchChar('>'),
GreaterThan: MatchChar('>'),
Question: MatchChar('?'),
At: MatchChar('@'),
SquareOpen: MatchChar('['),
Backslash: MatchChar('\\'),
SquareClose: MatchChar(']'),
Caret: MatchChar('^'),
Underscore: MatchChar('_'),
Backquote: MatchChar('`'),
CurlyOpen: MatchChar('{'),
Pipe: MatchChar('|'),
CurlyClose: MatchChar('}'),
Tilde: MatchChar('~'),
Newline: MatchNewline(),
Blank: MatchBlank(),
Blanks: MatchBlanks(),
Whitespace: MatchWhitespace(),
UnicodeSpace: MatchUnicodeSpace(),
Digit: MatchDigit(),
DigitNotZero: MatchDigitNotZero(),
Digits: MatchDigits(),
Zero: MatchChar('0'),
Signed: MatchSigned,
Integer: MatchInteger(true),
IntegerBetween: MatchIntegerBetween,
Decimal: MatchDecimal(true),
Boolean: MatchBoolean(),
ASCII: MatchASCII(),
ASCIILower: MatchASCIILower(),
ASCIIUpper: MatchASCIIUpper(),
Letter: MatchUnicodeLetter(),
Lower: MatchUnicodeLower(),
Upper: MatchUnicodeUpper(),
HexDigit: MatchHexDigit(),
Octet: MatchOctet(true),
IPv4: MatchIPv4(true),
IPv4CIDRMask: MatchIPv4CIDRMask(true),
IPv4Netmask: MatchIPv4Netmask(true),
IPv4Net: MatchIPv4Net(true),
IPv6: MatchIPv6(true),
IPv6CIDRMask: MatchIPv6CIDRMask(true),
IPv6Net: MatchIPv6Net(true),
Char: MatchChar,
CharRange: MatchCharRange,
ByteByCallback: MatchByteByCallback,
BytesByCallback: MatchBytesByCallback,
RuneByCallback: MatchRuneByCallback,
AnyByte: MatchAnyByte(),
AnyRune: MatchAnyRune(),
ValidRune: MatchValidRune(),
InvalidRune: MatchInvalidRune(),
Str: MatchStr,
StrNoCase: MatchStrNoCase,
EndOfFile: MatchEndOfFile(),
EndOfLine: MatchEndOfLine(),
UntilEndOfLine: MatchUntilEndOfLine(),
Space: MatchChar(' '),
Tab: MatchChar('\t'),
CR: MatchChar('\r'),
LF: MatchChar('\n'),
CRLF: MatchStr("\r\n"),
Excl: MatchChar('!'),
DoubleQuote: MatchChar('"'),
Hash: MatchChar('#'),
Dollar: MatchChar('$'),
Percent: MatchChar('%'),
Amp: MatchChar('&'),
SingleQuote: MatchChar('\''),
RoundOpen: MatchChar('('),
LeftParen: MatchChar('('),
RoundClose: MatchChar(')'),
RightParen: MatchChar(')'),
Asterisk: MatchChar('*'),
Multiply: MatchChar('*'),
Plus: MatchChar('+'),
Add: MatchChar('+'),
Comma: MatchChar(','),
Minus: MatchChar('-'),
Subtract: MatchChar('-'),
Dot: MatchChar('.'),
Slash: MatchChar('/'),
Divide: MatchChar('/'),
Colon: MatchChar(':'),
Semicolon: MatchChar(';'),
AngleOpen: MatchChar('<'),
LessThan: MatchChar('<'),
Equal: MatchChar('='),
AngleClose: MatchChar('>'),
GreaterThan: MatchChar('>'),
Question: MatchChar('?'),
At: MatchChar('@'),
SquareOpen: MatchChar('['),
Backslash: MatchChar('\\'),
SquareClose: MatchChar(']'),
Caret: MatchChar('^'),
Underscore: MatchChar('_'),
Backquote: MatchChar('`'),
CurlyOpen: MatchChar('{'),
Pipe: MatchChar('|'),
CurlyClose: MatchChar('}'),
Tilde: MatchChar('~'),
Newline: MatchNewline(),
Blank: MatchBlank(),
Blanks: MatchBlanks(),
Whitespace: MatchWhitespace(),
UnicodeSpace: MatchUnicodeSpace(),
Digit: MatchDigit(),
DigitNotZero: MatchDigitNotZero(),
Digits: MatchDigits(),
Zero: MatchChar('0'),
Signed: MatchSigned,
Integer: MatchInteger(true),
IntegerBetween: MatchIntegerBetween,
Decimal: MatchDecimal(true),
Boolean: MatchBoolean(),
ASCII: MatchASCII(),
ASCIILower: MatchASCIILower(),
ASCIIUpper: MatchASCIIUpper(),
Letter: MatchUnicodeLetter(),
Lower: MatchUnicodeLower(),
Upper: MatchUnicodeUpper(),
HexDigit: MatchHexDigit(),
Octet: MatchOctet(true),
IPv4: MatchIPv4(true),
IPv4CIDRMask: MatchIPv4CIDRMask(true),
IPv4Netmask: MatchIPv4Netmask(true),
IPv4Net: MatchIPv4Net(true),
IPv6: MatchIPv6(true),
IPv6CIDRMask: MatchIPv6CIDRMask(true),
IPv6Net: MatchIPv6Net(true),
}
// M provides convenient access to a range of modifiers (which in their nature are
@ -552,21 +556,32 @@ func MatchBlank() Handler {
// like a vertical tab, then make use of MatchUnicodeSpace().
func MatchBlanks() Handler {
return func(tokenAPI *API) bool {
// Match the first blank.
b, err := tokenAPI.Input.Byte.Peek(0)
if err != nil || (b != ' ' && b != '\t') {
return false
f := tokenAPI.Input.Byte.AcceptMulti
if tokenAPI.Output.suspended > 0 {
f = tokenAPI.Input.Byte.MoveCursorMulti
}
tokenAPI.Input.Byte.Accept(b)
// Now match any number of followup blanks. We've already got
// a successful match at this point, so we'll always return true at the end.
ok := false
for {
b, err := tokenAPI.Input.Byte.Peek(0)
if err != nil || (b != ' ' && b != '\t') {
return true
chunk, err := tokenAPI.Input.Byte.PeekMulti(0, 128)
for i, b := range chunk {
if b != ' ' && b != '\t' {
if i > 0 {
f(chunk[:i]...)
}
return ok
}
ok = true
}
tokenAPI.Input.Byte.Accept(b)
if err != nil {
if err == io.EOF {
if len(chunk) > 0 {
f(chunk...)
}
return ok
}
return false
}
f(chunk...)
}
}
}
@ -576,37 +591,32 @@ func MatchBlanks() Handler {
// carriage return '\r' followed by a newline '\n' (CRLF).
func MatchWhitespace() Handler {
return func(tokenAPI *API) bool {
// Match the first whitespace.
b1, err := tokenAPI.Input.Byte.Peek(0)
if err != nil || (b1 != ' ' && b1 != '\t' && b1 != '\n' && b1 != '\r') {
return false
f := tokenAPI.Input.Byte.AcceptMulti
if tokenAPI.Output.suspended > 0 {
f = tokenAPI.Input.Byte.MoveCursorMulti
}
if b1 == '\r' {
b2, err := tokenAPI.Input.Byte.Peek(1)
if err != nil || b2 != '\n' {
ok := false
for {
chunk, err := tokenAPI.Input.Byte.PeekMulti(0, 128)
for i, b := range chunk {
if b != ' ' && b != '\t' && b != '\n' && b != '\r' {
if i > 0 {
f(chunk[:i]...)
}
return ok
}
ok = true
}
if err != nil {
if err == io.EOF {
if len(chunk) > 0 {
f(chunk...)
}
return ok
}
return false
}
tokenAPI.Input.Byte.AcceptMulti(b1, b2)
} else {
tokenAPI.Input.Byte.Accept(b1)
}
// Now match any number of followup whitespace. We've already got
// a successful match at this point, so we'll always return true at the end.
for {
b1, err := tokenAPI.Input.Byte.Peek(0)
if err != nil || (b1 != ' ' && b1 != '\t' && b1 != '\n' && b1 != '\r') {
return true
}
if b1 == '\r' {
b2, err := tokenAPI.Input.Byte.Peek(1)
if err != nil || b2 != '\n' {
return true
}
tokenAPI.Input.Byte.AcceptMulti(b1, b2)
} else {
tokenAPI.Input.Byte.Accept(b1)
}
f(chunk...)
}
}
}
@ -620,9 +630,6 @@ func MatchUnicodeSpace() Handler {
// MatchByteByCallback creates a Handler that matches a single byte from the
// input against the provided callback function. When the callback returns true,
// it is considered a match.
//
// Note that the callback function matches the signature of the unicode.Is* functions,
// so those can be used. E.g. MatchRuneByCallback(unicode.IsLower).
func MatchByteByCallback(callback func(byte) bool) Handler {
return func(tokenAPI *API) bool {
b, err := tokenAPI.Input.Byte.Peek(0)
@ -634,6 +641,41 @@ func MatchByteByCallback(callback func(byte) bool) Handler {
}
}
// MatchBytesByCallback creates a Handler that matches one or more bytes from the
// input against the provided callback function. As long as the callback returns true,
// it is considered a match.
func MatchBytesByCallback(callback func(byte) bool) Handler {
return func(tokenAPI *API) bool {
f := tokenAPI.Input.Byte.AcceptMulti
if tokenAPI.Output.suspended > 0 {
f = tokenAPI.Input.Byte.MoveCursorMulti
}
ok := false
for {
chunk, err := tokenAPI.Input.Byte.PeekMulti(0, 128)
for i, b := range chunk {
if !callback(b) {
if i > 0 {
f(chunk[:i]...)
}
return ok
}
ok = true
}
if err != nil {
if err == io.EOF {
if len(chunk) > 0 {
f(chunk...)
}
return ok
}
return false
}
f(chunk...)
}
}
}
// MatchRuneByCallback creates a Handler that matches a single rune from the
// input against the provided callback function. When the callback returns true,
// it is considered a match.
@ -947,6 +989,37 @@ func MatchNotFollowedBy(lookAhead Handler, handler Handler) Handler {
}
}
func MatchInOptionalBlanks(handler Handler) Handler {
return func(tokenAPI *API) bool {
skipBlanks(tokenAPI)
if !handler(tokenAPI) {
return false
}
skipBlanks(tokenAPI)
return true
}
}
func skipBlanks(tokenAPI *API) {
for {
bs, err := tokenAPI.Input.Byte.PeekMulti(0, 128)
for i, b := range bs {
if b != ' ' && b != '\t' {
if i > 0 {
tokenAPI.Input.Byte.MoveCursorMulti(bs[:i]...)
}
return
}
}
if err != nil {
if len(bs) > 0 {
tokenAPI.Input.Byte.MoveCursorMulti(bs...)
}
return
}
}
}
// MakeInputFlusher creates a Handler that will flush the input buffer when the
// provided handler matches.
//
@ -1037,31 +1110,35 @@ func MatchUntilEndOfLine() Handler {
f = tokenAPI.Input.Byte.MoveCursorMulti
}
for {
bs, err := tokenAPI.Input.Byte.PeekMulti(0, 128)
chunk, err := tokenAPI.Input.Byte.PeekMulti(0, 128)
state := 0
for i, b := range bs {
ok := false
for i, b := range chunk {
if b == '\r' {
state = 1
continue
}
if b == '\n' {
if state == 1 {
f(bs[:i+1]...)
} else {
f(bs[:i]...)
f(chunk[:i+1]...)
} else if i > 0 {
f(chunk[:i]...)
}
return true
return ok
}
state = 0
ok = true
}
if err != nil {
if err == io.EOF {
f(bs...)
return true
if len(chunk) > 0 {
f(chunk...)
}
return ok
}
return false
}
f(bs...)
f(chunk...)
}
}
}
@ -1350,50 +1427,41 @@ func MatchHexDigit() Handler {
// stripped from the octet.
func MatchOctet(normalize bool) Handler {
return func(tokenAPI *API) bool {
// Digit 1
b0, err := tokenAPI.Input.Byte.Peek(0)
if err != nil || b0 < '0' || b0 > '9' {
chunk, _ := tokenAPI.Input.Byte.PeekMulti(0, 3)
value := 0
start := 0
end := 0
for i, b := range chunk {
if b < '0' || b > '9' {
if i == 0 {
return false
}
break
}
if b == '0' && value == 0 {
start++
} else {
value = value*10 + int(b-'0')
}
end++
}
if value > 255 {
return false
}
// Digit 2
b1, err := tokenAPI.Input.Byte.Peek(1)
if err != nil || b1 < '0' || b1 > '9' {
// Output 1-digit octet.
tokenAPI.Input.Byte.Accept(b0)
return true
}
// Digit 3
b2, err := tokenAPI.Input.Byte.Peek(2)
if err != nil || b2 < '0' || b2 > '9' {
// Output 2-digit octet.
if normalize && b0 == '0' {
tokenAPI.Input.Byte.MoveCursor(b0)
tokenAPI.Input.Byte.Accept(b1)
} else {
tokenAPI.Input.Byte.AcceptMulti(b0, b1)
if normalize {
if value == 0 {
start--
}
return true
}
// The value of the octet must be between 0 - 255.
if b0 > '2' || (b0 == '2' && b1 > '5') || (b0 == '2' && b1 == '5' && b2 > '5') {
return false
}
// Output 3-digit octet.
if normalize && b0 == '0' {
tokenAPI.Input.Byte.MoveCursor(b0)
if b1 == '0' {
tokenAPI.Input.Byte.MoveCursor(b1)
} else {
tokenAPI.Input.Byte.Accept(b1)
if start > 0 {
tokenAPI.Input.Byte.MoveCursorMulti(chunk[0:start]...)
}
tokenAPI.Input.Byte.Accept(b2)
tokenAPI.Input.Byte.AcceptMulti(chunk[start:end]...)
} else {
tokenAPI.Input.Byte.AcceptMulti(b0, b1, b2)
tokenAPI.Input.Byte.AcceptMulti(chunk[0:end]...)
}
return true
}
}

View File

@ -267,7 +267,7 @@ func TestIPv4Atoms(t *testing.T) {
{"256123", tokenize.MatchOctet(false), false, ""},
{"300", tokenize.MatchOctet(false), false, ""},
// Octet.
// // Octet.
{"0", tokenize.MatchOctet(false), true, "0"},
{"02", tokenize.MatchOctet(false), true, "02"},
{"003", tokenize.MatchOctet(false), true, "003"},