diff --git a/examples/example_basiccalculator2_test.go b/examples/example_basiccalculator2_test.go index e404eba..3038862 100644 --- a/examples/example_basiccalculator2_test.go +++ b/examples/example_basiccalculator2_test.go @@ -133,7 +133,7 @@ func (calc *calculator) factor(p *parse.API) { var A, T = tokenize.A, tokenize.T p.Accept(A.Blanks) switch { - case p.Accept(T.Float64(nil, A.Signed(A.Float))): + case p.Accept(T.Float64(nil, A.Signed(A.Decimal))): value := p.Result.Tokens[0].Value.(float64) calc.interpreter.pushValue(value) case p.Accept(A.LeftParen): diff --git a/tokenize/api.go b/tokenize/api.go index eb9948f..e768ee3 100644 --- a/tokenize/api.go +++ b/tokenize/api.go @@ -170,6 +170,14 @@ func (i *API) Accept() { i.acceptRunes(i.lastRuneWidth, i.lastRune) } +func (i *API) skipBytes(bytes ...byte) { + for _, b := range bytes { + i.stackFrame.moveCursorByByte(b) + } + i.stackFrame.offset += len(bytes) + i.runeRead = false +} + func (i *API) acceptBytes(bytes ...byte) { curRuneEnd := i.stackFrame.runeEnd newRuneEnd := curRuneEnd + len(bytes) @@ -190,6 +198,14 @@ func (i *API) acceptBytes(bytes ...byte) { i.runeRead = false } +func (i *API) skipRunes(width int, runes ...rune) { + for _, r := range runes { + i.stackFrame.moveCursorByRune(r) + } + i.stackFrame.offset += width + i.runeRead = false +} + func (i *API) acceptRunes(width int, runes ...rune) { curRuneEnd := i.stackFrame.runeEnd newRuneEnd := curRuneEnd + len(runes) diff --git a/tokenize/handlers_builtin.go b/tokenize/handlers_builtin.go index 095965e..a181ecc 100644 --- a/tokenize/handlers_builtin.go +++ b/tokenize/handlers_builtin.go @@ -70,9 +70,11 @@ var A = struct { Byte func(byte) Handler Bytes func(...byte) Handler ByteRange func(byte, byte) Handler + ByteByCallback func(func(byte) bool) Handler Rune func(rune) Handler Runes func(...rune) Handler RuneRange func(rune, rune) Handler + RuneByCallback func(func(rune) bool) Handler Str func(string) Handler StrNoCase func(string) Handler EndOfLine Handler @@ -136,11 +138,11 @@ var A = struct { DigitNotZero Handler Digits Handler Zero Handler - Float Handler Boolean Handler - Integer Handler Signed func(Handler) Handler + Integer Handler IntegerBetween func(min int64, max int64) Handler + Decimal Handler ASCII Handler ASCIILower Handler ASCIIUpper Handler @@ -160,9 +162,11 @@ var A = struct { Byte: MatchByte, Bytes: MatchBytes, ByteRange: MatchByteRange, + ByteByCallback: MatchByteByCallback, Rune: MatchRune, Runes: MatchRunes, RuneRange: MatchRuneRange, + RuneByCallback: MatchRuneByCallback, Str: MatchStr, StrNoCase: MatchStrNoCase, EndOfFile: MatchEndOfFile(), @@ -172,51 +176,51 @@ var A = struct { AnyRune: MatchAnyRune(), ValidRune: MatchValidRune(), InvalidRune: MatchInvalidRune(), - Space: MatchRune(' '), - Tab: MatchRune('\t'), - CR: MatchRune('\r'), - LF: MatchRune('\n'), + Space: MatchByte(' '), + Tab: MatchByte('\t'), + CR: MatchByte('\r'), + LF: MatchByte('\n'), CRLF: MatchStr("\r\n"), - Excl: MatchRune('!'), - DoubleQuote: MatchRune('"'), - Hash: MatchRune('#'), - Dollar: MatchRune('$'), - Percent: MatchRune('%'), - Amp: MatchRune('&'), - SingleQuote: MatchRune('\''), - RoundOpen: MatchRune('('), - LeftParen: MatchRune('('), - RoundClose: MatchRune(')'), - RightParen: MatchRune(')'), - Asterisk: MatchRune('*'), - Multiply: MatchRune('*'), - Plus: MatchRune('+'), - Add: MatchRune('+'), - Comma: MatchRune(','), - Minus: MatchRune('-'), - Subtract: MatchRune('-'), - Dot: MatchRune('.'), - Slash: MatchRune('/'), - Divide: MatchRune('/'), - Colon: MatchRune(':'), - Semicolon: MatchRune(';'), - AngleOpen: MatchRune('<'), - LessThan: MatchRune('<'), - Equal: MatchRune('='), - AngleClose: MatchRune('>'), - GreaterThan: MatchRune('>'), - Question: MatchRune('?'), - At: MatchRune('@'), - SquareOpen: MatchRune('['), - Backslash: MatchRune('\\'), - SquareClose: MatchRune(']'), - Caret: MatchRune('^'), - Underscore: MatchRune('_'), - Backquote: MatchRune('`'), - CurlyOpen: MatchRune('{'), - Pipe: MatchRune('|'), - CurlyClose: MatchRune('}'), - Tilde: MatchRune('~'), + Excl: MatchByte('!'), + DoubleQuote: MatchByte('"'), + Hash: MatchByte('#'), + Dollar: MatchByte('$'), + Percent: MatchByte('%'), + Amp: MatchByte('&'), + SingleQuote: MatchByte('\''), + RoundOpen: MatchByte('('), + LeftParen: MatchByte('('), + RoundClose: MatchByte(')'), + RightParen: MatchByte(')'), + Asterisk: MatchByte('*'), + Multiply: MatchByte('*'), + Plus: MatchByte('+'), + Add: MatchByte('+'), + Comma: MatchByte(','), + Minus: MatchByte('-'), + Subtract: MatchByte('-'), + Dot: MatchByte('.'), + Slash: MatchByte('/'), + Divide: MatchByte('/'), + Colon: MatchByte(':'), + Semicolon: MatchByte(';'), + AngleOpen: MatchByte('<'), + LessThan: MatchByte('<'), + Equal: MatchByte('='), + AngleClose: MatchByte('>'), + GreaterThan: MatchByte('>'), + Question: MatchByte('?'), + At: MatchByte('@'), + SquareOpen: MatchByte('['), + Backslash: MatchByte('\\'), + SquareClose: MatchByte(']'), + Caret: MatchByte('^'), + Underscore: MatchByte('_'), + Backquote: MatchByte('`'), + CurlyOpen: MatchByte('{'), + Pipe: MatchByte('|'), + CurlyClose: MatchByte('}'), + Tilde: MatchByte('~'), Newline: MatchNewline(), Blank: MatchBlank(), Blanks: MatchBlanks(), @@ -225,11 +229,11 @@ var A = struct { Digit: MatchDigit(), DigitNotZero: MatchDigitNotZero(), Digits: MatchDigits(), - Zero: MatchRune('0'), - Integer: MatchInteger(), + Zero: MatchByte('0'), Signed: MatchSigned, + Integer: MatchInteger(true), IntegerBetween: MatchIntegerBetween, - Float: MatchFloat(), + Decimal: MatchDecimal(true), Boolean: MatchBoolean(), ASCII: MatchASCII(), ASCIILower: MatchASCIILower(), @@ -355,7 +359,7 @@ func MatchByte(expected byte) Handler { // MatchRune creates a Handler function that matches against the provided rune. func MatchRune(expected rune) Handler { - if expected <= 127 { + if expected <= '\x7F' { return MatchByte(byte(expected)) } return func(t *API) bool { @@ -392,7 +396,7 @@ func MatchRunes(expected ...rune) Handler { onlyBytes := true expectedBytes := make([]byte, len(expected)) for i, r := range expected { - if r > 255 { + if r > '\x7F' { onlyBytes = false break } @@ -448,7 +452,7 @@ func MatchRuneRange(start rune, end rune) Handler { if end < start { callerPanic("MatchRuneRange", "Handler: {name} definition error at {caller}: start %q must not be < end %q", start, end) } - if end <= 127 { + if end <= '\x7F' { return MatchByteRange(byte(start), byte(end)) } return func(t *API) bool { @@ -574,6 +578,23 @@ func MatchUnicodeSpace() Handler { return MatchOneOrMore(MatchRuneByCallback(unicode.IsSpace)) } +// MatchByteByCallback creates a Handler that matches a single byte from the +// input against the provided callback function. When the callback returns true, +// it is considered a match. +// +// Note that the callback function matches the signature of the unicode.Is* functions, +// so those can be used. E.g. MatchRuneByCallback(unicode.IsLower). +func MatchByteByCallback(callback func(byte) bool) Handler { + return func(t *API) bool { + b, err := t.PeekByte(0) + if err == nil && callback(b) { + t.acceptBytes(b) + return true + } + return false + } +} + // MatchRuneByCallback creates a Handler that matches a single rune from the // input against the provided callback function. When the callback returns true, // it is considered a match. @@ -621,7 +642,7 @@ func MatchStr(expected string) Handler { return func(t *API) bool { offset := 0 for _, e := range expectedRunes { - if e <= 127 { + if e <= '\x7F' { b, err := t.PeekByte(offset) if err != nil || b != byte(e) { return false @@ -650,7 +671,7 @@ func MatchStrNoCase(expected string) Handler { width := 0 i := 0 for _, e := range expected { - if e <= 127 { + if e <= '\x7F' { b, err := t.PeekByte(width) if err != nil || (b != byte(e) && unicode.ToUpper(rune(b)) != unicode.ToUpper(e)) { return false @@ -732,9 +753,9 @@ func MatchNot(handler Handler) Handler { return false } t.Dispose(child) - _, err := t.NextRune() + r, w, err := t.PeekRune(0) if err == nil { - t.Accept() + t.acceptRunes(w, r) return true } return false @@ -924,8 +945,24 @@ func MakeInputFlusher(handler Handler) Handler { // // C.Signed(A.Integer) func MatchSigned(handler Handler) Handler { - sign := MatchOptional(MatchAny(MatchRune('+'), MatchRune('-'))) - return MatchSeq(sign, handler) + return func(t *API) bool { + child := t.Fork() + b, err := t.PeekByte(0) + if err != nil { + t.Dispose(child) + return false + } + if b == '-' || b == '+' { + t.acceptBytes(b) + } + if handler(t) { + t.Merge(child) + t.Dispose(child) + return true + } + t.Dispose(child) + return false + } } // MatchIntegerBetween creates a Handler that checks for an integer @@ -956,7 +993,7 @@ func MatchIntegerBetween(min int64, max int64) Handler { func MatchEndOfFile() Handler { return func(t *API) bool { child := t.Fork() - _, err := t.NextRune() + _, err := t.PeekByte(0) t.Dispose(child) return err == io.EOF } @@ -1024,37 +1061,157 @@ func MatchInvalidRune() Handler { // MatchDigit creates a Handler that checks if a single digit can be read // from the input. func MatchDigit() Handler { - return MatchRuneRange('0', '9') + return MatchByteRange('0', '9') } // MatchDigits creates a Handler that checks if one or more digits can be read // from the input. func MatchDigits() Handler { - return MatchOneOrMore(MatchDigit()) + return func(t *API) bool { + // Check if the first character is a digit. + b, err := t.PeekByte(0) + if err != nil || b < '0' || b > '9' { + return false + } + t.acceptBytes(b) + + // Continue accepting bytes as long as they are digits. + for { + b, err := t.PeekByte(0) + if err != nil || b < '0' || b > '9' { + return true + } + t.acceptBytes(b) + } + } } // MatchDigitNotZero creates a Handler that checks if a single digit not equal // to zero '0' can be read from the input. func MatchDigitNotZero() Handler { - return MatchRuneRange('1', '9') + return MatchByteRange('1', '9') } // MatchInteger creates a Handler function that checks if a valid integer -// can be read from the input. In line with Go, an integer cannot start with -// a zero. Starting with a zero is used to indicate other bases, like octal or -// hexadecimal. -func MatchInteger() Handler { - justZero := MatchRune('0') - integer := MatchSeq(MatchDigitNotZero(), MatchZeroOrMore(MatchDigit())) - return MatchAny(integer, justZero) +// can be read from the input. +// +// Leading zeroes are allowed. When the normalize parameter is true, these +// will be stripped from the input. +func MatchInteger(normalize bool) Handler { + return func(t *API) bool { + // Check if the first character is a digit. + b, err := t.PeekByte(0) + if err != nil || b < '0' || b > '9' { + return false + } + + // When normalization is requested, drop leading zeroes. + if normalize && b == '0' { + for { + b2, err := t.PeekByte(1) + + // The next character is a zero, skip the leading zero and check again. + if err == nil && b2 == b { + t.skipBytes('0') + continue + } + // The next character is not a zero, nor a digit at all. + // We're looking at a zero on its own here. + if err != nil || b2 < '1' || b2 > '9' { + t.acceptBytes('0') + return true + } + // The next character is a digit. SKip the leading zero and go with the digit. + t.skipBytes('0') + t.acceptBytes(b2) + break + } + } + + // Continue accepting bytes as long as they are digits. + for { + b, err := t.PeekByte(0) + if err != nil || b < '0' || b > '9' { + return true + } + t.acceptBytes(b) + } + } } -// MatchFloat creates a Handler function that checks if a valid float value -// can be read from the input. In case the fractional part is missing, this -// Handler will report a match, so both "123" and "123.123" will match. -func MatchFloat() Handler { - digits := MatchDigits() - return MatchSeq(digits, MatchOptional(MatchSeq(MatchRune('.'), digits))) +// MatchDecimal creates a Handler function that checks if a valid decimal value +// can be read from the input. In case the fractional part is missing (which is +// a valid decimal number), this Handler will report a match, so both "123" and +// "123.123" will match. +// +// Leading zeroes are allowed. When the normalize parameter is true, these +// will be stripped from the input. +func MatchDecimal(normalize bool) Handler { + return func(t *API) bool { + // Check if the first character is a digit. + b, err := t.PeekByte(0) + if err != nil || b < '0' || b > '9' { + return false + } + + // When normalization is requested, drop leading zeroes. + if normalize && b == '0' { + for { + b2, err := t.PeekByte(1) + + // The next character is a zero, skip the leading zero and check again. + if err == nil && b2 == b { + t.skipBytes('0') + continue + } + // The next character is a dot, go with the zero before the dot and + // let the upcoming code handle the dot. + if err == nil && b2 == '.' { + t.acceptBytes('0') + break + } + // The next character is not a zero, nor a digit at all. + // We're looking at a zero on its own here. + if err != nil || b2 < '1' || b2 > '9' { + t.acceptBytes('0') + return true + } + // The next character is a digit. SKip the leading zero and go with the digit. + t.skipBytes('0') + t.acceptBytes(b2) + break + } + } + + // Continue accepting bytes as long as they are digits. + for { + b, err = t.PeekByte(0) + if err != nil || b < '0' || b > '9' { + break + } + t.acceptBytes(b) + } + + // No dot or no digit after a dot? Then we're done. + if b != '.' { + return true + } + b, err = t.PeekByte(1) + if err != nil || b < '0' || b > '9' { + return true + } + + // Continue accepting bytes as long as they are digits. + t.acceptBytes('.', b) + for { + b, err = t.PeekByte(0) + if err != nil || b < '0' || b > '9' { + break + } + t.acceptBytes(b) + } + return true + } } // MatchBoolean creates a Handler function that checks if a boolean @@ -1075,7 +1232,11 @@ func MatchBoolean() Handler { return true } if b1 == 't' || b1 == 'T' { - b2, _ := t.PeekByte(1) + b2, err := t.PeekByte(1) + if err != nil || (b2 != 'R' && b2 != 'r') { + t.acceptBytes(b1) + return true + } b3, _ := t.PeekByte(2) b4, err := t.PeekByte(3) if err == nil && b2 == 'r' && b3 == 'u' && b4 == 'e' { @@ -1091,11 +1252,14 @@ func MatchBoolean() Handler { } if b1 == 'f' || b1 == 'F' { - b2, _ := t.PeekByte(1) + b2, err := t.PeekByte(1) + if err != nil || (b2 != 'A' && b2 != 'a') { + t.acceptBytes(b1) + return true + } b3, _ := t.PeekByte(2) b4, _ := t.PeekByte(3) b5, err := t.PeekByte(4) - if err == nil && b2 == 'a' && b3 == 'l' && b4 == 's' && b5 == 'e' { t.acceptBytes(b1, b2, b3, b4, b5) return true @@ -1114,19 +1278,19 @@ func MatchBoolean() Handler { // MatchASCII creates a Handler function that matches against any // ASCII value on the input. func MatchASCII() Handler { - return MatchRuneRange('\x00', '\x7F') + return MatchByteRange('\x00', '\x7F') } // MatchASCIILower creates a Handler function that matches against any // lower case ASCII letter on the input (a - z). func MatchASCIILower() Handler { - return MatchRuneRange('a', 'z') + return MatchByteRange('a', 'z') } // MatchASCIIUpper creates a Handler function that matches against any // upper case ASCII letter on the input (a - z). func MatchASCIIUpper() Handler { - return MatchRuneRange('A', 'Z') + return MatchByteRange('A', 'Z') } // MatchUnicodeLetter creates a Handler function that matches against any @@ -1365,19 +1529,15 @@ func MatchIPv6Net(normalize bool) Handler { // In both cases, it would match the first form. func ModifyDrop(handler Handler) Handler { return func(t *API) bool { - child := t.Fork() + runeEnd := t.stackFrame.runeEnd + tokenEnd := t.stackFrame.tokenEnd if handler(t) { - // Do a partial merge: only move the cursor and read offset forward. - // Any produced runes and tokens are ignored and not merged to the parent - // (since we're dropping those here). - parent := &t.stackFrames[t.stackLevel-1] - parent.offset = t.stackFrame.offset - parent.line = t.stackFrame.line - parent.column = t.stackFrame.column - t.Dispose(child) + // We keep offset and cursor updates, but rollback any runes / tokens + // that were added by the handler. + t.stackFrame.runeEnd = runeEnd + t.stackFrame.tokenEnd = tokenEnd return true } - t.Dispose(child) return false } } diff --git a/tokenize/handlers_builtin_test.go b/tokenize/handlers_builtin_test.go index 1d754e2..673e9b4 100644 --- a/tokenize/handlers_builtin_test.go +++ b/tokenize/handlers_builtin_test.go @@ -231,22 +231,27 @@ func TestAtoms(t *testing.T) { {"F", a.HexDigit, true, "F"}, {"g", a.HexDigit, false, "g"}, {"G", a.HexDigit, false, "G"}, + {"09", a.Integer, true, "9"}, + {"0000129", a.Integer, true, "129"}, {"0", a.Integer, true, "0"}, - {"09", a.Integer, true, "0"}, // following Go: 09 is invalid octal, so only 0 is valid for the integer + {"00000", a.Integer, true, "0"}, {"1", a.Integer, true, "1"}, {"-10X", a.Integer, false, ""}, {"+10X", a.Integer, false, ""}, {"-10X", a.Signed(a.Integer), true, "-10"}, {"+10X", a.Signed(a.Integer), true, "+10"}, {"+10.1X", a.Signed(a.Integer), true, "+10"}, - {"0X", a.Float, true, "0"}, - {"0X", a.Float, true, "0"}, - {"1X", a.Float, true, "1"}, - {"1.", a.Float, true, "1"}, // incomplete float, so only the 1 is picked up - {"123.321X", a.Float, true, "123.321"}, - {"-3.14X", a.Float, false, ""}, - {"-3.14X", a.Signed(a.Float), true, "-3.14"}, - {"-003.0014X", a.Signed(a.Float), true, "-003.0014"}, + {"0X", a.Decimal, true, "0"}, + {"0000X", a.Decimal, true, "0"}, + {"1X", a.Decimal, true, "1"}, + {"01X", a.Decimal, true, "1"}, + {"000001X", a.Decimal, true, "1"}, + {"1.", a.Decimal, true, "1"}, // incomplete float, so only the 1 is picked up + {"123.321X", a.Decimal, true, "123.321"}, + {"0.6X", a.Decimal, true, "0.6"}, + {"-3.14X", a.Decimal, false, ""}, + {"-3.14X", a.Signed(a.Decimal), true, "-3.14"}, + {"-003.0014X", a.Signed(a.Decimal), true, "-3.0014"}, {"-11", a.IntegerBetween(-10, 10), false, "0"}, {"-10", a.IntegerBetween(-10, 10), true, "-10"}, {"0", a.IntegerBetween(-10, 10), true, "0"}, @@ -430,8 +435,8 @@ func TestTokenMakers(t *testing.T) { {`4294967295XYZ`, tok.Uint32("L", a.Integer), []tokenize.Token{{Type: "L", Value: uint32(4294967295)}}}, {`18446744073709551615XYZ`, tok.Uint64("M", a.Integer), []tokenize.Token{{Type: "M", Value: uint64(18446744073709551615)}}}, - {`3.1415=PI`, tok.Float32("N", a.Float), []tokenize.Token{{Type: "N", Value: float32(3.1415)}}}, - {`24.19287=PI`, tok.Float64("O", a.Float), []tokenize.Token{{Type: "O", Value: float64(24.19287)}}}, + {`3.1415=PI`, tok.Float32("N", a.Decimal), []tokenize.Token{{Type: "N", Value: float32(3.1415)}}}, + {`24.19287=PI`, tok.Float64("O", a.Decimal), []tokenize.Token{{Type: "O", Value: float64(24.19287)}}}, {`1tTtrueTRUETrue`, c.OneOrMore(tok.Boolean("P", a.Boolean)), []tokenize.Token{ {Type: "P", Value: true},