diff --git a/examples/example_basiccalculator1_test.go b/examples/example_basiccalculator1_test.go index a2eb35d..634ca34 100644 --- a/examples/example_basiccalculator1_test.go +++ b/examples/example_basiccalculator1_test.go @@ -3,8 +3,8 @@ // // 10 + 20 - 8+4 // -// So positive numbers that can be either added or substracted, and whitespace -// is ignored. +// So positive numbers that can be either added or substracted, and blanks +// around numbers are ignored. package examples import ( @@ -69,9 +69,9 @@ type simpleCalculator struct { op int64 // represents operation for next term (+1 = add, -1 = subtract) } -// A definition of an int64, which conveniently drops surrounding whitespace. -var dropWhitespace = parsekit.M.Drop(parsekit.C.Opt(parsekit.A.Whitespace)) -var bareInteger = parsekit.C.Seq(dropWhitespace, parsekit.A.Integer, dropWhitespace) +// A definition of an int64, which conveniently drops surrounding blanks. +var dropBlank = parsekit.M.Drop(parsekit.C.Opt(parsekit.A.Blank)) +var bareInteger = parsekit.C.Seq(dropBlank, parsekit.A.Integer, dropBlank) var int64Token = parsekit.T.Int64(nil, bareInteger) func (c *simpleCalculator) number(p *parsekit.ParseAPI) { diff --git a/examples/example_basiccalculator2_test.go b/examples/example_basiccalculator2_test.go index dfd7ff5..47ce0ce 100644 --- a/examples/example_basiccalculator2_test.go +++ b/examples/example_basiccalculator2_test.go @@ -130,7 +130,7 @@ func (c *calculator) term(p *parsekit.ParseAPI) { // = (FLOAT | LPAREN RPAREN) func (c *calculator) factor(p *parsekit.ParseAPI) { var A, T = parsekit.A, parsekit.T - p.On(A.Whitespace).Skip() + p.On(A.Blank).Skip() switch { case p.On(T.Float64(nil, A.Signed(A.Float))).Accept(): value := p.Result().Value(0).(float64) @@ -147,7 +147,7 @@ func (c *calculator) factor(p *parsekit.ParseAPI) { p.UnexpectedInput("factor or (expression)") return } - p.On(A.Whitespace).Skip() + p.On(A.Blank).Skip() } // --------------------------------------------------------------------------- diff --git a/examples/example_dutchpostcode_test.go b/examples/example_dutchpostcode_test.go index dbd6fa4..7d81efc 100644 --- a/examples/example_dutchpostcode_test.go +++ b/examples/example_dutchpostcode_test.go @@ -65,7 +65,7 @@ func createPostcodeTokenizer() *parsekit.Tokenizer { pcDigits := C.Seq(digitNotZero, C.Rep(3, A.Digit)) pcLetter := C.Any(A.ASCIILower, A.ASCIIUpper) pcLetters := M.ToUpper(C.Seq(pcLetter, pcLetter)) - space := M.Replace(C.Opt(A.Whitespace), " ") + space := M.Replace(C.Opt(A.Blank), " ") postcode := C.Seq(T.Str("PCD", pcDigits), space, T.Str("PCL", pcLetters), A.EndOfFile) // Create a Tokenizer that wraps the 'postcode' TokenHandler and allows diff --git a/examples/example_helloManyStateParser_test.go b/examples/example_helloManyStateParser_test.go index 2c0be6c..f2bb6b6 100644 --- a/examples/example_helloManyStateParser_test.go +++ b/examples/example_helloManyStateParser_test.go @@ -90,7 +90,7 @@ func (h *helloparser1) start(p *parsekit.ParseAPI) { func (h *helloparser1) comma(p *parsekit.ParseAPI) { a := parsekit.A switch { - case p.On(a.Whitespace).Skip(): + case p.On(a.Blank).Skip(): p.Handle(h.comma) case p.On(a.Comma).Skip(): p.Handle(h.startName) @@ -102,7 +102,7 @@ func (h *helloparser1) comma(p *parsekit.ParseAPI) { func (h *helloparser1) startName(p *parsekit.ParseAPI) { c, a := parsekit.C, parsekit.A switch { - case p.On(a.Whitespace).Skip(): + case p.On(a.Blank).Skip(): p.Handle(h.startName) case p.On(c.Not(a.Excl)).Stay(): p.Handle(h.name) diff --git a/examples/example_helloParserCombinator_test.go b/examples/example_helloParserCombinator_test.go index 76faa16..2137317 100644 --- a/examples/example_helloParserCombinator_test.go +++ b/examples/example_helloParserCombinator_test.go @@ -54,8 +54,8 @@ func createHelloTokenizer() *parsekit.Tokenizer { // that does all the work. The 'greeting' TokenHandler matches the whole input and // drops all but the name from it. hello := a.StrNoCase("hello") - comma := c.Seq(c.Opt(a.Whitespace), a.Comma, c.Opt(a.Whitespace)) - separator := c.Any(comma, a.Whitespace) + comma := c.Seq(c.Opt(a.Blank), a.Comma, c.Opt(a.Blank)) + separator := c.Any(comma, a.Blank) name := c.OneOrMore(c.Not(a.Excl)) greeting := c.Seq(m.Drop(hello), m.Drop(separator), name, m.Drop(a.Excl), a.EndOfFile) diff --git a/examples/example_helloSingleStateParser_test.go b/examples/example_helloSingleStateParser_test.go index a798da2..691ac75 100644 --- a/examples/example_helloSingleStateParser_test.go +++ b/examples/example_helloSingleStateParser_test.go @@ -84,7 +84,7 @@ func (h *helloparser2) start(p *parsekit.ParseAPI) { p.Error("the greeting is not being friendly") return } - if !p.On(c.Seq(c.Opt(a.Whitespace), a.Comma, c.Opt(a.Whitespace))).Skip() { + if !p.On(c.Seq(c.Opt(a.Blank), a.Comma, c.Opt(a.Blank))).Skip() { p.Error("the greeting is not properly separated") return } diff --git a/parseapi.go b/parseapi.go index c6db4a1..d596b4f 100644 --- a/parseapi.go +++ b/parseapi.go @@ -66,7 +66,7 @@ func (p *ParseAPI) checkForLoops() { // // So an example chain could look like this: // -// p.On(parsekit.A.Whitespace).Skip() +// p.On(parsekit.A.Blank).Skip() // // The chain as a whole returns a boolean that indicates whether or not at match // was found. When no match was found, false is returned and Skip() and Accept() @@ -103,31 +103,36 @@ func (p *ParseAPI) On(tokenHandler TokenHandler) *ParseAPIOnAction { ok := tokenHandler(child) return &ParseAPIOnAction{ - parseAPI: p, - tokenAPI: child, - ok: ok, + parseAPI: p, + tokenAPI: p.tokenAPI, + forkedTokenAPI: child, + ok: ok, } } // ParseAPIOnAction is a struct that is used for building the On()-method chain. // The On() method will return an initialized struct of this type. type ParseAPIOnAction struct { - parseAPI *ParseAPI - tokenAPI *TokenAPI - ok bool + parseAPI *ParseAPI + tokenAPI *TokenAPI + forkedTokenAPI *TokenAPI + ok bool } // Accept tells the parser to move the read cursor past a match that was -// found, and to make the TokenHandlerResult from the TokenAPI available in the -// ParseAPI through the ParseAPI.Result() method. +// found by a TokenHandler, and to make the TokenHandlerResult from the +// TokenAPI available in the ParseAPI through the ParseAPI.Result() method. // // Returns true in case a match was found. // When no match was found, then no action is taken and false is returned. func (a *ParseAPIOnAction) Accept() bool { if a.ok { - a.tokenAPI.Merge() - a.flushReader() - a.parseAPI.result = a.tokenAPI.root.result + a.forkedTokenAPI.Merge() + a.parseAPI.result = a.tokenAPI.Result() + a.tokenAPI.detachChilds() + if a.tokenAPI.flushReader() { + a.parseAPI.initLoopCheck() + } } return a.ok } @@ -145,10 +150,12 @@ func (a *ParseAPIOnAction) Accept() bool { func (a *ParseAPIOnAction) Skip() bool { if a.ok { a.parseAPI.result = nil - a.tokenAPI.clearResults() - a.tokenAPI.syncCursorTo(a.tokenAPI.root) + a.forkedTokenAPI.clearResults() a.tokenAPI.detachChilds() - a.flushReader() + a.forkedTokenAPI.syncCursorTo(a.tokenAPI) + if a.tokenAPI.flushReader() { + a.parseAPI.initLoopCheck() + } } return a.ok } @@ -170,14 +177,6 @@ func (a *ParseAPIOnAction) Stay() bool { return a.ok } -func (a *ParseAPIOnAction) flushReader() { - if a.tokenAPI.result.offset > 0 { - a.tokenAPI.root.reader.Flush(a.tokenAPI.root.result.offset) - a.tokenAPI.root.result.offset = 0 - a.parseAPI.initLoopCheck() - } -} - // Result returns a TokenHandlerResult struct, containing results as produced by the // last ParseAPI.On().Accept() call. func (p *ParseAPI) Result() *TokenHandlerResult { diff --git a/parser_test.go b/parser_test.go index 74edc89..137695a 100644 --- a/parser_test.go +++ b/parser_test.go @@ -298,7 +298,7 @@ func TestGivenLoopingParserDefinition_ParserPanics(t *testing.T) { // p.On(c.Max(5, a.AnyRune)) // // The problem here is that Max(5, ...) will also match when there is -// no more input, since Max(5, ---) is actually MinMax(0, 5, ...). +// no more input, since Max(5, ...) is actually MinMax(0, 5, ...). // Therefore the loop will never stop. Solving the loop was simple: // // p.On(c.MinMax(1, 5, a.AnyRune)) diff --git a/tokenapi.go b/tokenapi.go index 34f1e23..ede53ab 100644 --- a/tokenapi.go +++ b/tokenapi.go @@ -8,15 +8,15 @@ import ( ) // TokenAPI wraps a parsekit.reader and its purpose is to retrieve data from -// the reader and to report back tokenizing results. For easy lookahead support, -// a forking strategy is provided. +// a parsekit.reader.Reader and to report back tokenizing results. For easy +// lookahead support, a forking strategy is provided. // // BASIC OPERATION: // // To retrieve the next rune from the TokenAPI, call the NextRune() method. // // When the rune is to be accepted as input, call the method Accept(). The rune -// is then added to the results of the TokenAPI and the read cursor is moved +// is then added to the result runes of the TokenAPI and the read cursor is moved // forward. // // By invoking NextRune() + Accept() multiple times, the result can be extended @@ -63,7 +63,6 @@ import ( // no bookkeeping has to be implemented when implementing a parser. type TokenAPI struct { reader *reader.Reader - root *TokenAPI // the root TokenAPI parent *TokenAPI // parent TokenAPI in case this TokenAPI is a fork child child *TokenAPI // child TokenAPI in case this TokenAPI is a fork parent result *TokenHandlerResult // results as produced by a TokenHandler (runes, Tokens, cursor position) @@ -75,7 +74,6 @@ func NewTokenAPI(r io.Reader) *TokenAPI { reader: reader.New(r), result: newTokenHandlerResult(), } - input.root = input // TODO remove this one from root input, input.root == nil is also a good check for "is root?". return input } @@ -141,7 +139,6 @@ func (i *TokenAPI) Fork() *TokenAPI { // Create the new fork. child := &TokenAPI{ reader: i.reader, - root: i.root, parent: i, } child.result = newTokenHandlerResult() @@ -200,6 +197,15 @@ func (i *TokenAPI) detachChildsRecurse() { i.parent = nil } +func (i *TokenAPI) flushReader() bool { + if i.result.offset > 0 { + i.reader.Flush(i.result.offset) + i.result.offset = 0 + return true + } + return false +} + // Result returns the TokenHandlerResult data for the TokenAPI. The returned struct // can be used to retrieve and to modify result data. func (i *TokenAPI) Result() *TokenHandlerResult { diff --git a/tokenhandlers_builtin.go b/tokenhandlers_builtin.go index c877036..69a9667 100644 --- a/tokenhandlers_builtin.go +++ b/tokenhandlers_builtin.go @@ -62,160 +62,168 @@ var C = struct { // // Doing so saves you a lot of typing, and it makes your code a lot cleaner. var A = struct { - Rune func(rune) TokenHandler - Runes func(...rune) TokenHandler - RuneRange func(rune, rune) TokenHandler - Str func(string) TokenHandler - StrNoCase func(string) TokenHandler - EndOfFile TokenHandler - AnyRune TokenHandler - ValidRune TokenHandler - Space TokenHandler - Tab TokenHandler - CR TokenHandler - LF TokenHandler - CRLF TokenHandler - Excl TokenHandler - DoubleQuote TokenHandler - Hash TokenHandler - Dollar TokenHandler - Percent TokenHandler - Amp TokenHandler - SingleQuote TokenHandler - RoundOpen TokenHandler - LeftParen TokenHandler - RoundClose TokenHandler - RightParen TokenHandler - Asterisk TokenHandler - Multiply TokenHandler - Plus TokenHandler - Add TokenHandler - Comma TokenHandler - Minus TokenHandler - Subtract TokenHandler - Dot TokenHandler - Slash TokenHandler - Divide TokenHandler - Colon TokenHandler - Semicolon TokenHandler - AngleOpen TokenHandler - LessThan TokenHandler - Equal TokenHandler - AngleClose TokenHandler - GreaterThan TokenHandler - Question TokenHandler - At TokenHandler - SquareOpen TokenHandler - Backslash TokenHandler - SquareClose TokenHandler - Caret TokenHandler - Underscore TokenHandler - Backquote TokenHandler - CurlyOpen TokenHandler - Pipe TokenHandler - CurlyClose TokenHandler - Tilde TokenHandler - Newline TokenHandler - Whitespace TokenHandler - WhitespaceAndNewlines TokenHandler - EndOfLine TokenHandler - Digit TokenHandler - DigitNotZero TokenHandler - Digits TokenHandler - Float TokenHandler - Boolean TokenHandler - Integer TokenHandler - Signed func(TokenHandler) TokenHandler - IntegerBetween func(min int64, max int64) TokenHandler - ASCII TokenHandler - ASCIILower TokenHandler - ASCIIUpper TokenHandler - HexDigit TokenHandler - Octet TokenHandler - IPv4 TokenHandler - IPv4CIDRMask TokenHandler - IPv4Netmask TokenHandler - IPv4Net TokenHandler - IPv6 TokenHandler - IPv6CIDRMask TokenHandler - IPv6Net TokenHandler + Rune func(rune) TokenHandler + Runes func(...rune) TokenHandler + RuneRange func(rune, rune) TokenHandler + Str func(string) TokenHandler + StrNoCase func(string) TokenHandler + EndOfFile TokenHandler + AnyRune TokenHandler + ValidRune TokenHandler + Space TokenHandler + Tab TokenHandler + CR TokenHandler + LF TokenHandler + CRLF TokenHandler + Excl TokenHandler + DoubleQuote TokenHandler + Hash TokenHandler + Dollar TokenHandler + Percent TokenHandler + Amp TokenHandler + SingleQuote TokenHandler + RoundOpen TokenHandler + LeftParen TokenHandler + RoundClose TokenHandler + RightParen TokenHandler + Asterisk TokenHandler + Multiply TokenHandler + Plus TokenHandler + Add TokenHandler + Comma TokenHandler + Minus TokenHandler + Subtract TokenHandler + Dot TokenHandler + Slash TokenHandler + Divide TokenHandler + Colon TokenHandler + Semicolon TokenHandler + AngleOpen TokenHandler + LessThan TokenHandler + Equal TokenHandler + AngleClose TokenHandler + GreaterThan TokenHandler + Question TokenHandler + At TokenHandler + SquareOpen TokenHandler + Backslash TokenHandler + SquareClose TokenHandler + Caret TokenHandler + Underscore TokenHandler + Backquote TokenHandler + CurlyOpen TokenHandler + Pipe TokenHandler + CurlyClose TokenHandler + Tilde TokenHandler + Newline TokenHandler + Blank TokenHandler + Blanks TokenHandler + Whitespace TokenHandler + EndOfLine TokenHandler + Digit TokenHandler + DigitNotZero TokenHandler + Digits TokenHandler + Float TokenHandler + Boolean TokenHandler + Integer TokenHandler + Signed func(TokenHandler) TokenHandler + IntegerBetween func(min int64, max int64) TokenHandler + ASCII TokenHandler + ASCIILower TokenHandler + ASCIIUpper TokenHandler + Letter TokenHandler + Lower TokenHandler + Upper TokenHandler + HexDigit TokenHandler + Octet TokenHandler + IPv4 TokenHandler + IPv4CIDRMask TokenHandler + IPv4Netmask TokenHandler + IPv4Net TokenHandler + IPv6 TokenHandler + IPv6CIDRMask TokenHandler + IPv6Net TokenHandler }{ - Rune: MatchRune, - Runes: MatchRunes, - RuneRange: MatchRuneRange, - Str: MatchStr, - StrNoCase: MatchStrNoCase, - EndOfFile: MatchEndOfFile(), - AnyRune: MatchAnyRune(), - ValidRune: MatchValidRune(), - Space: MatchRune(' '), - Tab: MatchRune('\t'), - CR: MatchRune('\r'), - LF: MatchRune('\n'), - CRLF: MatchStr("\r\n"), - Excl: MatchRune('!'), - DoubleQuote: MatchRune('"'), - Hash: MatchRune('#'), - Dollar: MatchRune('$'), - Percent: MatchRune('%'), - Amp: MatchRune('&'), - SingleQuote: MatchRune('\''), - RoundOpen: MatchRune('('), - LeftParen: MatchRune('('), - RoundClose: MatchRune(')'), - RightParen: MatchRune(')'), - Asterisk: MatchRune('*'), - Multiply: MatchRune('*'), - Plus: MatchRune('+'), - Add: MatchRune('+'), - Comma: MatchRune(','), - Minus: MatchRune('-'), - Subtract: MatchRune('-'), - Dot: MatchRune('.'), - Slash: MatchRune('/'), - Divide: MatchRune('/'), - Colon: MatchRune(':'), - Semicolon: MatchRune(';'), - AngleOpen: MatchRune('<'), - LessThan: MatchRune('<'), - Equal: MatchRune('='), - AngleClose: MatchRune('>'), - GreaterThan: MatchRune('>'), - Question: MatchRune('?'), - At: MatchRune('@'), - SquareOpen: MatchRune('['), - Backslash: MatchRune('\\'), - SquareClose: MatchRune(']'), - Caret: MatchRune('^'), - Underscore: MatchRune('_'), - Backquote: MatchRune('`'), - CurlyOpen: MatchRune('{'), - Pipe: MatchRune('|'), - CurlyClose: MatchRune('}'), - Tilde: MatchRune('~'), - Whitespace: MatchWhitespace(), - WhitespaceAndNewlines: MatchWhitespaceAndNewlines(), - EndOfLine: MatchEndOfLine(), - Digit: MatchDigit(), - DigitNotZero: MatchDigitNotZero(), - Digits: MatchDigits(), - Integer: MatchInteger(), - Signed: MatchSigned, - IntegerBetween: MatchIntegerBetween, - Float: MatchFloat(), - Boolean: MatchBoolean(), - ASCII: MatchASCII(), - ASCIILower: MatchASCIILower(), - ASCIIUpper: MatchASCIIUpper(), - HexDigit: MatchHexDigit(), - Octet: MatchOctet(false), - IPv4: MatchIPv4(true), - IPv4CIDRMask: MatchIPv4CIDRMask(true), - IPv4Netmask: MatchIPv4Netmask(true), - IPv4Net: MatchIPv4Net(true), - IPv6: MatchIPv6(true), - IPv6CIDRMask: MatchIPv6CIDRMask(true), - IPv6Net: MatchIPv6Net(true), + Rune: MatchRune, + Runes: MatchRunes, + RuneRange: MatchRuneRange, + Str: MatchStr, + StrNoCase: MatchStrNoCase, + EndOfFile: MatchEndOfFile(), + AnyRune: MatchAnyRune(), + ValidRune: MatchValidRune(), + Space: MatchRune(' '), + Tab: MatchRune('\t'), + CR: MatchRune('\r'), + LF: MatchRune('\n'), + CRLF: MatchStr("\r\n"), + Excl: MatchRune('!'), + DoubleQuote: MatchRune('"'), + Hash: MatchRune('#'), + Dollar: MatchRune('$'), + Percent: MatchRune('%'), + Amp: MatchRune('&'), + SingleQuote: MatchRune('\''), + RoundOpen: MatchRune('('), + LeftParen: MatchRune('('), + RoundClose: MatchRune(')'), + RightParen: MatchRune(')'), + Asterisk: MatchRune('*'), + Multiply: MatchRune('*'), + Plus: MatchRune('+'), + Add: MatchRune('+'), + Comma: MatchRune(','), + Minus: MatchRune('-'), + Subtract: MatchRune('-'), + Dot: MatchRune('.'), + Slash: MatchRune('/'), + Divide: MatchRune('/'), + Colon: MatchRune(':'), + Semicolon: MatchRune(';'), + AngleOpen: MatchRune('<'), + LessThan: MatchRune('<'), + Equal: MatchRune('='), + AngleClose: MatchRune('>'), + GreaterThan: MatchRune('>'), + Question: MatchRune('?'), + At: MatchRune('@'), + SquareOpen: MatchRune('['), + Backslash: MatchRune('\\'), + SquareClose: MatchRune(']'), + Caret: MatchRune('^'), + Underscore: MatchRune('_'), + Backquote: MatchRune('`'), + CurlyOpen: MatchRune('{'), + Pipe: MatchRune('|'), + CurlyClose: MatchRune('}'), + Tilde: MatchRune('~'), + Blank: MatchBlank(), + Blanks: MatchBlanks(), + Whitespace: MatchWhitespace(), + EndOfLine: MatchEndOfLine(), + Digit: MatchDigit(), + DigitNotZero: MatchDigitNotZero(), + Digits: MatchDigits(), + Integer: MatchInteger(), + Signed: MatchSigned, + IntegerBetween: MatchIntegerBetween, + Float: MatchFloat(), + Boolean: MatchBoolean(), + ASCII: MatchASCII(), + ASCIILower: MatchASCIILower(), + ASCIIUpper: MatchASCIIUpper(), + Letter: MatchUnicodeLetter(), + Lower: MatchUnicodeLower(), + Upper: MatchUnicodeUpper(), + HexDigit: MatchHexDigit(), + Octet: MatchOctet(false), + IPv4: MatchIPv4(true), + IPv4CIDRMask: MatchIPv4CIDRMask(true), + IPv4Netmask: MatchIPv4Netmask(true), + IPv4Net: MatchIPv4Net(true), + IPv6: MatchIPv6(true), + IPv6CIDRMask: MatchIPv6CIDRMask(true), + IPv6Net: MatchIPv6Net(true), } // M provides convenient access to a range of modifiers (which in their nature are @@ -352,20 +360,45 @@ func MatchRuneRange(start rune, end rune) TokenHandler { } } -// MatchWhitespace creates a TokenHandler that matches the input against one -// or more whitespace characters, meansing tabs and spaces. +// MatchBlank creates a TokenHandler that matches one rune from the input +// against blank characters, meaning tabs and spaces. // -// When you need whitespace matching to also include newlines, then make use -// of MatchWhitespaceAndNewlines(). -func MatchWhitespace() TokenHandler { - return MatchOneOrMore(MatchAny(MatchRune(' '), MatchRune('\t'))) +// When you need whitespace matching, which also includes characters like +// newlines, then take a look at MatchWhitespace(). +func MatchBlank() TokenHandler { + return MatchAny(MatchRune(' '), MatchRune('\t')) } -// MatchWhitespaceAndNewlines creates a TokenHandler that matches the input -// against one or more whitespace and/or newline characters, meaning tabs, -// spaces and newlines ("\r\n" and "\n"). -func MatchWhitespaceAndNewlines() TokenHandler { - return MatchOneOrMore(MatchAny(MatchRune(' '), MatchRune('\t'), MatchStr("\r\n"), MatchRune('\n'))) +// MatchBlanks creates a TokenHandler that matches the input against one +// or more blank characters, meaning tabs and spaces. +// +// When you need whitespace matching, which also includes characters like +// newlines, then make use of MatchSpace(). +func MatchBlanks() TokenHandler { + return MatchOneOrMore(MatchBlank()) +} + +// MatchWhitespace creates a TokenHandler that matches the input against one or more +// whitespace characters, as defined by unicode. +func MatchWhitespace() TokenHandler { + return MatchOneOrMore(MatchRuneByCallback(unicode.IsSpace)) +} + +// MatchRuneByCallback creates a TokenHandler that matches a single rune from the +// input against the provided callback function. When the callback returns true, +// it is considered a match. +// +// Note that the callback function matches the signature of the unicode.Is* functions, +// so those can be used. E.g. MatchRuneByCallback(unicode.IsLower). +func MatchRuneByCallback(callback func(rune) bool) TokenHandler { + return func(t *TokenAPI) bool { + input, err := t.NextRune() + if err == nil && callback(input) { + t.Accept() + return true + } + return false + } } // MatchEndOfLine creates a TokenHandler that matches a newline ("\r\n" or "\n") or EOF. @@ -649,7 +682,7 @@ func MatchDigit() TokenHandler { // MatchDigits creates a TokenHandler that checks if one or more digits can be read // from the input. func MatchDigits() TokenHandler { - return MatchOneOrMore(MatchRuneRange('0', '9')) + return MatchOneOrMore(MatchDigit()) } // MatchDigitNotZero creates a TokenHandler that checks if a single digit not equal @@ -707,6 +740,24 @@ func MatchASCIIUpper() TokenHandler { return MatchRuneRange('A', 'Z') } +// MatchUnicodeLetter creates a TokenHandler function that matches against any +// unicode letter on the input (see unicode.IsLetter(rune)). +func MatchUnicodeLetter() TokenHandler { + return MatchRuneByCallback(unicode.IsLetter) +} + +// MatchUnicodeUpper creates a TokenHandler function that matches against any +// upper case unicode letter on the input (see unicode.IsUpper(rune)). +func MatchUnicodeUpper() TokenHandler { + return MatchRuneByCallback(unicode.IsUpper) +} + +// MatchUnicodeLower creates a TokenHandler function that matches against any +// lower case unicode letter on the input (see unicode.IsLower(rune)). +func MatchUnicodeLower() TokenHandler { + return MatchRuneByCallback(unicode.IsLower) +} + // MatchHexDigit creates a TokenHandler function that check if a single hexadecimal // digit can be read from the input. func MatchHexDigit() TokenHandler { @@ -908,15 +959,15 @@ func MatchIPv6Net(normalize bool) TokenHandler { // // Note that if the TokenHandler does not apply, a mismatch will be reported back, // even though we would have dropped the output anyway. So if you would like -// to drop optional whitespace, then use something like: +// to drop optional blanks (spaces and tabs), then use something like: // -// M.Drop(C.Opt(A.Whitespace)) +// M.Drop(C.Opt(A.Blank)) // // instead of: // -// M.Drop(A.Whitespace) +// M.Drop(A.Blank) // -// Since whitespace is defined as "1 or more spaces and/or tabs", the input +// Since A.Blanks is defined as "1 or more spaces and/or tabs", the input // string "bork" would not match against the second form, but " bork" would. // In both cases, it would match the first form. func ModifyDrop(handler TokenHandler) TokenHandler { @@ -960,8 +1011,8 @@ func modifyTrim(handler TokenHandler, cutset string, trimLeft bool, trimRight bo } // ModifyTrimSpace creates a TokenHandler that checks if the provided TokenHandler applies. -// If it does, then its output is taken and all leading and trailing whitespace charcters, -// as defined by Unicode (spaces, tabs, carriage returns and newlines) are removed from it. +// If it does, then its output is taken and all leading and trailing whitespace characters, +// as defined by Unicode are removed from it. func ModifyTrimSpace(handler TokenHandler) TokenHandler { return ModifyByCallback(handler, strings.TrimSpace) } diff --git a/tokenhandlers_builtin_test.go b/tokenhandlers_builtin_test.go index 46be6ad..3f90c47 100644 --- a/tokenhandlers_builtin_test.go +++ b/tokenhandlers_builtin_test.go @@ -157,9 +157,12 @@ func TestAtoms(t *testing.T) { {"|", a.Pipe, true, "|"}, {"}", a.CurlyClose, true, "}"}, {"~", a.Tilde, true, "~"}, - {" \t \t \r\n", a.Whitespace, true, " \t \t "}, - {"\r", a.WhitespaceAndNewlines, false, ""}, - {" \t\r\n \r", a.WhitespaceAndNewlines, true, " \t\r\n "}, + {"\t \t \r\n", a.Blank, true, "\t"}, + {" \t \t \r\n", a.Blanks, true, " \t \t "}, + {"xxx", a.Whitespace, false, ""}, + {" ", a.Whitespace, true, " "}, + {"\t", a.Whitespace, true, "\t"}, + {" \t\r\n \r\v\f ", a.Whitespace, true, " \t\r\n \r\v\f "}, {"", a.EndOfLine, true, ""}, {"\r\n", a.EndOfLine, true, "\r\n"}, {"\n", a.EndOfLine, true, "\n"}, @@ -182,6 +185,13 @@ func TestAtoms(t *testing.T) { {"Z", a.ASCIIUpper, true, "Z"}, {"a", a.ASCIIUpper, false, ""}, {"z", a.ASCIIUpper, false, ""}, + {"1", a.Letter, false, ""}, + {"a", a.Letter, true, "a"}, + {"Ø", a.Letter, true, "Ø"}, + {"Ë", a.Lower, false, ""}, + {"ë", a.Lower, true, "ë"}, + {"ä", a.Upper, false, "ä"}, + {"Ä", a.Upper, true, "Ä"}, {"0", a.HexDigit, true, "0"}, {"9", a.HexDigit, true, "9"}, {"a", a.HexDigit, true, "a"}, @@ -403,16 +413,16 @@ func TestCombination(t *testing.T) { c.Opt(a.SquareOpen), m.Trim( c.Seq( - c.Opt(a.Whitespace), + c.Opt(a.Blanks), c.Rep(3, a.AngleClose), m.ByCallback(c.OneOrMore(a.StrNoCase("hello")), func(s string) string { return fmt.Sprintf("%d", len(s)) }), - m.Replace(c.Separated(a.Comma, c.Opt(a.Whitespace)), ", "), + m.Replace(c.Separated(a.Comma, c.Opt(a.Blanks)), ", "), m.ToUpper(c.Min(1, a.ASCIILower)), m.Drop(a.Excl), c.Rep(3, a.AngleOpen), - c.Opt(a.Whitespace), + c.Opt(a.Blanks), ), " \t", ),