package parsekit import ( "fmt" "strings" "unicode" ) // C provides convenient access to a range of parser/combinators that can be // used to construct TokenHandler functions. // // When using C in your own parser, then it is advised to create a variable // to reference it: // // var c = parsekit.C // // Doing so saves you a lot of typing, and it makes your code a lot cleaner. var C = struct { Rune func(rune) TokenHandler Runes func(...rune) TokenHandler RuneRange func(rune, rune) TokenHandler Str func(string) TokenHandler StrNoCase func(string) TokenHandler Any func(...TokenHandler) TokenHandler Not func(TokenHandler) TokenHandler Opt func(TokenHandler) TokenHandler Seq func(...TokenHandler) TokenHandler Rep func(times int, handler TokenHandler) TokenHandler Min func(min int, handler TokenHandler) TokenHandler Max func(max int, handler TokenHandler) TokenHandler ZeroOrMore func(TokenHandler) TokenHandler OneOrMore func(TokenHandler) TokenHandler MinMax func(min int, max int, handler TokenHandler) TokenHandler Separated func(separated TokenHandler, separator TokenHandler) TokenHandler // TODO reverse args for consistency Except func(except TokenHandler, handler TokenHandler) TokenHandler }{ Rune: MatchRune, Runes: MatchRunes, RuneRange: MatchRuneRange, Str: MatchStr, StrNoCase: MatchStrNoCase, Opt: MatchOpt, Any: MatchAny, Not: MatchNot, Seq: MatchSeq, Rep: MatchRep, Min: MatchMin, Max: MatchMax, ZeroOrMore: MatchZeroOrMore, OneOrMore: MatchOneOrMore, MinMax: MatchMinMax, Separated: MatchSeparated, Except: MatchExcept, } // MatchRune creates a TokenHandler function that checks if the next rune from // the input matches the provided rune. func MatchRune(expected rune) TokenHandler { return func(t *TokenAPI) bool { input, ok := t.NextRune() if ok && input == expected { t.Accept() return true } return false } } // MatchRunes creates a TokenHandler function that that checks if the next rune // from the input is one of the provided runes. func MatchRunes(expected ...rune) TokenHandler { s := string(expected) return func(t *TokenAPI) bool { input, ok := t.NextRune() if ok { if strings.ContainsRune(s, input) { t.Accept() return true } } return false } } // MatchRuneRange creates a TokenHandler function that that checks if the next rune // from the input is contained by the provided rune range. // // The rune range is defined by a start and an end rune, inclusive, so: // // MatchRuneRange('g', 'k') // // creates a TokenHandler that will match any of 'g', 'h', 'i', 'j' or 'k'. func MatchRuneRange(start rune, end rune) TokenHandler { return func(t *TokenAPI) bool { if end < start { panic(fmt.Sprintf("internal parser error: MatchRuneRange definition error: start %q must not be < end %q", start, end)) } input, ok := t.NextRune() if ok && input >= start && input <= end { t.Accept() return true } return false } } // MatchStr creates a TokenHandler that will check if the upcoming runes on the // input match the provided string. // TODO make this a more efficient string-level match? func MatchStr(expected string) TokenHandler { var handlers = []TokenHandler{} for _, r := range expected { handlers = append(handlers, MatchRune(r)) } return MatchSeq(handlers...) } // MatchStrNoCase creates a TokenHandler that will check if the upcoming runes // on the input match the provided string in a case-insensitive manner. // TODO make this a more efficient string-level match? func MatchStrNoCase(expected string) TokenHandler { var handlers = []TokenHandler{} for _, r := range expected { u := unicode.ToUpper(r) l := unicode.ToLower(r) handlers = append(handlers, MatchRunes(u, l)) } return MatchSeq(handlers...) } // MatchOpt creates a TokenHandler that makes the provided TokenHandler optional. // When the provided TokenHandler applies, then its output is used, otherwise // no output is generated but still a successful match is reported. func MatchOpt(handler TokenHandler) TokenHandler { return func(t *TokenAPI) bool { child := t.Fork() if handler(child) { child.Merge() } return true } } // MatchSeq creates a TokenHandler that checks if the provided TokenHandlers can be // applied in their exact order. Only if all matcher apply, the sequence // reports successful match. func MatchSeq(handlers ...TokenHandler) TokenHandler { return func(t *TokenAPI) bool { child := t.Fork() for _, matcher := range handlers { if !matcher(child) { return false } } child.Merge() return true } } // MatchAny creates a TokenHandler that checks if any of the provided TokenHandlers // can be applied. They are applied in their provided order. The first TokenHandler // that applies is used for reporting back a match. func MatchAny(handlers ...TokenHandler) TokenHandler { return func(t *TokenAPI) bool { for _, handler := range handlers { child := t.Fork() if handler(child) { return child.Merge() } } return false } } // MatchNot creates a TokenHandler that checks if the provided TokenHandler applies to // the current input. If it does, then a failed match will be reported. If it // does not, then the next rune from the input will be reported as a match. func MatchNot(handler TokenHandler) TokenHandler { return func(t *TokenAPI) bool { probe := t.Fork() if handler(probe) { return false } _, ok := t.NextRune() if ok { t.Accept() return true } return false } } // MatchRep creates a TokenHandler that checks if the provided TokenHandler can be // applied exactly the provided amount of times. // // Note that the input can contain more than the provided number of matches, e.g.: // // MatchRep(4, MatchRune('X')) // // will not match input "XXX", it will match input "XXXX", but also "XXXXXX". // In that last case, there will be a remainder "XX" on the input. func MatchRep(times int, handler TokenHandler) TokenHandler { return matchMinMax(times, times, handler) } // MatchMin creates a TokenHandler that checks if the provided TokenHandler can be // applied at least the provided minimum number of times. // When more matches are possible, these will be included in the output. func MatchMin(min int, handler TokenHandler) TokenHandler { return matchMinMax(min, -1, handler) } // MatchMax creates a TokenHandler that checks if the provided TokenHandler can be // applied at maximum the provided minimum number of times. // When more matches are possible, these will be included in the output. // Zero matches are considered a successful match. func MatchMax(max int, handler TokenHandler) TokenHandler { return matchMinMax(0, max, handler) } // MatchZeroOrMore creates a TokenHandler that checks if the provided TokenHandler can // be applied zero or more times. All matches will be included in the output. // Zero matches are considered a successful match. func MatchZeroOrMore(handler TokenHandler) TokenHandler { return matchMinMax(0, -1, handler) } // MatchOneOrMore creates a TokenHandler that checks if the provided TokenHandler can // be applied one or more times. All matches will be included in the output. func MatchOneOrMore(handler TokenHandler) TokenHandler { return matchMinMax(1, -1, handler) } // MatchMinMax creates a TokenHandler that checks if the provided TokenHandler can // be applied between the provided minimum and maximum number of times, // inclusive. All matches will be included in the output. func MatchMinMax(min int, max int, handler TokenHandler) TokenHandler { if max < 0 { panic("internal parser error: MatchMinMax definition error: max must be >= 0 ") } if min < 0 { panic("internal parser error: MatchMinMax definition error: min must be >= 0 ") } return matchMinMax(min, max, handler) } func matchMinMax(min int, max int, handler TokenHandler) TokenHandler { return func(t *TokenAPI) bool { child := t.Fork() if max >= 0 && min > max { panic(fmt.Sprintf("internal parser error: MatchRep definition error: max %d must not be < min %d", max, min)) } total := 0 // Check for the minimum required amount of matches. for total < min { total++ if !handler(child) { return false } } // No specified max: include the rest of the available matches. // Specified max: include the rest of the availble matches, up to the max. child.Merge() for max < 0 || total < max { total++ if !handler(child) { break } child.Merge() } return true } } // MatchSeparated creates a TokenHandler that checks for a pattern of one or more // TokenHandlers of one type (the separated), separated by TokenHandler of another type // (the separator). All matches (separated + separator) are included in the // output. func MatchSeparated(separator TokenHandler, separated TokenHandler) TokenHandler { return MatchSeq(separated, MatchZeroOrMore(MatchSeq(separator, separated))) } // MatchExcept creates a TokenHandler that checks if the provided TokenHandler can be // applied to the upcoming input. It also checks if the except TokenHandler can be // applied. If the handler applies, but the except TokenHandler as well, then the match // as a whole will be treated as a mismatch. func MatchExcept(except TokenHandler, handler TokenHandler) TokenHandler { return func(t *TokenAPI) bool { if except(t.Fork()) { return false } return handler(t) } } // A provides convenient access to a range of atoms that can be used to // build TokenHandlers or parser rules. // // In parsekit, an atom is defined as a ready for use TokenHandler function. // // When using A in your own parser, then it is advised to create a variable // to reference it: // // var a = parsekit.A // // Doing so saves you a lot of typing, and it makes your code a lot cleaner. var A = struct { EndOfFile TokenHandler AnyRune TokenHandler Space TokenHandler Tab TokenHandler CR TokenHandler LF TokenHandler CRLF TokenHandler Excl TokenHandler DoubleQuote TokenHandler Hash TokenHandler Dollar TokenHandler Percent TokenHandler Amp TokenHandler SingleQuote TokenHandler RoundOpen TokenHandler RoundClose TokenHandler Asterisk TokenHandler Plus TokenHandler Comma TokenHandler Minus TokenHandler Dot TokenHandler Slash TokenHandler Colon TokenHandler Semicolon TokenHandler AngleOpen TokenHandler Equal TokenHandler AngleClose TokenHandler Question TokenHandler At TokenHandler SquareOpen TokenHandler Backslash TokenHandler SquareClose TokenHandler Caret TokenHandler Underscore TokenHandler Backquote TokenHandler CurlyOpen TokenHandler Pipe TokenHandler CurlyClose TokenHandler Tilde TokenHandler Newline TokenHandler Whitespace TokenHandler WhitespaceAndNewlines TokenHandler EndOfLine TokenHandler Digit TokenHandler ASCII TokenHandler ASCIILower TokenHandler ASCIIUpper TokenHandler HexDigit TokenHandler }{ EndOfFile: MatchEndOfFile(), AnyRune: MatchAnyRune(), Space: C.Rune(' '), Tab: C.Rune('\t'), CR: C.Rune('\r'), LF: C.Rune('\n'), CRLF: C.Str("\r\n"), Excl: C.Rune('!'), DoubleQuote: C.Rune('"'), Hash: C.Rune('#'), Dollar: C.Rune('$'), Percent: C.Rune('%'), Amp: C.Rune('&'), SingleQuote: C.Rune('\''), RoundOpen: C.Rune('('), RoundClose: C.Rune(')'), Asterisk: C.Rune('*'), Plus: C.Rune('+'), Comma: C.Rune(','), Minus: C.Rune('-'), Dot: C.Rune('.'), Slash: C.Rune('/'), Colon: C.Rune(':'), Semicolon: C.Rune(';'), AngleOpen: C.Rune('<'), Equal: C.Rune('='), AngleClose: C.Rune('>'), Question: C.Rune('?'), At: C.Rune('@'), SquareOpen: C.Rune('['), Backslash: C.Rune('\\'), SquareClose: C.Rune(']'), Caret: C.Rune('^'), Underscore: C.Rune('_'), Backquote: C.Rune('`'), CurlyOpen: C.Rune('{'), Pipe: C.Rune('|'), CurlyClose: C.Rune('}'), Tilde: C.Rune('~'), Whitespace: C.OneOrMore(C.Any(C.Rune(' '), C.Rune('\t'))), WhitespaceAndNewlines: C.OneOrMore(C.Any(C.Rune(' '), C.Rune('\t'), C.Str("\r\n"), C.Rune('\n'))), EndOfLine: C.Any(C.Str("\r\n"), C.Rune('\n'), MatchEndOfFile()), Digit: C.RuneRange('0', '9'), ASCII: C.RuneRange('\x00', '\x7F'), ASCIILower: C.RuneRange('a', 'z'), ASCIIUpper: C.RuneRange('A', 'Z'), HexDigit: C.Any(C.RuneRange('0', '9'), C.RuneRange('a', 'f'), C.RuneRange('A', 'F')), } // MatchEndOfFile creates a TokenHandler that checks if the end of the input data // has been reached. This TokenHandler will never produce output. It only reports // a successful or a failing match through its boolean return value. func MatchEndOfFile() TokenHandler { return func(t *TokenAPI) bool { fork := t.Fork() input, ok := fork.NextRune() return !ok && input == eofRune } } // MatchAnyRune creates a TokenHandler function that checks if a valid rune can be // read from the input. It reports back a successful match if the end of the // input has not yet been reached and the upcoming input is a valid UTF8 rune. func MatchAnyRune() TokenHandler { return func(t *TokenAPI) bool { _, ok := t.NextRune() if ok { t.Accept() return true } return false } } // M provides convenient access to a range of modifiers (which in their nature are // parser/combinators) that can be used when creating TokenHandler functions. // // In parsekit, a modifier is defined as a TokenHandler function that modifies the // resulting output of another TokenHandler in some way. It does not do any matching // against input of its own. // // When using M in your own parser, then it is advised to create a variable // to reference it: // // var m = parsekit.M // // Doing so saves you a lot of typing, and it makes your code a lot cleaner. var M = struct { Drop func(TokenHandler) TokenHandler Trim func(handler TokenHandler, cutset string) TokenHandler // TODO reverse arguments? TrimLeft func(handler TokenHandler, cutset string) TokenHandler // TODO reverse arguments? TrimRight func(handler TokenHandler, cutset string) TokenHandler // TODO reverse arguments? ToLower func(TokenHandler) TokenHandler ToUpper func(TokenHandler) TokenHandler Replace func(handler TokenHandler, replaceWith string) TokenHandler // TODO reverse arguments? ModifyByCallback func(TokenHandler, func(string) string) TokenHandler }{ Drop: ModifyDrop, Trim: ModifyTrim, TrimLeft: ModifyTrimLeft, TrimRight: ModifyTrimRight, ToLower: ModifyToLower, ToUpper: ModifyToUpper, Replace: ModifyReplace, ModifyByCallback: ModifyByCallback, } // ModifyDrop creates a TokenHandler that checks if the provided TokenHandler applies. // If it does, then its output is discarded completely. // // Note that if the TokenHandler does not apply, a mismatch will be reported back, // even though we would have dropped the output anyway. So if you would like // to drop optional whitespace, then use something like: // // M.Drop(C.Opt(A.Whitespace)) // // instead of: // // M.Drop(A.Whitespace) // // Since whitespace is defined as "1 or more spaces and/or tabs", the input // string "bork" would not match against the second form, but " bork" would. // In both cases, it would match the first form. func ModifyDrop(handler TokenHandler) TokenHandler { return ModifyByCallback(handler, func(s string) string { return "" }) } // ModifyTrim creates a TokenHandler that checks if the provided TokenHandler applies. // If it does, then its output is taken and characters from the provided // cutset are trimmed from both the left and the right of the output. func ModifyTrim(handler TokenHandler, cutset string) TokenHandler { return modifyTrim(handler, cutset, true, true) } // ModifyTrimLeft creates a TokenHandler that checks if the provided TokenHandler applies. // If it does, then its output is taken and characters from the provided // cutset are trimmed from the left of the output. func ModifyTrimLeft(handler TokenHandler, cutset string) TokenHandler { return modifyTrim(handler, cutset, true, false) } // ModifyTrimRight creates a TokenHandler that checks if the provided TokenHandler applies. // If it does, then its output is taken and characters from the provided // cutset are trimmed from the right of the output. func ModifyTrimRight(handler TokenHandler, cutset string) TokenHandler { return modifyTrim(handler, cutset, false, true) } func modifyTrim(handler TokenHandler, cutset string, trimLeft bool, trimRight bool) TokenHandler { modfunc := func(s string) string { if trimLeft { s = strings.TrimLeft(s, cutset) } if trimRight { s = strings.TrimRight(s, cutset) } return s } return ModifyByCallback(handler, modfunc) } // ModifyToUpper creates a TokenHandler that checks if the provided TokenHandler applies. // If it does, then its output is taken and characters from the provided // cutset are converted into upper case. func ModifyToUpper(handler TokenHandler) TokenHandler { return ModifyByCallback(handler, strings.ToUpper) } // ModifyToLower creates a TokenHandler that checks if the provided TokenHandler applies. // If it does, then its output is taken and characters from the provided // cutset are converted into lower case. func ModifyToLower(handler TokenHandler) TokenHandler { return ModifyByCallback(handler, strings.ToLower) } // ModifyReplace creates a TokenHandler that checks if the provided TokenHandler applies. // If it does, then its output is replaced by the provided string. func ModifyReplace(handler TokenHandler, replaceWith string) TokenHandler { return ModifyByCallback(handler, func(string) string { return replaceWith }) } // ModifyByCallback creates a TokenHandler that checks if the provided TokenHandler applies. // If it does, then its output is taken and it is fed to the provided modfunc. // This is a simple function that takes a string on input and returns a possibly // modified string on output. The return value of the modfunc will replace the // resulting output. func ModifyByCallback(handler TokenHandler, modfunc func(string) string) TokenHandler { return func(t *TokenAPI) bool { child := t.Fork() if handler(child) { s := modfunc(string(child.output)) child.output = []rune(s) child.Merge() return true } return false } }