Further code cleaning for the interaction between ParseAPI and TokenAPI. Extra atoms added, also one based on a callback which can accept single runes based on thhat callback function.

This commit is contained in:
Maurice Makaay 2019-06-07 15:48:49 +00:00
parent 98d2db0374
commit 9a5bf8b9af
11 changed files with 286 additions and 220 deletions

View File

@ -3,8 +3,8 @@
//
// 10 + 20 - 8+4
//
// So positive numbers that can be either added or substracted, and whitespace
// is ignored.
// So positive numbers that can be either added or substracted, and blanks
// around numbers are ignored.
package examples
import (
@ -69,9 +69,9 @@ type simpleCalculator struct {
op int64 // represents operation for next term (+1 = add, -1 = subtract)
}
// A definition of an int64, which conveniently drops surrounding whitespace.
var dropWhitespace = parsekit.M.Drop(parsekit.C.Opt(parsekit.A.Whitespace))
var bareInteger = parsekit.C.Seq(dropWhitespace, parsekit.A.Integer, dropWhitespace)
// A definition of an int64, which conveniently drops surrounding blanks.
var dropBlank = parsekit.M.Drop(parsekit.C.Opt(parsekit.A.Blank))
var bareInteger = parsekit.C.Seq(dropBlank, parsekit.A.Integer, dropBlank)
var int64Token = parsekit.T.Int64(nil, bareInteger)
func (c *simpleCalculator) number(p *parsekit.ParseAPI) {

View File

@ -130,7 +130,7 @@ func (c *calculator) term(p *parsekit.ParseAPI) {
// <factor> = <space> (FLOAT | LPAREN <expr> RPAREN) <space>
func (c *calculator) factor(p *parsekit.ParseAPI) {
var A, T = parsekit.A, parsekit.T
p.On(A.Whitespace).Skip()
p.On(A.Blank).Skip()
switch {
case p.On(T.Float64(nil, A.Signed(A.Float))).Accept():
value := p.Result().Value(0).(float64)
@ -147,7 +147,7 @@ func (c *calculator) factor(p *parsekit.ParseAPI) {
p.UnexpectedInput("factor or (expression)")
return
}
p.On(A.Whitespace).Skip()
p.On(A.Blank).Skip()
}
// ---------------------------------------------------------------------------

View File

@ -65,7 +65,7 @@ func createPostcodeTokenizer() *parsekit.Tokenizer {
pcDigits := C.Seq(digitNotZero, C.Rep(3, A.Digit))
pcLetter := C.Any(A.ASCIILower, A.ASCIIUpper)
pcLetters := M.ToUpper(C.Seq(pcLetter, pcLetter))
space := M.Replace(C.Opt(A.Whitespace), " ")
space := M.Replace(C.Opt(A.Blank), " ")
postcode := C.Seq(T.Str("PCD", pcDigits), space, T.Str("PCL", pcLetters), A.EndOfFile)
// Create a Tokenizer that wraps the 'postcode' TokenHandler and allows

View File

@ -90,7 +90,7 @@ func (h *helloparser1) start(p *parsekit.ParseAPI) {
func (h *helloparser1) comma(p *parsekit.ParseAPI) {
a := parsekit.A
switch {
case p.On(a.Whitespace).Skip():
case p.On(a.Blank).Skip():
p.Handle(h.comma)
case p.On(a.Comma).Skip():
p.Handle(h.startName)
@ -102,7 +102,7 @@ func (h *helloparser1) comma(p *parsekit.ParseAPI) {
func (h *helloparser1) startName(p *parsekit.ParseAPI) {
c, a := parsekit.C, parsekit.A
switch {
case p.On(a.Whitespace).Skip():
case p.On(a.Blank).Skip():
p.Handle(h.startName)
case p.On(c.Not(a.Excl)).Stay():
p.Handle(h.name)

View File

@ -54,8 +54,8 @@ func createHelloTokenizer() *parsekit.Tokenizer {
// that does all the work. The 'greeting' TokenHandler matches the whole input and
// drops all but the name from it.
hello := a.StrNoCase("hello")
comma := c.Seq(c.Opt(a.Whitespace), a.Comma, c.Opt(a.Whitespace))
separator := c.Any(comma, a.Whitespace)
comma := c.Seq(c.Opt(a.Blank), a.Comma, c.Opt(a.Blank))
separator := c.Any(comma, a.Blank)
name := c.OneOrMore(c.Not(a.Excl))
greeting := c.Seq(m.Drop(hello), m.Drop(separator), name, m.Drop(a.Excl), a.EndOfFile)

View File

@ -84,7 +84,7 @@ func (h *helloparser2) start(p *parsekit.ParseAPI) {
p.Error("the greeting is not being friendly")
return
}
if !p.On(c.Seq(c.Opt(a.Whitespace), a.Comma, c.Opt(a.Whitespace))).Skip() {
if !p.On(c.Seq(c.Opt(a.Blank), a.Comma, c.Opt(a.Blank))).Skip() {
p.Error("the greeting is not properly separated")
return
}

View File

@ -66,7 +66,7 @@ func (p *ParseAPI) checkForLoops() {
//
// So an example chain could look like this:
//
// p.On(parsekit.A.Whitespace).Skip()
// p.On(parsekit.A.Blank).Skip()
//
// The chain as a whole returns a boolean that indicates whether or not at match
// was found. When no match was found, false is returned and Skip() and Accept()
@ -104,7 +104,8 @@ func (p *ParseAPI) On(tokenHandler TokenHandler) *ParseAPIOnAction {
return &ParseAPIOnAction{
parseAPI: p,
tokenAPI: child,
tokenAPI: p.tokenAPI,
forkedTokenAPI: child,
ok: ok,
}
}
@ -114,20 +115,24 @@ func (p *ParseAPI) On(tokenHandler TokenHandler) *ParseAPIOnAction {
type ParseAPIOnAction struct {
parseAPI *ParseAPI
tokenAPI *TokenAPI
forkedTokenAPI *TokenAPI
ok bool
}
// Accept tells the parser to move the read cursor past a match that was
// found, and to make the TokenHandlerResult from the TokenAPI available in the
// ParseAPI through the ParseAPI.Result() method.
// found by a TokenHandler, and to make the TokenHandlerResult from the
// TokenAPI available in the ParseAPI through the ParseAPI.Result() method.
//
// Returns true in case a match was found.
// When no match was found, then no action is taken and false is returned.
func (a *ParseAPIOnAction) Accept() bool {
if a.ok {
a.tokenAPI.Merge()
a.flushReader()
a.parseAPI.result = a.tokenAPI.root.result
a.forkedTokenAPI.Merge()
a.parseAPI.result = a.tokenAPI.Result()
a.tokenAPI.detachChilds()
if a.tokenAPI.flushReader() {
a.parseAPI.initLoopCheck()
}
}
return a.ok
}
@ -145,10 +150,12 @@ func (a *ParseAPIOnAction) Accept() bool {
func (a *ParseAPIOnAction) Skip() bool {
if a.ok {
a.parseAPI.result = nil
a.tokenAPI.clearResults()
a.tokenAPI.syncCursorTo(a.tokenAPI.root)
a.forkedTokenAPI.clearResults()
a.tokenAPI.detachChilds()
a.flushReader()
a.forkedTokenAPI.syncCursorTo(a.tokenAPI)
if a.tokenAPI.flushReader() {
a.parseAPI.initLoopCheck()
}
}
return a.ok
}
@ -170,14 +177,6 @@ func (a *ParseAPIOnAction) Stay() bool {
return a.ok
}
func (a *ParseAPIOnAction) flushReader() {
if a.tokenAPI.result.offset > 0 {
a.tokenAPI.root.reader.Flush(a.tokenAPI.root.result.offset)
a.tokenAPI.root.result.offset = 0
a.parseAPI.initLoopCheck()
}
}
// Result returns a TokenHandlerResult struct, containing results as produced by the
// last ParseAPI.On().Accept() call.
func (p *ParseAPI) Result() *TokenHandlerResult {

View File

@ -298,7 +298,7 @@ func TestGivenLoopingParserDefinition_ParserPanics(t *testing.T) {
// p.On(c.Max(5, a.AnyRune))
//
// The problem here is that Max(5, ...) will also match when there is
// no more input, since Max(5, ---) is actually MinMax(0, 5, ...).
// no more input, since Max(5, ...) is actually MinMax(0, 5, ...).
// Therefore the loop will never stop. Solving the loop was simple:
//
// p.On(c.MinMax(1, 5, a.AnyRune))

View File

@ -8,15 +8,15 @@ import (
)
// TokenAPI wraps a parsekit.reader and its purpose is to retrieve data from
// the reader and to report back tokenizing results. For easy lookahead support,
// a forking strategy is provided.
// a parsekit.reader.Reader and to report back tokenizing results. For easy
// lookahead support, a forking strategy is provided.
//
// BASIC OPERATION:
//
// To retrieve the next rune from the TokenAPI, call the NextRune() method.
//
// When the rune is to be accepted as input, call the method Accept(). The rune
// is then added to the results of the TokenAPI and the read cursor is moved
// is then added to the result runes of the TokenAPI and the read cursor is moved
// forward.
//
// By invoking NextRune() + Accept() multiple times, the result can be extended
@ -63,7 +63,6 @@ import (
// no bookkeeping has to be implemented when implementing a parser.
type TokenAPI struct {
reader *reader.Reader
root *TokenAPI // the root TokenAPI
parent *TokenAPI // parent TokenAPI in case this TokenAPI is a fork child
child *TokenAPI // child TokenAPI in case this TokenAPI is a fork parent
result *TokenHandlerResult // results as produced by a TokenHandler (runes, Tokens, cursor position)
@ -75,7 +74,6 @@ func NewTokenAPI(r io.Reader) *TokenAPI {
reader: reader.New(r),
result: newTokenHandlerResult(),
}
input.root = input // TODO remove this one from root input, input.root == nil is also a good check for "is root?".
return input
}
@ -141,7 +139,6 @@ func (i *TokenAPI) Fork() *TokenAPI {
// Create the new fork.
child := &TokenAPI{
reader: i.reader,
root: i.root,
parent: i,
}
child.result = newTokenHandlerResult()
@ -200,6 +197,15 @@ func (i *TokenAPI) detachChildsRecurse() {
i.parent = nil
}
func (i *TokenAPI) flushReader() bool {
if i.result.offset > 0 {
i.reader.Flush(i.result.offset)
i.result.offset = 0
return true
}
return false
}
// Result returns the TokenHandlerResult data for the TokenAPI. The returned struct
// can be used to retrieve and to modify result data.
func (i *TokenAPI) Result() *TokenHandlerResult {

View File

@ -116,8 +116,9 @@ var A = struct {
CurlyClose TokenHandler
Tilde TokenHandler
Newline TokenHandler
Blank TokenHandler
Blanks TokenHandler
Whitespace TokenHandler
WhitespaceAndNewlines TokenHandler
EndOfLine TokenHandler
Digit TokenHandler
DigitNotZero TokenHandler
@ -130,6 +131,9 @@ var A = struct {
ASCII TokenHandler
ASCIILower TokenHandler
ASCIIUpper TokenHandler
Letter TokenHandler
Lower TokenHandler
Upper TokenHandler
HexDigit TokenHandler
Octet TokenHandler
IPv4 TokenHandler
@ -193,8 +197,9 @@ var A = struct {
Pipe: MatchRune('|'),
CurlyClose: MatchRune('}'),
Tilde: MatchRune('~'),
Blank: MatchBlank(),
Blanks: MatchBlanks(),
Whitespace: MatchWhitespace(),
WhitespaceAndNewlines: MatchWhitespaceAndNewlines(),
EndOfLine: MatchEndOfLine(),
Digit: MatchDigit(),
DigitNotZero: MatchDigitNotZero(),
@ -207,6 +212,9 @@ var A = struct {
ASCII: MatchASCII(),
ASCIILower: MatchASCIILower(),
ASCIIUpper: MatchASCIIUpper(),
Letter: MatchUnicodeLetter(),
Lower: MatchUnicodeLower(),
Upper: MatchUnicodeUpper(),
HexDigit: MatchHexDigit(),
Octet: MatchOctet(false),
IPv4: MatchIPv4(true),
@ -352,20 +360,45 @@ func MatchRuneRange(start rune, end rune) TokenHandler {
}
}
// MatchWhitespace creates a TokenHandler that matches the input against one
// or more whitespace characters, meansing tabs and spaces.
// MatchBlank creates a TokenHandler that matches one rune from the input
// against blank characters, meaning tabs and spaces.
//
// When you need whitespace matching to also include newlines, then make use
// of MatchWhitespaceAndNewlines().
func MatchWhitespace() TokenHandler {
return MatchOneOrMore(MatchAny(MatchRune(' '), MatchRune('\t')))
// When you need whitespace matching, which also includes characters like
// newlines, then take a look at MatchWhitespace().
func MatchBlank() TokenHandler {
return MatchAny(MatchRune(' '), MatchRune('\t'))
}
// MatchWhitespaceAndNewlines creates a TokenHandler that matches the input
// against one or more whitespace and/or newline characters, meaning tabs,
// spaces and newlines ("\r\n" and "\n").
func MatchWhitespaceAndNewlines() TokenHandler {
return MatchOneOrMore(MatchAny(MatchRune(' '), MatchRune('\t'), MatchStr("\r\n"), MatchRune('\n')))
// MatchBlanks creates a TokenHandler that matches the input against one
// or more blank characters, meaning tabs and spaces.
//
// When you need whitespace matching, which also includes characters like
// newlines, then make use of MatchSpace().
func MatchBlanks() TokenHandler {
return MatchOneOrMore(MatchBlank())
}
// MatchWhitespace creates a TokenHandler that matches the input against one or more
// whitespace characters, as defined by unicode.
func MatchWhitespace() TokenHandler {
return MatchOneOrMore(MatchRuneByCallback(unicode.IsSpace))
}
// MatchRuneByCallback creates a TokenHandler that matches a single rune from the
// input against the provided callback function. When the callback returns true,
// it is considered a match.
//
// Note that the callback function matches the signature of the unicode.Is* functions,
// so those can be used. E.g. MatchRuneByCallback(unicode.IsLower).
func MatchRuneByCallback(callback func(rune) bool) TokenHandler {
return func(t *TokenAPI) bool {
input, err := t.NextRune()
if err == nil && callback(input) {
t.Accept()
return true
}
return false
}
}
// MatchEndOfLine creates a TokenHandler that matches a newline ("\r\n" or "\n") or EOF.
@ -649,7 +682,7 @@ func MatchDigit() TokenHandler {
// MatchDigits creates a TokenHandler that checks if one or more digits can be read
// from the input.
func MatchDigits() TokenHandler {
return MatchOneOrMore(MatchRuneRange('0', '9'))
return MatchOneOrMore(MatchDigit())
}
// MatchDigitNotZero creates a TokenHandler that checks if a single digit not equal
@ -707,6 +740,24 @@ func MatchASCIIUpper() TokenHandler {
return MatchRuneRange('A', 'Z')
}
// MatchUnicodeLetter creates a TokenHandler function that matches against any
// unicode letter on the input (see unicode.IsLetter(rune)).
func MatchUnicodeLetter() TokenHandler {
return MatchRuneByCallback(unicode.IsLetter)
}
// MatchUnicodeUpper creates a TokenHandler function that matches against any
// upper case unicode letter on the input (see unicode.IsUpper(rune)).
func MatchUnicodeUpper() TokenHandler {
return MatchRuneByCallback(unicode.IsUpper)
}
// MatchUnicodeLower creates a TokenHandler function that matches against any
// lower case unicode letter on the input (see unicode.IsLower(rune)).
func MatchUnicodeLower() TokenHandler {
return MatchRuneByCallback(unicode.IsLower)
}
// MatchHexDigit creates a TokenHandler function that check if a single hexadecimal
// digit can be read from the input.
func MatchHexDigit() TokenHandler {
@ -908,15 +959,15 @@ func MatchIPv6Net(normalize bool) TokenHandler {
//
// Note that if the TokenHandler does not apply, a mismatch will be reported back,
// even though we would have dropped the output anyway. So if you would like
// to drop optional whitespace, then use something like:
// to drop optional blanks (spaces and tabs), then use something like:
//
// M.Drop(C.Opt(A.Whitespace))
// M.Drop(C.Opt(A.Blank))
//
// instead of:
//
// M.Drop(A.Whitespace)
// M.Drop(A.Blank)
//
// Since whitespace is defined as "1 or more spaces and/or tabs", the input
// Since A.Blanks is defined as "1 or more spaces and/or tabs", the input
// string "bork" would not match against the second form, but " bork" would.
// In both cases, it would match the first form.
func ModifyDrop(handler TokenHandler) TokenHandler {
@ -960,8 +1011,8 @@ func modifyTrim(handler TokenHandler, cutset string, trimLeft bool, trimRight bo
}
// ModifyTrimSpace creates a TokenHandler that checks if the provided TokenHandler applies.
// If it does, then its output is taken and all leading and trailing whitespace charcters,
// as defined by Unicode (spaces, tabs, carriage returns and newlines) are removed from it.
// If it does, then its output is taken and all leading and trailing whitespace characters,
// as defined by Unicode are removed from it.
func ModifyTrimSpace(handler TokenHandler) TokenHandler {
return ModifyByCallback(handler, strings.TrimSpace)
}

View File

@ -157,9 +157,12 @@ func TestAtoms(t *testing.T) {
{"|", a.Pipe, true, "|"},
{"}", a.CurlyClose, true, "}"},
{"~", a.Tilde, true, "~"},
{" \t \t \r\n", a.Whitespace, true, " \t \t "},
{"\r", a.WhitespaceAndNewlines, false, ""},
{" \t\r\n \r", a.WhitespaceAndNewlines, true, " \t\r\n "},
{"\t \t \r\n", a.Blank, true, "\t"},
{" \t \t \r\n", a.Blanks, true, " \t \t "},
{"xxx", a.Whitespace, false, ""},
{" ", a.Whitespace, true, " "},
{"\t", a.Whitespace, true, "\t"},
{" \t\r\n \r\v\f ", a.Whitespace, true, " \t\r\n \r\v\f "},
{"", a.EndOfLine, true, ""},
{"\r\n", a.EndOfLine, true, "\r\n"},
{"\n", a.EndOfLine, true, "\n"},
@ -182,6 +185,13 @@ func TestAtoms(t *testing.T) {
{"Z", a.ASCIIUpper, true, "Z"},
{"a", a.ASCIIUpper, false, ""},
{"z", a.ASCIIUpper, false, ""},
{"1", a.Letter, false, ""},
{"a", a.Letter, true, "a"},
{"Ø", a.Letter, true, "Ø"},
{"Ë", a.Lower, false, ""},
{"ë", a.Lower, true, "ë"},
{"ä", a.Upper, false, "ä"},
{"Ä", a.Upper, true, "Ä"},
{"0", a.HexDigit, true, "0"},
{"9", a.HexDigit, true, "9"},
{"a", a.HexDigit, true, "a"},
@ -403,16 +413,16 @@ func TestCombination(t *testing.T) {
c.Opt(a.SquareOpen),
m.Trim(
c.Seq(
c.Opt(a.Whitespace),
c.Opt(a.Blanks),
c.Rep(3, a.AngleClose),
m.ByCallback(c.OneOrMore(a.StrNoCase("hello")), func(s string) string {
return fmt.Sprintf("%d", len(s))
}),
m.Replace(c.Separated(a.Comma, c.Opt(a.Whitespace)), ", "),
m.Replace(c.Separated(a.Comma, c.Opt(a.Blanks)), ", "),
m.ToUpper(c.Min(1, a.ASCIILower)),
m.Drop(a.Excl),
c.Rep(3, a.AngleOpen),
c.Opt(a.Whitespace),
c.Opt(a.Blanks),
),
" \t",
),