From 7598b62dd06fb03e0c79abd336b5f67ab6461fc3 Mon Sep 17 00:00:00 2001 From: Maurice Makaay Date: Wed, 10 Jul 2019 20:36:21 +0000 Subject: [PATCH] Finalized the work-through of the new version of the tokenizer code. --- tokenize2/api_test.go | 2 +- tokenize2/handlers_builtin.go | 27 +++------- tokenize2/handlers_builtin_test.go | 75 ++++++++++++++++++++++++++-- tokenize2/token.go | 8 +-- tokenize2/token_test.go | 31 ++++++++++++ tokenize2/tokenizer_whitebox_test.go | 21 ++++++++ 6 files changed, 136 insertions(+), 28 deletions(-) create mode 100644 tokenize2/token_test.go diff --git a/tokenize2/api_test.go b/tokenize2/api_test.go index 8986fc1..e331cd0 100644 --- a/tokenize2/api_test.go +++ b/tokenize2/api_test.go @@ -274,7 +274,7 @@ func TestClearRunes(t *testing.T) { api.Accept() // Add to runes api.NextRune() // Read 'a' api.Accept() // Add to runes - api.ClearRunes() // Clear the runes + api.ClearRunes() // Clear the runes, giving us a fresh start. api.NextRune() // Read 'p' api.Accept() // Add to runes api.NextRune() // Read 'r' diff --git a/tokenize2/handlers_builtin.go b/tokenize2/handlers_builtin.go index 48fd908..6821b71 100644 --- a/tokenize2/handlers_builtin.go +++ b/tokenize2/handlers_builtin.go @@ -636,17 +636,12 @@ func MatchExcept(handler Handler, except Handler) Handler { // for the lookAhead handler is ignored. func MatchFollowedBy(lookAhead Handler, handler Handler) Handler { return func(t *API) bool { - child := t.Fork() if handler(t) { - subChild := t.Fork() - if lookAhead(t) { - t.Dispose(subChild) - t.Merge(child) - } + child := t.Fork() + result := lookAhead(t) t.Dispose(child) - return true + return result } - t.Dispose(child) return false } } @@ -657,17 +652,12 @@ func MatchFollowedBy(lookAhead Handler, handler Handler) Handler { // the handler is accepted. func MatchNotFollowedBy(lookAhead Handler, handler Handler) Handler { return func(t *API) bool { - child := t.Fork() if handler(t) { - subChild := t.Fork() - if !lookAhead(t) { - t.Dispose(subChild) - t.Merge(child) - t.Dispose(child) - return true - } + child := t.Fork() + result := !lookAhead(t) + t.Dispose(child) + return result } - t.Dispose(child) return false } } @@ -681,7 +671,7 @@ func MatchNotFollowedBy(lookAhead Handler, handler Handler) Handler { // // Without flushing the input, the input reader will allocate memory // during the parsing process, eventually enough to hold the full input -// in memory. By wrapping Handlers with DoFlushInput, you can tell parsekit +// in memory. By wrapping Handlers with an input flusher, you can tell parsekit // that the accumulated input so far will no longer be needed, allowing // this input to be flushed from memory. // @@ -1203,7 +1193,6 @@ func MakeStrInterpretedToken(toktype interface{}, handler Handler) Handler { }) } -// TODO Use better interpreter from parser code? func interpretString(str string) (string, error) { var sb strings.Builder for len(str) > 0 { diff --git a/tokenize2/handlers_builtin_test.go b/tokenize2/handlers_builtin_test.go index 26de338..a2a8897 100644 --- a/tokenize2/handlers_builtin_test.go +++ b/tokenize2/handlers_builtin_test.go @@ -19,8 +19,10 @@ func TestCombinatorsTempDebug(t *testing.T) { func TestCombinators(t *testing.T) { var c, a, m = tokenize.C, tokenize.A, tokenize.M AssertHandlers(t, []HandlerT{ + {"", c.Not(a.Rune('b')), false, ""}, {"abc not", c.Not(a.Rune('b')), true, "a"}, {"bcd not", c.Not(a.Rune('b')), false, ""}, + {"aaaxxxb", c.OneOrMore(c.Not(a.Rune('b'))), true, "aaaxxx"}, {"1010 not", c.Not(c.Seq(a.Rune('2'), a.Rune('0'))), true, "1"}, {"2020 not", c.Not(c.Seq(a.Rune('2'), a.Rune('0'))), false, ""}, {"abc any", c.Any(a.Rune('a'), a.Rune('b')), true, "a"}, @@ -71,8 +73,20 @@ func TestCombinators(t *testing.T) { {"1,2,3,b,c", c.Separated(a.Comma, a.Digit), true, "1,2,3"}, {`\x9a\x01\xF0\xfCAndSomeMoreStuff`, c.OneOrMore(c.Seq(a.Backslash, a.Rune('x'), c.Repeated(2, a.HexDigit))), true, `\x9a\x01\xF0\xfC`}, {" ", m.Trim(c.OneOrMore(a.AnyRune), " "), true, ""}, - {" ", m.TrimLeft(c.OneOrMore(a.AnyRune), " "), true, ""}, - {" ", m.TrimRight(c.OneOrMore(a.AnyRune), " "), true, ""}, + {" a", m.TrimLeft(c.OneOrMore(a.AnyRune), " "), true, "a"}, + {"a ", m.TrimRight(c.OneOrMore(a.AnyRune), " "), true, "a"}, + {" a ", m.TrimSpace(c.OneOrMore(a.AnyRune)), true, "a"}, + {"ab", c.FollowedBy(a.Rune('b'), a.Rune('a')), true, "a"}, + {"ba", c.FollowedBy(a.Rune('b'), a.Rune('a')), false, ""}, + {"aa", c.FollowedBy(a.Rune('b'), a.Rune('a')), false, ""}, + {"aaabbbcccddd", c.FollowedBy(c.OneOrMore(a.Rune('d')), c.OneOrMore(a.Rune('a')).Then(c.OneOrMore(c.Not(a.Rune('d'))))), true, "aaabbbccc"}, + {"aaabbbcccxxx", c.FollowedBy(c.OneOrMore(a.Rune('d')), c.OneOrMore(a.Rune('a')).Then(c.OneOrMore(c.Not(a.Rune('d'))))), false, ""}, + {"xy", c.NotFollowedBy(a.Rune('a'), a.Rune('x')), true, "x"}, + {"yx", c.NotFollowedBy(a.Rune('a'), a.Rune('x')), false, ""}, + {"xx", c.NotFollowedBy(a.Rune('a'), a.Rune('x')), true, "x"}, + {"xa", c.NotFollowedBy(a.Rune('a'), a.Rune('x')), false, ""}, + {"xxxyyyzzzaaa", c.NotFollowedBy(a.Rune('a'), c.OneOrMore(a.Runes('x', 'y', 'z'))), false, ""}, + {"xxxyyyzzzbaa", c.NotFollowedBy(a.Rune('a'), c.OneOrMore(a.Runes('x', 'y', 'z'))), true, "xxxyyyzzz"}, }) } @@ -119,8 +133,10 @@ func TestAtoms(t *testing.T) { {"\xbc with AnyRune", a.AnyRune, true, "�"}, {"", a.AnyRune, false, ""}, {"⌘", a.ValidRune, true, "⌘"}, - {"\xbc with ValidRune", a.ValidRune, false, "�"}, + {"\xbc with ValidRune", a.ValidRune, false, ""}, {"", a.ValidRune, false, ""}, + {"\xbc with InvalidRune", a.InvalidRune, true, "�"}, + {"ok with InvalidRune", a.InvalidRune, false, ""}, {" ", a.Space, true, " "}, {"X", a.Space, false, ""}, {"\t", a.Tab, true, "\t"}, @@ -234,6 +250,7 @@ func TestAtoms(t *testing.T) { {"0", a.IntegerBetween(-10, 10), true, "0"}, {"10", a.IntegerBetween(-10, 10), true, "10"}, {"11", a.IntegerBetween(0, 10), false, ""}, + {"fifteen", a.IntegerBetween(0, 10), false, ""}, }) } @@ -250,7 +267,11 @@ func TestIPv4Atoms(t *testing.T) { {"256123", tokenize.MatchOctet(false), false, ""}, {"300", tokenize.MatchOctet(false), false, ""}, - // Normalized octet. + // Octet. + {"0", tokenize.MatchOctet(false), true, "0"}, + {"02", tokenize.MatchOctet(false), true, "02"}, + {"003", tokenize.MatchOctet(false), true, "003"}, + {"256", tokenize.MatchOctet(false), false, ""}, {"0X", a.Octet, true, "0"}, {"00X", a.Octet, true, "0"}, {"000X", a.Octet, true, "0"}, @@ -261,6 +282,8 @@ func TestIPv4Atoms(t *testing.T) { {"300", a.Octet, false, ""}, // IPv4 address. + {"0.0.0.0", tokenize.MatchIPv4(false), true, "0.0.0.0"}, + {"010.0.255.01", tokenize.MatchIPv4(false), true, "010.0.255.01"}, {"0.0.0.0", a.IPv4, true, "0.0.0.0"}, {"10.20.30.40", a.IPv4, true, "10.20.30.40"}, {"010.020.003.004", a.IPv4, true, "10.20.3.4"}, @@ -268,6 +291,8 @@ func TestIPv4Atoms(t *testing.T) { {"256.255.255.255", a.IPv4, false, ""}, // IPv4 CIDR netmask. + {"0", tokenize.MatchIPv4CIDRMask(false), true, "0"}, + {"000", tokenize.MatchIPv4CIDRMask(false), true, "000"}, {"0", a.IPv4CIDRMask, true, "0"}, {"00", a.IPv4CIDRMask, true, "0"}, {"000", a.IPv4CIDRMask, true, "0"}, @@ -276,6 +301,8 @@ func TestIPv4Atoms(t *testing.T) { {"33", a.IPv4CIDRMask, false, ""}, // IPv4 netmask in dotted quad format. + {"0.0.0.0", tokenize.MatchIPv4Netmask(false), true, "0.0.0.0"}, + {"255.128.000.000", tokenize.MatchIPv4Netmask(false), true, "255.128.000.000"}, {"0.0.0.0", a.IPv4Netmask, true, "0.0.0.0"}, {"255.255.128.0", a.IPv4Netmask, true, "255.255.128.0"}, {"255.255.255.255", a.IPv4Netmask, true, "255.255.255.255"}, @@ -283,6 +310,8 @@ func TestIPv4Atoms(t *testing.T) { // IPv4 address + CIDR or dotted quad netmask. {"192.168.6.123", a.IPv4Net, false, ""}, + {"192.168.6.123/24", tokenize.MatchIPv4Net(false), true, "192.168.6.123/24"}, + {"001.002.003.004/016", tokenize.MatchIPv4Net(false), true, "001.002.003.004/016"}, {"192.168.6.123/024", a.IPv4Net, true, "192.168.6.123/24"}, {"192.168.6.123/255.255.255.0", a.IPv4Net, true, "192.168.6.123/24"}, {"10.0.0.10/192.0.0.0", a.IPv4Net, true, "10.0.0.10/2"}, @@ -323,6 +352,8 @@ func TestIPv6Atoms(t *testing.T) { func TestModifiers(t *testing.T) { var c, a, m = tokenize.C, tokenize.A, tokenize.M AssertHandlers(t, []HandlerT{ + {"missed me!", m.Drop(a.Rune('w')), false, ""}, + {"where are you?", m.Drop(a.Rune('w')), true, ""}, {"--cool", c.Seq(m.Drop(c.OneOrMore(a.Minus)), a.Str("cool")), true, "cool"}, {"12345", c.Seq(a.Digit, m.Drop(a.Digit), a.Digit, m.Drop(a.Digit), a.Digit), true, "135"}, {" trim ", m.Trim(c.OneOrMore(a.AnyRune), " "), true, "trim"}, @@ -332,6 +363,7 @@ func TestModifiers(t *testing.T) { {" \t trim \t ", m.TrimRight(c.OneOrMore(a.AnyRune), " \t"), true, " \t trim"}, {"dirtyword", m.Replace(c.OneOrMore(a.AnyRune), "*******"), true, "*******"}, {"abcdefghijk", m.ByCallback(a.Str("abc"), func(s string) string { return "X" }), true, "X"}, + {"abcdefghijk", m.ByCallback(a.Str("xyz"), func(s string) string { return "X" }), false, ""}, {"NoTaLlUpPeR", m.ToUpper(a.StrNoCase("notallUPPER")), true, "NOTALLUPPER"}, {"NoTaLlLoWeR", m.ToLower(a.StrNoCase("NOTALLlower")), true, "notalllower"}, }) @@ -363,6 +395,12 @@ func TestTokenMakers(t *testing.T) { {`Ѝюجinterpreted \n string \u2318`, tok.StrInterpreted("C", c.OneOrMore(a.AnyRune)), []tokenize.Token{{Type: "C", Value: "Ѝюجinterpreted \n string ⌘"}}}, + {`\uD801 invalid rune`, tok.StrInterpreted("D", c.OneOrMore(a.AnyRune)), []tokenize.Token{{Type: "D", Value: "� invalid rune"}}}, + + // I don't check the returned error here, but it's good enough to see that the parsing + // stopped after the illegal \g escape sequence. + {`invalid \g escape`, tok.StrInterpreted("E", c.OneOrMore(a.AnyRune)), []tokenize.Token{{Type: "E", Value: "invalid "}}}, + {"Ø*", tok.Byte("Q", a.AnyRune), []tokenize.Token{{Type: "Q", Value: byte('Ø')}}}, {"ROCKS", c.OneOrMore(tok.Byte("bar", a.ASCII)), []tokenize.Token{ {Type: "bar", Value: byte('R')}, @@ -410,9 +448,38 @@ func TestTokenMakers(t *testing.T) { {Type: "P", Value: false}, {Type: "P", Value: false}, }}, + + {`anything`, tok.ByValue("Q", c.OneOrMore(a.AnyRune), "Kaboom!"), []tokenize.Token{{Type: "Q", Value: "Kaboom!"}}}, }) } +func TestTokenGroup_Match(t *testing.T) { + var c, a, tok = tokenize.C, tokenize.A, tokenize.T + tokenizer := tokenize.New(tok.Group("Group", + c.Seq(tok.Rune(1, a.Letter), tok.Rune(2, a.Letter), tok.Rune(3, a.Letter)))) + + api, err := tokenizer("xxxxx") + AssertTrue(t, err == nil, "Tokenizer result") + tokens := api.Tokens() + AssertEqual(t, 1, len(tokens), "Length of tokens slice") + contained := tokens[0].Value.([]tokenize.Token) + AssertEqual(t, 3, len(contained), "Length of contained tokens") + AssertEqual(t, 1, contained[0].Type.(int), "Value of contained Token 1") + AssertEqual(t, 2, contained[1].Type.(int), "Value of contained Token 2") + AssertEqual(t, 3, contained[2].Type.(int), "Value of contained Token 3") +} + +func TestTokenGroup_Mismatch(t *testing.T) { + var c, a, tok = tokenize.C, tokenize.A, tokenize.T + tokenizer := tokenize.New(tok.Group("Group", + c.Seq(tok.Rune(1, a.Letter), tok.Rune(2, a.Letter), tok.Rune(3, a.Letter))).Optional()) + + api, err := tokenizer("12345") + AssertTrue(t, err == nil, "Tokenizer result") + tokens := api.Tokens() + AssertEqual(t, 0, len(tokens), "Length of tokens slice") +} + // I know, this is hell, but that's the whole point for this test :-> func TestCombination(t *testing.T) { var c, a, m = tokenize.C, tokenize.A, tokenize.M diff --git a/tokenize2/token.go b/tokenize2/token.go index 166367a..a55add4 100644 --- a/tokenize2/token.go +++ b/tokenize2/token.go @@ -6,8 +6,8 @@ import ( // Token defines a lexical token as produced by tokenize.Handlers. // -// The only mandatory data in a Token are the Runes. The Type and Value fields -// are optional fields that can be filled with data at will. +// The Type and Value fields are optional fields that can be filled +// with data at will. // // The use of the Type field is to let a tokenizer communicate to // the parser what type of token it's handling. @@ -30,12 +30,12 @@ func (t Token) String() string { value := "" if t.Value != nil { switch t.Value.(type) { - case []*Token: + case []Token: return fmt.Sprintf("%v%v", tokenType, t.Value) case string: value = fmt.Sprintf("%q", t.Value) case rune: - value = fmt.Sprintf("%v", t.Value) + value = fmt.Sprintf("'%c'", t.Value) case bool: value = fmt.Sprintf("%v", t.Value) default: diff --git a/tokenize2/token_test.go b/tokenize2/token_test.go new file mode 100644 index 0000000..e82b7cf --- /dev/null +++ b/tokenize2/token_test.go @@ -0,0 +1,31 @@ +package tokenize2_test + +import ( + "fmt" + + tokenize "git.makaay.nl/mauricem/go-parsekit/tokenize2" +) + +func ExampleToken_String() { + fmt.Println(tokenize.Token{Type: "Name", Value: "Peter Pan"}) + + fmt.Println(tokenize.Token{Type: "Gender", Value: 'm'}) + + fmt.Println(tokenize.Token{Type: "CanFly", Value: true}) + + fmt.Println(tokenize.Token{Type: "Friends", Value: []tokenize.Token{ + {Type: "Name", Value: "Tinkerbell"}, + {Type: "Name", Value: "Tootles"}, + {Type: "Name", Value: "Slightly"}, + {Type: "Name", Value: "Nibs"}, + }}) + + fmt.Println(tokenize.Token{Type: "FirstMovieYear", Value: 1924}) + + // Output: + // Name("Peter Pan") + // Gender('m') + // CanFly(true) + // Friends[Name("Tinkerbell") Name("Tootles") Name("Slightly") Name("Nibs")] + // FirstMovieYear((int)1924) +} diff --git a/tokenize2/tokenizer_whitebox_test.go b/tokenize2/tokenizer_whitebox_test.go index 10ae253..5c6556c 100644 --- a/tokenize2/tokenizer_whitebox_test.go +++ b/tokenize2/tokenizer_whitebox_test.go @@ -95,6 +95,27 @@ func TestFlushInput(t *testing.T) { AssertEqual(t, "cool", api.String(), "end result") } +func TestInputFlusherWrapper(t *testing.T) { + runeA := A.Rune('a') + flushB := C.FlushInput(A.Rune('b')) + api := NewAPI("abaab") + runeA(api) + AssertEqual(t, 1, api.stackFrame.offset, "offset after 1 read") + AssertEqual(t, "a", api.String(), "runes after 1 read") + flushB(api) + AssertEqual(t, 0, api.stackFrame.offset, "offset after 2 reads + input flush") + AssertEqual(t, "ab", api.String(), "runes after 2 reads") + runeA(api) + AssertEqual(t, 1, api.stackFrame.offset, "offset after 3 reads") + AssertEqual(t, "aba", api.String(), "runes after 3 reads") + runeA(api) + AssertEqual(t, 2, api.stackFrame.offset, "offset after 4 reads") + AssertEqual(t, "abaa", api.String(), "runes after 4 reads") + flushB(api) + AssertEqual(t, 0, api.stackFrame.offset, "offset after 5 reads + input flush") + AssertEqual(t, "abaab", api.String(), "runes after 5 reads") +} + func AssertEqual(t *testing.T, expected interface{}, actual interface{}, forWhat string) { if expected != actual { t.Errorf(