Finalized the work-through of the new version of the tokenizer code.

2019-07-10 20:36:21 +00:00 · 2019-07-10 20:36:21 +00:00 · 7598b62dd0
parent 48d7fda9f8
commit 7598b62dd0
6 changed files with 136 additions and 28 deletions
--- a/tokenize2/api_test.go
+++ b/tokenize2/api_test.go
@ -274,7 +274,7 @@ func TestClearRunes(t *testing.T) {
 	api.Accept()     // Add to runes
 	api.NextRune()   // Read 'a'
 	api.Accept()     // Add to runes
-	api.ClearRunes() // Clear the runes
+	api.ClearRunes() // Clear the runes, giving us a fresh start.
 	api.NextRune()   // Read 'p'
 	api.Accept()     // Add to runes
 	api.NextRune()   // Read 'r'
--- a/tokenize2/handlers_builtin.go
+++ b/tokenize2/handlers_builtin.go
@ -636,17 +636,12 @@ func MatchExcept(handler Handler, except Handler) Handler {
 // for the lookAhead handler is ignored.
 func MatchFollowedBy(lookAhead Handler, handler Handler) Handler {
 	return func(t *API) bool {
-		child := t.Fork()
 		if handler(t) {
-			subChild := t.Fork()
-			if lookAhead(t) {
-				t.Dispose(subChild)
-				t.Merge(child)
-			}
+			child := t.Fork()
+			result := lookAhead(t)
 			t.Dispose(child)
-			return true
+			return result
 		}
-		t.Dispose(child)
 		return false
 	}
 }
@ -657,17 +652,12 @@ func MatchFollowedBy(lookAhead Handler, handler Handler) Handler {
 // the handler is accepted.
 func MatchNotFollowedBy(lookAhead Handler, handler Handler) Handler {
 	return func(t *API) bool {
-		child := t.Fork()
 		if handler(t) {
-			subChild := t.Fork()
-			if !lookAhead(t) {
-				t.Dispose(subChild)
-				t.Merge(child)
-				t.Dispose(child)
-				return true
-			}
+			child := t.Fork()
+			result := !lookAhead(t)
+			t.Dispose(child)
+			return result
 		}
-		t.Dispose(child)
 		return false
 	}
 }
@ -681,7 +671,7 @@ func MatchNotFollowedBy(lookAhead Handler, handler Handler) Handler {
 //
 //  Without flushing the input, the input reader will allocate memory
 // during the parsing process, eventually enough to hold the full input
-// in memory. By wrapping Handlers with DoFlushInput, you can tell parsekit
+// in memory. By wrapping Handlers with an input flusher, you can tell parsekit
 // that the accumulated input so far will no longer be needed, allowing
 // this input to be flushed from memory.
 //
@ -1203,7 +1193,6 @@ func MakeStrInterpretedToken(toktype interface{}, handler Handler) Handler {
 	})
 }

-// TODO Use better interpreter from parser code?
 func interpretString(str string) (string, error) {
 	var sb strings.Builder
 	for len(str) > 0 {
--- a/tokenize2/handlers_builtin_test.go
+++ b/tokenize2/handlers_builtin_test.go
@ -19,8 +19,10 @@ func TestCombinatorsTempDebug(t *testing.T) {
 func TestCombinators(t *testing.T) {
 	var c, a, m = tokenize.C, tokenize.A, tokenize.M
 	AssertHandlers(t, []HandlerT{
+		{"", c.Not(a.Rune('b')), false, ""},
 		{"abc not", c.Not(a.Rune('b')), true, "a"},
 		{"bcd not", c.Not(a.Rune('b')), false, ""},
+		{"aaaxxxb", c.OneOrMore(c.Not(a.Rune('b'))), true, "aaaxxx"},
 		{"1010 not", c.Not(c.Seq(a.Rune('2'), a.Rune('0'))), true, "1"},
 		{"2020 not", c.Not(c.Seq(a.Rune('2'), a.Rune('0'))), false, ""},
 		{"abc any", c.Any(a.Rune('a'), a.Rune('b')), true, "a"},
@ -71,8 +73,20 @@ func TestCombinators(t *testing.T) {
 		{"1,2,3,b,c", c.Separated(a.Comma, a.Digit), true, "1,2,3"},
 		{`\x9a\x01\xF0\xfCAndSomeMoreStuff`, c.OneOrMore(c.Seq(a.Backslash, a.Rune('x'), c.Repeated(2, a.HexDigit))), true, `\x9a\x01\xF0\xfC`},
 		{"  ", m.Trim(c.OneOrMore(a.AnyRune), " "), true, ""},
-		{"  ", m.TrimLeft(c.OneOrMore(a.AnyRune), " "), true, ""},
-		{"  ", m.TrimRight(c.OneOrMore(a.AnyRune), " "), true, ""},
+		{"  a", m.TrimLeft(c.OneOrMore(a.AnyRune), " "), true, "a"},
+		{"a  ", m.TrimRight(c.OneOrMore(a.AnyRune), " "), true, "a"},
+		{"   a   ", m.TrimSpace(c.OneOrMore(a.AnyRune)), true, "a"},
+		{"ab", c.FollowedBy(a.Rune('b'), a.Rune('a')), true, "a"},
+		{"ba", c.FollowedBy(a.Rune('b'), a.Rune('a')), false, ""},
+		{"aa", c.FollowedBy(a.Rune('b'), a.Rune('a')), false, ""},
+		{"aaabbbcccddd", c.FollowedBy(c.OneOrMore(a.Rune('d')), c.OneOrMore(a.Rune('a')).Then(c.OneOrMore(c.Not(a.Rune('d'))))), true, "aaabbbccc"},
+		{"aaabbbcccxxx", c.FollowedBy(c.OneOrMore(a.Rune('d')), c.OneOrMore(a.Rune('a')).Then(c.OneOrMore(c.Not(a.Rune('d'))))), false, ""},
+		{"xy", c.NotFollowedBy(a.Rune('a'), a.Rune('x')), true, "x"},
+		{"yx", c.NotFollowedBy(a.Rune('a'), a.Rune('x')), false, ""},
+		{"xx", c.NotFollowedBy(a.Rune('a'), a.Rune('x')), true, "x"},
+		{"xa", c.NotFollowedBy(a.Rune('a'), a.Rune('x')), false, ""},
+		{"xxxyyyzzzaaa", c.NotFollowedBy(a.Rune('a'), c.OneOrMore(a.Runes('x', 'y', 'z'))), false, ""},
+		{"xxxyyyzzzbaa", c.NotFollowedBy(a.Rune('a'), c.OneOrMore(a.Runes('x', 'y', 'z'))), true, "xxxyyyzzz"},
 	})
 }

@ -119,8 +133,10 @@ func TestAtoms(t *testing.T) {
 		{"\xbc with AnyRune", a.AnyRune, true, "<22>"},
 		{"", a.AnyRune, false, ""},
 		{"⌘", a.ValidRune, true, "⌘"},
-		{"\xbc with ValidRune", a.ValidRune, false, "<EFBFBD>"},
+		{"\xbc with ValidRune", a.ValidRune, false, ""},
 		{"", a.ValidRune, false, ""},
+		{"\xbc with InvalidRune", a.InvalidRune, true, "<22>"},
+		{"ok with InvalidRune", a.InvalidRune, false, ""},
 		{" ", a.Space, true, " "},
 		{"X", a.Space, false, ""},
 		{"\t", a.Tab, true, "\t"},
@ -234,6 +250,7 @@ func TestAtoms(t *testing.T) {
 		{"0", a.IntegerBetween(-10, 10), true, "0"},
 		{"10", a.IntegerBetween(-10, 10), true, "10"},
 		{"11", a.IntegerBetween(0, 10), false, ""},
+		{"fifteen", a.IntegerBetween(0, 10), false, ""},
 	})
 }

@ -250,7 +267,11 @@ func TestIPv4Atoms(t *testing.T) {
 		{"256123", tokenize.MatchOctet(false), false, ""},
 		{"300", tokenize.MatchOctet(false), false, ""},

-		// Normalized octet.
+		// Octet.
+		{"0", tokenize.MatchOctet(false), true, "0"},
+		{"02", tokenize.MatchOctet(false), true, "02"},
+		{"003", tokenize.MatchOctet(false), true, "003"},
+		{"256", tokenize.MatchOctet(false), false, ""},
 		{"0X", a.Octet, true, "0"},
 		{"00X", a.Octet, true, "0"},
 		{"000X", a.Octet, true, "0"},
@ -261,6 +282,8 @@ func TestIPv4Atoms(t *testing.T) {
 		{"300", a.Octet, false, ""},

 		// IPv4 address.
+		{"0.0.0.0", tokenize.MatchIPv4(false), true, "0.0.0.0"},
+		{"010.0.255.01", tokenize.MatchIPv4(false), true, "010.0.255.01"},
 		{"0.0.0.0", a.IPv4, true, "0.0.0.0"},
 		{"10.20.30.40", a.IPv4, true, "10.20.30.40"},
 		{"010.020.003.004", a.IPv4, true, "10.20.3.4"},
@ -268,6 +291,8 @@ func TestIPv4Atoms(t *testing.T) {
 		{"256.255.255.255", a.IPv4, false, ""},

 		// IPv4 CIDR netmask.
+		{"0", tokenize.MatchIPv4CIDRMask(false), true, "0"},
+		{"000", tokenize.MatchIPv4CIDRMask(false), true, "000"},
 		{"0", a.IPv4CIDRMask, true, "0"},
 		{"00", a.IPv4CIDRMask, true, "0"},
 		{"000", a.IPv4CIDRMask, true, "0"},
@ -276,6 +301,8 @@ func TestIPv4Atoms(t *testing.T) {
 		{"33", a.IPv4CIDRMask, false, ""},

 		// IPv4 netmask in dotted quad format.
+		{"0.0.0.0", tokenize.MatchIPv4Netmask(false), true, "0.0.0.0"},
+		{"255.128.000.000", tokenize.MatchIPv4Netmask(false), true, "255.128.000.000"},
 		{"0.0.0.0", a.IPv4Netmask, true, "0.0.0.0"},
 		{"255.255.128.0", a.IPv4Netmask, true, "255.255.128.0"},
 		{"255.255.255.255", a.IPv4Netmask, true, "255.255.255.255"},
@ -283,6 +310,8 @@ func TestIPv4Atoms(t *testing.T) {

 		// IPv4 address + CIDR or dotted quad netmask.
 		{"192.168.6.123", a.IPv4Net, false, ""},
+		{"192.168.6.123/24", tokenize.MatchIPv4Net(false), true, "192.168.6.123/24"},
+		{"001.002.003.004/016", tokenize.MatchIPv4Net(false), true, "001.002.003.004/016"},
 		{"192.168.6.123/024", a.IPv4Net, true, "192.168.6.123/24"},
 		{"192.168.6.123/255.255.255.0", a.IPv4Net, true, "192.168.6.123/24"},
 		{"10.0.0.10/192.0.0.0", a.IPv4Net, true, "10.0.0.10/2"},
@ -323,6 +352,8 @@ func TestIPv6Atoms(t *testing.T) {
 func TestModifiers(t *testing.T) {
 	var c, a, m = tokenize.C, tokenize.A, tokenize.M
 	AssertHandlers(t, []HandlerT{
+		{"missed me!", m.Drop(a.Rune('w')), false, ""},
+		{"where are you?", m.Drop(a.Rune('w')), true, ""},
 		{"--cool", c.Seq(m.Drop(c.OneOrMore(a.Minus)), a.Str("cool")), true, "cool"},
 		{"12345", c.Seq(a.Digit, m.Drop(a.Digit), a.Digit, m.Drop(a.Digit), a.Digit), true, "135"},
 		{"  trim  ", m.Trim(c.OneOrMore(a.AnyRune), " "), true, "trim"},
@ -332,6 +363,7 @@ func TestModifiers(t *testing.T) {
 		{" \t  trim  \t ", m.TrimRight(c.OneOrMore(a.AnyRune), " \t"), true, " \t  trim"},
 		{"dirtyword", m.Replace(c.OneOrMore(a.AnyRune), "*******"), true, "*******"},
 		{"abcdefghijk", m.ByCallback(a.Str("abc"), func(s string) string { return "X" }), true, "X"},
+		{"abcdefghijk", m.ByCallback(a.Str("xyz"), func(s string) string { return "X" }), false, ""},
 		{"NoTaLlUpPeR", m.ToUpper(a.StrNoCase("notallUPPER")), true, "NOTALLUPPER"},
 		{"NoTaLlLoWeR", m.ToLower(a.StrNoCase("NOTALLlower")), true, "notalllower"},
 	})
@ -363,6 +395,12 @@ func TestTokenMakers(t *testing.T) {
 		{`Ѝюجinterpreted \n string \u2318`, tok.StrInterpreted("C", c.OneOrMore(a.AnyRune)),
 			[]tokenize.Token{{Type: "C", Value: "Ѝюجinterpreted \n string ⌘"}}},

+		{`\uD801 invalid rune`, tok.StrInterpreted("D", c.OneOrMore(a.AnyRune)), []tokenize.Token{{Type: "D", Value: "<22> invalid rune"}}},
+
+		// I don't check the returned error here, but it's good enough to see that the parsing
+		// stopped after the illegal \g escape sequence.
+		{`invalid \g escape`, tok.StrInterpreted("E", c.OneOrMore(a.AnyRune)), []tokenize.Token{{Type: "E", Value: "invalid "}}},
+
 		{"Ø*", tok.Byte("Q", a.AnyRune), []tokenize.Token{{Type: "Q", Value: byte('Ø')}}},
 		{"ROCKS", c.OneOrMore(tok.Byte("bar", a.ASCII)), []tokenize.Token{
 			{Type: "bar", Value: byte('R')},
@ -410,9 +448,38 @@ func TestTokenMakers(t *testing.T) {
 			{Type: "P", Value: false},
 			{Type: "P", Value: false},
 		}},
+
+		{`anything`, tok.ByValue("Q", c.OneOrMore(a.AnyRune), "Kaboom!"), []tokenize.Token{{Type: "Q", Value: "Kaboom!"}}},
 	})
 }

+func TestTokenGroup_Match(t *testing.T) {
+	var c, a, tok = tokenize.C, tokenize.A, tokenize.T
+	tokenizer := tokenize.New(tok.Group("Group",
+		c.Seq(tok.Rune(1, a.Letter), tok.Rune(2, a.Letter), tok.Rune(3, a.Letter))))
+
+	api, err := tokenizer("xxxxx")
+	AssertTrue(t, err == nil, "Tokenizer result")
+	tokens := api.Tokens()
+	AssertEqual(t, 1, len(tokens), "Length of tokens slice")
+	contained := tokens[0].Value.([]tokenize.Token)
+	AssertEqual(t, 3, len(contained), "Length of contained tokens")
+	AssertEqual(t, 1, contained[0].Type.(int), "Value of contained Token 1")
+	AssertEqual(t, 2, contained[1].Type.(int), "Value of contained Token 2")
+	AssertEqual(t, 3, contained[2].Type.(int), "Value of contained Token 3")
+}
+
+func TestTokenGroup_Mismatch(t *testing.T) {
+	var c, a, tok = tokenize.C, tokenize.A, tokenize.T
+	tokenizer := tokenize.New(tok.Group("Group",
+		c.Seq(tok.Rune(1, a.Letter), tok.Rune(2, a.Letter), tok.Rune(3, a.Letter))).Optional())
+
+	api, err := tokenizer("12345")
+	AssertTrue(t, err == nil, "Tokenizer result")
+	tokens := api.Tokens()
+	AssertEqual(t, 0, len(tokens), "Length of tokens slice")
+}
+
 // I know, this is hell, but that's the whole point for this test :->
 func TestCombination(t *testing.T) {
 	var c, a, m = tokenize.C, tokenize.A, tokenize.M
--- a/tokenize2/token.go
+++ b/tokenize2/token.go
@ -6,8 +6,8 @@ import (

 // Token defines a lexical token as produced by tokenize.Handlers.
 //
-// The only mandatory data in a Token are the Runes. The Type and Value fields
-// are optional fields that can be filled with data at will.
+// The Type and Value fields are optional fields that can be filled
+// with data at will.
 //
 // The use of the Type field is to let a tokenizer communicate to
 // the parser what type of token it's handling.
@ -30,12 +30,12 @@ func (t Token) String() string {
 	value := ""
 	if t.Value != nil {
 		switch t.Value.(type) {
-		case []*Token:
+		case []Token:
 			return fmt.Sprintf("%v%v", tokenType, t.Value)
 		case string:
 			value = fmt.Sprintf("%q", t.Value)
 		case rune:
-			value = fmt.Sprintf("%v", t.Value)
+			value = fmt.Sprintf("'%c'", t.Value)
 		case bool:
 			value = fmt.Sprintf("%v", t.Value)
 		default:
--- a/tokenize2/token_test.go
+++ b/tokenize2/token_test.go
@ -0,0 +1,31 @@
+package tokenize2_test
+
+import (
+	"fmt"
+
+	tokenize "git.makaay.nl/mauricem/go-parsekit/tokenize2"
+)
+
+func ExampleToken_String() {
+	fmt.Println(tokenize.Token{Type: "Name", Value: "Peter Pan"})
+
+	fmt.Println(tokenize.Token{Type: "Gender", Value: 'm'})
+
+	fmt.Println(tokenize.Token{Type: "CanFly", Value: true})
+
+	fmt.Println(tokenize.Token{Type: "Friends", Value: []tokenize.Token{
+		{Type: "Name", Value: "Tinkerbell"},
+		{Type: "Name", Value: "Tootles"},
+		{Type: "Name", Value: "Slightly"},
+		{Type: "Name", Value: "Nibs"},
+	}})
+
+	fmt.Println(tokenize.Token{Type: "FirstMovieYear", Value: 1924})
+
+	// Output:
+	// Name("Peter Pan")
+	// Gender('m')
+	// CanFly(true)
+	// Friends[Name("Tinkerbell") Name("Tootles") Name("Slightly") Name("Nibs")]
+	// FirstMovieYear((int)1924)
+}
--- a/tokenize2/tokenizer_whitebox_test.go
+++ b/tokenize2/tokenizer_whitebox_test.go
@ -95,6 +95,27 @@ func TestFlushInput(t *testing.T) {
 	AssertEqual(t, "cool", api.String(), "end result")
 }

+func TestInputFlusherWrapper(t *testing.T) {
+	runeA := A.Rune('a')
+	flushB := C.FlushInput(A.Rune('b'))
+	api := NewAPI("abaab")
+	runeA(api)
+	AssertEqual(t, 1, api.stackFrame.offset, "offset after 1 read")
+	AssertEqual(t, "a", api.String(), "runes after 1 read")
+	flushB(api)
+	AssertEqual(t, 0, api.stackFrame.offset, "offset after 2 reads + input flush")
+	AssertEqual(t, "ab", api.String(), "runes after 2 reads")
+	runeA(api)
+	AssertEqual(t, 1, api.stackFrame.offset, "offset after 3 reads")
+	AssertEqual(t, "aba", api.String(), "runes after 3 reads")
+	runeA(api)
+	AssertEqual(t, 2, api.stackFrame.offset, "offset after 4 reads")
+	AssertEqual(t, "abaa", api.String(), "runes after 4 reads")
+	flushB(api)
+	AssertEqual(t, 0, api.stackFrame.offset, "offset after 5 reads + input flush")
+	AssertEqual(t, "abaab", api.String(), "runes after 5 reads")
+}
+
 func AssertEqual(t *testing.T, expected interface{}, actual interface{}, forWhat string) {
 	if expected != actual {
 		t.Errorf(