Finalized the work-through of the new version of the tokenizer code.

This commit is contained in:
Maurice Makaay 2019-07-10 20:36:21 +00:00
parent 48d7fda9f8
commit 7598b62dd0
6 changed files with 136 additions and 28 deletions

View File

@ -274,7 +274,7 @@ func TestClearRunes(t *testing.T) {
api.Accept() // Add to runes
api.NextRune() // Read 'a'
api.Accept() // Add to runes
api.ClearRunes() // Clear the runes
api.ClearRunes() // Clear the runes, giving us a fresh start.
api.NextRune() // Read 'p'
api.Accept() // Add to runes
api.NextRune() // Read 'r'

View File

@ -636,17 +636,12 @@ func MatchExcept(handler Handler, except Handler) Handler {
// for the lookAhead handler is ignored.
func MatchFollowedBy(lookAhead Handler, handler Handler) Handler {
return func(t *API) bool {
child := t.Fork()
if handler(t) {
subChild := t.Fork()
if lookAhead(t) {
t.Dispose(subChild)
t.Merge(child)
}
child := t.Fork()
result := lookAhead(t)
t.Dispose(child)
return true
return result
}
t.Dispose(child)
return false
}
}
@ -657,17 +652,12 @@ func MatchFollowedBy(lookAhead Handler, handler Handler) Handler {
// the handler is accepted.
func MatchNotFollowedBy(lookAhead Handler, handler Handler) Handler {
return func(t *API) bool {
child := t.Fork()
if handler(t) {
subChild := t.Fork()
if !lookAhead(t) {
t.Dispose(subChild)
t.Merge(child)
t.Dispose(child)
return true
}
child := t.Fork()
result := !lookAhead(t)
t.Dispose(child)
return result
}
t.Dispose(child)
return false
}
}
@ -681,7 +671,7 @@ func MatchNotFollowedBy(lookAhead Handler, handler Handler) Handler {
//
// Without flushing the input, the input reader will allocate memory
// during the parsing process, eventually enough to hold the full input
// in memory. By wrapping Handlers with DoFlushInput, you can tell parsekit
// in memory. By wrapping Handlers with an input flusher, you can tell parsekit
// that the accumulated input so far will no longer be needed, allowing
// this input to be flushed from memory.
//
@ -1203,7 +1193,6 @@ func MakeStrInterpretedToken(toktype interface{}, handler Handler) Handler {
})
}
// TODO Use better interpreter from parser code?
func interpretString(str string) (string, error) {
var sb strings.Builder
for len(str) > 0 {

View File

@ -19,8 +19,10 @@ func TestCombinatorsTempDebug(t *testing.T) {
func TestCombinators(t *testing.T) {
var c, a, m = tokenize.C, tokenize.A, tokenize.M
AssertHandlers(t, []HandlerT{
{"", c.Not(a.Rune('b')), false, ""},
{"abc not", c.Not(a.Rune('b')), true, "a"},
{"bcd not", c.Not(a.Rune('b')), false, ""},
{"aaaxxxb", c.OneOrMore(c.Not(a.Rune('b'))), true, "aaaxxx"},
{"1010 not", c.Not(c.Seq(a.Rune('2'), a.Rune('0'))), true, "1"},
{"2020 not", c.Not(c.Seq(a.Rune('2'), a.Rune('0'))), false, ""},
{"abc any", c.Any(a.Rune('a'), a.Rune('b')), true, "a"},
@ -71,8 +73,20 @@ func TestCombinators(t *testing.T) {
{"1,2,3,b,c", c.Separated(a.Comma, a.Digit), true, "1,2,3"},
{`\x9a\x01\xF0\xfCAndSomeMoreStuff`, c.OneOrMore(c.Seq(a.Backslash, a.Rune('x'), c.Repeated(2, a.HexDigit))), true, `\x9a\x01\xF0\xfC`},
{" ", m.Trim(c.OneOrMore(a.AnyRune), " "), true, ""},
{" ", m.TrimLeft(c.OneOrMore(a.AnyRune), " "), true, ""},
{" ", m.TrimRight(c.OneOrMore(a.AnyRune), " "), true, ""},
{" a", m.TrimLeft(c.OneOrMore(a.AnyRune), " "), true, "a"},
{"a ", m.TrimRight(c.OneOrMore(a.AnyRune), " "), true, "a"},
{" a ", m.TrimSpace(c.OneOrMore(a.AnyRune)), true, "a"},
{"ab", c.FollowedBy(a.Rune('b'), a.Rune('a')), true, "a"},
{"ba", c.FollowedBy(a.Rune('b'), a.Rune('a')), false, ""},
{"aa", c.FollowedBy(a.Rune('b'), a.Rune('a')), false, ""},
{"aaabbbcccddd", c.FollowedBy(c.OneOrMore(a.Rune('d')), c.OneOrMore(a.Rune('a')).Then(c.OneOrMore(c.Not(a.Rune('d'))))), true, "aaabbbccc"},
{"aaabbbcccxxx", c.FollowedBy(c.OneOrMore(a.Rune('d')), c.OneOrMore(a.Rune('a')).Then(c.OneOrMore(c.Not(a.Rune('d'))))), false, ""},
{"xy", c.NotFollowedBy(a.Rune('a'), a.Rune('x')), true, "x"},
{"yx", c.NotFollowedBy(a.Rune('a'), a.Rune('x')), false, ""},
{"xx", c.NotFollowedBy(a.Rune('a'), a.Rune('x')), true, "x"},
{"xa", c.NotFollowedBy(a.Rune('a'), a.Rune('x')), false, ""},
{"xxxyyyzzzaaa", c.NotFollowedBy(a.Rune('a'), c.OneOrMore(a.Runes('x', 'y', 'z'))), false, ""},
{"xxxyyyzzzbaa", c.NotFollowedBy(a.Rune('a'), c.OneOrMore(a.Runes('x', 'y', 'z'))), true, "xxxyyyzzz"},
})
}
@ -119,8 +133,10 @@ func TestAtoms(t *testing.T) {
{"\xbc with AnyRune", a.AnyRune, true, "<22>"},
{"", a.AnyRune, false, ""},
{"⌘", a.ValidRune, true, "⌘"},
{"\xbc with ValidRune", a.ValidRune, false, "<EFBFBD>"},
{"\xbc with ValidRune", a.ValidRune, false, ""},
{"", a.ValidRune, false, ""},
{"\xbc with InvalidRune", a.InvalidRune, true, "<22>"},
{"ok with InvalidRune", a.InvalidRune, false, ""},
{" ", a.Space, true, " "},
{"X", a.Space, false, ""},
{"\t", a.Tab, true, "\t"},
@ -234,6 +250,7 @@ func TestAtoms(t *testing.T) {
{"0", a.IntegerBetween(-10, 10), true, "0"},
{"10", a.IntegerBetween(-10, 10), true, "10"},
{"11", a.IntegerBetween(0, 10), false, ""},
{"fifteen", a.IntegerBetween(0, 10), false, ""},
})
}
@ -250,7 +267,11 @@ func TestIPv4Atoms(t *testing.T) {
{"256123", tokenize.MatchOctet(false), false, ""},
{"300", tokenize.MatchOctet(false), false, ""},
// Normalized octet.
// Octet.
{"0", tokenize.MatchOctet(false), true, "0"},
{"02", tokenize.MatchOctet(false), true, "02"},
{"003", tokenize.MatchOctet(false), true, "003"},
{"256", tokenize.MatchOctet(false), false, ""},
{"0X", a.Octet, true, "0"},
{"00X", a.Octet, true, "0"},
{"000X", a.Octet, true, "0"},
@ -261,6 +282,8 @@ func TestIPv4Atoms(t *testing.T) {
{"300", a.Octet, false, ""},
// IPv4 address.
{"0.0.0.0", tokenize.MatchIPv4(false), true, "0.0.0.0"},
{"010.0.255.01", tokenize.MatchIPv4(false), true, "010.0.255.01"},
{"0.0.0.0", a.IPv4, true, "0.0.0.0"},
{"10.20.30.40", a.IPv4, true, "10.20.30.40"},
{"010.020.003.004", a.IPv4, true, "10.20.3.4"},
@ -268,6 +291,8 @@ func TestIPv4Atoms(t *testing.T) {
{"256.255.255.255", a.IPv4, false, ""},
// IPv4 CIDR netmask.
{"0", tokenize.MatchIPv4CIDRMask(false), true, "0"},
{"000", tokenize.MatchIPv4CIDRMask(false), true, "000"},
{"0", a.IPv4CIDRMask, true, "0"},
{"00", a.IPv4CIDRMask, true, "0"},
{"000", a.IPv4CIDRMask, true, "0"},
@ -276,6 +301,8 @@ func TestIPv4Atoms(t *testing.T) {
{"33", a.IPv4CIDRMask, false, ""},
// IPv4 netmask in dotted quad format.
{"0.0.0.0", tokenize.MatchIPv4Netmask(false), true, "0.0.0.0"},
{"255.128.000.000", tokenize.MatchIPv4Netmask(false), true, "255.128.000.000"},
{"0.0.0.0", a.IPv4Netmask, true, "0.0.0.0"},
{"255.255.128.0", a.IPv4Netmask, true, "255.255.128.0"},
{"255.255.255.255", a.IPv4Netmask, true, "255.255.255.255"},
@ -283,6 +310,8 @@ func TestIPv4Atoms(t *testing.T) {
// IPv4 address + CIDR or dotted quad netmask.
{"192.168.6.123", a.IPv4Net, false, ""},
{"192.168.6.123/24", tokenize.MatchIPv4Net(false), true, "192.168.6.123/24"},
{"001.002.003.004/016", tokenize.MatchIPv4Net(false), true, "001.002.003.004/016"},
{"192.168.6.123/024", a.IPv4Net, true, "192.168.6.123/24"},
{"192.168.6.123/255.255.255.0", a.IPv4Net, true, "192.168.6.123/24"},
{"10.0.0.10/192.0.0.0", a.IPv4Net, true, "10.0.0.10/2"},
@ -323,6 +352,8 @@ func TestIPv6Atoms(t *testing.T) {
func TestModifiers(t *testing.T) {
var c, a, m = tokenize.C, tokenize.A, tokenize.M
AssertHandlers(t, []HandlerT{
{"missed me!", m.Drop(a.Rune('w')), false, ""},
{"where are you?", m.Drop(a.Rune('w')), true, ""},
{"--cool", c.Seq(m.Drop(c.OneOrMore(a.Minus)), a.Str("cool")), true, "cool"},
{"12345", c.Seq(a.Digit, m.Drop(a.Digit), a.Digit, m.Drop(a.Digit), a.Digit), true, "135"},
{" trim ", m.Trim(c.OneOrMore(a.AnyRune), " "), true, "trim"},
@ -332,6 +363,7 @@ func TestModifiers(t *testing.T) {
{" \t trim \t ", m.TrimRight(c.OneOrMore(a.AnyRune), " \t"), true, " \t trim"},
{"dirtyword", m.Replace(c.OneOrMore(a.AnyRune), "*******"), true, "*******"},
{"abcdefghijk", m.ByCallback(a.Str("abc"), func(s string) string { return "X" }), true, "X"},
{"abcdefghijk", m.ByCallback(a.Str("xyz"), func(s string) string { return "X" }), false, ""},
{"NoTaLlUpPeR", m.ToUpper(a.StrNoCase("notallUPPER")), true, "NOTALLUPPER"},
{"NoTaLlLoWeR", m.ToLower(a.StrNoCase("NOTALLlower")), true, "notalllower"},
})
@ -363,6 +395,12 @@ func TestTokenMakers(t *testing.T) {
{`Ѝюجinterpreted \n string \u2318`, tok.StrInterpreted("C", c.OneOrMore(a.AnyRune)),
[]tokenize.Token{{Type: "C", Value: "Ѝюجinterpreted \n string ⌘"}}},
{`\uD801 invalid rune`, tok.StrInterpreted("D", c.OneOrMore(a.AnyRune)), []tokenize.Token{{Type: "D", Value: "<22> invalid rune"}}},
// I don't check the returned error here, but it's good enough to see that the parsing
// stopped after the illegal \g escape sequence.
{`invalid \g escape`, tok.StrInterpreted("E", c.OneOrMore(a.AnyRune)), []tokenize.Token{{Type: "E", Value: "invalid "}}},
{"Ø*", tok.Byte("Q", a.AnyRune), []tokenize.Token{{Type: "Q", Value: byte('Ø')}}},
{"ROCKS", c.OneOrMore(tok.Byte("bar", a.ASCII)), []tokenize.Token{
{Type: "bar", Value: byte('R')},
@ -410,9 +448,38 @@ func TestTokenMakers(t *testing.T) {
{Type: "P", Value: false},
{Type: "P", Value: false},
}},
{`anything`, tok.ByValue("Q", c.OneOrMore(a.AnyRune), "Kaboom!"), []tokenize.Token{{Type: "Q", Value: "Kaboom!"}}},
})
}
func TestTokenGroup_Match(t *testing.T) {
var c, a, tok = tokenize.C, tokenize.A, tokenize.T
tokenizer := tokenize.New(tok.Group("Group",
c.Seq(tok.Rune(1, a.Letter), tok.Rune(2, a.Letter), tok.Rune(3, a.Letter))))
api, err := tokenizer("xxxxx")
AssertTrue(t, err == nil, "Tokenizer result")
tokens := api.Tokens()
AssertEqual(t, 1, len(tokens), "Length of tokens slice")
contained := tokens[0].Value.([]tokenize.Token)
AssertEqual(t, 3, len(contained), "Length of contained tokens")
AssertEqual(t, 1, contained[0].Type.(int), "Value of contained Token 1")
AssertEqual(t, 2, contained[1].Type.(int), "Value of contained Token 2")
AssertEqual(t, 3, contained[2].Type.(int), "Value of contained Token 3")
}
func TestTokenGroup_Mismatch(t *testing.T) {
var c, a, tok = tokenize.C, tokenize.A, tokenize.T
tokenizer := tokenize.New(tok.Group("Group",
c.Seq(tok.Rune(1, a.Letter), tok.Rune(2, a.Letter), tok.Rune(3, a.Letter))).Optional())
api, err := tokenizer("12345")
AssertTrue(t, err == nil, "Tokenizer result")
tokens := api.Tokens()
AssertEqual(t, 0, len(tokens), "Length of tokens slice")
}
// I know, this is hell, but that's the whole point for this test :->
func TestCombination(t *testing.T) {
var c, a, m = tokenize.C, tokenize.A, tokenize.M

View File

@ -6,8 +6,8 @@ import (
// Token defines a lexical token as produced by tokenize.Handlers.
//
// The only mandatory data in a Token are the Runes. The Type and Value fields
// are optional fields that can be filled with data at will.
// The Type and Value fields are optional fields that can be filled
// with data at will.
//
// The use of the Type field is to let a tokenizer communicate to
// the parser what type of token it's handling.
@ -30,12 +30,12 @@ func (t Token) String() string {
value := ""
if t.Value != nil {
switch t.Value.(type) {
case []*Token:
case []Token:
return fmt.Sprintf("%v%v", tokenType, t.Value)
case string:
value = fmt.Sprintf("%q", t.Value)
case rune:
value = fmt.Sprintf("%v", t.Value)
value = fmt.Sprintf("'%c'", t.Value)
case bool:
value = fmt.Sprintf("%v", t.Value)
default:

31
tokenize2/token_test.go Normal file
View File

@ -0,0 +1,31 @@
package tokenize2_test
import (
"fmt"
tokenize "git.makaay.nl/mauricem/go-parsekit/tokenize2"
)
func ExampleToken_String() {
fmt.Println(tokenize.Token{Type: "Name", Value: "Peter Pan"})
fmt.Println(tokenize.Token{Type: "Gender", Value: 'm'})
fmt.Println(tokenize.Token{Type: "CanFly", Value: true})
fmt.Println(tokenize.Token{Type: "Friends", Value: []tokenize.Token{
{Type: "Name", Value: "Tinkerbell"},
{Type: "Name", Value: "Tootles"},
{Type: "Name", Value: "Slightly"},
{Type: "Name", Value: "Nibs"},
}})
fmt.Println(tokenize.Token{Type: "FirstMovieYear", Value: 1924})
// Output:
// Name("Peter Pan")
// Gender('m')
// CanFly(true)
// Friends[Name("Tinkerbell") Name("Tootles") Name("Slightly") Name("Nibs")]
// FirstMovieYear((int)1924)
}

View File

@ -95,6 +95,27 @@ func TestFlushInput(t *testing.T) {
AssertEqual(t, "cool", api.String(), "end result")
}
func TestInputFlusherWrapper(t *testing.T) {
runeA := A.Rune('a')
flushB := C.FlushInput(A.Rune('b'))
api := NewAPI("abaab")
runeA(api)
AssertEqual(t, 1, api.stackFrame.offset, "offset after 1 read")
AssertEqual(t, "a", api.String(), "runes after 1 read")
flushB(api)
AssertEqual(t, 0, api.stackFrame.offset, "offset after 2 reads + input flush")
AssertEqual(t, "ab", api.String(), "runes after 2 reads")
runeA(api)
AssertEqual(t, 1, api.stackFrame.offset, "offset after 3 reads")
AssertEqual(t, "aba", api.String(), "runes after 3 reads")
runeA(api)
AssertEqual(t, 2, api.stackFrame.offset, "offset after 4 reads")
AssertEqual(t, "abaa", api.String(), "runes after 4 reads")
flushB(api)
AssertEqual(t, 0, api.stackFrame.offset, "offset after 5 reads + input flush")
AssertEqual(t, "abaab", api.String(), "runes after 5 reads")
}
func AssertEqual(t *testing.T, expected interface{}, actual interface{}, forWhat string) {
if expected != actual {
t.Errorf(