From 48d7fda9f8a62dd6dfa4205fe5e4bd2230d94e3e Mon Sep 17 00:00:00 2001 From: Maurice Makaay Date: Wed, 10 Jul 2019 11:26:47 +0000 Subject: [PATCH] New implementation for performance. --- tokenize2/api.go | 374 +++++++ tokenize2/api_test.go | 330 ++++++ tokenize2/assertions_test.go | 118 ++ tokenize2/callerinfo.go | 33 + tokenize2/callerinfo_test.go | 35 + tokenize2/cursor.go | 45 + tokenize2/cursor_test.go | 69 ++ tokenize2/handler.go | 53 + tokenize2/handler_test.go | 101 ++ tokenize2/handlers_builtin.go | 1500 ++++++++++++++++++++++++++ tokenize2/handlers_builtin_test.go | 445 ++++++++ tokenize2/token.go | 47 + tokenize2/tokenize.go | 41 + tokenize2/tokenizer_test.go | 223 ++++ tokenize2/tokenizer_whitebox_test.go | 110 ++ 15 files changed, 3524 insertions(+) create mode 100644 tokenize2/api.go create mode 100644 tokenize2/api_test.go create mode 100644 tokenize2/assertions_test.go create mode 100644 tokenize2/callerinfo.go create mode 100644 tokenize2/callerinfo_test.go create mode 100644 tokenize2/cursor.go create mode 100644 tokenize2/cursor_test.go create mode 100644 tokenize2/handler.go create mode 100644 tokenize2/handler_test.go create mode 100644 tokenize2/handlers_builtin.go create mode 100644 tokenize2/handlers_builtin_test.go create mode 100644 tokenize2/token.go create mode 100644 tokenize2/tokenize.go create mode 100644 tokenize2/tokenizer_test.go create mode 100644 tokenize2/tokenizer_whitebox_test.go diff --git a/tokenize2/api.go b/tokenize2/api.go new file mode 100644 index 0000000..2b0aa07 --- /dev/null +++ b/tokenize2/api.go @@ -0,0 +1,374 @@ +package tokenize2 + +import ( + "git.makaay.nl/mauricem/go-parsekit/read" +) + +// API holds the internal state of a tokenizer run and provides an API that +// tokenize.Handler functions can use to: +// +// • read and accept runes from the input (NextRune, Accept) +// +// • fork the API for easy lookahead support (Fork, Merge, Reset, Dispose) +// +// • flush already read input data when not needed anymore (FlushInput) +// +// • retrieve the tokenizer Result struct (Result) to read or modify the results +// +// BASIC OPERATION: +// +// To retrieve the next rune from the API, call the NextRune() method. +// +// When the rune is to be accepted as input, call the method Accept(). The rune +// is then added to the result runes of the API and the read cursor is moved +// forward. +// +// By invoking NextRune() + Accept() multiple times, the result can be extended +// with as many runes as needed. Runes collected this way can later on be +// retrieved using the method Result().Runes(). +// +// It is mandatory to call Accept() after retrieving a rune, before calling +// NextRune() again. Failing to do so will result in a panic. +// +// Next to adding runes to the result, it is also possible to modify the +// stored runes or to add lexical Tokens to the result. For all things +// concerning results, take a look at the Result struct, which +// can be accessed though the method Result(). +// +// FORKING OPERATION FOR EASY LOOKEAHEAD SUPPORT: +// +// Sometimes, we must be able to perform a lookahead, which might either +// succeed or fail. In case of a failing lookahead, the state of the +// API must be brought back to the original state, so we can try +// a different route. +// +// The way in which this is supported, is by forking an API struct by +// calling method Fork(). This will return a forked child API, with +// empty result data, but using the same read cursor position as the +// forked parent. +// +// After forking, the same interface as described for BASIC OPERATION can be +// used to fill the results. When the lookahead was successful, then +// Merge() can be called on the forked child to append the child's results +// to the parent's results, and to move the read cursor position to that +// of the child. +// +// When the lookahead was unsuccessful, then the forked child API can +// disposed by calling Dispose() on the forked child. This is not mandatory. +// Garbage collection will take care of this automatically. +// The parent API was never modified, so it can safely be used after disposal +// as if the lookahead never happened. +// +// Opinionized note: +// Many tokenizers/parsers take a different approach on lookaheads by using +// peeks and by moving the read cursor position back and forth, or by putting +// read input back on the input stream. That often leads to code that is +// efficient, however, in my opinion, not very intuitive to read. It can also +// be tedious to get the cursor position back at the correct position, which +// can lead to hard to track bugs. I much prefer this forking method, since +// no bookkeeping has to be implemented when implementing a parser. +type API struct { + reader *read.Buffer // the input data reader + lastRune rune // the rune as retrieved by the last NextRune() calll + lastRuneErr error // the error for the last NextRune() call + runeRead bool // whether or not a rune was read using NextRune() + runes []rune // the rune stack + tokens []Token // the token stack + stackFrames []stackFrame // the stack frames, containing stack level-specific data + stackLevel int // the current stack level + stackFrame *stackFrame // the current stack frame +} + +type stackFrame struct { + offset int // current rune offset relative to the Reader's sliding window + runeStart int + runeEnd int + tokenStart int + tokenEnd int + cursor Cursor + + // TODO + err error // can be used by a Handler to report a specific issue with the input +} + +const initialStackDepth = 10 +const initialTokenDepth = 10 +const initialRuneDepth = 10 + +// NewAPI initializes a new API struct, wrapped around the provided input. +// For an overview of allowed inputs, take a look at the documentation +// for parsekit.read.New(). +func NewAPI(input interface{}) *API { + api := &API{ + reader: read.New(input), + runes: make([]rune, 0, initialRuneDepth), + tokens: make([]Token, 0, initialTokenDepth), + stackFrames: make([]stackFrame, 1, initialStackDepth), + } + api.stackFrame = &api.stackFrames[0] + + return api +} + +// NextRune returns the rune at the current read offset. +// +// When an invalid UTF8 rune is encountered on the input, it is replaced with +// the utf.RuneError rune. It's up to the caller to handle this as an error +// when needed. +// +// After reading a rune it must be Accept()-ed to move the read cursor forward +// to the next rune. Doing so is mandatory. When doing a second call to NextRune() +// without explicitly accepting, this method will panic. You can see this as a +// built-in unit test, enforcing correct serialization of API method calls. +func (i *API) NextRune() (rune, error) { + if i.runeRead { + callerPanic("NextRune", "tokenize.API.{name}(): {name}() called at {caller} "+ + "without a prior call to Accept()") + } + + readRune, err := i.reader.RuneAt(i.stackFrame.offset) + i.lastRune = readRune + i.lastRuneErr = err + i.runeRead = true + + return readRune, err +} + +// Accept the last rune as read by NextRune() into the Result runes and move +// the cursor forward. +// +// It is not allowed to call Accept() when the previous call to NextRune() +// returned an error. Calling Accept() in such case will result in a panic. +func (i *API) Accept() { + if !i.runeRead { + callerPanic("Accept", "tokenize.API.{name}(): {name}() called at {caller} "+ + "without first calling NextRune()") + } else if i.lastRuneErr != nil { + callerPanic("Accept", "tokenize.API.{name}(): {name}() called at {caller}, "+ + "but the prior call to NextRune() failed") + } + + i.runes = append(i.runes, i.lastRune) + i.stackFrame.runeEnd++ + i.stackFrame.cursor.moveByRune(i.lastRune) + i.stackFrame.offset++ + i.runeRead = false +} + +// Fork forks off a child of the API struct. It will reuse the same +// read buffer and cursor position, but for the rest this is a fresh API. +// +// By forking an API, you can freely work with the forked child, without +// affecting the parent API. This is for example useful when you must perform +// some form of lookahead. +// +// When processing of the Handler was successful and you want to add the results +// to the parent API, you can call Merge() on the forked child. +// This will add the results to the results of the parent (runes, tokens). +// It also updates the read cursor position of the parent to that of the child. +// +// When the lookahead was unsuccessful, then the forked child API can +// disposed by calling Dispose() on the forked child. This is not mandatory. +// Garbage collection will take care of this automatically. +// The parent API was never modified, so it can safely be used after disposal +// as if the lookahead never happened. +func (i *API) Fork() int { + newStackLevel := i.stackLevel + 1 + newStackSize := newStackLevel + 1 + + // Grow the stack frames capacity when needed. + if cap(i.stackFrames) < newStackSize { + newFrames := make([]stackFrame, newStackSize, newStackSize*2) + copy(newFrames, i.stackFrames) + i.stackFrames = newFrames + } else { + i.stackFrames = i.stackFrames[0:newStackSize] + } + + parent := i.stackFrame + i.stackLevel++ + i.stackFrame = &i.stackFrames[i.stackLevel] + *i.stackFrame = *parent + i.stackFrame.runeStart = parent.runeEnd + i.stackFrame.tokenStart = parent.tokenEnd + i.runeRead = false + + return i.stackLevel +} + +// Merge appends the results of a forked child API (runes, tokens) to the +// results of its parent. The read cursor of the parent is also updated +// to that of the forked child. +// +// After the merge operation, the child results are reset so it can immediately +// be reused for performing another match. This means that all Result data are +// cleared, but the read cursor position is kept at its current position. +// This allows a child to feed results in chunks to its parent. +// +// Once the child is no longer needed, it can be disposed of by using the +// method Dispose(), which will return the tokenizer to the parent. +func (i *API) Merge(stackLevel int) { + if stackLevel == 0 { + callerPanic("Merge", "tokenize.API.{name}(): {name}() called at {caller} "+ + "on the top-level API stack level 0") + } + if stackLevel != i.stackLevel { + callerPanic("Merge", "tokenize.API.{name}(): {name}() called at {caller} "+ + "on API stack level %d, but the current stack level is %d "+ + "(forgot to Dispose() a forked child?)", stackLevel, i.stackLevel) + } + + parent := &i.stackFrames[stackLevel-1] + + if parent.runeEnd == i.stackFrame.runeStart { + // The end of the parent slice aligns with the start of the child slice. + // Because of this, to merge the parent slice can simply be expanded + // to include the child slice. + // parent : |----------| + // child: |------| + // After merge operation: + // parent: |-----------------| + // child: |---> continue reading from here + parent.runeEnd = i.stackFrame.runeEnd + i.stackFrame.runeStart = i.stackFrame.runeEnd + } else { + // The end of the parent slice does not align with the start of the + // child slice. The child slice has to be copied onto the end of + // the parent slice. + // parent : |----------| + // child: |------| + // After merge operation: + // parent: |-----------------| + // child: |---> continue reading from here + i.runes = append(i.runes[:parent.runeEnd], i.runes[i.stackFrame.runeStart:i.stackFrame.runeEnd]...) + parent.runeEnd = len(i.runes) + i.stackFrame.runeStart = parent.runeEnd + i.stackFrame.runeEnd = parent.runeEnd + } + + // The same logic applies to tokens. + if parent.tokenEnd == i.stackFrame.tokenStart { + parent.tokenEnd = i.stackFrame.tokenEnd + i.stackFrame.tokenStart = i.stackFrame.tokenEnd + } else { + i.tokens = append(i.tokens[:parent.tokenEnd], i.tokens[i.stackFrame.tokenStart:i.stackFrame.tokenEnd]...) + parent.tokenEnd = len(i.tokens) + i.stackFrame.tokenStart = parent.tokenEnd + i.stackFrame.tokenEnd = parent.tokenEnd + } + + parent.offset = i.stackFrame.offset + parent.cursor = i.stackFrame.cursor + + i.stackFrame.err = nil + i.runeRead = false +} + +func (i *API) Dispose(stackLevel int) { + if stackLevel == 0 { + callerPanic("Dispose", "tokenize.API.{name}(): {name}() called at {caller} "+ + "on the top-level API stack level 0") + } + if stackLevel != i.stackLevel { + callerPanic("Dispose", "tokenize.API.{name}(): {name}() called at {caller} "+ + "on API stack level %d, but the current stack level is %d "+ + "(forgot to Dispose() a forked child?)", stackLevel, i.stackLevel) + } + + i.runeRead = false + i.stackLevel = stackLevel - 1 + i.stackFrames = i.stackFrames[:stackLevel] + i.stackFrame = &i.stackFrames[stackLevel-1] + i.runes = i.runes[0:i.stackFrame.runeEnd] + i.tokens = i.tokens[0:i.stackFrame.tokenEnd] +} + +func (i *API) Reset() { + i.runeRead = false + i.stackFrame.runeStart = i.stackFrame.runeEnd + i.stackFrame.tokenStart = i.stackFrame.tokenEnd + i.stackFrame.err = nil +} + +// FlushInput flushes processed input data from the read.Buffer. +// In this context 'processed' means all runes that were read using NextRune() +// and that were added to the results using Accept(). +// +// Note: +// When writing your own TokenHandler, you normally won't have to call this +// method yourself. It is automatically called by parsekit when needed. +func (i *API) FlushInput() bool { + // result := &(i.state.stack[i.stackLevel]) + if i.stackFrame.offset > 0 { + i.reader.Flush(i.stackFrame.offset) + i.stackFrame.offset = 0 + return true + } + return false +} + +func (i *API) String() string { + return string(i.Runes()) +} + +func (i *API) Runes() []rune { + return i.runes[i.stackFrame.runeStart:i.stackFrame.runeEnd] +} + +func (i *API) Rune(offset int) rune { + return i.runes[i.stackFrame.runeStart+offset] +} + +func (i *API) ClearRunes() { + i.runes = i.runes[:i.stackFrame.runeStart] + i.stackFrame.runeEnd = i.stackFrame.runeStart +} + +func (i *API) SetRunes(runes ...rune) { + i.runes = append(i.runes[:i.stackFrame.runeStart], runes...) + i.stackFrame.runeEnd = i.stackFrame.runeStart + len(runes) +} + +func (i *API) AddRunes(runes ...rune) { + i.runes = append(i.runes[:i.stackFrame.runeEnd], runes...) + i.stackFrame.runeEnd += len(runes) +} + +func (i *API) AddString(s string) { + i.AddRunes([]rune(s)...) +} + +func (i *API) SetString(s string) { + i.SetRunes([]rune(s)...) +} + +func (i *API) Cursor() Cursor { + return i.stackFrame.cursor +} + +func (i *API) Tokens() []Token { + return i.tokens[i.stackFrame.tokenStart:i.stackFrame.tokenEnd] +} + +func (i *API) Token(offset int) Token { + return i.tokens[i.stackFrame.tokenStart+offset] +} + +func (i *API) TokenValue(offset int) interface{} { + return i.tokens[i.stackFrame.tokenStart+offset].Value +} + +func (i *API) ClearTokens() { + i.tokens = i.tokens[:i.stackFrame.tokenStart] + i.stackFrame.tokenEnd = i.stackFrame.tokenStart +} + +func (i *API) SetTokens(tokens ...Token) { + i.tokens = append(i.tokens[:i.stackFrame.tokenStart], tokens...) + i.stackFrame.tokenEnd = i.stackFrame.tokenStart + len(tokens) +} + +func (i *API) AddTokens(tokens ...Token) { + i.tokens = append(i.tokens[:i.stackFrame.tokenEnd], tokens...) + i.stackFrame.tokenEnd += len(tokens) +} diff --git a/tokenize2/api_test.go b/tokenize2/api_test.go new file mode 100644 index 0000000..8986fc1 --- /dev/null +++ b/tokenize2/api_test.go @@ -0,0 +1,330 @@ +package tokenize2_test + +import ( + "fmt" + "testing" + + tokenize "git.makaay.nl/mauricem/go-parsekit/tokenize2" +) + +func ExampleNewAPI() { + tokenize.NewAPI("The input that the API will handle") + + // Output: +} + +func ExampleAPI_NextRune() { + api := tokenize.NewAPI("The input that the API will handle") + r, err := api.NextRune() + fmt.Printf("Rune read from input; %c\n", r) + fmt.Printf("The error: %v\n", err) + fmt.Printf("API results: %q\n", api.String()) + + // Output: + // Rune read from input; T + // The error: + // API results: "" +} + +func ExampleAPI_Accept() { + api := tokenize.NewAPI("The input that the API will handle") + api.NextRune() // reads 'T' + api.Accept() // adds 'T' to the API results + api.NextRune() // reads 'h' + api.Accept() // adds 'h' to the API results + api.NextRune() // reads 'e', but it is not added to the API results + + fmt.Printf("API results: %q\n", api.String()) + + // Output: + // API results: "Th" +} + +func ExampleAPI_modifyingResults() { + api := tokenize.NewAPI("") + + api.AddString("Some runes") + api.AddRunes(' ', 'a', 'd', 'd', 'e', 'd') + api.AddRunes(' ', 'i', 'n', ' ') + api.AddString("various ways") + fmt.Printf("API result first 10 runes: %q\n", api.Runes()[0:10]) + fmt.Printf("API result runes as string: %q\n", api.String()) + + api.SetString("new ") + api.AddString("set ") + api.AddString("of ") + api.AddRunes('r', 'u', 'n', 'e', 's') + fmt.Printf("API result runes as string: %q\n", api.String()) + fmt.Printf("API result runes: %q\n", api.Runes()) + fmt.Printf("API third rune: %q\n", api.Rune(2)) + + api.AddTokens(tokenize.Token{ + Type: 42, + Value: "towel"}) + api.AddTokens(tokenize.Token{ + Type: 73, + Value: "Zaphod"}) + fmt.Printf("API result tokens: %v\n", api.Tokens()) + fmt.Printf("API second result token: %v\n", api.Token(1)) + + // Output: + // API result first 10 runes: ['S' 'o' 'm' 'e' ' ' 'r' 'u' 'n' 'e' 's'] + // API result runes as string: "Some runes added in various ways" + // API result runes as string: "new set of runes" + // API result runes: ['n' 'e' 'w' ' ' 's' 'e' 't' ' ' 'o' 'f' ' ' 'r' 'u' 'n' 'e' 's'] + // API third rune: 'w' + // API result tokens: [42("towel") 73("Zaphod")] + // API second result token: 73("Zaphod") +} + +func ExampleAPI_Reset() { + api := tokenize.NewAPI("Very important input!") + + api.NextRune() + api.Accept() + api.NextRune() + api.Accept() + fmt.Printf("API results: %q at %s\n", api.String(), api.Cursor()) + + // Reset clears the results, but keeps the cursor position. + api.Reset() + fmt.Printf("API results: %q at %s\n", api.String(), api.Cursor()) + + api.NextRune() + api.Accept() + api.NextRune() + api.Accept() + fmt.Printf("API results: %q at %s\n", api.String(), api.Cursor()) + + // Output: + // API results: "Ve" at line 1, column 3 + // API results: "" at line 1, column 3 + // API results: "ry" at line 1, column 5 +} + +func ExampleAPI_Fork() { + // This custom Handler checks for input 'a', 'b' or 'c'. + abcHandler := func(t *tokenize.API) bool { + a := tokenize.A + for _, r := range []rune{'a', 'b', 'c'} { + child := t.Fork() // fork, so we won't change parent t + if a.Rune(r)(t) { + t.Merge(child) // accept results into parent of child + t.Dispose(child) // return to the parent level + return true // and report a successful match + } + t.Dispose(child) // return to the parent level + } + // If we get here, then no match was found. Return false to communicate + // this to the caller. + return false + } + + // Note: a custom Handler is normally not what you need. + // You can make use of the parser/combinator tooling to make the + // implementation a lot simpler and to take care of forking at + // the appropriate places. The handler from above can be replaced with: + simpler := tokenize.A.RuneRange('a', 'c') + + result, err := tokenize.New(abcHandler)("another test") + fmt.Println(result, err) + result, err = tokenize.New(simpler)("curious") + fmt.Println(result, err) + result, err = tokenize.New(abcHandler)("bang on!") + fmt.Println(result, err) + result, err = tokenize.New(abcHandler)("not a match") + fmt.Println(result, err) + + // Output: + // a + // c + // b + // mismatch at start of file +} + +func ExampleAPI_Merge() { + tokenHandler := func(t *tokenize.API) bool { + child1 := t.Fork() + t.NextRune() // reads 'H' + t.Accept() + t.NextRune() // reads 'i' + t.Accept() + + child2 := t.Fork() + t.NextRune() // reads ' ' + t.Accept() + t.NextRune() // reads 'm' + t.Accept() + t.Dispose(child2) + + t.Merge(child1) // We merge child1, which has read 'H' and 'i' only. + t.Dispose(child1) // and clean up child1 to return to the parent + return true + } + + result, _ := tokenize.New(tokenHandler)("Hi mister X!") + fmt.Println(result.String()) + + // Output: + // Hi +} + +func TestMultipleLevelsOfForksAndMerges(t *testing.T) { + api := tokenize.NewAPI("abcdefghijklmnopqrstuvwxyz") + + // Fork a few levels. + child1 := api.Fork() + child2 := api.Fork() + child3 := api.Fork() + child4 := api.Fork() + + // Read a rune 'a' from child4. + r, _ := api.NextRune() + AssertEqual(t, 'a', r, "child4 rune 1") + api.Accept() + AssertEqual(t, "a", api.String(), "child4 runes after rune 1") + + // Read another rune 'b' from child4. + r, _ = api.NextRune() + AssertEqual(t, 'b', r, "child4 rune 2") + api.Accept() + AssertEqual(t, "ab", api.String(), "child4 runes after rune 2") + + // Merge "ab" from child4 to child3. + api.Merge(child4) + AssertEqual(t, "", api.String(), "child4 runes after first merge") + + // Read some more from child4. + r, _ = api.NextRune() + AssertEqual(t, 'c', r, "child4 rune 3") + api.Accept() + AssertEqual(t, "c", api.String(), "child4 runes after rune 1") + AssertEqual(t, "line 1, column 4", api.Cursor().String(), "cursor child4 rune 3") + + // Merge "c" from child4 to child3. + api.Merge(child4) + + // And dispose of child4, making child3 the active stack level. + api.Dispose(child4) + + // Child3 should now have the compbined results "abc" from child4's work. + AssertEqual(t, "abc", api.String(), "child3 after merge of child4") + AssertEqual(t, "line 1, column 4", api.Cursor().String(), "cursor child3 rune 3, after merge of child4") + + // Now read some data from child3. + r, _ = api.NextRune() + AssertEqual(t, 'd', r, "child3 rune 5") + api.Accept() + + r, _ = api.NextRune() + AssertEqual(t, 'e', r, "child3 rune 5") + api.Accept() + + r, _ = api.NextRune() + AssertEqual(t, 'f', r, "child3 rune 5") + api.Accept() + + AssertEqual(t, "abcdef", api.String(), "child3 total result after rune 6") + + // Temporarily go some new forks from here, but don't use their outcome. + child3sub1 := api.Fork() + api.NextRune() + api.Accept() + api.NextRune() + api.Accept() + child3sub2 := api.Fork() + api.NextRune() + api.Accept() + api.Merge(child3sub2) // do merge sub2 down to sub1 + api.Dispose(child3sub2) // and dispose of sub2 + api.Dispose(child3sub1) // but dispose of sub1 without merging + + // Instead merge the results from before this forking segway from child3 to child2 + // and dispose of it. + api.Merge(child3) + api.Dispose(child3) + + AssertEqual(t, "abcdef", api.String(), "child2 total result after merge of child3") + AssertEqual(t, "line 1, column 7", api.Cursor().String(), "cursor child2 after merge child3") + + // Merge child2 to child1 and dispose of it. + api.Merge(child2) + api.Dispose(child2) + + // Merge child1 a few times to the top level api. + api.Merge(child1) + api.Merge(child1) + api.Merge(child1) + api.Merge(child1) + + // And dispose of it. + api.Dispose(child1) + + // Read some data from the top level api. + r, _ = api.NextRune() + api.Accept() + + AssertEqual(t, "abcdefg", api.String(), "api string end result") + AssertEqual(t, "line 1, column 8", api.Cursor().String(), "api cursor end result") +} + +func TestClearRunes(t *testing.T) { + api := tokenize.NewAPI("Laphroaig") + api.NextRune() // Read 'L' + api.Accept() // Add to runes + api.NextRune() // Read 'a' + api.Accept() // Add to runes + api.ClearRunes() // Clear the runes + api.NextRune() // Read 'p' + api.Accept() // Add to runes + api.NextRune() // Read 'r' + api.Accept() // Add to runes + + AssertEqual(t, "ph", api.String(), "api string end result") +} + +func TestMergeScenariosForTokens(t *testing.T) { + api := tokenize.NewAPI("") + + token1 := tokenize.Token{Value: 1} + token2 := tokenize.Token{Value: 2} + token3 := tokenize.Token{Value: 3} + token4 := tokenize.Token{Value: 4} + + api.SetTokens(token1) + tokens := api.Tokens() + AssertEqual(t, 1, len(tokens), "Tokens 1") + + child := api.Fork() + + tokens = api.Tokens() + AssertEqual(t, 0, len(tokens), "Tokens 2") + + api.AddTokens(token2) + + // Here we can merge by expanding the token slice on the parent, + // because the end of the parent slice and the start of the child + // slice align. + api.Merge(child) + api.Dispose(child) + + tokens = api.Tokens() + AssertEqual(t, 2, len(tokens), "Tokens 3") + + child = api.Fork() + api.AddTokens(token3) + api.Reset() + api.AddTokens(token4) + + // Here the merge means that token4 will be copied to the end of + // the token slice of the parent, since there's a gap at the place + // where token3 used to be. + api.Merge(child) + api.Dispose(child) + + tokens = api.Tokens() + AssertEqual(t, 3, len(tokens), "Tokens 4") + AssertEqual(t, 1, api.TokenValue(0).(int), "Tokens 4, value 0") + AssertEqual(t, 2, api.TokenValue(1).(int), "Tokens 4, value 1") + AssertEqual(t, 4, api.TokenValue(2).(int), "Tokens 4, value 2") +} diff --git a/tokenize2/assertions_test.go b/tokenize2/assertions_test.go new file mode 100644 index 0000000..7aa8831 --- /dev/null +++ b/tokenize2/assertions_test.go @@ -0,0 +1,118 @@ +package tokenize2_test + +// This file contains some tools that are used for writing tests. + +import ( + "regexp" + "testing" + + tokenize "git.makaay.nl/mauricem/go-parsekit/tokenize2" +) + +func AssertEqual(t *testing.T, expected interface{}, actual interface{}, forWhat string) { + if expected != actual { + t.Errorf( + "Unexpected value for %s:\nexpected: %q\nactual: %q", + forWhat, expected, actual) + } +} + +func AssertTrue(t *testing.T, b bool, assertion string) { + if !b { + t.Errorf("Assertion %s is false", assertion) + } +} + +type PanicT struct { + Function func() + Regexp bool + Expect string +} + +func AssertPanics(t *testing.T, testSet []PanicT) { + for _, test := range testSet { + AssertPanic(t, test) + } +} + +func AssertPanic(t *testing.T, p PanicT) { + defer func() { + if r := recover(); r != nil { + mismatch := false + if p.Regexp && !regexp.MustCompile(p.Expect).MatchString(r.(string)) { + mismatch = true + } + if !p.Regexp && p.Expect != r.(string) { + mismatch = true + } + if mismatch { + t.Errorf( + "Code did panic, but unexpected panic message received:\nexpected: %q\nactual: %q", + p.Expect, r) + } + } else { + t.Errorf("Function did not panic (expected panic message: %s)", p.Expect) + } + }() + p.Function() +} + +type HandlerT struct { + Input string + Handler tokenize.Handler + MustMatch bool + Expected string +} + +func AssertHandlers(t *testing.T, testSet []HandlerT) { + for _, test := range testSet { + AssertHandler(t, test) + } +} + +func AssertHandler(t *testing.T, test HandlerT) { + result, err := tokenize.New(test.Handler)(test.Input) + if test.MustMatch { + if err != nil { + t.Errorf("Test %q failed with error: %s", test.Input, err) + } else if output := result.String(); output != test.Expected { + t.Errorf("Test %q failed: not expected output:\nexpected: %q\nactual: %q\n", test.Input, test.Expected, output) + } + } else { + if err == nil { + t.Errorf("Test %q failed: should not match, but it did", test.Input) + } + } +} + +type TokenMakerT struct { + Input string + Handler tokenize.Handler + Expected []tokenize.Token +} + +func AssertTokenMakers(t *testing.T, testSet []TokenMakerT) { + for _, test := range testSet { + AssertTokenMaker(t, test) + } +} + +func AssertTokenMaker(t *testing.T, test TokenMakerT) { + result, err := tokenize.New(test.Handler)(test.Input) + if err != nil { + t.Errorf("Test %q failed with error: %s", test.Input, err) + } else { + if len(result.Tokens()) != len(test.Expected) { + t.Errorf("Unexpected number of tokens in output:\nexpected: %d\nactual: %d", len(test.Expected), len(result.Tokens())) + } + for i, expected := range test.Expected { + actual := result.Token(i) + if expected.Type != actual.Type { + t.Errorf("Unexpected Type in result.Tokens[%d]:\nexpected: (%T) %s\nactual: (%T) %s", i, expected.Type, expected.Type, actual.Type, actual.Type) + } + if expected.Value != actual.Value { + t.Errorf("Unexpected Value in result.Tokens[%d]:\nexpected: (%T) %s\nactual: (%T) %s", i, expected.Value, expected.Value, actual.Value, actual.Value) + } + } + } +} diff --git a/tokenize2/callerinfo.go b/tokenize2/callerinfo.go new file mode 100644 index 0000000..dcb4f21 --- /dev/null +++ b/tokenize2/callerinfo.go @@ -0,0 +1,33 @@ +package tokenize2 + +import ( + "fmt" + "runtime" + "strings" +) + +func callerPanic(name, f string, data ...interface{}) { + filepos := callerBefore(name) + m := fmt.Sprintf(f, data...) + m = strings.Replace(m, "{caller}", filepos, -1) + m = strings.Replace(m, "{name}", name, -1) + panic(m) +} + +func callerBefore(name string) string { + found := false + for i := 1; ; i++ { + pc, file, line, ok := runtime.Caller(i) + if found { + return fmt.Sprintf("%s:%d", file, line) + } + if !ok { + return "unknown caller" + } + f := runtime.FuncForPC(pc) + + if strings.HasSuffix(f.Name(), "."+name) { + found = true + } + } +} diff --git a/tokenize2/callerinfo_test.go b/tokenize2/callerinfo_test.go new file mode 100644 index 0000000..d0f1107 --- /dev/null +++ b/tokenize2/callerinfo_test.go @@ -0,0 +1,35 @@ +package tokenize2 + +import ( + "strings" + "testing" +) + +func SomeFunc1() { + SomeFunc2() +} + +func SomeFunc2() { + SomeFunc3() +} + +func SomeFunc3() { + callerPanic("SomeFunc2", "{name} was called from {caller}") +} + +func TestCallerPanic(t *testing.T) { + defer func() { + r := recover() + err := r.(string) + + if !strings.Contains(err, "SomeFunc2 was called from") || !strings.Contains(err, "callerinfo_test.go:") { + t.Fatalf("Unexpected error message: %s", err) + } + }() + SomeFunc1() +} + +func TestCallerBefore_WithFunctionNameNotInStack(t *testing.T) { + caller := callerBefore("NotExistingAtAll") + AssertEqual(t, "unknown caller", caller, "result for name not in stack") +} diff --git a/tokenize2/cursor.go b/tokenize2/cursor.go new file mode 100644 index 0000000..a5e8799 --- /dev/null +++ b/tokenize2/cursor.go @@ -0,0 +1,45 @@ +package tokenize2 + +import ( + "fmt" + "unicode/utf8" +) + +// Cursor represents the position of a cursor in various ways. +type Cursor struct { + Byte int // The cursor offset in bytes + Rune int // The cursor offset in UTF8 runes + Column int // The column at which the cursor is (0-indexed) + Line int // The line at which the cursor is (0-indexed) +} + +// String produces a string representation of the cursor position. +func (c Cursor) String() string { + if c.Line == 0 && c.Column == 0 { + return fmt.Sprintf("start of file") + } + return fmt.Sprintf("line %d, column %d", c.Line+1, c.Column+1) +} + +// move updates the position of the cursor, based on the provided input string. +// The input string represents the runes that the cursor must be moved over. +// This method will take newlines into account to keep track of line numbers and +// column positions automatically. +func (c *Cursor) move(input string) *Cursor { + for _, r := range input { + c.moveByRune(r) + } + return c +} + +func (c *Cursor) moveByRune(r rune) *Cursor { + c.Byte += utf8.RuneLen(r) + c.Rune++ + if r == '\n' { + c.Column = 0 + c.Line++ + } else { + c.Column++ + } + return c +} diff --git a/tokenize2/cursor_test.go b/tokenize2/cursor_test.go new file mode 100644 index 0000000..8569354 --- /dev/null +++ b/tokenize2/cursor_test.go @@ -0,0 +1,69 @@ +package tokenize2 + +import ( + "fmt" + "testing" +) + +func ExampleCursor_move() { + c := Cursor{} + fmt.Printf("after initialization : %s\n", c) + fmt.Printf("after 'some words' : %s\n", c.move("some words")) + fmt.Printf("after '\\n' : %s\n", c.move("\n")) + fmt.Printf("after '\\r\\nskip\\nlines' : %s\n", c.move("\r\nskip\nlines")) + + // Output: + // after initialization : start of file + // after 'some words' : line 1, column 11 + // after '\n' : line 2, column 1 + // after '\r\nskip\nlines' : line 4, column 6 +} + +func ExampleCursor_String() { + c := Cursor{} + fmt.Println(c.String()) + + c.move("\nfoobar") + fmt.Println(c.String()) + + // Output: + // start of file + // line 2, column 7 +} + +func TestGivenCursor_WhenMoving_CursorIsUpdated(t *testing.T) { + for _, test := range []struct { + name string + input []string + byte int + rune int + line int + column int + }{ + {"No input at all", []string{""}, 0, 0, 0, 0}, + {"One ASCII char", []string{"a"}, 1, 1, 0, 1}, + {"Multiple ASCII chars", []string{"abc"}, 3, 3, 0, 3}, + {"One newline", []string{"\n"}, 1, 1, 1, 0}, + {"Carriage return", []string{"\r\r\r"}, 3, 3, 0, 3}, + {"One UTF8 3 byte char", []string{"⌘"}, 3, 1, 0, 1}, + {"Mixture", []string{"Hello\n\npretty\nW⌘O⌘R⌘L⌘D"}, 31, 23, 3, 9}, + {"Multiple calls", []string{"hello", "world"}, 10, 10, 0, 10}, + } { + c := Cursor{} + for _, s := range test.input { + c.move(s) + } + if c.Byte != test.byte { + t.Errorf("[%s] Unexpected byte offset %d (expected %d)", test.name, c.Byte, test.byte) + } + if c.Rune != test.rune { + t.Errorf("[%s] Unexpected rune offset %d (expected %d)", test.name, c.Rune, test.rune) + } + if c.Line != test.line { + t.Errorf("[%s] Unexpected line offset %d (expected %d)", test.name, c.Line, test.line) + } + if c.Column != test.column { + t.Errorf("[%s] Unexpected column offset %d (expected %d)", test.name, c.Column, test.column) + } + } +} diff --git a/tokenize2/handler.go b/tokenize2/handler.go new file mode 100644 index 0000000..a2c637b --- /dev/null +++ b/tokenize2/handler.go @@ -0,0 +1,53 @@ +package tokenize2 + +// Handler is the function type that is involved in turning a low level +// stream of UTF8 runes into lexical tokens. Its purpose is to check if input +// data matches some kind of pattern and to report back the results. +// +// A Handler function gets an API as its input and returns a boolean to +// indicate whether or not it found a match on the input. The API is used +// for retrieving input data to match against and for reporting back results. +type Handler func(t *API) bool + +// Match is syntactic sugar that allows you to write a construction like +// NewTokenizer(handler).Execute(input) as handler.Match(input). +func (handler Handler) Match(input interface{}) (*API, error) { + tokenizer := New(handler) + return tokenizer(input) +} + +// Or is syntactic sugar that allows you to write a construction like +// MatchAny(tokenHandler1, tokenHandler2) as tokenHandler1.Or(tokenHandler2). +func (handler Handler) Or(otherHandler Handler) Handler { + return MatchAny(handler, otherHandler) +} + +// Times is syntactic sugar that allows you to write a construction like +// MatchRep(3, handler) as handler.Times(3). +func (handler Handler) Times(n int) Handler { + return MatchRep(n, handler) +} + +// Then is syntactic sugar that allows you to write a construction like +// MatchSeq(handler1, handler2, handler3) as handler1.Then(handler2).Then(handler3). +func (handler Handler) Then(otherHandler Handler) Handler { + return MatchSeq(handler, otherHandler) +} + +// SeparatedBy is syntactic sugar that allows you to write a construction like +// MatchSeparated(handler, separator) as handler.SeparatedBy(separator). +func (handler Handler) SeparatedBy(separator Handler) Handler { + return MatchSeparated(separator, handler) +} + +// Optional is syntactic sugar that allows you to write a construction like +// MatchOptional(handler) as handler.Optional(). +func (handler Handler) Optional() Handler { + return MatchOptional(handler) +} + +// Except is syntactic sugar that allows you to write a construction like +// MatchExcept(handler) as handler.Optional(). +func (handler Handler) Except(exceptHandler Handler) Handler { + return MatchExcept(handler, exceptHandler) +} diff --git a/tokenize2/handler_test.go b/tokenize2/handler_test.go new file mode 100644 index 0000000..7688d1e --- /dev/null +++ b/tokenize2/handler_test.go @@ -0,0 +1,101 @@ +package tokenize2_test + +import ( + "fmt" + "testing" + + tokenize "git.makaay.nl/mauricem/go-parsekit/tokenize2" +) + +func TestSyntacticSugar(t *testing.T) { + var a = tokenize.A + AssertHandlers(t, []HandlerT{ + {"aaaaaa", a.Rune('a').Times(4), true, "aaaa"}, + {"ababab", a.Rune('a').Or(a.Rune('b')).Times(4), true, "abab"}, + {"ababab", a.Rune('a').Then(a.Rune('b')), true, "ab"}, + {"bababa", a.Rune('a').Then(a.Rune('b')), false, ""}, + {"cccccc", a.Rune('c').Optional(), true, "c"}, + {"dddddd", a.Rune('c').Optional(), true, ""}, + {"a,b,c,d", a.ASCII.SeparatedBy(a.Comma), true, "a,b,c,d"}, + {"a, b, c, d", a.ASCII.SeparatedBy(a.Comma.Then(a.Space)), true, "a, b, c, d"}, + {"a, b,c,d", a.ASCII.SeparatedBy(a.Comma.Then(a.Space.Optional())), true, "a, b,c,d"}, + {"a, b, c, d", a.ASCII.SeparatedBy(a.Space.Optional().Then(a.Comma.Then(a.Space.Optional()))), true, "a, b, c, d"}, + {"a,b ,c, d|", a.ASCII.SeparatedBy(a.Space.Optional().Then(a.Comma).Then(a.Space.Optional())), true, "a,b ,c, d"}, + }) +} + +func ExampleHandler_Times() { + c, a := tokenize.C, tokenize.A + phoneNumber := c.Seq(a.Rune('0'), a.Digit.Times(9)) + + fmt.Println(phoneNumber.Match("0201234567")) + // Output: + // 0201234567 +} + +func ExampleHandler_Then() { + c, a := tokenize.C, tokenize.A + phoneNumber := a.Rune('0').Then(c.Repeated(9, a.Digit)) + + fmt.Println(phoneNumber.Match("0208888888")) + // Output: + // 0208888888 +} + +func ExampleHandler_Or() { + c, a := tokenize.C, tokenize.A + phoneNumber := c.Seq(a.Str("00").Or(a.Plus), a.Str("31"), a.DigitNotZero, c.Repeated(8, a.Digit)) + + fmt.Println(phoneNumber.Match("+31209876543")) + fmt.Println(phoneNumber.Match("0031209876543")) + fmt.Println(phoneNumber.Match("0031020991234")) + fmt.Println(phoneNumber.Match("0031201234")) + // Output: + // +31209876543 + // 0031209876543 + // mismatch at start of file + // mismatch at start of file +} + +func ExampleHandler_SeparatedBy() { + a, t := tokenize.A, tokenize.T + csv := t.Int("number", a.Digits).SeparatedBy(a.Comma) + + r, _ := csv.Match("123,456,7,8,9") + for i, token := range r.Tokens() { + fmt.Printf("[%d] %v\n", i, token) + } + // Output: + // [0] number((int)123) + // [1] number((int)456) + // [2] number((int)7) + // [3] number((int)8) + // [4] number((int)9) +} + +func ExampleHandler_Optional() { + c, a := tokenize.C, tokenize.A + + spanish := c.Seq( + a.Rune('¿').Optional(), + c.OneOrMore(a.AnyRune.Except(a.Question)), + a.Rune('?').Optional()) + + fmt.Println(spanish.Match("¿Habla español María?")) + fmt.Println(spanish.Match("Sí, María habla español.")) + // Output: + // ¿Habla español María? + // Sí, María habla español. +} + +func ExampleHandler_Match() { + r, err := tokenize.A.IPv4.Match("001.002.003.004") + fmt.Println(r, err) + + r, err = tokenize.A.IPv4.Match("1.2.3") + fmt.Println(r, err) + + // Output: + // 1.2.3.4 + // mismatch at start of file +} diff --git a/tokenize2/handlers_builtin.go b/tokenize2/handlers_builtin.go new file mode 100644 index 0000000..48fd908 --- /dev/null +++ b/tokenize2/handlers_builtin.go @@ -0,0 +1,1500 @@ +package tokenize2 + +import ( + "fmt" + "io" + "net" + "strconv" + "strings" + "unicode" + "unicode/utf8" +) + +// C provides convenient access to a range of parser/combinators that can be +// used to construct Handler functions. +// +// Parser/combinators are so called higher order functions that take in one +// or more other Handler functions and output a new Handler. They can be +// used to combine Handler functions in useful ways to create new more complex +// Handler functions. +// +// When using C in your own parser, then it is advised to create a variable +// to reference it, for example: +// +// c := tokenize.C +// +// Doing so saves you a lot of typing, and it makes your code a lot cleaner. +var C = struct { + Any func(...Handler) Handler + Not func(Handler) Handler + Seq func(...Handler) Handler + Min func(min int, handler Handler) Handler + Max func(max int, handler Handler) Handler + Repeated func(times int, handler Handler) Handler + Optional func(Handler) Handler + ZeroOrMore func(Handler) Handler + OneOrMore func(Handler) Handler + MinMax func(min int, max int, handler Handler) Handler + Separated func(separator Handler, separated Handler) Handler + Except func(except Handler, handler Handler) Handler + FollowedBy func(lookAhead Handler, handler Handler) Handler + NotFollowedBy func(lookAhead Handler, handler Handler) Handler + FlushInput func(Handler) Handler +}{ + Any: MatchAny, + Not: MatchNot, + Seq: MatchSeq, + Min: MatchMin, + Max: MatchMax, + Repeated: MatchRep, + Optional: MatchOptional, + ZeroOrMore: MatchZeroOrMore, + OneOrMore: MatchOneOrMore, + MinMax: MatchMinMax, + Separated: MatchSeparated, + Except: MatchExcept, + FollowedBy: MatchFollowedBy, + NotFollowedBy: MatchNotFollowedBy, + FlushInput: MakeInputFlusher, +} + +// A provides convenient access to a range of atoms or functions to build atoms. +// +// When using A in your own parser, then it is advised to create a variable +// to reference it: +// +// a := tokenize.A +// +// Doing so saves you a lot of typing, and it makes your code a lot cleaner. +var A = struct { + Rune func(rune) Handler + Runes func(...rune) Handler + RuneRange func(rune, rune) Handler + Str func(string) Handler + StrNoCase func(string) Handler + EndOfLine Handler + EndOfFile Handler + UntilEndOfLine Handler + AnyRune Handler + ValidRune Handler + InvalidRune Handler + Space Handler + Tab Handler + CR Handler + LF Handler + CRLF Handler + Excl Handler + DoubleQuote Handler + Hash Handler + Dollar Handler + Percent Handler + Amp Handler + SingleQuote Handler + RoundOpen Handler + LeftParen Handler + RoundClose Handler + RightParen Handler + Asterisk Handler + Multiply Handler + Plus Handler + Add Handler + Comma Handler + Minus Handler + Subtract Handler + Dot Handler + Slash Handler + Divide Handler + Colon Handler + Semicolon Handler + AngleOpen Handler + LessThan Handler + Equal Handler + AngleClose Handler + GreaterThan Handler + Question Handler + At Handler + SquareOpen Handler + Backslash Handler + SquareClose Handler + Caret Handler + Underscore Handler + Backquote Handler + CurlyOpen Handler + Pipe Handler + CurlyClose Handler + Tilde Handler + Newline Handler + Blank Handler + Blanks Handler + Whitespace Handler + UnicodeSpace Handler + Digit Handler + DigitNotZero Handler + Digits Handler + Zero Handler + Float Handler + Boolean Handler + Integer Handler + Signed func(Handler) Handler + IntegerBetween func(min int64, max int64) Handler + ASCII Handler + ASCIILower Handler + ASCIIUpper Handler + Letter Handler + Lower Handler + Upper Handler + HexDigit Handler + Octet Handler + IPv4 Handler + IPv4CIDRMask Handler + IPv4Netmask Handler + IPv4Net Handler + IPv6 Handler + IPv6CIDRMask Handler + IPv6Net Handler +}{ + Rune: MatchRune, + Runes: MatchRunes, + RuneRange: MatchRuneRange, + Str: MatchStr, + StrNoCase: MatchStrNoCase, + EndOfFile: MatchEndOfFile(), + EndOfLine: MatchEndOfLine(), + UntilEndOfLine: MatchUntilEndOfLine(), + AnyRune: MatchAnyRune(), + ValidRune: MatchValidRune(), + InvalidRune: MatchInvalidRune(), + Space: MatchRune(' '), + Tab: MatchRune('\t'), + CR: MatchRune('\r'), + LF: MatchRune('\n'), + CRLF: MatchStr("\r\n"), + Excl: MatchRune('!'), + DoubleQuote: MatchRune('"'), + Hash: MatchRune('#'), + Dollar: MatchRune('$'), + Percent: MatchRune('%'), + Amp: MatchRune('&'), + SingleQuote: MatchRune('\''), + RoundOpen: MatchRune('('), + LeftParen: MatchRune('('), + RoundClose: MatchRune(')'), + RightParen: MatchRune(')'), + Asterisk: MatchRune('*'), + Multiply: MatchRune('*'), + Plus: MatchRune('+'), + Add: MatchRune('+'), + Comma: MatchRune(','), + Minus: MatchRune('-'), + Subtract: MatchRune('-'), + Dot: MatchRune('.'), + Slash: MatchRune('/'), + Divide: MatchRune('/'), + Colon: MatchRune(':'), + Semicolon: MatchRune(';'), + AngleOpen: MatchRune('<'), + LessThan: MatchRune('<'), + Equal: MatchRune('='), + AngleClose: MatchRune('>'), + GreaterThan: MatchRune('>'), + Question: MatchRune('?'), + At: MatchRune('@'), + SquareOpen: MatchRune('['), + Backslash: MatchRune('\\'), + SquareClose: MatchRune(']'), + Caret: MatchRune('^'), + Underscore: MatchRune('_'), + Backquote: MatchRune('`'), + CurlyOpen: MatchRune('{'), + Pipe: MatchRune('|'), + CurlyClose: MatchRune('}'), + Tilde: MatchRune('~'), + Newline: MatchNewline(), + Blank: MatchBlank(), + Blanks: MatchBlanks(), + Whitespace: MatchWhitespace(), + UnicodeSpace: MatchUnicodeSpace(), + Digit: MatchDigit(), + DigitNotZero: MatchDigitNotZero(), + Digits: MatchDigits(), + Zero: MatchRune('0'), + Integer: MatchInteger(), + Signed: MatchSigned, + IntegerBetween: MatchIntegerBetween, + Float: MatchFloat(), + Boolean: MatchBoolean(), + ASCII: MatchASCII(), + ASCIILower: MatchASCIILower(), + ASCIIUpper: MatchASCIIUpper(), + Letter: MatchUnicodeLetter(), + Lower: MatchUnicodeLower(), + Upper: MatchUnicodeUpper(), + HexDigit: MatchHexDigit(), + Octet: MatchOctet(true), + IPv4: MatchIPv4(true), + IPv4CIDRMask: MatchIPv4CIDRMask(true), + IPv4Netmask: MatchIPv4Netmask(true), + IPv4Net: MatchIPv4Net(true), + IPv6: MatchIPv6(true), + IPv6CIDRMask: MatchIPv6CIDRMask(true), + IPv6Net: MatchIPv6Net(true), +} + +// M provides convenient access to a range of modifiers (which in their nature are +// parser/combinators) that can be used when creating Handler functions. +// +// In parsekit, a modifier is defined as a Handler function that modifies the +// resulting output of another Handler in some way. It does not do any matching +// against input of its own. +// +// When using M in your own parser, then it is advised to create a variable +// to reference it: +// +// m := tokenize.M +// +// Doing so saves you a lot of typing, and it makes your code a lot cleaner. +var M = struct { + Drop func(Handler) Handler + Trim func(handler Handler, cutset string) Handler + TrimLeft func(handler Handler, cutset string) Handler + TrimRight func(handler Handler, cutset string) Handler + TrimSpace func(handler Handler) Handler + ToLower func(Handler) Handler + ToUpper func(Handler) Handler + Replace func(handler Handler, replaceWith string) Handler + ByCallback func(Handler, func(string) string) Handler +}{ + Drop: ModifyDrop, + Trim: ModifyTrim, + TrimLeft: ModifyTrimLeft, + TrimRight: ModifyTrimRight, + TrimSpace: ModifyTrimSpace, + ToLower: ModifyToLower, + ToUpper: ModifyToUpper, + Replace: ModifyReplace, + ByCallback: ModifyByCallback, +} + +// T provides convenient access to a range of Token producers (which in their +// nature are parser/combinators) that can be used when creating Handler +// functions. +// +// When using T in your own parser, then it is advised to create a variable +// to reference it: +// +// t := tokenize.T +// +// Doing so saves you a lot of typing, and it makes your code a lot cleaner. +var T = struct { + Str func(interface{}, Handler) Handler + StrInterpreted func(interface{}, Handler) Handler + Byte func(interface{}, Handler) Handler + Rune func(interface{}, Handler) Handler + Int func(interface{}, Handler) Handler + Int8 func(interface{}, Handler) Handler + Int16 func(interface{}, Handler) Handler + Int32 func(interface{}, Handler) Handler + Int64 func(interface{}, Handler) Handler + Int64Base func(interface{}, int, Handler) Handler + Uint func(interface{}, Handler) Handler + Uint8 func(interface{}, Handler) Handler + Uint16 func(interface{}, Handler) Handler + Uint32 func(interface{}, Handler) Handler + Uint64 func(interface{}, Handler) Handler + Uint64Base func(interface{}, int, Handler) Handler + Float32 func(interface{}, Handler) Handler + Float64 func(interface{}, Handler) Handler + Boolean func(interface{}, Handler) Handler + ByValue func(toktype interface{}, handler Handler, value interface{}) Handler + ByCallback func(toktype interface{}, handler Handler, makeValue func(t *API) interface{}) Handler + Group func(interface{}, Handler) Handler +}{ + Str: MakeStrLiteralToken, + StrInterpreted: MakeStrInterpretedToken, + Byte: MakeByteToken, + Rune: MakeRuneToken, + Int: MakeIntToken, + Int8: MakeInt8Token, + Int16: MakeInt16Token, + Int32: MakeInt32Token, + Int64: MakeInt64Token, + Int64Base: MakeInt64BaseToken, + Uint: MakeUintToken, + Uint8: MakeUint8Token, + Uint16: MakeUint16Token, + Uint32: MakeUint32Token, + Uint64: MakeUint64Token, + Uint64Base: MakeUint64BaseToken, + Float32: MakeFloat32Token, + Float64: MakeFloat64Token, + Boolean: MakeBooleanToken, + ByValue: MakeTokenByValue, + ByCallback: MakeTokenByCallback, + Group: MakeTokenGroup, +} + +// MatchRune creates a Handler function that matches against the provided rune. +func MatchRune(expected rune) Handler { + return MatchRuneByCallback(func(r rune) bool { return r == expected }) +} + +// MatchRunes creates a Handler function that checks if the input matches +// one of the provided runes. The first match counts. +func MatchRunes(expected ...rune) Handler { + s := string(expected) + return MatchRuneByCallback(func(r rune) bool { return strings.ContainsRune(s, r) }) +} + +// MatchRuneRange creates a Handler function that checks if the input +// matches the provided rune range. The rune range is defined by a start and +// an end rune, inclusive, so: +// +// MatchRuneRange('g', 'k') +// +// creates a Handler that will match any of 'g', 'h', 'i', 'j' or 'k'. +func MatchRuneRange(start rune, end rune) Handler { + if end < start { + callerPanic("MatchRuneRange", "Handler: {name} definition error at {caller}: start %q must not be < end %q", start, end) + } + return MatchRuneByCallback(func(r rune) bool { return r >= start && r <= end }) +} + +// MatchNewline creates a handler that matches a newline, which is either +// a DOS-style newline (CRLF, \r\n) or a UNIX-style newline (just a LF, \n). +func MatchNewline() Handler { + return MatchAny(MatchStr("\r\n"), MatchRune('\n')) +} + +// MatchBlank creates a Handler that matches one rune from the input +// against blank characters, meaning tabs and spaces. +// +// When you need whitespace matching, which also includes characters like +// newlines, then take a look at MatchWhitespace(). +func MatchBlank() Handler { + return MatchRuneByCallback(func(r rune) bool { return r == ' ' || r == '\t' }) +} + +// MatchBlanks creates a Handler that matches the input against one +// or more blank characters, meaning tabs and spaces. +// +// When you need whitespace matching, which also includes characters like +// newlines, then make use of MatchWhitespace(). +// When you need unicode whitespace matching, which also includes characters +// like a vertical tab, then make use of MatchUnicodeSpace(). +func MatchBlanks() Handler { + return MatchOneOrMore(MatchBlank()) +} + +// MatchWhitespace creates a Handler that matches the input against one or more +// whitespace characters, defined as space ' ', tab, ' ', newline '\n' (LF) and +// carriage return '\r' followed by a newline '\n' (CRLF). +func MatchWhitespace() Handler { + return MatchOneOrMore(MatchBlank().Or(MatchNewline())) +} + +// MatchUnicodeSpace creates a Handler that matches the input against one or more +// whitespace characters, as defined by unicode. +func MatchUnicodeSpace() Handler { + return MatchOneOrMore(MatchRuneByCallback(unicode.IsSpace)) +} + +// MatchRuneByCallback creates a Handler that matches a single rune from the +// input against the provided callback function. When the callback returns true, +// it is considered a match. +// +// Note that the callback function matches the signature of the unicode.Is* functions, +// so those can be used. E.g. MatchRuneByCallback(unicode.IsLower). +func MatchRuneByCallback(callback func(rune) bool) Handler { + return func(t *API) bool { + r, err := t.NextRune() + if err == nil && callback(r) { + t.Accept() + return true + } + return false + } +} + +// MatchEndOfLine creates a Handler that matches a newline ("\r\n" or "\n") or EOF. +func MatchEndOfLine() Handler { + return MatchAny(MatchNewline(), MatchEndOfFile()) +} + +// MatchStr creates a Handler that matches the input against the provided string. +func MatchStr(expected string) Handler { + var handlers = make([]Handler, len(expected)) + for i, r := range expected { + handlers[i] = MatchRune(r) + } + return MatchSeq(handlers...) +} + +// MatchStrNoCase creates a Handler that matches the input against the +// provided string in a case-insensitive manner. +func MatchStrNoCase(expected string) Handler { + var handlers = []Handler{} + for _, r := range expected { + u := unicode.ToUpper(r) + l := unicode.ToLower(r) + handlers = append(handlers, MatchRunes(u, l)) + } + return MatchSeq(handlers...) +} + +// MatchOptional creates a Handler that makes the provided Handler optional. +// When the provided Handler applies, then its output is used, otherwise +// no output is generated but still a successful match is reported (but the +// result will be empty). +func MatchOptional(handler Handler) Handler { + return matchMinMax(0, 1, handler, "MatchOptional") +} + +// MatchSeq creates a Handler that checks if the provided Handlers can be +// applied in their exact order. Only if all Handlers apply, the sequence +// reports successful match. +func MatchSeq(handlers ...Handler) Handler { + return func(t *API) bool { + child := t.Fork() + for _, handler := range handlers { + subchild := t.Fork() + if !handler(t) { + t.Dispose(subchild) + t.Dispose(child) + return false + } + t.Merge(subchild) + t.Dispose(subchild) + } + t.Merge(child) + t.Dispose(child) + return true + } +} + +// MatchAny creates a Handler that checks if any of the provided Handlers +// can be applied. They are applied in their provided order. The first Handler +// that applies is used for reporting back a match. +func MatchAny(handlers ...Handler) Handler { + return func(t *API) bool { + for _, handler := range handlers { + child := t.Fork() + if handler(t) { + t.Merge(child) + t.Dispose(child) + return true + } + t.Dispose(child) // TODO switch to Reset() and move forking outside the loop? + } + + return false + } +} + +// MatchNot creates a Handler that checks if the provided Handler applies to +// the current input. If it does, then a failed match will be reported. If it +// does not, then the next rune from the input will be reported as a match. +func MatchNot(handler Handler) Handler { + return func(t *API) bool { + child := t.Fork() + if handler(t) { + t.Dispose(child) + return false + } + t.Dispose(child) + _, err := t.NextRune() + if err == nil { + t.Accept() + return true + } + return false + } +} + +// MatchRep creates a Handler that checks if the provided Handler can be +// applied exactly the provided amount of times. +// +// Note that the input can contain more than the provided number of matches, e.g.: +// +// MatchRep(4, MatchRune('X')) +// +// will not match input "XXX", it will match input "XXXX", but also "XXXXXX". +// In that last case, there will be a remainder "XX" on the input. +// +// Another way to use this method, is by applying the following syntactic sugar: +// +// MatchRune('X').Times(4) +func MatchRep(times int, handler Handler) Handler { + return matchMinMax(times, times, handler, "MatchRep") +} + +// MatchMin creates a Handler that checks if the provided Handler can be +// applied at least the provided minimum number of times. +// When more matches are possible, these will be included in the output. +func MatchMin(min int, handler Handler) Handler { + if min < 0 { + callerPanic("MatchMin", "Handler: {name} definition error at {caller}: min must be >= 0") + } + return matchMinMax(min, -1, handler, "MatchMin") +} + +// MatchMax creates a Handler that checks if the provided Handler can be +// applied at maximum the provided minimum number of times. +// When more matches are possible, thhandler(ese will be included in the output. +// Zero matches are considered a successful match. +func MatchMax(max int, handler Handler) Handler { + if max < 0 { + callerPanic("MatchMax", "Handler: {name} definition error at {caller}: max must be >= 0") + } + return matchMinMax(0, max, handler, "MatchMax") +} + +// MatchZeroOrMore creates a Handler that checks if the provided Handler can +// be applied zero or more times. All matches will be included in the output. +// Zero matches are considered a successful match. +func MatchZeroOrMore(handler Handler) Handler { + return matchMinMax(0, -1, handler, "MatchZeroOfMore") +} + +// MatchOneOrMore creates a Handler that checks if the provided Handler can +// be applied one or more times. All matches will be included in the output. +func MatchOneOrMore(handler Handler) Handler { + return matchMinMax(1, -1, handler, "MatchOneOrMore") +} + +// MatchMinMax creates a Handler that checks if the provided Handler can +// be applied between the provided minimum and maximum number of times, +// inclusive. All matches will be included in the output. +func MatchMinMax(min int, max int, handler Handler) Handler { + if max < 0 { + callerPanic("MatchMinMax", "Handler: {name} definition error at {caller}: max must be >= 0") + } + if min < 0 { + callerPanic("MatchMinMax", "Handler: {name} definition error at {caller}: min must be >= 0") + } + return matchMinMax(min, max, handler, "MatchMinMax") +} + +func matchMinMax(min int, max int, handler Handler, name string) Handler { + if max >= 0 && min > max { + callerPanic(name, "Handler: {name} definition error at {caller}: max %d must not be < min %d", max, min) + } + return func(t *API) bool { + total := 0 + + // Check for the minimum required amount of matches. + child := t.Fork() + for total < min { + total++ + if !handler(t) { + t.Dispose(child) + return false + } + } + + // No specified max: include the rest of the available matches. + // Specified max: include the rest of the availble matches, up to the max. + //child.Merge() + for max < 0 || total < max { + total++ + if !handler(t) { + break + } + } + t.Merge(child) + t.Dispose(child) + return true + } +} + +// MatchSeparated creates a Handler that checks for a pattern of one or more +// Handlers of one type (the separated), separated by Handler of another type +// (the separator). All matches (separated + separator) are included in the +// output. +func MatchSeparated(separator Handler, separated Handler) Handler { + return MatchSeq(separated, MatchZeroOrMore(MatchSeq(separator, separated))) +} + +// MatchExcept creates a Handler that checks if the provided Handler can be +// applied to the upcoming input. It also checks if the except Handler can be +// applied. If the handler applies, but the except Handler as well, then the match +// as a whole will be treated as a mismatch. +func MatchExcept(handler Handler, except Handler) Handler { + return func(t *API) bool { + child := t.Fork() + if except(t) { + t.Dispose(child) + return false + } + t.Dispose(child) + return handler(t) + } +} + +// MatchFollowedBy creates a Handler that checks if the provided handler matches +// and if the provided lookAhead handler matches after the handler. +// When both handlers match, the match for the handler is accepted and the match +// for the lookAhead handler is ignored. +func MatchFollowedBy(lookAhead Handler, handler Handler) Handler { + return func(t *API) bool { + child := t.Fork() + if handler(t) { + subChild := t.Fork() + if lookAhead(t) { + t.Dispose(subChild) + t.Merge(child) + } + t.Dispose(child) + return true + } + t.Dispose(child) + return false + } +} + +// MatchNotFollowedBy creates a Handler that checks if the provided handler matches +// and if the provided lookAhead handler does not match after the handler. +// If the handler matches and the lookAhead handler doesn't, then the match for +// the handler is accepted. +func MatchNotFollowedBy(lookAhead Handler, handler Handler) Handler { + return func(t *API) bool { + child := t.Fork() + if handler(t) { + subChild := t.Fork() + if !lookAhead(t) { + t.Dispose(subChild) + t.Merge(child) + t.Dispose(child) + return true + } + } + t.Dispose(child) + return false + } +} + +// MakeInputFlusher creates a Handler that will flush the input buffer when the +// provided handler matches. +// +// This is useful when constructing a grammar using only parsekit.tokenize +// functionality (parsekit.parse will automatically flush the input for you) +// that has to process large input data. +// +// Without flushing the input, the input reader will allocate memory +// during the parsing process, eventually enough to hold the full input +// in memory. By wrapping Handlers with DoFlushInput, you can tell parsekit +// that the accumulated input so far will no longer be needed, allowing +// this input to be flushed from memory. +// +// Rule of thumb is: only use it when you have to actually fix a memory +// hogging issue for your use case. +func MakeInputFlusher(handler Handler) Handler { + return func(t *API) bool { + if handler(t) { + t.FlushInput() + return true + } + return false + } +} + +// MatchSigned creates a Handler that checks if the provided Handler is +// prefixed by an optional '+' or '-' sign. This can be used to turn numeric +// atoms into a signed version, e.g. +// +// C.Signed(A.Integer) +func MatchSigned(handler Handler) Handler { + sign := MatchOptional(MatchAny(MatchRune('+'), MatchRune('-'))) + return MatchSeq(sign, handler) +} + +// MatchIntegerBetween creates a Handler that checks for an integer +// value between the provided min and max boundaries (inclusive). +// It uses an int64 for checking internally, so you can check values +// ranging from -9223372036854775808 to 9223372036854775807. +func MatchIntegerBetween(min int64, max int64) Handler { + if max < min { + callerPanic("MatchIntegerBetween", "Handler: {name} definition error at {caller}: max %d must not be < min %d", max, min) + } + digits := MatchSigned(MatchDigits()) + + return func(t *API) bool { + if !digits(t) { + return false + } + value, _ := strconv.ParseInt(t.String(), 10, 64) + if value < min || value > max { + return false + } + return true + } +} + +// MatchEndOfFile creates a Handler that checks if the end of the input data +// has been reached. This Handler will never produce output. It only reports +// a successful or a failing match through its boolean return value. +func MatchEndOfFile() Handler { + return func(t *API) bool { + child := t.Fork() + _, err := t.NextRune() + t.Dispose(child) + return err == io.EOF + } +} + +// MatchUntilEndOfLine creates a Handler function that accepts one or +// more runes until the end of the line (or file when that's the case). +// The newline itself is not included in the match. +func MatchUntilEndOfLine() Handler { + return MatchOneOrMore(MatchNot(MatchEndOfLine())) +} + +// MatchAnyRune creates a Handler function that checks if a rune can be +// read from the input. Invalid runes on the input are replaced with the UTF8 +// replacement rune \uFFFD (i.e. utf8.RuneError), which displays as �. +func MatchAnyRune() Handler { + return func(t *API) bool { + _, err := t.NextRune() + if err == nil { + t.Accept() + return true + } + return false + } +} + +// MatchValidRune creates a Handler function that checks if a valid +// UTF8 rune can be read from the input. +func MatchValidRune() Handler { + return func(t *API) bool { + r, err := t.NextRune() + if err == nil && r != utf8.RuneError { + t.Accept() + return true + } + return false + } +} + +// MatchInvalidRune creates a Handler function that checks if an invalid +// UTF8 rune can be read from the input. +func MatchInvalidRune() Handler { + return func(t *API) bool { + r, err := t.NextRune() + if err == nil && r == utf8.RuneError { + t.Accept() + return true + } + return false + } +} + +// MatchDigit creates a Handler that checks if a single digit can be read +// from the input. +func MatchDigit() Handler { + return MatchRuneRange('0', '9') +} + +// MatchDigits creates a Handler that checks if one or more digits can be read +// from the input. +func MatchDigits() Handler { + return MatchOneOrMore(MatchDigit()) +} + +// MatchDigitNotZero creates a Handler that checks if a single digit not equal +// to zero '0' can be read from the input. +func MatchDigitNotZero() Handler { + return MatchRuneRange('1', '9') +} + +// MatchInteger creates a Handler function that checks if a valid integer +// can be read from the input. In line with Go, an integer cannot start with +// a zero. Starting with a zero is used to indicate other bases, like octal or +// hexadecimal. +func MatchInteger() Handler { + justZero := MatchRune('0') + integer := MatchSeq(MatchDigitNotZero(), MatchZeroOrMore(MatchDigit())) + return MatchAny(integer, justZero) +} + +// MatchFloat creates a Handler function that checks if a valid float value +// can be read from the input. In case the fractional part is missing, this +// Handler will report a match, so both "123" and "123.123" will match. +func MatchFloat() Handler { + digits := MatchDigits() + return MatchSeq(digits, MatchOptional(MatchSeq(MatchRune('.'), digits))) +} + +// MatchBoolean creates a Handler function that checks if a boolean +// value can be read from the input. It supports the boolean values as understood +// by Go's strconv.ParseBool() function. +// +// True values: true, TRUE, True, 1, t, T +// +// False falues: false, FALSE, False, 0, f, F +func MatchBoolean() Handler { + trues := MatchAny(MatchStr("true"), MatchStr("TRUE"), MatchStr("True"), MatchRune('1'), MatchRune('t'), MatchRune('T')) + falses := MatchAny(MatchStr("false"), MatchStr("FALSE"), MatchStr("False"), MatchRune('0'), MatchRune('f'), MatchRune('F')) + return MatchAny(trues, falses) +} + +// MatchASCII creates a Handler function that matches against any +// ASCII value on the input. +func MatchASCII() Handler { + return MatchRuneRange('\x00', '\x7F') +} + +// MatchASCIILower creates a Handler function that matches against any +// lower case ASCII letter on the input (a - z). +func MatchASCIILower() Handler { + return MatchRuneRange('a', 'z') +} + +// MatchASCIIUpper creates a Handler function that matches against any +// upper case ASCII letter on the input (a - z). +func MatchASCIIUpper() Handler { + return MatchRuneRange('A', 'Z') +} + +// MatchUnicodeLetter creates a Handler function that matches against any +// unicode letter on the input (see unicode.IsLetter(rune)). +func MatchUnicodeLetter() Handler { + return MatchRuneByCallback(unicode.IsLetter) +} + +// MatchUnicodeUpper creates a Handler function that matches against any +// upper case unicode letter on the input (see unicode.IsUpper(rune)). +func MatchUnicodeUpper() Handler { + return MatchRuneByCallback(unicode.IsUpper) +} + +// MatchUnicodeLower creates a Handler function that matches against any +// lower case unicode letter on the input (see unicode.IsLower(rune)). +func MatchUnicodeLower() Handler { + return MatchRuneByCallback(unicode.IsLower) +} + +// MatchHexDigit creates a Handler function that check if a single hexadecimal +// digit can be read from the input. +func MatchHexDigit() Handler { + return MatchAny(MatchRuneRange('0', '9'), MatchRuneRange('a', 'f'), MatchRuneRange('A', 'F')) +} + +// MatchOctet creates a Handler function that checks if a valid octet value +// can be read from the input (octet = byte value representation, with a value +// between 0 and 255 inclusive). It only looks at the first 1 to 3 upcoming +// digits, not if there's a non-digit after it, meaning that "123255" would be +// a valid sequence of two octets. +// +// When the normalize parameter is set to true, then leading zeroes will be +// stripped from the octet. +func MatchOctet(normalize bool) Handler { + max3Digits := MatchMinMax(1, 3, MatchDigit()) + return func(t *API) bool { + if !max3Digits(t) { + return false + } + value, _ := strconv.ParseInt(t.String(), 10, 16) + if value > 255 { + return false + } + if normalize { + runes := t.Runes() + for len(runes) > 1 && runes[0] == '0' { + runes = runes[1:] + } + t.SetRunes(runes...) + } + return true + } +} + +// MatchIPv4 creates a Handler function that checks if a valid IPv4 +// IP address value can be read from the input. +// +// When the normalize parameter is true, IP-addresses that look like +// "192.168.001.012" will be normalize to "192.168.1.12". +func MatchIPv4(normalize bool) Handler { + octet := MatchOctet(normalize) + dot := MatchRune('.') + return MatchSeq(octet, dot, octet, dot, octet, dot, octet) +} + +// MatchIPv4CIDRMask creates a Handler function that checks if a +// valid IPv4 CIDR mask (0 - 32) value can be read from the input. +func MatchIPv4CIDRMask(normalize bool) Handler { + return matchCIDRMask(32, normalize) +} + +// MatchIPv4Netmask creates a Handler function that checks if a valid +// IPv4 netmask can be read from input (e.g. 255.255.255.0). +// Only a netmask in canonical form is accepted (meaning that in binary form +// it start with zero or more 1-bits, followed by only 0-bits up to the +// 32 bit length). +// +// When the normalize parameter is true, netmasks that look like +// "255.255.192.000" will be normalized to "255.255.192.0". +func MatchIPv4Netmask(normalize bool) Handler { + octet := MakeUint8Token(nil, MatchOctet(normalize)) + dot := MatchRune('.') + netmask := MatchSeq(octet, dot, octet, dot, octet, dot, octet) + + return func(t *API) bool { + if !netmask(t) { + return false + } + + // Check if the mask is provided in canonical form (at the binary level, ones followed by zeroes). + mask := net.IPv4Mask(t.TokenValue(0).(byte), t.TokenValue(1).(byte), t.TokenValue(2).(byte), t.TokenValue(3).(byte)) + ones, bits := mask.Size() + if ones == 0 && bits == 0 { + return false + } + + t.ClearTokens() + return true + } +} + +// MatchIPv4Net creates a Handler function that checks the input for an +// IPv4 + mask input. Both / (e.g. 192.168.0.1/24) and / +// (e.g. 172.16.10.254/255.255.192.0) are acceptable. +// +// When the normalize parameter is true, then the IP address and the mask are +// normalized. The mask will be normalized to cidr, so the above example would +// be normalized to 172.16.10.254/18. +func MatchIPv4Net(normalize bool) Handler { + ip := MakeStrLiteralToken("ip", MatchIPv4(normalize)) + slash := MatchRune('/') + mask := MatchAny( + MakeStrLiteralToken("mask", MatchIPv4Netmask(normalize)), + MakeUint8Token("cidr", MatchIPv4CIDRMask(normalize))) + ipnet := MatchSeq(ip, slash, mask) + + return func(t *API) bool { + if !ipnet(t) { + return false + } + + if !normalize { + return true + } + + maskToken := t.Token(1) + if maskToken.Type == "cidr" { + t.SetString(fmt.Sprintf("%s/%d", t.TokenValue(0), t.TokenValue(1).(uint8))) + } else { + o := strings.Split(t.TokenValue(1).(string), ".") + b := func(idx int) byte { i, _ := strconv.Atoi(o[idx]); return byte(i) } + mask := net.IPv4Mask(b(0), b(1), b(2), b(3)) + bits, _ := mask.Size() + t.SetString(fmt.Sprintf("%s/%d", t.TokenValue(0), bits)) + } + + t.ClearTokens() + return true + } +} + +// MatchIPv6 creates a Handler function that checks if an IPv6 address +// can be read from the input. +func MatchIPv6(normalize bool) Handler { + hextet := MatchMinMax(1, 4, MatchHexDigit()) + colon := MatchRune(':') + empty := MatchSeq(colon, colon) + + return func(t *API) bool { + nrOfHextets := 0 + for nrOfHextets < 8 { + if hextet(t) { + nrOfHextets++ + } else if empty(t) { + nrOfHextets += 2 + } else if !colon(t) { + break + } + } + // No hextets or too many hextets (e.g. 1:1:1:1:1:1:1:: <-- since :: is 2 or more hextets). + if nrOfHextets == 0 || nrOfHextets > 8 { + return false + } + + // Invalid IPv6, when net.ParseIP() cannot handle it. + parsed := net.ParseIP(t.String()) + if parsed == nil { + return false + } + + if normalize { + t.SetString(parsed.String()) + } + return true + } +} + +// MatchIPv6CIDRMask creates a Handler function that checks if a +// valid IPv6 CIDR mask (0 - 128) value can be read from the input. +func MatchIPv6CIDRMask(normalize bool) Handler { + return matchCIDRMask(128, normalize) +} + +func matchCIDRMask(bits int64, normalize bool) Handler { + mask := MatchIntegerBetween(0, bits) + + if !normalize { + return mask + } + + return func(t *API) bool { + if !mask(t) { + return false + } + bits, _ := strconv.Atoi(t.String()) + t.SetString(fmt.Sprintf("%d", bits)) + return true + } +} + +// MatchIPv6Net creates a Handler function that checks the input for an +// IPv6 + mask input, e.g. fe80:0:0:0:0216:3eff:fe96:0002/64. +// +// When the normalize parameter is true, then the IP address and the mask are +// normalized. The above example would be normalized to fe08::216:3eff:fe96:2/64. +func MatchIPv6Net(normalize bool) Handler { + ip := MatchIPv6(normalize) + slash := MatchRune('/') + mask := MatchIPv6CIDRMask(normalize) + return MatchSeq(ip, slash, mask) +} + +// ModifyDrop creates a Handler that checks if the provided Handler applies. +// If it does, then its output is disposed completely. +// +// Note that if the Handler does not apply, a mismatch will be reported back, +// even though we would have dropped the output anyway. So if you would like +// to drop optional blanks (spaces and tabs), then use something like: +// +// M.Drop(C.Optional(A.Blanks)) +// +// instead of: +// +// M.Drop(A.Blanks) +// +// Since A.Blanks is defined as "1 or more spaces and/or tabs", the input +// string "bork" would not match against the second form, but " bork" would. +// In both cases, it would match the first form. +func ModifyDrop(handler Handler) Handler { + return func(t *API) bool { + child := t.Fork() + if handler(t) { + t.Reset() + t.Merge(child) + t.Dispose(child) + return true + } + t.Dispose(child) + return false + } +} + +// ModifyTrim creates a Handler that checks if the provided Handler applies. +// If it does, then its output is taken and characters from the provided +// cutset are trimmed from both the left and the right of the output. +func ModifyTrim(handler Handler, cutset string) Handler { + return modifyTrim(handler, cutset, true, true) +} + +// ModifyTrimLeft creates a Handler that checks if the provided Handler applies. +// If it does, then its output is taken and characters from the provided +// cutset are trimmed from the left of the output. +func ModifyTrimLeft(handler Handler, cutset string) Handler { + return modifyTrim(handler, cutset, true, false) +} + +// ModifyTrimRight creates a Handler that checks if the provided Handler applies. +// If it does, then its output is taken and characters from the provided +// cutset are trimmed from the right of the output. +func ModifyTrimRight(handler Handler, cutset string) Handler { + return modifyTrim(handler, cutset, false, true) +} + +func modifyTrim(handler Handler, cutset string, trimLeft bool, trimRight bool) Handler { + modfunc := func(s string) string { + if trimLeft { + s = strings.TrimLeft(s, cutset) + } + if trimRight { + s = strings.TrimRight(s, cutset) + } + return s + } + return ModifyByCallback(handler, modfunc) +} + +// ModifyTrimSpace creates a Handler that checks if the provided Handler applies. +// If it does, then its output is taken and all leading and trailing whitespace characters, +// as defined by Unicode are removed from it. +func ModifyTrimSpace(handler Handler) Handler { + return ModifyByCallback(handler, strings.TrimSpace) +} + +// ModifyToUpper creates a Handler that checks if the provided Handler applies. +// If it does, then its output is taken and characters from the provided +// cutset are converted into upper case. +func ModifyToUpper(handler Handler) Handler { + return ModifyByCallback(handler, strings.ToUpper) +} + +// ModifyToLower creates a Handler that checks if the provided Handler applies. +// If it does, then its output is taken and characters from the provided +// cutset are converted into lower case. +func ModifyToLower(handler Handler) Handler { + return ModifyByCallback(handler, strings.ToLower) +} + +// ModifyReplace creates a Handler that checks if the provided Handler applies. +// If it does, then its output is replaced by the provided string. +func ModifyReplace(handler Handler, replaceWith string) Handler { + return ModifyByCallback(handler, func(string) string { + return replaceWith + }) +} + +// ModifyByCallback creates a Handler that checks if the provided Handler applies. +// If it does, then its output is taken and it is fed to the provided modfunc. +// This is a simple function that takes a string on input and returns a possibly +// modified string on output. The return value of the modfunc will replace the +// resulting output. +func ModifyByCallback(handler Handler, modfunc func(string) string) Handler { + return func(t *API) bool { + child := t.Fork() + if handler(t) { + s := modfunc(t.String()) + t.SetString(s) + t.Merge(child) + t.Dispose(child) + return true + } + t.Dispose(child) + return false + } +} + +// MakeStrLiteralToken creates a Handler that will add a Token to the +// Result, for which the Token.Value is set to a string-typed +// representation of the read Runes. This string is literal, meaning that an +// escape sequence like "\n" is kept as-is (a backslash character, followed by +// an 'n'-character). +func MakeStrLiteralToken(toktype interface{}, handler Handler) Handler { + return MakeTokenByCallback(toktype, handler, func(t *API) interface{} { + literal := t.String() + return literal + }) +} + +// MakeStrInterpretedToken creates a Handler that will add a Token to the +// Result, for which the Token.Value is set to a string-typed +// representation of the read Runes. This string is interpreted, meaning that an +// escape sequence like "\n" is translated to an actual newline control character +func MakeStrInterpretedToken(toktype interface{}, handler Handler) Handler { + return MakeTokenByCallback(toktype, handler, func(t *API) interface{} { + // TODO ERROR HANDLING + interpreted, _ := interpretString(t.String()) + return interpreted + }) +} + +// TODO Use better interpreter from parser code? +func interpretString(str string) (string, error) { + var sb strings.Builder + for len(str) > 0 { + r, _, remainder, err := strconv.UnquoteChar(str, '"') + if err != nil { + return sb.String(), err + } + str = remainder + sb.WriteRune(r) + } + return sb.String(), nil +} + +// MakeRuneToken creates a Handler that will add a Token to the +// Result, for which the Token.Value is set to a Rune-representation +// of the read Rune. +func MakeRuneToken(toktype interface{}, handler Handler) Handler { + return MakeTokenByCallback(toktype, handler, func(t *API) interface{} { + // TODO ERROR HANDLING --- not a 1 rune input + return t.Rune(0) + }) +} + +// MakeByteToken creates a Handler that will add a Token to the +// Result, for which the Token.Value is set to a Byte-representation +// of the read Rune. +func MakeByteToken(toktype interface{}, handler Handler) Handler { + return MakeTokenByCallback(toktype, handler, func(t *API) interface{} { + // TODO ERROR HANDLING --- not a 1 byte input + return byte(t.Rune(0)) + }) +} + +// MakeIntToken creates a Handler that will add a Token to the +// Result, for which the Token.Value is set to an int-representation +// of the read Rune. +func MakeIntToken(toktype interface{}, handler Handler) Handler { + return makeStrconvToken("int", toktype, handler, func(s string) (interface{}, error) { + return strconv.Atoi(s) + }) +} + +// MakeInt8Token creates a Handler that will add a Token to the +// Result, for which the Token.Value is set to an int8-representation +// of the read Rune. +// TODO allow other Go types for oct and hex too. +func MakeInt8Token(toktype interface{}, handler Handler) Handler { + return makeStrconvToken("int8", toktype, handler, + func(s string) (interface{}, error) { + value, err := strconv.ParseInt(s, 10, 8) + if err == nil { + return int8(value), err + } + return value, err + }) +} + +// MakeInt16Token creates a Handler that will add a Token to the +// Result, for which the Token.Value is set to an int16-representation +// of the read Rune. +func MakeInt16Token(toktype interface{}, handler Handler) Handler { + return makeStrconvToken("int16", toktype, handler, + func(s string) (interface{}, error) { + value, err := strconv.ParseInt(s, 10, 16) + if err == nil { + return int16(value), err + } + return value, err + }) +} + +// MakeInt32Token creates a Handler that will add a Token to the +// Result, for which the Token.Value is set to an int32-representation +// of the read Rune. +func MakeInt32Token(toktype interface{}, handler Handler) Handler { + return makeStrconvToken("int32", toktype, handler, + func(s string) (interface{}, error) { + value, err := strconv.ParseInt(s, 10, 32) + if err == nil { + return int32(value), err + } + return value, err + }) +} + +// MakeInt64BaseToken creates a Handler that will add a Token to the +// Result, for which the Token.Value is set to an int64-representation +// of the read Rune, using the provided base (e.g. 2 = binary, 8 = octal, +// 10 = decimal, 16 = hexadecimal). +func MakeInt64BaseToken(toktype interface{}, base int, handler Handler) Handler { + return makeInt64BaseToken(toktype, base, handler) +} + +func makeInt64BaseToken(toktype interface{}, base int, handler Handler) Handler { + return makeStrconvToken("int64", toktype, handler, + func(s string) (interface{}, error) { + value, err := strconv.ParseInt(s, base, 64) + if err == nil { + return int64(value), err + } + return value, err + }) +} + +// MakeInt64Token creates a Handler that will add a Token to the +// Result, for which the Token.Value is set to an int64-representation +// of the read Rune. +func MakeInt64Token(toktype interface{}, handler Handler) Handler { + return MakeInt64BaseToken(toktype, 10, handler) +} + +// MakeUintToken creates a Handler that will add a Token to the +// Result, for which the Token.Value is set to an uint-representation +// of the read Rune. +func MakeUintToken(toktype interface{}, handler Handler) Handler { + return makeStrconvToken("uint", toktype, handler, + func(s string) (interface{}, error) { + value, err := strconv.ParseUint(s, 10, 0) + if err == nil { + return uint(value), err + } + return value, err + }) +} + +// MakeUint8Token creates a Handler that will add a Token to the +// Result, for which the Token.Value is set to an uint8-representation +// of the read Rune. +// TODO allow other Go types for oct and hex too. +func MakeUint8Token(toktype interface{}, handler Handler) Handler { + return makeStrconvToken("uint8", toktype, handler, + func(s string) (interface{}, error) { + value, err := strconv.ParseUint(s, 10, 8) + if err == nil { + return uint8(value), err + } + return value, err + }) +} + +// MakeUint16Token creates a Handler that will add a Token to the +// Result, for which the Token.Value is set to an uint16-representation +// of the read Rune. +func MakeUint16Token(toktype interface{}, handler Handler) Handler { + return makeStrconvToken("uint16", toktype, handler, + func(s string) (interface{}, error) { + value, err := strconv.ParseUint(s, 10, 16) + if err == nil { + return uint16(value), err + } + return value, err + }) +} + +// MakeUint32Token creates a Handler that will add a Token to the +// Result, for which the Token.Value is set to an uint32-representation +// of the read Rune. +func MakeUint32Token(toktype interface{}, handler Handler) Handler { + return makeStrconvToken("unit32", toktype, handler, + func(s string) (interface{}, error) { + value, err := strconv.ParseUint(s, 10, 32) + if err == nil { + return uint32(value), err + } + return value, err + }) +} + +// MakeUint64BaseToken creates a Handler that will add a Token to the +// Result, for which the Token.Value is set to an uint64-representation +// of the read Rune, using the provided base (e.g. 2 = binary, 8 = octal, +// 10 = decimal, 16 = hexadecimal). +func MakeUint64BaseToken(toktype interface{}, base int, handler Handler) Handler { + return makeStrconvToken("uint64", toktype, handler, + func(s string) (interface{}, error) { + value, err := strconv.ParseUint(s, base, 64) + if err == nil { + return uint64(value), err + } + return value, err + }) +} + +// MakeUint64Token creates a Handler that will add a Token to the +// Result, for which the Token.Value is set to an uint64-representation +// of the read Rune. +func MakeUint64Token(toktype interface{}, handler Handler) Handler { + return MakeUint64BaseToken(toktype, 10, handler) +} + +// MakeFloat32Token creates a Handler that will add a Token to the +// Result, for which the Token.Value is set to an float32-representation +// of the read Rune. +func MakeFloat32Token(toktype interface{}, handler Handler) Handler { + return makeStrconvToken("float32", toktype, handler, + func(s string) (interface{}, error) { + value, err := strconv.ParseFloat(s, 32) + if err == nil { + return float32(value), err + } + return value, err + }) +} + +// MakeFloat64Token creates a Handler that will add a Token to the +// Result, for which the Token.Value is set to an float64-representation +// of the read Rune. +func MakeFloat64Token(toktype interface{}, handler Handler) Handler { + return makeStrconvToken("float64", toktype, handler, + func(s string) (interface{}, error) { + value, err := strconv.ParseFloat(s, 64) + if err == nil { + return float64(value), err + } + return value, err + }) +} + +// MakeBooleanToken creates a Handler that will add a Token to the +// Result, for which the Token.Value is set to an bool-representation +// of the read Rune. +func MakeBooleanToken(toktype interface{}, handler Handler) Handler { + return makeStrconvToken("boolean", toktype, handler, + func(s string) (interface{}, error) { + value, err := strconv.ParseBool(s) + if err == nil { + return bool(value), err + } + return value, err + }) +} + +func makeStrconvToken(name string, toktype interface{}, handler Handler, convert func(s string) (interface{}, error)) Handler { + return MakeTokenByCallback(toktype, handler, func(t *API) interface{} { + value, err := convert(t.String()) + if err != nil { + // TODO meh, panic feels so bad here. Maybe just turn this case into "no match"? + panic(fmt.Sprintf("%s token invalid (%s)", name, err)) + } + return value + }) +} + +// MakeTokenByValue creates a Handler that will add a static Token value +// to the Result. +func MakeTokenByValue(toktype interface{}, handler Handler, value interface{}) Handler { + return MakeTokenByCallback(toktype, handler, func(t *API) interface{} { return value }) +} + +// MakeTokenByCallback creates a Handler that will add a Token to the +// Result, for which the Token.Value is to be generated by the provided +// makeValue() callback function. The function gets the current API as +// its input and must return the token value. +func MakeTokenByCallback(toktype interface{}, handler Handler, makeValue func(t *API) interface{}) Handler { + return func(t *API) bool { + child := t.Fork() + if handler(t) { + // The token is not added to the child here. The child might have produced its own + // tokens and we want those to come after the token for the current parsing level. + // By adding the token to the input API and then merging the child tokens, the order + // of the tokens will match the expectations. + // e.g. when a parsing hierarchy looks like ("date" ("year", "month" "day")), the + // tokens will end up in the order "date", "year", "month", "day". When we'd add the + // token to the child here, the order would have been "year", "month", "day", "date". + token := Token{Type: toktype, Value: makeValue(t)} + t.AddTokens(token) + t.Merge(child) + t.Dispose(child) + + return true + } + t.Dispose(child) + return false + } +} + +// MakeTokenGroup checks if the provided handler matches the input. If yes, then it will +// take the tokens as produced by the handler and group them together in a single token. +func MakeTokenGroup(toktype interface{}, handler Handler) Handler { + return func(t *API) bool { + child := t.Fork() + if handler(t) { + tokens := t.Tokens() + tokensCopy := make([]Token, len(tokens)) + copy(tokensCopy, tokens) + t.SetTokens(Token{Type: toktype, Value: tokensCopy}) + t.Merge(child) + t.Dispose(child) + return true + } + t.Dispose(child) + return false + } +} diff --git a/tokenize2/handlers_builtin_test.go b/tokenize2/handlers_builtin_test.go new file mode 100644 index 0000000..26de338 --- /dev/null +++ b/tokenize2/handlers_builtin_test.go @@ -0,0 +1,445 @@ +package tokenize2_test + +import ( + "fmt" + "testing" + + tokenize "git.makaay.nl/mauricem/go-parsekit/tokenize2" +) + +func TestCombinatorsTempDebug(t *testing.T) { + var a = tokenize.A + AssertHandlers(t, []HandlerT{ + // {"024", a.IPv4CIDRMask, true, "24"}, + // {"024", a.Octet, true, "24"}, + {"192.168.6.123/024", a.IPv4Net, true, "192.168.6.123/24"}, + }) +} + +func TestCombinators(t *testing.T) { + var c, a, m = tokenize.C, tokenize.A, tokenize.M + AssertHandlers(t, []HandlerT{ + {"abc not", c.Not(a.Rune('b')), true, "a"}, + {"bcd not", c.Not(a.Rune('b')), false, ""}, + {"1010 not", c.Not(c.Seq(a.Rune('2'), a.Rune('0'))), true, "1"}, + {"2020 not", c.Not(c.Seq(a.Rune('2'), a.Rune('0'))), false, ""}, + {"abc any", c.Any(a.Rune('a'), a.Rune('b')), true, "a"}, + {"bcd any", c.Any(a.Rune('a'), a.Rune('b')), true, "b"}, + {"cde any", c.Any(a.Rune('a'), a.Rune('b')), false, ""}, + {"ababc repeated", c.Repeated(4, a.Runes('a', 'b')), true, "abab"}, + {"ababc repeated", c.Repeated(5, a.Runes('a', 'b')), false, ""}, + {"", c.Min(0, a.Rune('a')), true, ""}, + {"a", c.Min(0, a.Rune('a')), true, "a"}, + {"aaaaa", c.Min(4, a.Rune('a')), true, "aaaaa"}, + {"aaaaa", c.Min(5, a.Rune('a')), true, "aaaaa"}, + {"aaaaa", c.Min(6, a.Rune('a')), false, ""}, + {"", c.Max(4, a.Rune('b')), true, ""}, + {"X", c.Max(4, a.Rune('b')), true, ""}, + {"bbbbbX", c.Max(4, a.Rune('b')), true, "bbbb"}, + {"bbbbbX", c.Max(5, a.Rune('b')), true, "bbbbb"}, + {"bbbbbX", c.Max(6, a.Rune('b')), true, "bbbbb"}, + {"", c.MinMax(0, 0, a.Rune('c')), true, ""}, + {"X", c.MinMax(0, 0, a.Rune('c')), true, ""}, + {"cccc", c.MinMax(0, 5, a.Rune('c')), true, "cccc"}, + {"ccccc", c.MinMax(0, 5, a.Rune('c')), true, "ccccc"}, + {"cccccc", c.MinMax(0, 5, a.Rune('c')), true, "ccccc"}, + {"cccccX", c.MinMax(0, 0, a.Rune('c')), true, ""}, + {"cccccX", c.MinMax(0, 1, a.Rune('c')), true, "c"}, + {"cccccX", c.MinMax(0, 5, a.Rune('c')), true, "ccccc"}, + {"cccccX", c.MinMax(0, 6, a.Rune('c')), true, "ccccc"}, + {"cccccX", c.MinMax(1, 1, a.Rune('c')), true, "c"}, + {"", c.MinMax(1, 1, a.Rune('c')), false, ""}, + {"X", c.MinMax(1, 1, a.Rune('c')), false, ""}, + {"cccccX", c.MinMax(1, 3, a.Rune('c')), true, "ccc"}, + {"cccccX", c.MinMax(1, 6, a.Rune('c')), true, "ccccc"}, + {"cccccX", c.MinMax(3, 4, a.Rune('c')), true, "cccc"}, + {"", c.OneOrMore(a.Rune('d')), false, ""}, + {"X", c.OneOrMore(a.Rune('d')), false, ""}, + {"dX", c.OneOrMore(a.Rune('d')), true, "d"}, + {"dddddX", c.OneOrMore(a.Rune('d')), true, "ddddd"}, + {"", c.ZeroOrMore(a.Rune('e')), true, ""}, + {"X", c.ZeroOrMore(a.Rune('e')), true, ""}, + {"eX", c.ZeroOrMore(a.Rune('e')), true, "e"}, + {"eeeeeX", c.ZeroOrMore(a.Rune('e')), true, "eeeee"}, + {"HI!", c.Seq(a.Rune('H'), a.Rune('I'), a.Rune('!')), true, "HI!"}, + {"Hello, world!X", c.Seq(a.Str("Hello"), a.Comma, a.Space, a.Str("world"), a.Excl), true, "Hello, world!"}, + {"101010123", c.OneOrMore(c.Seq(a.Rune('1'), a.Rune('0'))), true, "101010"}, + {"", c.Optional(c.OneOrMore(a.Rune('f'))), true, ""}, + {"ghijkl", c.Optional(a.Rune('h')), true, ""}, + {"ghijkl", c.Optional(a.Rune('g')), true, "g"}, + {"fffffX", c.Optional(c.OneOrMore(a.Rune('f'))), true, "fffff"}, + {"1,2,3,b,c", c.Separated(a.Comma, a.Digit), true, "1,2,3"}, + {`\x9a\x01\xF0\xfCAndSomeMoreStuff`, c.OneOrMore(c.Seq(a.Backslash, a.Rune('x'), c.Repeated(2, a.HexDigit))), true, `\x9a\x01\xF0\xfC`}, + {" ", m.Trim(c.OneOrMore(a.AnyRune), " "), true, ""}, + {" ", m.TrimLeft(c.OneOrMore(a.AnyRune), " "), true, ""}, + {" ", m.TrimRight(c.OneOrMore(a.AnyRune), " "), true, ""}, + }) +} + +func TestCombinatorPanics(t *testing.T) { + var c, a = tokenize.C, tokenize.A + AssertPanics(t, []PanicT{ + {func() { a.RuneRange('z', 'a') }, true, + `Handler: MatchRuneRange definition error at /.*/handlers_builtin_test\.go:\d+: start 'z' must not be < end 'a'`}, + {func() { c.MinMax(-1, 1, a.Space) }, true, + `Handler: MatchMinMax definition error at /.*/handlers_builtin_test\.go:\d+: min must be >= 0`}, + {func() { c.MinMax(1, -1, a.Space) }, true, + `Handler: MatchMinMax definition error at /.*/handlers_builtin_test\.go:\d+: max must be >= 0`}, + {func() { c.MinMax(10, 5, a.Space) }, true, + `Handler: MatchMinMax definition error at /.*/handlers_builtin_test\.go:\d+: max 5 must not be < min 10`}, + {func() { c.Min(-10, a.Space) }, true, + `Handler: MatchMin definition error at /.*/handlers_builtin_test\.go:\d+: min must be >= 0`}, + {func() { c.Max(-42, a.Space) }, true, + `Handler: MatchMax definition error at /.*/handlers_builtin_test\.go:\d+: max must be >= 0`}, + {func() { a.IntegerBetween(10, -10) }, true, + `Handler: MatchIntegerBetween definition error at /.*/handlers_builtin_test.go:\d+: max -10 must not be < min 10`}, + }) +} + +func TestAtoms(t *testing.T) { + var a = tokenize.A + AssertHandlers(t, []HandlerT{ + {"dd", a.RuneRange('b', 'e'), true, "d"}, + {"ee", a.RuneRange('b', 'e'), true, "e"}, + {"ff", a.RuneRange('b', 'e'), false, ""}, + {"Hello, world!", a.Str("Hello"), true, "Hello"}, + {"HellÖ, world!", a.StrNoCase("hellö"), true, "HellÖ"}, + {"+X", a.Runes('+', '-', '*', '/'), true, "+"}, + {"-X", a.Runes('+', '-', '*', '/'), true, "-"}, + {"*X", a.Runes('+', '-', '*', '/'), true, "*"}, + {"/X", a.Runes('+', '-', '*', '/'), true, "/"}, + {"!X", a.Runes('+', '-', '*', '/'), false, ""}, + {"xxx", a.Rune('x'), true, "x"}, + {"x ", a.Rune(' '), false, ""}, + {"aa", a.RuneRange('b', 'e'), false, ""}, + {"bb", a.RuneRange('b', 'e'), true, "b"}, + {"cc", a.RuneRange('b', 'e'), true, "c"}, + {"", a.EndOfFile, true, ""}, + {"⌘", a.AnyRune, true, "⌘"}, + {"\xbc with AnyRune", a.AnyRune, true, "�"}, + {"", a.AnyRune, false, ""}, + {"⌘", a.ValidRune, true, "⌘"}, + {"\xbc with ValidRune", a.ValidRune, false, "�"}, + {"", a.ValidRune, false, ""}, + {" ", a.Space, true, " "}, + {"X", a.Space, false, ""}, + {"\t", a.Tab, true, "\t"}, + {"\r", a.CR, true, "\r"}, + {"\n", a.LF, true, "\n"}, + {"!", a.Excl, true, "!"}, + {"\"", a.DoubleQuote, true, "\""}, + {"#", a.Hash, true, "#"}, + {"$", a.Dollar, true, "$"}, + {"%", a.Percent, true, "%"}, + {"&", a.Amp, true, "&"}, + {"'", a.SingleQuote, true, "'"}, + {"(", a.LeftParen, true, "("}, + {"(", a.RoundOpen, true, "("}, + {")", a.RightParen, true, ")"}, + {")", a.RoundClose, true, ")"}, + {"*", a.Asterisk, true, "*"}, + {"*", a.Multiply, true, "*"}, + {"+", a.Plus, true, "+"}, + {"+", a.Add, true, "+"}, + {",", a.Comma, true, ","}, + {"-", a.Minus, true, "-"}, + {"-", a.Subtract, true, "-"}, + {".", a.Dot, true, "."}, + {"/", a.Slash, true, "/"}, + {"/", a.Divide, true, "/"}, + {":", a.Colon, true, ":"}, + {";", a.Semicolon, true, ";"}, + {"<", a.AngleOpen, true, "<"}, + {"<", a.LessThan, true, "<"}, + {"=", a.Equal, true, "="}, + {">", a.AngleClose, true, ">"}, + {">", a.GreaterThan, true, ">"}, + {"?", a.Question, true, "?"}, + {"@", a.At, true, "@"}, + {"[", a.SquareOpen, true, "["}, + {"\\", a.Backslash, true, "\\"}, + {"]", a.SquareClose, true, "]"}, + {"^", a.Caret, true, "^"}, + {"_", a.Underscore, true, "_"}, + {"`", a.Backquote, true, "`"}, + {"{", a.CurlyOpen, true, "{"}, + {"|", a.Pipe, true, "|"}, + {"}", a.CurlyClose, true, "}"}, + {"~", a.Tilde, true, "~"}, + {"\t \t \r\n", a.Blank, true, "\t"}, + {" \t \t \r\n", a.Blanks, true, " \t \t "}, + {"xxx", a.Whitespace, false, ""}, + {" ", a.Whitespace, true, " "}, + {"\t", a.Whitespace, true, "\t"}, + {"\n", a.Whitespace, true, "\n"}, + {"\r\n", a.Whitespace, true, "\r\n"}, + {" \t\r\n \n \t\t\r\n ", a.Whitespace, true, " \t\r\n \n \t\t\r\n "}, + {"xxx", a.UnicodeSpace, false, ""}, + {" \t\r\n \r\v\f ", a.UnicodeSpace, true, " \t\r\n \r\v\f "}, + {"", a.EndOfLine, true, ""}, + {"\r\n", a.EndOfLine, true, "\r\n"}, + {"\n", a.EndOfLine, true, "\n"}, + {"0", a.Digit, true, "0"}, + {"1", a.Digit, true, "1"}, + {"2", a.Digit, true, "2"}, + {"3", a.Digit, true, "3"}, + {"4", a.Digit, true, "4"}, + {"5", a.Digit, true, "5"}, + {"6", a.Digit, true, "6"}, + {"7", a.Digit, true, "7"}, + {"8", a.Digit, true, "8"}, + {"9", a.Digit, true, "9"}, + {"X", a.Digit, false, ""}, + {"a", a.ASCIILower, true, "a"}, + {"z", a.ASCIILower, true, "z"}, + {"A", a.ASCIILower, false, ""}, + {"Z", a.ASCIILower, false, ""}, + {"A", a.ASCIIUpper, true, "A"}, + {"Z", a.ASCIIUpper, true, "Z"}, + {"a", a.ASCIIUpper, false, ""}, + {"z", a.ASCIIUpper, false, ""}, + {"1", a.Letter, false, ""}, + {"a", a.Letter, true, "a"}, + {"Ø", a.Letter, true, "Ø"}, + {"Ë", a.Lower, false, ""}, + {"ë", a.Lower, true, "ë"}, + {"ä", a.Upper, false, "ä"}, + {"Ä", a.Upper, true, "Ä"}, + {"0", a.HexDigit, true, "0"}, + {"9", a.HexDigit, true, "9"}, + {"a", a.HexDigit, true, "a"}, + {"f", a.HexDigit, true, "f"}, + {"A", a.HexDigit, true, "A"}, + {"F", a.HexDigit, true, "F"}, + {"g", a.HexDigit, false, "g"}, + {"G", a.HexDigit, false, "G"}, + {"0", a.Integer, true, "0"}, + {"09", a.Integer, true, "0"}, // following Go: 09 is invalid octal, so only 0 is valid for the integer + {"1", a.Integer, true, "1"}, + {"-10X", a.Integer, false, ""}, + {"+10X", a.Integer, false, ""}, + {"-10X", a.Signed(a.Integer), true, "-10"}, + {"+10X", a.Signed(a.Integer), true, "+10"}, + {"+10.1X", a.Signed(a.Integer), true, "+10"}, + {"0X", a.Float, true, "0"}, + {"0X", a.Float, true, "0"}, + {"1X", a.Float, true, "1"}, + {"1.", a.Float, true, "1"}, // incomplete float, so only the 1 is picked up + {"123.321X", a.Float, true, "123.321"}, + {"-3.14X", a.Float, false, ""}, + {"-3.14X", a.Signed(a.Float), true, "-3.14"}, + {"-003.0014X", a.Signed(a.Float), true, "-003.0014"}, + {"-11", a.IntegerBetween(-10, 10), false, "0"}, + {"-10", a.IntegerBetween(-10, 10), true, "-10"}, + {"0", a.IntegerBetween(-10, 10), true, "0"}, + {"10", a.IntegerBetween(-10, 10), true, "10"}, + {"11", a.IntegerBetween(0, 10), false, ""}, + }) +} + +func TestIPv4Atoms(t *testing.T) { + var a = tokenize.A + AssertHandlers(t, []HandlerT{ + // Not normalized octet. + {"0X", tokenize.MatchOctet(false), true, "0"}, + {"00X", tokenize.MatchOctet(false), true, "00"}, + {"000X", tokenize.MatchOctet(false), true, "000"}, + {"10X", tokenize.MatchOctet(false), true, "10"}, + {"010X", tokenize.MatchOctet(false), true, "010"}, + {"255123", tokenize.MatchOctet(false), true, "255"}, + {"256123", tokenize.MatchOctet(false), false, ""}, + {"300", tokenize.MatchOctet(false), false, ""}, + + // Normalized octet. + {"0X", a.Octet, true, "0"}, + {"00X", a.Octet, true, "0"}, + {"000X", a.Octet, true, "0"}, + {"10X", a.Octet, true, "10"}, + {"010X", a.Octet, true, "10"}, + {"255123", a.Octet, true, "255"}, + {"256123", a.Octet, false, ""}, + {"300", a.Octet, false, ""}, + + // IPv4 address. + {"0.0.0.0", a.IPv4, true, "0.0.0.0"}, + {"10.20.30.40", a.IPv4, true, "10.20.30.40"}, + {"010.020.003.004", a.IPv4, true, "10.20.3.4"}, + {"255.255.255.255", a.IPv4, true, "255.255.255.255"}, + {"256.255.255.255", a.IPv4, false, ""}, + + // IPv4 CIDR netmask. + {"0", a.IPv4CIDRMask, true, "0"}, + {"00", a.IPv4CIDRMask, true, "0"}, + {"000", a.IPv4CIDRMask, true, "0"}, + {"32", a.IPv4CIDRMask, true, "32"}, + {"032", a.IPv4CIDRMask, true, "32"}, + {"33", a.IPv4CIDRMask, false, ""}, + + // IPv4 netmask in dotted quad format. + {"0.0.0.0", a.IPv4Netmask, true, "0.0.0.0"}, + {"255.255.128.0", a.IPv4Netmask, true, "255.255.128.0"}, + {"255.255.255.255", a.IPv4Netmask, true, "255.255.255.255"}, + {"255.255.132.0", a.IPv4Netmask, false, ""}, // not a canonical netmask (1-bits followed by 0-bits) + + // IPv4 address + CIDR or dotted quad netmask. + {"192.168.6.123", a.IPv4Net, false, ""}, + {"192.168.6.123/024", a.IPv4Net, true, "192.168.6.123/24"}, + {"192.168.6.123/255.255.255.0", a.IPv4Net, true, "192.168.6.123/24"}, + {"10.0.0.10/192.0.0.0", a.IPv4Net, true, "10.0.0.10/2"}, + {"10.0.0.10/193.0.0.0", a.IPv4Net, false, ""}, // invalid netmask and 193 is also invalid cidr + {"010.000.000.010/16.000.000.000", a.IPv4Net, true, "10.0.0.10/16"}, // invalid netmask, but 16 cidr is ok, remainder input = ".0.0.0" + }) +} + +func TestIPv6Atoms(t *testing.T) { + var a = tokenize.A + AssertHandlers(t, []HandlerT{ + {"", a.IPv6, false, ""}, + {"::", a.IPv6, true, "::"}, + {"1::", a.IPv6, true, "1::"}, + {"1::1", a.IPv6, true, "1::1"}, + {"::1", a.IPv6, true, "::1"}, + {"1:2:3:4:5:6:7::", a.IPv6, false, ""}, + {"::1:2:3:4:5:6:7:8:9", a.IPv6, true, "::1:2:3:4:5:6"}, + {"1:2:3:4::5:6:7:8:9", a.IPv6, true, "1:2:3:4::5:6"}, + {"a:b::ffff:0:1111", a.IPv6, true, "a:b::ffff:0:1111"}, + {"000a:000b:0000:000:00:ffff:0000:1111", a.IPv6, true, "a:b::ffff:0:1111"}, + {"000a:0000:0000:001:00:0:ffff:1111", a.IPv6, true, "a::1:0:0:ffff:1111"}, + {"0000:0000:0000:001:00:0:ffff:1111", a.IPv6, true, "::1:0:0:ffff:1111"}, + {"aaaa:bbbb:cccc:dddd:eeee:ffff:0000:1111", a.IPv6, true, "aaaa:bbbb:cccc:dddd:eeee:ffff:0:1111"}, + {"gggg:bbbb:cccc:dddd:eeee:ffff:0000:1111", a.IPv6, false, ""}, + {"ffff::gggg:eeee:ffff:0000:1111", a.IPv6, true, "ffff::"}, + {"0", a.IPv6CIDRMask, true, "0"}, + {"128", a.IPv6CIDRMask, true, "128"}, + {"129", a.IPv6CIDRMask, false, ""}, + {"::1/128", a.IPv6Net, true, "::1/128"}, + {"::1/129", a.IPv6Net, false, ""}, + {"1.1.1.1/24", a.IPv6Net, false, ""}, + {"ffff:0:0:0::1010/0", a.IPv6Net, true, "ffff::1010/0"}, + {"fe80:0:0:0:0216:3eff:fe96:0002/64", a.IPv6Net, true, "fe80::216:3eff:fe96:2/64"}, + }) +} + +func TestModifiers(t *testing.T) { + var c, a, m = tokenize.C, tokenize.A, tokenize.M + AssertHandlers(t, []HandlerT{ + {"--cool", c.Seq(m.Drop(c.OneOrMore(a.Minus)), a.Str("cool")), true, "cool"}, + {"12345", c.Seq(a.Digit, m.Drop(a.Digit), a.Digit, m.Drop(a.Digit), a.Digit), true, "135"}, + {" trim ", m.Trim(c.OneOrMore(a.AnyRune), " "), true, "trim"}, + {" \t trim \t ", m.Trim(c.OneOrMore(a.AnyRune), " \t"), true, "trim"}, + {" trim ", m.TrimLeft(c.OneOrMore(a.AnyRune), " "), true, "trim "}, + {" trim ", m.TrimRight(c.OneOrMore(a.AnyRune), " "), true, " trim"}, + {" \t trim \t ", m.TrimRight(c.OneOrMore(a.AnyRune), " \t"), true, " \t trim"}, + {"dirtyword", m.Replace(c.OneOrMore(a.AnyRune), "*******"), true, "*******"}, + {"abcdefghijk", m.ByCallback(a.Str("abc"), func(s string) string { return "X" }), true, "X"}, + {"NoTaLlUpPeR", m.ToUpper(a.StrNoCase("notallUPPER")), true, "NOTALLUPPER"}, + {"NoTaLlLoWeR", m.ToLower(a.StrNoCase("NOTALLlower")), true, "notalllower"}, + }) +} + +// When a TokenMaker encounters an error, this is considered a programmer error. +// A TokenMaker should not be called, unless the input is already validated to +// follow the correct pattern. Therefore, tokenmakers will panic when the +// input cannot be processed successfully. +func TestTokenMakerErrorHandling(t *testing.T) { + var a, tok = tokenize.A, tokenize.T + invalid := tok.Boolean("BOOL", a.Str("no")) // not valid for strconv.ParseBool() + tokenizer := tokenize.New(invalid) + AssertPanic(t, PanicT{ + func() { tokenizer("no") }, false, + `boolean token invalid (strconv.ParseBool: parsing "no": invalid syntax)`, + }) +} + +func TestTokenMakers(t *testing.T) { + var c, a, tok = tokenize.C, tokenize.A, tokenize.T + AssertTokenMakers(t, []TokenMakerT{ + {`empty token`, tok.Str("A", c.ZeroOrMore(a.Digit)), + []tokenize.Token{{Type: "A", Value: ""}}}, + + {`Ѝюج literal \string`, tok.Str("B", c.OneOrMore(a.AnyRune)), + []tokenize.Token{{Type: "B", Value: `Ѝюج literal \string`}}}, + + {`Ѝюجinterpreted \n string \u2318`, tok.StrInterpreted("C", c.OneOrMore(a.AnyRune)), + []tokenize.Token{{Type: "C", Value: "Ѝюجinterpreted \n string ⌘"}}}, + + {"Ø*", tok.Byte("Q", a.AnyRune), []tokenize.Token{{Type: "Q", Value: byte('Ø')}}}, + {"ROCKS", c.OneOrMore(tok.Byte("bar", a.ASCII)), []tokenize.Token{ + {Type: "bar", Value: byte('R')}, + {Type: "bar", Value: byte('O')}, + {Type: "bar", Value: byte('C')}, + {Type: "bar", Value: byte('K')}, + {Type: "bar", Value: byte('S')}, + }}, + + {"Ø*", tok.Rune("P", a.AnyRune), []tokenize.Token{{Type: "P", Value: rune('Ø')}}}, + + {`2147483647XYZ`, tok.Int("D", a.Integer), []tokenize.Token{{Type: "D", Value: int(2147483647)}}}, + {`-2147483647XYZ`, tok.Int("D", a.Signed(a.Integer)), []tokenize.Token{{Type: "D", Value: int(-2147483647)}}}, + {`127XYZ`, tok.Int8("E", a.Integer), []tokenize.Token{{Type: "E", Value: int8(127)}}}, + {`-127XYZ`, tok.Int8("E", a.Signed(a.Integer)), []tokenize.Token{{Type: "E", Value: int8(-127)}}}, + {`32767XYZ`, tok.Int16("F", a.Integer), []tokenize.Token{{Type: "F", Value: int16(32767)}}}, + {`-32767XYZ`, tok.Int16("F", a.Signed(a.Integer)), []tokenize.Token{{Type: "F", Value: int16(-32767)}}}, + {`2147483647XYZ`, tok.Int32("G", a.Integer), []tokenize.Token{{Type: "G", Value: int32(2147483647)}}}, + {`-2147483647XYZ`, tok.Int32("G", a.Signed(a.Integer)), []tokenize.Token{{Type: "G", Value: int32(-2147483647)}}}, + {`-9223372036854775807XYZ`, tok.Int64("H", a.Signed(a.Integer)), []tokenize.Token{{Type: "H", Value: int64(-9223372036854775807)}}}, + + {`4294967295`, tok.Uint("I", a.Integer), []tokenize.Token{{Type: "I", Value: uint(4294967295)}}}, + {`255XYZ`, tok.Uint8("J", a.Integer), []tokenize.Token{{Type: "J", Value: uint8(255)}}}, + {`65535XYZ`, tok.Uint16("K", a.Integer), []tokenize.Token{{Type: "K", Value: uint16(65535)}}}, + {`4294967295XYZ`, tok.Uint32("L", a.Integer), []tokenize.Token{{Type: "L", Value: uint32(4294967295)}}}, + {`18446744073709551615XYZ`, tok.Uint64("M", a.Integer), []tokenize.Token{{Type: "M", Value: uint64(18446744073709551615)}}}, + + {`3.1415=PI`, tok.Float32("N", a.Float), []tokenize.Token{{Type: "N", Value: float32(3.1415)}}}, + {`24.19287=PI`, tok.Float64("O", a.Float), []tokenize.Token{{Type: "O", Value: float64(24.19287)}}}, + + {`1tTtrueTRUETrue`, c.OneOrMore(tok.Boolean("P", a.Boolean)), []tokenize.Token{ + {Type: "P", Value: true}, + {Type: "P", Value: true}, + {Type: "P", Value: true}, + {Type: "P", Value: true}, + {Type: "P", Value: true}, + {Type: "P", Value: true}, + }}, + + {`0fFfalseFALSEFalse`, c.OneOrMore(tok.Boolean("P", a.Boolean)), []tokenize.Token{ + {Type: "P", Value: false}, + {Type: "P", Value: false}, + {Type: "P", Value: false}, + {Type: "P", Value: false}, + {Type: "P", Value: false}, + {Type: "P", Value: false}, + }}, + }) +} + +// I know, this is hell, but that's the whole point for this test :-> +func TestCombination(t *testing.T) { + var c, a, m = tokenize.C, tokenize.A, tokenize.M + demonic := c.Seq( + c.Optional(a.SquareOpen), + m.Trim( + c.Seq( + c.Optional(a.Blanks), + c.Repeated(3, a.AngleClose), + m.ByCallback(c.OneOrMore(a.StrNoCase("hello")), func(s string) string { + return fmt.Sprintf("%d", len(s)) + }), + m.Replace(c.Separated(a.Comma, c.Optional(a.Blanks)), ", "), + m.ToUpper(c.Min(1, a.ASCIILower)), + m.Drop(a.Excl), + c.Repeated(3, a.AngleOpen), + c.Optional(a.Blanks), + ), + " \t", + ), + c.Optional(a.SquareClose), + ) + + AssertHandlers(t, []HandlerT{ + {"[ \t >>>Hello, world!<<< ]", demonic, true, "[>>>5, WORLD<<<]"}, + {"[ \t >>>Hello, world!<<< ", demonic, true, "[>>>5, WORLD<<<"}, + {">>>HellohellO, world!<<< ]", demonic, true, ">>>10, WORLD<<<]"}, + {"[ \t >>>HellohellO , , , world!<<< ", demonic, true, "[>>>10, WORLD<<<"}, + }) +} diff --git a/tokenize2/token.go b/tokenize2/token.go new file mode 100644 index 0000000..166367a --- /dev/null +++ b/tokenize2/token.go @@ -0,0 +1,47 @@ +package tokenize2 + +import ( + "fmt" +) + +// Token defines a lexical token as produced by tokenize.Handlers. +// +// The only mandatory data in a Token are the Runes. The Type and Value fields +// are optional fields that can be filled with data at will. +// +// The use of the Type field is to let a tokenizer communicate to +// the parser what type of token it's handling. +// +// The use of the Value field is to store any kind af data along with the token. +// One use of this can be found in the built-in token maker functions like +// MakeInt8Token(), which store an interpreted version of the input string +// in the Value field. +type Token struct { + Type interface{} // optional token type, can be any type that a parser author sees fit + Value interface{} // optional token value, of any type as well +} + +func (t Token) String() string { + tokenType := "" + if t.Type != nil { + tokenType = fmt.Sprintf("%v", t.Type) + } + + value := "" + if t.Value != nil { + switch t.Value.(type) { + case []*Token: + return fmt.Sprintf("%v%v", tokenType, t.Value) + case string: + value = fmt.Sprintf("%q", t.Value) + case rune: + value = fmt.Sprintf("%v", t.Value) + case bool: + value = fmt.Sprintf("%v", t.Value) + default: + value = fmt.Sprintf("(%T)%v", t.Value, t.Value) + } + } + + return fmt.Sprintf("%v(%s)", tokenType, value) +} diff --git a/tokenize2/tokenize.go b/tokenize2/tokenize.go new file mode 100644 index 0000000..1fc35c3 --- /dev/null +++ b/tokenize2/tokenize.go @@ -0,0 +1,41 @@ +// Package tokenize provides tooling to build a tokenizer in +// parser/combinator-style, used to feed data to the parser. +package tokenize2 + +import ( + "fmt" +) + +// Func is the function signature as returned by New: a function that takes +// any supported type of input, executes a tokenizer run and returns a +// Result struct (possibly nil) and an error (possibly nil). +type Func func(input interface{}) (*API, error) + +// New instantiates a new tokenizer. +// +// The tokenizer is a tokenizing state machine, in which tokenize.Handler +// functions are used to move the state machine forward during tokenizing. +// Using the New function, you can wrap a tokenize.Handler in a simple way, +// making it possible to feed some input to the handler and retrieve the +// tokenizing results. +// +// The startHandler argument points the tokenizer to the tokenize.Handler function +// that must be executed at the start of the tokenizing process. From there on +// other tokenize.Handler functions can be invoked recursively to implement the +// tokenizing process. +// +// THis function returns a function that can be invoked to run the tokenizer +// against the provided input data. For an overview of allowed inputs, take a +// look at the documentation for parsekit.read.New(). +func New(tokenHandler Handler) Func { + return func(input interface{}) (*API, error) { + api := NewAPI(input) + ok := tokenHandler(api) + + if !ok { + err := fmt.Errorf("mismatch at %s", Cursor{}) + return nil, err + } + return api, nil + } +} diff --git a/tokenize2/tokenizer_test.go b/tokenize2/tokenizer_test.go new file mode 100644 index 0000000..55fe905 --- /dev/null +++ b/tokenize2/tokenizer_test.go @@ -0,0 +1,223 @@ +package tokenize2_test + +import ( + "fmt" + "io" + "strings" + "testing" + "unicode/utf8" + + tokenize "git.makaay.nl/mauricem/go-parsekit/tokenize2" +) + +// TODO For error handling, it would be really cool if for example the +// 10.0.300.1/24 case would return an actual error stating that +// 300 is not a valid octet for an IPv4 address. +// Biggest thing to take care of here, is that errors should not stop +// a Parser flow (since we might be trying to match different cases in +// sequence), but a Parser flow should optionally be able to make use +// of the actual error. +// The same goes for a Tokenizer, since those can also make use of +// optional matching using tokenize.C.Any(...) for example. If matching +// for Any(IPv4, Digits), the example case should simply end up with 10 +// after the IPv4 mismatch. +func ExampleNew() { + // Build the tokenizer for ip/mask. + var c, a, t = tokenize.C, tokenize.A, tokenize.T + ip := t.Str("ip", a.IPv4) + mask := t.Int8("mask", a.IPv4CIDRMask) + cidr := c.Seq(ip, a.Slash, mask) + tokenizer := tokenize.New(cidr) + + for _, input := range []string{ + "000.000.000.000/000", + "192.168.0.1/24", + "255.255.255.255/32", + "10.0.300.1/24", + "not an IPv4 CIDR", + } { + // Execute returns a Result and an error, which is nil on success. + result, err := tokenizer(input) + + if err == nil { + fmt.Printf("Result: %s\n", result.Tokens()) + } else { + fmt.Printf("Error: %s\n", err) + } + } + // Output: + // Result: [ip("0.0.0.0") mask((int8)0)] + // Result: [ip("192.168.0.1") mask((int8)24)] + // Result: [ip("255.255.255.255") mask((int8)32)] + // Error: mismatch at start of file + // Error: mismatch at start of file +} + +func TestCallingNextRune_ReturnsNextRune(t *testing.T) { + api := makeTokenizeAPI() + r, _ := api.NextRune() + AssertEqual(t, 'T', r, "first rune") +} + +func TestInputCanAcceptRunesFromReader(t *testing.T) { + i := makeTokenizeAPI() + i.NextRune() + i.Accept() + i.NextRune() + i.Accept() + i.NextRune() + i.Accept() + AssertEqual(t, "Tes", i.String(), "i.String()") +} + +func TestCallingNextRuneTwice_Panics(t *testing.T) { + AssertPanic(t, PanicT{ + Function: func() { + i := makeTokenizeAPI() + i.NextRune() + i.NextRune() + }, + Regexp: true, + Expect: `tokenize\.API\.NextRune\(\): NextRune\(\) called at /.*_test\.go:\d+ ` + + `without a prior call to Accept\(\)`, + }) +} + +func TestCallingAcceptWithoutCallingNextRune_Panics(t *testing.T) { + api := makeTokenizeAPI() + AssertPanic(t, PanicT{ + Function: api.Accept, + Regexp: true, + Expect: `tokenize\.API\.Accept\(\): Accept\(\) called at /.*test\.go:\d+ ` + + `without first calling NextRune\(\)`, + }) +} + +func TestCallingAcceptAfterReadError_Panics(t *testing.T) { + api := tokenize.NewAPI("") + AssertPanic(t, PanicT{ + Function: func() { + api.NextRune() + api.Accept() + }, + Regexp: true, + Expect: `tokenize\.API\.Accept\(\): Accept\(\) called at /.*_test\.go:\d+` + + `, but the prior call to NextRune\(\) failed`, + }) +} + +func TestCallingMergeOnTopLevelAPI_Panics(t *testing.T) { + AssertPanic(t, PanicT{ + Function: func() { + i := makeTokenizeAPI() + i.Merge(0) + }, + Regexp: true, + Expect: `tokenize\.API\.Merge\(\): Merge\(\) called at /.*_test.go:\d+ on the top-level API`}) +} + +func TestCallingMergeOnForkParentAPI_Panics(t *testing.T) { + AssertPanic(t, PanicT{ + Function: func() { + i := makeTokenizeAPI() + child := i.Fork() + i.Fork() + i.Merge(child) + }, + Regexp: true, + Expect: `tokenize\.API\.Merge\(\): Merge\(\) called at /.*_test.go:\d+ ` + + `on API stack level 1, but the current stack level is 2 \(forgot to Dispose\(\) a forked child\?\)`}) +} + +func TestCallingDisposeOnTopLevelAPI_Panics(t *testing.T) { + AssertPanic(t, PanicT{ + Function: func() { + i := makeTokenizeAPI() + i.Dispose(0) + }, + Regexp: true, + Expect: `tokenize\.API\.Dispose\(\): Dispose\(\) called at /.*_test.go:\d+ on the top-level API`}) +} + +func TestCallingDisposeOnForkParentAPI_Panics(t *testing.T) { + AssertPanic(t, PanicT{ + Function: func() { + i := makeTokenizeAPI() + child := i.Fork() + i.Fork() + i.Dispose(child) + }, + Regexp: true, + Expect: `tokenize\.API\.Dispose\(\): Dispose\(\) called at /.*_test.go:\d+ ` + + `on API stack level 1, but the current stack level is 2 \(forgot to Dispose\(\) a forked child\?\)`}) +} + +func TestCallingForkOnForkedParentAPI_Panics(t *testing.T) { + AssertPanic(t, PanicT{ + Function: func() { + i := makeTokenizeAPI() + i.Fork() + g := i.Fork() + i.Fork() + i.Merge(g) + }, + Regexp: true, + Expect: `tokenize\.API\.Merge\(\): Merge\(\) called at /.*_test.go:\d+ ` + + `on API stack level 2, but the current stack level is 3 \(forgot to Dispose\(\) a forked child\?\)`}) +} + +func TestForkingInput_ClearsLastRune(t *testing.T) { + AssertPanic(t, PanicT{ + Function: func() { + i := makeTokenizeAPI() + i.NextRune() + i.Fork() + i.Accept() + }, + Regexp: true, + Expect: `tokenize\.API\.Accept\(\): Accept\(\) called at /.*_test\.go:\d+ without first calling NextRune\(\)`, + }) +} + +func TestAccept_UpdatesCursor(t *testing.T) { + i := tokenize.NewAPI(strings.NewReader("input\r\nwith\r\nnewlines")) + AssertEqual(t, "start of file", i.Cursor().String(), "cursor 1") + for j := 0; j < 6; j++ { // read "input\r", cursor end up at "\n" + i.NextRune() + i.Accept() + } + AssertEqual(t, "line 1, column 7", i.Cursor().String(), "cursor 2") + i.NextRune() // read "\n", cursor ends up at start of new line + i.Accept() + AssertEqual(t, "line 2, column 1", i.Cursor().String(), "cursor 3") + for j := 0; j < 10; j++ { // read "with\r\nnewl", cursor end up at "i" + i.NextRune() + i.Accept() + } + AssertEqual(t, "line 3, column 5", i.Cursor().String(), "cursor 4") +} + +func TestWhenCallingNextruneAtEndOfFile_EOFIsReturned(t *testing.T) { + i := tokenize.NewAPI(strings.NewReader("X")) + i.NextRune() + i.Accept() + r, err := i.NextRune() + AssertEqual(t, true, r == utf8.RuneError, "returned rune from NextRune()") + AssertEqual(t, true, err == io.EOF, "returned error from NextRune()") +} +func TestAfterReadingruneAtEndOfFile_EarlierRunesCanStillBeAccessed(t *testing.T) { + i := tokenize.NewAPI(strings.NewReader("X")) + child := i.Fork() + i.NextRune() + i.Accept() + r, err := i.NextRune() + AssertEqual(t, true, r == utf8.RuneError, "returned rune from 2nd NextRune()") + i.Dispose(child) // brings the read offset back to the start + r, err = i.NextRune() // so here we should see the same rune + AssertEqual(t, 'X', r, "returned rune from 2nd NextRune()") + AssertEqual(t, true, err == nil, "returned error from 2nd NextRune()") +} + +func makeTokenizeAPI() *tokenize.API { + return tokenize.NewAPI("Testing") +} diff --git a/tokenize2/tokenizer_whitebox_test.go b/tokenize2/tokenizer_whitebox_test.go new file mode 100644 index 0000000..10ae253 --- /dev/null +++ b/tokenize2/tokenizer_whitebox_test.go @@ -0,0 +1,110 @@ +package tokenize2 + +import ( + "testing" +) + +func TestFork_CreatesForkOfInputAtSameCursorPosition(t *testing.T) { + // Create input, accept the first rune. + i := NewAPI("Testing") + i.NextRune() + i.Accept() // T + AssertEqual(t, "T", i.String(), "accepted rune in input") + // Fork + child := i.Fork() + AssertEqual(t, 1, i.stackFrame.cursor.Byte, "parent cursor.Byte") + AssertEqual(t, 1, i.stackFrame.offset, "parent offset") + AssertEqual(t, 1, i.stackFrame.cursor.Byte, "child cursor.Byte") + AssertEqual(t, 1, i.stackFrame.offset, "child offset") + // Accept two runes via fork. + i.NextRune() + i.Accept() // e + i.NextRune() + i.Accept() // s + AssertEqual(t, "es", i.String(), "result runes in fork") + AssertEqual(t, 1, i.stackFrames[i.stackLevel-1].cursor.Byte, "parent cursor.Byte") + AssertEqual(t, 1, i.stackFrames[i.stackLevel-1].offset, "parent offset") + AssertEqual(t, 3, i.stackFrame.cursor.Byte, "child cursor.Byte") + AssertEqual(t, 3, i.stackFrame.offset, "child offset") + // Merge fork back into parent + i.Merge(child) + i.Dispose(child) + AssertEqual(t, "Tes", i.String(), "result runes in parent Input after Merge()") + AssertEqual(t, 3, i.stackFrame.cursor.Byte, "parent cursor.Byte") + AssertEqual(t, 3, i.stackFrame.offset, "parent offset") +} + +func TestGivenForkedChildWhichAcceptedRune_AfterMerging_RuneEndsUpInParentResult(t *testing.T) { + i := NewAPI("Testing") + i.NextRune() + i.Accept() + f1 := i.Fork() + i.NextRune() + i.Accept() + f2 := i.Fork() + i.NextRune() + i.Accept() + AssertEqual(t, "s", i.String(), "f2 String()") + AssertEqual(t, 3, i.stackFrame.offset, "f2.offset A") + i.Merge(f2) + i.Dispose(f2) + AssertEqual(t, "es", i.String(), "f1 String()") + AssertEqual(t, 3, i.stackFrame.offset, "f1.offset A") + i.Merge(f1) + i.Dispose(f1) + AssertEqual(t, "Tes", i.String(), "top-level API String()") + AssertEqual(t, 3, i.stackFrame.offset, "f1.offset A") +} + +func TestCallingAcceptAfterNextRune_AcceptsRuneAndMovesReadOffsetForward(t *testing.T) { + i := NewAPI("Testing") + r, _ := i.NextRune() + AssertEqual(t, 'T', r, "result from 1st call to NextRune()") + AssertTrue(t, i.lastRune == 'T', "API.lastRune after NextRune() is not 'T'") + AssertTrue(t, i.runeRead, "API.runeRead after NextRune() is not true") + i.Accept() + AssertTrue(t, i.runeRead == false, "API.runeRead after Accept() is not false") + AssertEqual(t, 1, i.stackFrame.offset, "API.stackFrame.offset") + r, _ = i.NextRune() + AssertEqual(t, 'e', r, "result from 2nd call to NextRune()") +} + +func TestFlushInput(t *testing.T) { + api := NewAPI("cool") + + // Flushing without any read data is okay. FlushInput() will return + // false in this case, and nothing else happens. + AssertTrue(t, api.FlushInput() == false, "flush input at start") + + api.NextRune() + api.Accept() + api.NextRune() + api.Accept() + + AssertTrue(t, api.FlushInput() == true, "flush input after reading some data") + AssertEqual(t, 0, api.stackFrame.offset, "offset after flush input") + + AssertTrue(t, api.FlushInput() == false, "flush input after flush input") + + // Read offset is now zero, but reading should continue after "co". + api.NextRune() + api.Accept() + api.NextRune() + api.Accept() + + AssertEqual(t, "cool", api.String(), "end result") +} + +func AssertEqual(t *testing.T, expected interface{}, actual interface{}, forWhat string) { + if expected != actual { + t.Errorf( + "Unexpected value for %s:\nexpected: %q\nactual: %q", + forWhat, expected, actual) + } +} + +func AssertTrue(t *testing.T, b bool, assertion string) { + if !b { + t.Errorf("Assertion %s is false", assertion) + } +}