New implementation for performance.

This commit is contained in:
Maurice Makaay 2019-07-10 11:26:47 +00:00
parent 7795588fe6
commit 48d7fda9f8
15 changed files with 3524 additions and 0 deletions

374
tokenize2/api.go Normal file
View File

@ -0,0 +1,374 @@
package tokenize2
import (
"git.makaay.nl/mauricem/go-parsekit/read"
)
// API holds the internal state of a tokenizer run and provides an API that
// tokenize.Handler functions can use to:
//
// • read and accept runes from the input (NextRune, Accept)
//
// • fork the API for easy lookahead support (Fork, Merge, Reset, Dispose)
//
// • flush already read input data when not needed anymore (FlushInput)
//
// • retrieve the tokenizer Result struct (Result) to read or modify the results
//
// BASIC OPERATION:
//
// To retrieve the next rune from the API, call the NextRune() method.
//
// When the rune is to be accepted as input, call the method Accept(). The rune
// is then added to the result runes of the API and the read cursor is moved
// forward.
//
// By invoking NextRune() + Accept() multiple times, the result can be extended
// with as many runes as needed. Runes collected this way can later on be
// retrieved using the method Result().Runes().
//
// It is mandatory to call Accept() after retrieving a rune, before calling
// NextRune() again. Failing to do so will result in a panic.
//
// Next to adding runes to the result, it is also possible to modify the
// stored runes or to add lexical Tokens to the result. For all things
// concerning results, take a look at the Result struct, which
// can be accessed though the method Result().
//
// FORKING OPERATION FOR EASY LOOKEAHEAD SUPPORT:
//
// Sometimes, we must be able to perform a lookahead, which might either
// succeed or fail. In case of a failing lookahead, the state of the
// API must be brought back to the original state, so we can try
// a different route.
//
// The way in which this is supported, is by forking an API struct by
// calling method Fork(). This will return a forked child API, with
// empty result data, but using the same read cursor position as the
// forked parent.
//
// After forking, the same interface as described for BASIC OPERATION can be
// used to fill the results. When the lookahead was successful, then
// Merge() can be called on the forked child to append the child's results
// to the parent's results, and to move the read cursor position to that
// of the child.
//
// When the lookahead was unsuccessful, then the forked child API can
// disposed by calling Dispose() on the forked child. This is not mandatory.
// Garbage collection will take care of this automatically.
// The parent API was never modified, so it can safely be used after disposal
// as if the lookahead never happened.
//
// Opinionized note:
// Many tokenizers/parsers take a different approach on lookaheads by using
// peeks and by moving the read cursor position back and forth, or by putting
// read input back on the input stream. That often leads to code that is
// efficient, however, in my opinion, not very intuitive to read. It can also
// be tedious to get the cursor position back at the correct position, which
// can lead to hard to track bugs. I much prefer this forking method, since
// no bookkeeping has to be implemented when implementing a parser.
type API struct {
reader *read.Buffer // the input data reader
lastRune rune // the rune as retrieved by the last NextRune() calll
lastRuneErr error // the error for the last NextRune() call
runeRead bool // whether or not a rune was read using NextRune()
runes []rune // the rune stack
tokens []Token // the token stack
stackFrames []stackFrame // the stack frames, containing stack level-specific data
stackLevel int // the current stack level
stackFrame *stackFrame // the current stack frame
}
type stackFrame struct {
offset int // current rune offset relative to the Reader's sliding window
runeStart int
runeEnd int
tokenStart int
tokenEnd int
cursor Cursor
// TODO
err error // can be used by a Handler to report a specific issue with the input
}
const initialStackDepth = 10
const initialTokenDepth = 10
const initialRuneDepth = 10
// NewAPI initializes a new API struct, wrapped around the provided input.
// For an overview of allowed inputs, take a look at the documentation
// for parsekit.read.New().
func NewAPI(input interface{}) *API {
api := &API{
reader: read.New(input),
runes: make([]rune, 0, initialRuneDepth),
tokens: make([]Token, 0, initialTokenDepth),
stackFrames: make([]stackFrame, 1, initialStackDepth),
}
api.stackFrame = &api.stackFrames[0]
return api
}
// NextRune returns the rune at the current read offset.
//
// When an invalid UTF8 rune is encountered on the input, it is replaced with
// the utf.RuneError rune. It's up to the caller to handle this as an error
// when needed.
//
// After reading a rune it must be Accept()-ed to move the read cursor forward
// to the next rune. Doing so is mandatory. When doing a second call to NextRune()
// without explicitly accepting, this method will panic. You can see this as a
// built-in unit test, enforcing correct serialization of API method calls.
func (i *API) NextRune() (rune, error) {
if i.runeRead {
callerPanic("NextRune", "tokenize.API.{name}(): {name}() called at {caller} "+
"without a prior call to Accept()")
}
readRune, err := i.reader.RuneAt(i.stackFrame.offset)
i.lastRune = readRune
i.lastRuneErr = err
i.runeRead = true
return readRune, err
}
// Accept the last rune as read by NextRune() into the Result runes and move
// the cursor forward.
//
// It is not allowed to call Accept() when the previous call to NextRune()
// returned an error. Calling Accept() in such case will result in a panic.
func (i *API) Accept() {
if !i.runeRead {
callerPanic("Accept", "tokenize.API.{name}(): {name}() called at {caller} "+
"without first calling NextRune()")
} else if i.lastRuneErr != nil {
callerPanic("Accept", "tokenize.API.{name}(): {name}() called at {caller}, "+
"but the prior call to NextRune() failed")
}
i.runes = append(i.runes, i.lastRune)
i.stackFrame.runeEnd++
i.stackFrame.cursor.moveByRune(i.lastRune)
i.stackFrame.offset++
i.runeRead = false
}
// Fork forks off a child of the API struct. It will reuse the same
// read buffer and cursor position, but for the rest this is a fresh API.
//
// By forking an API, you can freely work with the forked child, without
// affecting the parent API. This is for example useful when you must perform
// some form of lookahead.
//
// When processing of the Handler was successful and you want to add the results
// to the parent API, you can call Merge() on the forked child.
// This will add the results to the results of the parent (runes, tokens).
// It also updates the read cursor position of the parent to that of the child.
//
// When the lookahead was unsuccessful, then the forked child API can
// disposed by calling Dispose() on the forked child. This is not mandatory.
// Garbage collection will take care of this automatically.
// The parent API was never modified, so it can safely be used after disposal
// as if the lookahead never happened.
func (i *API) Fork() int {
newStackLevel := i.stackLevel + 1
newStackSize := newStackLevel + 1
// Grow the stack frames capacity when needed.
if cap(i.stackFrames) < newStackSize {
newFrames := make([]stackFrame, newStackSize, newStackSize*2)
copy(newFrames, i.stackFrames)
i.stackFrames = newFrames
} else {
i.stackFrames = i.stackFrames[0:newStackSize]
}
parent := i.stackFrame
i.stackLevel++
i.stackFrame = &i.stackFrames[i.stackLevel]
*i.stackFrame = *parent
i.stackFrame.runeStart = parent.runeEnd
i.stackFrame.tokenStart = parent.tokenEnd
i.runeRead = false
return i.stackLevel
}
// Merge appends the results of a forked child API (runes, tokens) to the
// results of its parent. The read cursor of the parent is also updated
// to that of the forked child.
//
// After the merge operation, the child results are reset so it can immediately
// be reused for performing another match. This means that all Result data are
// cleared, but the read cursor position is kept at its current position.
// This allows a child to feed results in chunks to its parent.
//
// Once the child is no longer needed, it can be disposed of by using the
// method Dispose(), which will return the tokenizer to the parent.
func (i *API) Merge(stackLevel int) {
if stackLevel == 0 {
callerPanic("Merge", "tokenize.API.{name}(): {name}() called at {caller} "+
"on the top-level API stack level 0")
}
if stackLevel != i.stackLevel {
callerPanic("Merge", "tokenize.API.{name}(): {name}() called at {caller} "+
"on API stack level %d, but the current stack level is %d "+
"(forgot to Dispose() a forked child?)", stackLevel, i.stackLevel)
}
parent := &i.stackFrames[stackLevel-1]
if parent.runeEnd == i.stackFrame.runeStart {
// The end of the parent slice aligns with the start of the child slice.
// Because of this, to merge the parent slice can simply be expanded
// to include the child slice.
// parent : |----------|
// child: |------|
// After merge operation:
// parent: |-----------------|
// child: |---> continue reading from here
parent.runeEnd = i.stackFrame.runeEnd
i.stackFrame.runeStart = i.stackFrame.runeEnd
} else {
// The end of the parent slice does not align with the start of the
// child slice. The child slice has to be copied onto the end of
// the parent slice.
// parent : |----------|
// child: |------|
// After merge operation:
// parent: |-----------------|
// child: |---> continue reading from here
i.runes = append(i.runes[:parent.runeEnd], i.runes[i.stackFrame.runeStart:i.stackFrame.runeEnd]...)
parent.runeEnd = len(i.runes)
i.stackFrame.runeStart = parent.runeEnd
i.stackFrame.runeEnd = parent.runeEnd
}
// The same logic applies to tokens.
if parent.tokenEnd == i.stackFrame.tokenStart {
parent.tokenEnd = i.stackFrame.tokenEnd
i.stackFrame.tokenStart = i.stackFrame.tokenEnd
} else {
i.tokens = append(i.tokens[:parent.tokenEnd], i.tokens[i.stackFrame.tokenStart:i.stackFrame.tokenEnd]...)
parent.tokenEnd = len(i.tokens)
i.stackFrame.tokenStart = parent.tokenEnd
i.stackFrame.tokenEnd = parent.tokenEnd
}
parent.offset = i.stackFrame.offset
parent.cursor = i.stackFrame.cursor
i.stackFrame.err = nil
i.runeRead = false
}
func (i *API) Dispose(stackLevel int) {
if stackLevel == 0 {
callerPanic("Dispose", "tokenize.API.{name}(): {name}() called at {caller} "+
"on the top-level API stack level 0")
}
if stackLevel != i.stackLevel {
callerPanic("Dispose", "tokenize.API.{name}(): {name}() called at {caller} "+
"on API stack level %d, but the current stack level is %d "+
"(forgot to Dispose() a forked child?)", stackLevel, i.stackLevel)
}
i.runeRead = false
i.stackLevel = stackLevel - 1
i.stackFrames = i.stackFrames[:stackLevel]
i.stackFrame = &i.stackFrames[stackLevel-1]
i.runes = i.runes[0:i.stackFrame.runeEnd]
i.tokens = i.tokens[0:i.stackFrame.tokenEnd]
}
func (i *API) Reset() {
i.runeRead = false
i.stackFrame.runeStart = i.stackFrame.runeEnd
i.stackFrame.tokenStart = i.stackFrame.tokenEnd
i.stackFrame.err = nil
}
// FlushInput flushes processed input data from the read.Buffer.
// In this context 'processed' means all runes that were read using NextRune()
// and that were added to the results using Accept().
//
// Note:
// When writing your own TokenHandler, you normally won't have to call this
// method yourself. It is automatically called by parsekit when needed.
func (i *API) FlushInput() bool {
// result := &(i.state.stack[i.stackLevel])
if i.stackFrame.offset > 0 {
i.reader.Flush(i.stackFrame.offset)
i.stackFrame.offset = 0
return true
}
return false
}
func (i *API) String() string {
return string(i.Runes())
}
func (i *API) Runes() []rune {
return i.runes[i.stackFrame.runeStart:i.stackFrame.runeEnd]
}
func (i *API) Rune(offset int) rune {
return i.runes[i.stackFrame.runeStart+offset]
}
func (i *API) ClearRunes() {
i.runes = i.runes[:i.stackFrame.runeStart]
i.stackFrame.runeEnd = i.stackFrame.runeStart
}
func (i *API) SetRunes(runes ...rune) {
i.runes = append(i.runes[:i.stackFrame.runeStart], runes...)
i.stackFrame.runeEnd = i.stackFrame.runeStart + len(runes)
}
func (i *API) AddRunes(runes ...rune) {
i.runes = append(i.runes[:i.stackFrame.runeEnd], runes...)
i.stackFrame.runeEnd += len(runes)
}
func (i *API) AddString(s string) {
i.AddRunes([]rune(s)...)
}
func (i *API) SetString(s string) {
i.SetRunes([]rune(s)...)
}
func (i *API) Cursor() Cursor {
return i.stackFrame.cursor
}
func (i *API) Tokens() []Token {
return i.tokens[i.stackFrame.tokenStart:i.stackFrame.tokenEnd]
}
func (i *API) Token(offset int) Token {
return i.tokens[i.stackFrame.tokenStart+offset]
}
func (i *API) TokenValue(offset int) interface{} {
return i.tokens[i.stackFrame.tokenStart+offset].Value
}
func (i *API) ClearTokens() {
i.tokens = i.tokens[:i.stackFrame.tokenStart]
i.stackFrame.tokenEnd = i.stackFrame.tokenStart
}
func (i *API) SetTokens(tokens ...Token) {
i.tokens = append(i.tokens[:i.stackFrame.tokenStart], tokens...)
i.stackFrame.tokenEnd = i.stackFrame.tokenStart + len(tokens)
}
func (i *API) AddTokens(tokens ...Token) {
i.tokens = append(i.tokens[:i.stackFrame.tokenEnd], tokens...)
i.stackFrame.tokenEnd += len(tokens)
}

330
tokenize2/api_test.go Normal file
View File

@ -0,0 +1,330 @@
package tokenize2_test
import (
"fmt"
"testing"
tokenize "git.makaay.nl/mauricem/go-parsekit/tokenize2"
)
func ExampleNewAPI() {
tokenize.NewAPI("The input that the API will handle")
// Output:
}
func ExampleAPI_NextRune() {
api := tokenize.NewAPI("The input that the API will handle")
r, err := api.NextRune()
fmt.Printf("Rune read from input; %c\n", r)
fmt.Printf("The error: %v\n", err)
fmt.Printf("API results: %q\n", api.String())
// Output:
// Rune read from input; T
// The error: <nil>
// API results: ""
}
func ExampleAPI_Accept() {
api := tokenize.NewAPI("The input that the API will handle")
api.NextRune() // reads 'T'
api.Accept() // adds 'T' to the API results
api.NextRune() // reads 'h'
api.Accept() // adds 'h' to the API results
api.NextRune() // reads 'e', but it is not added to the API results
fmt.Printf("API results: %q\n", api.String())
// Output:
// API results: "Th"
}
func ExampleAPI_modifyingResults() {
api := tokenize.NewAPI("")
api.AddString("Some runes")
api.AddRunes(' ', 'a', 'd', 'd', 'e', 'd')
api.AddRunes(' ', 'i', 'n', ' ')
api.AddString("various ways")
fmt.Printf("API result first 10 runes: %q\n", api.Runes()[0:10])
fmt.Printf("API result runes as string: %q\n", api.String())
api.SetString("new ")
api.AddString("set ")
api.AddString("of ")
api.AddRunes('r', 'u', 'n', 'e', 's')
fmt.Printf("API result runes as string: %q\n", api.String())
fmt.Printf("API result runes: %q\n", api.Runes())
fmt.Printf("API third rune: %q\n", api.Rune(2))
api.AddTokens(tokenize.Token{
Type: 42,
Value: "towel"})
api.AddTokens(tokenize.Token{
Type: 73,
Value: "Zaphod"})
fmt.Printf("API result tokens: %v\n", api.Tokens())
fmt.Printf("API second result token: %v\n", api.Token(1))
// Output:
// API result first 10 runes: ['S' 'o' 'm' 'e' ' ' 'r' 'u' 'n' 'e' 's']
// API result runes as string: "Some runes added in various ways"
// API result runes as string: "new set of runes"
// API result runes: ['n' 'e' 'w' ' ' 's' 'e' 't' ' ' 'o' 'f' ' ' 'r' 'u' 'n' 'e' 's']
// API third rune: 'w'
// API result tokens: [42("towel") 73("Zaphod")]
// API second result token: 73("Zaphod")
}
func ExampleAPI_Reset() {
api := tokenize.NewAPI("Very important input!")
api.NextRune()
api.Accept()
api.NextRune()
api.Accept()
fmt.Printf("API results: %q at %s\n", api.String(), api.Cursor())
// Reset clears the results, but keeps the cursor position.
api.Reset()
fmt.Printf("API results: %q at %s\n", api.String(), api.Cursor())
api.NextRune()
api.Accept()
api.NextRune()
api.Accept()
fmt.Printf("API results: %q at %s\n", api.String(), api.Cursor())
// Output:
// API results: "Ve" at line 1, column 3
// API results: "" at line 1, column 3
// API results: "ry" at line 1, column 5
}
func ExampleAPI_Fork() {
// This custom Handler checks for input 'a', 'b' or 'c'.
abcHandler := func(t *tokenize.API) bool {
a := tokenize.A
for _, r := range []rune{'a', 'b', 'c'} {
child := t.Fork() // fork, so we won't change parent t
if a.Rune(r)(t) {
t.Merge(child) // accept results into parent of child
t.Dispose(child) // return to the parent level
return true // and report a successful match
}
t.Dispose(child) // return to the parent level
}
// If we get here, then no match was found. Return false to communicate
// this to the caller.
return false
}
// Note: a custom Handler is normally not what you need.
// You can make use of the parser/combinator tooling to make the
// implementation a lot simpler and to take care of forking at
// the appropriate places. The handler from above can be replaced with:
simpler := tokenize.A.RuneRange('a', 'c')
result, err := tokenize.New(abcHandler)("another test")
fmt.Println(result, err)
result, err = tokenize.New(simpler)("curious")
fmt.Println(result, err)
result, err = tokenize.New(abcHandler)("bang on!")
fmt.Println(result, err)
result, err = tokenize.New(abcHandler)("not a match")
fmt.Println(result, err)
// Output:
// a <nil>
// c <nil>
// b <nil>
// <nil> mismatch at start of file
}
func ExampleAPI_Merge() {
tokenHandler := func(t *tokenize.API) bool {
child1 := t.Fork()
t.NextRune() // reads 'H'
t.Accept()
t.NextRune() // reads 'i'
t.Accept()
child2 := t.Fork()
t.NextRune() // reads ' '
t.Accept()
t.NextRune() // reads 'm'
t.Accept()
t.Dispose(child2)
t.Merge(child1) // We merge child1, which has read 'H' and 'i' only.
t.Dispose(child1) // and clean up child1 to return to the parent
return true
}
result, _ := tokenize.New(tokenHandler)("Hi mister X!")
fmt.Println(result.String())
// Output:
// Hi
}
func TestMultipleLevelsOfForksAndMerges(t *testing.T) {
api := tokenize.NewAPI("abcdefghijklmnopqrstuvwxyz")
// Fork a few levels.
child1 := api.Fork()
child2 := api.Fork()
child3 := api.Fork()
child4 := api.Fork()
// Read a rune 'a' from child4.
r, _ := api.NextRune()
AssertEqual(t, 'a', r, "child4 rune 1")
api.Accept()
AssertEqual(t, "a", api.String(), "child4 runes after rune 1")
// Read another rune 'b' from child4.
r, _ = api.NextRune()
AssertEqual(t, 'b', r, "child4 rune 2")
api.Accept()
AssertEqual(t, "ab", api.String(), "child4 runes after rune 2")
// Merge "ab" from child4 to child3.
api.Merge(child4)
AssertEqual(t, "", api.String(), "child4 runes after first merge")
// Read some more from child4.
r, _ = api.NextRune()
AssertEqual(t, 'c', r, "child4 rune 3")
api.Accept()
AssertEqual(t, "c", api.String(), "child4 runes after rune 1")
AssertEqual(t, "line 1, column 4", api.Cursor().String(), "cursor child4 rune 3")
// Merge "c" from child4 to child3.
api.Merge(child4)
// And dispose of child4, making child3 the active stack level.
api.Dispose(child4)
// Child3 should now have the compbined results "abc" from child4's work.
AssertEqual(t, "abc", api.String(), "child3 after merge of child4")
AssertEqual(t, "line 1, column 4", api.Cursor().String(), "cursor child3 rune 3, after merge of child4")
// Now read some data from child3.
r, _ = api.NextRune()
AssertEqual(t, 'd', r, "child3 rune 5")
api.Accept()
r, _ = api.NextRune()
AssertEqual(t, 'e', r, "child3 rune 5")
api.Accept()
r, _ = api.NextRune()
AssertEqual(t, 'f', r, "child3 rune 5")
api.Accept()
AssertEqual(t, "abcdef", api.String(), "child3 total result after rune 6")
// Temporarily go some new forks from here, but don't use their outcome.
child3sub1 := api.Fork()
api.NextRune()
api.Accept()
api.NextRune()
api.Accept()
child3sub2 := api.Fork()
api.NextRune()
api.Accept()
api.Merge(child3sub2) // do merge sub2 down to sub1
api.Dispose(child3sub2) // and dispose of sub2
api.Dispose(child3sub1) // but dispose of sub1 without merging
// Instead merge the results from before this forking segway from child3 to child2
// and dispose of it.
api.Merge(child3)
api.Dispose(child3)
AssertEqual(t, "abcdef", api.String(), "child2 total result after merge of child3")
AssertEqual(t, "line 1, column 7", api.Cursor().String(), "cursor child2 after merge child3")
// Merge child2 to child1 and dispose of it.
api.Merge(child2)
api.Dispose(child2)
// Merge child1 a few times to the top level api.
api.Merge(child1)
api.Merge(child1)
api.Merge(child1)
api.Merge(child1)
// And dispose of it.
api.Dispose(child1)
// Read some data from the top level api.
r, _ = api.NextRune()
api.Accept()
AssertEqual(t, "abcdefg", api.String(), "api string end result")
AssertEqual(t, "line 1, column 8", api.Cursor().String(), "api cursor end result")
}
func TestClearRunes(t *testing.T) {
api := tokenize.NewAPI("Laphroaig")
api.NextRune() // Read 'L'
api.Accept() // Add to runes
api.NextRune() // Read 'a'
api.Accept() // Add to runes
api.ClearRunes() // Clear the runes
api.NextRune() // Read 'p'
api.Accept() // Add to runes
api.NextRune() // Read 'r'
api.Accept() // Add to runes
AssertEqual(t, "ph", api.String(), "api string end result")
}
func TestMergeScenariosForTokens(t *testing.T) {
api := tokenize.NewAPI("")
token1 := tokenize.Token{Value: 1}
token2 := tokenize.Token{Value: 2}
token3 := tokenize.Token{Value: 3}
token4 := tokenize.Token{Value: 4}
api.SetTokens(token1)
tokens := api.Tokens()
AssertEqual(t, 1, len(tokens), "Tokens 1")
child := api.Fork()
tokens = api.Tokens()
AssertEqual(t, 0, len(tokens), "Tokens 2")
api.AddTokens(token2)
// Here we can merge by expanding the token slice on the parent,
// because the end of the parent slice and the start of the child
// slice align.
api.Merge(child)
api.Dispose(child)
tokens = api.Tokens()
AssertEqual(t, 2, len(tokens), "Tokens 3")
child = api.Fork()
api.AddTokens(token3)
api.Reset()
api.AddTokens(token4)
// Here the merge means that token4 will be copied to the end of
// the token slice of the parent, since there's a gap at the place
// where token3 used to be.
api.Merge(child)
api.Dispose(child)
tokens = api.Tokens()
AssertEqual(t, 3, len(tokens), "Tokens 4")
AssertEqual(t, 1, api.TokenValue(0).(int), "Tokens 4, value 0")
AssertEqual(t, 2, api.TokenValue(1).(int), "Tokens 4, value 1")
AssertEqual(t, 4, api.TokenValue(2).(int), "Tokens 4, value 2")
}

View File

@ -0,0 +1,118 @@
package tokenize2_test
// This file contains some tools that are used for writing tests.
import (
"regexp"
"testing"
tokenize "git.makaay.nl/mauricem/go-parsekit/tokenize2"
)
func AssertEqual(t *testing.T, expected interface{}, actual interface{}, forWhat string) {
if expected != actual {
t.Errorf(
"Unexpected value for %s:\nexpected: %q\nactual: %q",
forWhat, expected, actual)
}
}
func AssertTrue(t *testing.T, b bool, assertion string) {
if !b {
t.Errorf("Assertion %s is false", assertion)
}
}
type PanicT struct {
Function func()
Regexp bool
Expect string
}
func AssertPanics(t *testing.T, testSet []PanicT) {
for _, test := range testSet {
AssertPanic(t, test)
}
}
func AssertPanic(t *testing.T, p PanicT) {
defer func() {
if r := recover(); r != nil {
mismatch := false
if p.Regexp && !regexp.MustCompile(p.Expect).MatchString(r.(string)) {
mismatch = true
}
if !p.Regexp && p.Expect != r.(string) {
mismatch = true
}
if mismatch {
t.Errorf(
"Code did panic, but unexpected panic message received:\nexpected: %q\nactual: %q",
p.Expect, r)
}
} else {
t.Errorf("Function did not panic (expected panic message: %s)", p.Expect)
}
}()
p.Function()
}
type HandlerT struct {
Input string
Handler tokenize.Handler
MustMatch bool
Expected string
}
func AssertHandlers(t *testing.T, testSet []HandlerT) {
for _, test := range testSet {
AssertHandler(t, test)
}
}
func AssertHandler(t *testing.T, test HandlerT) {
result, err := tokenize.New(test.Handler)(test.Input)
if test.MustMatch {
if err != nil {
t.Errorf("Test %q failed with error: %s", test.Input, err)
} else if output := result.String(); output != test.Expected {
t.Errorf("Test %q failed: not expected output:\nexpected: %q\nactual: %q\n", test.Input, test.Expected, output)
}
} else {
if err == nil {
t.Errorf("Test %q failed: should not match, but it did", test.Input)
}
}
}
type TokenMakerT struct {
Input string
Handler tokenize.Handler
Expected []tokenize.Token
}
func AssertTokenMakers(t *testing.T, testSet []TokenMakerT) {
for _, test := range testSet {
AssertTokenMaker(t, test)
}
}
func AssertTokenMaker(t *testing.T, test TokenMakerT) {
result, err := tokenize.New(test.Handler)(test.Input)
if err != nil {
t.Errorf("Test %q failed with error: %s", test.Input, err)
} else {
if len(result.Tokens()) != len(test.Expected) {
t.Errorf("Unexpected number of tokens in output:\nexpected: %d\nactual: %d", len(test.Expected), len(result.Tokens()))
}
for i, expected := range test.Expected {
actual := result.Token(i)
if expected.Type != actual.Type {
t.Errorf("Unexpected Type in result.Tokens[%d]:\nexpected: (%T) %s\nactual: (%T) %s", i, expected.Type, expected.Type, actual.Type, actual.Type)
}
if expected.Value != actual.Value {
t.Errorf("Unexpected Value in result.Tokens[%d]:\nexpected: (%T) %s\nactual: (%T) %s", i, expected.Value, expected.Value, actual.Value, actual.Value)
}
}
}
}

33
tokenize2/callerinfo.go Normal file
View File

@ -0,0 +1,33 @@
package tokenize2
import (
"fmt"
"runtime"
"strings"
)
func callerPanic(name, f string, data ...interface{}) {
filepos := callerBefore(name)
m := fmt.Sprintf(f, data...)
m = strings.Replace(m, "{caller}", filepos, -1)
m = strings.Replace(m, "{name}", name, -1)
panic(m)
}
func callerBefore(name string) string {
found := false
for i := 1; ; i++ {
pc, file, line, ok := runtime.Caller(i)
if found {
return fmt.Sprintf("%s:%d", file, line)
}
if !ok {
return "unknown caller"
}
f := runtime.FuncForPC(pc)
if strings.HasSuffix(f.Name(), "."+name) {
found = true
}
}
}

View File

@ -0,0 +1,35 @@
package tokenize2
import (
"strings"
"testing"
)
func SomeFunc1() {
SomeFunc2()
}
func SomeFunc2() {
SomeFunc3()
}
func SomeFunc3() {
callerPanic("SomeFunc2", "{name} was called from {caller}")
}
func TestCallerPanic(t *testing.T) {
defer func() {
r := recover()
err := r.(string)
if !strings.Contains(err, "SomeFunc2 was called from") || !strings.Contains(err, "callerinfo_test.go:") {
t.Fatalf("Unexpected error message: %s", err)
}
}()
SomeFunc1()
}
func TestCallerBefore_WithFunctionNameNotInStack(t *testing.T) {
caller := callerBefore("NotExistingAtAll")
AssertEqual(t, "unknown caller", caller, "result for name not in stack")
}

45
tokenize2/cursor.go Normal file
View File

@ -0,0 +1,45 @@
package tokenize2
import (
"fmt"
"unicode/utf8"
)
// Cursor represents the position of a cursor in various ways.
type Cursor struct {
Byte int // The cursor offset in bytes
Rune int // The cursor offset in UTF8 runes
Column int // The column at which the cursor is (0-indexed)
Line int // The line at which the cursor is (0-indexed)
}
// String produces a string representation of the cursor position.
func (c Cursor) String() string {
if c.Line == 0 && c.Column == 0 {
return fmt.Sprintf("start of file")
}
return fmt.Sprintf("line %d, column %d", c.Line+1, c.Column+1)
}
// move updates the position of the cursor, based on the provided input string.
// The input string represents the runes that the cursor must be moved over.
// This method will take newlines into account to keep track of line numbers and
// column positions automatically.
func (c *Cursor) move(input string) *Cursor {
for _, r := range input {
c.moveByRune(r)
}
return c
}
func (c *Cursor) moveByRune(r rune) *Cursor {
c.Byte += utf8.RuneLen(r)
c.Rune++
if r == '\n' {
c.Column = 0
c.Line++
} else {
c.Column++
}
return c
}

69
tokenize2/cursor_test.go Normal file
View File

@ -0,0 +1,69 @@
package tokenize2
import (
"fmt"
"testing"
)
func ExampleCursor_move() {
c := Cursor{}
fmt.Printf("after initialization : %s\n", c)
fmt.Printf("after 'some words' : %s\n", c.move("some words"))
fmt.Printf("after '\\n' : %s\n", c.move("\n"))
fmt.Printf("after '\\r\\nskip\\nlines' : %s\n", c.move("\r\nskip\nlines"))
// Output:
// after initialization : start of file
// after 'some words' : line 1, column 11
// after '\n' : line 2, column 1
// after '\r\nskip\nlines' : line 4, column 6
}
func ExampleCursor_String() {
c := Cursor{}
fmt.Println(c.String())
c.move("\nfoobar")
fmt.Println(c.String())
// Output:
// start of file
// line 2, column 7
}
func TestGivenCursor_WhenMoving_CursorIsUpdated(t *testing.T) {
for _, test := range []struct {
name string
input []string
byte int
rune int
line int
column int
}{
{"No input at all", []string{""}, 0, 0, 0, 0},
{"One ASCII char", []string{"a"}, 1, 1, 0, 1},
{"Multiple ASCII chars", []string{"abc"}, 3, 3, 0, 3},
{"One newline", []string{"\n"}, 1, 1, 1, 0},
{"Carriage return", []string{"\r\r\r"}, 3, 3, 0, 3},
{"One UTF8 3 byte char", []string{"⌘"}, 3, 1, 0, 1},
{"Mixture", []string{"Hello\n\npretty\nW⌘O⌘R⌘L⌘D"}, 31, 23, 3, 9},
{"Multiple calls", []string{"hello", "world"}, 10, 10, 0, 10},
} {
c := Cursor{}
for _, s := range test.input {
c.move(s)
}
if c.Byte != test.byte {
t.Errorf("[%s] Unexpected byte offset %d (expected %d)", test.name, c.Byte, test.byte)
}
if c.Rune != test.rune {
t.Errorf("[%s] Unexpected rune offset %d (expected %d)", test.name, c.Rune, test.rune)
}
if c.Line != test.line {
t.Errorf("[%s] Unexpected line offset %d (expected %d)", test.name, c.Line, test.line)
}
if c.Column != test.column {
t.Errorf("[%s] Unexpected column offset %d (expected %d)", test.name, c.Column, test.column)
}
}
}

53
tokenize2/handler.go Normal file
View File

@ -0,0 +1,53 @@
package tokenize2
// Handler is the function type that is involved in turning a low level
// stream of UTF8 runes into lexical tokens. Its purpose is to check if input
// data matches some kind of pattern and to report back the results.
//
// A Handler function gets an API as its input and returns a boolean to
// indicate whether or not it found a match on the input. The API is used
// for retrieving input data to match against and for reporting back results.
type Handler func(t *API) bool
// Match is syntactic sugar that allows you to write a construction like
// NewTokenizer(handler).Execute(input) as handler.Match(input).
func (handler Handler) Match(input interface{}) (*API, error) {
tokenizer := New(handler)
return tokenizer(input)
}
// Or is syntactic sugar that allows you to write a construction like
// MatchAny(tokenHandler1, tokenHandler2) as tokenHandler1.Or(tokenHandler2).
func (handler Handler) Or(otherHandler Handler) Handler {
return MatchAny(handler, otherHandler)
}
// Times is syntactic sugar that allows you to write a construction like
// MatchRep(3, handler) as handler.Times(3).
func (handler Handler) Times(n int) Handler {
return MatchRep(n, handler)
}
// Then is syntactic sugar that allows you to write a construction like
// MatchSeq(handler1, handler2, handler3) as handler1.Then(handler2).Then(handler3).
func (handler Handler) Then(otherHandler Handler) Handler {
return MatchSeq(handler, otherHandler)
}
// SeparatedBy is syntactic sugar that allows you to write a construction like
// MatchSeparated(handler, separator) as handler.SeparatedBy(separator).
func (handler Handler) SeparatedBy(separator Handler) Handler {
return MatchSeparated(separator, handler)
}
// Optional is syntactic sugar that allows you to write a construction like
// MatchOptional(handler) as handler.Optional().
func (handler Handler) Optional() Handler {
return MatchOptional(handler)
}
// Except is syntactic sugar that allows you to write a construction like
// MatchExcept(handler) as handler.Optional().
func (handler Handler) Except(exceptHandler Handler) Handler {
return MatchExcept(handler, exceptHandler)
}

101
tokenize2/handler_test.go Normal file
View File

@ -0,0 +1,101 @@
package tokenize2_test
import (
"fmt"
"testing"
tokenize "git.makaay.nl/mauricem/go-parsekit/tokenize2"
)
func TestSyntacticSugar(t *testing.T) {
var a = tokenize.A
AssertHandlers(t, []HandlerT{
{"aaaaaa", a.Rune('a').Times(4), true, "aaaa"},
{"ababab", a.Rune('a').Or(a.Rune('b')).Times(4), true, "abab"},
{"ababab", a.Rune('a').Then(a.Rune('b')), true, "ab"},
{"bababa", a.Rune('a').Then(a.Rune('b')), false, ""},
{"cccccc", a.Rune('c').Optional(), true, "c"},
{"dddddd", a.Rune('c').Optional(), true, ""},
{"a,b,c,d", a.ASCII.SeparatedBy(a.Comma), true, "a,b,c,d"},
{"a, b, c, d", a.ASCII.SeparatedBy(a.Comma.Then(a.Space)), true, "a, b, c, d"},
{"a, b,c,d", a.ASCII.SeparatedBy(a.Comma.Then(a.Space.Optional())), true, "a, b,c,d"},
{"a, b, c, d", a.ASCII.SeparatedBy(a.Space.Optional().Then(a.Comma.Then(a.Space.Optional()))), true, "a, b, c, d"},
{"a,b ,c, d|", a.ASCII.SeparatedBy(a.Space.Optional().Then(a.Comma).Then(a.Space.Optional())), true, "a,b ,c, d"},
})
}
func ExampleHandler_Times() {
c, a := tokenize.C, tokenize.A
phoneNumber := c.Seq(a.Rune('0'), a.Digit.Times(9))
fmt.Println(phoneNumber.Match("0201234567"))
// Output:
// 0201234567 <nil>
}
func ExampleHandler_Then() {
c, a := tokenize.C, tokenize.A
phoneNumber := a.Rune('0').Then(c.Repeated(9, a.Digit))
fmt.Println(phoneNumber.Match("0208888888"))
// Output:
// 0208888888 <nil>
}
func ExampleHandler_Or() {
c, a := tokenize.C, tokenize.A
phoneNumber := c.Seq(a.Str("00").Or(a.Plus), a.Str("31"), a.DigitNotZero, c.Repeated(8, a.Digit))
fmt.Println(phoneNumber.Match("+31209876543"))
fmt.Println(phoneNumber.Match("0031209876543"))
fmt.Println(phoneNumber.Match("0031020991234"))
fmt.Println(phoneNumber.Match("0031201234"))
// Output:
// +31209876543 <nil>
// 0031209876543 <nil>
// <nil> mismatch at start of file
// <nil> mismatch at start of file
}
func ExampleHandler_SeparatedBy() {
a, t := tokenize.A, tokenize.T
csv := t.Int("number", a.Digits).SeparatedBy(a.Comma)
r, _ := csv.Match("123,456,7,8,9")
for i, token := range r.Tokens() {
fmt.Printf("[%d] %v\n", i, token)
}
// Output:
// [0] number((int)123)
// [1] number((int)456)
// [2] number((int)7)
// [3] number((int)8)
// [4] number((int)9)
}
func ExampleHandler_Optional() {
c, a := tokenize.C, tokenize.A
spanish := c.Seq(
a.Rune('¿').Optional(),
c.OneOrMore(a.AnyRune.Except(a.Question)),
a.Rune('?').Optional())
fmt.Println(spanish.Match("¿Habla español María?"))
fmt.Println(spanish.Match("Sí, María habla español."))
// Output:
// ¿Habla español María? <nil>
// Sí, María habla español. <nil>
}
func ExampleHandler_Match() {
r, err := tokenize.A.IPv4.Match("001.002.003.004")
fmt.Println(r, err)
r, err = tokenize.A.IPv4.Match("1.2.3")
fmt.Println(r, err)
// Output:
// 1.2.3.4 <nil>
// <nil> mismatch at start of file
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,445 @@
package tokenize2_test
import (
"fmt"
"testing"
tokenize "git.makaay.nl/mauricem/go-parsekit/tokenize2"
)
func TestCombinatorsTempDebug(t *testing.T) {
var a = tokenize.A
AssertHandlers(t, []HandlerT{
// {"024", a.IPv4CIDRMask, true, "24"},
// {"024", a.Octet, true, "24"},
{"192.168.6.123/024", a.IPv4Net, true, "192.168.6.123/24"},
})
}
func TestCombinators(t *testing.T) {
var c, a, m = tokenize.C, tokenize.A, tokenize.M
AssertHandlers(t, []HandlerT{
{"abc not", c.Not(a.Rune('b')), true, "a"},
{"bcd not", c.Not(a.Rune('b')), false, ""},
{"1010 not", c.Not(c.Seq(a.Rune('2'), a.Rune('0'))), true, "1"},
{"2020 not", c.Not(c.Seq(a.Rune('2'), a.Rune('0'))), false, ""},
{"abc any", c.Any(a.Rune('a'), a.Rune('b')), true, "a"},
{"bcd any", c.Any(a.Rune('a'), a.Rune('b')), true, "b"},
{"cde any", c.Any(a.Rune('a'), a.Rune('b')), false, ""},
{"ababc repeated", c.Repeated(4, a.Runes('a', 'b')), true, "abab"},
{"ababc repeated", c.Repeated(5, a.Runes('a', 'b')), false, ""},
{"", c.Min(0, a.Rune('a')), true, ""},
{"a", c.Min(0, a.Rune('a')), true, "a"},
{"aaaaa", c.Min(4, a.Rune('a')), true, "aaaaa"},
{"aaaaa", c.Min(5, a.Rune('a')), true, "aaaaa"},
{"aaaaa", c.Min(6, a.Rune('a')), false, ""},
{"", c.Max(4, a.Rune('b')), true, ""},
{"X", c.Max(4, a.Rune('b')), true, ""},
{"bbbbbX", c.Max(4, a.Rune('b')), true, "bbbb"},
{"bbbbbX", c.Max(5, a.Rune('b')), true, "bbbbb"},
{"bbbbbX", c.Max(6, a.Rune('b')), true, "bbbbb"},
{"", c.MinMax(0, 0, a.Rune('c')), true, ""},
{"X", c.MinMax(0, 0, a.Rune('c')), true, ""},
{"cccc", c.MinMax(0, 5, a.Rune('c')), true, "cccc"},
{"ccccc", c.MinMax(0, 5, a.Rune('c')), true, "ccccc"},
{"cccccc", c.MinMax(0, 5, a.Rune('c')), true, "ccccc"},
{"cccccX", c.MinMax(0, 0, a.Rune('c')), true, ""},
{"cccccX", c.MinMax(0, 1, a.Rune('c')), true, "c"},
{"cccccX", c.MinMax(0, 5, a.Rune('c')), true, "ccccc"},
{"cccccX", c.MinMax(0, 6, a.Rune('c')), true, "ccccc"},
{"cccccX", c.MinMax(1, 1, a.Rune('c')), true, "c"},
{"", c.MinMax(1, 1, a.Rune('c')), false, ""},
{"X", c.MinMax(1, 1, a.Rune('c')), false, ""},
{"cccccX", c.MinMax(1, 3, a.Rune('c')), true, "ccc"},
{"cccccX", c.MinMax(1, 6, a.Rune('c')), true, "ccccc"},
{"cccccX", c.MinMax(3, 4, a.Rune('c')), true, "cccc"},
{"", c.OneOrMore(a.Rune('d')), false, ""},
{"X", c.OneOrMore(a.Rune('d')), false, ""},
{"dX", c.OneOrMore(a.Rune('d')), true, "d"},
{"dddddX", c.OneOrMore(a.Rune('d')), true, "ddddd"},
{"", c.ZeroOrMore(a.Rune('e')), true, ""},
{"X", c.ZeroOrMore(a.Rune('e')), true, ""},
{"eX", c.ZeroOrMore(a.Rune('e')), true, "e"},
{"eeeeeX", c.ZeroOrMore(a.Rune('e')), true, "eeeee"},
{"HI!", c.Seq(a.Rune('H'), a.Rune('I'), a.Rune('!')), true, "HI!"},
{"Hello, world!X", c.Seq(a.Str("Hello"), a.Comma, a.Space, a.Str("world"), a.Excl), true, "Hello, world!"},
{"101010123", c.OneOrMore(c.Seq(a.Rune('1'), a.Rune('0'))), true, "101010"},
{"", c.Optional(c.OneOrMore(a.Rune('f'))), true, ""},
{"ghijkl", c.Optional(a.Rune('h')), true, ""},
{"ghijkl", c.Optional(a.Rune('g')), true, "g"},
{"fffffX", c.Optional(c.OneOrMore(a.Rune('f'))), true, "fffff"},
{"1,2,3,b,c", c.Separated(a.Comma, a.Digit), true, "1,2,3"},
{`\x9a\x01\xF0\xfCAndSomeMoreStuff`, c.OneOrMore(c.Seq(a.Backslash, a.Rune('x'), c.Repeated(2, a.HexDigit))), true, `\x9a\x01\xF0\xfC`},
{" ", m.Trim(c.OneOrMore(a.AnyRune), " "), true, ""},
{" ", m.TrimLeft(c.OneOrMore(a.AnyRune), " "), true, ""},
{" ", m.TrimRight(c.OneOrMore(a.AnyRune), " "), true, ""},
})
}
func TestCombinatorPanics(t *testing.T) {
var c, a = tokenize.C, tokenize.A
AssertPanics(t, []PanicT{
{func() { a.RuneRange('z', 'a') }, true,
`Handler: MatchRuneRange definition error at /.*/handlers_builtin_test\.go:\d+: start 'z' must not be < end 'a'`},
{func() { c.MinMax(-1, 1, a.Space) }, true,
`Handler: MatchMinMax definition error at /.*/handlers_builtin_test\.go:\d+: min must be >= 0`},
{func() { c.MinMax(1, -1, a.Space) }, true,
`Handler: MatchMinMax definition error at /.*/handlers_builtin_test\.go:\d+: max must be >= 0`},
{func() { c.MinMax(10, 5, a.Space) }, true,
`Handler: MatchMinMax definition error at /.*/handlers_builtin_test\.go:\d+: max 5 must not be < min 10`},
{func() { c.Min(-10, a.Space) }, true,
`Handler: MatchMin definition error at /.*/handlers_builtin_test\.go:\d+: min must be >= 0`},
{func() { c.Max(-42, a.Space) }, true,
`Handler: MatchMax definition error at /.*/handlers_builtin_test\.go:\d+: max must be >= 0`},
{func() { a.IntegerBetween(10, -10) }, true,
`Handler: MatchIntegerBetween definition error at /.*/handlers_builtin_test.go:\d+: max -10 must not be < min 10`},
})
}
func TestAtoms(t *testing.T) {
var a = tokenize.A
AssertHandlers(t, []HandlerT{
{"dd", a.RuneRange('b', 'e'), true, "d"},
{"ee", a.RuneRange('b', 'e'), true, "e"},
{"ff", a.RuneRange('b', 'e'), false, ""},
{"Hello, world!", a.Str("Hello"), true, "Hello"},
{"HellÖ, world!", a.StrNoCase("hellö"), true, "HellÖ"},
{"+X", a.Runes('+', '-', '*', '/'), true, "+"},
{"-X", a.Runes('+', '-', '*', '/'), true, "-"},
{"*X", a.Runes('+', '-', '*', '/'), true, "*"},
{"/X", a.Runes('+', '-', '*', '/'), true, "/"},
{"!X", a.Runes('+', '-', '*', '/'), false, ""},
{"xxx", a.Rune('x'), true, "x"},
{"x ", a.Rune(' '), false, ""},
{"aa", a.RuneRange('b', 'e'), false, ""},
{"bb", a.RuneRange('b', 'e'), true, "b"},
{"cc", a.RuneRange('b', 'e'), true, "c"},
{"", a.EndOfFile, true, ""},
{"⌘", a.AnyRune, true, "⌘"},
{"\xbc with AnyRune", a.AnyRune, true, "<22>"},
{"", a.AnyRune, false, ""},
{"⌘", a.ValidRune, true, "⌘"},
{"\xbc with ValidRune", a.ValidRune, false, "<22>"},
{"", a.ValidRune, false, ""},
{" ", a.Space, true, " "},
{"X", a.Space, false, ""},
{"\t", a.Tab, true, "\t"},
{"\r", a.CR, true, "\r"},
{"\n", a.LF, true, "\n"},
{"!", a.Excl, true, "!"},
{"\"", a.DoubleQuote, true, "\""},
{"#", a.Hash, true, "#"},
{"$", a.Dollar, true, "$"},
{"%", a.Percent, true, "%"},
{"&", a.Amp, true, "&"},
{"'", a.SingleQuote, true, "'"},
{"(", a.LeftParen, true, "("},
{"(", a.RoundOpen, true, "("},
{")", a.RightParen, true, ")"},
{")", a.RoundClose, true, ")"},
{"*", a.Asterisk, true, "*"},
{"*", a.Multiply, true, "*"},
{"+", a.Plus, true, "+"},
{"+", a.Add, true, "+"},
{",", a.Comma, true, ","},
{"-", a.Minus, true, "-"},
{"-", a.Subtract, true, "-"},
{".", a.Dot, true, "."},
{"/", a.Slash, true, "/"},
{"/", a.Divide, true, "/"},
{":", a.Colon, true, ":"},
{";", a.Semicolon, true, ";"},
{"<", a.AngleOpen, true, "<"},
{"<", a.LessThan, true, "<"},
{"=", a.Equal, true, "="},
{">", a.AngleClose, true, ">"},
{">", a.GreaterThan, true, ">"},
{"?", a.Question, true, "?"},
{"@", a.At, true, "@"},
{"[", a.SquareOpen, true, "["},
{"\\", a.Backslash, true, "\\"},
{"]", a.SquareClose, true, "]"},
{"^", a.Caret, true, "^"},
{"_", a.Underscore, true, "_"},
{"`", a.Backquote, true, "`"},
{"{", a.CurlyOpen, true, "{"},
{"|", a.Pipe, true, "|"},
{"}", a.CurlyClose, true, "}"},
{"~", a.Tilde, true, "~"},
{"\t \t \r\n", a.Blank, true, "\t"},
{" \t \t \r\n", a.Blanks, true, " \t \t "},
{"xxx", a.Whitespace, false, ""},
{" ", a.Whitespace, true, " "},
{"\t", a.Whitespace, true, "\t"},
{"\n", a.Whitespace, true, "\n"},
{"\r\n", a.Whitespace, true, "\r\n"},
{" \t\r\n \n \t\t\r\n ", a.Whitespace, true, " \t\r\n \n \t\t\r\n "},
{"xxx", a.UnicodeSpace, false, ""},
{" \t\r\n \r\v\f ", a.UnicodeSpace, true, " \t\r\n \r\v\f "},
{"", a.EndOfLine, true, ""},
{"\r\n", a.EndOfLine, true, "\r\n"},
{"\n", a.EndOfLine, true, "\n"},
{"0", a.Digit, true, "0"},
{"1", a.Digit, true, "1"},
{"2", a.Digit, true, "2"},
{"3", a.Digit, true, "3"},
{"4", a.Digit, true, "4"},
{"5", a.Digit, true, "5"},
{"6", a.Digit, true, "6"},
{"7", a.Digit, true, "7"},
{"8", a.Digit, true, "8"},
{"9", a.Digit, true, "9"},
{"X", a.Digit, false, ""},
{"a", a.ASCIILower, true, "a"},
{"z", a.ASCIILower, true, "z"},
{"A", a.ASCIILower, false, ""},
{"Z", a.ASCIILower, false, ""},
{"A", a.ASCIIUpper, true, "A"},
{"Z", a.ASCIIUpper, true, "Z"},
{"a", a.ASCIIUpper, false, ""},
{"z", a.ASCIIUpper, false, ""},
{"1", a.Letter, false, ""},
{"a", a.Letter, true, "a"},
{"Ø", a.Letter, true, "Ø"},
{"Ë", a.Lower, false, ""},
{"ë", a.Lower, true, "ë"},
{"ä", a.Upper, false, "ä"},
{"Ä", a.Upper, true, "Ä"},
{"0", a.HexDigit, true, "0"},
{"9", a.HexDigit, true, "9"},
{"a", a.HexDigit, true, "a"},
{"f", a.HexDigit, true, "f"},
{"A", a.HexDigit, true, "A"},
{"F", a.HexDigit, true, "F"},
{"g", a.HexDigit, false, "g"},
{"G", a.HexDigit, false, "G"},
{"0", a.Integer, true, "0"},
{"09", a.Integer, true, "0"}, // following Go: 09 is invalid octal, so only 0 is valid for the integer
{"1", a.Integer, true, "1"},
{"-10X", a.Integer, false, ""},
{"+10X", a.Integer, false, ""},
{"-10X", a.Signed(a.Integer), true, "-10"},
{"+10X", a.Signed(a.Integer), true, "+10"},
{"+10.1X", a.Signed(a.Integer), true, "+10"},
{"0X", a.Float, true, "0"},
{"0X", a.Float, true, "0"},
{"1X", a.Float, true, "1"},
{"1.", a.Float, true, "1"}, // incomplete float, so only the 1 is picked up
{"123.321X", a.Float, true, "123.321"},
{"-3.14X", a.Float, false, ""},
{"-3.14X", a.Signed(a.Float), true, "-3.14"},
{"-003.0014X", a.Signed(a.Float), true, "-003.0014"},
{"-11", a.IntegerBetween(-10, 10), false, "0"},
{"-10", a.IntegerBetween(-10, 10), true, "-10"},
{"0", a.IntegerBetween(-10, 10), true, "0"},
{"10", a.IntegerBetween(-10, 10), true, "10"},
{"11", a.IntegerBetween(0, 10), false, ""},
})
}
func TestIPv4Atoms(t *testing.T) {
var a = tokenize.A
AssertHandlers(t, []HandlerT{
// Not normalized octet.
{"0X", tokenize.MatchOctet(false), true, "0"},
{"00X", tokenize.MatchOctet(false), true, "00"},
{"000X", tokenize.MatchOctet(false), true, "000"},
{"10X", tokenize.MatchOctet(false), true, "10"},
{"010X", tokenize.MatchOctet(false), true, "010"},
{"255123", tokenize.MatchOctet(false), true, "255"},
{"256123", tokenize.MatchOctet(false), false, ""},
{"300", tokenize.MatchOctet(false), false, ""},
// Normalized octet.
{"0X", a.Octet, true, "0"},
{"00X", a.Octet, true, "0"},
{"000X", a.Octet, true, "0"},
{"10X", a.Octet, true, "10"},
{"010X", a.Octet, true, "10"},
{"255123", a.Octet, true, "255"},
{"256123", a.Octet, false, ""},
{"300", a.Octet, false, ""},
// IPv4 address.
{"0.0.0.0", a.IPv4, true, "0.0.0.0"},
{"10.20.30.40", a.IPv4, true, "10.20.30.40"},
{"010.020.003.004", a.IPv4, true, "10.20.3.4"},
{"255.255.255.255", a.IPv4, true, "255.255.255.255"},
{"256.255.255.255", a.IPv4, false, ""},
// IPv4 CIDR netmask.
{"0", a.IPv4CIDRMask, true, "0"},
{"00", a.IPv4CIDRMask, true, "0"},
{"000", a.IPv4CIDRMask, true, "0"},
{"32", a.IPv4CIDRMask, true, "32"},
{"032", a.IPv4CIDRMask, true, "32"},
{"33", a.IPv4CIDRMask, false, ""},
// IPv4 netmask in dotted quad format.
{"0.0.0.0", a.IPv4Netmask, true, "0.0.0.0"},
{"255.255.128.0", a.IPv4Netmask, true, "255.255.128.0"},
{"255.255.255.255", a.IPv4Netmask, true, "255.255.255.255"},
{"255.255.132.0", a.IPv4Netmask, false, ""}, // not a canonical netmask (1-bits followed by 0-bits)
// IPv4 address + CIDR or dotted quad netmask.
{"192.168.6.123", a.IPv4Net, false, ""},
{"192.168.6.123/024", a.IPv4Net, true, "192.168.6.123/24"},
{"192.168.6.123/255.255.255.0", a.IPv4Net, true, "192.168.6.123/24"},
{"10.0.0.10/192.0.0.0", a.IPv4Net, true, "10.0.0.10/2"},
{"10.0.0.10/193.0.0.0", a.IPv4Net, false, ""}, // invalid netmask and 193 is also invalid cidr
{"010.000.000.010/16.000.000.000", a.IPv4Net, true, "10.0.0.10/16"}, // invalid netmask, but 16 cidr is ok, remainder input = ".0.0.0"
})
}
func TestIPv6Atoms(t *testing.T) {
var a = tokenize.A
AssertHandlers(t, []HandlerT{
{"", a.IPv6, false, ""},
{"::", a.IPv6, true, "::"},
{"1::", a.IPv6, true, "1::"},
{"1::1", a.IPv6, true, "1::1"},
{"::1", a.IPv6, true, "::1"},
{"1:2:3:4:5:6:7::", a.IPv6, false, ""},
{"::1:2:3:4:5:6:7:8:9", a.IPv6, true, "::1:2:3:4:5:6"},
{"1:2:3:4::5:6:7:8:9", a.IPv6, true, "1:2:3:4::5:6"},
{"a:b::ffff:0:1111", a.IPv6, true, "a:b::ffff:0:1111"},
{"000a:000b:0000:000:00:ffff:0000:1111", a.IPv6, true, "a:b::ffff:0:1111"},
{"000a:0000:0000:001:00:0:ffff:1111", a.IPv6, true, "a::1:0:0:ffff:1111"},
{"0000:0000:0000:001:00:0:ffff:1111", a.IPv6, true, "::1:0:0:ffff:1111"},
{"aaaa:bbbb:cccc:dddd:eeee:ffff:0000:1111", a.IPv6, true, "aaaa:bbbb:cccc:dddd:eeee:ffff:0:1111"},
{"gggg:bbbb:cccc:dddd:eeee:ffff:0000:1111", a.IPv6, false, ""},
{"ffff::gggg:eeee:ffff:0000:1111", a.IPv6, true, "ffff::"},
{"0", a.IPv6CIDRMask, true, "0"},
{"128", a.IPv6CIDRMask, true, "128"},
{"129", a.IPv6CIDRMask, false, ""},
{"::1/128", a.IPv6Net, true, "::1/128"},
{"::1/129", a.IPv6Net, false, ""},
{"1.1.1.1/24", a.IPv6Net, false, ""},
{"ffff:0:0:0::1010/0", a.IPv6Net, true, "ffff::1010/0"},
{"fe80:0:0:0:0216:3eff:fe96:0002/64", a.IPv6Net, true, "fe80::216:3eff:fe96:2/64"},
})
}
func TestModifiers(t *testing.T) {
var c, a, m = tokenize.C, tokenize.A, tokenize.M
AssertHandlers(t, []HandlerT{
{"--cool", c.Seq(m.Drop(c.OneOrMore(a.Minus)), a.Str("cool")), true, "cool"},
{"12345", c.Seq(a.Digit, m.Drop(a.Digit), a.Digit, m.Drop(a.Digit), a.Digit), true, "135"},
{" trim ", m.Trim(c.OneOrMore(a.AnyRune), " "), true, "trim"},
{" \t trim \t ", m.Trim(c.OneOrMore(a.AnyRune), " \t"), true, "trim"},
{" trim ", m.TrimLeft(c.OneOrMore(a.AnyRune), " "), true, "trim "},
{" trim ", m.TrimRight(c.OneOrMore(a.AnyRune), " "), true, " trim"},
{" \t trim \t ", m.TrimRight(c.OneOrMore(a.AnyRune), " \t"), true, " \t trim"},
{"dirtyword", m.Replace(c.OneOrMore(a.AnyRune), "*******"), true, "*******"},
{"abcdefghijk", m.ByCallback(a.Str("abc"), func(s string) string { return "X" }), true, "X"},
{"NoTaLlUpPeR", m.ToUpper(a.StrNoCase("notallUPPER")), true, "NOTALLUPPER"},
{"NoTaLlLoWeR", m.ToLower(a.StrNoCase("NOTALLlower")), true, "notalllower"},
})
}
// When a TokenMaker encounters an error, this is considered a programmer error.
// A TokenMaker should not be called, unless the input is already validated to
// follow the correct pattern. Therefore, tokenmakers will panic when the
// input cannot be processed successfully.
func TestTokenMakerErrorHandling(t *testing.T) {
var a, tok = tokenize.A, tokenize.T
invalid := tok.Boolean("BOOL", a.Str("no")) // not valid for strconv.ParseBool()
tokenizer := tokenize.New(invalid)
AssertPanic(t, PanicT{
func() { tokenizer("no") }, false,
`boolean token invalid (strconv.ParseBool: parsing "no": invalid syntax)`,
})
}
func TestTokenMakers(t *testing.T) {
var c, a, tok = tokenize.C, tokenize.A, tokenize.T
AssertTokenMakers(t, []TokenMakerT{
{`empty token`, tok.Str("A", c.ZeroOrMore(a.Digit)),
[]tokenize.Token{{Type: "A", Value: ""}}},
{`Ѝюج literal \string`, tok.Str("B", c.OneOrMore(a.AnyRune)),
[]tokenize.Token{{Type: "B", Value: `Ѝюج literal \string`}}},
{`Ѝюجinterpreted \n string \u2318`, tok.StrInterpreted("C", c.OneOrMore(a.AnyRune)),
[]tokenize.Token{{Type: "C", Value: "Ѝюجinterpreted \n string ⌘"}}},
{"Ø*", tok.Byte("Q", a.AnyRune), []tokenize.Token{{Type: "Q", Value: byte('Ø')}}},
{"ROCKS", c.OneOrMore(tok.Byte("bar", a.ASCII)), []tokenize.Token{
{Type: "bar", Value: byte('R')},
{Type: "bar", Value: byte('O')},
{Type: "bar", Value: byte('C')},
{Type: "bar", Value: byte('K')},
{Type: "bar", Value: byte('S')},
}},
{"Ø*", tok.Rune("P", a.AnyRune), []tokenize.Token{{Type: "P", Value: rune('Ø')}}},
{`2147483647XYZ`, tok.Int("D", a.Integer), []tokenize.Token{{Type: "D", Value: int(2147483647)}}},
{`-2147483647XYZ`, tok.Int("D", a.Signed(a.Integer)), []tokenize.Token{{Type: "D", Value: int(-2147483647)}}},
{`127XYZ`, tok.Int8("E", a.Integer), []tokenize.Token{{Type: "E", Value: int8(127)}}},
{`-127XYZ`, tok.Int8("E", a.Signed(a.Integer)), []tokenize.Token{{Type: "E", Value: int8(-127)}}},
{`32767XYZ`, tok.Int16("F", a.Integer), []tokenize.Token{{Type: "F", Value: int16(32767)}}},
{`-32767XYZ`, tok.Int16("F", a.Signed(a.Integer)), []tokenize.Token{{Type: "F", Value: int16(-32767)}}},
{`2147483647XYZ`, tok.Int32("G", a.Integer), []tokenize.Token{{Type: "G", Value: int32(2147483647)}}},
{`-2147483647XYZ`, tok.Int32("G", a.Signed(a.Integer)), []tokenize.Token{{Type: "G", Value: int32(-2147483647)}}},
{`-9223372036854775807XYZ`, tok.Int64("H", a.Signed(a.Integer)), []tokenize.Token{{Type: "H", Value: int64(-9223372036854775807)}}},
{`4294967295`, tok.Uint("I", a.Integer), []tokenize.Token{{Type: "I", Value: uint(4294967295)}}},
{`255XYZ`, tok.Uint8("J", a.Integer), []tokenize.Token{{Type: "J", Value: uint8(255)}}},
{`65535XYZ`, tok.Uint16("K", a.Integer), []tokenize.Token{{Type: "K", Value: uint16(65535)}}},
{`4294967295XYZ`, tok.Uint32("L", a.Integer), []tokenize.Token{{Type: "L", Value: uint32(4294967295)}}},
{`18446744073709551615XYZ`, tok.Uint64("M", a.Integer), []tokenize.Token{{Type: "M", Value: uint64(18446744073709551615)}}},
{`3.1415=PI`, tok.Float32("N", a.Float), []tokenize.Token{{Type: "N", Value: float32(3.1415)}}},
{`24.19287=PI`, tok.Float64("O", a.Float), []tokenize.Token{{Type: "O", Value: float64(24.19287)}}},
{`1tTtrueTRUETrue`, c.OneOrMore(tok.Boolean("P", a.Boolean)), []tokenize.Token{
{Type: "P", Value: true},
{Type: "P", Value: true},
{Type: "P", Value: true},
{Type: "P", Value: true},
{Type: "P", Value: true},
{Type: "P", Value: true},
}},
{`0fFfalseFALSEFalse`, c.OneOrMore(tok.Boolean("P", a.Boolean)), []tokenize.Token{
{Type: "P", Value: false},
{Type: "P", Value: false},
{Type: "P", Value: false},
{Type: "P", Value: false},
{Type: "P", Value: false},
{Type: "P", Value: false},
}},
})
}
// I know, this is hell, but that's the whole point for this test :->
func TestCombination(t *testing.T) {
var c, a, m = tokenize.C, tokenize.A, tokenize.M
demonic := c.Seq(
c.Optional(a.SquareOpen),
m.Trim(
c.Seq(
c.Optional(a.Blanks),
c.Repeated(3, a.AngleClose),
m.ByCallback(c.OneOrMore(a.StrNoCase("hello")), func(s string) string {
return fmt.Sprintf("%d", len(s))
}),
m.Replace(c.Separated(a.Comma, c.Optional(a.Blanks)), ", "),
m.ToUpper(c.Min(1, a.ASCIILower)),
m.Drop(a.Excl),
c.Repeated(3, a.AngleOpen),
c.Optional(a.Blanks),
),
" \t",
),
c.Optional(a.SquareClose),
)
AssertHandlers(t, []HandlerT{
{"[ \t >>>Hello, world!<<< ]", demonic, true, "[>>>5, WORLD<<<]"},
{"[ \t >>>Hello, world!<<< ", demonic, true, "[>>>5, WORLD<<<"},
{">>>HellohellO, world!<<< ]", demonic, true, ">>>10, WORLD<<<]"},
{"[ \t >>>HellohellO , , , world!<<< ", demonic, true, "[>>>10, WORLD<<<"},
})
}

47
tokenize2/token.go Normal file
View File

@ -0,0 +1,47 @@
package tokenize2
import (
"fmt"
)
// Token defines a lexical token as produced by tokenize.Handlers.
//
// The only mandatory data in a Token are the Runes. The Type and Value fields
// are optional fields that can be filled with data at will.
//
// The use of the Type field is to let a tokenizer communicate to
// the parser what type of token it's handling.
//
// The use of the Value field is to store any kind af data along with the token.
// One use of this can be found in the built-in token maker functions like
// MakeInt8Token(), which store an interpreted version of the input string
// in the Value field.
type Token struct {
Type interface{} // optional token type, can be any type that a parser author sees fit
Value interface{} // optional token value, of any type as well
}
func (t Token) String() string {
tokenType := ""
if t.Type != nil {
tokenType = fmt.Sprintf("%v", t.Type)
}
value := ""
if t.Value != nil {
switch t.Value.(type) {
case []*Token:
return fmt.Sprintf("%v%v", tokenType, t.Value)
case string:
value = fmt.Sprintf("%q", t.Value)
case rune:
value = fmt.Sprintf("%v", t.Value)
case bool:
value = fmt.Sprintf("%v", t.Value)
default:
value = fmt.Sprintf("(%T)%v", t.Value, t.Value)
}
}
return fmt.Sprintf("%v(%s)", tokenType, value)
}

41
tokenize2/tokenize.go Normal file
View File

@ -0,0 +1,41 @@
// Package tokenize provides tooling to build a tokenizer in
// parser/combinator-style, used to feed data to the parser.
package tokenize2
import (
"fmt"
)
// Func is the function signature as returned by New: a function that takes
// any supported type of input, executes a tokenizer run and returns a
// Result struct (possibly nil) and an error (possibly nil).
type Func func(input interface{}) (*API, error)
// New instantiates a new tokenizer.
//
// The tokenizer is a tokenizing state machine, in which tokenize.Handler
// functions are used to move the state machine forward during tokenizing.
// Using the New function, you can wrap a tokenize.Handler in a simple way,
// making it possible to feed some input to the handler and retrieve the
// tokenizing results.
//
// The startHandler argument points the tokenizer to the tokenize.Handler function
// that must be executed at the start of the tokenizing process. From there on
// other tokenize.Handler functions can be invoked recursively to implement the
// tokenizing process.
//
// THis function returns a function that can be invoked to run the tokenizer
// against the provided input data. For an overview of allowed inputs, take a
// look at the documentation for parsekit.read.New().
func New(tokenHandler Handler) Func {
return func(input interface{}) (*API, error) {
api := NewAPI(input)
ok := tokenHandler(api)
if !ok {
err := fmt.Errorf("mismatch at %s", Cursor{})
return nil, err
}
return api, nil
}
}

223
tokenize2/tokenizer_test.go Normal file
View File

@ -0,0 +1,223 @@
package tokenize2_test
import (
"fmt"
"io"
"strings"
"testing"
"unicode/utf8"
tokenize "git.makaay.nl/mauricem/go-parsekit/tokenize2"
)
// TODO For error handling, it would be really cool if for example the
// 10.0.300.1/24 case would return an actual error stating that
// 300 is not a valid octet for an IPv4 address.
// Biggest thing to take care of here, is that errors should not stop
// a Parser flow (since we might be trying to match different cases in
// sequence), but a Parser flow should optionally be able to make use
// of the actual error.
// The same goes for a Tokenizer, since those can also make use of
// optional matching using tokenize.C.Any(...) for example. If matching
// for Any(IPv4, Digits), the example case should simply end up with 10
// after the IPv4 mismatch.
func ExampleNew() {
// Build the tokenizer for ip/mask.
var c, a, t = tokenize.C, tokenize.A, tokenize.T
ip := t.Str("ip", a.IPv4)
mask := t.Int8("mask", a.IPv4CIDRMask)
cidr := c.Seq(ip, a.Slash, mask)
tokenizer := tokenize.New(cidr)
for _, input := range []string{
"000.000.000.000/000",
"192.168.0.1/24",
"255.255.255.255/32",
"10.0.300.1/24",
"not an IPv4 CIDR",
} {
// Execute returns a Result and an error, which is nil on success.
result, err := tokenizer(input)
if err == nil {
fmt.Printf("Result: %s\n", result.Tokens())
} else {
fmt.Printf("Error: %s\n", err)
}
}
// Output:
// Result: [ip("0.0.0.0") mask((int8)0)]
// Result: [ip("192.168.0.1") mask((int8)24)]
// Result: [ip("255.255.255.255") mask((int8)32)]
// Error: mismatch at start of file
// Error: mismatch at start of file
}
func TestCallingNextRune_ReturnsNextRune(t *testing.T) {
api := makeTokenizeAPI()
r, _ := api.NextRune()
AssertEqual(t, 'T', r, "first rune")
}
func TestInputCanAcceptRunesFromReader(t *testing.T) {
i := makeTokenizeAPI()
i.NextRune()
i.Accept()
i.NextRune()
i.Accept()
i.NextRune()
i.Accept()
AssertEqual(t, "Tes", i.String(), "i.String()")
}
func TestCallingNextRuneTwice_Panics(t *testing.T) {
AssertPanic(t, PanicT{
Function: func() {
i := makeTokenizeAPI()
i.NextRune()
i.NextRune()
},
Regexp: true,
Expect: `tokenize\.API\.NextRune\(\): NextRune\(\) called at /.*_test\.go:\d+ ` +
`without a prior call to Accept\(\)`,
})
}
func TestCallingAcceptWithoutCallingNextRune_Panics(t *testing.T) {
api := makeTokenizeAPI()
AssertPanic(t, PanicT{
Function: api.Accept,
Regexp: true,
Expect: `tokenize\.API\.Accept\(\): Accept\(\) called at /.*test\.go:\d+ ` +
`without first calling NextRune\(\)`,
})
}
func TestCallingAcceptAfterReadError_Panics(t *testing.T) {
api := tokenize.NewAPI("")
AssertPanic(t, PanicT{
Function: func() {
api.NextRune()
api.Accept()
},
Regexp: true,
Expect: `tokenize\.API\.Accept\(\): Accept\(\) called at /.*_test\.go:\d+` +
`, but the prior call to NextRune\(\) failed`,
})
}
func TestCallingMergeOnTopLevelAPI_Panics(t *testing.T) {
AssertPanic(t, PanicT{
Function: func() {
i := makeTokenizeAPI()
i.Merge(0)
},
Regexp: true,
Expect: `tokenize\.API\.Merge\(\): Merge\(\) called at /.*_test.go:\d+ on the top-level API`})
}
func TestCallingMergeOnForkParentAPI_Panics(t *testing.T) {
AssertPanic(t, PanicT{
Function: func() {
i := makeTokenizeAPI()
child := i.Fork()
i.Fork()
i.Merge(child)
},
Regexp: true,
Expect: `tokenize\.API\.Merge\(\): Merge\(\) called at /.*_test.go:\d+ ` +
`on API stack level 1, but the current stack level is 2 \(forgot to Dispose\(\) a forked child\?\)`})
}
func TestCallingDisposeOnTopLevelAPI_Panics(t *testing.T) {
AssertPanic(t, PanicT{
Function: func() {
i := makeTokenizeAPI()
i.Dispose(0)
},
Regexp: true,
Expect: `tokenize\.API\.Dispose\(\): Dispose\(\) called at /.*_test.go:\d+ on the top-level API`})
}
func TestCallingDisposeOnForkParentAPI_Panics(t *testing.T) {
AssertPanic(t, PanicT{
Function: func() {
i := makeTokenizeAPI()
child := i.Fork()
i.Fork()
i.Dispose(child)
},
Regexp: true,
Expect: `tokenize\.API\.Dispose\(\): Dispose\(\) called at /.*_test.go:\d+ ` +
`on API stack level 1, but the current stack level is 2 \(forgot to Dispose\(\) a forked child\?\)`})
}
func TestCallingForkOnForkedParentAPI_Panics(t *testing.T) {
AssertPanic(t, PanicT{
Function: func() {
i := makeTokenizeAPI()
i.Fork()
g := i.Fork()
i.Fork()
i.Merge(g)
},
Regexp: true,
Expect: `tokenize\.API\.Merge\(\): Merge\(\) called at /.*_test.go:\d+ ` +
`on API stack level 2, but the current stack level is 3 \(forgot to Dispose\(\) a forked child\?\)`})
}
func TestForkingInput_ClearsLastRune(t *testing.T) {
AssertPanic(t, PanicT{
Function: func() {
i := makeTokenizeAPI()
i.NextRune()
i.Fork()
i.Accept()
},
Regexp: true,
Expect: `tokenize\.API\.Accept\(\): Accept\(\) called at /.*_test\.go:\d+ without first calling NextRune\(\)`,
})
}
func TestAccept_UpdatesCursor(t *testing.T) {
i := tokenize.NewAPI(strings.NewReader("input\r\nwith\r\nnewlines"))
AssertEqual(t, "start of file", i.Cursor().String(), "cursor 1")
for j := 0; j < 6; j++ { // read "input\r", cursor end up at "\n"
i.NextRune()
i.Accept()
}
AssertEqual(t, "line 1, column 7", i.Cursor().String(), "cursor 2")
i.NextRune() // read "\n", cursor ends up at start of new line
i.Accept()
AssertEqual(t, "line 2, column 1", i.Cursor().String(), "cursor 3")
for j := 0; j < 10; j++ { // read "with\r\nnewl", cursor end up at "i"
i.NextRune()
i.Accept()
}
AssertEqual(t, "line 3, column 5", i.Cursor().String(), "cursor 4")
}
func TestWhenCallingNextruneAtEndOfFile_EOFIsReturned(t *testing.T) {
i := tokenize.NewAPI(strings.NewReader("X"))
i.NextRune()
i.Accept()
r, err := i.NextRune()
AssertEqual(t, true, r == utf8.RuneError, "returned rune from NextRune()")
AssertEqual(t, true, err == io.EOF, "returned error from NextRune()")
}
func TestAfterReadingruneAtEndOfFile_EarlierRunesCanStillBeAccessed(t *testing.T) {
i := tokenize.NewAPI(strings.NewReader("X"))
child := i.Fork()
i.NextRune()
i.Accept()
r, err := i.NextRune()
AssertEqual(t, true, r == utf8.RuneError, "returned rune from 2nd NextRune()")
i.Dispose(child) // brings the read offset back to the start
r, err = i.NextRune() // so here we should see the same rune
AssertEqual(t, 'X', r, "returned rune from 2nd NextRune()")
AssertEqual(t, true, err == nil, "returned error from 2nd NextRune()")
}
func makeTokenizeAPI() *tokenize.API {
return tokenize.NewAPI("Testing")
}

View File

@ -0,0 +1,110 @@
package tokenize2
import (
"testing"
)
func TestFork_CreatesForkOfInputAtSameCursorPosition(t *testing.T) {
// Create input, accept the first rune.
i := NewAPI("Testing")
i.NextRune()
i.Accept() // T
AssertEqual(t, "T", i.String(), "accepted rune in input")
// Fork
child := i.Fork()
AssertEqual(t, 1, i.stackFrame.cursor.Byte, "parent cursor.Byte")
AssertEqual(t, 1, i.stackFrame.offset, "parent offset")
AssertEqual(t, 1, i.stackFrame.cursor.Byte, "child cursor.Byte")
AssertEqual(t, 1, i.stackFrame.offset, "child offset")
// Accept two runes via fork.
i.NextRune()
i.Accept() // e
i.NextRune()
i.Accept() // s
AssertEqual(t, "es", i.String(), "result runes in fork")
AssertEqual(t, 1, i.stackFrames[i.stackLevel-1].cursor.Byte, "parent cursor.Byte")
AssertEqual(t, 1, i.stackFrames[i.stackLevel-1].offset, "parent offset")
AssertEqual(t, 3, i.stackFrame.cursor.Byte, "child cursor.Byte")
AssertEqual(t, 3, i.stackFrame.offset, "child offset")
// Merge fork back into parent
i.Merge(child)
i.Dispose(child)
AssertEqual(t, "Tes", i.String(), "result runes in parent Input after Merge()")
AssertEqual(t, 3, i.stackFrame.cursor.Byte, "parent cursor.Byte")
AssertEqual(t, 3, i.stackFrame.offset, "parent offset")
}
func TestGivenForkedChildWhichAcceptedRune_AfterMerging_RuneEndsUpInParentResult(t *testing.T) {
i := NewAPI("Testing")
i.NextRune()
i.Accept()
f1 := i.Fork()
i.NextRune()
i.Accept()
f2 := i.Fork()
i.NextRune()
i.Accept()
AssertEqual(t, "s", i.String(), "f2 String()")
AssertEqual(t, 3, i.stackFrame.offset, "f2.offset A")
i.Merge(f2)
i.Dispose(f2)
AssertEqual(t, "es", i.String(), "f1 String()")
AssertEqual(t, 3, i.stackFrame.offset, "f1.offset A")
i.Merge(f1)
i.Dispose(f1)
AssertEqual(t, "Tes", i.String(), "top-level API String()")
AssertEqual(t, 3, i.stackFrame.offset, "f1.offset A")
}
func TestCallingAcceptAfterNextRune_AcceptsRuneAndMovesReadOffsetForward(t *testing.T) {
i := NewAPI("Testing")
r, _ := i.NextRune()
AssertEqual(t, 'T', r, "result from 1st call to NextRune()")
AssertTrue(t, i.lastRune == 'T', "API.lastRune after NextRune() is not 'T'")
AssertTrue(t, i.runeRead, "API.runeRead after NextRune() is not true")
i.Accept()
AssertTrue(t, i.runeRead == false, "API.runeRead after Accept() is not false")
AssertEqual(t, 1, i.stackFrame.offset, "API.stackFrame.offset")
r, _ = i.NextRune()
AssertEqual(t, 'e', r, "result from 2nd call to NextRune()")
}
func TestFlushInput(t *testing.T) {
api := NewAPI("cool")
// Flushing without any read data is okay. FlushInput() will return
// false in this case, and nothing else happens.
AssertTrue(t, api.FlushInput() == false, "flush input at start")
api.NextRune()
api.Accept()
api.NextRune()
api.Accept()
AssertTrue(t, api.FlushInput() == true, "flush input after reading some data")
AssertEqual(t, 0, api.stackFrame.offset, "offset after flush input")
AssertTrue(t, api.FlushInput() == false, "flush input after flush input")
// Read offset is now zero, but reading should continue after "co".
api.NextRune()
api.Accept()
api.NextRune()
api.Accept()
AssertEqual(t, "cool", api.String(), "end result")
}
func AssertEqual(t *testing.T, expected interface{}, actual interface{}, forWhat string) {
if expected != actual {
t.Errorf(
"Unexpected value for %s:\nexpected: %q\nactual: %q",
forWhat, expected, actual)
}
}
func AssertTrue(t *testing.T, b bool, assertion string) {
if !b {
t.Errorf("Assertion %s is false", assertion)
}
}