Moved Reader into its own package.

2019-06-07 10:55:55 +00:00 · 2019-06-07 10:55:55 +00:00 · 98d2db0374
parent 6d92e1dc68
commit 98d2db0374
9 changed files with 291 additions and 204 deletions
--- a/go.mod
+++ b/go.mod
@ -1,3 +1,5 @@
 module git.makaay.nl/mauricem/go-parsekit
 go 1.12
 require github.com/stretchr/testify v1.3.0
--- a/go.sum
+++ b/go.sum
@ -0,0 +1,7 @@
 github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
 github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
--- a/parseapi.go
+++ b/parseapi.go
@ -172,7 +172,7 @@ func (a *ParseAPIOnAction) Stay() bool {
 func (a *ParseAPIOnAction) flushReader() {
 	if a.tokenAPI.result.offset > 0 {
-		a.tokenAPI.root.reader.flush(a.tokenAPI.root.result.offset)
+		a.tokenAPI.root.reader.Flush(a.tokenAPI.root.result.offset)
 		a.tokenAPI.root.result.offset = 0
 		a.parseAPI.initLoopCheck()
 	}
--- a/reader.go
+++ b/reader.go
@ -1,84 +0,0 @@
 package parsekit
 import (
 	"bufio"
 	"fmt"
 	"io"
 	"unicode/utf8"
 )
 // reader wraps around an io.Reader and provides buffering to allows us to read
 // the same runes over and over again. This is useful for implementing a parser
 // that must be able to do lookahead on the input, returning to the original
 // input position after finishing that lookahead).
 //
 // To minimze memory use, it is also possible to flush the buffer when there is
 // no more need to go back to previously read runes.
 //
 // The reader is used internally by parsekit.TokenAPI.
 type reader struct {
 	bufio        *bufio.Reader // Used for ReadRune()
 	buffer       []rune        // Input buffer, holding runes that were read from input
 	bufferOffset int           // The offset of the buffer, relative to the start of the input
 	bufferLen    int           // Input size, the number of runes in the buffer
 }
 // newwReader initializes a new reader struct, wrapped around the provided io.Reader.
 func newReader(r io.Reader) *reader {
 	return &reader{
 		bufio:  bufio.NewReader(r),
 		buffer: []rune{},
 	}
 }
 // runeAt reads the rune at the provided rune offset.
 //
 // This offset is relative to the current starting position of the buffer in
 // the reader. When starting reading, offset 0 will point at the start of the
 // input. After flushing, offset 0 will point at the input up to where
 // the flush was done.
 //
 // The error return value will be nil when reading was successful.
 // When an invalid rune is encountered on the input, the error will be nil,
 // but the rune will be utf8.RuneError
 //
 // When reading failed, the rune will be utf8.RuneError. One special read
 // fail is actually a normal situation: end of file reached. In that case,
 // the returned error wille be io.EOF.
 func (r *reader) runeAt(offset int) (rune, error) {
 	// Rune at provided offset is not yet available in the input buffer.
 	// Read runes until we have enough runes to satisfy the offset.
 	for r.bufferLen <= offset {
 		readRune, _, err := r.bufio.ReadRune()
 		// Handle errors.
 		if err != nil {
 			return utf8.RuneError, err
 		}
 		// Skip BOM.
 		if readRune == '\uFEFF' && r.bufferOffset == 0 {
 			r.bufferOffset++
 			continue
 		}
 		r.buffer = append(r.buffer, readRune)
 		r.bufferLen++
 	}
 	return r.buffer[offset], nil
 }
 // Flush deletes the provided number of runes from the start of the
 // reader buffer. After flushing the buffer, offset 0 as used by runeAt()
 // will point to the rune that comes after the flushed runes.
 // So what this basically does is turn the Reader into a sliding window.
 func (r *reader) flush(numberOfRunes int) {
 	if numberOfRunes > r.bufferLen {
 		panic(fmt.Sprintf(
 			"parsekit.Input.Reader.Flush(): number of runes to flush (%d) "+
 				"exceeds size of the buffer (%d)", numberOfRunes, r.bufferLen))
 	}
 	r.bufferOffset += numberOfRunes
 	r.bufferLen -= numberOfRunes
 	r.buffer = r.buffer[numberOfRunes:]
 }
--- a/reader/reader.go
+++ b/reader/reader.go
@ -0,0 +1,123 @@
 // Package reader provides a buffered Reader that wraps around an io.Reader.
 //
 // Functionally, it provides an input buffer in the form of a sliding window.
 // Let's say we've got the following input coming up in the io.Reader that is
 // wrapped by the Reader:
 //
 //     |H|e|l|l|o|,| |w|o|r|l|d|!|  <-- runes
 //      0           6           12  <-- rune offset
 //
 // The Reader can now be used to retrieve runes from the input, based on their
 // offset, using RuneAt(offset). Normally these runes will be retrieved in
 // sequence, but that is not a requirement. Let's say we retrieve the rune with
 // offset 6 from the input (the 'w'), then the Reader buffer be filled with runes
 // from the io.Reader until there are enough runes available to return the rune
 // for offset 6:
 //
 //     |H|e|l|l|o| |w|
 //      0           6
 //
 // Using RuneAt, you can retrieve arbitrary runes. If you request one that is
 // in the Reader buffer, then the buffered rune is returned. If you request one
 // that is not in the buffer, then the buffer will be expanded.
 //
 // To make this into a sliding window, the Reader provides the method
 // Flush(numberOfRunes). This method will drop the provided number of runes from
 // the Reader buffer. So when we'd do a Flush(3) on the example buffer from above,
 // then the Reader buffer would become:
 //
 //     |l|o| |w|
 //      0     3
 //
 // Note that the offset for the first rune 'l' in the buffer is now 0.
 // You can consider the input to be changed in a similar way:
 //
 //     |l|o|,| |w|o|r|l|d|!|
 //      0           6     9
 //
 // So after a flush, the first upcoming rune after the flushed runes
 // will always have index 0.
 package reader
 import (
 	"bufio"
 	"fmt"
 	"io"
 	"unicode/utf8"
 )
 // Reader wraps around an io.Reader and provides buffering that allows us to read
 // the same runes over and over again. This is useful for implementing a parser
 // that must be able to do lookahead on the input, returning to the original
 // input position after finishing that lookahead).
 //
 // To minimze memory use, it is also possible to flush the read buffer when there is
 // no more need to go back to previously read runes.
 //
 // The parserkit.reader.Reader is used internally by parsekit.TokenAPI.
 type Reader struct {
 	bufio        *bufio.Reader // Used for ReadRune()
 	buffer       []rune        // Input buffer, holding runes that were read from input
 	bufferOffset int           // The offset of the buffer, relative to the start of the input
 	bufferLen    int           // Input size, the number of runes in the buffer
 }
 // New initializes a new reader struct, wrapped around the provided io.Reader.
 func New(r io.Reader) *Reader {
 	return &Reader{
 		bufio:  bufio.NewReader(r),
 		buffer: []rune{},
 	}
 }
 // RuneAt reads the rune at the provided rune offset.
 //
 // This offset is relative to the current starting position of the buffer in
 // the reader. When starting reading, offset 0 will point at the start of the
 // input. After flushing, offset 0 will point at the input up to where
 // the flush was done.
 //
 // The error return value will be nil when reading was successful.
 // When an invalid rune is encountered on the input, the error will be nil,
 // but the rune will be utf8.RuneError
 //
 // When reading failed, the rune will be utf8.RuneError and the error will
 // be not nil. One special read fail is actually a normal situation: end
 // of file reached. In that case, the returned error wille be io.EOF.
 func (r *Reader) RuneAt(offset int) (rune, error) {
 	// Rune at provided offset is not yet available in the input buffer.
 	// Read runes until we have enough runes to satisfy the offset.
 	for r.bufferLen <= offset {
 		readRune, _, err := r.bufio.ReadRune()
 		// Handle errors.
 		if err != nil {
 			return utf8.RuneError, err
 		}
 		// Skip BOM.
 		if readRune == '\uFEFF' && r.bufferOffset == 0 {
 			r.bufferOffset++
 			continue
 		}
 		r.buffer = append(r.buffer, readRune)
 		r.bufferLen++
 	}
 	return r.buffer[offset], nil
 }
 // Flush deletes the provided number of runes from the start of the
 // reader buffer. After flushing the buffer, offset 0 as used by RuneAt()
 // will point to the rune that comes after the flushed runes.
 // So what this basically does is turn the Reader into a sliding window.
 func (r *Reader) Flush(numberOfRunes int) {
 	if numberOfRunes > r.bufferLen {
 		panic(fmt.Sprintf(
 			"parsekit.Input.Reader.Flush(): number of runes to flush (%d) "+
 				"exceeds size of the buffer (%d)", numberOfRunes, r.bufferLen))
 	}
 	r.bufferOffset += numberOfRunes
 	r.bufferLen -= numberOfRunes
 	r.buffer = r.buffer[numberOfRunes:]
 }
--- a/reader/reader_test.go
+++ b/reader/reader_test.go
@ -0,0 +1,131 @@
 package reader_test
 import (
 	"fmt"
 	"io"
 	"strings"
 	"testing"
 	"git.makaay.nl/mauricem/go-parsekit/reader"
 	"github.com/stretchr/testify/assert"
 )
 func ExampleNew() {
 	r := reader.New(strings.NewReader("Hello, world!"))
 	at := func(i int) rune { r, _ := r.RuneAt(i); return r }
 	fmt.Printf("%c", at(0))
 	fmt.Printf("%c", at(12))
 	// Output:
 	// H!
 }
 func TestReader_RuneAt(t *testing.T) {
 	r := reader.New(strings.NewReader("Hello, world!"))
 	at := func(i int) rune { r, _ := r.RuneAt(i); return r }
 	// It is possible to go back and forth while reading the input.
 	result := fmt.Sprintf("%c%c%c%c", at(0), at(12), at(7), at(0))
 	assert.Equal(t, "H!wH", result)
 }
 func TestReader_RuneAt_endOfFile(t *testing.T) {
 	r := reader.New(strings.NewReader("Hello, world!"))
 	rn, err := r.RuneAt(13)
 	result := fmt.Sprintf("%q %s %t", rn, err, err == io.EOF)
 	assert.Equal(t, "'<27>' EOF true", result)
 	rn, err = r.RuneAt(20)
 	result = fmt.Sprintf("%q %s %t", rn, err, err == io.EOF)
 	assert.Equal(t, "'<27>' EOF true", result)
 }
 func TestReader_RuneAt_invalidRune(t *testing.T) {
 	r := reader.New(strings.NewReader("Hello, \xcdworld!"))
 	at := func(i int) rune { r, _ := r.RuneAt(i); return r }
 	result := fmt.Sprintf("%c%c%c%c", at(6), at(7), at(8), at(9))
 	assert.Equal(t, " <20>wo", result, "result")
 }
 func ExampleReader_RuneAt() {
 	reader := reader.New(strings.NewReader("Hello, world!"))
 	fmt.Printf("Runes: ")
 	for i := 0; ; i++ {
 		r, err := reader.RuneAt(i)
 		if err != nil {
 			fmt.Printf("\nErr: %s\n", err)
 			break
 		}
 		fmt.Printf("%c", r)
 	}
 	// Output:
 	// Runes: Hello, world!
 	// Err: EOF
 }
 func TestRuneAt_SkipsBOMAtStartOfFile(t *testing.T) {
 	r := reader.New(strings.NewReader("\uFEFFBommetje!"))
 	b, _ := r.RuneAt(0)
 	o, _ := r.RuneAt(1)
 	m, _ := r.RuneAt(2)
 	bom := fmt.Sprintf("%c%c%c", b, o, m)
 	assert.Equal(t, "Bom", bom, "first three runes")
 }
 func TestReader_Flush(t *testing.T) {
 	r := reader.New(strings.NewReader("Hello, world!"))
 	at := func(i int) rune { r, _ := r.RuneAt(i); return r }
 	// Fills the buffer with the first 8 runes on the input: "Hello, w"
 	result := fmt.Sprintf("%c", at(7))
 	assert.Equal(t, "w", result, "first read")
 	// Now flush the first 4 runes from the buffer (dropping "Hell" from it)
 	r.Flush(4)
 	// Rune 0 is now pointing at what originally was rune offset 4.
 	// We can continue reading from there.
 	result = fmt.Sprintf("%c%c%c%c%c%c", at(0), at(1), at(2), at(3), at(4), at(5))
 	assert.Equal(t, "o, wor", result)
 }
 func ExampleReader_Flush() {
 	r := reader.New(strings.NewReader("dog eat dog!"))
 	at := func(offset int) rune { c, _ := r.RuneAt(offset); return c }
 	// Read from the first 4 runes of the input.
 	fmt.Printf("%c%c%c%c", at(0), at(1), at(2), at(3))
 	// Flush those 4 runes, bringing offset 0 to the start of "eat dog".
 	r.Flush(4)
 	// Read another 4 runes, because of the flushing, we start at offset 0.
 	fmt.Printf("%c%c%c%c", at(1), at(2), at(0), at(3))
 	// Again, flush 4 runes, bringing offset 0 to the start of "dog!".
 	r.Flush(4)
 	// Read from the remainder runes.
 	fmt.Printf("%c%c%c%c%c", at(2), at(1), at(1), at(0), at(3))
 	// Output:
 	// dog ate good!
 }
 func TestGivenNumberOfRunesTooHigh_Flush_Panics(t *testing.T) {
 	r := reader.New(strings.NewReader("Hello, world!"))
 	// Fill buffer with "Hello, worl", the first 11 runes.
 	r.RuneAt(10)
 	// However, we flush 12 runes, which exceeds the buffer size.
 	assert.PanicsWithValue(t,
 		"parsekit.Input.Reader.Flush(): number of runes to flush "+
 			"(12) exceeds size of the buffer (11)",
 		func() { r.Flush(12) })
 }
--- a/reader_test.go
+++ b/reader_test.go
@ -1,94 +0,0 @@
 package parsekit
 import (
 	"fmt"
 	"io"
 	"strings"
 	"testing"
 )
 func ExamplenewReader() {
 	in := strings.NewReader("Hello, world!")
 	r := newReader(in)
 	at := func(i int) rune { r, _ := r.runeAt(i); return r }
 	fmt.Printf("%c", at(0))
 	fmt.Printf("%c", at(12))
 	// Output:
 	// H!
 }
 func TestReader_runeAt(t *testing.T) {
 	in := strings.NewReader("Hello, world!")
 	r := newReader(in)
 	at := func(i int) rune { r, _ := r.runeAt(i); return r }
 	// It is possible to go back and forth while reading the input.
 	result := fmt.Sprintf("%c%c%c%c", at(0), at(12), at(7), at(0))
 	AssertEqual(t, "H!wH", result, "result")
 }
 func TestReader_runeAt_endOfFile(t *testing.T) {
 	in := strings.NewReader("Hello, world!")
 	r := newReader(in)
 	rn, err := r.runeAt(13)
 	result := fmt.Sprintf("%q %s %t", rn, err, err == io.EOF)
 	AssertEqual(t, "'<27>' EOF true", result, "result")
 	rn, err = r.runeAt(20)
 	result = fmt.Sprintf("%q %s %t", rn, err, err == io.EOF)
 	AssertEqual(t, "'<27>' EOF true", result, "result")
 }
 func TestReader_runeAt_invalidRune(t *testing.T) {
 	in := strings.NewReader("Hello, \xcdworld!")
 	r := newReader(in)
 	at := func(i int) rune { r, _ := r.runeAt(i); return r }
 	result := fmt.Sprintf("%c%c%c%c", at(6), at(7), at(8), at(9))
 	AssertEqual(t, " <20>wo", result, "result")
 }
 func TestRuneAt_SkipsBOMAtStartOfFile(t *testing.T) {
 	in := strings.NewReader("\uFEFFBommetje!")
 	r := newReader(in)
 	b, _ := r.runeAt(0)
 	o, _ := r.runeAt(1)
 	m, _ := r.runeAt(2)
 	bom := fmt.Sprintf("%c%c%c", b, o, m)
 	AssertEqual(t, "Bom", bom, "first three runes")
 }
 func TestReader_Flush(t *testing.T) {
 	in := strings.NewReader("Hello, world!")
 	r := newReader(in)
 	at := func(i int) rune { r, _ := r.runeAt(i); return r }
 	// Fills the buffer with the first 8 runes on the input: "Hello, w"
 	result := fmt.Sprintf("%c", at(7))
 	AssertEqual(t, "w", result, "first read")
 	// Now flush the first 4 runes from the buffer (dropping "Hell" from it)
 	r.flush(4)
 	// Rune 0 is now pointing at what originally was rune offset 4.
 	// We can continue reading from there.
 	result = fmt.Sprintf("%c%c%c%c%c%c", at(0), at(1), at(2), at(3), at(4), at(5))
 	AssertEqual(t, "o, wor", result, "second read")
 }
 func TestGivenNumberOfRunesTooHigh_Flush_Panics(t *testing.T) {
 	in := strings.NewReader("Hello, world!")
 	r := newReader(in)
 	// Fill buffer with "Hello, worl", the first 11 runes.
 	r.runeAt(10)
 	// However, we flush 12 runes, which exceeds the buffer size.
 	AssertPanic(t, PanicT{
 		Function: func() { r.flush(12) },
 		Expect:   "parsekit.Input.Reader.Flush(): number of runes to flush (12) exceeds size of the buffer (11)",
 	})
 }
--- a/tokenapi.go
+++ b/tokenapi.go
@ -3,11 +3,13 @@ package parsekit
 import (
 	"fmt"
 	"io"
 	"git.makaay.nl/mauricem/go-parsekit/reader"
 )
-// TokenAPI wraps a parsekit.reader and its purpose is to retrieve input data and
+// TokenAPI wraps a parsekit.reader and its purpose is to retrieve data from
-// to report back results. For easy lookahead support, a forking strategy is
+// the reader and to report back tokenizing results. For easy lookahead support,
-// provided.
+// a forking strategy is provided.
 //
 // BASIC OPERATION:
 //
@ -15,19 +17,19 @@ import (
 //
 // When the rune is to be accepted as input, call the method Accept(). The rune
 // is then added to the results of the TokenAPI and the read cursor is moved
-// forward. Runes collected this way can later on be retrieved using for
+// forward.
-// example the method Result().Runes().
+//
 // By invoking NextRune() + Accept() multiple times, the result can be extended
 // with as many runes as needed. Runes collected this way can later on be
 // retrieved using the method
 //
 // It is mandatory to call Accept() after retrieving a rune, before calling
 // NextRune() again. Failing to do so will result in a panic.
 //
-// By invoking NextRune() + Accept() multiple times, the result can be extended
+// Next to adding runes to the result, it is also possible to modify the
-// with as many runes as needed.
+// stored runes or to add lexical Tokens to the result. For all things
-//
+// concerning results, take a look at the TokenHandlerResult struct, which
-// Next to adding runes to the output, it is also possible to modify the
+// can be accessed though the method Result().
 // already collected runes or to produce lexical Tokens. For all things
 // concerning results, take a look at the TokenHandlerResult struct, which can be
 // accessed though the method Result().
 //
 // FORKING OPERATION FOR EASY LOOKEAHEAD SUPPORT:
 //
@ -38,7 +40,7 @@ import (
 //
 // The way in which this is supported, is by forking a TokenAPI struct by
 // calling method Fork(). This will return a forked child TokenAPI, with
-// an empty result buffer, but using the same read cursor position as the
+// empty result data, but using the same read cursor position as the
 // forked parent.
 //
 // After forking, the same interface as described for BASIC OPERATION can be
@ -47,28 +49,30 @@ import (
 // to the parent's results, and to move the read cursor position to that
 // of the child.
 //
-// When the lookahead was unsuccessful or when the results of the forked child
+// When the lookahead was unsuccessful, then the forked child TokenAPI can
-// are not to be used, then the forked child TokenAPI can simply be discarded.
+// simply be discarded. The parent TokenAPI was never modified, so it can
-// The parent TokenAPI was never modified, so it can safely be used as if the
+// safely be used as if the lookahead never happened.
 // lookahead never happened.
 //
-// Note:
+// Opinionized note:
 // Many tokenizers/parsers take a different approach on lookaheads by using
 // peeks and by moving the read cursor position back and forth, or by putting
 // read input back on the input stream. That often leads to code that is
-// efficient, however, in my opinion, not very intuitive to read.
+// efficient, however, in my opinion, not very intuitive to read. It can also
 // be tedious to get the cursor position back at the correct position, which
 // can lead to hard to track bugs. I much prefer this forking method, since
 // no bookkeeping has to be implemented when implementing a parser.
 type TokenAPI struct {
-	reader *reader
+	reader *reader.Reader
 	root   *TokenAPI           // the root TokenAPI
 	parent *TokenAPI           // parent TokenAPI in case this TokenAPI is a fork child
 	child  *TokenAPI           // child TokenAPI in case this TokenAPI is a fork parent
-	result *TokenHandlerResult // results as produced by a TokenHandler (runes, Tokens)
+	result *TokenHandlerResult // results as produced by a TokenHandler (runes, Tokens, cursor position)
 }
 // NewTokenAPI initializes a new TokenAPI struct, wrapped around the provided io.Reader.
 func NewTokenAPI(r io.Reader) *TokenAPI {
 	input := &TokenAPI{
-		reader: newReader(r),
+		reader: reader.New(r),
 		result: newTokenHandlerResult(),
 	}
 	input.root = input // TODO remove this one from root input, input.root == nil is also a good check for "is root?".
@ -91,7 +95,7 @@ func (i *TokenAPI) NextRune() (rune, error) {
 	}
 	i.detachChilds()
-	readRune, err := i.reader.runeAt(i.result.offset)
+	readRune, err := i.reader.RuneAt(i.result.offset)
 	i.result.lastRune = &runeInfo{r: readRune, err: err}
 	return readRune, err
 }
--- a/tokenizer_test.go
+++ b/tokenizer_test.go
@ -177,7 +177,6 @@ func TestCallingAcceptAfterNextRune_AcceptsRuneAndMovesReadOffsetForward(t *test
 	i.Accept()
 	AssertTrue(t, i.result.lastRune == nil, "TokenAPI.result.lastRune after Accept() is nil")
 	AssertEqual(t, 1, i.result.offset, "TokenAPI.result.offset")
 	AssertEqual(t, 'T', i.reader.buffer[0], "TokenAPI.reader.buffer[0]")
 	r, _ = i.NextRune()
 	AssertEqual(t, 'e', r, "result from 2nd call to NextRune()")
 }
@ -188,7 +187,6 @@ func TestCallingMultipleAccepts_FillsInputWithData(t *testing.T) {
 		i.NextRune()
 		i.Accept()
 	}
 	AssertEqual(t, "Testing", string(i.reader.buffer), "reader input buffer")
 	AssertEqual(t, "Testing", i.Result().String(), "i.Result().String()")
 }