Moved Reader into its own package.

2019-06-07 10:55:55 +00:00 · 2019-06-07 10:55:55 +00:00 · 98d2db0374
parent 6d92e1dc68
commit 98d2db0374
9 changed files with 291 additions and 204 deletions
--- a/go.mod
+++ b/go.mod
@ -1,3 +1,5 @@
 module git.makaay.nl/mauricem/go-parsekit

 go 1.12
+
+require github.com/stretchr/testify v1.3.0
--- a/go.sum
+++ b/go.sum
@ -0,0 +1,7 @@
+github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
+github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
--- a/parseapi.go
+++ b/parseapi.go
@ -172,7 +172,7 @@ func (a *ParseAPIOnAction) Stay() bool {

 func (a *ParseAPIOnAction) flushReader() {
 	if a.tokenAPI.result.offset > 0 {
-		a.tokenAPI.root.reader.flush(a.tokenAPI.root.result.offset)
+		a.tokenAPI.root.reader.Flush(a.tokenAPI.root.result.offset)
 		a.tokenAPI.root.result.offset = 0
 		a.parseAPI.initLoopCheck()
 	}
--- a/reader.go
+++ b/reader.go
@ -1,84 +0,0 @@
-package parsekit
-
-import (
-	"bufio"
-	"fmt"
-	"io"
-	"unicode/utf8"
-)
-
-// reader wraps around an io.Reader and provides buffering to allows us to read
-// the same runes over and over again. This is useful for implementing a parser
-// that must be able to do lookahead on the input, returning to the original
-// input position after finishing that lookahead).
-//
-// To minimze memory use, it is also possible to flush the buffer when there is
-// no more need to go back to previously read runes.
-//
-// The reader is used internally by parsekit.TokenAPI.
-type reader struct {
-	bufio        *bufio.Reader // Used for ReadRune()
-	buffer       []rune        // Input buffer, holding runes that were read from input
-	bufferOffset int           // The offset of the buffer, relative to the start of the input
-	bufferLen    int           // Input size, the number of runes in the buffer
-}
-
-// newwReader initializes a new reader struct, wrapped around the provided io.Reader.
-func newReader(r io.Reader) *reader {
-	return &reader{
-		bufio:  bufio.NewReader(r),
-		buffer: []rune{},
-	}
-}
-
-// runeAt reads the rune at the provided rune offset.
-//
-// This offset is relative to the current starting position of the buffer in
-// the reader. When starting reading, offset 0 will point at the start of the
-// input. After flushing, offset 0 will point at the input up to where
-// the flush was done.
-//
-// The error return value will be nil when reading was successful.
-// When an invalid rune is encountered on the input, the error will be nil,
-// but the rune will be utf8.RuneError
-//
-// When reading failed, the rune will be utf8.RuneError. One special read
-// fail is actually a normal situation: end of file reached. In that case,
-// the returned error wille be io.EOF.
-func (r *reader) runeAt(offset int) (rune, error) {
-	// Rune at provided offset is not yet available in the input buffer.
-	// Read runes until we have enough runes to satisfy the offset.
-	for r.bufferLen <= offset {
-		readRune, _, err := r.bufio.ReadRune()
-
-		// Handle errors.
-		if err != nil {
-			return utf8.RuneError, err
-		}
-
-		// Skip BOM.
-		if readRune == '\uFEFF' && r.bufferOffset == 0 {
-			r.bufferOffset++
-			continue
-		}
-
-		r.buffer = append(r.buffer, readRune)
-		r.bufferLen++
-	}
-	return r.buffer[offset], nil
-}
-
-// Flush deletes the provided number of runes from the start of the
-// reader buffer. After flushing the buffer, offset 0 as used by runeAt()
-// will point to the rune that comes after the flushed runes.
-// So what this basically does is turn the Reader into a sliding window.
-func (r *reader) flush(numberOfRunes int) {
-	if numberOfRunes > r.bufferLen {
-		panic(fmt.Sprintf(
-			"parsekit.Input.Reader.Flush(): number of runes to flush (%d) "+
-				"exceeds size of the buffer (%d)", numberOfRunes, r.bufferLen))
-	}
-	r.bufferOffset += numberOfRunes
-	r.bufferLen -= numberOfRunes
-	r.buffer = r.buffer[numberOfRunes:]
-}
--- a/reader/reader.go
+++ b/reader/reader.go
@ -0,0 +1,123 @@
+// Package reader provides a buffered Reader that wraps around an io.Reader.
+//
+// Functionally, it provides an input buffer in the form of a sliding window.
+// Let's say we've got the following input coming up in the io.Reader that is
+// wrapped by the Reader:
+//
+//     |H|e|l|l|o|,| |w|o|r|l|d|!|  <-- runes
+//      0           6           12  <-- rune offset
+//
+// The Reader can now be used to retrieve runes from the input, based on their
+// offset, using RuneAt(offset). Normally these runes will be retrieved in
+// sequence, but that is not a requirement. Let's say we retrieve the rune with
+// offset 6 from the input (the 'w'), then the Reader buffer be filled with runes
+// from the io.Reader until there are enough runes available to return the rune
+// for offset 6:
+//
+//     |H|e|l|l|o| |w|
+//      0           6
+//
+// Using RuneAt, you can retrieve arbitrary runes. If you request one that is
+// in the Reader buffer, then the buffered rune is returned. If you request one
+// that is not in the buffer, then the buffer will be expanded.
+//
+// To make this into a sliding window, the Reader provides the method
+// Flush(numberOfRunes). This method will drop the provided number of runes from
+// the Reader buffer. So when we'd do a Flush(3) on the example buffer from above,
+// then the Reader buffer would become:
+//
+//     |l|o| |w|
+//      0     3
+//
+// Note that the offset for the first rune 'l' in the buffer is now 0.
+// You can consider the input to be changed in a similar way:
+//
+//     |l|o|,| |w|o|r|l|d|!|
+//      0           6     9
+//
+// So after a flush, the first upcoming rune after the flushed runes
+// will always have index 0.
+package reader
+
+import (
+	"bufio"
+	"fmt"
+	"io"
+	"unicode/utf8"
+)
+
+// Reader wraps around an io.Reader and provides buffering that allows us to read
+// the same runes over and over again. This is useful for implementing a parser
+// that must be able to do lookahead on the input, returning to the original
+// input position after finishing that lookahead).
+//
+// To minimze memory use, it is also possible to flush the read buffer when there is
+// no more need to go back to previously read runes.
+//
+// The parserkit.reader.Reader is used internally by parsekit.TokenAPI.
+type Reader struct {
+	bufio        *bufio.Reader // Used for ReadRune()
+	buffer       []rune        // Input buffer, holding runes that were read from input
+	bufferOffset int           // The offset of the buffer, relative to the start of the input
+	bufferLen    int           // Input size, the number of runes in the buffer
+}
+
+// New initializes a new reader struct, wrapped around the provided io.Reader.
+func New(r io.Reader) *Reader {
+	return &Reader{
+		bufio:  bufio.NewReader(r),
+		buffer: []rune{},
+	}
+}
+
+// RuneAt reads the rune at the provided rune offset.
+//
+// This offset is relative to the current starting position of the buffer in
+// the reader. When starting reading, offset 0 will point at the start of the
+// input. After flushing, offset 0 will point at the input up to where
+// the flush was done.
+//
+// The error return value will be nil when reading was successful.
+// When an invalid rune is encountered on the input, the error will be nil,
+// but the rune will be utf8.RuneError
+//
+// When reading failed, the rune will be utf8.RuneError and the error will
+// be not nil. One special read fail is actually a normal situation: end
+// of file reached. In that case, the returned error wille be io.EOF.
+func (r *Reader) RuneAt(offset int) (rune, error) {
+	// Rune at provided offset is not yet available in the input buffer.
+	// Read runes until we have enough runes to satisfy the offset.
+	for r.bufferLen <= offset {
+		readRune, _, err := r.bufio.ReadRune()
+
+		// Handle errors.
+		if err != nil {
+			return utf8.RuneError, err
+		}
+
+		// Skip BOM.
+		if readRune == '\uFEFF' && r.bufferOffset == 0 {
+			r.bufferOffset++
+			continue
+		}
+
+		r.buffer = append(r.buffer, readRune)
+		r.bufferLen++
+	}
+	return r.buffer[offset], nil
+}
+
+// Flush deletes the provided number of runes from the start of the
+// reader buffer. After flushing the buffer, offset 0 as used by RuneAt()
+// will point to the rune that comes after the flushed runes.
+// So what this basically does is turn the Reader into a sliding window.
+func (r *Reader) Flush(numberOfRunes int) {
+	if numberOfRunes > r.bufferLen {
+		panic(fmt.Sprintf(
+			"parsekit.Input.Reader.Flush(): number of runes to flush (%d) "+
+				"exceeds size of the buffer (%d)", numberOfRunes, r.bufferLen))
+	}
+	r.bufferOffset += numberOfRunes
+	r.bufferLen -= numberOfRunes
+	r.buffer = r.buffer[numberOfRunes:]
+}
--- a/reader/reader_test.go
+++ b/reader/reader_test.go
@ -0,0 +1,131 @@
+package reader_test
+
+import (
+	"fmt"
+	"io"
+	"strings"
+	"testing"
+
+	"git.makaay.nl/mauricem/go-parsekit/reader"
+	"github.com/stretchr/testify/assert"
+)
+
+func ExampleNew() {
+	r := reader.New(strings.NewReader("Hello, world!"))
+	at := func(i int) rune { r, _ := r.RuneAt(i); return r }
+
+	fmt.Printf("%c", at(0))
+	fmt.Printf("%c", at(12))
+
+	// Output:
+	// H!
+}
+
+func TestReader_RuneAt(t *testing.T) {
+	r := reader.New(strings.NewReader("Hello, world!"))
+	at := func(i int) rune { r, _ := r.RuneAt(i); return r }
+
+	// It is possible to go back and forth while reading the input.
+	result := fmt.Sprintf("%c%c%c%c", at(0), at(12), at(7), at(0))
+	assert.Equal(t, "H!wH", result)
+}
+
+func TestReader_RuneAt_endOfFile(t *testing.T) {
+	r := reader.New(strings.NewReader("Hello, world!"))
+
+	rn, err := r.RuneAt(13)
+	result := fmt.Sprintf("%q %s %t", rn, err, err == io.EOF)
+	assert.Equal(t, "'<27>' EOF true", result)
+
+	rn, err = r.RuneAt(20)
+	result = fmt.Sprintf("%q %s %t", rn, err, err == io.EOF)
+	assert.Equal(t, "'<27>' EOF true", result)
+}
+
+func TestReader_RuneAt_invalidRune(t *testing.T) {
+	r := reader.New(strings.NewReader("Hello, \xcdworld!"))
+	at := func(i int) rune { r, _ := r.RuneAt(i); return r }
+
+	result := fmt.Sprintf("%c%c%c%c", at(6), at(7), at(8), at(9))
+	assert.Equal(t, " <20>wo", result, "result")
+}
+
+func ExampleReader_RuneAt() {
+	reader := reader.New(strings.NewReader("Hello, world!"))
+
+	fmt.Printf("Runes: ")
+	for i := 0; ; i++ {
+		r, err := reader.RuneAt(i)
+		if err != nil {
+			fmt.Printf("\nErr: %s\n", err)
+			break
+		}
+		fmt.Printf("%c", r)
+	}
+
+	// Output:
+	// Runes: Hello, world!
+	// Err: EOF
+}
+
+func TestRuneAt_SkipsBOMAtStartOfFile(t *testing.T) {
+	r := reader.New(strings.NewReader("\uFEFFBommetje!"))
+	b, _ := r.RuneAt(0)
+	o, _ := r.RuneAt(1)
+	m, _ := r.RuneAt(2)
+	bom := fmt.Sprintf("%c%c%c", b, o, m)
+	assert.Equal(t, "Bom", bom, "first three runes")
+}
+
+func TestReader_Flush(t *testing.T) {
+	r := reader.New(strings.NewReader("Hello, world!"))
+	at := func(i int) rune { r, _ := r.RuneAt(i); return r }
+
+	// Fills the buffer with the first 8 runes on the input: "Hello, w"
+	result := fmt.Sprintf("%c", at(7))
+	assert.Equal(t, "w", result, "first read")
+
+	// Now flush the first 4 runes from the buffer (dropping "Hell" from it)
+	r.Flush(4)
+
+	// Rune 0 is now pointing at what originally was rune offset 4.
+	// We can continue reading from there.
+	result = fmt.Sprintf("%c%c%c%c%c%c", at(0), at(1), at(2), at(3), at(4), at(5))
+	assert.Equal(t, "o, wor", result)
+}
+
+func ExampleReader_Flush() {
+	r := reader.New(strings.NewReader("dog eat dog!"))
+	at := func(offset int) rune { c, _ := r.RuneAt(offset); return c }
+
+	// Read from the first 4 runes of the input.
+	fmt.Printf("%c%c%c%c", at(0), at(1), at(2), at(3))
+
+	// Flush those 4 runes, bringing offset 0 to the start of "eat dog".
+	r.Flush(4)
+
+	// Read another 4 runes, because of the flushing, we start at offset 0.
+	fmt.Printf("%c%c%c%c", at(1), at(2), at(0), at(3))
+
+	// Again, flush 4 runes, bringing offset 0 to the start of "dog!".
+	r.Flush(4)
+
+	// Read from the remainder runes.
+	fmt.Printf("%c%c%c%c%c", at(2), at(1), at(1), at(0), at(3))
+
+	// Output:
+	// dog ate good!
+}
+
+func TestGivenNumberOfRunesTooHigh_Flush_Panics(t *testing.T) {
+	r := reader.New(strings.NewReader("Hello, world!"))
+
+	// Fill buffer with "Hello, worl", the first 11 runes.
+	r.RuneAt(10)
+
+	// However, we flush 12 runes, which exceeds the buffer size.
+	assert.PanicsWithValue(t,
+		"parsekit.Input.Reader.Flush(): number of runes to flush "+
+			"(12) exceeds size of the buffer (11)",
+		func() { r.Flush(12) })
+}
--- a/reader_test.go
+++ b/reader_test.go
@ -1,94 +0,0 @@
-package parsekit
-
-import (
-	"fmt"
-	"io"
-	"strings"
-	"testing"
-)
-
-func ExamplenewReader() {
-	in := strings.NewReader("Hello, world!")
-	r := newReader(in)
-	at := func(i int) rune { r, _ := r.runeAt(i); return r }
-
-	fmt.Printf("%c", at(0))
-	fmt.Printf("%c", at(12))
-
-	// Output:
-	// H!
-}
-
-func TestReader_runeAt(t *testing.T) {
-	in := strings.NewReader("Hello, world!")
-	r := newReader(in)
-	at := func(i int) rune { r, _ := r.runeAt(i); return r }
-
-	// It is possible to go back and forth while reading the input.
-	result := fmt.Sprintf("%c%c%c%c", at(0), at(12), at(7), at(0))
-	AssertEqual(t, "H!wH", result, "result")
-}
-
-func TestReader_runeAt_endOfFile(t *testing.T) {
-	in := strings.NewReader("Hello, world!")
-	r := newReader(in)
-
-	rn, err := r.runeAt(13)
-	result := fmt.Sprintf("%q %s %t", rn, err, err == io.EOF)
-	AssertEqual(t, "'<27>' EOF true", result, "result")
-
-	rn, err = r.runeAt(20)
-	result = fmt.Sprintf("%q %s %t", rn, err, err == io.EOF)
-	AssertEqual(t, "'<27>' EOF true", result, "result")
-}
-
-func TestReader_runeAt_invalidRune(t *testing.T) {
-	in := strings.NewReader("Hello, \xcdworld!")
-	r := newReader(in)
-	at := func(i int) rune { r, _ := r.runeAt(i); return r }
-
-	result := fmt.Sprintf("%c%c%c%c", at(6), at(7), at(8), at(9))
-	AssertEqual(t, " <20>wo", result, "result")
-}
-
-func TestRuneAt_SkipsBOMAtStartOfFile(t *testing.T) {
-	in := strings.NewReader("\uFEFFBommetje!")
-	r := newReader(in)
-	b, _ := r.runeAt(0)
-	o, _ := r.runeAt(1)
-	m, _ := r.runeAt(2)
-	bom := fmt.Sprintf("%c%c%c", b, o, m)
-	AssertEqual(t, "Bom", bom, "first three runes")
-}
-
-func TestReader_Flush(t *testing.T) {
-	in := strings.NewReader("Hello, world!")
-	r := newReader(in)
-	at := func(i int) rune { r, _ := r.runeAt(i); return r }
-
-	// Fills the buffer with the first 8 runes on the input: "Hello, w"
-	result := fmt.Sprintf("%c", at(7))
-	AssertEqual(t, "w", result, "first read")
-
-	// Now flush the first 4 runes from the buffer (dropping "Hell" from it)
-	r.flush(4)
-
-	// Rune 0 is now pointing at what originally was rune offset 4.
-	// We can continue reading from there.
-	result = fmt.Sprintf("%c%c%c%c%c%c", at(0), at(1), at(2), at(3), at(4), at(5))
-	AssertEqual(t, "o, wor", result, "second read")
-}
-
-func TestGivenNumberOfRunesTooHigh_Flush_Panics(t *testing.T) {
-	in := strings.NewReader("Hello, world!")
-	r := newReader(in)
-
-	// Fill buffer with "Hello, worl", the first 11 runes.
-	r.runeAt(10)
-
-	// However, we flush 12 runes, which exceeds the buffer size.
-	AssertPanic(t, PanicT{
-		Function: func() { r.flush(12) },
-		Expect:   "parsekit.Input.Reader.Flush(): number of runes to flush (12) exceeds size of the buffer (11)",
-	})
-}
--- a/tokenapi.go
+++ b/tokenapi.go
@ -3,11 +3,13 @@ package parsekit
 import (
 	"fmt"
 	"io"
+
+	"git.makaay.nl/mauricem/go-parsekit/reader"
 )

-// TokenAPI wraps a parsekit.reader and its purpose is to retrieve input data and
-// to report back results. For easy lookahead support, a forking strategy is
-// provided.
+// TokenAPI wraps a parsekit.reader and its purpose is to retrieve data from
+// the reader and to report back tokenizing results. For easy lookahead support,
+// a forking strategy is provided.
 //
 // BASIC OPERATION:
 //
@ -15,19 +17,19 @@ import (
 //
 // When the rune is to be accepted as input, call the method Accept(). The rune
 // is then added to the results of the TokenAPI and the read cursor is moved
-// forward. Runes collected this way can later on be retrieved using for
-// example the method Result().Runes().
+// forward.
+//
+// By invoking NextRune() + Accept() multiple times, the result can be extended
+// with as many runes as needed. Runes collected this way can later on be
+// retrieved using the method
 //
 // It is mandatory to call Accept() after retrieving a rune, before calling
 // NextRune() again. Failing to do so will result in a panic.
 //
-// By invoking NextRune() + Accept() multiple times, the result can be extended
-// with as many runes as needed.
-//
-// Next to adding runes to the output, it is also possible to modify the
-// already collected runes or to produce lexical Tokens. For all things
-// concerning results, take a look at the TokenHandlerResult struct, which can be
-// accessed though the method Result().
+// Next to adding runes to the result, it is also possible to modify the
+// stored runes or to add lexical Tokens to the result. For all things
+// concerning results, take a look at the TokenHandlerResult struct, which
+// can be accessed though the method Result().
 //
 // FORKING OPERATION FOR EASY LOOKEAHEAD SUPPORT:
 //
@ -38,7 +40,7 @@ import (
 //
 // The way in which this is supported, is by forking a TokenAPI struct by
 // calling method Fork(). This will return a forked child TokenAPI, with
-// an empty result buffer, but using the same read cursor position as the
+// empty result data, but using the same read cursor position as the
 // forked parent.
 //
 // After forking, the same interface as described for BASIC OPERATION can be
@ -47,28 +49,30 @@ import (
 // to the parent's results, and to move the read cursor position to that
 // of the child.
 //
-// When the lookahead was unsuccessful or when the results of the forked child
-// are not to be used, then the forked child TokenAPI can simply be discarded.
-// The parent TokenAPI was never modified, so it can safely be used as if the
-// lookahead never happened.
+// When the lookahead was unsuccessful, then the forked child TokenAPI can
+// simply be discarded. The parent TokenAPI was never modified, so it can
+// safely be used as if the lookahead never happened.
 //
-// Note:
+// Opinionized note:
 // Many tokenizers/parsers take a different approach on lookaheads by using
 // peeks and by moving the read cursor position back and forth, or by putting
 // read input back on the input stream. That often leads to code that is
-// efficient, however, in my opinion, not very intuitive to read.
+// efficient, however, in my opinion, not very intuitive to read. It can also
+// be tedious to get the cursor position back at the correct position, which
+// can lead to hard to track bugs. I much prefer this forking method, since
+// no bookkeeping has to be implemented when implementing a parser.
 type TokenAPI struct {
-	reader *reader
+	reader *reader.Reader
 	root   *TokenAPI           // the root TokenAPI
 	parent *TokenAPI           // parent TokenAPI in case this TokenAPI is a fork child
 	child  *TokenAPI           // child TokenAPI in case this TokenAPI is a fork parent
-	result *TokenHandlerResult // results as produced by a TokenHandler (runes, Tokens)
+	result *TokenHandlerResult // results as produced by a TokenHandler (runes, Tokens, cursor position)
 }

 // NewTokenAPI initializes a new TokenAPI struct, wrapped around the provided io.Reader.
 func NewTokenAPI(r io.Reader) *TokenAPI {
 	input := &TokenAPI{
-		reader: newReader(r),
+		reader: reader.New(r),
 		result: newTokenHandlerResult(),
 	}
 	input.root = input // TODO remove this one from root input, input.root == nil is also a good check for "is root?".
@ -91,7 +95,7 @@ func (i *TokenAPI) NextRune() (rune, error) {
 	}
 	i.detachChilds()

-	readRune, err := i.reader.runeAt(i.result.offset)
+	readRune, err := i.reader.RuneAt(i.result.offset)
 	i.result.lastRune = &runeInfo{r: readRune, err: err}
 	return readRune, err
 }
--- a/tokenizer_test.go
+++ b/tokenizer_test.go
@ -177,7 +177,6 @@ func TestCallingAcceptAfterNextRune_AcceptsRuneAndMovesReadOffsetForward(t *test
 	i.Accept()
 	AssertTrue(t, i.result.lastRune == nil, "TokenAPI.result.lastRune after Accept() is nil")
 	AssertEqual(t, 1, i.result.offset, "TokenAPI.result.offset")
-	AssertEqual(t, 'T', i.reader.buffer[0], "TokenAPI.reader.buffer[0]")
 	r, _ = i.NextRune()
 	AssertEqual(t, 'e', r, "result from 2nd call to NextRune()")
 }
@ -188,7 +187,6 @@ func TestCallingMultipleAccepts_FillsInputWithData(t *testing.T) {
 		i.NextRune()
 		i.Accept()
 	}
-	AssertEqual(t, "Testing", string(i.reader.buffer), "reader input buffer")
 	AssertEqual(t, "Testing", i.Result().String(), "i.Result().String()")
 }