Moved Reader into its own package.

This commit is contained in:
Maurice Makaay 2019-06-07 10:55:55 +00:00
parent 6d92e1dc68
commit 98d2db0374
9 changed files with 291 additions and 204 deletions

2
go.mod
View File

@ -1,3 +1,5 @@
module git.makaay.nl/mauricem/go-parsekit
go 1.12
require github.com/stretchr/testify v1.3.0

7
go.sum Normal file
View File

@ -0,0 +1,7 @@
github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=

View File

@ -172,7 +172,7 @@ func (a *ParseAPIOnAction) Stay() bool {
func (a *ParseAPIOnAction) flushReader() {
if a.tokenAPI.result.offset > 0 {
a.tokenAPI.root.reader.flush(a.tokenAPI.root.result.offset)
a.tokenAPI.root.reader.Flush(a.tokenAPI.root.result.offset)
a.tokenAPI.root.result.offset = 0
a.parseAPI.initLoopCheck()
}

View File

@ -1,84 +0,0 @@
package parsekit
import (
"bufio"
"fmt"
"io"
"unicode/utf8"
)
// reader wraps around an io.Reader and provides buffering to allows us to read
// the same runes over and over again. This is useful for implementing a parser
// that must be able to do lookahead on the input, returning to the original
// input position after finishing that lookahead).
//
// To minimze memory use, it is also possible to flush the buffer when there is
// no more need to go back to previously read runes.
//
// The reader is used internally by parsekit.TokenAPI.
type reader struct {
bufio *bufio.Reader // Used for ReadRune()
buffer []rune // Input buffer, holding runes that were read from input
bufferOffset int // The offset of the buffer, relative to the start of the input
bufferLen int // Input size, the number of runes in the buffer
}
// newwReader initializes a new reader struct, wrapped around the provided io.Reader.
func newReader(r io.Reader) *reader {
return &reader{
bufio: bufio.NewReader(r),
buffer: []rune{},
}
}
// runeAt reads the rune at the provided rune offset.
//
// This offset is relative to the current starting position of the buffer in
// the reader. When starting reading, offset 0 will point at the start of the
// input. After flushing, offset 0 will point at the input up to where
// the flush was done.
//
// The error return value will be nil when reading was successful.
// When an invalid rune is encountered on the input, the error will be nil,
// but the rune will be utf8.RuneError
//
// When reading failed, the rune will be utf8.RuneError. One special read
// fail is actually a normal situation: end of file reached. In that case,
// the returned error wille be io.EOF.
func (r *reader) runeAt(offset int) (rune, error) {
// Rune at provided offset is not yet available in the input buffer.
// Read runes until we have enough runes to satisfy the offset.
for r.bufferLen <= offset {
readRune, _, err := r.bufio.ReadRune()
// Handle errors.
if err != nil {
return utf8.RuneError, err
}
// Skip BOM.
if readRune == '\uFEFF' && r.bufferOffset == 0 {
r.bufferOffset++
continue
}
r.buffer = append(r.buffer, readRune)
r.bufferLen++
}
return r.buffer[offset], nil
}
// Flush deletes the provided number of runes from the start of the
// reader buffer. After flushing the buffer, offset 0 as used by runeAt()
// will point to the rune that comes after the flushed runes.
// So what this basically does is turn the Reader into a sliding window.
func (r *reader) flush(numberOfRunes int) {
if numberOfRunes > r.bufferLen {
panic(fmt.Sprintf(
"parsekit.Input.Reader.Flush(): number of runes to flush (%d) "+
"exceeds size of the buffer (%d)", numberOfRunes, r.bufferLen))
}
r.bufferOffset += numberOfRunes
r.bufferLen -= numberOfRunes
r.buffer = r.buffer[numberOfRunes:]
}

123
reader/reader.go Normal file
View File

@ -0,0 +1,123 @@
// Package reader provides a buffered Reader that wraps around an io.Reader.
//
// Functionally, it provides an input buffer in the form of a sliding window.
// Let's say we've got the following input coming up in the io.Reader that is
// wrapped by the Reader:
//
// |H|e|l|l|o|,| |w|o|r|l|d|!| <-- runes
// 0 6 12 <-- rune offset
//
// The Reader can now be used to retrieve runes from the input, based on their
// offset, using RuneAt(offset). Normally these runes will be retrieved in
// sequence, but that is not a requirement. Let's say we retrieve the rune with
// offset 6 from the input (the 'w'), then the Reader buffer be filled with runes
// from the io.Reader until there are enough runes available to return the rune
// for offset 6:
//
// |H|e|l|l|o| |w|
// 0 6
//
// Using RuneAt, you can retrieve arbitrary runes. If you request one that is
// in the Reader buffer, then the buffered rune is returned. If you request one
// that is not in the buffer, then the buffer will be expanded.
//
// To make this into a sliding window, the Reader provides the method
// Flush(numberOfRunes). This method will drop the provided number of runes from
// the Reader buffer. So when we'd do a Flush(3) on the example buffer from above,
// then the Reader buffer would become:
//
// |l|o| |w|
// 0 3
//
// Note that the offset for the first rune 'l' in the buffer is now 0.
// You can consider the input to be changed in a similar way:
//
// |l|o|,| |w|o|r|l|d|!|
// 0 6 9
//
// So after a flush, the first upcoming rune after the flushed runes
// will always have index 0.
package reader
import (
"bufio"
"fmt"
"io"
"unicode/utf8"
)
// Reader wraps around an io.Reader and provides buffering that allows us to read
// the same runes over and over again. This is useful for implementing a parser
// that must be able to do lookahead on the input, returning to the original
// input position after finishing that lookahead).
//
// To minimze memory use, it is also possible to flush the read buffer when there is
// no more need to go back to previously read runes.
//
// The parserkit.reader.Reader is used internally by parsekit.TokenAPI.
type Reader struct {
bufio *bufio.Reader // Used for ReadRune()
buffer []rune // Input buffer, holding runes that were read from input
bufferOffset int // The offset of the buffer, relative to the start of the input
bufferLen int // Input size, the number of runes in the buffer
}
// New initializes a new reader struct, wrapped around the provided io.Reader.
func New(r io.Reader) *Reader {
return &Reader{
bufio: bufio.NewReader(r),
buffer: []rune{},
}
}
// RuneAt reads the rune at the provided rune offset.
//
// This offset is relative to the current starting position of the buffer in
// the reader. When starting reading, offset 0 will point at the start of the
// input. After flushing, offset 0 will point at the input up to where
// the flush was done.
//
// The error return value will be nil when reading was successful.
// When an invalid rune is encountered on the input, the error will be nil,
// but the rune will be utf8.RuneError
//
// When reading failed, the rune will be utf8.RuneError and the error will
// be not nil. One special read fail is actually a normal situation: end
// of file reached. In that case, the returned error wille be io.EOF.
func (r *Reader) RuneAt(offset int) (rune, error) {
// Rune at provided offset is not yet available in the input buffer.
// Read runes until we have enough runes to satisfy the offset.
for r.bufferLen <= offset {
readRune, _, err := r.bufio.ReadRune()
// Handle errors.
if err != nil {
return utf8.RuneError, err
}
// Skip BOM.
if readRune == '\uFEFF' && r.bufferOffset == 0 {
r.bufferOffset++
continue
}
r.buffer = append(r.buffer, readRune)
r.bufferLen++
}
return r.buffer[offset], nil
}
// Flush deletes the provided number of runes from the start of the
// reader buffer. After flushing the buffer, offset 0 as used by RuneAt()
// will point to the rune that comes after the flushed runes.
// So what this basically does is turn the Reader into a sliding window.
func (r *Reader) Flush(numberOfRunes int) {
if numberOfRunes > r.bufferLen {
panic(fmt.Sprintf(
"parsekit.Input.Reader.Flush(): number of runes to flush (%d) "+
"exceeds size of the buffer (%d)", numberOfRunes, r.bufferLen))
}
r.bufferOffset += numberOfRunes
r.bufferLen -= numberOfRunes
r.buffer = r.buffer[numberOfRunes:]
}

131
reader/reader_test.go Normal file
View File

@ -0,0 +1,131 @@
package reader_test
import (
"fmt"
"io"
"strings"
"testing"
"git.makaay.nl/mauricem/go-parsekit/reader"
"github.com/stretchr/testify/assert"
)
func ExampleNew() {
r := reader.New(strings.NewReader("Hello, world!"))
at := func(i int) rune { r, _ := r.RuneAt(i); return r }
fmt.Printf("%c", at(0))
fmt.Printf("%c", at(12))
// Output:
// H!
}
func TestReader_RuneAt(t *testing.T) {
r := reader.New(strings.NewReader("Hello, world!"))
at := func(i int) rune { r, _ := r.RuneAt(i); return r }
// It is possible to go back and forth while reading the input.
result := fmt.Sprintf("%c%c%c%c", at(0), at(12), at(7), at(0))
assert.Equal(t, "H!wH", result)
}
func TestReader_RuneAt_endOfFile(t *testing.T) {
r := reader.New(strings.NewReader("Hello, world!"))
rn, err := r.RuneAt(13)
result := fmt.Sprintf("%q %s %t", rn, err, err == io.EOF)
assert.Equal(t, "'<27>' EOF true", result)
rn, err = r.RuneAt(20)
result = fmt.Sprintf("%q %s %t", rn, err, err == io.EOF)
assert.Equal(t, "'<27>' EOF true", result)
}
func TestReader_RuneAt_invalidRune(t *testing.T) {
r := reader.New(strings.NewReader("Hello, \xcdworld!"))
at := func(i int) rune { r, _ := r.RuneAt(i); return r }
result := fmt.Sprintf("%c%c%c%c", at(6), at(7), at(8), at(9))
assert.Equal(t, " <20>wo", result, "result")
}
func ExampleReader_RuneAt() {
reader := reader.New(strings.NewReader("Hello, world!"))
fmt.Printf("Runes: ")
for i := 0; ; i++ {
r, err := reader.RuneAt(i)
if err != nil {
fmt.Printf("\nErr: %s\n", err)
break
}
fmt.Printf("%c", r)
}
// Output:
// Runes: Hello, world!
// Err: EOF
}
func TestRuneAt_SkipsBOMAtStartOfFile(t *testing.T) {
r := reader.New(strings.NewReader("\uFEFFBommetje!"))
b, _ := r.RuneAt(0)
o, _ := r.RuneAt(1)
m, _ := r.RuneAt(2)
bom := fmt.Sprintf("%c%c%c", b, o, m)
assert.Equal(t, "Bom", bom, "first three runes")
}
func TestReader_Flush(t *testing.T) {
r := reader.New(strings.NewReader("Hello, world!"))
at := func(i int) rune { r, _ := r.RuneAt(i); return r }
// Fills the buffer with the first 8 runes on the input: "Hello, w"
result := fmt.Sprintf("%c", at(7))
assert.Equal(t, "w", result, "first read")
// Now flush the first 4 runes from the buffer (dropping "Hell" from it)
r.Flush(4)
// Rune 0 is now pointing at what originally was rune offset 4.
// We can continue reading from there.
result = fmt.Sprintf("%c%c%c%c%c%c", at(0), at(1), at(2), at(3), at(4), at(5))
assert.Equal(t, "o, wor", result)
}
func ExampleReader_Flush() {
r := reader.New(strings.NewReader("dog eat dog!"))
at := func(offset int) rune { c, _ := r.RuneAt(offset); return c }
// Read from the first 4 runes of the input.
fmt.Printf("%c%c%c%c", at(0), at(1), at(2), at(3))
// Flush those 4 runes, bringing offset 0 to the start of "eat dog".
r.Flush(4)
// Read another 4 runes, because of the flushing, we start at offset 0.
fmt.Printf("%c%c%c%c", at(1), at(2), at(0), at(3))
// Again, flush 4 runes, bringing offset 0 to the start of "dog!".
r.Flush(4)
// Read from the remainder runes.
fmt.Printf("%c%c%c%c%c", at(2), at(1), at(1), at(0), at(3))
// Output:
// dog ate good!
}
func TestGivenNumberOfRunesTooHigh_Flush_Panics(t *testing.T) {
r := reader.New(strings.NewReader("Hello, world!"))
// Fill buffer with "Hello, worl", the first 11 runes.
r.RuneAt(10)
// However, we flush 12 runes, which exceeds the buffer size.
assert.PanicsWithValue(t,
"parsekit.Input.Reader.Flush(): number of runes to flush "+
"(12) exceeds size of the buffer (11)",
func() { r.Flush(12) })
}

View File

@ -1,94 +0,0 @@
package parsekit
import (
"fmt"
"io"
"strings"
"testing"
)
func ExamplenewReader() {
in := strings.NewReader("Hello, world!")
r := newReader(in)
at := func(i int) rune { r, _ := r.runeAt(i); return r }
fmt.Printf("%c", at(0))
fmt.Printf("%c", at(12))
// Output:
// H!
}
func TestReader_runeAt(t *testing.T) {
in := strings.NewReader("Hello, world!")
r := newReader(in)
at := func(i int) rune { r, _ := r.runeAt(i); return r }
// It is possible to go back and forth while reading the input.
result := fmt.Sprintf("%c%c%c%c", at(0), at(12), at(7), at(0))
AssertEqual(t, "H!wH", result, "result")
}
func TestReader_runeAt_endOfFile(t *testing.T) {
in := strings.NewReader("Hello, world!")
r := newReader(in)
rn, err := r.runeAt(13)
result := fmt.Sprintf("%q %s %t", rn, err, err == io.EOF)
AssertEqual(t, "'<27>' EOF true", result, "result")
rn, err = r.runeAt(20)
result = fmt.Sprintf("%q %s %t", rn, err, err == io.EOF)
AssertEqual(t, "'<27>' EOF true", result, "result")
}
func TestReader_runeAt_invalidRune(t *testing.T) {
in := strings.NewReader("Hello, \xcdworld!")
r := newReader(in)
at := func(i int) rune { r, _ := r.runeAt(i); return r }
result := fmt.Sprintf("%c%c%c%c", at(6), at(7), at(8), at(9))
AssertEqual(t, " <20>wo", result, "result")
}
func TestRuneAt_SkipsBOMAtStartOfFile(t *testing.T) {
in := strings.NewReader("\uFEFFBommetje!")
r := newReader(in)
b, _ := r.runeAt(0)
o, _ := r.runeAt(1)
m, _ := r.runeAt(2)
bom := fmt.Sprintf("%c%c%c", b, o, m)
AssertEqual(t, "Bom", bom, "first three runes")
}
func TestReader_Flush(t *testing.T) {
in := strings.NewReader("Hello, world!")
r := newReader(in)
at := func(i int) rune { r, _ := r.runeAt(i); return r }
// Fills the buffer with the first 8 runes on the input: "Hello, w"
result := fmt.Sprintf("%c", at(7))
AssertEqual(t, "w", result, "first read")
// Now flush the first 4 runes from the buffer (dropping "Hell" from it)
r.flush(4)
// Rune 0 is now pointing at what originally was rune offset 4.
// We can continue reading from there.
result = fmt.Sprintf("%c%c%c%c%c%c", at(0), at(1), at(2), at(3), at(4), at(5))
AssertEqual(t, "o, wor", result, "second read")
}
func TestGivenNumberOfRunesTooHigh_Flush_Panics(t *testing.T) {
in := strings.NewReader("Hello, world!")
r := newReader(in)
// Fill buffer with "Hello, worl", the first 11 runes.
r.runeAt(10)
// However, we flush 12 runes, which exceeds the buffer size.
AssertPanic(t, PanicT{
Function: func() { r.flush(12) },
Expect: "parsekit.Input.Reader.Flush(): number of runes to flush (12) exceeds size of the buffer (11)",
})
}

View File

@ -3,11 +3,13 @@ package parsekit
import (
"fmt"
"io"
"git.makaay.nl/mauricem/go-parsekit/reader"
)
// TokenAPI wraps a parsekit.reader and its purpose is to retrieve input data and
// to report back results. For easy lookahead support, a forking strategy is
// provided.
// TokenAPI wraps a parsekit.reader and its purpose is to retrieve data from
// the reader and to report back tokenizing results. For easy lookahead support,
// a forking strategy is provided.
//
// BASIC OPERATION:
//
@ -15,19 +17,19 @@ import (
//
// When the rune is to be accepted as input, call the method Accept(). The rune
// is then added to the results of the TokenAPI and the read cursor is moved
// forward. Runes collected this way can later on be retrieved using for
// example the method Result().Runes().
// forward.
//
// By invoking NextRune() + Accept() multiple times, the result can be extended
// with as many runes as needed. Runes collected this way can later on be
// retrieved using the method
//
// It is mandatory to call Accept() after retrieving a rune, before calling
// NextRune() again. Failing to do so will result in a panic.
//
// By invoking NextRune() + Accept() multiple times, the result can be extended
// with as many runes as needed.
//
// Next to adding runes to the output, it is also possible to modify the
// already collected runes or to produce lexical Tokens. For all things
// concerning results, take a look at the TokenHandlerResult struct, which can be
// accessed though the method Result().
// Next to adding runes to the result, it is also possible to modify the
// stored runes or to add lexical Tokens to the result. For all things
// concerning results, take a look at the TokenHandlerResult struct, which
// can be accessed though the method Result().
//
// FORKING OPERATION FOR EASY LOOKEAHEAD SUPPORT:
//
@ -38,7 +40,7 @@ import (
//
// The way in which this is supported, is by forking a TokenAPI struct by
// calling method Fork(). This will return a forked child TokenAPI, with
// an empty result buffer, but using the same read cursor position as the
// empty result data, but using the same read cursor position as the
// forked parent.
//
// After forking, the same interface as described for BASIC OPERATION can be
@ -47,28 +49,30 @@ import (
// to the parent's results, and to move the read cursor position to that
// of the child.
//
// When the lookahead was unsuccessful or when the results of the forked child
// are not to be used, then the forked child TokenAPI can simply be discarded.
// The parent TokenAPI was never modified, so it can safely be used as if the
// lookahead never happened.
// When the lookahead was unsuccessful, then the forked child TokenAPI can
// simply be discarded. The parent TokenAPI was never modified, so it can
// safely be used as if the lookahead never happened.
//
// Note:
// Opinionized note:
// Many tokenizers/parsers take a different approach on lookaheads by using
// peeks and by moving the read cursor position back and forth, or by putting
// read input back on the input stream. That often leads to code that is
// efficient, however, in my opinion, not very intuitive to read.
// efficient, however, in my opinion, not very intuitive to read. It can also
// be tedious to get the cursor position back at the correct position, which
// can lead to hard to track bugs. I much prefer this forking method, since
// no bookkeeping has to be implemented when implementing a parser.
type TokenAPI struct {
reader *reader
reader *reader.Reader
root *TokenAPI // the root TokenAPI
parent *TokenAPI // parent TokenAPI in case this TokenAPI is a fork child
child *TokenAPI // child TokenAPI in case this TokenAPI is a fork parent
result *TokenHandlerResult // results as produced by a TokenHandler (runes, Tokens)
result *TokenHandlerResult // results as produced by a TokenHandler (runes, Tokens, cursor position)
}
// NewTokenAPI initializes a new TokenAPI struct, wrapped around the provided io.Reader.
func NewTokenAPI(r io.Reader) *TokenAPI {
input := &TokenAPI{
reader: newReader(r),
reader: reader.New(r),
result: newTokenHandlerResult(),
}
input.root = input // TODO remove this one from root input, input.root == nil is also a good check for "is root?".
@ -91,7 +95,7 @@ func (i *TokenAPI) NextRune() (rune, error) {
}
i.detachChilds()
readRune, err := i.reader.runeAt(i.result.offset)
readRune, err := i.reader.RuneAt(i.result.offset)
i.result.lastRune = &runeInfo{r: readRune, err: err}
return readRune, err
}

View File

@ -177,7 +177,6 @@ func TestCallingAcceptAfterNextRune_AcceptsRuneAndMovesReadOffsetForward(t *test
i.Accept()
AssertTrue(t, i.result.lastRune == nil, "TokenAPI.result.lastRune after Accept() is nil")
AssertEqual(t, 1, i.result.offset, "TokenAPI.result.offset")
AssertEqual(t, 'T', i.reader.buffer[0], "TokenAPI.reader.buffer[0]")
r, _ = i.NextRune()
AssertEqual(t, 'e', r, "result from 2nd call to NextRune()")
}
@ -188,7 +187,6 @@ func TestCallingMultipleAccepts_FillsInputWithData(t *testing.T) {
i.NextRune()
i.Accept()
}
AssertEqual(t, "Testing", string(i.reader.buffer), "reader input buffer")
AssertEqual(t, "Testing", i.Result().String(), "i.Result().String()")
}