diff --git a/go.mod b/go.mod index c336e8c..ff46249 100644 --- a/go.mod +++ b/go.mod @@ -1,3 +1,5 @@ module git.makaay.nl/mauricem/go-parsekit go 1.12 + +require github.com/stretchr/testify v1.3.0 diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..4347755 --- /dev/null +++ b/go.sum @@ -0,0 +1,7 @@ +github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= diff --git a/parseapi.go b/parseapi.go index ffe7169..c6db4a1 100644 --- a/parseapi.go +++ b/parseapi.go @@ -172,7 +172,7 @@ func (a *ParseAPIOnAction) Stay() bool { func (a *ParseAPIOnAction) flushReader() { if a.tokenAPI.result.offset > 0 { - a.tokenAPI.root.reader.flush(a.tokenAPI.root.result.offset) + a.tokenAPI.root.reader.Flush(a.tokenAPI.root.result.offset) a.tokenAPI.root.result.offset = 0 a.parseAPI.initLoopCheck() } diff --git a/reader.go b/reader.go deleted file mode 100644 index 3f37779..0000000 --- a/reader.go +++ /dev/null @@ -1,84 +0,0 @@ -package parsekit - -import ( - "bufio" - "fmt" - "io" - "unicode/utf8" -) - -// reader wraps around an io.Reader and provides buffering to allows us to read -// the same runes over and over again. This is useful for implementing a parser -// that must be able to do lookahead on the input, returning to the original -// input position after finishing that lookahead). -// -// To minimze memory use, it is also possible to flush the buffer when there is -// no more need to go back to previously read runes. -// -// The reader is used internally by parsekit.TokenAPI. -type reader struct { - bufio *bufio.Reader // Used for ReadRune() - buffer []rune // Input buffer, holding runes that were read from input - bufferOffset int // The offset of the buffer, relative to the start of the input - bufferLen int // Input size, the number of runes in the buffer -} - -// newwReader initializes a new reader struct, wrapped around the provided io.Reader. -func newReader(r io.Reader) *reader { - return &reader{ - bufio: bufio.NewReader(r), - buffer: []rune{}, - } -} - -// runeAt reads the rune at the provided rune offset. -// -// This offset is relative to the current starting position of the buffer in -// the reader. When starting reading, offset 0 will point at the start of the -// input. After flushing, offset 0 will point at the input up to where -// the flush was done. -// -// The error return value will be nil when reading was successful. -// When an invalid rune is encountered on the input, the error will be nil, -// but the rune will be utf8.RuneError -// -// When reading failed, the rune will be utf8.RuneError. One special read -// fail is actually a normal situation: end of file reached. In that case, -// the returned error wille be io.EOF. -func (r *reader) runeAt(offset int) (rune, error) { - // Rune at provided offset is not yet available in the input buffer. - // Read runes until we have enough runes to satisfy the offset. - for r.bufferLen <= offset { - readRune, _, err := r.bufio.ReadRune() - - // Handle errors. - if err != nil { - return utf8.RuneError, err - } - - // Skip BOM. - if readRune == '\uFEFF' && r.bufferOffset == 0 { - r.bufferOffset++ - continue - } - - r.buffer = append(r.buffer, readRune) - r.bufferLen++ - } - return r.buffer[offset], nil -} - -// Flush deletes the provided number of runes from the start of the -// reader buffer. After flushing the buffer, offset 0 as used by runeAt() -// will point to the rune that comes after the flushed runes. -// So what this basically does is turn the Reader into a sliding window. -func (r *reader) flush(numberOfRunes int) { - if numberOfRunes > r.bufferLen { - panic(fmt.Sprintf( - "parsekit.Input.Reader.Flush(): number of runes to flush (%d) "+ - "exceeds size of the buffer (%d)", numberOfRunes, r.bufferLen)) - } - r.bufferOffset += numberOfRunes - r.bufferLen -= numberOfRunes - r.buffer = r.buffer[numberOfRunes:] -} diff --git a/reader/reader.go b/reader/reader.go new file mode 100644 index 0000000..44c9f70 --- /dev/null +++ b/reader/reader.go @@ -0,0 +1,123 @@ +// Package reader provides a buffered Reader that wraps around an io.Reader. +// +// Functionally, it provides an input buffer in the form of a sliding window. +// Let's say we've got the following input coming up in the io.Reader that is +// wrapped by the Reader: +// +// |H|e|l|l|o|,| |w|o|r|l|d|!| <-- runes +// 0 6 12 <-- rune offset +// +// The Reader can now be used to retrieve runes from the input, based on their +// offset, using RuneAt(offset). Normally these runes will be retrieved in +// sequence, but that is not a requirement. Let's say we retrieve the rune with +// offset 6 from the input (the 'w'), then the Reader buffer be filled with runes +// from the io.Reader until there are enough runes available to return the rune +// for offset 6: +// +// |H|e|l|l|o| |w| +// 0 6 +// +// Using RuneAt, you can retrieve arbitrary runes. If you request one that is +// in the Reader buffer, then the buffered rune is returned. If you request one +// that is not in the buffer, then the buffer will be expanded. +// +// To make this into a sliding window, the Reader provides the method +// Flush(numberOfRunes). This method will drop the provided number of runes from +// the Reader buffer. So when we'd do a Flush(3) on the example buffer from above, +// then the Reader buffer would become: +// +// |l|o| |w| +// 0 3 +// +// Note that the offset for the first rune 'l' in the buffer is now 0. +// You can consider the input to be changed in a similar way: +// +// |l|o|,| |w|o|r|l|d|!| +// 0 6 9 +// +// So after a flush, the first upcoming rune after the flushed runes +// will always have index 0. +package reader + +import ( + "bufio" + "fmt" + "io" + "unicode/utf8" +) + +// Reader wraps around an io.Reader and provides buffering that allows us to read +// the same runes over and over again. This is useful for implementing a parser +// that must be able to do lookahead on the input, returning to the original +// input position after finishing that lookahead). +// +// To minimze memory use, it is also possible to flush the read buffer when there is +// no more need to go back to previously read runes. +// +// The parserkit.reader.Reader is used internally by parsekit.TokenAPI. +type Reader struct { + bufio *bufio.Reader // Used for ReadRune() + buffer []rune // Input buffer, holding runes that were read from input + bufferOffset int // The offset of the buffer, relative to the start of the input + bufferLen int // Input size, the number of runes in the buffer +} + +// New initializes a new reader struct, wrapped around the provided io.Reader. +func New(r io.Reader) *Reader { + return &Reader{ + bufio: bufio.NewReader(r), + buffer: []rune{}, + } +} + +// RuneAt reads the rune at the provided rune offset. +// +// This offset is relative to the current starting position of the buffer in +// the reader. When starting reading, offset 0 will point at the start of the +// input. After flushing, offset 0 will point at the input up to where +// the flush was done. +// +// The error return value will be nil when reading was successful. +// When an invalid rune is encountered on the input, the error will be nil, +// but the rune will be utf8.RuneError +// +// When reading failed, the rune will be utf8.RuneError and the error will +// be not nil. One special read fail is actually a normal situation: end +// of file reached. In that case, the returned error wille be io.EOF. +func (r *Reader) RuneAt(offset int) (rune, error) { + // Rune at provided offset is not yet available in the input buffer. + // Read runes until we have enough runes to satisfy the offset. + for r.bufferLen <= offset { + readRune, _, err := r.bufio.ReadRune() + + // Handle errors. + if err != nil { + return utf8.RuneError, err + } + + // Skip BOM. + if readRune == '\uFEFF' && r.bufferOffset == 0 { + r.bufferOffset++ + continue + } + + r.buffer = append(r.buffer, readRune) + r.bufferLen++ + } + return r.buffer[offset], nil +} + +// Flush deletes the provided number of runes from the start of the +// reader buffer. After flushing the buffer, offset 0 as used by RuneAt() +// will point to the rune that comes after the flushed runes. +// So what this basically does is turn the Reader into a sliding window. +func (r *Reader) Flush(numberOfRunes int) { + if numberOfRunes > r.bufferLen { + panic(fmt.Sprintf( + "parsekit.Input.Reader.Flush(): number of runes to flush (%d) "+ + "exceeds size of the buffer (%d)", numberOfRunes, r.bufferLen)) + } + r.bufferOffset += numberOfRunes + r.bufferLen -= numberOfRunes + r.buffer = r.buffer[numberOfRunes:] +} diff --git a/reader/reader_test.go b/reader/reader_test.go new file mode 100644 index 0000000..85df380 --- /dev/null +++ b/reader/reader_test.go @@ -0,0 +1,131 @@ +package reader_test + +import ( + "fmt" + "io" + "strings" + "testing" + + "git.makaay.nl/mauricem/go-parsekit/reader" + "github.com/stretchr/testify/assert" +) + +func ExampleNew() { + r := reader.New(strings.NewReader("Hello, world!")) + at := func(i int) rune { r, _ := r.RuneAt(i); return r } + + fmt.Printf("%c", at(0)) + fmt.Printf("%c", at(12)) + + // Output: + // H! +} + +func TestReader_RuneAt(t *testing.T) { + r := reader.New(strings.NewReader("Hello, world!")) + at := func(i int) rune { r, _ := r.RuneAt(i); return r } + + // It is possible to go back and forth while reading the input. + result := fmt.Sprintf("%c%c%c%c", at(0), at(12), at(7), at(0)) + assert.Equal(t, "H!wH", result) +} + +func TestReader_RuneAt_endOfFile(t *testing.T) { + r := reader.New(strings.NewReader("Hello, world!")) + + rn, err := r.RuneAt(13) + result := fmt.Sprintf("%q %s %t", rn, err, err == io.EOF) + assert.Equal(t, "'�' EOF true", result) + + rn, err = r.RuneAt(20) + result = fmt.Sprintf("%q %s %t", rn, err, err == io.EOF) + assert.Equal(t, "'�' EOF true", result) +} + +func TestReader_RuneAt_invalidRune(t *testing.T) { + r := reader.New(strings.NewReader("Hello, \xcdworld!")) + at := func(i int) rune { r, _ := r.RuneAt(i); return r } + + result := fmt.Sprintf("%c%c%c%c", at(6), at(7), at(8), at(9)) + assert.Equal(t, " �wo", result, "result") +} + +func ExampleReader_RuneAt() { + reader := reader.New(strings.NewReader("Hello, world!")) + + fmt.Printf("Runes: ") + for i := 0; ; i++ { + r, err := reader.RuneAt(i) + if err != nil { + fmt.Printf("\nErr: %s\n", err) + break + } + fmt.Printf("%c", r) + } + + // Output: + // Runes: Hello, world! + // Err: EOF +} + +func TestRuneAt_SkipsBOMAtStartOfFile(t *testing.T) { + r := reader.New(strings.NewReader("\uFEFFBommetje!")) + b, _ := r.RuneAt(0) + o, _ := r.RuneAt(1) + m, _ := r.RuneAt(2) + bom := fmt.Sprintf("%c%c%c", b, o, m) + assert.Equal(t, "Bom", bom, "first three runes") +} + +func TestReader_Flush(t *testing.T) { + r := reader.New(strings.NewReader("Hello, world!")) + at := func(i int) rune { r, _ := r.RuneAt(i); return r } + + // Fills the buffer with the first 8 runes on the input: "Hello, w" + result := fmt.Sprintf("%c", at(7)) + assert.Equal(t, "w", result, "first read") + + // Now flush the first 4 runes from the buffer (dropping "Hell" from it) + r.Flush(4) + + // Rune 0 is now pointing at what originally was rune offset 4. + // We can continue reading from there. + result = fmt.Sprintf("%c%c%c%c%c%c", at(0), at(1), at(2), at(3), at(4), at(5)) + assert.Equal(t, "o, wor", result) +} + +func ExampleReader_Flush() { + r := reader.New(strings.NewReader("dog eat dog!")) + at := func(offset int) rune { c, _ := r.RuneAt(offset); return c } + + // Read from the first 4 runes of the input. + fmt.Printf("%c%c%c%c", at(0), at(1), at(2), at(3)) + + // Flush those 4 runes, bringing offset 0 to the start of "eat dog". + r.Flush(4) + + // Read another 4 runes, because of the flushing, we start at offset 0. + fmt.Printf("%c%c%c%c", at(1), at(2), at(0), at(3)) + + // Again, flush 4 runes, bringing offset 0 to the start of "dog!". + r.Flush(4) + + // Read from the remainder runes. + fmt.Printf("%c%c%c%c%c", at(2), at(1), at(1), at(0), at(3)) + + // Output: + // dog ate good! +} + +func TestGivenNumberOfRunesTooHigh_Flush_Panics(t *testing.T) { + r := reader.New(strings.NewReader("Hello, world!")) + + // Fill buffer with "Hello, worl", the first 11 runes. + r.RuneAt(10) + + // However, we flush 12 runes, which exceeds the buffer size. + assert.PanicsWithValue(t, + "parsekit.Input.Reader.Flush(): number of runes to flush "+ + "(12) exceeds size of the buffer (11)", + func() { r.Flush(12) }) +} diff --git a/reader_test.go b/reader_test.go deleted file mode 100644 index f9d2967..0000000 --- a/reader_test.go +++ /dev/null @@ -1,94 +0,0 @@ -package parsekit - -import ( - "fmt" - "io" - "strings" - "testing" -) - -func ExamplenewReader() { - in := strings.NewReader("Hello, world!") - r := newReader(in) - at := func(i int) rune { r, _ := r.runeAt(i); return r } - - fmt.Printf("%c", at(0)) - fmt.Printf("%c", at(12)) - - // Output: - // H! -} - -func TestReader_runeAt(t *testing.T) { - in := strings.NewReader("Hello, world!") - r := newReader(in) - at := func(i int) rune { r, _ := r.runeAt(i); return r } - - // It is possible to go back and forth while reading the input. - result := fmt.Sprintf("%c%c%c%c", at(0), at(12), at(7), at(0)) - AssertEqual(t, "H!wH", result, "result") -} - -func TestReader_runeAt_endOfFile(t *testing.T) { - in := strings.NewReader("Hello, world!") - r := newReader(in) - - rn, err := r.runeAt(13) - result := fmt.Sprintf("%q %s %t", rn, err, err == io.EOF) - AssertEqual(t, "'�' EOF true", result, "result") - - rn, err = r.runeAt(20) - result = fmt.Sprintf("%q %s %t", rn, err, err == io.EOF) - AssertEqual(t, "'�' EOF true", result, "result") -} - -func TestReader_runeAt_invalidRune(t *testing.T) { - in := strings.NewReader("Hello, \xcdworld!") - r := newReader(in) - at := func(i int) rune { r, _ := r.runeAt(i); return r } - - result := fmt.Sprintf("%c%c%c%c", at(6), at(7), at(8), at(9)) - AssertEqual(t, " �wo", result, "result") -} - -func TestRuneAt_SkipsBOMAtStartOfFile(t *testing.T) { - in := strings.NewReader("\uFEFFBommetje!") - r := newReader(in) - b, _ := r.runeAt(0) - o, _ := r.runeAt(1) - m, _ := r.runeAt(2) - bom := fmt.Sprintf("%c%c%c", b, o, m) - AssertEqual(t, "Bom", bom, "first three runes") -} - -func TestReader_Flush(t *testing.T) { - in := strings.NewReader("Hello, world!") - r := newReader(in) - at := func(i int) rune { r, _ := r.runeAt(i); return r } - - // Fills the buffer with the first 8 runes on the input: "Hello, w" - result := fmt.Sprintf("%c", at(7)) - AssertEqual(t, "w", result, "first read") - - // Now flush the first 4 runes from the buffer (dropping "Hell" from it) - r.flush(4) - - // Rune 0 is now pointing at what originally was rune offset 4. - // We can continue reading from there. - result = fmt.Sprintf("%c%c%c%c%c%c", at(0), at(1), at(2), at(3), at(4), at(5)) - AssertEqual(t, "o, wor", result, "second read") -} - -func TestGivenNumberOfRunesTooHigh_Flush_Panics(t *testing.T) { - in := strings.NewReader("Hello, world!") - r := newReader(in) - - // Fill buffer with "Hello, worl", the first 11 runes. - r.runeAt(10) - - // However, we flush 12 runes, which exceeds the buffer size. - AssertPanic(t, PanicT{ - Function: func() { r.flush(12) }, - Expect: "parsekit.Input.Reader.Flush(): number of runes to flush (12) exceeds size of the buffer (11)", - }) -} diff --git a/tokenapi.go b/tokenapi.go index bc6b359..34f1e23 100644 --- a/tokenapi.go +++ b/tokenapi.go @@ -3,11 +3,13 @@ package parsekit import ( "fmt" "io" + + "git.makaay.nl/mauricem/go-parsekit/reader" ) -// TokenAPI wraps a parsekit.reader and its purpose is to retrieve input data and -// to report back results. For easy lookahead support, a forking strategy is -// provided. +// TokenAPI wraps a parsekit.reader and its purpose is to retrieve data from +// the reader and to report back tokenizing results. For easy lookahead support, +// a forking strategy is provided. // // BASIC OPERATION: // @@ -15,19 +17,19 @@ import ( // // When the rune is to be accepted as input, call the method Accept(). The rune // is then added to the results of the TokenAPI and the read cursor is moved -// forward. Runes collected this way can later on be retrieved using for -// example the method Result().Runes(). +// forward. +// +// By invoking NextRune() + Accept() multiple times, the result can be extended +// with as many runes as needed. Runes collected this way can later on be +// retrieved using the method // // It is mandatory to call Accept() after retrieving a rune, before calling // NextRune() again. Failing to do so will result in a panic. // -// By invoking NextRune() + Accept() multiple times, the result can be extended -// with as many runes as needed. -// -// Next to adding runes to the output, it is also possible to modify the -// already collected runes or to produce lexical Tokens. For all things -// concerning results, take a look at the TokenHandlerResult struct, which can be -// accessed though the method Result(). +// Next to adding runes to the result, it is also possible to modify the +// stored runes or to add lexical Tokens to the result. For all things +// concerning results, take a look at the TokenHandlerResult struct, which +// can be accessed though the method Result(). // // FORKING OPERATION FOR EASY LOOKEAHEAD SUPPORT: // @@ -38,7 +40,7 @@ import ( // // The way in which this is supported, is by forking a TokenAPI struct by // calling method Fork(). This will return a forked child TokenAPI, with -// an empty result buffer, but using the same read cursor position as the +// empty result data, but using the same read cursor position as the // forked parent. // // After forking, the same interface as described for BASIC OPERATION can be @@ -47,28 +49,30 @@ import ( // to the parent's results, and to move the read cursor position to that // of the child. // -// When the lookahead was unsuccessful or when the results of the forked child -// are not to be used, then the forked child TokenAPI can simply be discarded. -// The parent TokenAPI was never modified, so it can safely be used as if the -// lookahead never happened. +// When the lookahead was unsuccessful, then the forked child TokenAPI can +// simply be discarded. The parent TokenAPI was never modified, so it can +// safely be used as if the lookahead never happened. // -// Note: +// Opinionized note: // Many tokenizers/parsers take a different approach on lookaheads by using // peeks and by moving the read cursor position back and forth, or by putting // read input back on the input stream. That often leads to code that is -// efficient, however, in my opinion, not very intuitive to read. +// efficient, however, in my opinion, not very intuitive to read. It can also +// be tedious to get the cursor position back at the correct position, which +// can lead to hard to track bugs. I much prefer this forking method, since +// no bookkeeping has to be implemented when implementing a parser. type TokenAPI struct { - reader *reader + reader *reader.Reader root *TokenAPI // the root TokenAPI parent *TokenAPI // parent TokenAPI in case this TokenAPI is a fork child child *TokenAPI // child TokenAPI in case this TokenAPI is a fork parent - result *TokenHandlerResult // results as produced by a TokenHandler (runes, Tokens) + result *TokenHandlerResult // results as produced by a TokenHandler (runes, Tokens, cursor position) } // NewTokenAPI initializes a new TokenAPI struct, wrapped around the provided io.Reader. func NewTokenAPI(r io.Reader) *TokenAPI { input := &TokenAPI{ - reader: newReader(r), + reader: reader.New(r), result: newTokenHandlerResult(), } input.root = input // TODO remove this one from root input, input.root == nil is also a good check for "is root?". @@ -91,7 +95,7 @@ func (i *TokenAPI) NextRune() (rune, error) { } i.detachChilds() - readRune, err := i.reader.runeAt(i.result.offset) + readRune, err := i.reader.RuneAt(i.result.offset) i.result.lastRune = &runeInfo{r: readRune, err: err} return readRune, err } diff --git a/tokenizer_test.go b/tokenizer_test.go index 10186cd..0c583a1 100644 --- a/tokenizer_test.go +++ b/tokenizer_test.go @@ -177,7 +177,6 @@ func TestCallingAcceptAfterNextRune_AcceptsRuneAndMovesReadOffsetForward(t *test i.Accept() AssertTrue(t, i.result.lastRune == nil, "TokenAPI.result.lastRune after Accept() is nil") AssertEqual(t, 1, i.result.offset, "TokenAPI.result.offset") - AssertEqual(t, 'T', i.reader.buffer[0], "TokenAPI.reader.buffer[0]") r, _ = i.NextRune() AssertEqual(t, 'e', r, "result from 2nd call to NextRune()") } @@ -188,7 +187,6 @@ func TestCallingMultipleAccepts_FillsInputWithData(t *testing.T) { i.NextRune() i.Accept() } - AssertEqual(t, "Testing", string(i.reader.buffer), "reader input buffer") AssertEqual(t, "Testing", i.Result().String(), "i.Result().String()") }