diff --git a/parsekit/emitting.go b/parsekit/emitting.go index dc419b8..5f53ba3 100644 --- a/parsekit/emitting.go +++ b/parsekit/emitting.go @@ -5,26 +5,45 @@ import ( "strings" ) +// ItemType represents the type of a parser Item. +type ItemType int + +// TODO private? +// ItemEOF is a built-in parser item type that is used for flagging that the +// end of the input was reached. +const ItemEOF ItemType = -1 + +// TODO private? +// ItemError is a built-in parser item type that is used for flagging that +// an error has occurred during parsing. +const ItemError ItemType = -2 + +// Item represents an item that can be emitted from the parser. +type Item struct { + Type ItemType + Value string +} + // Emit passes a Parser item to the client, including the provided string. func (p *P) Emit(t ItemType, s string) { p.items <- Item{t, s} p.buffer.reset() } -// EmitLiteral passes a Parser item to the client, including the accumulated +// EmitLiteral passes a Parser item to the client, including accumulated // string buffer data as a literal string. func (p *P) EmitLiteral(t ItemType) { p.Emit(t, p.buffer.asLiteralString()) } -// EmitLiteralTrim passes a Parser item to the client, including the +// EmitLiteralTrim passes a Parser item to the client, including // accumulated string buffer data as a literal string with whitespace // trimmed from it. func (p *P) EmitLiteralTrim(t ItemType) { p.Emit(t, strings.TrimSpace(p.buffer.asLiteralString())) } -// EmitInterpreted passes a Parser item to the client, including the +// EmitInterpreted passes a Parser item to the client, including // accumulated string buffer data a Go doubled quoted interpreted string // (handling escape codes like \n, \t, \uXXXX, etc.) // This method might return an error, in case there is data in the @@ -38,6 +57,19 @@ func (p *P) EmitInterpreted(t ItemType) error { return nil } +// Error is used as the error type when parsing errors occur. +// The error includes some extra meta information to allow for useful +// error messages to the user. +type Error struct { + Message string + Row int + Column int +} + +func (err *Error) Error() string { + return err.Message +} + // EmitError emits a Parser error item to the client. func (p *P) EmitError(format string, args ...interface{}) { message := fmt.Sprintf(format, args...) @@ -51,17 +83,17 @@ func (p *P) UnexpectedInput() { r, _, ok := p.peek(0) switch { case ok: - p.EmitError("unexpected character %q%s", r, p.fmtExpects()) + p.EmitError("unexpected character %q%s", r, fmtExpects(p)) case r == EOF: - p.EmitError("unexpected end of file%s", p.fmtExpects()) + p.EmitError("unexpected end of file%s", fmtExpects(p)) case r == INVALID: - p.EmitError("invalid UTF8 character in input%s", p.fmtExpects()) + p.EmitError("invalid UTF8 character in input%s", fmtExpects(p)) default: panic("Unhandled output from peek()") } } -func (p *P) fmtExpects() string { +func fmtExpects(p *P) string { if p.expecting == "" { return "" } diff --git a/parsekit/internals.go b/parsekit/internals.go deleted file mode 100644 index ac50d94..0000000 --- a/parsekit/internals.go +++ /dev/null @@ -1,95 +0,0 @@ -package parsekit - -import ( - "unicode/utf8" -) - -// P holds the internal state of the parser. -type P struct { - state StateFn // the function that handles the current state - nextState StateFn // the function that will handle the next state - stack []StateFn // state function stack, for nested parsing - input string // the scanned input - len int // the total length of the input in bytes - pos int // current byte scanning position in the input - newline bool // keep track of when we have scanned a newline - cursorRow int // current row number in the input - cursorColumn int // current column position in the input - expecting string // a description of what the current state expects to find - buffer stringBuffer // an efficient buffer, used to build string values - items chan Item // channel of resulting Parser items - item Item // the current item as reached by Next() and retrieved by Get() - err *Error // an error when lexing failed, retrieved by Error() -} - -// peek returns but does not advance the cursor to the next rune(s) in the input. -// Returns the rune, its width in bytes and a boolean. -// The boolean will be false in case no upcoming rune can be peeked -// (end of data or invalid UTF8 character). -func (p *P) peek(offsetInBytes int) (rune, int, bool) { - r, w := utf8.DecodeRuneInString(p.input[p.pos+offsetInBytes:]) - return handleRuneError(r, w) -} - -// handleRuneError is used to normale rune value in case of errors. -// When an error occurs, then utf8.RuneError will be in the rune. -// This can however indicate one of two situations: -// * w == 0: end of file is reached -// * w == 1: invalid UTF character on input -// This function lets these two cases return respectively the -// package's own EOF or INVALID runes, to make it easy for client -// code to distinct between these two cases. -func handleRuneError(r rune, w int) (rune, int, bool) { - if r == utf8.RuneError { - if w == 0 { - return EOF, 0, false - } - return INVALID, w, false - } - return r, w, true -} - -// EOF is a special rune, which is used to indicate an end of file when -// reading a character from the input. -// It can be treated as a rune when writing parsing rules, so a valid way to -// say 'I now expect the end of the file' is using something like: -// if (p.On(c.Rune(EOF)).Skip()) { ... } -const EOF rune = -1 - -// INVALID is a special rune, which is used to indicate an invalid UTF8 -// rune on the input. -const INVALID rune = utf8.RuneError - -// StateFn defines the type of function that can be used to -// handle a parser state. -type StateFn func(*P) - -// ItemType represents the type of a parser Item. -type ItemType int - -// ItemEOF is a built-in parser item type that is used for flagging that the -// end of the input was reached. -const ItemEOF ItemType = -1 - -// ItemError is a built-in parser item type that is used for flagging that -// an error has occurred during parsing. -const ItemError ItemType = -2 - -// Item represents an item returned from the parser. -type Item struct { - Type ItemType - Value string -} - -// Error is used as the error type when parsing errors occur. -// The error includes some extra meta information to allow for useful -// error messages to the user. -type Error struct { - Message string - Row int - Column int -} - -func (err *Error) Error() string { - return err.Message -} diff --git a/parsekit/matchers.go b/parsekit/matchers.go index 17d7712..0ffa575 100644 --- a/parsekit/matchers.go +++ b/parsekit/matchers.go @@ -6,8 +6,7 @@ import ( ) // Not in need of it myself, but nice to have I guess: -// - NotFollowedBy -// - Separated +// - LookAhead // MatchDialog is used by Matcher implementations as a means // to retrieve data to match against and to report back @@ -92,6 +91,7 @@ type Matcher interface { } type matcherConstructors struct { + EndOfFile func() MatchEndOfFile Any func() MatchAny Rune func(rune) MatchRune RuneRange func(rune, rune) MatchRuneRange @@ -99,20 +99,28 @@ type matcherConstructors struct { String func(string) MatchSequence StringNoCase func(string) MatchSequence AnyOf func(...Matcher) MatchAnyOf - Repeat func(int, Matcher) MatchRepeat - Sequence func(...Matcher) MatchSequence - ZeroOrMore func(Matcher) MatchZeroOrMore - OneOrMore func(Matcher) MatchOneOrMore + Not func(Matcher) MatchNot Optional func(Matcher) MatchOptional + Sequence func(...Matcher) MatchSequence + Repeat func(int, Matcher) MatchRepeat + Min func(int, Matcher) MatchRepeat + Max func(int, Matcher) MatchRepeat + Bounded func(int, int, Matcher) MatchRepeat + ZeroOrMore func(Matcher) MatchRepeat + OneOrMore func(Matcher) MatchRepeat + Separated func(Matcher, Matcher) MatchSeparated Drop func(Matcher) MatchDrop } // C provides access to a wide range of parser/combinator -// constructors that can be used to build matching expressions. +// constructorshat can be used to build matching expressions. // When using C in your own parser, then it is advised to create // an alias in your own package for easy reference: // var c = parsekit.C var C = matcherConstructors{ + EndOfFile: func() MatchEndOfFile { + return MatchEndOfFile{} + }, Any: func() MatchAny { return MatchAny{} }, @@ -130,44 +138,73 @@ var C = matcherConstructors{ return MatchAnyOf{m} }, String: func(s string) MatchSequence { - m := make([]Matcher, len(s)) - for i, r := range s { - m[i] = MatchRune{r} + var m = []Matcher{} + for _, r := range s { + m = append(m, MatchRune{r}) } return MatchSequence{m} }, StringNoCase: func(s string) MatchSequence { - m := make([]Matcher, len(s)) - for i, r := range s { + var m = []Matcher{} + for _, r := range s { u := MatchRune{unicode.ToUpper(r)} l := MatchRune{unicode.ToLower(r)} - m[i] = MatchAnyOf{[]Matcher{u, l}} + m = append(m, MatchAnyOf{[]Matcher{u, l}}) } return MatchSequence{m} }, - AnyOf: func(matchers ...Matcher) MatchAnyOf { - return MatchAnyOf{matchers} - }, - Repeat: func(count int, matcher Matcher) MatchRepeat { - return MatchRepeat{count, matcher} - }, - Sequence: func(matchers ...Matcher) MatchSequence { - return MatchSequence{matchers} - }, - OneOrMore: func(matcher Matcher) MatchOneOrMore { - return MatchOneOrMore{matcher} - }, - ZeroOrMore: func(matcher Matcher) MatchZeroOrMore { - return MatchZeroOrMore{matcher} - }, Optional: func(matcher Matcher) MatchOptional { return MatchOptional{matcher} }, + Not: func(matcher Matcher) MatchNot { + return MatchNot{matcher} + }, + AnyOf: func(matchers ...Matcher) MatchAnyOf { + return MatchAnyOf{matchers} + }, + Sequence: func(matchers ...Matcher) MatchSequence { + return MatchSequence{matchers} + }, + Repeat: func(count int, matcher Matcher) MatchRepeat { + return MatchRepeat{count, count, matcher} + }, + Min: func(min int, matcher Matcher) MatchRepeat { + return MatchRepeat{min, -1, matcher} + }, + Max: func(max int, matcher Matcher) MatchRepeat { + return MatchRepeat{-1, max, matcher} + }, + Bounded: func(min int, max int, matcher Matcher) MatchRepeat { + return MatchRepeat{min, max, matcher} + }, + OneOrMore: func(matcher Matcher) MatchRepeat { + return MatchRepeat{1, -1, matcher} + }, + ZeroOrMore: func(matcher Matcher) MatchRepeat { + return MatchRepeat{0, -1, matcher} + }, + Separated: func(separator Matcher, matcher Matcher) MatchSeparated { + return MatchSeparated{separator, matcher} + }, Drop: func(matcher Matcher) MatchDrop { return MatchDrop{matcher} }, } +type MatchEndOfFile struct{} + +func (c MatchEndOfFile) Match(m *MatchDialog) bool { + r, ok := m.NextRune() + return !ok && r == EOF +} + +type MatchInvalidRune struct{} + +func (c MatchInvalidRune) Match(m *MatchDialog) bool { + r, ok := m.NextRune() + return !ok && r == INVALID +} + type MatchAny struct{} func (c MatchAny) Match(m *MatchDialog) bool { @@ -175,6 +212,31 @@ func (c MatchAny) Match(m *MatchDialog) bool { return ok } +type MatchNot struct { + matcher Matcher +} + +func (c MatchNot) Match(m *MatchDialog) bool { + child := m.Fork() + if !c.matcher.Match(child) { + child.Merge() + return true + } + return false +} + +type MatchOptional struct { + matcher Matcher +} + +func (c MatchOptional) Match(m *MatchDialog) bool { + child := m.Fork() + if c.matcher.Match(child) { + child.Merge() + } + return true +} + type MatchRune struct { match rune } @@ -209,18 +271,41 @@ func (c MatchAnyOf) Match(m *MatchDialog) bool { } type MatchRepeat struct { - count int + min int + max int matcher Matcher } func (c MatchRepeat) Match(m *MatchDialog) bool { child := m.Fork() - for i := 0; i < c.count; i++ { + if c.min >= 0 && c.max >= 0 && c.min > c.max { + panic("MatchRepeat definition error: max must not be < min") + } + total := 0 + // Specified min: check for the minimal required amount of matches. + for total < c.min { + total++ if !c.matcher.Match(child) { return false } } + // No specified max: include the rest of the available matches. + if c.max < 0 { + child.Merge() + for c.matcher.Match(child) { + child.Merge() + } + return true + } + // Specified max: include the rest of the availble matches, up to the max. child.Merge() + for total < c.max { + total++ + if !c.matcher.Match(child) { + break + } + child.Merge() + } return true } @@ -239,40 +324,14 @@ func (c MatchSequence) Match(m *MatchDialog) bool { return true } -type MatchOneOrMore struct { - matcher Matcher +type MatchSeparated struct { + separator Matcher + matcher Matcher } -func (c MatchOneOrMore) Match(m *MatchDialog) bool { - child := m.Fork() - for c.matcher.Match(child) { - child.Merge() - } - return len(m.runes) > 0 -} - -type MatchZeroOrMore struct { - matcher Matcher -} - -func (c MatchZeroOrMore) Match(m *MatchDialog) bool { - child := m.Fork() - for c.matcher.Match(child) { - child.Merge() - } - return true -} - -type MatchOptional struct { - matcher Matcher -} - -func (c MatchOptional) Match(m *MatchDialog) bool { - child := m.Fork() - if c.matcher.Match(child) { - child.Merge() - } - return true +func (c MatchSeparated) Match(m *MatchDialog) bool { + seq := C.Sequence(c.matcher, C.ZeroOrMore(C.Sequence(c.separator, c.matcher))) + return seq.Match(m) } type MatchDrop struct { diff --git a/parsekit/matchers_test.go b/parsekit/matchers_test.go index 2d104e0..ff4efaa 100644 --- a/parsekit/matchers_test.go +++ b/parsekit/matchers_test.go @@ -15,7 +15,7 @@ func newParser(input string, matcher p.Matcher) *p.P { p.Expects("MATCH") if p.On(matcher).Accept() { p.EmitLiteral(TestItem) - p.Repeat() + p.RouteRepeat() } } return p.New(input, stateFn) @@ -120,20 +120,19 @@ func TestMatchString(t *testing.T) { } } -// TODO -// func TestMatchStringNoCase(t *testing.T) { -// p := newParser("HellÖ, world!", c.StringNoCase("hellö")) -// r, err, ok := p.Next() -// if !ok { -// t.Fatalf("Parsing failed: %s", err) -// } -// if r.Type != TestItem { -// t.Error("Parser item type not expected TestTitem") -// } -// if r.Value != "Hello" { -// t.Errorf("Parser item value is %q instead of expected \"Hello\"", r.Value) -// } -// } +func TestMatchStringNoCase(t *testing.T) { + p := newParser("HellÖ, world!", c.StringNoCase("hellö")) + r, err, ok := p.Next() + if !ok { + t.Fatalf("Parsing failed: %s", err) + } + if r.Type != TestItem { + t.Error("Parser item type not expected TestTitem") + } + if r.Value != "HellÖ" { + t.Errorf("Parser item value is %q instead of expected \"HellÖ\"", r.Value) + } +} func TestMatchRunes(t *testing.T) { m := c.Runes('+', '-', '*', '/') @@ -156,6 +155,29 @@ func TestMatchRunes(t *testing.T) { } } +func TestMatchNot(t *testing.T) { + p := newParser("aabc", c.Not(c.Rune('b'))) + r, err, ok := p.Next() + if !ok { + t.Fatalf("Parsing failed: %s", err) + } + if r.Value != "a" { + t.Errorf("Parser item value is %q instead of expected \"a\"", r.Value) + } +} + +func TestMatchNot_Mismatch(t *testing.T) { + p := newParser("aabc", c.Not(c.Rune('a'))) + _, err, ok := p.Next() + if ok { + t.Fatalf("Parsing unexpectedly succeeded") + } + expected := "unexpected character 'a' (expected MATCH)" + if err.Error() != expected { + t.Fatalf("Unexpected error from parser:\nexpectd: %s\nactual: %s\n", expected, err.Error()) + } +} + func TestMatchAnyOf(t *testing.T) { p := newParser("abc", c.AnyOf(c.Rune('a'), c.Rune('b'))) r, err, ok := p.Next() @@ -192,6 +214,30 @@ func TestMatchRepeat(t *testing.T) { } } +func TestMatchRepeat_Min(t *testing.T) { + p := newParser("1111112345", c.Min(4, c.Rune('1'))) + r, _, _ := p.Next() + if r.Value != "111111" { + t.Errorf("Parser item value is %q instead of expected \"111111\"", r.Value) + } +} + +func TestMatchRepeat_Max(t *testing.T) { + p := newParser("1111112345", c.Max(4, c.Rune('1'))) + r, _, _ := p.Next() + if r.Value != "1111" { + t.Errorf("Parser item value is %q instead of expected \"1111\"", r.Value) + } +} + +func TestMatchRepeat_Bounded(t *testing.T) { + p := newParser("1111112345", c.Bounded(3, 5, c.Rune('1'))) + r, _, _ := p.Next() + if r.Value != "11111" { + t.Errorf("Parser item value is %q instead of expected \"11111\"", r.Value) + } +} + func TestMatchRepeat_Mismatch(t *testing.T) { p := newParser("xxxyyyy", c.Repeat(4, c.Rune('x'))) _, err, ok := p.Next() @@ -282,6 +328,21 @@ func TestMatchDrop(t *testing.T) { t.Errorf("Parser item value is %q instead of expected \"x\"", r.Value) } } + +func TestMatchSeparated(t *testing.T) { + number := c.Bounded(1, 3, c.RuneRange('0', '9')) + separators := c.Runes('|', ';', ',') + separated_numbers := c.Separated(separators, number) + p := newParser("1,2;3|44,55|66;777,abc", separated_numbers) + r, err, ok := p.Next() + if !ok { + t.Fatalf("Parsing failed: %s at row: %d, column %d\n", err, err.Row, err.Column) + } + if r.Value != "1,2;3|44,55|66;777" { + t.Errorf("Parser item value is %q instead of expected \"1,2;3|44,55|66;777\"", r.Value) + } +} + func TestMixAndMatch(t *testing.T) { hex := c.AnyOf(c.RuneRange('0', '9'), c.RuneRange('a', 'f'), c.RuneRange('A', 'F')) backslash := c.Rune('\\') diff --git a/parsekit/matching.go b/parsekit/matching.go index 5da0567..8185e79 100644 --- a/parsekit/matching.go +++ b/parsekit/matching.go @@ -3,7 +3,7 @@ package parsekit // Expects is used to let a state function describe what input it is expecting. // This expectation is used in error messages to make them more descriptive. // -// Also, when defining an expectation inside a StateFn, you do not need +// Also, when defining an expectation inside a StateHandler, you do not need // to handle unexpected input yourself. When the end of the function is // reached without setting the next state, an automatic error will be // emitted. This error differentiates between issues: @@ -14,47 +14,18 @@ func (p *P) Expects(description string) { p.expecting = description } -// AtEndOfFile returns true when there is no more data available in the input. -func (p *P) AtEndOfFile() bool { - return p.pos >= p.len -} - -// AtEndOfLine returns true when the cursor is either at the end of the line -// or at the end of the file. The cursor is not moved to a new position -// by this method. -func (p *P) AtEndOfLine() bool { - return p.AtEndOfFile() || - p.On(C.String("\r\n")).Stay() || - p.On(C.Rune('\n')).Stay() -} - -// SkipEndOfLine returns true when the cursor is either at the end of the line -// or at the end of the file. Additionally, when not at the end of the file, -// the cursor is moved forward to beyond the newline. -func (p *P) SkipEndOfLine() bool { - return p.AtEndOfFile() || - p.On(C.String("\r\n")).Skip() || - p.On(C.Rune('\n')).Skip() -} - -// AcceptEndOfLine returns true when the cursor is either at the end of the line -// or at the end of the file. When not at the end of the file, a normalized -// newline (only a '\n' character, even with '\r\n' on the input) -// is added to the string buffer. -func (p *P) AcceptEndOfLine() bool { - if p.AtEndOfFile() { - return true - } - if p.SkipEndOfLine() { - p.buffer.writeRune('\n') - return true - } - return false -} - -func (p *P) On(m Matcher) *action { - runes, widths, ok := p.Match(m) - return &action{ +// On checks if the current input matches the provided Matcher. +// It returns a MatchAction struct, which provides methods that +// can be used to tell the parser what to do with a match. +// +// The intended way to use this, is by chaining some methods, +// for example: p.On(...).Accept() +// The chained methods will as a whole return a boolean value, +// indicating whether or not a match was found and processed. +func (p *P) On(m Matcher) *MatchAction { + runes, widths, ok := p.match(m) + p.LastMatch = string(runes) + return &MatchAction{ p: p, runes: runes, widths: widths, @@ -62,24 +33,29 @@ func (p *P) On(m Matcher) *action { } } -func (p *P) Match(matcher Matcher) ([]rune, []int, bool) { - return p.match(0, matcher) -} - -func (p *P) match(offset int, matcher Matcher) ([]rune, []int, bool) { +// Match checks if the provided Matcher matches the current input. +// Returns a slice of matching runes, a slice of their respective +// byte widths and a boolean. +// The boolean will be false and the slices will be empty in case +// the input did not match. +func (p *P) match(matcher Matcher) ([]rune, []int, bool) { m := &MatchDialog{p: p} ok := matcher.Match(m) return m.runes, m.widths, ok } -type action struct { +type MatchAction struct { p *P runes []rune widths []int ok bool } -func (a *action) Accept() bool { +// Accept tells the parser to move the cursor past a match that was found, +// and to store the input that matched in the string buffer. +// Returns true in case a match was found. +// When no match was found, then no action is taken and false is returned. +func (a *MatchAction) Accept() bool { if a.ok { for i, r := range a.runes { a.p.buffer.writeRune(r) @@ -89,7 +65,11 @@ func (a *action) Accept() bool { return a.ok } -func (a *action) Skip() bool { +// Skip tells the parser to move the cursor past a match that was found, +// without storing the actual match in the string buffer. +// Returns true in case a match was found. +// When no match was found, then no action is taken and false is returned. +func (a *MatchAction) Skip() bool { if a.ok { for i, r := range a.runes { type C struct { @@ -102,13 +82,31 @@ func (a *action) Skip() bool { return a.ok } -func (a *action) Stay() bool { +// Stay tells the parser to not move the cursor after finding a match. +// Returns true in case a match was found, false otherwise. +func (a *MatchAction) Stay() bool { return a.ok } -// advanceCursor advances the rune cursor one position in the -// input data. While doing so, it keeps tracks of newlines, -// so we can report on row + column positions on error. +// RouteTo is a shortcut for p.On(...).Stay() + p.RouteTo(...). +func (a *MatchAction) RouteTo(state StateHandler) bool { + if a.ok { + a.p.RouteTo(state) + } + return a.ok +} + +// RouteReturn is a shortcut for p.On(...).Stay() + p.RouteReturn(). +func (a *MatchAction) RouteReturn() bool { + if a.ok { + a.p.RouteReturn() + } + return a.ok +} + +// advanceCursor advances the rune cursor one position in the input data. +// While doing so, it keeps tracks of newlines, so we can report on +// row + column positions on error. func (p *P) advanceCursor(r rune, w int) { p.pos += w if p.newline { @@ -119,17 +117,3 @@ func (p *P) advanceCursor(r rune, w int) { } p.newline = r == '\n' } - -func (a *action) RouteTo(state StateFn) bool { - if a.ok { - a.p.RouteTo(state) - } - return a.ok -} - -func (a *action) RouteReturn() bool { - if a.ok { - a.p.RouteReturn() - } - return a.ok -} diff --git a/parsekit/parsekit.go b/parsekit/parsekit.go index c8fb078..b4f1805 100644 --- a/parsekit/parsekit.go +++ b/parsekit/parsekit.go @@ -6,13 +6,36 @@ import ( "runtime" ) +// P holds the internal state of the parser. +type P struct { + state StateHandler // the function that handles the current state + nextState StateHandler // the function that will handle the next state + stack []StateHandler // state function stack, for nested parsing + input string // the scanned input + len int // the total length of the input in bytes + pos int // current byte scanning position in the input + newline bool // keep track of when we have scanned a newline + cursorRow int // current row number in the input + cursorColumn int // current column position in the input + expecting string // a description of what the current state expects to find + buffer stringBuffer // an efficient buffer, used to build string values + LastMatch string // a string representation of the last matched input data + items chan Item // channel of resulting Parser items + item Item // the current item as reached by Next() and retrieved by Get() + err *Error // an error when lexing failed, retrieved by Error() +} + +// StateHandler defines the type of function that can be used to +// handle a parser state. +type StateHandler func(*P) + // New takes an input string and a start state, // and initializes the parser for it. -func New(input string, startState StateFn) *P { +func New(input string, start StateHandler) *P { return &P{ input: input, len: len(input), - nextState: startState, + nextState: start, items: make(chan Item, 2), } } @@ -25,51 +48,80 @@ func (p *P) Next() (Item, *Error, bool) { for { select { case i := <-p.items: - switch { - case i.Type == ItemEOF: - return i, nil, false - case i.Type == ItemError: - p.err = &Error{i.Value, p.cursorRow, p.cursorColumn} - return i, p.err, false - default: - p.item = i - return i, nil, true - } + return p.makeReturnValues(i) default: - // When implementing a parser, a state function must provide - // a routing decision in every state function execution. - // When no route is specified, then it is considered a but - // in the parser implementation. - // An exception is when a function specified its expectation - // using the Expects() method. In that case, an unexpected - // input error is emitted. - if p.nextState == nil { - if p.expecting != "" { - p.UnexpectedInput() - continue - } else { - name := runtime.FuncForPC(reflect.ValueOf(p.state).Pointer()).Name() - panic(fmt.Sprintf("StateFn implementation bug: %s did not set next state or input expectation", name)) - } - } - p.state = p.nextState - p.nextState = nil - p.expecting = "" - p.state(p) + p.runStatusHandler() } } } -// ToArray returns Parser items as an array (mainly intended for testing purposes) -// When an error occurs during scanning, a partial result will be -// returned, accompanied by the error that occurred. -func (p *P) ToArray() ([]Item, *Error) { - var items []Item - for { - item, err, more := p.Next() - if !more { - return items, err - } - items = append(items, item) +// runStatusHandler moves the parser, which is bascially a state machine, +// to its next status. It does so by invoking a function of the +// type StateHandler. This function represents the current status. +func (p *P) runStatusHandler() { + if state, ok := p.getNextStateHandler(); ok { + p.invokeNextStatusHandler(state) + } +} + +// getNextStateHandler determintes the next StatusHandler to invoke in order +// to move the parsing state machine one step further. +// +// When implementing a parser, the StateHandler functions must provide +// a routing decision in every invocation. A routing decision is one +// of the following: +// +// * A route is specified explicitly, which means that the next StatusHandler +// function to invoke is registered during the StateHandler function +// invocation. For example: p.RouteTo(nextStatus) +// +// * A route is specified implicitly, which means that a previous StateHandler +// invocation has registered the followup route for the current state. +// For example: p.RouteTo(nextStatus).ThenTo(otherStatus) +// In this example, the nextStatus StateHandler will not have to specify +// a route explicitly, but otherStatus will be used implicitly after +// the nextStatus function has returned. +// +// * An expectation is registered by the StatusHandler. +// For example: p.Expects("a cool thing") +// When the StatusHandler returns without having specified a route, this +// expectation is used to generate an "unexpected input" error message. +// +// When no routing decision is provided by a StateHandler, then this is +// considered a bug in the state handler, and the parser will panic. +func (p *P) getNextStateHandler() (StateHandler, bool) { + switch { + case p.nextState != nil: + return p.nextState, true + case len(p.stack) > 0: + return p.popState(), true + case p.expecting != "": + p.UnexpectedInput() + return nil, false + default: + name := runtime.FuncForPC(reflect.ValueOf(p.state).Pointer()).Name() + panic(fmt.Sprintf("StateHandler %s did not provide a routing decision", name)) + } +} + +// invokeNextStatusHandler moves the parser state to the provided state +// and invokes the StatusHandler function. +func (p *P) invokeNextStatusHandler(state StateHandler) { + p.state = state + p.nextState = nil + p.expecting = "" + p.state(p) +} + +func (p *P) makeReturnValues(i Item) (Item, *Error, bool) { + switch { + case i.Type == ItemEOF: + return i, nil, false + case i.Type == ItemError: + p.err = &Error{i.Value, p.cursorRow, p.cursorColumn} + return i, p.err, false + default: + p.item = i + return i, nil, true } } diff --git a/parsekit/peek.go b/parsekit/peek.go new file mode 100644 index 0000000..c45e2d5 --- /dev/null +++ b/parsekit/peek.go @@ -0,0 +1,43 @@ +package parsekit + +import ( + "unicode/utf8" +) + +// peek returns but does not advance the cursor to the next rune(s) in the input. +// Returns the rune, its width in bytes and a boolean. +// The boolean will be false in case no upcoming rune can be peeked +// (end of data or invalid UTF8 character). +func (p *P) peek(offsetInBytes int) (rune, int, bool) { + r, w := utf8.DecodeRuneInString(p.input[p.pos+offsetInBytes:]) + return handleRuneError(r, w) +} + +// handleRuneError is used to normale rune value in case of errors. +// When an error occurs, then utf8.RuneError will be in the rune. +// This can however indicate one of two situations: +// * w == 0: end of file is reached +// * w == 1: invalid UTF character on input +// This function lets these two cases return respectively the +// package's own EOF or INVALID runes, to make it easy for client +// code to distinct between these two cases. +func handleRuneError(r rune, w int) (rune, int, bool) { + if r == utf8.RuneError { + if w == 0 { + return EOF, 0, false + } + return INVALID, w, false + } + return r, w, true +} + +// EOF is a special rune, which is used to indicate an end of file when +// reading a character from the input. +// It can be treated as a rune when writing parsing rules, so a valid way to +// say 'I now expect the end of the file' is using something like: +// if (p.On(c.Rune(EOF)).Skip()) { ... } +const EOF rune = -1 + +// INVALID is a special rune, which is used to indicate an invalid UTF8 +// rune on the input. +const INVALID rune = utf8.RuneError diff --git a/parsekit/staterouting.go b/parsekit/staterouting.go index 3182480..c629aa3 100644 --- a/parsekit/staterouting.go +++ b/parsekit/staterouting.go @@ -1,40 +1,58 @@ package parsekit -func (p *P) Repeat() { - p.nextState = p.state - return +// RouteRepeat indicates that on the next parsing cycle, +// the current StateHandler must be invoked again. +func (p *P) RouteRepeat() { + p.RouteTo(p.state) } -func (p *P) RouteTo(state StateFn) *routeFollowup { +// RouteTo tells the parser what StateHandler function to invoke +// in the next parsing cycle. +func (p *P) RouteTo(state StateHandler) *RouteFollowup { p.nextState = state - return &routeFollowup{p} + return &RouteFollowup{p} } -type routeFollowup struct { +// RouteFollowup chains parsing routes. +// It allows for routing code like p.RouteTo(handlerA).ThenTo(handlerB). +type RouteFollowup struct { p *P } -func (r *routeFollowup) ThenTo(state StateFn) *routeFollowup { +// ThenTo schedules a StateHandler that must be invoked +// after the RouteTo StateHandler has been completed. +// For example: p.RouteTo(handlerA).ThenTo(handlerB) +func (r *RouteFollowup) ThenTo(state StateHandler) { r.p.pushState(state) - return r } -func (r *routeFollowup) ThenReturnHere() { +// ThenReturnHere schedules the current StateHandler to be +// invoked after the RouteTo StateHandler has been completed. +// For example: p.RouteTo(handlerA).ThenReturnHere() +func (r *RouteFollowup) ThenReturnHere() { r.p.pushState(r.p.state) } +// RouteReturn tells the parser that on the next cycle the +// next scheduled route must be invoked. +// Using this method is optional. When implementating a +// StateHandler that is used as a sort of subroutine (using +// constructions like p.RouteTo(subroutine).ThenReturnHere()), +// then you can refrain from providing a routing decision +// from that handler. The parser will automatically assume +// a RouteReturn in that case. func (p *P) RouteReturn() { p.nextState = p.popState() } // PushState adds the state function to the state stack. // This is used for implementing nested parsing. -func (p *P) pushState(state StateFn) { +func (p *P) pushState(state StateHandler) { p.stack = append(p.stack, state) } // PopState pops the last pushed state from the state stack. -func (p *P) popState() StateFn { +func (p *P) popState() StateHandler { last := len(p.stack) - 1 head, tail := p.stack[:last], p.stack[last] p.stack = head diff --git a/parser/syn_comments.go b/parser/comment.go similarity index 71% rename from parser/syn_comments.go rename to parser/comment.go index 88210dd..07c4912 100644 --- a/parser/syn_comments.go +++ b/parser/comment.go @@ -6,18 +6,20 @@ import ( // A '#' hash symbol marks the rest of the line as a comment. func startComment(p *parsekit.P) { - p.On(c.OneOrMore(hash)).Skip() - p.RouteTo(commentContents) + p.Expects("start of comment") + if p.On(c.OneOrMore(hash)).Skip() { + p.RouteTo(commentContents) + } } // All characters up to the end of the line are included in the comment. func commentContents(p *parsekit.P) { p.Expects("comment contents") switch { - case p.AtEndOfLine() || p.On(endOfLine).Skip(): // TODO drop AtEndOfLine support + case p.On(endOfLine).Skip(): p.EmitLiteralTrim(ItemComment) p.RouteReturn() case p.On(any).Accept(): - p.Repeat() + p.RouteRepeat() } } diff --git a/parser/syn_comments_test.go b/parser/comment_test.go similarity index 100% rename from parser/syn_comments_test.go rename to parser/comment_test.go diff --git a/parser/syn_eof.go b/parser/eof.go similarity index 72% rename from parser/syn_eof.go rename to parser/eof.go index ebcaa40..97c6bb5 100644 --- a/parser/syn_eof.go +++ b/parser/eof.go @@ -2,9 +2,10 @@ package parser import "github.com/mmakaay/toml/parsekit" +// TODO move into parsekit func endOfFile(p *parsekit.P) { p.Expects("end of file") - if p.AtEndOfFile() { + if p.On(c.EndOfFile()).Stay() { p.Emit(parsekit.ItemEOF, "EOF") } } diff --git a/parser/helpers_test.go b/parser/helpers_test.go index fa07691..6e6dd7e 100644 --- a/parser/helpers_test.go +++ b/parser/helpers_test.go @@ -22,8 +22,23 @@ func runStatesTs(t *testing.T, tests []statesT) { } } +// ToArray returns Parser items as an array. +// When an error occurs during scanning, a partial result will be +// returned, accompanied by the error that occurred. +func parseItemsToArray(p *parsekit.P) ([]parsekit.Item, *parsekit.Error) { + var items []parsekit.Item + for { + item, err, more := p.Next() + if !more { + return items, err + } + items = append(items, item) + } +} + func runStatesT(t *testing.T, c statesT) { - l, err := parser.NewParser(c.in).ToArray() + p := parser.NewParser(c.in) + l, err := parseItemsToArray(p) if err == nil && c.err != "" { t.Errorf("[%s] Expected error '%s', but no error occurred", c.name, c.err) } diff --git a/parser/syn_keyvaluepair.go b/parser/keyvaluepair.go similarity index 100% rename from parser/syn_keyvaluepair.go rename to parser/keyvaluepair.go diff --git a/parser/syn_keyvaluepair_test.go b/parser/keyvaluepair_test.go similarity index 100% rename from parser/syn_keyvaluepair_test.go rename to parser/keyvaluepair_test.go diff --git a/parser/parser.go b/parser/toml.go similarity index 95% rename from parser/parser.go rename to parser/toml.go index d62479e..8b86a65 100644 --- a/parser/parser.go +++ b/parser/toml.go @@ -33,7 +33,7 @@ var ( whitespace = c.OneOrMore(c.AnyOf(space, tab)) whitespaceOrNewlines = c.OneOrMore(c.AnyOf(space, tab, carriageReturn, lineFeed)) optionalWhitespace = c.Optional(whitespace) - endOfLine = c.AnyOf(lineFeed, c.Rune(parsekit.EOF)) + endOfLine = c.AnyOf(lineFeed, c.EndOfFile()) ) // NewParser creates a new parser, using the provided input string diff --git a/parser/parser_test.go b/parser/toml_test.go similarity index 90% rename from parser/parser_test.go rename to parser/toml_test.go index 9561a8a..f983ea6 100644 --- a/parser/parser_test.go +++ b/parser/toml_test.go @@ -11,7 +11,8 @@ func TestEmptyInput(t *testing.T) { } func TestErrorsIncludeLineAndRowPosition(t *testing.T) { - _, err := parser.NewParser("# 12345 abcde\t\n\n\n# 67890\r\n# 12345\xbc").ToArray() + p := parser.NewParser("# 12345 abcde\t\n\n\n# 67890\r\n# 12345\xbc") + _, err := parseItemsToArray(p) t.Logf("Got error: %s", err.Error()) if err.Row != 4 { t.Errorf("Unexpected line number: %d (expected %d)", err.Row, 4) @@ -23,7 +24,7 @@ func TestErrorsIncludeLineAndRowPosition(t *testing.T) { func TestInvalidUtf8Data(t *testing.T) { runStatesTs(t, []statesT{ - {"inside comment", "# \xbc", "", "invalid UTF8 character in input (expected comment contents)"}, + {"inside comment", "# \xbc", "", "invalid UTF8 character in input (expected end of file)"}, {"bare key 1", "\xbc", "", "invalid UTF8 character in input (expected end of file)"}, {"bare key 2", "key\xbc", "[key]", "invalid UTF8 character in input (expected a value assignment)"}, {"start of value", "key=\xbc", "[key]=", "invalid UTF8 character in input (expected a value)"}, diff --git a/parser/syn_strings.go b/parser/value_string.go similarity index 91% rename from parser/syn_strings.go rename to parser/value_string.go index f0b8466..1d3705f 100644 --- a/parser/syn_strings.go +++ b/parser/value_string.go @@ -42,21 +42,6 @@ func startString(p *parsekit.P) { } } -func parseBasicString(p *parsekit.P) { - p.Expects("string contents") - switch { - case p.On(charThatMustBeEscaped).Stay(): - r, _, _ := p.Match(charThatMustBeEscaped) - p.EmitError("Invalid character in basic string: %q (must be escaped)", r[0]) - case p.On(validEscape).Accept(): - p.Repeat() - case p.On(backslash).Stay() || p.On(doubleQuote).Stay(): - p.RouteReturn() - case p.On(any).Accept(): - p.Repeat() - } -} - func startBasicString(p *parsekit.P) { p.Expects("a basic string") if p.On(doubleQuote).Skip() { @@ -64,12 +49,27 @@ func startBasicString(p *parsekit.P) { } } +func parseBasicString(p *parsekit.P) { + p.Expects("string contents") + switch { + case p.On(charThatMustBeEscaped).Stay(): + p.EmitError("Invalid character in basic string: %q (must be escaped)", p.LastMatch) + case p.On(validEscape).Accept(): + p.RouteRepeat() + case p.On(backslash).RouteReturn(): + case p.On(doubleQuote).RouteReturn(): + case p.On(any).Accept(): + p.RouteRepeat() + } +} + // Specific handling of input for basic strings. // * A double quote ends the string // * No additional \escape sequences are allowed. What the spec say about this: // "All other escape sequences [..] are reserved and, if used, TOML should // produce an error."" func basicStringSpecifics(p *parsekit.P) { + p.Expects("string contents") switch { case p.On(doubleQuote).Skip(): if err := p.EmitInterpreted(ItemString); err != nil { // TODO testcase? @@ -79,8 +79,6 @@ func basicStringSpecifics(p *parsekit.P) { } case p.On(backslash).Stay(): p.EmitError("Invalid escape sequence") - default: - panic("String parsing should not have ended up here") } } diff --git a/parser/syn_strings_test.go b/parser/value_tring_test.go similarity index 94% rename from parser/syn_strings_test.go rename to parser/value_tring_test.go index cc67b2e..62ddf18 100644 --- a/parser/syn_strings_test.go +++ b/parser/value_tring_test.go @@ -13,9 +13,9 @@ func TestUnterminatedBasicString(t *testing.T) { func TestBasicStringWithUnescapedControlCharacters(t *testing.T) { runStatesTs(t, []statesT{ - {"null char", "a=\"\u0000\"", "[a]=", `Invalid character in basic string: '\x00' (must be escaped)`}, - {"newline", "a=\"b\nc\nd\"", "[a]=", `Invalid character in basic string: '\n' (must be escaped)`}, - {"delete", "a=\"\u007F\"", "[a]=", `Invalid character in basic string: '\u007f' (must be escaped)`}, + {"null char", "a=\"\u0000\"", "[a]=", `Invalid character in basic string: "\x00" (must be escaped)`}, + {"newline", "a=\"b\nc\nd\"", "[a]=", `Invalid character in basic string: "\n" (must be escaped)`}, + {"delete", "a=\"\u007F\"", "[a]=", `Invalid character in basic string: "\u007f" (must be escaped)`}, }) // No need to write all test cases for disallowed characters by hand. @@ -23,7 +23,7 @@ func TestBasicStringWithUnescapedControlCharacters(t *testing.T) { name := fmt.Sprintf("control character %x", rune(i)) runStatesT( t, statesT{name, fmt.Sprintf(`_="%c"`, rune(i)), "[_]=", - fmt.Sprintf(`Invalid character in basic string: %q (must be escaped)`, rune(i))}) + fmt.Sprintf(`Invalid character in basic string: %q (must be escaped)`, string(rune(i)))}) } }