diff --git a/lexer/lexer.no b/lexer/lexer.no deleted file mode 100644 index 7a2d92a..0000000 --- a/lexer/lexer.no +++ /dev/null @@ -1,305 +0,0 @@ -package lexer - -import ( - "fmt" - "strings" - "unicode/utf8" - - "github.com/mmakaay/toml/parser" -) - -// Lexer holds the state of the lexer. -type Lexer struct { - input string // the scanned input - state parser.StateFn // a function that handles the current state - stack []parser.StateFn // state function stack, for nested parsing - len int // the total length of the input in bytes - pos int // current byte scanning position in the input - newline bool // keep track of when we have scanned a newline - cursorRow int // current row number in the input - cursorColumn int // current column position in the input - buffer StringBuffer // an efficient buffer, used to build string values - items chan parser.Item // channel of resulting lexer items - item parser.Item // the current item as reached by Next() and retrieved by Get() - err *Error // an error when lexing failed, retrieved by Error() -} - -// Error is used as the error type when lexing errors occur. -// The error includes some extra meta information to allow for useful -// error messages to the user. -type Error struct { - Message string - Row int - Column int -} - -func (err *Error) Error() string { - return err.Message -} - -// New takes an input string and initializes the lexer for it. -func New(input string) *Lexer { - return &Lexer{ - input: input, - len: len(input), - state: stateKeyValuePair, - items: make(chan parser.Item, 2), - } -} - -// Next advances to the next lexer item in the input string. -// When a valid item was found, then the boolean return parameter will be true. -// On error or when reaching the end of the input, false is returned. -// When an error occurred, it will be set in the error return value, nil otherwise. -func (l *Lexer) Next() (parser.Item, *Error, bool) { - for { - select { - case i := <-l.items: - switch { - case i.Type == ItemEOF: - return i, nil, false - case i.Type == ItemError: - l.err = &Error{i.Value, l.cursorRow, l.cursorColumn} - return i, l.err, false - default: - l.item = i - return i, nil, true - } - default: - l.state = l.state(l) - } - } -} - -// ToArray returns lexer items as an array (mainly intended for testing purposes) -// When an error occurs during scanning, a partial result will be -// returned, accompanied by the error that occurred. -func (l *Lexer) ToArray() ([]parser.Item, *Error) { - var items []parser.Item - for { - item, err, more := l.Next() - if !more { - return items, err - } - items = append(items, item) - } -} - -// pushState adds the state function to its stack. -// This is used for implementing nested parsing. -func (l *Lexer) pushState(state stateFn) { - l.stack = append(l.stack, state) -} - -// popState pops the last pushed state from its stack. -func (l *Lexer) popState() stateFn { - last := len(l.stack) - 1 - head, tail := l.stack[:last], l.stack[last] - l.stack = head - return tail -} - -// atEndOfFile returns true when there is no more data available in the input. -func (l *Lexer) atEndOfFile() bool { - return l.pos >= l.len -} - -// emit passes a lexer item back to the client, including the provided string. -func (l *Lexer) emit(t parser.ItemType, s string) { - l.items <- parser.Item{Type: t, Value: s} - l.buffer.Reset() -} - -// emitLiteral passes a lexer item back to the client, including the accumulated -// string buffer data as a literal string. -func (l *Lexer) emitLiteral(t parser.ItemType) { - l.emit(t, l.buffer.AsLiteralString()) -} - -// emitTrimmedLiteral passes a lexer item back to the client, including the -// accumulated string buffer data as a literal string with whitespace -// trimmed from it. -func (l *Lexer) emitTrimmedLiteral(t parser.ItemType) { - l.emit(t, strings.TrimSpace(l.buffer.AsLiteralString())) -} - -// emitInterpreted passes a lexer item back to the client, including the -// accumulated string buffer data an interpreted string (handling escape -// codes like \n, \t, \uXXXX, etc.) -// This method might return an error, in case there is data in the -// string buffer that is not valid for string interpretation. -func (l *Lexer) emitInterpreted(t parser.ItemType) error { - s, err := l.buffer.AsInterpretedString() - if err != nil { - return err - } - l.emit(t, s) - return nil -} - -// emitError emits a lexer error item back to the client. -func (l *Lexer) emitError(format string, args ...interface{}) stateFn { - message := fmt.Sprintf(format, args...) - l.emit(ItemError, message) - return nil -} - -// peek returns but does not advance to the next rune(s) in the input. -// Returns the rune, its width and a boolean. The boolean will be false in case -// no upcoming rune can be peeked (end of data or invalid UTF8 character). -func (l *Lexer) peek() (rune, int, bool) { - r, w := utf8.DecodeRuneInString(l.input[l.pos:]) - return r, w, r != utf8.RuneError -} - -// peekMulti takes a peek at multiple upcoming runes in the input. -// Returns a slice of runes and a boolean. The boolean will be false in case -// less upcoming runes can be peeked than the requested amount -// (end of data or invalid UTF8 character). -func (l *Lexer) peekMulti(amount int) ([]rune, int, bool) { - width := 0 - var peeked []rune - for i := 0; i < amount; i++ { - r, w := utf8.DecodeRuneInString(l.input[l.pos+width:]) - switch { - case r == utf8.RuneError: - return peeked, width, false - default: - width += w - peeked = append(peeked, r) - } - } - return peeked, width, true -} - -// acceptAny adds the next rune from the input to the string buffer. -// If no rune could be read (end of file or invalid UTF8 data), then -// false is returned. -func (l *Lexer) acceptAny() bool { - if r, ok := l.next(); ok { - l.buffer.WriteRune(r) - return true - } - return false -} - -// accept adds the next rune to the string buffer and returns true if it's -// from the valid set of runes. Otherwise false is returned. -func (l *Lexer) accept(matches ...string) bool { - return l.acceptPattern(matches...) -} - -// AcceptMatching adds the next runes to the string buffer, but only -// if the upcoming runes satisfy the provided pattern. -// When runes were added then true is returned, false otherwise. -func (l *Lexer) acceptPattern(pattern ...string) bool { - return l.progress(func(r rune) { l.buffer.WriteRune(r) }, pattern...) -} - -func (l *Lexer) progress(callback func(rune), matches ...string) bool { - if runes, w, ok := l.match(matches...); ok { - l.pos += w - for _, r := range runes { - callback(r) - l.advanceCursor(r) - } - return true - } - return false -} - -// acceptConsecutive adds consecutive runes from the input to the string -// buffer when they match the rune match. -// If any runes were added then true is returned, false otherwise. -func (l *Lexer) acceptConsecutive(match string) bool { - accepted := false - for l.accept(match) { - accepted = true - } - return accepted -} - -// advanceCursor advances the rune cursor one position in the -// input data. While doing so, it keeps tracks of newlines, -// so we can report on row + column positions on error. -func (l *Lexer) advanceCursor(r rune) { - if l.newline { - l.cursorColumn = 0 - l.cursorRow++ - } else { - l.cursorColumn++ - } - l.newline = r == '\n' -} - -// skip skips runes, but only when all provided matches are satisfied. -// Returns true when one or more runes were skipped. -func (l *Lexer) skipMatching(pattern ...string) bool { - return l.progress(func(r rune) {}, pattern...) -} - -// skipConsecutive skips consecutive runes from the provided match. -// Returns true when one or more runes were skipped. -func (l *Lexer) skipConsecutive(pattern string) bool { - didSkip := false - for l.skipMatching(pattern) { - didSkip = true - } - return didSkip -} - -// upcoming checks if the upcoming runes satisfy the provided rune matches. -// This is a lot like the match method, with the difference that -// this one only returns the boolean value. -func (l *Lexer) upcoming(matches ...string) bool { - _, _, ok := l.match(matches...) - return ok -} - -// next returns the next rune from the input and a boolean indicating if -// reading the input was successful. -// When the end of input is reached, or an invalid UTF8 character is -// read, then false is returned. -func (l *Lexer) next() (rune, bool) { - r, w, ok := l.peek() - if ok { - l.pos += w - l.advanceCursor(r) - return r, true - } - if r == utf8.RuneError && w == 0 { - l.emitError("unexpected end of file") - } else { - l.emitError("invalid UTF8 character") - } - return r, false -} - -// match checks if the upcoming runes satisfy the provided rune matches. -// It returns a slice of runes that were found, their total byte width -// and a boolean indicating whether or not all provided matches matched -// the input data. -func (l *Lexer) match(matches ...string) ([]rune, int, bool) { - peeked, width, ok := l.peekMulti(len(matches)) - if ok { - for i, r := range matches { - if strings.IndexRune(r, peeked[i]) < 0 { - return peeked, width, false - } - } - return peeked, width, true - } - return peeked, width, false -} - -func (l *Lexer) unexpectedInputError(expected string) stateFn { - // next() takes care of emitting errors for ok == false. - if r, ok := l.next(); ok { - return l.emitError(fmt.Sprintf("unexpected character %q (expected %s)", r, expected)) - } - return nil -} - -func (l *Lexer) unexpectedEndOfFile(expected string) stateFn { - return l.emitError("Unexpected end of file (expected %s)", expected) -} diff --git a/lexer/stringbuf.no b/lexer/stringbuf.no deleted file mode 100644 index 69030ce..0000000 --- a/lexer/stringbuf.no +++ /dev/null @@ -1,62 +0,0 @@ -package lexer - -import ( - "bytes" - "strconv" - "strings" -) - -// StringBuffer is a string buffer implementation, which is used by the lexer -// to efficiently accumulate runes from the input and eventually turn these -// into a string, either literal or interpreted. -type StringBuffer struct { - buffer bytes.Buffer -} - -// Reset resets the string buffer, in order to build a new string. -func (b *StringBuffer) Reset() *StringBuffer { - b.buffer.Reset() - return b -} - -// WriteString adds the runes of the input string to the string buffer. -func (b *StringBuffer) WriteString(s string) *StringBuffer { - for _, r := range s { - b.WriteRune(r) - } - return b -} - -// WriteRune adds a single rune to the string buffer. -func (b *StringBuffer) WriteRune(r rune) *StringBuffer { - b.buffer.WriteRune(r) - return b -} - -// AsLiteralString returns the string buffer as a literal string. -// Literal means that no escape sequences are processed. -func (b *StringBuffer) AsLiteralString() string { - return b.buffer.String() -} - -// AsInterpretedString returns the string in its interpreted form. -// Interpreted means that escape sequences are handled in the way that Go would -// have, had it been inside double quotes. It translates for example escape -// sequences like "\n", "\t", \uXXXX" and "\UXXXXXXXX" into their string -// representations. -// Since the input might contain invalid escape sequences, this method -// also returns an error. When an error is returned, the returned string will -// contain the string as far as it could be interpreted. -func (b *StringBuffer) AsInterpretedString() (string, error) { - var sb strings.Builder - tail := b.buffer.String() - for len(tail) > 0 { - r, _, newtail, err := strconv.UnquoteChar(tail, '"') - if err != nil { - return sb.String(), err - } - tail = newtail - sb.WriteRune(r) - } - return sb.String(), nil -} diff --git a/lexer/stringbuf_test.no b/lexer/stringbuf_test.no deleted file mode 100644 index c751581..0000000 --- a/lexer/stringbuf_test.no +++ /dev/null @@ -1,87 +0,0 @@ -package lexer_test - -import "testing" -import "github.com/mmakaay/toml/lexer" - -func TestGeneratingStringDoesNotResetBuffer(t *testing.T) { - var b lexer.StringBuffer - s1, _ := b.WriteString(`hi\nthere`).AsInterpretedString() - s2 := b.AsLiteralString() - if s1 != "hi\nthere" { - t.Fatalf("Did not get expected string\"X\" for try 1, but %q", s1) - } - if s2 != "hi\\nthere" { - t.Fatalf("Did not get expected string\"X\" for try 2, but %q", s2) - } -} - -func TestResetResetsBuffer(t *testing.T) { - var b lexer.StringBuffer - s := b.WriteRune('X').Reset().AsLiteralString() - if s != "" { - t.Fatalf("Did not get expected empty string, but %q", s) - } -} - -func TestAsLiteralString(t *testing.T) { - b := lexer.StringBuffer{} - for _, c := range []stringbufT{ - {"empty string", ``, ``, OK}, - {"simple string", `Simple string!`, `Simple string!`, OK}, - {"single quote", `'`, `'`, OK}, - {"double quote", `"`, `"`, OK}, - {"escaped single quote", `\'`, `\'`, OK}, - {"escaped double quote", `\"`, `\"`, OK}, - {"escape anything", `\x\t\f\n\r\'\"\\`, `\x\t\f\n\r\'\"\\`, OK}, - {"UTF8 escapes", `\uceb2\U00e0b8bf`, `\uceb2\U00e0b8bf`, OK}, - {"actual newline", "on\nmultiple\nlines", "on\nmultiple\nlines", OK}, - } { - s := b.Reset().WriteString(c.in).AsLiteralString() - if s != c.out { - t.Fatalf("[%s] %q -> %q failed: actual result = %q", c.name, c.in, c.out, s) - } - } -} - -func TestAsInterpretedString(t *testing.T) { - b := lexer.StringBuffer{} - for _, c := range []stringbufT{ - {"empty string", "", "", OK}, - {"one character", "Simple string!", "Simple string!", OK}, - {"escaped single quote", `\'`, "", FAIL}, - {"escaped double quote", `\"`, `"`, OK}, - {"bare single quote", `'`, "'", OK}, - {"string in single quotes", `'Hello'`, `'Hello'`, OK}, - {"string in escaped double quotes", `\"Hello\"`, `"Hello"`, OK}, - {"escape something", `\t\f\n\r\"\\`, "\t\f\n\r\"\\", OK}, - {"short UTF8 escapes", `\u2318Wh\u00e9\u00e9!`, `⌘Whéé!`, OK}, - {"long UTF8 escapes", `\U0001014D \u2318 Wh\u00e9\u00e9!`, `𐅍 ⌘ Whéé!`, OK}, - {"UTF8 characters", "Ѝюج wut Ж ?", "Ѝюج wut Ж ?", OK}, - {"example from spec", - `I'm a string. \"You can quote me\". Name\tJos\u00E9\nLocation\tSF.`, - "I'm a string. \"You can quote me\". Name\tJosé\nLocation\tSF.", OK}, - } { - s, err := b.Reset().WriteString(c.in).AsInterpretedString() - if c.isSuccessCase && err != nil { - t.Fatalf("[%s] unexpected error for input %q: %s", c.name, c.in, err) - } - if !c.isSuccessCase && err == nil { - t.Fatalf("[%s] expected a failure, but no failure occurred", c.name) - } - if s != c.out && c.isSuccessCase { - t.Fatalf("[%s] %q -> %q failed: actual result = %q", c.name, c.in, c.out, s) - } - } -} - -type stringbufT struct { - name string - in string - out string - isSuccessCase bool -} - -const ( - OK bool = true - FAIL bool = false -)