go-toml/parse2/grammar.go

package main

import (
	"flag"
	"fmt"
	"io/ioutil"
	"log"
	"math"
	"os"
	"path"
	"time"

	"git.makaay.nl/mauricem/go-parsekit/tokenize"
	"github.com/pkg/profile"
)

var doProfile *int

func init() {
	doProfile = flag.Int("p", 0, "Perform pprof profiling (value is number of run loops)")
	flag.Usage = usage
	flag.Parse()
}

func usage() {
	log.Printf("Usage: %s < <path to TOML-file>\n", path.Base(os.Args[0]))
	flag.PrintDefaults()

	os.Exit(1)
}

func main() {
	toml := BuildGrammar()
	var result *tokenize.Result
	var err error

	if *doProfile > 0 {
		fmt.Println("Profiling ...")
		inputBytes, _ := ioutil.ReadAll(os.Stdin)
		inputStr := string(inputBytes)
		p := profile.Start()
		start := time.Now()
		for i := 0; i < *doProfile; i++ {
			result, err = toml.Match(inputStr)
			fmt.Printf("cycle %d / %d, tokens=%d\r", i+1, *doProfile, len(result.Tokens))
		}
		duration := time.Since(start)
		p.Stop()
		fmt.Printf("\n")
		fmt.Println("Duration:", duration)
		return
	}

	result, err = toml.Match(os.Stdin)
	if err != nil {
		log.Fatalf("Error in parsing TOML: %s\n", err)
	} else {
		for i, t := range result.Tokens {
			fmt.Printf("[%d] %v\n", i, t)
		}
	}
}

type Grammar map[string]tokenize.Handler

func (g Grammar) Rule(name string, definition tokenize.Handler) {
	if _, ok := g[name]; ok {
		panic(fmt.Sprintf("Grammar rule %q already exists", name))
	}
	g[name] = definition
}

func (g Grammar) Get(name string) tokenize.Handler {
	if handler, ok := g[name]; ok {
		return handler
	}
	return func(t *tokenize.API) bool {
		if handler, ok := g[name]; ok {
			return handler(t)
		}
		panic(fmt.Sprintf("Grammar rule %q does not exist", name))
	}
}

func BuildGrammar() tokenize.Handler {

	c, a, m, tok := tokenize.C, tokenize.A, tokenize.M, tokenize.T

	g := make(Grammar)
	R := g.Rule
	G := g.Get

	R("alpha", a.Letter)
	R("digit", a.Digit)
	R("minus", a.Minus)
	R("plus", a.Plus)
	R("underscore", a.Underscore)
	R("quotation-mark", a.DoubleQuote)
	R("apostrophe", a.SingleQuote)
	R("colon", a.Colon)
	R("escape", a.Backslash)
	R("hex-digit", a.HexDigit)

	// Whitespace, Newline

	R("tab", a.Tab)
	R("space", a.Space)
	R("whitespaceChar", G("tab").Or(G("space")))
	R("ws", c.ZeroOrMore(G("whitespaceChar")))
	R("newline", a.Newline)
	R("whitespaceChar-or-newline", G("whitespaceChar").Or(G("newline")))

	// Comment

	R("comment-start-symbol", a.Hash)
	R("non-ascii", a.RuneRange(0x80, 0xD7FF).Or(a.RuneRange(0xE000, 0x10FFFF)))
	R("non-eol", c.Any(a.Rune(0x09), a.RuneRange(0x20, 0x7E), G("non-ascii")))
	R("comment", G("comment-start-symbol").Then(c.ZeroOrMore(G("non-eol"))))

	// Basic String

	R("escape-seq-char", c.Any(
		a.Runes('"', '\\', 'b', 'f', 'n', 'r', 't'),
		a.Rune('u').Then(G("hex-digit").Times(4)),
		a.Rune('U').Then(G("hex-digit").Times(8))))
	R("escaped", G("escape").Then(G("escape-seq-char")))
	R("basic-unescaped", c.Any(a.RuneRange(0x20, 0x21), a.RuneRange(0x23, 0x5B), a.RuneRange(0x5D, 0x7E), G("non-ascii")))
	R("basic-char", G("escaped").Or(G("basic-unescaped")))
	R("basic-string", c.Seq(m.Drop(G("quotation-mark")), c.ZeroOrMore(G("basic-char")), m.Drop(G("quotation-mark"))))

	// Multiline Basic String

	R("ml-basic-string-delim", G("quotation-mark").Times(3))
	R("ml-basic-unescaped", c.Any(a.RuneRange(0x20, 0x5B), a.RuneRange(0x5D, 0x7E), G("non-ascii")))
	R("ml-basic-char", G("ml-basic-unescaped").Or(G("escaped")))
	R("ml-basic-body-concat", c.Seq(G("escape"), G("ws"), G("newline"), c.ZeroOrMore(G("whitespaceChar-or-newline"))))
	R("ml-basic-body-content", c.Any(G("ml-basic-char"), G("newline"), m.Drop(G("ml-basic-body-concat"))))
	R("ml-basic-body", c.ZeroOrMore(G("ml-basic-body-content").Except(G("ml-basic-string-delim"))))
	R("ml-basic-string", c.Seq(
		m.Drop(G("ml-basic-string-delim").Then(c.Optional(G("newline")))),
		m.Drop(c.Optional(G("newline"))),
		G("ml-basic-body"),
		m.Drop(G("ml-basic-string-delim"))))

	// Literal String

	R("literal-char", c.Any(G("tab"), a.RuneRange(0x20, 0x26), a.RuneRange(0x28, 0x7E), G("non-ascii")))
	R("literal-string", c.Seq(
		m.Drop(G("apostrophe")),
		c.ZeroOrMore(G("literal-char")),
		m.Drop(G("apostrophe"))))

	// Multiline Literal String

	R("ml-literal-string-delim", G("apostrophe").Times(3))
	R("ml-literal-char", c.Any(G("tab"), a.RuneRange(0x20, 0x7E), G("non-ascii")))
	R("ml-literal-body-content", G("ml-literal-char").Or(G("newline")))
	R("ml-literal-body", c.ZeroOrMore(G("ml-literal-body-content").Except(G("ml-literal-string-delim"))))
	R("ml-literal-string", c.Seq(
		m.Drop(G("ml-literal-string-delim").Then(c.Optional(G("newline")))),
		G("ml-literal-body"),
		m.Drop(G("ml-literal-string-delim"))))

	// String

	R("string", c.Any(
		tok.StrInterpreted("string", G("ml-basic-string")),
		tok.StrInterpreted("string", G("basic-string")),
		tok.Str("string", G("ml-literal-string")),
		tok.Str("string", G("literal-string"))))

	// Integer

	R("digit1-9", a.DigitNotZero)
	R("underscore-int-digit", c.Any(G("digit"), m.Drop(G("underscore")).Then(G("digit"))))
	R("unsigned-dec-int", c.Any(G("digit1-9").Then(c.OneOrMore(G("underscore-int-digit"))), G("digit")))
	R("dec-int", c.Optional(G("plus").Or(G("minus"))).Then(G("unsigned-dec-int")))

	R("hex-prefix", a.Zero.Then(a.Rune('x')))
	R("underscore-hex-digit", c.Any(G("hex-digit"), m.Drop(G("underscore")).Then(G("hex-digit"))))
	R("hex-int", c.Seq(m.Drop(G("hex-prefix")), G("hex-digit"), c.ZeroOrMore(G("underscore-hex-digit"))))

	R("oct-prefix", a.Zero.Then(a.Rune('o')))
	R("digit0-7", a.RuneRange('0', '7'))
	R("underscore-oct-digit", c.Any(G("digit0-7"), m.Drop(G("underscore")).Then(G("digit0-7"))))
	R("oct-int", c.Seq(m.Drop(G("oct-prefix")), G("digit0-7"), c.ZeroOrMore(G("underscore-oct-digit"))))

	R("bin-prefix", a.Zero.Then(a.Rune('b')))
	R("digit0-1", a.Runes('0', '1'))
	R("underscore-bin-digit", c.Any(G("digit0-1"), m.Drop(G("underscore")).Then(G("digit0-1"))))
	R("bin-int", c.Seq(m.Drop(G("bin-prefix")), G("digit0-1"), c.ZeroOrMore(G("underscore-bin-digit"))))

	R("integer", c.Any(
		tok.Int64Base("integer", 16, G("hex-int")),
		tok.Int64Base("integer", 8, G("oct-int")),
		tok.Int64Base("integer", 2, G("bin-int")),
		tok.Int64("integer", G("dec-int"))))

	// Float

	R("float-int-part", G("dec-int"))
	R("exp", a.StrNoCase("e").Then(G("float-int-part")))
	R("decimal-point", a.Dot)
	R("zero-prefixable-int", c.Seq(G("digit"), m.Drop(c.ZeroOrMore(G("underscore-int-digit")))))
	R("frac", c.Seq(G("decimal-point"), G("zero-prefixable-int")))
	R("standard-float", c.Seq(G("float-int-part"), G("exp").Or(G("frac").Then(c.Optional(G("exp"))))))

	R("inf-float", c.Optional(G("plus").Or(G("minus"))).Then(a.Str("inf")))

	R("nan-float", c.Optional(G("plus").Or(G("minus"))).Then(a.Str("nan")))

	R("float", c.Any(
		tok.Float64("float", G("standard-float")),
		tok.ByCallback("float", G("inf-float"), func(t *tokenize.API) interface{} {
			if t.Rune(0) == '-' {
				return math.Inf(-1)
			}
			return math.Inf(+1)
		}),
		tok.ByValue("float", G("nan-float"), math.NaN())))

	// Boolean

	R("true", a.Str("true"))
	R("false", a.Str("false"))

	R("boolean", tok.Boolean("boolean", G("true").Or(G("false"))))

	// Date and time (as defined in RFC 3339)

	R("date-year", G("digit").Times(4))
	R("date-month", G("digit").Times(2))
	R("date-mday", G("digit").Times(2))
	R("date", tok.Str("2006-01-02", c.Seq(G("date-year"), G("minus"), G("date-month"), G("minus"), G("date-mday"))))

	R("time-delim", c.Any(
		tok.Str("T", a.Rune('T')),
		tok.Str("t", a.Rune('t')),
		tok.Str(" ", a.Rune(' '))))

	R("time-hour", G("digit").Times(2))
	R("time-minute", G("digit").Times(2))
	R("time-second", G("digit").Times(2))
	R("time", tok.Str("15:04:05", c.Seq(G("time-hour"), G("colon"), G("time-minute"), G("colon"), G("time-second"))))

	R("time-sec-frac", tok.Str(".999999999", c.Seq(G("decimal-point"), c.MinMax(1, 9, a.Digit), m.Drop(c.ZeroOrMore(a.Digit)))))

	R("time-zulu", m.Replace(a.Runes('Z', 'z'), "Z"))
	R("time-num-offset", c.Seq(G("plus").Or(G("minus")), G("time-hour"), G("colon"), G("time-minute")))
	R("time-offset", tok.Str("Z07:00", c.Any(G("time-zulu"), G("time-num-offset"))))

	R("offset-date-time", c.Seq(G("date"), G("time-delim"), G("time"), G("time-sec-frac").Optional(), G("time-offset")))
	R("local-date-time", c.Seq(G("date"), G("time-delim"), G("time")))
	R("local-date", G("date"))
	R("local-time", G("time"))

	makeDateTimeValue := func(t *tokenize.API) interface{} {
		layout := ""
		input := ""
		for _, t := range t.Tokens() {
			layout += t.Type.(string)
			input += t.Value.(string)
		}
		t.ClearTokens()
		value, err := time.Parse(layout, input)
		if err != nil {
			panic(fmt.Sprintf("Ow, we must implement a way to report date parse errors: %s", err))
		}
		return value
	}

	R("date-time", c.Any(
		tok.ByCallback("offset-date-time", G("offset-date-time"), makeDateTimeValue),
		tok.ByCallback("local-date-time", G("local-date-time"), makeDateTimeValue),
		tok.ByCallback("local-date", G("local-date"), makeDateTimeValue),
		tok.ByCallback("local-time", G("local-time"), makeDateTimeValue)))

	// Inline Table

	R("inline-table-open", a.CurlyOpen.Then(G("ws")))
	R("inline-table-sep", c.Seq(G("ws"), a.Comma, G("ws")))
	R("inline-table-keyvals", c.Separated(G("inline-table-sep"), G("keyval")))
	R("inline-table-close", G("ws").Then(a.CurlyClose))

	R("inline-table", tok.Group("inline-table", c.Seq(
		G("inline-table-open"),
		G("inline-table-keyvals").Optional(),
		G("inline-table-close"))))

	// Inline Array

	R("array-open", a.SquareOpen)
	R("array-sep", G("ws").Then(a.Comma))
	R("ws-comment-newline", c.ZeroOrMore(G("whitespaceChar").Or(G("comment").Optional().Then(G("newline")))))
	R("array-values", c.Seq(
		G("ws-comment-newline"),
		G("val"),
		c.ZeroOrMore(c.Seq(G("ws"), G("array-sep"), G("ws-comment-newline"), G("val"))),
		G("array-sep").Optional()))
	R("array-close", a.SquareClose)

	R("inline-array", tok.Group("array", c.Seq(G("array-open"), G("array-values").Optional(), G("ws-comment-newline"), G("array-close"))))

	// Standard Table

	R("std-table-open", a.SquareOpen.Then(G("ws")))
	R("std-table-close", G("ws").Then(a.SquareClose))

	R("std-table", c.Seq(G("std-table-open"), tok.Group("table", G("key")), G("std-table-close")))

	// Array Table

	R("array-table", c.Seq(G("array-table-open"), tok.Group("array-of-tables", G("key")), G("array-table-close")))

	R("array-table-open", a.SquareOpen.Times(2).Then(G("ws")))
	R("array-table-close", G("ws").Then(a.SquareClose.Times(2)))

	// Table

	R("table", G("array-table").Or(G("std-table")))

	// Key-Value Pairs

	R("unquoted-key", c.OneOrMore(c.Any(G("alpha"), G("digit"), G("minus"), G("underscore"))))
	R("quoted-key", G("basic-string").Or(G("literal-string")))
	R("key-sep", c.Seq(G("ws"), a.Dot, G("ws")))
	R("simple-key", tok.Str("key-part", G("quoted-key").Or(G("unquoted-key"))))
	R("dotted-key", c.Separated(G("key-sep"), G("simple-key")))

	R("key", c.FlushInput(tok.Group("key", G("dotted-key").Or(G("simple-key")))))

	R("keyval-sep", c.FlushInput(c.Seq(G("ws"), a.Equal, G("ws"))))

	R("val", tok.Group("val", c.Any(
		G("string"),
		G("date-time"),
		G("float"),
		G("integer"),
		G("boolean"),
		G("inline-array"),
		G("inline-table"),
	)))

	R("keyval", tok.Group("keyval", c.Seq(G("key"), G("keyval-sep"), G("val"))))

	// Overall Structure

	R("expression", c.Seq(
		c.FlushInput(m.Drop(G("ws"))),
		c.FlushInput(c.Optional(G("table").Or(G("keyval")))),
		c.FlushInput(m.Drop(G("ws"))),
		c.FlushInput(m.Drop(c.Optional(G("comment"))))))

	R("toml", c.Seq(G("expression"), c.ZeroOrMore(G("newline").Then(G("expression"))), a.EndOfFile))

	return G("toml")
}