diff --git a/README.md b/README.md index 679d677..a9f6d30 100644 --- a/README.md +++ b/README.md @@ -1,80 +1,111 @@ # vartan -vartan provides a compiler that generates a LALR(1) or SLR(1) parsing table and a driver for golang. +vartan is a parser generator for golang and supports LALR(1) and SLR(1). vartan also provides a command to perform syntax analysis to allow easy debugging of your grammar. [![Test](https://github.com/nihei9/vartan/actions/workflows/test.yml/badge.svg?branch=main)](https://github.com/nihei9/vartan/actions/workflows/test.yml) -## Status - -🚧 Now Developing - ## Installation +Compiler: + ```sh $ go install github.com/nihei9/vartan/cmd/vartan@latest ``` +Code Generator: + +```sh +$ go install github.com/nihei9/vartan/cmd/vartan-go@latest +``` + ## Usage +### 1. Define your grammar + vartan uses BNF-like DSL to define your grammar. As an example, let's write a grammar that represents a simple expression. ``` +%name expr + +%left mul div +%left add sub + expr - : expr add_op term - | term - ; -term - : term mul_op factor - | factor - ; -factor - : number - | id - ; + : expr add expr + | expr sub expr + | expr mul expr + | expr div expr + | func_call + | integer + | id + | '(' expr ')' #ast #(expr $2) + ; +func_call + : id '(' args ')' #ast #(func_call $1 $3) + | id '(' ')' #ast #(func_call $1) + ; +args + : args ',' expr #ast #(args $1... $3) + | expr + ; whitespaces: "[\u{0009}\u{0020}]+" #skip; -number: "[0-9]+"; -id: "[A-Za-z_]+"; -add_op: '+'; -mul_op: '*'; +integer: "0|[1-9][0-9]*"; +id: "[A-Za-z_][0-9A-Za-z_]*"; +add: '+'; +sub: '-'; +mul: '*'; +div: '/'; ``` Save the above grammar to a file in UTF-8. In this explanation, the file name is `expr.vr`. +⚠️ The input file must be encoded in UTF-8. + +### 2. Compile the grammar + Next, generate a parsing table using `vartan compile` command. ```sh $ vartan compile -g expr.vr -o expr.json +16 conflicts ``` +### 3. Debug + +#### 3.1. Parse + If you want to make sure that the grammar behaves as expected, you can use `vartan parse` command to try parse without implementing a driver. ⚠️ An encoding that `vartan parse` command and the driver can handle is only UTF-8. ```sh -$ echo -n 'foo + bar * baz * 100' | vartan parse expr.json +$ echo -n 'foo(10, bar(a)) + 99 * x' | vartan parse expr.json expr ├─ expr -│ └─ term -│ └─ factor -│ └─ id "foo" -├─ add_op "+" -└─ term - ├─ term - │ ├─ term - │ │ └─ factor - │ │ └─ id "bar" - │ ├─ mul_op "*" - │ └─ factor - │ └─ id "baz" - ├─ mul_op "*" - └─ factor - └─ number "100" +│ └─ func_call +│ ├─ id "foo" +│ └─ args +│ ├─ expr +│ │ └─ integer "10" +│ └─ expr +│ └─ func_call +│ ├─ id "bar" +│ └─ args +│ └─ expr +│ └─ id "a" +├─ add "+" +└─ expr + ├─ expr + │ └─ integer "99" + ├─ mul "*" + └─ expr + └─ id "x" ``` When `vartan parse` command successfully parses the input data, it prints a CST or an AST (if any). -## Debug +#### 3.2. Resolve conflicts `vartan compile` command also generates a description file having `-description.json` suffix along with a parsing table. This file describes each state in the parsing table in detail. If your grammar contains conflicts, see `Conflicts` and `States` sections of this file. Using `vartan show` command, you can see the description file in a readable format. @@ -86,27 +117,38 @@ LALR(1) # Conflicts -No conflict was detected. +16 conflicts were detected. # Terminals 1 - - 2 - - error - 3 - - whitespaces - 4 - - number - 5 - - id - 6 - - add_op (+) - 7 - - mul_op (*) + 3 - - x_1 (\() + 4 - - x_2 (\)) + 5 - - x_3 (,) + 6 - - whitespaces + 7 - - integer + 8 - - id + 9 2 l add (+) + 10 2 l sub (-) + 11 1 l mul (*) + 12 1 l div (/) # Productions 1 - - expr' → expr - 2 - - expr → expr + term - 3 - - expr → term - 4 - - term → term * factor - 5 - - term → factor - 6 - - factor → number - 7 - - factor → id + 2 2 l expr → expr + expr + 3 2 l expr → expr - expr + 4 1 l expr → expr * expr + 5 1 l expr → expr / expr + 6 - - expr → func_call + 7 - - expr → integer + 8 - - expr → id + 9 - - expr → \( expr \) + 10 - - func_call → id \( args \) + 11 - - func_call → id \( \) + 12 - - args → args , expr + 13 - - args → expr # States @@ -114,11 +156,144 @@ No conflict was detected. 1 expr' → ・ expr -shift 4 on number +shift 3 on \( +shift 4 on integer shift 5 on id goto 1 on expr -goto 2 on term -goto 3 on factor +goto 2 on func_call + + +## State 1 + + 1 expr' → expr ・ + 2 expr → expr ・ + expr + 3 expr → expr ・ - expr + 4 expr → expr ・ * expr + 5 expr → expr ・ / expr + +shift 6 on + +shift 7 on - +shift 8 on * +shift 9 on / +reduce 1 on + + +## State 2 + + 6 expr → func_call ・ + +reduce 6 on , \), ,, +, -, *, / ... ``` + +### 4. Generate a parser + +Using `vartan-go` command, you can generate a source code of a parser to recognize your grammar. + +```sh +$ vartan-go expr.json +``` + +Then you will get the following files. + +* `expr_parser.go` +* `expr_lexer.go` +* `expr_semantic_action.go` + +You need to implement a driver to use the parser. An example is below. + +```go +package main + +import ( + "fmt" + "io" + "os" +) + +func main() { + toks, err := NewTokenStream(os.Stdin) + if err != nil { + fmt.Println(err) + os.Exit(1) + } + gram := NewGrammar() + treeAct := NewSyntaxTreeActionSet(gram, true, false) + p, err := NewParser(toks, gram, SemanticAction(treeAct)) + if err != nil { + fmt.Println(err) + os.Exit(1) + } + err = p.Parse() + if err != nil { + fmt.Println(err) + os.Exit(1) + } + synErrs := p.SyntaxErrors() + if len(synErrs) > 0 { + for _, synErr := range synErrs { + printSyntaxError(os.Stderr, synErr, gram) + } + os.Exit(1) + } + fmt.Println("accepted") + PrintTree(os.Stdout, treeAct.AST()) +} + +func printSyntaxError(w io.Writer, synErr *SyntaxError, gram Grammar) { + var msg string + tok := synErr.Token + switch { + case tok.EOF(): + msg = "" + case tok.Invalid(): + msg = fmt.Sprintf("'%v' ()", string(tok.Lexeme())) + default: + if alias := gram.TerminalAlias(tok.TerminalID()); alias != "" { + msg = fmt.Sprintf("'%v' (%v)", string(tok.Lexeme()), alias) + } else { + msg = fmt.Sprintf("'%v' (%v)", string(tok.Lexeme()), gram.Terminal(tok.TerminalID())) + } + } + fmt.Fprintf(w, "%v:%v: %v: %v", synErr.Row+1, synErr.Col+1, synErr.Message, msg) + + if len(synErr.ExpectedTerminals) > 0 { + fmt.Fprintf(w, "; expected: %v", synErr.ExpectedTerminals[0]) + for _, t := range synErr.ExpectedTerminals[1:] { + fmt.Fprintf(w, ", %v", t) + } + } + + fmt.Fprintf(w, "\n") +} +``` + +Please save the above source code to `main.go` and create a directory structure like the following. + +``` +/project_root +├── expr_parser.go +├── expr_lexer.go +├── expr_semantic_action.go +└── main.go (the driver you implemented) +``` + +Now, you can perform the syntax analysis. + +```sh +$ echo -n 'foo+99' | go run . +accepted +expr +├─ expr +│ └─ id "foo" +├─ add "+" +└─ expr + └─ integer "99" +``` + +```sh +$ echo -n 'foo+99?' | go run . +1:7: unexpected token: '?' (); expected: , +, -, *, / +exit status 1 +``` diff --git a/cmd/vartan-go/generate.go b/cmd/vartan-go/generate.go new file mode 100644 index 0000000..27f7236 --- /dev/null +++ b/cmd/vartan-go/generate.go @@ -0,0 +1,153 @@ +package main + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "os" + "runtime/debug" + + mldriver "github.com/nihei9/maleeni/driver" + "github.com/nihei9/vartan/driver" + "github.com/nihei9/vartan/spec" + "github.com/spf13/cobra" +) + +func Execute() error { + err := generateCmd.Execute() + if err != nil { + fmt.Fprintf(os.Stderr, "%v\n", err) + return err + } + + return nil +} + +var generateFlags = struct { + pkgName *string +}{} + +var generateCmd = &cobra.Command{ + Use: "vartan-go", + Short: "Generate a parser for Go", + Long: `vartan-go generates a parser for Go.`, + Example: ` vartan-go grammar.json`, + Args: cobra.ExactArgs(1), + RunE: runGenerate, + SilenceErrors: true, + SilenceUsage: true, +} + +func init() { + generateFlags.pkgName = generateCmd.Flags().StringP("package", "p", "main", "package name") +} + +func runGenerate(cmd *cobra.Command, args []string) (retErr error) { + defer func() { + panicked := false + v := recover() + if v != nil { + err, ok := v.(error) + if !ok { + retErr = fmt.Errorf("an unexpected error occurred: %v", v) + fmt.Fprintf(os.Stderr, "%v:\n%v", retErr, string(debug.Stack())) + return + } + + retErr = err + panicked = true + } + + if retErr != nil { + if panicked { + fmt.Fprintf(os.Stderr, "%v:\n%v", retErr, string(debug.Stack())) + } else { + fmt.Fprintf(os.Stderr, "%v\n", retErr) + } + } + }() + + cgram, err := readCompiledGrammar(args[0]) + if err != nil { + return fmt.Errorf("Cannot read a compiled grammar: %w", err) + } + + { + b, err := mldriver.GenLexer(cgram.LexicalSpecification.Maleeni.Spec, *generateFlags.pkgName) + if err != nil { + return fmt.Errorf("Failed to generate a lexer: %w", err) + } + + filePath := fmt.Sprintf("%v_lexer.go", cgram.Name) + + f, err := os.OpenFile(filePath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644) + if err != nil { + return fmt.Errorf("Failed to create an output file: %v", err) + } + defer f.Close() + + _, err = f.Write(b) + if err != nil { + return fmt.Errorf("Failed to write lexer source code: %v", err) + } + } + + { + b, err := driver.GenParser(cgram, *generateFlags.pkgName) + if err != nil { + return fmt.Errorf("Failed to generate a parser: %w", err) + } + + filePath := fmt.Sprintf("%v_parser.go", cgram.Name) + + f, err := os.OpenFile(filePath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644) + if err != nil { + return fmt.Errorf("Failed to create an output file: %v", err) + } + defer f.Close() + + _, err = f.Write(b) + if err != nil { + return fmt.Errorf("Failed to write parser source code: %v", err) + } + } + + { + b, err := driver.GenSemanticAction(*generateFlags.pkgName) + if err != nil { + return fmt.Errorf("Failed to generate a semantic action set: %w", err) + } + + filePath := fmt.Sprintf("%v_semantic_action.go", cgram.Name) + + f, err := os.OpenFile(filePath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644) + if err != nil { + return fmt.Errorf("Failed to create an output file: %v", err) + } + defer f.Close() + + _, err = f.Write(b) + if err != nil { + return fmt.Errorf("Failed to write semantic action source code: %v", err) + } + } + + return nil +} + +func readCompiledGrammar(path string) (*spec.CompiledGrammar, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + data, err := ioutil.ReadAll(f) + if err != nil { + return nil, err + } + cgram := &spec.CompiledGrammar{} + err = json.Unmarshal(data, cgram) + if err != nil { + return nil, err + } + return cgram, nil +} diff --git a/cmd/vartan-go/main.go b/cmd/vartan-go/main.go new file mode 100644 index 0000000..701f02f --- /dev/null +++ b/cmd/vartan-go/main.go @@ -0,0 +1,12 @@ +package main + +import ( + "os" +) + +func main() { + err := Execute() + if err != nil { + os.Exit(1) + } +} diff --git a/cmd/vartan/parse.go b/cmd/vartan/parse.go index caf1cbd..9b15251 100644 --- a/cmd/vartan/parse.go +++ b/cmd/vartan/parse.go @@ -63,7 +63,7 @@ func runParse(cmd *cobra.Command, args []string) (retErr error) { return fmt.Errorf("You cannot enable --only-parse and --cst at the same time") } - cgram, err := readCompiledGrammar(args[0]) + cg, err := readCompiledGrammar(args[0]) if err != nil { return fmt.Errorf("Cannot read a compiled grammar: %w", err) } @@ -81,13 +81,15 @@ func runParse(cmd *cobra.Command, args []string) (retErr error) { src = f } + gram := driver.NewGrammar(cg) + var opts []driver.ParserOption { switch { case *parseFlags.cst: - treeAct = driver.NewSyntaxTreeActionSet(cgram, false, true) + treeAct = driver.NewSyntaxTreeActionSet(gram, false, true) case !*parseFlags.onlyParse: - treeAct = driver.NewSyntaxTreeActionSet(cgram, true, false) + treeAct = driver.NewSyntaxTreeActionSet(gram, true, false) } if treeAct != nil { opts = append(opts, driver.SemanticAction(treeAct)) @@ -98,12 +100,12 @@ func runParse(cmd *cobra.Command, args []string) (retErr error) { } } - toks, err := driver.NewTokenStream(cgram, src) + toks, err := driver.NewTokenStream(cg, src) if err != nil { return err } - p, err = driver.NewParser(toks, driver.NewGrammar(cgram), opts...) + p, err = driver.NewParser(toks, gram, opts...) if err != nil { return err } @@ -125,7 +127,7 @@ func runParse(cmd *cobra.Command, args []string) (retErr error) { case tok.Invalid(): msg = fmt.Sprintf("'%v' ()", string(tok.Lexeme())) default: - t := cgram.ParsingTable.Terminals[tok.TerminalID()] + t := cg.ParsingTable.Terminals[tok.TerminalID()] msg = fmt.Sprintf("'%v' (%v)", string(tok.Lexeme()), t) } @@ -161,10 +163,10 @@ func readCompiledGrammar(path string) (*spec.CompiledGrammar, error) { if err != nil { return nil, err } - cgram := &spec.CompiledGrammar{} - err = json.Unmarshal(data, cgram) + cg := &spec.CompiledGrammar{} + err = json.Unmarshal(data, cg) if err != nil { return nil, err } - return cgram, nil + return cg, nil } diff --git a/driver/conflict_test.go b/driver/conflict_test.go index c827107..f507d4f 100644 --- a/driver/conflict_test.go +++ b/driver/conflict_test.go @@ -344,18 +344,19 @@ assign: '='; t.Fatal(err) } - gram, err := grammar.Compile(g, grammar.SpecifyClass(grammar.ClassSLR)) + cg, err := grammar.Compile(g, grammar.SpecifyClass(grammar.ClassSLR)) if err != nil { t.Fatal(err) } - toks, err := NewTokenStream(gram, strings.NewReader(tt.src)) + toks, err := NewTokenStream(cg, strings.NewReader(tt.src)) if err != nil { t.Fatal(err) } + gram := NewGrammar(cg) treeAct := NewSyntaxTreeActionSet(gram, false, true) - p, err := NewParser(toks, NewGrammar(gram), SemanticAction(treeAct)) + p, err := NewParser(toks, gram, SemanticAction(treeAct)) if err != nil { t.Fatal(err) } diff --git a/driver/parser.go b/driver/parser.go index b765982..4c7397f 100644 --- a/driver/parser.go +++ b/driver/parser.go @@ -32,6 +32,9 @@ type Grammar interface { // RecoverProduction returns true when a production has the recover directive. RecoverProduction(prod int) bool + // NonTerminal retuns a string representaion of a non-terminal symbol. + NonTerminal(nonTerminal int) string + // TerminalCount returns a terminal symbol count of grammar. TerminalCount() int @@ -46,9 +49,12 @@ type Grammar interface { // TerminalAlias returns an alias for a terminal. TerminalAlias(terminal int) string + + // ASTAction returns an AST action entries. + ASTAction(prod int) []int } -type Token interface { +type VToken interface { // TerminalID returns a terminal ID. TerminalID() int @@ -69,14 +75,14 @@ type Token interface { } type TokenStream interface { - Next() (Token, error) + Next() (VToken, error) } type SyntaxError struct { Row int Col int Message string - Token Token + Token VToken ExpectedTerminals []string } @@ -269,7 +275,7 @@ func (p *Parser) validateLookahead(term int) bool { } } -func (p *Parser) nextToken() (Token, error) { +func (p *Parser) nextToken() (VToken, error) { for { // We don't have to check whether the token is invalid because the kind ID of the invalid token is 0, // and the parsing table doesn't have an entry corresponding to the kind ID 0. Thus we can detect @@ -287,7 +293,7 @@ func (p *Parser) nextToken() (Token, error) { } } -func (p *Parser) tokenToTerminal(tok Token) int { +func (p *Parser) tokenToTerminal(tok VToken) int { if tok.EOF() { return p.gram.EOF() } @@ -295,7 +301,7 @@ func (p *Parser) tokenToTerminal(tok Token) int { return tok.TerminalID() } -func (p *Parser) lookupAction(tok Token) int { +func (p *Parser) lookupAction(tok VToken) int { if !p.disableLAC { term := p.tokenToTerminal(tok) if !p.validateLookahead(term) { diff --git a/driver/parser_test.go b/driver/parser_test.go index 4e60f52..89d0e79 100644 --- a/driver/parser_test.go +++ b/driver/parser_test.go @@ -762,18 +762,19 @@ error: 'error' #skip; } } - gram, err := grammar.Compile(g, grammar.SpecifyClass(class)) + cg, err := grammar.Compile(g, grammar.SpecifyClass(class)) if err != nil { t.Fatal(err) } - toks, err := NewTokenStream(gram, strings.NewReader(tt.src)) + toks, err := NewTokenStream(cg, strings.NewReader(tt.src)) if err != nil { t.Fatal(err) } + gram := NewGrammar(cg) treeAct := NewSyntaxTreeActionSet(gram, true, true) - p, err := NewParser(toks, NewGrammar(gram), SemanticAction(treeAct)) + p, err := NewParser(toks, gram, SemanticAction(treeAct)) if err != nil { t.Fatal(err) } diff --git a/driver/semantic_action.go b/driver/semantic_action.go index 3023a36..d88d5fa 100644 --- a/driver/semantic_action.go +++ b/driver/semantic_action.go @@ -3,14 +3,12 @@ package driver import ( "fmt" "io" - - "github.com/nihei9/vartan/spec" ) type SemanticActionSet interface { // Shift runs when the driver shifts a symbol onto the state stack. `tok` is a token corresponding to // the symbol. When the driver recovered from an error state by shifting the token, `recovered` is true. - Shift(tok Token, recovered bool) + Shift(tok VToken, recovered bool) // Reduce runs when the driver reduces an RHS of a production to its LHS. `prodNum` is a number of // the production. When the driver recovered from an error state by reducing the production, @@ -25,10 +23,10 @@ type SemanticActionSet interface { // from the state stack. // Unlike `Shift` function, this function doesn't take a token to be shifted as an argument because a token // corresponding to the error symbol doesn't exist. - TrapAndShiftError(cause Token, popped int) + TrapAndShiftError(cause VToken, popped int) // MissError runs when the driver fails to trap a syntax error. `cause` is a token that caused a syntax error. - MissError(cause Token) + MissError(cause VToken) } var _ SemanticActionSet = &SyntaxTreeActionSet{} @@ -77,7 +75,7 @@ func printTree(w io.Writer, node *Node, ruledLine string, childRuledLinePrefix s } type SyntaxTreeActionSet struct { - gram *spec.CompiledGrammar + gram Grammar makeAST bool makeCST bool ast *Node @@ -85,7 +83,7 @@ type SyntaxTreeActionSet struct { semStack *semanticStack } -func NewSyntaxTreeActionSet(gram *spec.CompiledGrammar, makeAST bool, makeCST bool) *SyntaxTreeActionSet { +func NewSyntaxTreeActionSet(gram Grammar, makeAST bool, makeCST bool) *SyntaxTreeActionSet { return &SyntaxTreeActionSet{ gram: gram, makeAST: makeAST, @@ -94,7 +92,7 @@ func NewSyntaxTreeActionSet(gram *spec.CompiledGrammar, makeAST bool, makeCST bo } } -func (a *SyntaxTreeActionSet) Shift(tok Token, recovered bool) { +func (a *SyntaxTreeActionSet) Shift(tok VToken, recovered bool) { term := a.tokenToTerminal(tok) var ast *Node @@ -102,7 +100,7 @@ func (a *SyntaxTreeActionSet) Shift(tok Token, recovered bool) { if a.makeAST { row, col := tok.Position() ast = &Node{ - KindName: a.gram.ParsingTable.Terminals[term], + KindName: a.gram.Terminal(term), Text: string(tok.Lexeme()), Row: row, Col: col, @@ -111,7 +109,7 @@ func (a *SyntaxTreeActionSet) Shift(tok Token, recovered bool) { if a.makeCST { row, col := tok.Position() cst = &Node{ - KindName: a.gram.ParsingTable.Terminals[term], + KindName: a.gram.Terminal(term), Text: string(tok.Lexeme()), Row: row, Col: col, @@ -125,16 +123,16 @@ func (a *SyntaxTreeActionSet) Shift(tok Token, recovered bool) { } func (a *SyntaxTreeActionSet) Reduce(prodNum int, recovered bool) { - lhs := a.gram.ParsingTable.LHSSymbols[prodNum] + lhs := a.gram.LHS(prodNum) // When an alternative is empty, `n` will be 0, and `handle` will be empty slice. - n := a.gram.ParsingTable.AlternativeSymbolCounts[prodNum] + n := a.gram.AlternativeSymbolCount(prodNum) handle := a.semStack.pop(n) var ast *Node var cst *Node if a.makeAST { - act := a.gram.ASTAction.Entries[prodNum] + act := a.gram.ASTAction(prodNum) var children []*Node if act != nil { // Count the number of children in advance to avoid frequent growth in a slice for children. @@ -177,7 +175,7 @@ func (a *SyntaxTreeActionSet) Reduce(prodNum int, recovered bool) { } ast = &Node{ - KindName: a.gram.ParsingTable.NonTerminals[lhs], + KindName: a.gram.NonTerminal(lhs), Children: children, } } @@ -188,7 +186,7 @@ func (a *SyntaxTreeActionSet) Reduce(prodNum int, recovered bool) { } cst = &Node{ - KindName: a.gram.ParsingTable.NonTerminals[lhs], + KindName: a.gram.NonTerminal(lhs), Children: children, } } @@ -206,21 +204,19 @@ func (a *SyntaxTreeActionSet) Accept() { a.ast = top[0].ast } -func (a *SyntaxTreeActionSet) TrapAndShiftError(cause Token, popped int) { +func (a *SyntaxTreeActionSet) TrapAndShiftError(cause VToken, popped int) { a.semStack.pop(popped) - errSym := a.gram.ParsingTable.ErrorSymbol - var ast *Node var cst *Node if a.makeAST { ast = &Node{ - KindName: a.gram.ParsingTable.Terminals[errSym], + KindName: a.gram.Terminal(a.gram.Error()), } } if a.makeCST { cst = &Node{ - KindName: a.gram.ParsingTable.Terminals[errSym], + KindName: a.gram.Terminal(a.gram.Error()), } } @@ -230,7 +226,7 @@ func (a *SyntaxTreeActionSet) TrapAndShiftError(cause Token, popped int) { }) } -func (a *SyntaxTreeActionSet) MissError(cause Token) { +func (a *SyntaxTreeActionSet) MissError(cause VToken) { } func (a *SyntaxTreeActionSet) CST() *Node { @@ -241,9 +237,9 @@ func (a *SyntaxTreeActionSet) AST() *Node { return a.ast } -func (a *SyntaxTreeActionSet) tokenToTerminal(tok Token) int { +func (a *SyntaxTreeActionSet) tokenToTerminal(tok VToken) int { if tok.EOF() { - return a.gram.ParsingTable.EOFSymbol + return a.gram.EOF() } return tok.TerminalID() diff --git a/driver/semantic_action_test.go b/driver/semantic_action_test.go index 9c66a85..791b5f0 100644 --- a/driver/semantic_action_test.go +++ b/driver/semantic_action_test.go @@ -14,7 +14,7 @@ type testSemAct struct { actLog []string } -func (a *testSemAct) Shift(tok Token, recovered bool) { +func (a *testSemAct) Shift(tok VToken, recovered bool) { t := a.gram.ParsingTable.Terminals[tok.TerminalID()] if recovered { a.actLog = append(a.actLog, fmt.Sprintf("shift/%v/recovered", t)) @@ -37,11 +37,11 @@ func (a *testSemAct) Accept() { a.actLog = append(a.actLog, "accept") } -func (a *testSemAct) TrapAndShiftError(cause Token, popped int) { +func (a *testSemAct) TrapAndShiftError(cause VToken, popped int) { a.actLog = append(a.actLog, fmt.Sprintf("trap/%v/shift/error", popped)) } -func (a *testSemAct) MissError(cause Token) { +func (a *testSemAct) MissError(cause VToken) { a.actLog = append(a.actLog, "miss") } diff --git a/driver/spec.go b/driver/spec.go index a935577..6127e73 100644 --- a/driver/spec.go +++ b/driver/spec.go @@ -48,6 +48,10 @@ func (g *grammarImpl) ErrorTrapperState(state int) bool { return g.g.ParsingTable.ErrorTrapperStates[state] != 0 } +func (g *grammarImpl) NonTerminal(nonTerminal int) string { + return g.g.ParsingTable.NonTerminals[nonTerminal] +} + func (g *grammarImpl) LHS(prod int) int { return g.g.ParsingTable.LHSSymbols[prod] } @@ -67,3 +71,7 @@ func (g *grammarImpl) Terminal(terminal int) string { func (g *grammarImpl) TerminalAlias(terminal int) string { return g.g.LexicalSpecification.Maleeni.KindAliases[terminal] } + +func (g *grammarImpl) ASTAction(prod int) []int { + return g.g.ASTAction.Entries[prod] +} diff --git a/driver/template.go b/driver/template.go new file mode 100644 index 0000000..aa1fbd3 --- /dev/null +++ b/driver/template.go @@ -0,0 +1,554 @@ +package driver + +import ( + "bytes" + _ "embed" + "fmt" + "go/ast" + "go/format" + "go/parser" + "go/token" + goToken "go/token" + "strconv" + "strings" + "text/template" + + "github.com/nihei9/vartan/spec" +) + +//go:embed parser.go +var parserCoreSrc string + +//go:embed semantic_action.go +var semActSrc string + +func GenParser(cgram *spec.CompiledGrammar, pkgName string) ([]byte, error) { + var parserSrc string + { + fset := goToken.NewFileSet() + f, err := parser.ParseFile(fset, "parser.go", parserCoreSrc, parser.ParseComments) + if err != nil { + return nil, err + } + + var b strings.Builder + err = format.Node(&b, fset, f) + if err != nil { + return nil, err + } + + parserSrc = b.String() + } + + var grammarSrc string + { + t, err := template.New("").Funcs(genGrammarTemplateFuncs(cgram)).Parse(grammarSrcTmplate) + if err != nil { + return nil, err + } + + var b strings.Builder + err = t.Execute(&b, map[string]interface{}{ + "class": cgram.ParsingTable.Class, + "initialState": cgram.ParsingTable.InitialState, + "startProduction": cgram.ParsingTable.StartProduction, + "terminalCount": cgram.ParsingTable.TerminalCount, + "nonTerminalCount": cgram.ParsingTable.NonTerminalCount, + "eofSymbol": cgram.ParsingTable.EOFSymbol, + "errorSymbol": cgram.ParsingTable.ErrorSymbol, + }) + if err != nil { + return nil, err + } + + grammarSrc = b.String() + } + + var lexerSrc string + { + t, err := template.New("").Funcs(genLexerTemplateFuncs(cgram)).Parse(lexerSrcTmplate) + if err != nil { + return nil, err + } + + var b strings.Builder + err = t.Execute(&b, nil) + if err != nil { + return nil, err + } + + lexerSrc = b.String() + } + + var src string + { + tmpl := `// Code generated by vartan-go. DO NOT EDIT. +{{ .parserSrc }} + +{{ .grammarSrc }} + +{{ .lexerSrc }} +` + t, err := template.New("").Parse(tmpl) + if err != nil { + return nil, err + } + + var b strings.Builder + err = t.Execute(&b, map[string]string{ + "parserSrc": parserSrc, + "grammarSrc": grammarSrc, + "lexerSrc": lexerSrc, + }) + if err != nil { + return nil, err + } + + src = b.String() + } + + fset := goToken.NewFileSet() + f, err := parser.ParseFile(fset, "", src, parser.ParseComments) + if err != nil { + return nil, err + } + + f.Name = ast.NewIdent(pkgName) + + // Complete an import statement. + for _, d := range f.Decls { + gd, ok := d.(*ast.GenDecl) + if !ok || gd.Tok != token.IMPORT { + continue + } + gd.Specs = append(gd.Specs, &ast.ImportSpec{ + Path: &ast.BasicLit{ + Value: `"io"`, + }, + }) + break + } + + var b bytes.Buffer + err = format.Node(&b, fset, f) + if err != nil { + return nil, err + } + + return b.Bytes(), nil +} + +const grammarSrcTmplate = ` +type grammarImpl struct { + recoverProductions []int + action []int + goTo []int + alternativeSymbolCounts []int + errorTrapperStates []int + nonTerminals []string + lhsSymbols []int + terminals []string + terminalAliases []string + astActions [][]int +} + +func NewGrammar() *grammarImpl { + return &grammarImpl{ + recoverProductions: {{ genRecoverProductions }}, + action: {{ genAction }}, + goTo: {{ genGoTo }}, + alternativeSymbolCounts: {{ genAlternativeSymbolCounts }}, + errorTrapperStates: {{ genErrorTrapperStates }}, + nonTerminals: {{ genNonTerminals }}, + lhsSymbols: {{ genLHSSymbols }}, + terminals: {{ genTerminals }}, + terminalAliases: {{ genTerminalAliases }}, + astActions: {{ genASTActions }}, + } +} + +func (g *grammarImpl) Class() string { + return "{{ .class }}" +} + +func (g *grammarImpl) InitialState() int { + return {{ .initialState }} +} + +func (g *grammarImpl) StartProduction() int { + return {{ .startProduction }} +} + +func (g *grammarImpl) RecoverProduction(prod int) bool { + return g.recoverProductions[prod] != 0 +} + +func (g *grammarImpl) Action(state int, terminal int) int { + return g.action[state*{{ .terminalCount }}+terminal] +} + +func (g *grammarImpl) GoTo(state int, lhs int) int { + return g.goTo[state*{{ .nonTerminalCount }}+lhs] +} + +func (g *grammarImpl) AlternativeSymbolCount(prod int) int { + return g.alternativeSymbolCounts[prod] +} + +func (g *grammarImpl) TerminalCount() int { + return {{ .terminalCount }} +} + +func (g *grammarImpl) ErrorTrapperState(state int) bool { + return g.errorTrapperStates[state] != 0 +} + +func (g *grammarImpl) NonTerminal(nonTerminal int) string { + return g.nonTerminals[nonTerminal] +} + +func (g *grammarImpl) LHS(prod int) int { + return g.lhsSymbols[prod] +} + +func (g *grammarImpl) EOF() int { + return {{ .eofSymbol }} +} + +func (g *grammarImpl) Error() int { + return {{ .errorSymbol }} +} + +func (g *grammarImpl) Terminal(terminal int) string { + return g.terminals[terminal] +} + +func (g *grammarImpl) TerminalAlias(terminal int) string { + return g.terminalAliases[terminal] +} + +func (g *grammarImpl) ASTAction(prod int) []int { + return g.astActions[prod] +} +` + +func genGrammarTemplateFuncs(cgram *spec.CompiledGrammar) template.FuncMap { + return template.FuncMap{ + "genRecoverProductions": func() string { + var b strings.Builder + fmt.Fprintf(&b, "[]int{\n") + c := 1 + for _, v := range cgram.ParsingTable.RecoverProductions { + fmt.Fprintf(&b, "%v, ", v) + if c == 20 { + fmt.Fprintf(&b, "\n") + c = 1 + } else { + c++ + } + } + if c > 1 { + fmt.Fprintf(&b, "\n") + } + fmt.Fprintf(&b, "}") + return b.String() + }, + "genAction": func() string { + var b strings.Builder + fmt.Fprintf(&b, "[]int{\n") + c := 1 + for _, v := range cgram.ParsingTable.Action { + fmt.Fprintf(&b, "%v, ", v) + if c == 20 { + fmt.Fprintf(&b, "\n") + c = 1 + } else { + c++ + } + } + if c > 1 { + fmt.Fprintf(&b, "\n") + } + fmt.Fprintf(&b, "}") + return b.String() + }, + "genGoTo": func() string { + var b strings.Builder + fmt.Fprintf(&b, "[]int{\n") + c := 1 + for _, v := range cgram.ParsingTable.GoTo { + fmt.Fprintf(&b, "%v, ", v) + if c == 20 { + fmt.Fprintf(&b, "\n") + c = 1 + } else { + c++ + } + } + if c > 1 { + fmt.Fprintf(&b, "\n") + } + fmt.Fprintf(&b, "}") + return b.String() + }, + "genAlternativeSymbolCounts": func() string { + var b strings.Builder + fmt.Fprintf(&b, "[]int{\n") + c := 1 + for _, v := range cgram.ParsingTable.AlternativeSymbolCounts { + fmt.Fprintf(&b, "%v, ", v) + if c == 20 { + fmt.Fprintf(&b, "\n") + c = 1 + } else { + c++ + } + } + if c > 1 { + fmt.Fprintf(&b, "\n") + } + fmt.Fprintf(&b, "}") + return b.String() + }, + "genErrorTrapperStates": func() string { + var b strings.Builder + fmt.Fprintf(&b, "[]int{\n") + c := 1 + for _, v := range cgram.ParsingTable.ErrorTrapperStates { + fmt.Fprintf(&b, "%v, ", v) + if c == 20 { + fmt.Fprintf(&b, "\n") + c = 1 + } else { + c++ + } + } + if c > 1 { + fmt.Fprintf(&b, "\n") + } + fmt.Fprintf(&b, "}") + return b.String() + }, + "genNonTerminals": func() string { + var b strings.Builder + fmt.Fprintf(&b, "[]string{\n") + for _, v := range cgram.ParsingTable.NonTerminals { + fmt.Fprintf(&b, "%v,\n", strconv.Quote(v)) + } + fmt.Fprintf(&b, "}") + return b.String() + }, + "genLHSSymbols": func() string { + var b strings.Builder + fmt.Fprintf(&b, "[]int{\n") + c := 1 + for _, v := range cgram.ParsingTable.LHSSymbols { + fmt.Fprintf(&b, "%v, ", v) + if c == 20 { + fmt.Fprintf(&b, "\n") + c = 1 + } else { + c++ + } + } + if c > 1 { + fmt.Fprintf(&b, "\n") + } + fmt.Fprintf(&b, "}") + return b.String() + }, + "genTerminals": func() string { + var b strings.Builder + fmt.Fprintf(&b, "[]string{\n") + for _, v := range cgram.ParsingTable.Terminals { + fmt.Fprintf(&b, "%v,\n", strconv.Quote(v)) + } + fmt.Fprintf(&b, "}") + return b.String() + }, + "genTerminalAliases": func() string { + var b strings.Builder + fmt.Fprintf(&b, "[]string{\n") + for _, v := range cgram.LexicalSpecification.Maleeni.KindAliases { + fmt.Fprintf(&b, "%v,\n", strconv.Quote(v)) + } + fmt.Fprintf(&b, "}") + return b.String() + }, + "genASTActions": func() string { + var b strings.Builder + fmt.Fprintf(&b, "[][]int{\n") + for _, entries := range cgram.ASTAction.Entries { + if len(entries) == 0 { + fmt.Fprintf(&b, "nil,\n") + continue + } + + fmt.Fprintf(&b, "{\n") + c := 1 + for _, v := range entries { + fmt.Fprintf(&b, "%v, ", v) + if c == 20 { + fmt.Fprintf(&b, "\n") + c = 1 + } else { + c++ + } + } + if c > 1 { + fmt.Fprintf(&b, "\n") + } + fmt.Fprintf(&b, "},\n") + } + fmt.Fprintf(&b, "}") + return b.String() + }, + } +} + +const lexerSrcTmplate = ` +type vToken struct { + terminalID int + skip bool + tok *Token +} + +func (t *vToken) TerminalID() int { + return t.terminalID +} + +func (t *vToken) Lexeme() []byte { + return t.tok.Lexeme +} + +func (t *vToken) EOF() bool { + return t.tok.EOF +} + +func (t *vToken) Invalid() bool { + return t.tok.Invalid +} + +func (t *vToken) Skip() bool { + return t.skip +} + +func (t *vToken) Position() (int, int) { + return t.tok.Row, t.tok.Col +} + +var kindToTerminal = {{ genKindToTerminal }} + +var skip = {{ genSkip }} + +type tokenStream struct { + lex *Lexer + kindToTerminal []int + skip []int +} + +func NewTokenStream(src io.Reader) (*tokenStream, error) { + lex, err := NewLexer(NewLexSpec(), src) + if err != nil { + return nil, err + } + + return &tokenStream{ + lex: lex, + }, nil +} + +func (t *tokenStream) Next() (VToken, error) { + tok, err := t.lex.Next() + if err != nil { + return nil, err + } + return &vToken{ + terminalID: kindToTerminal[tok.KindID], + skip: skip[tok.KindID] > 0, + tok: tok, + }, nil +} +` + +func genLexerTemplateFuncs(cgram *spec.CompiledGrammar) template.FuncMap { + return template.FuncMap{ + "genKindToTerminal": func() string { + var b strings.Builder + fmt.Fprintf(&b, "[]int{\n") + c := 1 + for _, v := range cgram.LexicalSpecification.Maleeni.KindToTerminal { + fmt.Fprintf(&b, "%v, ", v) + if c == 20 { + fmt.Fprintf(&b, "\n") + c = 1 + } else { + c++ + } + } + if c > 1 { + fmt.Fprintf(&b, "\n") + } + fmt.Fprintf(&b, "}") + return b.String() + }, + "genSkip": func() string { + var b strings.Builder + fmt.Fprintf(&b, "[]int{\n") + c := 1 + for _, v := range cgram.LexicalSpecification.Maleeni.Skip { + fmt.Fprintf(&b, "%v, ", v) + if c == 20 { + fmt.Fprintf(&b, "\n") + c = 1 + } else { + c++ + } + } + if c > 1 { + fmt.Fprintf(&b, "\n") + } + fmt.Fprintf(&b, "}") + return b.String() + }, + } +} + +func GenSemanticAction(pkgName string) ([]byte, error) { + var src string + { + tmpl := `// Code generated by vartan-go. DO NOT EDIT. +{{ .semActSrc }} +` + t, err := template.New("").Parse(tmpl) + if err != nil { + return nil, err + } + + var b strings.Builder + err = t.Execute(&b, map[string]string{ + "semActSrc": semActSrc, + }) + if err != nil { + return nil, err + } + + src = b.String() + } + + fset := goToken.NewFileSet() + f, err := parser.ParseFile(fset, "", src, parser.ParseComments) + if err != nil { + return nil, err + } + + f.Name = ast.NewIdent(pkgName) + + var b bytes.Buffer + err = format.Node(&b, fset, f) + if err != nil { + return nil, err + } + + return b.Bytes(), nil +} diff --git a/driver/token_stream.go b/driver/token_stream.go index feb86ae..97c9f1f 100644 --- a/driver/token_stream.go +++ b/driver/token_stream.go @@ -7,33 +7,33 @@ import ( "github.com/nihei9/vartan/spec" ) -type token struct { +type vToken struct { terminalID int skip bool tok *mldriver.Token } -func (t *token) TerminalID() int { +func (t *vToken) TerminalID() int { return t.terminalID } -func (t *token) Lexeme() []byte { +func (t *vToken) Lexeme() []byte { return t.tok.Lexeme } -func (t *token) EOF() bool { +func (t *vToken) EOF() bool { return t.tok.EOF } -func (t *token) Invalid() bool { +func (t *vToken) Invalid() bool { return t.tok.Invalid } -func (t *token) Skip() bool { +func (t *vToken) Skip() bool { return t.skip } -func (t *token) Position() (int, int) { +func (t *vToken) Position() (int, int) { return t.tok.Row, t.tok.Col } @@ -56,12 +56,12 @@ func NewTokenStream(g *spec.CompiledGrammar, src io.Reader) (TokenStream, error) }, nil } -func (l *tokenStream) Next() (Token, error) { +func (l *tokenStream) Next() (VToken, error) { tok, err := l.lex.Next() if err != nil { return nil, err } - return &token{ + return &vToken{ terminalID: l.kindToTerminal[tok.KindID], skip: l.skip[tok.KindID] > 0, tok: tok,