使用 Menhir 构建 SimPL 的编译器
Lexer and Parser 语法分析模块
Lexer, Parser, AST 是三个依次耦合的模块,可以这么描述三者的关系:
Lexer ---tokens--> Parser ---nodes--> AST
相对于上面的图像化描述,cs3110 反过来构建整个 Lexer 和 Parser 的结构
在 ast.ml
中,定义了 AST 上节点 node 的类型:
type bop =
| Add
| Mul
| Leq
type expr =
| Var of string
| Int of int
| Bool of bool
| Binop of bop * expr * expr
| If of expr * expr * expr
| Let of string * expr * expr
在 parser.mly
中构建的 parser 接受 TOKEN
并将其翻译成 AST 上节点
(* Link the ast and the parser *)
%{
open Ast
%}
(* The name of tokens *)
%token <int> INT
%token <string> ID
%token TRUE
%token FALSE
%token LEQ
%token TIMES
%token PLUS
%token LPAREN
%token RPAREN
%token LET
%token EQUALS
%token IN
%token IF
%token THEN
%token ELSE
%token EOF
(* Compute the precedence and associativity of the operators *)
%nonassoc IN
%nonassoc ELSE
%left LEQ
%left PLUS
%left TIMES
(* Define the start symbol: where to parse the program? *)
%start <Ast.expr> prog
%%
prog:
| e = expr EOF { e }
;
expr:
| i = INT { Int i } (* integer *)
| TRUE { Bool true } (* booleans *)
| FALSE { Bool false }
| e1 = expr Add e2 = expr { Binop (Add, e1, e2) } (* binary operators *)
| e1 = expr Mul e2 = expr { Binop (Mul, e1, e2) }
| e1 = expr Leq e2 = expr { Binop (Leq, e1, e2) }
| LET; x = ID; EQUALS; e1 = expr; IN; e2 = expr { Let (x, e1, e2) } (* let expressions *)
| IF; e1 = expr; THEN; e2 = expr; ELSE; e3 = expr { If (e1, e2, e3) } (* if expressions *)
| LPAREN; e = expr; RPAREN { e } (* parentheses *)
;
lexer.mll
接受输入的 token 序列(也即从字符串角度看代码)并将其进行划分,返回 parser.mly
中早就定义好的 TOKEN
{
open Parser
}
(* Regular expressions for the lexer *)
let white = [' ' '\t']+
let digit = ['0'-'9']
let int = '-'?digit+
let letter = ['a'-'z' 'A'-'Z']
let id = letter (letter|digit)*
(* Define the rules of lexer *)
rule read =
parse
(* 'read lexbuf' reads the next token from lexbuf *)
| white { read lexbuf }
| "true" { TRUE }
| "false" { FALSE }
| "<=" { LEQ }
| "*" { TIMES }
| "+" { PLUS }
| "(" { LPAREN }
| ")" { RPAREN }
| "let" { LET }
| "=" { EQUALS }
| "in" { IN }
| "if" { IF }
| "then" { THEN }
| "else" { ELSE }
(* 'Lexing.lexeme lexbuf' returns the string that was matched *)
| id { ID (Lexing.lexeme lexbuf) }
(* 'int_of_string' converts a string to an integer *)
| int { INT (int_of_string (Lexing.lexeme lexbuf)) }
| eof { EOF }
对于 Menhir 工具来说,这样的构建顺序是合理的;毕竟看上去 lexer.mll
中定义 TOKEN
并不是那么方便