frontend/lexer.mll


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114

(*
  Cours "Sémantique et Application à la Vérification de programmes"
  
  Antoine Miné 2014
  Ecole normale supérieure, Paris, France / CNRS / INRIA
*)

(*
  Lexer for a very simple C-like "curly bracket" language.
*)
    
{
open Lexing
open Abstract_syntax_tree
open Parser

(* keyword table *)
let kwd_table = Hashtbl.create 10
let _ = 
  List.iter (fun (a,b) -> Hashtbl.add kwd_table a b)
    [
     (* types *)
     "bool",   TOK_BOOL; 
     "int",    TOK_INT; 
     "void",   TOK_VOID;
     "auto",   TOK_AUTO;

     (* constants *)
     "true",   TOK_TRUE; 
     "false",  TOK_FALSE;

     (* expression operators *)
     "rand",   TOK_RAND;

     (* control flow *)
     "while",  TOK_WHILE;
     "if",     TOK_IF;
     "else",   TOK_ELSE;
     "halt",   TOK_HALT;
     "return", TOK_RETURN;
     "break",  TOK_BREAK;
     "goto",   TOK_GOTO;

     (* special statements *)
     "assert", TOK_ASSERT;
     "print",  TOK_PRINT;
   ]
}

(* special character classes *)
let space = [' ' '\t' '\r']+
let newline = "\n" | "\r" | "\r\n"

(* utilities *)
let digit = ['0'-'9']
let digit_ = ['0'-'9' '_']

(* integers *)
let int_dec = digit digit_*
let int_bin = ("0b" | "0B") ['0'-'1'] ['0'-'1' '_']*
let int_oct = ("0o" | "0O") ['0'-'7'] ['0'-'7' '_']*
let int_hex = ("0x" | "0X") ['0'-'9' 'a'-'f' 'A'-'F'] ['0'-'9' 'a'-'f' 'A'-'F' '_']*
let const_int = int_bin | int_oct | int_dec | int_hex


(* tokens *)
rule token = parse

(* identifier (TOK_id) or reserved keyword *)
| ['a'-'z' 'A'-'Z' '_'] ['a'-'z' 'A'-'Z' '0'-'9' '_']* as id
{ try Hashtbl.find kwd_table id with Not_found -> TOK_id id }

(* symbols *)
| "("    { TOK_LPAREN }
| ")"    { TOK_RPAREN }
| "{"    { TOK_LCURLY }
| "}"    { TOK_RCURLY }
| "*"    { TOK_STAR }
| "+"    { TOK_PLUS }
| "-"    { TOK_MINUS }
| "!"    { TOK_EXCLAIM }
| "/"    { TOK_DIVIDE }
| "%"    { TOK_PERCENT }
| "<"    { TOK_LESS }
| ">"    { TOK_GREATER }
| "<="   { TOK_LESS_EQUAL }
| ">="   { TOK_GREATER_EQUAL }
| "=="   { TOK_EQUAL_EQUAL }
| "!="   { TOK_NOT_EQUAL }
| "&&"   { TOK_AND_AND }
| "||"   { TOK_BAR_BAR }
| ";"    { TOK_SEMICOLON }
| ":"    { TOK_COLON }
| ","    { TOK_COMMA }
| "="    { TOK_EQUAL }

(* literals *)
| const_int    as c { TOK_int c }

(* spaces, comments *)
| "/*" { comment lexbuf; token lexbuf }
| "//" [^ '\n' '\r']* { token lexbuf }
| newline { new_line lexbuf; token lexbuf }
| space { token lexbuf }

(* end of files *)
| eof { TOK_EOF }


(* nested comments (handled recursively)  *)
and comment = parse
| "*/" { () }
| [^ '\n' '\r'] { comment lexbuf }
| newline { new_line lexbuf; comment lexbuf }