From bcde99fbe99174a094f38fdda70ad69d65a423f4 Mon Sep 17 00:00:00 2001
From: Alex AUVOLAT <alex.auvolat@ens.fr>
Date: Wed, 30 Apr 2014 17:19:08 +0200
Subject: Fist commit (WIP)

---
 frontend/lexer.mll | 114 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 114 insertions(+)
 create mode 100644 frontend/lexer.mll

(limited to 'frontend/lexer.mll')

diff --git a/frontend/lexer.mll b/frontend/lexer.mll
new file mode 100644
index 0000000..bc9267f
--- /dev/null
+++ b/frontend/lexer.mll
@@ -0,0 +1,114 @@
+(*
+  Cours "Sémantique et Application à la Vérification de programmes"
+  
+  Antoine Miné 2014
+  Ecole normale supérieure, Paris, France / CNRS / INRIA
+*)
+
+(*
+  Lexer for a very simple C-like "curly bracket" language.
+*)
+    
+{
+open Lexing
+open Abstract_syntax_tree
+open Parser
+
+(* keyword table *)
+let kwd_table = Hashtbl.create 10
+let _ = 
+  List.iter (fun (a,b) -> Hashtbl.add kwd_table a b)
+    [
+     (* types *)
+     "bool",   TOK_BOOL; 
+     "int",    TOK_INT; 
+     "void",   TOK_VOID;
+     "auto",   TOK_AUTO;
+
+     (* constants *)
+     "true",   TOK_TRUE; 
+     "false",  TOK_FALSE;
+
+     (* expression operators *)
+     "rand",   TOK_RAND;
+
+     (* control flow *)
+     "while",  TOK_WHILE;
+     "if",     TOK_IF;
+     "else",   TOK_ELSE;
+     "halt",   TOK_HALT;
+     "return", TOK_RETURN;
+     "break",  TOK_BREAK;
+     "goto",   TOK_GOTO;
+
+     (* special statements *)
+     "assert", TOK_ASSERT;
+     "print",  TOK_PRINT;
+   ]
+}
+
+(* special character classes *)
+let space = [' ' '\t' '\r']+
+let newline = "\n" | "\r" | "\r\n"
+
+(* utilities *)
+let digit = ['0'-'9']
+let digit_ = ['0'-'9' '_']
+
+(* integers *)
+let int_dec = digit digit_*
+let int_bin = ("0b" | "0B") ['0'-'1'] ['0'-'1' '_']*
+let int_oct = ("0o" | "0O") ['0'-'7'] ['0'-'7' '_']*
+let int_hex = ("0x" | "0X") ['0'-'9' 'a'-'f' 'A'-'F'] ['0'-'9' 'a'-'f' 'A'-'F' '_']*
+let const_int = int_bin | int_oct | int_dec | int_hex
+
+
+(* tokens *)
+rule token = parse
+
+(* identifier (TOK_id) or reserved keyword *)
+| ['a'-'z' 'A'-'Z' '_'] ['a'-'z' 'A'-'Z' '0'-'9' '_']* as id
+{ try Hashtbl.find kwd_table id with Not_found -> TOK_id id }
+
+(* symbols *)
+| "("    { TOK_LPAREN }
+| ")"    { TOK_RPAREN }
+| "{"    { TOK_LCURLY }
+| "}"    { TOK_RCURLY }
+| "*"    { TOK_STAR }
+| "+"    { TOK_PLUS }
+| "-"    { TOK_MINUS }
+| "!"    { TOK_EXCLAIM }
+| "/"    { TOK_DIVIDE }
+| "%"    { TOK_PERCENT }
+| "<"    { TOK_LESS }
+| ">"    { TOK_GREATER }
+| "<="   { TOK_LESS_EQUAL }
+| ">="   { TOK_GREATER_EQUAL }
+| "=="   { TOK_EQUAL_EQUAL }
+| "!="   { TOK_NOT_EQUAL }
+| "&&"   { TOK_AND_AND }
+| "||"   { TOK_BAR_BAR }
+| ";"    { TOK_SEMICOLON }
+| ":"    { TOK_COLON }
+| ","    { TOK_COMMA }
+| "="    { TOK_EQUAL }
+
+(* literals *)
+| const_int    as c { TOK_int c }
+
+(* spaces, comments *)
+| "/*" { comment lexbuf; token lexbuf }
+| "//" [^ '\n' '\r']* { token lexbuf }
+| newline { new_line lexbuf; token lexbuf }
+| space { token lexbuf }
+
+(* end of files *)
+| eof { TOK_EOF }
+
+
+(* nested comments (handled recursively)  *)
+and comment = parse
+| "*/" { () }
+| [^ '\n' '\r'] { comment lexbuf }
+| newline { new_line lexbuf; comment lexbuf }
-- 
cgit v1.2.3