This repository has been archived by the owner on Dec 3, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathslexer.ml
300 lines (286 loc) · 10.5 KB
/
slexer.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
(** sedlex lexer for dill *)
open Sedlexing (* new_line, loc stuff that parser will use *)
open Syntax (* my SyntaxError exception type *)
open Parser (* token types from there *)
(* Not in common.ml for now, Other modules might have different error
types. Or, should I make a unified error type? *)
(* Idea: parameterize the module by encoding *)
module Enc = Sedlexing.Utf8
(* Regular expressions *)
let newline = [%sedlex.regexp? Opt '\r', '\n']
let whitespace = [%sedlex.regexp? (' ' | '\t')]
let digit = [%sedlex.regexp? '0'..'9']
let hexdigit = [%sedlex.regexp? ('0'..'9' | 'a'..'f' | 'A'..'F')]
let iconst = [%sedlex.regexp? Plus digit]
let hexconst = [%sedlex.regexp? "0x", Plus hexdigit]
let expon = [%sedlex.regexp? Opt ('e' | 'E'), Opt ('-' | '+'), Plus digit]
let fconst = [%sedlex.regexp? Plus digit, '.', Star digit, Opt expon]
let ident_lc =
[%sedlex.regexp? ('a'..'z' | '_'),
Star ('a'..'z' | 'A'..'Z' | '0'..'9' | '_')]
let qident_lc = [%sedlex.regexp? Opt (ident_lc, "::"), ident_lc]
let ident_uc =
[%sedlex.regexp? ('A'..'Z'), Star ('a'..'z' | 'A'..'Z' | '0'..'9' | '_')]
(* Currently parsing :: as a separate token. *)
(* let qident_uc = [%sedlex.regexp? Opt (ident_lc, "::"), ident_uc] *)
let vlabel = [%sedlex.regexp? '#', ident_lc]
let hexbyte = [%sedlex.regexp? "\\x", Rep (hexdigit, 2)]
(* ASCII printable except ' and \ (for char literals) *)
(* leaving out double quote (34), '"' works in ocaml. *)
let char_noesc = [%sedlex.regexp? (20 .. 38 | 40 .. 91 | 93 .. 126)]
(* Any unicode except ' and \ for string literals. May not be needed. *)
let str_noesc = [%sedlex.regexp? Sub (any, Chars "\"\\")]
(*
type ttype =
| I_LIT of Int64.t
| F_LIT of float
| S_LIT of string
| B_LIT of char
| LC_IDENT of string
| UC_IDENT of string
| VLABEL of string
| LPAREN | RPAREN | LBRACE | RBRACE | LSQRB | RSQRB
| PLUS | MINUS | TIMES | DIV | MOD
(* | UMINUS (* not lexed *) *)
| AMP | PIPE | CARAT | TILDE (* bitwise operators *)
| SHL | SHR (* logical shift *)
| EQ | NE | LT | GT | LE | GE
| AND | OR | NOT (* Which ones overloadable? Maybe not these. *)
| TRUE | FALSE | NULL (* | VAL *)
| COLON | DCOLON | SEMI | DOT | COMMA
| HASH | DOLLAR | QMARK | ARROW | DARROW
| ASSIGN | NULLASSIGN
| VAR | REF | IMMREF
| NOP
| IF | THEN | ELSIF | ELSE | ENDIF
| WHILE (* | LOOP *) | ENDWHILE
| CASE | OF | ENDCASE
| IS | DO
| PRIVATE | PROC | ENDPROC | RETURN
| MODULE (* | BEGIN *) | ENDMODULE | MODSPEC | ENDMODSPEC
| IMPORT | AS | OPEN (* | EXPORT *) | REQUIRE
| TYPE | ENDTYPE | OPAQUE | REC | RECORD | VARIANT | MUT
| EOF
type token = {
ttype: ttype;
loc: Lexing.position * Lexing.position
}
*)
(* Map and function for unparsing and error messages. *)
module Ttype =
struct
type t = token
let compare = Stdlib.compare
end
(* Used by the parser to generate error messages also. *)
module TokenMap = Map.Make(Ttype)
let ttype_strings = List.fold_left (* of_list in ocaml 5.1 *)
(fun m (k,v) -> TokenMap.add k v m) TokenMap.empty [
(LPAREN, "("); (RPAREN, ")"); (LBRACE, "{"); (RBRACE, "}");
(LSQRB, "["); (RSQRB, "]"); (PLUS, "+"); (MINUS, "-");
(TIMES, "*"); (DIV, "/"); (MOD, "%"); (CARAT, "^"); (TILDE, "~");
(BANG, "!"); (AMP, "&"); (SHL, "<<"); (SHR, ">>"); (EQ, "==");
(NE, "!="); (LT, "<"); (LE, "<="); (GT, ">"); (GE, ">=");
(AND, "&&"); (OR, "||"); (HASH, "#"); (QMARK, "?"); (DOLLAR, "$");
(ASSIGN, "="); (NULLASSIGN, "?="); (ARROW, "->"); (DARROW, "=>");
(DOT, "."); (SEMI, ";"); (DCOLON, "::"); (COMMA, ","); (COLON, ":");
(MODULE, "module"); (* (BEGIN, "begin"); *) (ENDMODULE, "/module");
(IMPORT, "import"); (OPEN, "open"); (AS, "as"); (PRIVATE, "private");
(MODSPEC, "modspec"); (ENDMODSPEC, "/modspec"); (REQUIRE, "require");
(IS, "is"); (DO, "do");
(PROC, "proc"); (ENDPROC, "/proc"); (RETURN, "return");
(VAR, "var"); (REF, "ref"); (NOP, "nop"); (IF, "if"); (THEN, "then");
(ELSIF, "elsif"); (ELSE, "else"); (ENDIF, "/if");
(WHILE, "while"); (* (LOOP, "loop"); *) (ENDWHILE, "/while");
(CASE, "case"); (OF, "of"); (ENDCASE, "/case");
(OPAQUE, "opaque"); (TYPE, "type"); (MUT, "mut"); (REC, "rec");
(STRUCT, "struct"); (VARIANT, "variant");
(TRUE, "#true"); (FALSE, "#false"); (NULL, "#null")
]
(** For converting tokens back to strings. Use this in Ast also *)
let string_of_ttype = function
| I_LIT n -> Int64.to_string n
| F_LIT x -> string_of_float x
| S_LIT s -> "\"" ^ String.escaped s ^ "\""
| B_LIT c -> "'" ^ Char.escaped c ^ "'"
| UC_IDENT s -> s
| LC_IDENT s -> s
| VLABEL s -> s
| t -> TokenMap.find t ttype_strings
(* Useful in parser for error messages. But does the lexer also store this? *)
let string_of_token token = string_of_ttype token
(* do I have to write my own to_string for the buf? *)
let string_of_ucarray arr =
String.of_seq (Array.to_seq (Array.map Uchar.to_char arr))
let syntax_error msg buf =
SyntaxError { msg=msg;
loc=lexing_positions buf }
let rec tparse (buf: Sedlexing.lexbuf) =
match%sedlex buf with
| whitespace ->
tparse buf
| newline -> (* sedlex doesn't need you to manually bump newlines *)
(* new_line buf; *) tparse buf
| "(*" ->
comment 0 buf
| '\'' ->
byte [] buf
| "\"" ->
string (Buffer.create 80) buf
| iconst -> I_LIT (Int64.of_string (Enc.lexeme buf))
| hexconst -> I_LIT (Int64.of_string (Enc.lexeme buf))
| fconst -> F_LIT (Float.of_string (Enc.lexeme buf))
| "#true" -> TRUE
| "#false" -> FALSE
| "#null" -> NULL
| vlabel -> VLABEL (Enc.lexeme buf)
(* | '#' -> HASH *)
| '(' -> LPAREN
| ')' -> RPAREN
| '{' -> LBRACE
| '}' -> RBRACE
| '[' -> LSQRB
| ']' -> RSQRB
| '+' -> PLUS
| '-' -> MINUS
| '*' -> TIMES
| '/' -> DIV
| '%' -> MOD
| '^' -> CARAT (* overloadable, maybe *)
| '|' -> PIPE
| '&' -> AMP
| '~' -> TILDE (* bitwise not *)
| "<<" -> SHL
| ">>" -> SHR
| "==" -> EQ
| "!=" -> NE
| '<' -> LT
| "<=" -> LE
| ">" -> GT
| ">=" -> GE
| "&&" -> AND
| "||" -> OR
| '!' -> BANG
| '?' -> QMARK (* Maybe later: attached to something else. *)
| '$' -> DOLLAR
| '=' -> ASSIGN
| "?=" -> NULLASSIGN
| "->" -> ARROW
| "=>" -> DARROW
| ',' -> COMMA
| '.' -> DOT
| ';' -> SEMI
| "::" -> DCOLON (* actually this should be attached too. *)
| ':' -> COLON
| "var" -> VAR
(* | "ref^" -> IMMREF *)
| "ref" -> REF
| "nop" -> NOP
| "if" -> IF
| "then" -> THEN
| "elsif" -> ELSIF
| "else" -> ELSE
| "/if" -> ENDIF
| "while" -> WHILE
(* | "loop" -> LOOP *)
| "/while" -> ENDWHILE
| "case" -> CASE
| "of" -> OF
| "/case" -> ENDCASE
| "proc" -> PROC
| "/proc" -> ENDPROC
| "return" -> RETURN
| "is" -> IS
| "do" -> DO
| "module" -> MODULE
(* | "begin" -> BEGIN *)
| "/module" -> ENDMODULE
| "modspec" -> MODSPEC
| "/modspec" -> ENDMODSPEC (* Should put these in separate parser? *)
| "import" -> IMPORT
| "as" -> AS
| "open" -> OPEN
(* | "export" -> EXPORT *)
| "private" -> PRIVATE
| "type" -> TYPE
| "/type" -> ENDTYPE
| "rec" -> REC
| "opaque" -> OPAQUE
| "mut" -> MUT
| "record" -> STRUCT
| "variant" -> VARIANT
| ident_lc -> LC_IDENT (Enc.lexeme buf)
| ident_uc -> UC_IDENT (Enc.lexeme buf)
| eof -> EOF
(* Lexeme will be empty unless we match an actual regex. *)
| any ->
raise (syntax_error ("Unexpected character: "
^ Enc.lexeme buf) buf)
| _ -> failwith "Unreachable: slexer.tparse"
(* lexing rule for ident chained together with restricted whitespace *)
(* but it has to return multiple tokens; do I have to do some sort of push/pop? *)
(* or generalize so multiple tokens can be returned? *)
(* nevermind *)
and comment depth buf =
match%sedlex buf with
| "(*" -> comment (depth+1) buf
| "*)" ->
if depth = 0 then tparse buf else comment (depth-1) buf
| newline -> (* new_line buf; *) comment depth buf
| eof -> raise (syntax_error "Unterminated comment at EOF" buf)
| any -> comment depth buf (* unicode in comments too, yay! *)
| _ -> failwith "Unreachable: slexer.comment"
and string acc buf =
match%sedlex buf with
| '"' -> S_LIT (Buffer.contents acc)
| "\\\"" -> Buffer.add_char acc '\"'; string acc buf
| "\\n" -> Buffer.add_char acc '\n'; string acc buf
| "\\r" -> Buffer.add_char acc '\r'; string acc buf
| "\\t" -> Buffer.add_char acc '\t'; string acc buf
| "\\b" -> Buffer.add_char acc '\b'; string acc buf
| "\\\\" -> Buffer.add_char acc '\\'; string acc buf
| hexbyte ->
let bytechar =
char_of_int (int_of_string ("0x" ^ String.sub (Enc.lexeme buf) 2 2)) in
Buffer.add_char acc bytechar; string acc buf
| "\\", any ->
raise (syntax_error ("Illegal backslash escape in string: "
^ Enc.lexeme buf) buf)
| '\n' -> raise (syntax_error "Unterminated string constant at newline" buf)
| eof -> raise (syntax_error "Unterminated string constant at EOF" buf)
| str_noesc -> Buffer.add_string acc (Enc.lexeme buf);
string acc buf
(* Probably this cannot be reached either. *)
| any -> raise (syntax_error ("Illegal character in string: "
^ Enc.lexeme buf) buf)
| _ -> failwith "Unreachable: slexer.string"
and byte (acc: char list) buf =
match%sedlex buf with
| '\'' ->
if List.length acc == 0 then
raise (syntax_error "Byte constant cannot be empty" buf)
else if List.length acc > 1 then
(* could match just the single-char expressions,
but this is more informative? *)
raise (syntax_error "Too many characters in byte constant" buf)
else
B_LIT (List.hd acc)
| "\\n" -> byte ('\n'::acc) buf
| "\\t" -> byte ('\t'::acc) buf
| "\\r" -> byte ('\r'::acc) buf
| "\\b" -> byte ('\b'::acc) buf (* backspace *)
| "\\\\" -> byte ('\\'::acc) buf
| hexbyte ->
let bytechar =
char_of_int (int_of_string ("0x" ^ String.sub (Enc.lexeme buf) 2 2)) in
byte (bytechar::acc) buf
| "\\", any ->
raise (syntax_error ("Illegal backslash escape in byte: "
^ Enc.lexeme buf) buf)
| char_noesc -> byte ((Enc.lexeme buf).[0]::acc) buf
| any ->
raise (syntax_error ("Illegal byte constant: " ^ Enc.lexeme buf) buf)
| _ -> failwith "Unreachable: slexer.byte"
(* Wait...will buf be different after? what if I get pos first? *)
let token buf = tparse buf
(* { ttype=tparse buf; loc=lexing_positions buf } *)