Skip to content

Compiler: Modernize the js lexer, now utf-8 aware #1386

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Jan 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
* Compiler: JavaScript files generated by `js_of_ocaml` are now UTF-8 encoded.
* Compiler: use identifier for object literals when possible
* Compiler: Cache function arity (the length prop of a function is slow with v8)
* Compiler: The js lexer is now utf8 aware, recognize and emit utf8 ident
* Compiler: Update the js lexer with new number literal syntax

## Bug fixes
- Effects: fix Js.export and Js.export_all to work with functions
Expand Down
83 changes: 83 additions & 0 deletions compiler/bin-js-lexer/cmp.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
open Js_of_ocaml_compiler

let files = Sys.argv |> Array.to_list |> List.tl

let col (l1 : Lexing.position) = l1.pos_cnum - l1.pos_bol

let loc_equal (l1 : Lexing.position) (l2 : Lexing.position) =
l1.pos_lnum = l2.pos_lnum && col l1 = col l2

let loc { Lexing.pos_lnum; pos_cnum; pos_bol; pos_fname } =
Printf.sprintf "%s:%d:%d" pos_fname pos_lnum (pos_cnum - pos_bol)

let () =
List.iter
(fun f ->
let c1 = open_in_bin f in
let c2 = open_in_bin f in
let l1 = Lexing.from_channel c1 in
Lexing.set_position
l1
{ Lexing.pos_fname = f; pos_lnum = 1; pos_bol = 0; pos_cnum = 0 };
if true then Lexing.set_filename l1 f;
let l2 = Sedlexing.Utf8.from_channel c2 in
if true then Sedlexing.set_filename l2 f;
Sedlexing.set_position
l2
{ Lexing.pos_fname = f; pos_lnum = 1; pos_bol = 0; pos_cnum = 0 };
Sedlexing.start l2;
let f1 () =
let t = Js_lexer.main l1 in
t, l1.lex_start_p, l1.lex_curr_p
in
let drop1 () = Js_lexer.drop_line l1 in
let f2, drop2 =
let env = ref (Flow_lexer.Lex_env.create l2) in
let f () =
let nenv, res = Flow_lexer.token !env in
env := nenv;
let p = Flow_lexer.Lex_result.loc res in
Flow_lexer.Lex_result.token res, fst p, snd p
in
let drop () = Flow_lexer.drop_line !env in
f, drop
in

try
while true do
let t1, p1, p1' = f1 () in
let t2, p2, p2' = f2 () in
if false
then
Printf.eprintf
"%s: %s\n%s: %s\n"
(loc p2)
(Js_token.to_string_extra t1)
(loc p2)
(Js_token.to_string_extra t2);
if true
then
if t1 <> t2
then (
Printf.eprintf
"Token mismatch: %s:%s <> %s\n"
(loc p1)
(Js_token.to_string_extra t1)
(Js_token.to_string_extra t2);
drop1 ();
drop2 ())
else if true && ((not (loc_equal p1 p2)) || not (loc_equal p1' p2'))
then
Printf.eprintf
"Location mismatch for %s: %s-%s <> %s-%s\n"
(Js_token.to_string_extra t1)
(loc p1)
(loc p1')
(loc p2)
(loc p2');
match t1, t2 with
| T_EOF, T_EOF -> raise Exit
| _ -> ()
done
with Exit -> ())
files
3 changes: 3 additions & 0 deletions compiler/bin-js-lexer/dune
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
(executable
(name cmp)
(libraries js_of_ocaml-compiler))
6 changes: 3 additions & 3 deletions compiler/lib/driver.ml
Original file line number Diff line number Diff line change
Expand Up @@ -414,7 +414,7 @@ let pack ~wrap_with_fun ~standalone { Linker.runtime_code = js; always_required_
{|((typeof module === 'object' && module.exports) || %s)|}
Constant.global_object
in
let lex = Parse_js.Lexer.of_lexbuf (Lexing.from_string s) in
let lex = Parse_js.Lexer.of_string s in
Parse_js.parse_expr lex
in
var Constant.exports_ export_node :: js
Expand Down Expand Up @@ -477,7 +477,7 @@ if (typeof module === 'object' && module.exports) {
|}
name
in
let lex = Parse_js.Lexer.of_lexbuf (Lexing.from_string s) in
let lex = Parse_js.Lexer.of_string s in
Parse_js.parse lex
in
js @ export_node
Expand All @@ -504,7 +504,7 @@ if (typeof module === 'object' && module.exports) {
}(Object));
|}
in
let lex = Parse_js.Lexer.of_lexbuf (Lexing.from_string s) in
let lex = Parse_js.Lexer.of_string s in
Parse_js.parse lex
in
e @ js
Expand Down
29 changes: 27 additions & 2 deletions compiler/lib/dune
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
compiler-libs.common
compiler-libs.bytecomp
menhirLib
sedlex
(select
source_map_io.ml
from
Expand All @@ -14,7 +15,7 @@
(flags
(:standard -w -7-37 -safe-string))
(preprocess
(pps ppx_optcomp_light)))
(pps ppx_optcomp_light sedlex.ppx)))

(ocamllex js_lexer annot_lexer)

Expand All @@ -30,7 +31,31 @@
--unused-token
TCommentLineDirective
--unused-token
TUnknown))
T_ERROR
--unused-token
T_AT
--unused-token
T_POUND
--unused-token
T_PLING_PERIOD
--unused-token
T_PLING_PLING
--unused-token
T_OR_ASSIGN
--unused-token
T_AND_ASSIGN
--unused-token
T_NULLISH_ASSIGN
--unused-token
T_EXP
--unused-token
T_EXP_ASSIGN
--unused-token
T_ARROW
--unused-token
T_BIGINT
--unused-token
T_TEMPLATE_PART))

(menhir
(modules annot_parser)
Expand Down
Loading