Skip to content

Commit b344aeb

Browse files
committed
Initial parser implementation
The parser and AST was taken from LPython. In subsequent commits we will start modifying it to parse C++.
1 parent 0c22405 commit b344aeb

12 files changed

+3771
-0
lines changed

build.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,13 @@
22

33
set -ex
44

5+
python src/libasr/asdl_cpp.py src/lc/LC.asdl src/lc/ast.h
56
python src/libasr/asdl_cpp.py src/libasr/ASR.asdl src/libasr/asr.h
67
python src/libasr/wasm_instructions_visitor.py
78

9+
(cd src/lc/parser && re2c -W -b tokenizer.re -o tokenizer.cpp)
10+
(cd src/lc/parser && bison -Wall -d -r all parser.yy)
11+
812
cmake \
913
-DCMAKE_BUILD_TYPE=Debug \
1014
-DWITH_LLVM=yes \

environment_unix.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,5 @@ dependencies:
1212
- python=3.12.0
1313
- toml=0.10.2
1414
- xtensor=0.24.7
15+
- bison=3.4
16+
- re2c=3.1

src/lc/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
set(SRC
2+
parser/tokenizer.cpp
3+
parser/parser.cpp
4+
parser/parser.tab.cc
5+
26
utils.cpp
37
)
48

src/lc/LC.asdl

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
-- ASDL's 4 builtin types are:
2+
-- identifier, int, string, constant
3+
4+
module LPython
5+
{
6+
mod = Module(stmt* body, type_ignore* type_ignores)
7+
| Interactive(stmt* body)
8+
| Expression(expr body)
9+
| FunctionType(expr* argtypes, expr returns)
10+
11+
stmt = FunctionDef(identifier name, arguments args,
12+
stmt* body, expr* decorator_list, expr? returns,
13+
string? type_comment)
14+
| AsyncFunctionDef(identifier name, arguments args,
15+
stmt* body, expr* decorator_list, expr? returns,
16+
string? type_comment)
17+
18+
| ClassDef(identifier name,
19+
expr* bases,
20+
keyword* keywords,
21+
stmt* body,
22+
expr* decorator_list)
23+
| Return(expr? value)
24+
25+
| Delete(expr* targets)
26+
| Assign(expr* targets, expr value, string? type_comment)
27+
| AugAssign(expr target, operator op, expr value)
28+
-- 'simple' indicates that we annotate simple name without parens
29+
| AnnAssign(expr target, expr annotation, expr? value, int simple)
30+
31+
-- use 'orelse' because else is a keyword in target languages
32+
| For(expr target, expr iter, stmt* body, stmt* orelse, string? type_comment)
33+
| AsyncFor(expr target, expr iter, stmt* body, stmt* orelse, string? type_comment)
34+
| While(expr test, stmt* body, stmt* orelse)
35+
| If(expr test, stmt* body, stmt* orelse)
36+
| With(withitem* items, stmt* body, string? type_comment)
37+
| AsyncWith(withitem* items, stmt* body, string? type_comment)
38+
39+
| Match(expr subject, match_case* cases)
40+
41+
| Raise(expr? exc, expr? cause)
42+
| Try(stmt* body, excepthandler* handlers, stmt* orelse, stmt* finalbody)
43+
| Assert(expr test, expr? msg)
44+
45+
| Import(alias* names)
46+
| ImportFrom(identifier? module, alias* names, int level)
47+
48+
| Global(identifier* names)
49+
| Nonlocal(identifier* names)
50+
| Expr(expr value)
51+
| Pass | Break | Continue
52+
53+
-- col_offset is the byte offset in the utf8 string the parser uses
54+
attributes (int lineno, int col_offset, int? end_lineno, int? end_col_offset)
55+
56+
-- BoolOp() can use left & right?
57+
expr = BoolOp(boolop op, expr* values)
58+
| NamedExpr(expr target, expr value)
59+
| BinOp(expr left, operator op, expr right)
60+
| UnaryOp(unaryop op, expr operand)
61+
| Lambda(arguments args, expr body)
62+
| IfExp(expr test, expr body, expr orelse)
63+
| Dict(expr* keys, expr* values)
64+
| Set(expr* elts)
65+
| ListComp(expr elt, comprehension* generators)
66+
| SetComp(expr elt, comprehension* generators)
67+
| DictComp(expr key, expr value, comprehension* generators)
68+
| GeneratorExp(expr elt, comprehension* generators)
69+
-- the grammar constrains where yield expressions can occur
70+
| Await(expr value)
71+
| Yield(expr? value)
72+
| YieldFrom(expr value)
73+
-- need sequences for compare to distinguish between
74+
-- x < 4 < 3 and (x < 4) < 3
75+
| Compare(expr left, cmpop ops, expr* comparators)
76+
| Call(expr func, expr* args, keyword* keywords)
77+
| FormattedValue(expr value, int conversion, expr? format_spec)
78+
| JoinedStr(expr* values)
79+
-- | Constant(constant value, string? kind)
80+
-- Our specific nodes that are used instead of Constant:
81+
| ConstantStr(string value, string? kind)
82+
| ConstantInt(int value, string? kind)
83+
| ConstantBool(bool value, string? kind)
84+
| ConstantFloat(float value, string? kind)
85+
| ConstantComplex(float re, float im, string? kind)
86+
| ConstantEllipsis(string? kind)
87+
| ConstantNone(string? kind)
88+
| ConstantBytes(string value, string? kind)
89+
90+
-- the following expression can appear in assignment context
91+
| Attribute(expr value, identifier attr, expr_context ctx)
92+
| Subscript(expr value, expr slice, expr_context ctx)
93+
| Starred(expr value, expr_context ctx)
94+
| Name(identifier id, expr_context ctx)
95+
| List(expr* elts, expr_context ctx)
96+
| Tuple(expr* elts, expr_context ctx)
97+
98+
-- can appear only in Subscript
99+
| Slice(expr? lower, expr? upper, expr? step)
100+
101+
-- col_offset is the byte offset in the utf8 string the parser uses
102+
attributes (int lineno, int col_offset, int? end_lineno, int? end_col_offset)
103+
104+
expr_context = Load | Store | Del
105+
106+
boolop = And | Or
107+
108+
operator = Add | Sub | Mult | MatMult | Div | Mod | Pow | LShift
109+
| RShift | BitOr | BitXor | BitAnd | FloorDiv
110+
111+
unaryop = Invert | Not | UAdd | USub
112+
113+
cmpop = Eq | NotEq | Lt | LtE | Gt | GtE | Is | IsNot | In | NotIn
114+
115+
comprehension = (expr target, expr iter, expr* ifs, int is_async)
116+
117+
excepthandler = ExceptHandler(expr? type, identifier? name, stmt* body)
118+
attributes (int lineno, int col_offset, int? end_lineno, int? end_col_offset)
119+
120+
arguments = (arg* posonlyargs, arg* args, arg* vararg, arg* kwonlyargs,
121+
expr* kw_defaults, arg* kwarg, expr* defaults)
122+
123+
arg = (identifier arg, expr? annotation, string? type_comment)
124+
attributes (int lineno, int col_offset, int? end_lineno, int? end_col_offset)
125+
126+
-- keyword arguments supplied to call (NULL identifier for **kwargs)
127+
keyword = (identifier? arg, expr value)
128+
attributes (int lineno, int col_offset, int? end_lineno, int? end_col_offset)
129+
130+
-- import name with optional 'as' alias.
131+
alias = (identifier name, identifier? asname)
132+
attributes (int lineno, int col_offset, int? end_lineno, int? end_col_offset)
133+
134+
withitem = (expr context_expr, expr? optional_vars)
135+
136+
match_case = (pattern pattern, expr? guard, stmt* body)
137+
138+
pattern = MatchValue(expr value)
139+
-- | MatchSingleton(constant value)
140+
| MatchSingleton(expr value)
141+
| MatchSequence(pattern* patterns)
142+
| MatchMapping(expr* keys, pattern* patterns, identifier? rest)
143+
| MatchClass(expr cls, pattern* patterns, identifier* kwd_attrs, pattern* kwd_patterns)
144+
145+
| MatchStar(identifier? name)
146+
-- The optional "rest" MatchMapping parameter handles capturing extra mapping keys
147+
148+
| MatchAs(pattern? pattern, identifier? name)
149+
| MatchOr(pattern* patterns)
150+
151+
attributes (int lineno, int col_offset, int end_lineno, int end_col_offset)
152+
153+
type_ignore = TypeIgnore(int lineno, string tag)
154+
}

src/lc/parser/parser.cpp

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
#include <iostream>
2+
#include <string>
3+
#include <sstream>
4+
#include <fstream>
5+
#include <chrono>
6+
7+
#include <lc/parser/parser.h>
8+
#include <lc/parser/parser.tab.hh>
9+
#include <libasr/diagnostics.h>
10+
#include <libasr/string_utils.h>
11+
#include <libasr/utils.h>
12+
#include <lc/parser/parser_exception.h>
13+
14+
namespace LCompilers::LPython {
15+
16+
Result<LPython::AST::Module_t*> parse(Allocator &al, const std::string &s,
17+
uint32_t prev_loc, diag::Diagnostics &diagnostics)
18+
{
19+
Parser p(al, diagnostics);
20+
try {
21+
p.parse(s, prev_loc);
22+
} catch (const parser_local::TokenizerError &e) {
23+
Error error;
24+
diagnostics.diagnostics.push_back(e.d);
25+
return error;
26+
} catch (const parser_local::ParserError &e) {
27+
Error error;
28+
diagnostics.diagnostics.push_back(e.d);
29+
return error;
30+
}
31+
32+
Location l;
33+
if (p.result.size() == 0) {
34+
l.first=0;
35+
l.last=0;
36+
} else {
37+
l.first=p.result[0]->base.loc.first;
38+
l.last=p.result[p.result.size()-1]->base.loc.last;
39+
}
40+
return (LPython::AST::Module_t*)LPython::AST::make_Module_t(al, l,
41+
p.result.p, p.result.size(), p.type_ignore.p, p.type_ignore.size());
42+
}
43+
44+
void Parser::parse(const std::string &input, uint32_t prev_loc)
45+
{
46+
inp = input;
47+
if (inp.size() > 0) {
48+
if (inp[inp.size()-1] != '\n') inp.append("\n");
49+
} else {
50+
inp.append("\n");
51+
}
52+
m_tokenizer.set_string(inp, prev_loc);
53+
if (yyparse(*this) == 0) {
54+
return;
55+
}
56+
throw parser_local::ParserError("Parsing unsuccessful (internal compiler error)");
57+
}
58+
59+
void Parser::handle_yyerror(const Location &loc, const std::string &msg)
60+
{
61+
std::string message;
62+
if (msg == "syntax is ambiguous") {
63+
message = "Internal Compiler Error: syntax is ambiguous in the parser";
64+
} else if (msg == "syntax error") {
65+
YYSTYPE yylval_;
66+
YYLTYPE yyloc_;
67+
this->m_tokenizer.cur = this->m_tokenizer.tok;
68+
int token = this->m_tokenizer.lex(this->m_a, yylval_, yyloc_, diag);
69+
if (token == yytokentype::END_OF_FILE) {
70+
message = "End of file is unexpected here";
71+
} else if (token == yytokentype::TK_NEWLINE) {
72+
message = "Newline is unexpected here";
73+
} else {
74+
std::string token_str = this->m_tokenizer.token();
75+
std::string token_type = token2text(token);
76+
if (token_str == token_type) {
77+
message = "Token '" + token_str + "' is unexpected here";
78+
} else {
79+
message = "Token '" + token_str + "' (of type '" + token2text(token) + "') is unexpected here";
80+
}
81+
}
82+
} else {
83+
message = "Internal Compiler Error: parser returned unknown error";
84+
}
85+
throw parser_local::ParserError(message, loc);
86+
}
87+
88+
bool file_exists(const std::string &name) {
89+
std::ifstream file(name);
90+
if (!file.is_open()) {
91+
return false;
92+
}
93+
return true;
94+
}
95+
96+
std::string unique_filename(const std::string &prefix) {
97+
uint64_t ms = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now().time_since_epoch()).count();
98+
ms = ms % 1000000000;
99+
srand((unsigned) ms);
100+
std::string hex = "0123456789ABCDEF";
101+
std::string random_hash;
102+
for (int i=0; i < 6; i++) {
103+
random_hash += hex[rand() % 16];
104+
}
105+
int counter = 1;
106+
std::string filename = prefix + random_hash + std::to_string(counter);
107+
while (file_exists(filename)) {
108+
counter++;
109+
filename = prefix + random_hash + std::to_string(counter);
110+
}
111+
return filename;
112+
}
113+
114+
Result<LPython::AST::ast_t*> parse_python_file(Allocator &al,
115+
const std::string &/*runtime_library_dir*/,
116+
const std::string &infile,
117+
diag::Diagnostics &diagnostics,
118+
uint32_t prev_loc,
119+
[[maybe_unused]] bool new_parser) {
120+
LPython::AST::ast_t* ast;
121+
// We will be using the new parser from now on
122+
new_parser = true;
123+
LCOMPILERS_ASSERT(new_parser)
124+
std::string input = read_file(infile);
125+
Result<LPython::AST::Module_t*> res = parse(al, input, prev_loc, diagnostics);
126+
if (res.ok) {
127+
ast = (LPython::AST::ast_t*)res.result;
128+
} else {
129+
LCOMPILERS_ASSERT(diagnostics.has_error())
130+
return Error();
131+
}
132+
return ast;
133+
}
134+
135+
136+
} // namespace LCompilers::LPython

src/lc/parser/parser.h

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#ifndef LPYTHON_PARSER_PARSER_H
2+
#define LPYTHON_PARSER_PARSER_H
3+
4+
#include <libasr/containers.h>
5+
#include <libasr/diagnostics.h>
6+
#include <lc/parser/tokenizer.h>
7+
8+
namespace LCompilers::LPython {
9+
10+
class Parser
11+
{
12+
public:
13+
std::string inp;
14+
15+
public:
16+
diag::Diagnostics &diag;
17+
Allocator &m_a;
18+
Tokenizer m_tokenizer;
19+
Vec<LPython::AST::stmt_t*> result;
20+
Vec<LPython::AST::type_ignore_t*> type_ignore;
21+
22+
Parser(Allocator &al, diag::Diagnostics &diagnostics)
23+
: diag{diagnostics}, m_a{al} {
24+
result.reserve(al, 32);
25+
type_ignore.reserve(al, 4);
26+
}
27+
28+
void parse(const std::string &input, uint32_t prev_loc);
29+
void handle_yyerror(const Location &loc, const std::string &msg);
30+
};
31+
32+
33+
// Parses Python code to AST
34+
Result<LPython::AST::Module_t*> parse(Allocator &al,
35+
const std::string &s, uint32_t prev_loc,
36+
diag::Diagnostics &diagnostics);
37+
38+
Result<LPython::AST::ast_t*> parse_python_file(Allocator &al,
39+
const std::string &runtime_library_dir,
40+
const std::string &infile,
41+
diag::Diagnostics &diagnostics,
42+
uint32_t prev_loc, bool new_parser);
43+
44+
} // namespace LCompilers::LPython
45+
46+
#endif

0 commit comments

Comments
 (0)