tan  0.0.1
parser.cpp
1 #include "parser/parser.h"
2 #include "ast/ast_node_type.h"
3 #include "ast/stmt.h"
4 #include "ast/expr.h"
5 #include "ast/decl.h"
6 #include "ast/type.h"
7 #include "ast/context.h"
8 #include "ast/intrinsic.h"
9 #include "source_file/token.h"
10 #include <iostream>
11 
12 using namespace tanlang;
13 using tanlang::TokenType; // distinguish from the one in winnt.h
14 
15 // ========= helper functions' prototypes ========= //
16 
17 static bool check_typename_token(Token *token);
18 static bool check_terminal_token(Token *token);
19 
20 namespace tanlang {
21 
22 using nud_parsing_func_t = void (ParserImpl::*)(ASTBase *);
23 using led_parsing_func_t = void (ParserImpl::*)(ASTBase *, ASTBase *);
24 
25 class ScopeGuard {
26 public:
27  ScopeGuard() = delete;
28 
29  ScopeGuard(ASTBase *&variable, ASTBase *scoped_val) : _variable(variable), _original(variable) {
30  _variable = scoped_val;
31  }
32 
33  ~ScopeGuard() { _variable = _original; }
34 
35 private:
36  ASTBase *&_variable;
37  ASTBase *_original;
38 };
39 
40 class ParserImpl final {
41 public:
42  ParserImpl() = delete;
43  explicit ParserImpl(TokenizedSourceFile *src) : _src(src), _filename(src->get_filename()) {}
44 
45  Program *parse() {
46  _root = Program::Create(_src);
47 
48  ScopeGuard scope_guard(_curr_scope, _root);
49 
50  while (!eof(_curr)) {
51  ASTBase *node = next_expression(PREC_LOWEST);
52  if (!node)
53  error(ErrorType::SYNTAX_ERROR, _curr, _curr, "Unexpected terminal token");
54 
55  _root->append_child(node);
56 
57  expect_token(node->terminal_token());
58  ++_curr;
59  }
60 
61  return _root;
62  }
63 
64 private:
65  ASTBase *peek_keyword() {
66  ASTBase *ret = nullptr;
67  Token *token = at(_curr);
68 
69  str tok = token->get_value();
70 
71  if (tok == "pub" || tok == "extern") { // need to look ahead
72  if (tok == "pub") {
73  if (_pub)
74  error(ErrorType::SYNTAX_ERROR, _curr, _curr, fmt::format("Repeated `{}`", tok));
75  _pub = true;
76  }
77  if (tok == "extern") {
78  if (_extern)
79  error(ErrorType::SYNTAX_ERROR, _curr, _curr, fmt::format("Repeated `{}`", tok));
80  _extern = true;
81  }
82 
83  ++_curr;
84  ret = peek();
85 
86  // check if these keywords are used in the right way
87  auto node_type = ret->get_node_type();
88  bool is_func = node_type == ASTNodeType::FUNC_DECL;
89  bool is_struct = node_type == ASTNodeType::STRUCT_DECL;
90  if (tok == "pub" && !is_func && !is_struct)
91  error(ErrorType::SYNTAX_ERROR,
92  _curr,
93  _curr,
94  fmt::format("Cannot use `{}` keyword to qualify {}", tok, ASTBase::ASTTypeNames[node_type]));
95  if (tok == "extern" && !is_func)
96  error(ErrorType::SYNTAX_ERROR,
97  _curr,
98  _curr,
99  fmt::format("Cannot use `{}` keyword to qualify {}", tok, ASTBase::ASTTypeNames[node_type]));
100 
101  _pub = false;
102  _extern = false;
103  }
104 
105  else if (tok == "var")
106  ret = VarDecl::Create(_src);
107  else if (tok == "import")
108  ret = Import::Create(_src);
109  else if (tok == "if") /// else clause should be covered by If statement as well
110  ret = If::Create(_src);
111  else if (tok == "return")
112  ret = Return::Create(_src);
113  else if (tok == "while" || tok == "for")
114  ret = Loop::Create(_src);
115  else if (tok == "fn")
116  ret = FunctionDecl::Create(_src, _extern, _pub);
117  else if (tok == "struct")
118  ret = StructDecl::Create(_src, _extern, _pub);
119  else if (tok == "break")
120  ret = Break::Create(_src);
121  else if (tok == "continue")
122  ret = Continue::Create(_src);
123  else if (tok == "as")
124  ret = Cast::Create(_src);
125  else if (tok == "true")
126  ret = BoolLiteral::Create(_src, true);
127  else if (tok == "false")
128  ret = BoolLiteral::Create(_src, false);
129  else if (tok == "package")
130  ret = PackageDecl::Create(_src);
131 
132  TAN_ASSERT(ret);
133  return ret;
134  }
135 
136  Type *peek_type() {
137  str s = at(_curr)->get_value();
138 
139  if (!std::isalpha(s[0])) {
140  error(ErrorType::SYNTAX_ERROR, _curr, _curr, "Invalid type name");
141  }
142 
143  return Type::GetTypeRef(s); // placeholder type
144  }
145 
146  ASTBase *peek() {
147  if (eof(_curr)) {
148  return nullptr;
149  }
150 
151  Token *token = at(_curr);
152 
153  // skip comments
154  while (token && token->get_type() == TokenType::COMMENTS) {
155  ++_curr;
156  if (eof(_curr)) {
157  return nullptr;
158  }
159  token = at(_curr);
160  }
161 
162  TAN_ASSERT(token);
163 
164  ASTBase *node = nullptr;
165  if (token->get_value() == "@") { /// intrinsics
166  node = Intrinsic::Create(_src);
167  } else if (token->get_value() == "=" && token->get_type() == TokenType::BOP) {
168  node = Assignment::Create(_src);
169  } else if (token->get_value() == "!") { /// logical not
170  node = UnaryOperator::Create(UnaryOpKind::LNOT, _src);
171  } else if (token->get_value() == "~") { /// binary not
172  node = UnaryOperator::Create(UnaryOpKind::BNOT, _src);
173  } else if (token->get_value() == "[") {
174  Token *prev_token = at(_curr - 1);
175  if (prev_token->get_type() != TokenType::ID && prev_token->get_value() != "]" && prev_token->get_value() != ")") {
176  /// array literal if there is no identifier, "]", or ")" before
177  node = ArrayLiteral::Create(_src);
178  } else {
179  /// otherwise bracket access
180  node = MemberAccess::Create(_src);
181  }
182  } else if (token->get_type() == TokenType::RELOP) { /// comparisons
183  BinaryOpKind op;
184  str tok = token->get_value();
185  if (tok == ">")
186  op = BinaryOpKind::GT;
187  else if (tok == ">=")
188  op = BinaryOpKind::GE;
189  else if (tok == "<")
190  op = BinaryOpKind::LT;
191  else if (tok == "<=")
192  op = BinaryOpKind::LE;
193  else if (tok == "==")
194  op = BinaryOpKind::EQ;
195  else if (tok == "!=")
196  op = BinaryOpKind::NE;
197  else if (tok == "&&")
198  op = BinaryOpKind::LAND;
199  else if (tok == "||")
200  op = BinaryOpKind::LOR;
201  else
202  error(ErrorType::NOT_IMPLEMENTED,
203  _curr,
204  _curr,
205  fmt::format("Binary relational operator not implemented: {}", token->get_value().c_str()));
206 
207  node = BinaryOperator::Create(op, _src);
208  } else if (token->get_type() == TokenType::INT) {
209  node = IntegerLiteral::Create(_src, (uint64_t)std::stol(token->get_value()), token->is_unsigned());
210  } else if (token->get_type() == TokenType::FLOAT) {
211  node = FloatLiteral::Create(_src, std::stod(token->get_value()));
212  } else if (token->get_type() == TokenType::STRING) { /// string literal
213  node = StringLiteral::Create(_src, token->get_value());
214  } else if (token->get_type() == TokenType::CHAR) { /// char literal
215  node = CharLiteral::Create(_src, static_cast<uint8_t>(token->get_value()[0]));
216  } else if (check_typename_token(token)) { /// should not encounter types if parsed properly
217  error(ErrorType::SYNTAX_ERROR, _curr, _curr, "Unexpected type name");
218  } else if (token->get_type() == TokenType::ID) {
219  auto next = _curr;
220  Token *next_token = at(++next);
221  if (next_token->get_value() == "(") {
222  /// identifier followed by a "(" is a function call
223  node = FunctionCall::Create(_src);
224  } else {
225  /// actually an identifier
226  node = Identifier::Create(_src, token->get_value());
227  }
228  } else if (token->get_type() == TokenType::PUNCTUATION && token->get_value() == "(") {
229  node = Parenthesis::Create(_src);
230  } else if (token->get_type() == TokenType::KEYWORD) { /// keywords
231  node = peek_keyword();
232  if (!node) {
233  error(ErrorType::NOT_IMPLEMENTED, _curr, _curr, "Keyword not implemented: " + token->get_value());
234  }
235  } else if (token->get_type() == TokenType::BOP && token->get_value() == ".") { /// member access
236  node = MemberAccess::Create(_src);
237  } else if (token->get_value() == "&") {
238  /// BOP or UOP? ambiguous
239  node = BinaryOrUnary::Create(_src, BinaryOperator::BOPPrecedence[BinaryOpKind::BAND]);
240  } else if (token->get_type() == TokenType::PUNCTUATION && token->get_value() == "{") { /// statement(s)
241  node = CompoundStmt::Create(_src);
242  } else if (token->get_type() == TokenType::BOP) { /// binary operators that haven't been processed yet
243  TAN_ASSERT(token->get_value().length());
244  switch (token->get_value()[0]) {
245  case '/':
246  node = BinaryOperator::Create(BinaryOpKind::DIVIDE, _src);
247  break;
248  case '%':
249  node = BinaryOperator::Create(BinaryOpKind::MOD, _src);
250  break;
251  case '|':
252  node = BinaryOperator::Create(BinaryOpKind::BOR, _src);
253  break;
254  case '^':
255  node = BinaryOperator::Create(BinaryOpKind::XOR, _src);
256  break;
257  /// Operators that are possibly BOP or UOP at this stage
258  /// NOTE: using the precedence of the BOP form so that the parsing works correctly if it's really a BOP
259  case '*':
260  // MULTIPLY / PTR_DEREF
261  node = BinaryOrUnary::Create(_src, BinaryOperator::BOPPrecedence[BinaryOpKind::MULTIPLY]);
262  break;
263  case '+':
264  // SUM / PLUS
265  node = BinaryOrUnary::Create(_src, BinaryOperator::BOPPrecedence[BinaryOpKind::SUM]);
266  break;
267  case '-':
268  // SUBTRACT / MINUS
269  node = BinaryOrUnary::Create(_src, BinaryOperator::BOPPrecedence[BinaryOpKind::SUBTRACT]);
270  break;
271  default:
272  TAN_ASSERT(false);
273  return nullptr;
274  }
275  } else if (check_terminal_token(token)) { /// this MUST be the last thing to check
276  return nullptr;
277  } else {
278  error(ErrorType::SYNTAX_ERROR, _curr, _curr, "Unknown token " + token->get_value());
279  }
280 
281  node->set_start(_curr);
282  node->set_end(_curr);
283  return node;
284  }
285 
286  ASTBase *next_expression(int rbp) {
287  ASTBase *node = peek();
288  if (!node) {
289  return nullptr;
290  }
291  auto n = node;
292  parse_node(n);
293  auto left = n;
294  node = peek();
295  if (!node) {
296  return left;
297  }
298  while (rbp < node->get_bp()) {
299  node = peek();
300  n = node;
301  parse_node(left, n);
302  left = n;
303  node = peek();
304  if (!node) {
305  break;
306  }
307  }
308  return left;
309  }
310 
311  /// Parse NUD
312  void parse_node(ASTBase *p) {
313  /// special tokens that require whether p is led or nud to determine the node type
314  if (p->get_node_type() == ASTNodeType::BOP_OR_UOP) {
315  auto *pp = pcast<BinaryOrUnary>(p);
316  UnaryOperator *actual = nullptr;
317  str tok = _src->get_token_str(p->start());
318  TAN_ASSERT(tok.length());
319  switch (tok[0]) {
320  case '*':
321  actual = UnaryOperator::Create(UnaryOpKind::PTR_DEREF, _src);
322  break;
323  case '&':
324  actual = UnaryOperator::Create(UnaryOpKind::ADDRESS_OF, _src);
325  break;
326  case '+':
327  actual = UnaryOperator::Create(UnaryOpKind::PLUS, _src);
328  break;
329  case '-':
330  actual = UnaryOperator::Create(UnaryOpKind::MINUS, _src);
331  break;
332  default:
333  TAN_ASSERT(false);
334  break;
335  }
336  pp->set_uop(actual);
337 
338  parse_node(pp->get_expr_ptr());
339 
340  pp->set_end(pp->get_expr_ptr()->end());
341  return;
342  }
343 
344  // look up parser func from the table
345  auto it = NUD_PARSING_FUNC_TABLE.find(p->get_node_type());
346  if (it == NUD_PARSING_FUNC_TABLE.end()) {
347  error(ErrorType::SYNTAX_ERROR,
348  _curr,
349  _curr,
350  fmt::format("Unexpected token with type: {}", ASTBase::ASTTypeNames[p->get_node_type()]));
351  }
352  nud_parsing_func_t func = it->second;
353  (this->*func)(p);
354  }
355 
356  /// Parse LED
357  void parse_node(ASTBase *left, ASTBase *p) {
358  /// special tokens that require whether p is led or nud to determine the node type
359  if (p->get_node_type() == ASTNodeType::BOP_OR_UOP) {
360  auto *pp = pcast<BinaryOrUnary>(p);
361  BinaryOperator *actual = nullptr;
362  str tok = _src->get_token_str(p->start());
363  TAN_ASSERT(tok.length());
364  switch (tok[0]) {
365  case '*':
366  actual = BinaryOperator::Create(BinaryOpKind::MULTIPLY, _src);
367  break;
368  case '&':
369  actual = BinaryOperator::Create(BinaryOpKind::BAND, _src);
370  break;
371  case '+':
372  actual = BinaryOperator::Create(BinaryOpKind::SUM, _src);
373  break;
374  case '-':
375  actual = BinaryOperator::Create(BinaryOpKind::SUBTRACT, _src);
376  break;
377  default:
378  TAN_ASSERT(false);
379  break;
380  }
381  pp->set_bop(actual);
382  parse_node(left, pp->get_expr_ptr());
383 
384  pp->set_start(pp->get_expr_ptr()->start());
385  pp->set_end(pp->get_expr_ptr()->end());
386  return;
387  }
388 
389  // look up parser func from the table
390  auto it = LED_PARSING_FUNC_TABLE.find(p->get_node_type());
391  if (it == LED_PARSING_FUNC_TABLE.end()) {
392  error(ErrorType::SYNTAX_ERROR,
393  _curr,
394  _curr,
395  fmt::format("Unexpected token with type: {}", ASTBase::ASTTypeNames[p->get_node_type()]));
396  }
397  led_parsing_func_t func = it->second;
398  (this->*func)(left, p);
399  }
400 
401 private:
402  [[nodiscard]] Token *at(uint32_t loc) const {
403  if (this->eof(loc)) {
404  Error(ErrorType::SYNTAX_ERROR, _src->get_last_token(), _src->get_last_token(), "Unexpected EOF").raise();
405  }
406  return _src->get_token(loc);
407  }
408 
409  [[nodiscard]] bool eof(uint32_t loc) const { return _src->is_eof(loc); }
410 
411  [[noreturn]] void error(ErrorType type, ASTBase *node, const str &error_message) const {
412  Error(type, at(node->start()), at(node->end()), error_message).raise();
413  }
414 
415  [[noreturn]] void error(ErrorType type, uint32_t start, uint32_t end, const str &error_message) const {
416  Error(type, at(start), at(end), error_message).raise();
417  }
418 
419  Expr *expect_expression(ASTBase *p) {
420  TAN_ASSERT(p);
421  if (!p->is_expr())
422  error(ErrorType::SYNTAX_ERROR, p, "Expect an expression");
423 
424  return pcast<Expr>(p);
425  }
426 
427  Stmt *expect_stmt(ASTBase *p) {
428  TAN_ASSERT(p);
429  if (!p->is_stmt())
430  error(ErrorType::SYNTAX_ERROR, p, "Expect a statement");
431 
432  return pcast<Stmt>(p);
433  }
434 
435  void expect_token(const str &value) {
436  Token *token = at(_curr);
437  if (token->get_value() != value) {
438  Error(ErrorType::SYNTAX_ERROR,
439  token,
440  token,
441  fmt::format("Expect '{}' but got '{}' instead", value, token->get_value()))
442  .raise();
443  }
444  }
445 
446 private:
447  void parse_compound_stmt(ASTBase *_p) {
448  auto p = pcast<CompoundStmt>(_p);
449  ScopeGuard scope_guard(_curr_scope, p);
450 
451  ++_curr; // skip "{"
452 
453  ASTBase *node = nullptr;
454  while ((node = next_expression(PREC_LOWEST)) != nullptr) {
455  p->append_child(node);
456 
457  str s = at(_curr)->get_value();
458  expect_token(node->terminal_token());
459  ++_curr;
460  }
461 
462  expect_token("}");
463  p->set_end(_curr);
464  }
465 
466  void parse_assignment(ASTBase *left, ASTBase *_p) {
467  auto p = pcast<Assignment>(_p);
468 
469  ++_curr; // skip =
470 
471  /// lhs
472  p->set_lhs(left);
473 
474  /// rhs
475  auto rhs = next_expression(PREC_LOWEST);
476  p->set_rhs(expect_expression(rhs));
477  p->set_end(_curr - 1);
478  }
479 
480  void parse_cast(ASTBase *left, ASTBase *_p) {
481  auto lhs = pcast<Expr>(left);
482  auto p = pcast<Cast>(_p);
483 
484  ++_curr; // skip as
485 
486  /// lhs
487  p->set_lhs(lhs);
488 
489  /// rhs
490  auto *ty = peek_type();
491  p->set_type(parse_ty(ty));
492 
493  p->set_end(_curr - 1);
494  }
495 
496  void parse_generic_token(ASTBase *p) { p->set_end(_curr++); }
497 
498  void parse_if(ASTBase *_p) {
499  auto p = pcast<If>(_p);
500 
501  p->set_end(_curr); // the end of token of If AST ends here
502 
503  /// if then
504  parse_if_then_branch(p);
505  ++_curr; // skip "}"
506 
507  /// else or elif clause, if any
508  while (at(_curr)->get_value() == "else") {
509  ++_curr; // skip "else"
510  if (at(_curr)->get_value() == "if") { /// elif
511  parse_if_then_branch(p);
512  } else if (at(_curr)->get_value() == "{") { /// else
513  auto else_clause = peek();
514  parse_node(else_clause);
515  p->add_else_branch(expect_stmt(else_clause));
516  } else {
517  error(ErrorType::SYNTAX_ERROR, _curr, _curr, "Unexpected token");
518  }
519 
520  ++_curr;
521  }
522 
523  --_curr; /// return back to "}"
524  TAN_ASSERT(at(_curr)->get_value() == "}");
525  }
526 
527  void parse_if_then_branch(If *p) {
528  ++_curr; // skip "if"
529 
530  // predicate
531  expect_token("(");
532  auto *_pred = peek();
533  if (_pred->get_node_type() != ASTNodeType::PARENTHESIS) {
534  error(ErrorType::SYNTAX_ERROR, _curr, _curr, "Expect a parenthesis expression");
535  }
536  parse_parenthesis(_pred);
537  Expr *pred = pcast<Expr>(_pred);
538 
539  // then clause
540  expect_token("{");
541  auto *_then = peek();
542  parse_node(_then);
543  Stmt *then_clause = expect_stmt(_then);
544 
545  p->add_if_then_branch(pred, then_clause);
546  }
547 
548  void parse_loop(ASTBase *_p) {
549  auto p = pcast<Loop>(_p);
550 
551  // determine kind
552  if (at(_curr)->get_value() == "for") {
553  p->_loop_type = ASTLoopType::FOR;
554  } else if (at(_curr)->get_value() == "while") {
555  p->_loop_type = ASTLoopType::WHILE;
556  } else {
557  TAN_ASSERT(false);
558  }
559 
560  // actual parsing
561  p->set_end(_curr);
562  ++_curr; // skip while/for
563 
564  switch (p->_loop_type) {
565  case ASTLoopType::WHILE: {
566  // predicate
567  expect_token("(");
568  auto _pred = next_expression(PREC_LOWEST);
569  Expr *pred = expect_expression(_pred);
570  p->_predicate = pred;
571  break;
572  }
573  case ASTLoopType::FOR:
574  expect_token("(");
575  ++_curr;
576 
577  // initialization
578  auto *tmp = next_expression(PREC_LOWEST);
579  Expr *init = expect_expression(tmp);
580  p->_initialization = init;
581 
582  expect_token(";");
583  ++_curr;
584 
585  // predicate
586  tmp = next_expression(PREC_LOWEST);
587  Expr *pred = expect_expression(tmp);
588  p->_predicate = pred;
589 
590  expect_token(";");
591  ++_curr;
592 
593  // iteration
594  tmp = next_expression(PREC_LOWEST);
595  Expr *iteration = expect_expression(tmp);
596  p->_iteration = iteration;
597 
598  expect_token(")");
599  ++_curr;
600 
601  break;
602  }
603 
604  // body
605  expect_token("{");
606  auto *_body = next_expression(PREC_LOWEST);
607  Stmt *body = expect_stmt(_body);
608  p->_body = body;
609  }
610 
611  void parse_array_literal(ASTBase *_p) {
612  auto *p = pcast<ArrayLiteral>(_p);
613 
614  ++_curr; // skip '['
615 
616  if (at(_curr)->get_value() == "]") {
617  // TODO: support empty array literal, but raise error if the type cannot be inferred
618  error(ErrorType::SEMANTIC_ERROR, p->start(), _curr, "Empty array literal");
619  }
620 
621  vector<Literal *> elements{};
622  while (!eof(_curr)) {
623  if (at(_curr)->get_value() == ",") { /// skip ","
624  ++_curr;
625  continue;
626  } else if (at(_curr)->get_value() == "]") { /// skip "]"
627  ++_curr;
628  break;
629  }
630 
631  auto *node = peek();
632  auto *expr = expect_expression(node);
633  if (!expr->is_comptime_known()) {
634  error(ErrorType::SEMANTIC_ERROR, _curr, _curr, "Expected a compile-time known value");
635  }
636 
637  parse_node(node);
638  elements.push_back(pcast<Literal>(node));
639  }
640 
641  p->set_elements(elements);
642  p->set_end(_curr - 1);
643  }
644 
645  void parse_bop(ASTBase *_lhs, ASTBase *_p) {
646  Expr *lhs = pcast<Expr>(_lhs);
647 
648  Token *token = at(_p->start());
649  if (token->get_value() == "." || token->get_value() == "[") { /// delegate to parse_member_access
650  parse_member_access(lhs, pcast<MemberAccess>(_p));
651  return;
652  }
653 
654  auto *p = pcast<BinaryOperator>(_p);
655 
656  ++_curr; // skip the operator
657 
658  p->set_lhs(lhs);
659 
660  auto rhs = next_expression(p->get_bp());
661  p->set_rhs(expect_expression(rhs));
662 
663  p->set_start(lhs->start());
664  p->set_end(_curr - 1);
665  }
666 
667  void parse_uop(ASTBase *_p) {
668  auto *p = pcast<UnaryOperator>(_p);
669 
670  ++_curr;
671  auto rhs = pcast<Expr>(next_expression(p->get_bp()));
672  if (!rhs) {
673  error(ErrorType::SEMANTIC_ERROR, rhs, "Invalid operand");
674  }
675  p->set_rhs(rhs);
676  p->set_end(_curr - 1);
677  }
678 
679  void parse_parenthesis(ASTBase *_p) {
680  auto *p = pcast<Parenthesis>(_p);
681 
682  ++_curr; // skip "("
683  while (true) {
684  auto *t = at(_curr);
685  if (t->get_type() == TokenType::PUNCTUATION && t->get_value() == ")") {
686  ++_curr;
687  break;
688  }
689  // _curr points to whatever after ')'
690 
691  // NOTE: parenthesis without child expression inside are illegal
692  // (except function call, which is parsed elsewhere)
693  auto _sub = next_expression(PREC_LOWEST);
694  Expr *sub = expect_expression(_sub);
695  p->set_sub(sub);
696  }
697 
698  p->set_end(_curr - 1);
699  }
700 
701  void parse_func_decl(ASTBase *_p) {
702  auto *p = pcast<FunctionDecl>(_p);
703 
704  ++_curr; // skip "fn"
705 
706  // Get function name
707  // Don't use peek since it look ahead and returns ASTNodeType::FUNCTION when it finds "(",
708  // but we only want the function name as an identifier
709  Token *id_token = at(_curr);
710  auto id = Identifier::Create(_src, id_token->get_value());
711  id->set_start(_curr);
712  id->set_end(_curr);
713  if (id->get_node_type() != ASTNodeType::ID) {
714  error(ErrorType::SYNTAX_ERROR, _curr, _curr, "Expect a function name");
715  }
716  parse_node(id);
717  p->set_name(id->get_name());
718 
719  expect_token("(");
720  ++_curr;
721 
722  { // declarations of func arguments and variables in the body are within the local scope
723  ScopeGuard scope_guard(_curr_scope, p);
724 
725  /// arguments
726  vector<str> arg_names{};
727  vector<Type *> arg_types{};
728  vector<ArgDecl *> arg_decls{};
729  if (at(_curr)->get_value() != ")") {
730  while (!eof(_curr)) {
731  auto arg = ArgDecl::Create(_src);
732  arg->set_start(_curr);
733  parse_node(arg);
734 
735  arg_names.push_back(arg->get_name());
736  arg_types.push_back(arg->get_type());
737  arg_decls.push_back(arg);
738 
739  if (at(_curr)->get_value() == ",") {
740  ++_curr;
741  } else {
742  break;
743  }
744  }
745  }
746 
747  expect_token(")");
748  ++_curr;
749 
750  p->set_arg_names(arg_names);
751  p->set_arg_decls(arg_decls);
752 
753  expect_token(":");
754  ++_curr;
755 
756  /// function type
757  auto *ret_type = peek_type();
758  auto *func_type = Type::GetFunctionType(parse_ty(ret_type), arg_types);
759  p->set_type(func_type);
760 
761  p->set_end(_curr - 1);
762 
763  /// body
764  if (!p->is_external()) {
765  expect_token("{");
766  auto body = peek();
767  parse_node(body);
768  p->set_body(expect_stmt(body));
769  }
770  }
771  }
772 
773  void parse_func_call(ASTBase *_p) {
774  auto *p = pcast<FunctionCall>(_p);
775 
776  p->set_name(at(_curr)->get_value()); // function name
777  ++_curr;
778 
779  // No need to check since '(' is what distinguish a function call from an identifier at the first place
780  // auto *token = at(_curr); if (token->get_value() != "(") { error("Invalid function call"); }
781  ++_curr; // skip (
782 
783  // args
784  while (!eof(_curr) && at(_curr)->get_value() != ")") {
785  auto _arg = next_expression(PREC_LOWEST);
786  Expr *arg = expect_expression(_arg);
787  p->_args.push_back(arg);
788 
789  if (at(_curr)->get_value() == ",") { /// skip ,
790  ++_curr;
791  } else {
792  break;
793  }
794  }
795 
796  expect_token(")");
797  p->set_end(_curr);
798  ++_curr;
799  }
800 
801  // assuming _curr is at the token after "@"
802  void parse_test_comp_error_intrinsic(Intrinsic *p) {
803  ++_curr; // skip "test_comp_error"
804 
805  auto *e = peek();
806  if (e->get_node_type() != ASTNodeType::PARENTHESIS) {
807  error(ErrorType::SYNTAX_ERROR, _curr, _curr, "Expect a parenthesis");
808  }
809 
810  parse_node(e);
811  auto *test_name = pcast<Parenthesis>(e)->get_sub();
812 
813  p->set_end(_curr - 1);
814 
815  if (test_name->get_node_type() != ASTNodeType::ID) {
816  error(ErrorType::SEMANTIC_ERROR, test_name, "Expect a test name");
817  }
818 
819  p->set_name("test_comp_error");
820  p->set_intrinsic_type(IntrinsicType::TEST_COMP_ERROR);
821 
822  // parse body
823  auto *body = peek();
824  if (body->get_node_type() != ASTNodeType::COMPOUND_STATEMENT) {
825  error(ErrorType::SYNTAX_ERROR, body, "Expect a compound statement");
826  }
827 
828  // we need to check if there's a parsing error even before the TestCompError instance is fully initialized
829  bool caught = false;
830  auto *sub = new TestCompError(_src);
831  try {
832  parse_node(body);
833  } catch (const CompileException &e) {
834  caught = true;
835  std::cerr << fmt::format("Caught expected compile error: {}\nContinue compilation...\n", e.what());
836 
837  while (at(_curr)->get_value() != "}") // will throw if EOF
838  ++_curr;
839  }
840  sub->_caught = caught;
841 
842  if (!caught) {
843  for (auto *c : body->get_children()) {
844  sub->append_child(c);
845  }
846  }
847 
848  p->set_sub(sub);
849  }
850 
851  void parse_intrinsic(ASTBase *_p) {
852  auto *p = pcast<Intrinsic>(_p);
853 
854  ++_curr; // skip "@"
855 
856  if (_src->get_token_str(_curr) == Intrinsic::TEST_COMP_ERROR_NAME) {
857  parse_test_comp_error_intrinsic(p);
858  return;
859  }
860 
861  auto e = peek();
862  parse_node(e);
863  // Only allow identifier or function call as the valid intrinsic token
864  if (e->get_node_type() != ASTNodeType::ID && e->get_node_type() != ASTNodeType::FUNC_CALL) {
865  error(ErrorType::SYNTAX_ERROR, _curr, _curr, "Unexpected token");
866  }
867  p->set_sub(e);
868 
869  // find out the name
870  str name;
871  switch (e->get_node_type()) {
872  case ASTNodeType::FUNC_CALL:
873  name = pcast<FunctionCall>(e)->get_name();
874  break;
875  case ASTNodeType::ID:
876  name = pcast<Identifier>(e)->get_name();
877  break;
878  default:
879  TAN_ASSERT(false);
880  break;
881  }
882  TAN_ASSERT(!name.empty());
883 
884  // figure out which intrinsic
885  p->set_name(name);
886  auto q = Intrinsic::INTRINSIC_NAME_TO_TYPES.find(name);
887  if (q == Intrinsic::INTRINSIC_NAME_TO_TYPES.end()) {
888  error(ErrorType::UNKNOWN_SYMBOL, _curr, _curr, fmt::format("Unknown intrinsic {}", name));
889  }
890  p->set_intrinsic_type(q->second);
891 
892  // check if the syntax is correct
893  switch (p->get_intrinsic_type()) {
894  case IntrinsicType::LINENO:
895  case IntrinsicType::FILENAME:
896  if (e->get_node_type() != ASTNodeType::ID)
897  error(ErrorType::SEMANTIC_ERROR, _curr, _curr, "Invalid usage of intrinsic");
898  break;
899  case IntrinsicType::NOOP:
900  case IntrinsicType::STACK_TRACE:
901  case IntrinsicType::TEST_COMP_ERROR:
902  if (e->get_node_type() != ASTNodeType::FUNC_CALL)
903  error(ErrorType::SEMANTIC_ERROR, _curr, _curr, "Invalid usage of intrinsic");
904  break;
905  case IntrinsicType::INVALID:
906  TAN_ASSERT(false);
907  break;
908  default:
909  break;
910  }
911 
912  p->set_end(_curr - 1);
913  }
914 
915  void parse_import(ASTBase *_p) {
916  auto *p = pcast<Import>(_p);
917 
918  ++_curr; // skip "import"
919  p->set_end(_curr);
920 
921  auto rhs = peek();
922  if (rhs->get_node_type() != ASTNodeType::STRING_LITERAL) {
923  error(ErrorType::SYNTAX_ERROR, _curr, _curr, "Invalid import statement");
924  }
925  parse_node(rhs);
926  str name = pcast<StringLiteral>(rhs)->get_value();
927  p->set_name(name);
928  }
929 
930  void parse_package_stmt(ASTBase *_p) {
931  auto *p = pcast<PackageDecl>(_p);
932  ++_curr;
933 
934  auto rhs = peek();
935  if (rhs->get_node_type() != ASTNodeType::STRING_LITERAL) {
936  error(ErrorType::SYNTAX_ERROR, _curr, _curr, "Invalid package statement");
937  }
938  parse_node(rhs);
939  str name = pcast<StringLiteral>(rhs)->get_value();
940 
941  p->set_name(name);
942 
943  p->set_end(_curr - 1);
944  }
945 
946  void parse_member_access(Expr *left, MemberAccess *p) {
947  if (at(_curr)->get_value() == "[") {
948  p->_access_type = MemberAccess::MemberAccessBracket;
949  }
950 
951  ++_curr; // skip "." or "["
952 
953  // lhs
954  p->set_lhs(left);
955 
956  // rhs
957  auto _right = peek();
958  Expr *right = expect_expression(_right);
959  parse_node(right);
960  p->set_rhs(right);
961 
962  if (p->_access_type == MemberAccess::MemberAccessBracket) { // bracket access
963  ++_curr; // skip ]
964  }
965 
966  p->set_end(_curr - 1);
967  }
968 
969  void parse_return(ASTBase *_p) {
970  auto *p = pcast<Return>(_p);
971 
972  ++_curr; // skip "return"
973 
974  auto _rhs = next_expression(PREC_LOWEST);
975  if (_rhs) {
976  Expr *rhs = expect_expression(_rhs);
977  p->set_rhs(rhs);
978  }
979 
980  p->set_end(_curr - 1);
981  }
982 
983  void parse_struct_decl(ASTBase *_p) {
984  auto *p = pcast<StructDecl>(_p);
985 
986  ++_curr; // skip "struct"
987 
988  // struct typename
989  auto _id = peek();
990  if (_id->get_node_type() != ASTNodeType::ID) {
991  error(ErrorType::SYNTAX_ERROR, _curr, _curr, "Expecting a typename");
992  }
993  parse_node(_id);
994  auto id = pcast<Identifier>(_id);
995  p->set_name(id->get_name());
996 
997  p->set_end(_curr - 1);
998 
999  // struct body
1000  if (at(_curr)->get_value() != "{") {
1001  error(ErrorType::SYNTAX_ERROR, _curr, _curr, "Expect struct body");
1002  }
1003 
1004  ScopeGuard scope_guard(_curr_scope, p);
1005 
1006  auto _comp_stmt = next_expression(PREC_LOWEST);
1007  if (!_comp_stmt || _comp_stmt->get_node_type() != ASTNodeType::COMPOUND_STATEMENT) {
1008  error(ErrorType::SEMANTIC_ERROR, _curr, _curr, "struct definition requires a valid body");
1009  }
1010  auto comp_stmt = pcast<CompoundStmt>(_comp_stmt);
1011 
1012  // copy member declarations
1013  auto children = comp_stmt->get_children();
1014  vector<Expr *> member_decls{};
1015  for (const auto &c : children) {
1016  if (!( //
1017  c->get_node_type() == ASTNodeType::VAR_DECL //
1018  || c->get_node_type() == ASTNodeType::ASSIGN //
1019  || c->get_node_type() == ASTNodeType::FUNC_DECL //
1020  )) {
1021  error(ErrorType::SEMANTIC_ERROR, c, "Invalid struct member");
1022  }
1023  member_decls.push_back(pcast<Expr>(c));
1024  }
1025  p->set_member_decls(member_decls);
1026  }
1027 
1028  ArrayType *parse_ty_array(Type *p) {
1029  ArrayType *ret = nullptr;
1030  while (true) {
1031  ++_curr; // skip "["
1032 
1033  // size
1034  ASTBase *_size = peek();
1035  if (_size->get_node_type() != ASTNodeType::INTEGER_LITERAL) {
1036  error(ErrorType::SYNTAX_ERROR, _curr, _curr, "Expect an unsigned integer as the array size");
1037  }
1038  parse_node(_size);
1039 
1040  auto size = pcast<IntegerLiteral>(_size);
1041  size_t array_size = size->get_value();
1042  if (static_cast<int64_t>(array_size) < 0) {
1043  error(ErrorType::SYNTAX_ERROR, _curr, _curr, "Expect an unsigned integer as the array size");
1044  }
1045 
1046  ret = Type::GetArrayType(p, (int)array_size);
1047 
1048  // skip "]"
1049  expect_token("]");
1050  ++_curr;
1051 
1052  // if followed by a "[", this is a multi-dimension array
1053  if (at(_curr)->get_value() == "[") {
1054  p = ret;
1055  ret = nullptr;
1056  } else {
1057  break;
1058  }
1059  }
1060 
1061  TAN_ASSERT(ret);
1062  return ret;
1063  }
1064 
1065  Type *parse_ty(Type *p) {
1066  Type *ret = p;
1067 
1068  while (!eof(_curr)) {
1069  Token *token = at(_curr);
1070 
1071  auto it = PrimitiveType::TYPENAME_TO_KIND.find(token->get_value());
1072  if (it != PrimitiveType::TYPENAME_TO_KIND.end()) { // primitive
1073  ret = PrimitiveType::Create(it->second);
1074  } else if (token->get_value() == "*") { // pointer
1075  TAN_ASSERT(ret);
1076  ret = Type::GetPointerType(ret);
1077  } else if (token->get_value() == "str") {
1078  ret = Type::GetStringType();
1079  } else if (token->get_type() == TokenType::ID) { // struct/typedefs etc.
1080  /// type referred will be resolved in analysis phase
1081  } else {
1082  break;
1083  }
1084  ++_curr;
1085  }
1086 
1087  // array
1088  Token *token = at(_curr);
1089  if (token->get_value() == "[") {
1090  TAN_ASSERT(ret);
1091  ret = parse_ty_array(ret);
1092  }
1093 
1094  return ret;
1095  }
1096 
1097  void parse_var_decl(ASTBase *_p) {
1098  auto *p = pcast<VarDecl>(_p);
1099 
1100  ++_curr; // skip 'var'
1101 
1102  // name
1103  auto name_token = at(_curr);
1104  p->set_name(name_token->get_value());
1105  ++_curr;
1106 
1107  // type
1108  if (at(_curr)->get_value() == ":") {
1109  ++_curr;
1110  Type *ty = peek_type();
1111  p->set_type(parse_ty(ty));
1112  }
1113 
1114  p->set_end(_curr - 1);
1115  }
1116 
1117  void parse_arg_decl(ASTBase *_p) {
1118  auto *p = pcast<ArgDecl>(_p);
1119 
1120  // name
1121  auto name_token = at(_curr);
1122  p->set_name(name_token->get_value());
1123  ++_curr;
1124 
1125  if (at(_curr)->get_value() != ":") {
1126  error(ErrorType::SYNTAX_ERROR, _curr, _curr, "Expect a type being specified");
1127  }
1128  ++_curr;
1129 
1130  // type
1131  Type *ty = peek_type();
1132  p->set_type(parse_ty(ty));
1133 
1134  p->set_end(_curr - 1);
1135  }
1136 
1137 private:
1138  TokenizedSourceFile *_src = nullptr;
1139  uint32_t _curr = 0;
1140  str _filename;
1141  Program *_root = nullptr;
1142  ASTBase *_curr_scope = nullptr;
1143 
1144  bool _pub = false; /// is processing pub keyword
1145  bool _extern = false; /// is processing extern keyword
1146 
1147 private:
1148  const static umap<ASTNodeType, nud_parsing_func_t> NUD_PARSING_FUNC_TABLE;
1149  const static umap<ASTNodeType, led_parsing_func_t> LED_PARSING_FUNC_TABLE;
1150 };
1151 
1152 Parser::Parser(TokenizedSourceFile *src) { _impl = new ParserImpl(src); }
1153 
1154 Program *Parser::parse() { return _impl->parse(); }
1155 
1156 Parser::~Parser() { delete _impl; }
1157 
1158 const umap<ASTNodeType, nud_parsing_func_t> ParserImpl::NUD_PARSING_FUNC_TABLE = {
1159  {ASTNodeType::COMPOUND_STATEMENT, &ParserImpl::parse_compound_stmt},
1160  {ASTNodeType::PARENTHESIS, &ParserImpl::parse_parenthesis },
1161  {ASTNodeType::IMPORT, &ParserImpl::parse_import },
1162  {ASTNodeType::INTRINSIC, &ParserImpl::parse_intrinsic },
1163  {ASTNodeType::IF, &ParserImpl::parse_if },
1164  {ASTNodeType::LOOP, &ParserImpl::parse_loop },
1165  {ASTNodeType::UOP, &ParserImpl::parse_uop },
1166  {ASTNodeType::RET, &ParserImpl::parse_return },
1167  {ASTNodeType::FUNC_CALL, &ParserImpl::parse_func_call },
1168  {ASTNodeType::ARRAY_LITERAL, &ParserImpl::parse_array_literal},
1169  {ASTNodeType::STRUCT_DECL, &ParserImpl::parse_struct_decl },
1170  {ASTNodeType::VAR_DECL, &ParserImpl::parse_var_decl },
1171  {ASTNodeType::ARG_DECL, &ParserImpl::parse_arg_decl },
1172  {ASTNodeType::FUNC_DECL, &ParserImpl::parse_func_decl },
1173  {ASTNodeType::BREAK, &ParserImpl::parse_generic_token},
1174  {ASTNodeType::CONTINUE, &ParserImpl::parse_generic_token},
1175  {ASTNodeType::ID, &ParserImpl::parse_generic_token},
1176  {ASTNodeType::INTEGER_LITERAL, &ParserImpl::parse_generic_token},
1177  {ASTNodeType::FLOAT_LITERAL, &ParserImpl::parse_generic_token},
1178  {ASTNodeType::CHAR_LITERAL, &ParserImpl::parse_generic_token},
1179  {ASTNodeType::STRING_LITERAL, &ParserImpl::parse_generic_token},
1180  {ASTNodeType::BOOL_LITERAL, &ParserImpl::parse_generic_token},
1181  {ASTNodeType::PACKAGE_DECL, &ParserImpl::parse_package_stmt },
1182 };
1183 } // namespace tanlang
1184 
1185 const umap<ASTNodeType, led_parsing_func_t> ParserImpl::LED_PARSING_FUNC_TABLE = {
1186  {ASTNodeType::BOP, &ParserImpl::parse_bop },
1187  {ASTNodeType::ASSIGN, &ParserImpl::parse_assignment},
1188  {ASTNodeType::CAST, &ParserImpl::parse_cast }
1189 };
1190 
1191 // implementations of helper functions
1192 
1193 static bool check_typename_token(Token *token) {
1194  return is_string_in(token->get_value(), tanlang::Type::ALL_TYPE_NAMES);
1195 }
1196 
1197 static bool check_terminal_token(Token *token) {
1198  // return token->get_type() == TokenType::PUNCTUATION && is_string_in(token->get_value(), TERMINAL_TOKENS);
1199  return is_string_in(token->get_value(), TERMINAL_TOKENS);
1200 }
static umap< ASTNodeType, str > ASTTypeNames
string representation of ASTNodeType
Definition: ast_base.h:16
virtual str terminal_token() const
Which terminal token is expected immediately after this node.
Definition: ast_base.h:45
static umap< BinaryOpKind, int > BOPPrecedence
binary operator precedence
Definition: expr.h:209
static CompoundStmt * Create(TokenizedSourceFile *src)
Definition: stmt.cpp:14
static FunctionCall * Create(TokenizedSourceFile *src)
Definition: expr.cpp:253
Represent if-[else] or if-elif-[else] statements.
Definition: stmt.h:144
static Import * Create(TokenizedSourceFile *src)
Definition: stmt.cpp:53
A generic representation of Intrinsic variables/functions.
Definition: intrinsic.h:55
static umap< str, IntrinsicType > INTRINSIC_NAME_TO_TYPES
A mapping from intrinsic names to IntrinsicType.
Definition: intrinsic.h:65
static Loop * Create(TokenizedSourceFile *src)
Definition: stmt.cpp:81
static MemberAccess * Create(TokenizedSourceFile *src)
Definition: expr.cpp:240
static Parenthesis * Create(TokenizedSourceFile *src)
Definition: expr.cpp:223
static Program * Create(TokenizedSourceFile *src)
Definition: stmt.cpp:35
Different from SourceFile, TokenizedSourceFile manages the tokenized text of a source file.
Type is immutable once created. The exception is StructType. Its information is updated in multiple s...
Definition: type.h:22