tan  0.0.1
lexer.cpp
1 #include "lexer/lexer.h"
2 #include "base.h"
3 #include "source_file/token.h"
4 #include "source_file/source_file.h"
5 #include <algorithm>
6 #include <cctype>
7 #include <regex>
8 
9 namespace tanlang {
10 
11 #define IS_DELIMITER(x) \
12  (x == ';' || std::isspace(x) || x == ',' || x == '.' || x == '!' || x == '@' || x == '#' || x == '$' || x == '%' || \
13  x == '^' || x == '&' || x == '*' || x == '(' || x == ')' || x == '-' || x == '+' || x == '=' || x == ';' || \
14  x == '<' || x == '>' || x == '/' || x == '?' || x == '\\' || x == '|' || x == '{' || x == '}' || x == '[' || \
15  x == ']' || x == '\'' || x == '"' || x == ':')
16 
17 [[noreturn]] static void report_error(SrcLoc c, const str &message) {
18  Error(ErrorType::SYNTAX_ERROR, SourceSpan(c, c), message).raise();
19 }
20 
21 SrcLoc skip_whitespace(SourceFile *src, SrcLoc ptr) {
22  const auto end = src->end();
23  while (ptr < end && (std::isspace(*ptr) || *ptr == '\0')) {
24  ++ptr;
25  }
26  return ptr;
27 }
28 
29 /**
30  * \note: For all tokenize_xx functions
31  * @start is at least one token before the end
32  * \note: Call of tokenize_keyword must before that of
33  * tokenize_id
34  */
35 Token *tokenize_id(SourceFile *src, SrcLoc &start) {
36  Token *ret = nullptr;
37  auto forward = start;
38  const auto end = src->end();
39  while (forward != end) {
40  if (std::isalnum(*forward) || *forward == '_') {
41  ++forward;
42  } else if (start == forward) {
43  return nullptr;
44  } else {
45  ret = new Token(TokenType::ID, start.l, start.c, src->substr(start, forward), src);
46  break;
47  }
48  }
49  start = forward;
50  return ret;
51 }
52 
53 Token *tokenize_keyword(SourceFile *src, SrcLoc &start) {
54  // find whether the value is in KEYWORDS (in token.h/token.cpp) based on
55  // returned value of tokenize_id()
56  SrcLoc forward = start;
57  auto *t = tokenize_id(src, forward);
58  if (t) {
59  if (std::find(KEYWORDS.begin(), KEYWORDS.end(), t->get_value()) != KEYWORDS.end()) {
60  t->set_type(TokenType::KEYWORD);
61  start = forward;
62  } else {
63  delete (t);
64  t = nullptr;
65  }
66  }
67  return t;
68 }
69 
70 Token *tokenize_comments(SourceFile *src, SrcLoc &start) {
71  Token *t = nullptr;
72  auto next = src->forward(start);
73  if (*next == '/') { /// line comments
74  auto value = src->substr(src->forward(next));
75  t = new Token(TokenType::COMMENTS, start.l, start.c, value, src);
76  start.c = (uint32_t)src->get_line(start.l).length();
77  ++start;
78  } else if (*next == '*') { /// block comments
79  auto forward = start = src->forward(next); /// forward now points to the character after "/*"
80 
81  /// trying to find "*/"
82  while ((size_t)forward.l < src->size()) {
83  auto re = std::regex(R"(\*\/)");
84  auto s = src->get_line(forward.l);
85  std::smatch result;
86  if (std::regex_search(s, result, re)) {
87  forward.c = (uint32_t)result.position(0); // forward is the position of */
88  str comment_val = src->substr(start, forward);
89  t = new Token(TokenType::COMMENTS, start.l, start.c, comment_val, src);
90  forward.c += 2;
91  start = forward;
92  break;
93  }
94  ++forward.l;
95  forward.c = 0;
96  }
97  if (!t) {
98  report_error(start, "Invalid comments");
99  }
100  } else {
101  report_error(start, "Invalid comments");
102  }
103  return t;
104 }
105 
106 // we don't care about if the number literal is actually correct
107 Token *tokenize_number(SourceFile *src, SrcLoc &start) {
108  auto forward = start;
109  const auto end = src->end();
110  bool is_float = false;
111  bool is_unsigned = false;
112  bool contains_digit = false;
113  auto start_digit_i = start;
114  while (forward < end) {
115  const char ch = *forward;
116 
117  if (std::isdigit(ch)) {
118  contains_digit = true;
119 
120  } else if (*start_digit_i == '0' &&
121  ((ch <= 'F' && ch >= 'A') || (ch <= 'f' && ch >= 'a') || ch == 'x' || ch == 'X')) {
122 
123  } else if (contains_digit && !is_float && ch == 'u') { // explicitly unsigned
124  is_unsigned = true;
125 
126  } else if (contains_digit && ch == '.') {
127  is_float = true;
128 
129  } else if (IS_DELIMITER(ch)) {
130  break;
131 
132  } else {
133  report_error(forward, "Unexpected character within a number literal");
134  }
135 
136  ++forward;
137  }
138 
139  auto *t = new Token(is_float ? TokenType::FLOAT : TokenType::INT, start.l, start.c, src->substr(start, forward), src);
140  t->set_is_unsigned(is_unsigned);
141  start = forward;
142  return t;
143 }
144 
145 char escape_char(char c) {
146  /// https://en.cppreference.com/w/cpp/language/escape
147  switch (c) {
148  case '\'':
149  return '\'';
150  case '\"':
151  return '\"';
152  case '\\':
153  return '\\';
154  case '?':
155  return '\?';
156  case 'a':
157  return '\a';
158  case 'b':
159  return '\b';
160  case 'f':
161  return '\f';
162  case 'n':
163  return '\n';
164  case 'r':
165  return '\r';
166  case 't':
167  return '\t';
168  case 'v':
169  return '\v';
170  default:
171  return -1;
172  }
173 }
174 
175 Token *tokenize_char(SourceFile *src, SrcLoc &start) {
176  Token *t = nullptr;
177  auto forward = src->forward(start);
178  const auto end = src->end();
179 
180  while (forward < end && *forward != '\'') {
181  if (*forward == '\\') {
182  ++forward;
183  }
184  ++forward;
185  }
186 
187  if (end <= forward) {
188  auto lineno = src->size() - 1;
189  auto line = src->get_line(lineno);
190  Error(ErrorType::SYNTAX_ERROR,
191  SourceSpan(src, (uint32_t)lineno, (uint32_t)line.length() - 1),
192  "Incomplete character literal")
193  .raise();
194  } else {
195  str value = src->substr(src->forward(start), forward); // not including the single quotes
196  if (value[0] == '\\') {
197  if (value.length() != 2) {
198  report_error(forward, "Invalid character literal");
199  }
200  value = str(1, escape_char(value[1]));
201  } else if (value.length() != 1) {
202  report_error(forward, "Invalid character literal");
203  }
204  t = new Token(TokenType::CHAR, start.l, start.c, value, src);
205  start = src->forward(forward);
206  }
207  return t;
208 }
209 
210 Token *tokenize_string(SourceFile *src, SrcLoc &start) {
211  Token *t = nullptr;
212  auto forward = src->forward(start);
213  const auto end = src->end();
214 
215  while (forward < end && *forward != '"') {
216  if (*forward == '\\') { // escape
217  ++forward;
218  }
219  ++forward;
220  }
221 
222  if (end <= forward) {
223  auto lineno = src->size() - 1;
224  auto line = src->get_line(lineno);
225  Error(ErrorType::SYNTAX_ERROR,
226  SourceSpan(src, (uint32_t)lineno, (uint32_t)line.length() - 1),
227  "Incomplete string literal")
228  .raise();
229  } else {
230  str value = src->substr(src->forward(start), forward); // not including the double quotes
231  str escaped = "";
232  size_t l = value.length();
233  size_t start_i = 0;
234  size_t i = 0;
235  while (i < l) {
236  char c = value[i];
237  if (c == '\\') {
238  escaped += value.substr(start_i, i - start_i);
239  escaped += escape_char(value[i + 1]);
240  start_i = i + 2;
241  ++i;
242  }
243  ++i;
244  }
245  escaped += value.substr(start_i, l - start_i);
246  t = new Token(TokenType::STRING, start.l, start.c, escaped, src);
247  start = (*src).forward(forward);
248  }
249  return t;
250 }
251 
252 Token *tokenize_punctuation(SourceFile *src, SrcLoc &start) {
253  Token *t = nullptr;
254  auto next = src->forward(start);
255 
256  if (*start == '/' && (*next == '/' || *next == '*')) { /// line comment or block comment
257  t = tokenize_comments(src, start);
258 
259  } else if (*start == '\'') { /// char literal
260  t = tokenize_char(src, start);
261 
262  } else if (*start == '"') { /// string literal
263  t = tokenize_string(src, start);
264 
265  } else if (std::find(OP.begin(), OP.end(), *start) != OP.end()) { /// operators
266  str value;
267  SrcLoc orig_start = start;
268 
269  {
270  SrcLoc nnext = src->forward(next);
271  SrcLoc nnnext = src->forward(nnext);
272  SrcLoc back_ptr = src->end();
273  str two = src->substr(start, nnext);
274  str three = src->substr(start, src->forward(nnext));
275 
276  if (next < back_ptr && nnext < back_ptr &&
277  std::find(OP_ALL.begin(), OP_ALL.end(), three) != OP_ALL.end()) { /// operator containing three characters
278  value = src->substr(start, nnnext);
279  start = nnnext;
280  } else if (next < back_ptr && std::find(OP_ALL.begin(), OP_ALL.end(), two) != OP_ALL.end()) {
281  value = src->substr(start, nnext);
282  if (OPERATION_VALUE_TYPE_MAP.find(value) != OPERATION_VALUE_TYPE_MAP.end()) { /// operator containing two chars
283  start = nnext;
284  }
285  } else {
286  /// operator containing one chars
287  value = str{*start};
288  TAN_ASSERT(OPERATION_VALUE_TYPE_MAP.find(value) != OPERATION_VALUE_TYPE_MAP.end());
289  start = next;
290  }
291  }
292 
293  // create new token, fill in token
294  TokenType type = OPERATION_VALUE_TYPE_MAP[value];
295  t = new Token(type, orig_start.l, orig_start.c, value, src);
296 
297  } /// other punctuations
298  else if (std::find(PUNCTUATIONS.begin(), PUNCTUATIONS.end(), *start) != PUNCTUATIONS.end()) {
299  t = new Token(TokenType::PUNCTUATION, start.l, start.c, str(1, *start), src);
300  start = next;
301 
302  } else {
303  t = nullptr;
304  }
305 
306  return t;
307 }
308 
309 vector<Token *> tokenize(SourceFile *src) {
310  SrcLoc start = src->begin();
311  if (src->size() == 0) {
312  return {};
313  }
314  vector<Token *> tokens;
315  const auto end = src->end();
316  while (start < end) {
317  /// if start with a letter
318  if (std::isalpha(*start)) {
319  auto *new_token = tokenize_keyword(src, start);
320  if (!new_token) {
321  /// if this is not a keyword, probably an identifier
322  new_token = tokenize_id(src, start);
323  if (!new_token) {
324  report_error(start, "Invalid identifier");
325  }
326  }
327  tokens.emplace_back(new_token);
328  } else if (*start == '_') {
329  /// start with an underscore, must be an identifier
330  auto *new_token = tokenize_id(src, start);
331  if (!new_token) {
332  report_error(start, "Invalid identifier");
333  }
334  tokens.emplace_back(new_token);
335  } else if (std::isdigit(*start)) {
336  /// number literal
337  auto *new_token = tokenize_number(src, start);
338  if (!new_token) {
339  report_error(start, "Invalid number literal");
340  }
341  tokens.emplace_back(new_token);
342  } else if (std::find(PUNCTUATIONS.begin(), PUNCTUATIONS.end(), *start) != PUNCTUATIONS.end()) {
343  /// punctuations
344  auto *new_token = tokenize_punctuation(src, start);
345  if (!new_token) {
346  report_error(start, "Invalid symbol(s)");
347  }
348  tokens.emplace_back(new_token);
349  } else {
350  report_error(start, "Invalid symbol(s)");
351  }
352  start = skip_whitespace(src, start);
353  }
354  return tokens;
355 }
356 
357 } // namespace tanlang