1 #include "lexer/lexer.h"
3 #include "source_file/token.h"
4 #include "source_file/source_file.h"
11 #define IS_DELIMITER(x) \
12 (x == ';' || std::isspace(x) || x == ',' || x == '.' || x == '!' || x == '@' || x == '#' || x == '$' || x == '%' || \
13 x == '^' || x == '&' || x == '*' || x == '(' || x == ')' || x == '-' || x == '+' || x == '=' || x == ';' || \
14 x == '<' || x == '>' || x == '/' || x == '?' || x == '\\' || x == '|' || x == '{' || x == '}' || x == '[' || \
15 x == ']' || x == '\'' || x == '"' || x == ':')
17 [[noreturn]] static void report_error(SrcLoc c, const str &message) {
18 Error(ErrorType::SYNTAX_ERROR, SourceSpan(c, c), message).raise();
21 SrcLoc skip_whitespace(SourceFile *src, SrcLoc ptr) {
22 const auto end = src->end();
23 while (ptr < end && (std::isspace(*ptr) || *ptr == '\0')) {
35 Token *tokenize_id(SourceFile *src, SrcLoc &start) {
38 const auto end = src->end();
39 while (forward != end) {
40 if (std::isalnum(*forward) || *forward == '_') {
42 } else if (start == forward) {
45 ret = new Token(TokenType::ID, start.l, start.c, src->substr(start, forward), src);
53 Token *tokenize_keyword(SourceFile *src, SrcLoc &start) {
54 // find whether the value is in KEYWORDS (in token.h/token.cpp) based on
55 // returned value of tokenize_id()
56 SrcLoc forward = start;
57 auto *t = tokenize_id(src, forward);
59 if (std::find(KEYWORDS.begin(), KEYWORDS.end(), t->get_value()) != KEYWORDS.end()) {
60 t->set_type(TokenType::KEYWORD);
70 Token *tokenize_comments(SourceFile *src, SrcLoc &start) {
72 auto next = src->forward(start);
73 if (*next == '/') { /// line comments
74 auto value = src->substr(src->forward(next));
75 t = new Token(TokenType::COMMENTS, start.l, start.c, value, src);
76 start.c = (uint32_t)src->get_line(start.l).length();
78 } else if (*next == '*') { /// block comments
79 auto forward = start = src->forward(next); /// forward now points to the character after "
82 while ((size_t)forward.l < src->size()) {
83 auto re = std::regex(R"(\*\/)");
84 auto s = src->get_line(forward.l);
86 if (std::regex_search(s, result, re)) {
87 forward.c = (uint32_t)result.position(0); // forward is the position of */
88 str comment_val = src->substr(start, forward);
89 t =
new Token(TokenType::COMMENTS, start.l, start.c, comment_val, src);
98 report_error(start,
"Invalid comments");
101 report_error(start,
"Invalid comments");
107 Token *tokenize_number(SourceFile *src, SrcLoc &start) {
108 auto forward = start;
109 const auto end = src->end();
110 bool is_float =
false;
111 bool is_unsigned =
false;
112 bool contains_digit =
false;
113 auto start_digit_i = start;
114 while (forward < end) {
115 const char ch = *forward;
117 if (std::isdigit(ch)) {
118 contains_digit =
true;
120 }
else if (*start_digit_i ==
'0' &&
121 ((ch <= 'F' && ch >=
'A') || (ch <= 'f' && ch >=
'a') || ch ==
'x' || ch ==
'X')) {
123 }
else if (contains_digit && !is_float && ch ==
'u') {
126 }
else if (contains_digit && ch ==
'.') {
129 }
else if (IS_DELIMITER(ch)) {
133 report_error(forward,
"Unexpected character within a number literal");
139 auto *t =
new Token(is_float ? TokenType::FLOAT : TokenType::INT, start.l, start.c, src->substr(start, forward), src);
140 t->set_is_unsigned(is_unsigned);
145 char escape_char(
char c) {
175 Token *tokenize_char(SourceFile *src, SrcLoc &start) {
177 auto forward = src->forward(start);
178 const auto end = src->end();
180 while (forward < end && *forward !=
'\'') {
181 if (*forward ==
'\\') {
187 if (end <= forward) {
188 auto lineno = src->size() - 1;
189 auto line = src->get_line(lineno);
190 Error(ErrorType::SYNTAX_ERROR,
191 SourceSpan(src, (uint32_t)lineno, (uint32_t)line.length() - 1),
192 "Incomplete character literal")
195 str value = src->substr(src->forward(start), forward);
196 if (value[0] ==
'\\') {
197 if (value.length() != 2) {
198 report_error(forward,
"Invalid character literal");
200 value = str(1, escape_char(value[1]));
201 }
else if (value.length() != 1) {
202 report_error(forward,
"Invalid character literal");
204 t =
new Token(TokenType::CHAR, start.l, start.c, value, src);
205 start = src->forward(forward);
210 Token *tokenize_string(SourceFile *src, SrcLoc &start) {
212 auto forward = src->forward(start);
213 const auto end = src->end();
215 while (forward < end && *forward !=
'"') {
216 if (*forward ==
'\\') {
222 if (end <= forward) {
223 auto lineno = src->size() - 1;
224 auto line = src->get_line(lineno);
225 Error(ErrorType::SYNTAX_ERROR,
226 SourceSpan(src, (uint32_t)lineno, (uint32_t)line.length() - 1),
227 "Incomplete string literal")
230 str value = src->substr(src->forward(start), forward);
232 size_t l = value.length();
238 escaped += value.substr(start_i, i - start_i);
239 escaped += escape_char(value[i + 1]);
245 escaped += value.substr(start_i, l - start_i);
246 t =
new Token(TokenType::STRING, start.l, start.c, escaped, src);
247 start = (*src).forward(forward);
252 Token *tokenize_punctuation(SourceFile *src, SrcLoc &start) {
254 auto next = src->forward(start);
256 if (*start ==
'/' && (*next ==
'/' || *next ==
'*')) {
257 t = tokenize_comments(src, start);
259 }
else if (*start ==
'\'') {
260 t = tokenize_char(src, start);
262 }
else if (*start ==
'"') {
263 t = tokenize_string(src, start);
265 }
else if (std::find(OP.begin(), OP.end(), *start) != OP.end()) {
267 SrcLoc orig_start = start;
270 SrcLoc nnext = src->forward(next);
271 SrcLoc nnnext = src->forward(nnext);
272 SrcLoc back_ptr = src->end();
273 str two = src->substr(start, nnext);
274 str three = src->substr(start, src->forward(nnext));
276 if (next < back_ptr && nnext < back_ptr &&
277 std::find(OP_ALL.begin(), OP_ALL.end(), three) != OP_ALL.end()) {
278 value = src->substr(start, nnnext);
280 }
else if (next < back_ptr && std::find(OP_ALL.begin(), OP_ALL.end(), two) != OP_ALL.end()) {
281 value = src->substr(start, nnext);
282 if (OPERATION_VALUE_TYPE_MAP.find(value) != OPERATION_VALUE_TYPE_MAP.end()) {
288 TAN_ASSERT(OPERATION_VALUE_TYPE_MAP.find(value) != OPERATION_VALUE_TYPE_MAP.end());
294 TokenType type = OPERATION_VALUE_TYPE_MAP[value];
295 t =
new Token(type, orig_start.l, orig_start.c, value, src);
298 else if (std::find(PUNCTUATIONS.begin(), PUNCTUATIONS.end(), *start) != PUNCTUATIONS.end()) {
299 t =
new Token(TokenType::PUNCTUATION, start.l, start.c, str(1, *start), src);
309 vector<Token *> tokenize(SourceFile *src) {
310 SrcLoc start = src->begin();
311 if (src->size() == 0) {
314 vector<Token *> tokens;
315 const auto end = src->end();
316 while (start < end) {
318 if (std::isalpha(*start)) {
319 auto *new_token = tokenize_keyword(src, start);
322 new_token = tokenize_id(src, start);
324 report_error(start,
"Invalid identifier");
327 tokens.emplace_back(new_token);
328 }
else if (*start ==
'_') {
330 auto *new_token = tokenize_id(src, start);
332 report_error(start,
"Invalid identifier");
334 tokens.emplace_back(new_token);
335 }
else if (std::isdigit(*start)) {
337 auto *new_token = tokenize_number(src, start);
339 report_error(start,
"Invalid number literal");
341 tokens.emplace_back(new_token);
342 }
else if (std::find(PUNCTUATIONS.begin(), PUNCTUATIONS.end(), *start) != PUNCTUATIONS.end()) {
344 auto *new_token = tokenize_punctuation(src, start);
346 report_error(start,
"Invalid symbol(s)");
348 tokens.emplace_back(new_token);
350 report_error(start,
"Invalid symbol(s)");
352 start = skip_whitespace(src, start);