From 5d66c96a190a396a1535c89bed4e33c2a005fe8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yaroslav=20de=20la=20Pe=C3=B1a=20Smirnov?= Date: Thu, 24 Mar 2022 01:04:02 +0300 Subject: Initial commit Basically it works, just needs some polishing and maybe a couple of features that I could actually use. Also probably better docs. Not sure if it will be of use to anybody besides me. --- src/lexer.c | 264 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 264 insertions(+) create mode 100644 src/lexer.c (limited to 'src/lexer.c') diff --git a/src/lexer.c b/src/lexer.c new file mode 100644 index 0000000..1ba9912 --- /dev/null +++ b/src/lexer.c @@ -0,0 +1,264 @@ +#include "lexer.h" +#include "token.h" + +#include +#include +#include +#include + +static bool +isidentc(char c) +{ + return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '_'; +} + +static void +set_token(struct token *token, enum token_type t, const struct slice *s) +{ + token->type = t; + if (s == NULL) { + token->literal.str = ""; + token->literal.start = 0; + token->literal.end = 0; + } else { + slice_cpy(&token->literal, s); + } +} + +static char +lexer_peek_prev_char(struct lexer *lexer) +{ + if (lexer->word.start <= 1) { + return 0; + } + return lexer->input[lexer->word.start - 1]; +} + +static char +lexer_peek_char(struct lexer *lexer) +{ + if (lexer->word.start >= lexer->len) { + return 0; + } + return lexer->input[lexer->word.start + 1]; +} + +static inline void +lexer_read_char(struct lexer *lexer) +{ + lexer->word.start = lexer->word.end; + if (lexer->word.end > lexer->len) { + lexer->word.end = 0; + return; + } + char prevc = lexer_peek_prev_char(lexer); + if (prevc == '\n') { + lexer->line++; + lexer->column = 0; + } + lexer->column++; + lexer->word.end++; +} + +static void +lexer_read_ident(struct lexer *lexer, struct token *token) +{ + size_t start = lexer->word.start; + token->literal.str = lexer->input; + while (isidentc(lexer->input[lexer->word.start]) + || isdigit(lexer->input[lexer->word.start])) { + lexer_read_char(lexer); + } + token->literal.start = start; + token->literal.end = lexer->word.start; +} + +static void +lexer_read_num(struct lexer *lexer, struct token *token) +{ + size_t start = lexer->word.start; + token->literal.str = lexer->input; + while (isdigit(lexer->input[lexer->word.start])) { + lexer_read_char(lexer); + } + token->literal.start = start; + token->literal.end = lexer->word.start; +} + +static void +lexer_read_string(struct lexer *lexer, struct token *token) +{ + size_t start = lexer->word.start; + token->literal.str = lexer->input; + lexer_read_char(lexer); + while(lexer->input[lexer->word.start] != '"' && + lexer->input[lexer->word.start] != '\0') { + lexer_read_char(lexer); + } + lexer_read_char(lexer); + token->literal.start = start; + token->literal.end = lexer->word.start; +} + +static void +lexer_read_content(struct lexer *lexer, struct token *token) +{ + size_t start = lexer->word.start; + token->literal.str = lexer->input; + while(lexer->input[lexer->word.start] != '{' && + lexer->input[lexer->word.start] != '\0') { + lexer_read_char(lexer); + } + token->literal.start = start; + token->literal.end = lexer->word.start; +} + +static void +lexer_eatspace(struct lexer *lexer) +{ + while(isspace(lexer->input[lexer->word.start])) { + lexer_read_char(lexer); + } +} + +struct lexer * +lexer_new(const char *input) +{ + struct lexer *lexer = malloc(sizeof(*lexer)); + lexer->input = input; + lexer->len = strlen(lexer->input); + lexer->word.str = lexer->input; + lexer->word.start = 0; + lexer->word.end = 0; + lexer->in_content = true; + lexer->line = 1; + lexer->column = 0; + lexer_read_char(lexer); + + return lexer; +} + +struct token +lexer_next_token(struct lexer *lexer) +{ + struct token token = { .line = lexer->line, .column = lexer->column }; + char c = lexer->input[lexer->word.start]; + + if (c == '\0') { + set_token(&token, TOKEN_EOF, NULL); + return token; + } + + if (lexer->in_content && c != '{') { + lexer_read_content(lexer, &token); + token.type = TOKEN_CONTENT; + return token; + } + + lexer_eatspace(lexer); + c = lexer->input[lexer->word.start]; + switch (c) { + case '=': + if (lexer_peek_char(lexer) == '=') { + lexer->word.end++; + set_token(&token, TOKEN_EQ, &lexer->word); + } else { + set_token(&token, TOKEN_ILLEGAL, &lexer->word); + } + break; + case '+': + set_token(&token, TOKEN_PLUS, &lexer->word); + break; + case '-': + set_token(&token, TOKEN_MINUS, &lexer->word); + break; + case '!': + if (lexer_peek_char(lexer) == '=') { + lexer->word.end++; + set_token(&token, TOKEN_NOTEQ, &lexer->word); + } else { + set_token(&token, TOKEN_BANG, &lexer->word); + } + break; + case '/': + set_token(&token, TOKEN_SLASH, &lexer->word); + break; + case '*': + set_token(&token, TOKEN_ASTERISK, &lexer->word); + break; + case '<': + if (lexer_peek_char(lexer) == '=') { + lexer->word.end++; + set_token(&token, TOKEN_LTE, &lexer->word); + } else { + set_token(&token, TOKEN_LT, &lexer->word); + } + break; + case '>': + if (lexer_peek_char(lexer) == '=') { + lexer->word.end++; + set_token(&token, TOKEN_GTE, &lexer->word); + } else { + set_token(&token, TOKEN_GT, &lexer->word); + } + break; + case '(': + set_token(&token, TOKEN_LPAREN, &lexer->word); + break; + case ')': + set_token(&token, TOKEN_RPAREN, &lexer->word); + break; + case '.': + set_token(&token, TOKEN_DOT, &lexer->word); + break; + case ',': + set_token(&token, TOKEN_COMMA, &lexer->word); + break; + case '[': + set_token(&token, TOKEN_LBRACKET, &lexer->word); + break; + case ']': + set_token(&token, TOKEN_RBRACKET, &lexer->word); + break; + case '{': + lexer->in_content = false; + set_token(&token, TOKEN_LBRACE, &lexer->word); + break; + case '}':{ + char prevc = lexer_peek_prev_char(lexer); + if (prevc == '}' || prevc == '%') { + lexer->in_content = true; + } + set_token(&token, TOKEN_RBRACE, &lexer->word); + break; + } + case '%': + set_token(&token, TOKEN_PERCENT, &lexer->word); + break; + default: + if (c == '"') { + lexer_read_string(lexer, &token); + token.type = TOKEN_STRING; + return token; + } else if (isidentc(c)) { + lexer_read_ident(lexer, &token); + token.type = token_lookup_ident(&token.literal); + return token; + } else if (isdigit(c)) { + lexer_read_num(lexer, &token); + token.type = TOKEN_INT; + return token; + } + set_token(&token, TOKEN_ILLEGAL, &lexer->word); + } + + lexer_read_char(lexer); + + return token; +} + +void +lexer_destroy(struct lexer *lexer) +{ + free(lexer); +} -- cgit v1.2.3