harec

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

commit 5bab90a0b147434119ec607cff42664cee3c15e1
parent 5706f05c670e8b6e70e372f018ecfed7a033ed46
Author: Drew DeVault <sir@cmpwn.com>
Date:   Wed,  7 Oct 2020 17:02:33 -0400

lex: flesh out a bit more

Diffstat:
Minclude/lex.h | 5+++++
Msrc/lex.c | 271+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
Msrc/main.c | 20++++++++++++++++++--
3 files changed, 274 insertions(+), 22 deletions(-)

diff --git a/include/lex.h b/include/lex.h @@ -48,6 +48,7 @@ enum lexical_token { T_USE, T_VOID, T_WHILE, + T_LAST_KEYWORD = T_WHILE, // Operators T_ANDEQ, @@ -93,7 +94,9 @@ enum lexical_token { T_SLICE, T_TIMES, T_TIMESEQ, + T_XOR, T_XOREQ, + T_LAST_OPERATOR = T_XOREQ, // Tokens with additional information T_NAME, @@ -115,9 +118,11 @@ struct lexer { FILE *in; char *buf; size_t bufsz, buflen; + int c; }; void lex_init(struct lexer *lexer, FILE *f); +void lex_finish(struct lexer *lexer); int lex(struct lexer *lexer, struct token *out); void token_finish(struct token *tok); diff --git a/src/lex.c b/src/lex.c @@ -1,71 +1,290 @@ #include <assert.h> #include <ctype.h> +#include <stdbool.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include "lex.h" -// Gets the next non-whitespace character +static const char *tokens[] = { + // Must be alpha sorted + [T_AS] = "as", + [T_ASSERT] = "assert", + [T_BOOL] = "bool", + [T_BREAK] = "break", + [T_CHAR] = "char", + [T_CONST] = "const", + [T_CONTINUE] = "continue", + [T_DEF] = "def", + [T_ELSE] = "else", + [T_ENUM] = "enum", + [T_EXPORT] = "export", + [T_F32] = "f32", + [T_F64] = "f64", + [T_FALSE] = "false", + [T_FN] = "fn", + [T_FOR] = "for", + [T_I16] = "i16", + [T_I32] = "i32", + [T_I64] = "i64", + [T_I8] = "i8", + [T_IF] = "if", + [T_INT] = "int", + [T_IS] = "is", + [T_LEN] = "len", + [T_LET] = "let", + [T_MATCH] = "match", + [T_NULL] = "null", + [T_NULLABLE] = "nullable", + [T_RETURN] = "return", + [T_SIZE] = "size", + [T_STATIC] = "static", + [T_STR] = "str", + [T_STRUCT] = "struct", + [T_SWITCH] = "switch", + [T_TRUE] = "true", + [T_U16] = "u16", + [T_U32] = "u32", + [T_U64] = "u64", + [T_U8] = "u8", + [T_UINT] = "uint", + [T_UINTPTR] = "uintptr", + [T_UNION] = "union", + [T_USE] = "use", + [T_VOID] = "void", + [T_WHILE] = "while", + + // Operators + [T_ANDEQ] = "&=", + [T_BAND] = "&", + [T_BNOT] = "~", + [T_BOR] = "|", + [T_CASE] = "=>", + [T_COLON] = ":", + [T_COMMA] = ",", + [T_DIV] = "/", + [T_DIVEQ] = "/=", + [T_DOT] = ".", + [T_DOUBLE_COLON] = "::", + [T_ELLIPSIS] = "...", + [T_EQUAL] = "=", + [T_GREATER] = ">", + [T_GTR_EQL] = ">=", + [T_LAND] = "&&", + [T_LBRACE] = "{", + [T_LBRACKET] = "[", + [T_LEQUAL] = "==", + [T_LESS] = "<", + [T_LESS_EQL] = "<=", + [T_LNOT] = "!", + [T_LOR] = "||", + [T_LPAREN] = "(", + [T_LSHIFT] = "<<", + [T_LSHIFTEQ] = "<<=", + [T_MINUS] = "-", + [T_MINUSEQ] = "-=", + [T_MODEQ] = "%=", + [T_MODULO] = "%", + [T_NEQUAL] = "!=", + [T_OREQ] = "|=", + [T_PLUS] = "+", + [T_PLUSEQ] = "+=", + [T_RBRACE] = "}", + [T_RBRACKET] = "]", + [T_RPAREN] = ")", + [T_RSHIFT] = ">>", + [T_RSHIFTEQ] = ">>=", + [T_SEMICOLON] = ";", + [T_SLICE] = "..", + [T_TIMES] = "*", + [T_TIMESEQ] = "*=", + [T_XOR] = "^=", + [T_XOREQ] = "^=", +}; + +void +lex_init(struct lexer *lexer, FILE *f) +{ + memset(lexer, 0, sizeof(*lexer)); + lexer->in = f; + lexer->bufsz = 256; + lexer->buf = calloc(1, lexer->bufsz); +} + +void +lex_finish(struct lexer *lexer) +{ + fclose(lexer->in); + free(lexer->buf); +} + static int -lwgetc(struct lexer *lexer) +next(struct lexer *lexer, bool buffer) { int c; - while ((c = fgetc(lexer->in)) != EOF && isspace(c)) ; + if (lexer->c != 0) { + c = lexer->c; + lexer->c = 0; + } else { + c = fgetc(lexer->in); + } + if (c == EOF || !buffer) { + return c; + } + if (lexer->buflen + 1 >= lexer->bufsz) { + lexer->bufsz *= 2; + lexer->buf = realloc(lexer->buf, lexer->bufsz); + assert(lexer->buf); + } + lexer->buf[lexer->buflen++] = c; + lexer->buf[lexer->buflen] = '\0'; return c; } static int -lex_name(struct lexer *lexer, struct token *out) +wgetc(struct lexer *lexer) { - assert(0); // TODO + int c; + while ((c = next(lexer, false)) != EOF && isspace(c)) ; + return c; +} + +static void +consume(struct lexer *lexer, ssize_t n) +{ + if (n == -1) { + lexer->buflen = 0; + lexer->buf[0] = 0; + return; + } + memmove(lexer->buf, &lexer->buf[lexer->buflen], lexer->buflen - n); + lexer->buflen -= n; +} + +static void +push(struct lexer *lexer, int c, bool buffer) +{ + lexer->c = c; + if (buffer) { + lexer->buf[--lexer->buflen] = 0; + } } static int -lex_literal(struct lexer *lexer, struct token *out) +cmp_keyword(const void *va, const void *vb) { - assert(0); // TODO + return strcmp(*(const char **)va, *(const char **)vb); } -void -lex_init(struct lexer *lexer, FILE *f) +static int +lex_name(struct lexer *lexer, struct token *out) { - memset(lexer, 0, sizeof(*lexer)); - lexer->in = f; + int c = next(lexer, true); + if (c == EOF || (!isalpha(c) && c != '_')) { + out->token = T_ERROR; + return EOF; + } + while ((c = next(lexer, true)) != EOF) { + if (!isalnum(c) && c != '_') { + push(lexer, c, true); + goto lookup; + } + } + out->token = T_EOF; + return c; + +lookup:; + void *token = bsearch(&lexer->buf, tokens, T_LAST_KEYWORD + 1, + sizeof(tokens[0]), cmp_keyword); + if (!token) { + out->token = T_NAME; + out->name = strdup(lexer->buf); + } else { + out->token = (const char **)token - tokens; + } + consume(lexer, -1); + return c; +} + +static int +lex_literal(struct lexer *lexer, struct token *out) +{ + assert(0); // TODO } int lex(struct lexer *lexer, struct token *out) { - int c = lwgetc(lexer); + int c = wgetc(lexer); if (c == EOF) { out->token = T_EOF; return c; } if (isalpha(c)) { - // TODO: internal buffering - ungetc(c, lexer->in); + push(lexer, c, false); return lex_name(lexer, out); } if (isdigit(c)) { - // TODO: internal buffering - ungetc(c, lexer->in); + push(lexer, c, false); return lex_literal(lexer, out); } switch (c) { - case '.': case '"': case '\'': - // TODO: internal buffering - ungetc(c, lexer->in); + push(lexer, c, false); return lex_literal(lexer, out); + case '.': // . .. ... + case '&': // & && &= + case '|': // | || |= + case '<': // < << <= + case '>': // > >> >= + case '=': // = == => + assert(0); // TODO + case '^': // ^ ^= + case '*': // * *= + case '%': // % %= + case '/': // / /= + case '+': // + += + case '-': // - -= + case ':': // : :: + case '!': // ! != + assert(0); // TODO + case '~': + out->token = T_BNOT; + break; + case ',': + out->token = T_COMMA; + break; + case '{': + out->token = T_LBRACE; + break; + case '[': + out->token = T_LBRACKET; + break; + case '(': + out->token = T_LPAREN; + break; + case '}': + out->token = T_RBRACE; + break; + case ']': + out->token = T_RBRACKET; + break; + case ')': + out->token = T_RPAREN; + break; + case ';': + out->token = T_SEMICOLON; + break; default: assert(0); // TODO: Operators } - assert(0); // Unreachable + consume(lexer, 1); + return c; } void @@ -79,3 +298,15 @@ token_finish(struct token *tok) break; } } + +const char * +token_str(const struct token *tok) +{ + switch (tok->token) { + case T_NAME: + return tok->name; + default: + assert(tok->token < sizeof(tokens) / sizeof(tokens[0])); + return tokens[tok->token]; + } +} diff --git a/src/main.c b/src/main.c @@ -4,11 +4,27 @@ int main(int argc, char *argv[]) { - struct token tok; struct lexer lexer; lex_init(&lexer, stdin); + + struct token tok; while (lex(&lexer, &tok) != EOF) { - // TODO + switch (tok.token) { + case T_NAME: + fprintf(stderr, "'%s'\n", tok.name); + break; + case T_ERROR: + fprintf(stderr, "ERROR\n"); + break; + case T_EOF: + fprintf(stderr, "EOF\n"); + break; + default: + fprintf(stderr, "%s\n", token_str(&tok)); + break; + } }; + + lex_finish(&lexer); return 0; }