commit 5bab90a0b147434119ec607cff42664cee3c15e1
parent 5706f05c670e8b6e70e372f018ecfed7a033ed46
Author: Drew DeVault <sir@cmpwn.com>
Date: Wed, 7 Oct 2020 17:02:33 -0400
lex: flesh out a bit more
Diffstat:
3 files changed, 274 insertions(+), 22 deletions(-)
diff --git a/include/lex.h b/include/lex.h
@@ -48,6 +48,7 @@ enum lexical_token {
T_USE,
T_VOID,
T_WHILE,
+ T_LAST_KEYWORD = T_WHILE,
// Operators
T_ANDEQ,
@@ -93,7 +94,9 @@ enum lexical_token {
T_SLICE,
T_TIMES,
T_TIMESEQ,
+ T_XOR,
T_XOREQ,
+ T_LAST_OPERATOR = T_XOREQ,
// Tokens with additional information
T_NAME,
@@ -115,9 +118,11 @@ struct lexer {
FILE *in;
char *buf;
size_t bufsz, buflen;
+ int c;
};
void lex_init(struct lexer *lexer, FILE *f);
+void lex_finish(struct lexer *lexer);
int lex(struct lexer *lexer, struct token *out);
void token_finish(struct token *tok);
diff --git a/src/lex.c b/src/lex.c
@@ -1,71 +1,290 @@
#include <assert.h>
#include <ctype.h>
+#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "lex.h"
-// Gets the next non-whitespace character
+static const char *tokens[] = {
+ // Must be alpha sorted
+ [T_AS] = "as",
+ [T_ASSERT] = "assert",
+ [T_BOOL] = "bool",
+ [T_BREAK] = "break",
+ [T_CHAR] = "char",
+ [T_CONST] = "const",
+ [T_CONTINUE] = "continue",
+ [T_DEF] = "def",
+ [T_ELSE] = "else",
+ [T_ENUM] = "enum",
+ [T_EXPORT] = "export",
+ [T_F32] = "f32",
+ [T_F64] = "f64",
+ [T_FALSE] = "false",
+ [T_FN] = "fn",
+ [T_FOR] = "for",
+ [T_I16] = "i16",
+ [T_I32] = "i32",
+ [T_I64] = "i64",
+ [T_I8] = "i8",
+ [T_IF] = "if",
+ [T_INT] = "int",
+ [T_IS] = "is",
+ [T_LEN] = "len",
+ [T_LET] = "let",
+ [T_MATCH] = "match",
+ [T_NULL] = "null",
+ [T_NULLABLE] = "nullable",
+ [T_RETURN] = "return",
+ [T_SIZE] = "size",
+ [T_STATIC] = "static",
+ [T_STR] = "str",
+ [T_STRUCT] = "struct",
+ [T_SWITCH] = "switch",
+ [T_TRUE] = "true",
+ [T_U16] = "u16",
+ [T_U32] = "u32",
+ [T_U64] = "u64",
+ [T_U8] = "u8",
+ [T_UINT] = "uint",
+ [T_UINTPTR] = "uintptr",
+ [T_UNION] = "union",
+ [T_USE] = "use",
+ [T_VOID] = "void",
+ [T_WHILE] = "while",
+
+ // Operators
+ [T_ANDEQ] = "&=",
+ [T_BAND] = "&",
+ [T_BNOT] = "~",
+ [T_BOR] = "|",
+ [T_CASE] = "=>",
+ [T_COLON] = ":",
+ [T_COMMA] = ",",
+ [T_DIV] = "/",
+ [T_DIVEQ] = "/=",
+ [T_DOT] = ".",
+ [T_DOUBLE_COLON] = "::",
+ [T_ELLIPSIS] = "...",
+ [T_EQUAL] = "=",
+ [T_GREATER] = ">",
+ [T_GTR_EQL] = ">=",
+ [T_LAND] = "&&",
+ [T_LBRACE] = "{",
+ [T_LBRACKET] = "[",
+ [T_LEQUAL] = "==",
+ [T_LESS] = "<",
+ [T_LESS_EQL] = "<=",
+ [T_LNOT] = "!",
+ [T_LOR] = "||",
+ [T_LPAREN] = "(",
+ [T_LSHIFT] = "<<",
+ [T_LSHIFTEQ] = "<<=",
+ [T_MINUS] = "-",
+ [T_MINUSEQ] = "-=",
+ [T_MODEQ] = "%=",
+ [T_MODULO] = "%",
+ [T_NEQUAL] = "!=",
+ [T_OREQ] = "|=",
+ [T_PLUS] = "+",
+ [T_PLUSEQ] = "+=",
+ [T_RBRACE] = "}",
+ [T_RBRACKET] = "]",
+ [T_RPAREN] = ")",
+ [T_RSHIFT] = ">>",
+ [T_RSHIFTEQ] = ">>=",
+ [T_SEMICOLON] = ";",
+ [T_SLICE] = "..",
+ [T_TIMES] = "*",
+ [T_TIMESEQ] = "*=",
+ [T_XOR] = "^=",
+ [T_XOREQ] = "^=",
+};
+
+void
+lex_init(struct lexer *lexer, FILE *f)
+{
+ memset(lexer, 0, sizeof(*lexer));
+ lexer->in = f;
+ lexer->bufsz = 256;
+ lexer->buf = calloc(1, lexer->bufsz);
+}
+
+void
+lex_finish(struct lexer *lexer)
+{
+ fclose(lexer->in);
+ free(lexer->buf);
+}
+
static int
-lwgetc(struct lexer *lexer)
+next(struct lexer *lexer, bool buffer)
{
int c;
- while ((c = fgetc(lexer->in)) != EOF && isspace(c)) ;
+ if (lexer->c != 0) {
+ c = lexer->c;
+ lexer->c = 0;
+ } else {
+ c = fgetc(lexer->in);
+ }
+ if (c == EOF || !buffer) {
+ return c;
+ }
+ if (lexer->buflen + 1 >= lexer->bufsz) {
+ lexer->bufsz *= 2;
+ lexer->buf = realloc(lexer->buf, lexer->bufsz);
+ assert(lexer->buf);
+ }
+ lexer->buf[lexer->buflen++] = c;
+ lexer->buf[lexer->buflen] = '\0';
return c;
}
static int
-lex_name(struct lexer *lexer, struct token *out)
+wgetc(struct lexer *lexer)
{
- assert(0); // TODO
+ int c;
+ while ((c = next(lexer, false)) != EOF && isspace(c)) ;
+ return c;
+}
+
+static void
+consume(struct lexer *lexer, ssize_t n)
+{
+ if (n == -1) {
+ lexer->buflen = 0;
+ lexer->buf[0] = 0;
+ return;
+ }
+ memmove(lexer->buf, &lexer->buf[lexer->buflen], lexer->buflen - n);
+ lexer->buflen -= n;
+}
+
+static void
+push(struct lexer *lexer, int c, bool buffer)
+{
+ lexer->c = c;
+ if (buffer) {
+ lexer->buf[--lexer->buflen] = 0;
+ }
}
static int
-lex_literal(struct lexer *lexer, struct token *out)
+cmp_keyword(const void *va, const void *vb)
{
- assert(0); // TODO
+ return strcmp(*(const char **)va, *(const char **)vb);
}
-void
-lex_init(struct lexer *lexer, FILE *f)
+static int
+lex_name(struct lexer *lexer, struct token *out)
{
- memset(lexer, 0, sizeof(*lexer));
- lexer->in = f;
+ int c = next(lexer, true);
+ if (c == EOF || (!isalpha(c) && c != '_')) {
+ out->token = T_ERROR;
+ return EOF;
+ }
+ while ((c = next(lexer, true)) != EOF) {
+ if (!isalnum(c) && c != '_') {
+ push(lexer, c, true);
+ goto lookup;
+ }
+ }
+ out->token = T_EOF;
+ return c;
+
+lookup:;
+ void *token = bsearch(&lexer->buf, tokens, T_LAST_KEYWORD + 1,
+ sizeof(tokens[0]), cmp_keyword);
+ if (!token) {
+ out->token = T_NAME;
+ out->name = strdup(lexer->buf);
+ } else {
+ out->token = (const char **)token - tokens;
+ }
+ consume(lexer, -1);
+ return c;
+}
+
+static int
+lex_literal(struct lexer *lexer, struct token *out)
+{
+ assert(0); // TODO
}
int
lex(struct lexer *lexer, struct token *out)
{
- int c = lwgetc(lexer);
+ int c = wgetc(lexer);
if (c == EOF) {
out->token = T_EOF;
return c;
}
if (isalpha(c)) {
- // TODO: internal buffering
- ungetc(c, lexer->in);
+ push(lexer, c, false);
return lex_name(lexer, out);
}
if (isdigit(c)) {
- // TODO: internal buffering
- ungetc(c, lexer->in);
+ push(lexer, c, false);
return lex_literal(lexer, out);
}
switch (c) {
- case '.':
case '"':
case '\'':
- // TODO: internal buffering
- ungetc(c, lexer->in);
+ push(lexer, c, false);
return lex_literal(lexer, out);
+ case '.': // . .. ...
+ case '&': // & && &=
+ case '|': // | || |=
+ case '<': // < << <=
+ case '>': // > >> >=
+ case '=': // = == =>
+ assert(0); // TODO
+ case '^': // ^ ^=
+ case '*': // * *=
+ case '%': // % %=
+ case '/': // / /=
+ case '+': // + +=
+ case '-': // - -=
+ case ':': // : ::
+ case '!': // ! !=
+ assert(0); // TODO
+ case '~':
+ out->token = T_BNOT;
+ break;
+ case ',':
+ out->token = T_COMMA;
+ break;
+ case '{':
+ out->token = T_LBRACE;
+ break;
+ case '[':
+ out->token = T_LBRACKET;
+ break;
+ case '(':
+ out->token = T_LPAREN;
+ break;
+ case '}':
+ out->token = T_RBRACE;
+ break;
+ case ']':
+ out->token = T_RBRACKET;
+ break;
+ case ')':
+ out->token = T_RPAREN;
+ break;
+ case ';':
+ out->token = T_SEMICOLON;
+ break;
default:
assert(0); // TODO: Operators
}
- assert(0); // Unreachable
+ consume(lexer, 1);
+ return c;
}
void
@@ -79,3 +298,15 @@ token_finish(struct token *tok)
break;
}
}
+
+const char *
+token_str(const struct token *tok)
+{
+ switch (tok->token) {
+ case T_NAME:
+ return tok->name;
+ default:
+ assert(tok->token < sizeof(tokens) / sizeof(tokens[0]));
+ return tokens[tok->token];
+ }
+}
diff --git a/src/main.c b/src/main.c
@@ -4,11 +4,27 @@
int
main(int argc, char *argv[])
{
- struct token tok;
struct lexer lexer;
lex_init(&lexer, stdin);
+
+ struct token tok;
while (lex(&lexer, &tok) != EOF) {
- // TODO
+ switch (tok.token) {
+ case T_NAME:
+ fprintf(stderr, "'%s'\n", tok.name);
+ break;
+ case T_ERROR:
+ fprintf(stderr, "ERROR\n");
+ break;
+ case T_EOF:
+ fprintf(stderr, "EOF\n");
+ break;
+ default:
+ fprintf(stderr, "%s\n", token_str(&tok));
+ break;
+ }
};
+
+ lex_finish(&lexer);
return 0;
}