commit 37f47c120bf9cf72a226eabf12462e22df275c20
parent f62b8cb02d3991e2bc51e8d064f3d85b2b8da6d4
Author: Drew DeVault <sir@cmpwn.com>
Date: Sat, 13 Feb 2021 14:27:09 -0500
hare::lex: lex1
Diffstat:
A | bufio/fixed.ha | | | 32 | ++++++++++++++++++++++++++++++++ |
M | hare/lex/lex.ha | | | 139 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------- |
M | hare/lex/token.ha | | | 130 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----- |
3 files changed, 280 insertions(+), 21 deletions(-)
diff --git a/bufio/fixed.ha b/bufio/fixed.ha
@@ -0,0 +1,32 @@
+use io;
+use rt;
+
+// XXX: All of this is temporary
+export type fixed_stream = struct {
+ stream: io::stream,
+ buf: []u8,
+};
+
+export fn fixed(in: []u8) *io::stream = {
+ let s = alloc(*fixed_stream, fixed_stream {
+ stream = io::stream {
+ name = "<bufio::fixed>",
+ reader = &fixed_read,
+ ...
+ },
+ buf = in,
+ });
+ return &s.stream;
+};
+
+fn fixed_read(s: *io::stream, buf: []u8) (size | io::error | io::EOF) = {
+ let stream = s: *fixed_stream;
+ if (len(stream.buf) == 0) {
+ return io::EOF;
+ };
+ const n = if (len(buf) > len(stream.buf)) len(stream.buf) else len(buf);
+ // TODO: Fix me up once slice copying is in
+ rt::memcpy(buf: *[*]u8, stream.buf: *[*]u8, n);
+ stream.buf = stream.buf[n..];
+ return n;
+};
diff --git a/hare/lex/lex.ha b/hare/lex/lex.ha
@@ -1,4 +1,6 @@
// hare::lex provides a lexer for Hare source code.
+use ascii;
+use bufio;
use io;
use strings;
use types;
@@ -7,22 +9,35 @@ use types;
export type lexer = struct {
in: *io::stream,
path: str,
- loc: linecol,
+ loc: (uint, uint),
un: ((token, location) | void),
rb: [2](rune | io::EOF | void),
};
+// A syntax error
+export type syntax = location;
+
+// All possible lexer errors
+export type error = (io::error | syntax);
+
+export fn errstr(err: error) const str = {
+ return match (err) {
+ err: io::error => io::errstr(err),
+ syntax => "Syntax error", // TODO: add line info
+ };
+};
+
// Initializes a new lexer for the given input stream. The path is borrowed.
export fn lexer_init(in: *io::stream, path: str) lexer = lexer {
in = in,
path = path,
- loc = linecol { line = 1, col = 1 },
+ loc = (1, 1),
un = void,
rb = [void...],
};
// Returns the next token from the lexer.
-export fn lex(lex: *lexer) ((token, location) | io::EOF | io::error) = {
+export fn lex(lex: *lexer) ((token, location) | io::EOF | error) = {
match (lex.un) {
tok: (token, location) => {
lex.un = void;
@@ -31,6 +46,50 @@ export fn lex(lex: *lexer) ((token, location) | io::EOF | io::error) = {
void => void,
};
+ let loc = mkloc(lex);
+ let r: rune = match (next(lex)) {
+ e: io::error => return e,
+ io::EOF => return io::EOF,
+ r: rune => r,
+ };
+
+ if (ascii::isalpha(r) || r == '_' || r == '@') {
+ unget(lex, r);
+ abort(); // TODO: Keywords/names
+ };
+ if (ascii::isdigit(r)) {
+ unget(lex, r);
+ abort(); // TODO: Literals
+ };
+
+ let tok: token = switch (r) {
+ * => return mkloc(lex),
+ '"', '\'' => abort(), // TODO: Strings/runes
+ '.', '<', '>' => return lex3(lex, r),
+ '^', '*', '%', '/', '+', '-', ':', '!', '&', '|', '=' => {
+ return lex2(lex, r);
+ },
+ '~' => btoken::BNOT,
+ ',' => btoken::COMMA,
+ '{' => btoken::LBRACE,
+ '[' => btoken::LBRACKET,
+ '(' => btoken::LPAREN,
+ '}' => btoken::RBRACE,
+ ']' => btoken::RBRACKET,
+ ')' => btoken::RPAREN,
+ ';' => btoken::SEMICOLON,
+ };
+
+ return (tok, loc);
+};
+
+fn lex3(lex: *lexer, r: rune) ((token, location) | io::EOF | error) = {
+ abort();
+ return io::EOF; // TODO
+};
+
+fn lex2(lex: *lexer, r: rune) ((token, location) | io::EOF | error) = {
+ abort();
return io::EOF; // TODO
};
@@ -51,10 +110,34 @@ fn next(lex: *lexer) (rune | io::EOF | io::error) = {
return r;
},
};
- return match (io::getrune(lex.in)) {
- r: rune => r,
- io::EOF => io::EOF,
- err: io::error => err,
+
+ for (true) {
+ return match (io::getrune(lex.in)) {
+ io::EOF => io::EOF,
+ err: io::error => err,
+ r: rune => {
+ lexloc(lex, r);
+ if (ascii::isspace(r)) continue;
+ r;
+ },
+ };
+ };
+
+ abort("unreachable");
+};
+
+fn lexloc(lex: *lexer, r: rune) void = {
+ switch (r) {
+ '\n' => {
+ lex.loc.0 += 1;
+ lex.loc.1 = 1;
+ },
+ '\t' => {
+ lex.loc.1 += 8;
+ },
+ * => {
+ lex.loc.1 += 1;
+ },
};
};
@@ -66,6 +149,12 @@ fn unget(lex: *lexer, r: (rune | io::EOF)) void = {
lex.rb[0] = r;
};
+fn mkloc(lex: *lexer) location = location {
+ path = lex.path,
+ line = lex.loc.0,
+ col = lex.loc.1,
+};
+
@test fn unget() void = {
let lexer = lexer_init(io::empty, "<test>");
unget(&lexer, 'x');
@@ -77,15 +166,37 @@ fn unget(lex: *lexer, r: (rune | io::EOF)) void = {
@test fn unlex() void = {
let lexer = lexer_init(io::empty, "<test>");
- unlex(&lexer, (base_token::IF, location {
+ unlex(&lexer, (btoken::IF, location {
path = "<test>",
- start = linecol { line = 1234, col = 1234 },
- end = linecol { line = 1234, col = 1234 },
+ line = 1234,
+ col = 1234,
}));
let t = lex(&lexer) as (token, location);
- assert(t.0 is base_token);
- assert(t.0 as base_token == base_token::IF);
+ assert(t.0 is btoken);
+ assert(t.0 as btoken == btoken::IF);
assert(t.1.path == "<test>");
- assert(t.1.start.line == 1234 && t.1.start.col == 1234);
- assert(t.1.end.line == 1234 && t.1.end.col == 1234);
+ assert(t.1.line == 1234 && t.1.col == 1234);
+};
+
+@test fn lex1() void = {
+ const in = "~,{[(}]);";
+ const expected = [
+ btoken::BNOT,
+ btoken::COMMA,
+ btoken::LBRACE,
+ btoken::LBRACKET,
+ btoken::LPAREN,
+ btoken::RBRACE,
+ btoken::RBRACKET,
+ btoken::RPAREN,
+ btoken::SEMICOLON,
+ ];
+ let lexer = lexer_init(bufio::fixed(strings::to_utf8(in)), "<test>");
+ for (let i = 0z; i < len(expected); i += 1) {
+ let tl = lex(&lexer) as (token, location);
+ let tok = tl.0, loc = tl.1;
+ assert(tok as btoken == expected[i]);
+ assert(loc.path == "<test>");
+ assert(loc.line == 1 && loc.col == i + 1);
+ };
};
diff --git a/hare/lex/token.ha b/hare/lex/token.ha
@@ -1,5 +1,6 @@
// A token with no additional context, such as '+'
-export type base_token = enum {
+export type btoken = enum {
+ // Keep ordered with bmap
// Alpha shorted
ATTR_FINI,
ATTR_INIT,
@@ -111,6 +112,116 @@ export type base_token = enum {
BXOREQ,
};
+const bmap: [_]str = [
+ // Keep ordered with btoken
+ "@fini",
+ "@init",
+ "@noreturn",
+ "@symbol",
+ "@test",
+ "_",
+ "abort",
+ "alloc",
+ "append",
+ "as",
+ "assert",
+ "bool",
+ "break",
+ "char",
+ "const",
+ "continue",
+ "def",
+ "defer",
+ "else",
+ "enum",
+ "export",
+ "f32",
+ "f64",
+ "false",
+ "fn",
+ "for",
+ "free",
+ "i16",
+ "i32",
+ "i64",
+ "i8",
+ "if",
+ "int",
+ "is",
+ "len",
+ "let",
+ "match",
+ "null",
+ "nullable",
+ "offset",
+ "return",
+ "rune",
+ "size",
+ "static",
+ "str",
+ "struct",
+ "switch",
+ "true",
+ "type",
+ "u16",
+ "u32",
+ "u64",
+ "u8",
+ "uint",
+ "uintptr",
+ "union",
+ "use",
+ "void",
+ "&=",
+ "&",
+ "~",
+ "|",
+ "case",
+ ":",
+ ",",
+ "/",
+ "/=",
+ ".",
+ "::",
+ "...",
+ "=",
+ ">",
+ ">=",
+ "&&",
+ "{",
+ "[",
+ "==",
+ "<",
+ "<=",
+ "!",
+ "||",
+ "(",
+ "<<",
+ "<<=",
+ "^^",
+ "-",
+ "-=",
+ "--",
+ "%=",
+ "%",
+ "!=",
+ "|=",
+ "+",
+ "+=",
+ "++",
+ "}",
+ "]",
+ ")",
+ ">>",
+ ">>=",
+ ";",
+ "..",
+ "*",
+ "*=",
+ "^",
+ "^=",
+];
+
// A loop label, such as ':example'
export type label = str;
@@ -147,15 +258,20 @@ export type literal = struct {
},
};
-// A tuple of a line number and column number, counting from 1.
-export type linecol = struct { line: uint, col: uint };
-
// A location within a source file.
export type location = struct {
path: str,
- start: linecol,
- end: linecol,
+ line: uint,
+ col: uint
};
// A single lexical token.
-export type token = (base_token | label | name | literal);
+export type token = (btoken | label | name | literal);
+
+// Converts a token to its string representation
+export fn tokstr(tok: token) const str = {
+ return match (tok) {
+ b: btoken => bmap[b: int],
+ * => abort(), // TODO
+ };
+};