hare::lex: lex1 - hare - The Hare programming language

commit 37f47c120bf9cf72a226eabf12462e22df275c20
parent f62b8cb02d3991e2bc51e8d064f3d85b2b8da6d4
Author: Drew DeVault <sir@cmpwn.com>
Date:   Sat, 13 Feb 2021 14:27:09 -0500

hare::lex: lex1

Diffstat:
A bufio/fixed.ha  | 32 ++++++++++++++++++++++++++++++++
M hare/lex/lex.ha  | 139 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
M hare/lex/token.ha  | 130 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----

3 files changed, 280 insertions(+), 21 deletions(-)
diff --git a/bufio/fixed.ha b/bufio/fixed.ha
@@ -0,0 +1,32 @@
+use io;
+use rt;
+
+// XXX: All of this is temporary
+export type fixed_stream = struct {
+	stream: io::stream,
+	buf: []u8,
+};
+
+export fn fixed(in: []u8) *io::stream = {
+	let s = alloc(*fixed_stream, fixed_stream {
+		stream = io::stream {
+			name = "<bufio::fixed>",
+			reader = &fixed_read,
+			...
+		},
+		buf = in,
+	});
+	return &s.stream;
+};
+
+fn fixed_read(s: *io::stream, buf: []u8) (size | io::error | io::EOF) = {
+	let stream = s: *fixed_stream;
+	if (len(stream.buf) == 0) {
+		return io::EOF;
+	};
+	const n = if (len(buf) > len(stream.buf)) len(stream.buf) else len(buf);
+	// TODO: Fix me up once slice copying is in
+	rt::memcpy(buf: *[*]u8, stream.buf: *[*]u8, n);
+	stream.buf = stream.buf[n..];
+	return n;
+};
diff --git a/hare/lex/lex.ha b/hare/lex/lex.ha
@@ -1,4 +1,6 @@
 // hare::lex provides a lexer for Hare source code.
+use ascii;
+use bufio;
 use io;
 use strings;
 use types;
@@ -7,22 +9,35 @@ use types;
 export type lexer = struct {
 	in: *io::stream,
 	path: str,
-	loc: linecol,
+	loc: (uint, uint),
 	un: ((token, location) | void),
 	rb: [2](rune | io::EOF | void),
 };
 
+// A syntax error
+export type syntax = location;
+
+// All possible lexer errors
+export type error = (io::error | syntax);
+
+export fn errstr(err: error) const str = {
+	return match (err) {
+		err: io::error => io::errstr(err),
+		syntax => "Syntax error", // TODO: add line info
+	};
+};
+
 // Initializes a new lexer for the given input stream. The path is borrowed.
 export fn lexer_init(in: *io::stream, path: str) lexer = lexer {
 	in = in,
 	path = path,
-	loc = linecol { line = 1, col = 1 },
+	loc = (1, 1),
 	un = void,
 	rb = [void...],
 };
 
 // Returns the next token from the lexer.
-export fn lex(lex: *lexer) ((token, location) | io::EOF | io::error) = {
+export fn lex(lex: *lexer) ((token, location) | io::EOF | error) = {
 	match (lex.un) {
 		tok: (token, location) => {
 			lex.un = void;
@@ -31,6 +46,50 @@ export fn lex(lex: *lexer) ((token, location) | io::EOF | io::error) = {
 		void => void,
 	};
 
+	let loc = mkloc(lex);
+	let r: rune = match (next(lex)) {
+		e: io::error => return e,
+		io::EOF => return io::EOF,
+		r: rune => r,
+	};
+
+	if (ascii::isalpha(r) || r == '_' || r == '@') {
+		unget(lex, r);
+		abort(); // TODO: Keywords/names
+	};
+	if (ascii::isdigit(r)) {
+		unget(lex, r);
+		abort(); // TODO: Literals
+	};
+
+	let tok: token = switch (r) {
+		* => return mkloc(lex),
+		'"', '\'' => abort(), // TODO: Strings/runes
+		'.', '<', '>' => return lex3(lex, r),
+		'^', '*', '%', '/', '+', '-', ':', '!', '&', '|', '=' => {
+			return lex2(lex, r);
+		},
+		'~' => btoken::BNOT,
+		',' => btoken::COMMA,
+		'{' => btoken::LBRACE,
+		'[' => btoken::LBRACKET,
+		'(' => btoken::LPAREN,
+		'}' => btoken::RBRACE,
+		']' => btoken::RBRACKET,
+		')' => btoken::RPAREN,
+		';' => btoken::SEMICOLON,
+	};
+
+	return (tok, loc);
+};
+
+fn lex3(lex: *lexer, r: rune) ((token, location) | io::EOF | error) = {
+	abort();
+	return io::EOF; // TODO
+};
+
+fn lex2(lex: *lexer, r: rune) ((token, location) | io::EOF | error) = {
+	abort();
 	return io::EOF; // TODO
 };
 
@@ -51,10 +110,34 @@ fn next(lex: *lexer) (rune | io::EOF | io::error) = {
 			return r;
 		},
 	};
-	return match (io::getrune(lex.in)) {
-		r: rune => r,
-		io::EOF => io::EOF,
-		err: io::error => err,
+
+	for (true) {
+		return match (io::getrune(lex.in)) {
+			io::EOF => io::EOF,
+			err: io::error => err,
+			r: rune => {
+				lexloc(lex, r);
+				if (ascii::isspace(r)) continue;
+				r;
+			},
+		};
+	};
+
+	abort("unreachable");
+};
+
+fn lexloc(lex: *lexer, r: rune) void = {
+	switch (r) {
+		'\n' => {
+			lex.loc.0 += 1;
+			lex.loc.1 = 1;
+		},
+		'\t' => {
+			lex.loc.1 += 8;
+		},
+		* => {
+			lex.loc.1 += 1;
+		},
 	};
 };
 
@@ -66,6 +149,12 @@ fn unget(lex: *lexer, r: (rune | io::EOF)) void = {
 	lex.rb[0] = r;
 };
 
+fn mkloc(lex: *lexer) location = location {
+	path = lex.path,
+	line = lex.loc.0,
+	col = lex.loc.1,
+};
+
 @test fn unget() void = {
 	let lexer = lexer_init(io::empty, "<test>");
 	unget(&lexer, 'x');
@@ -77,15 +166,37 @@ fn unget(lex: *lexer, r: (rune | io::EOF)) void = {
 
 @test fn unlex() void = {
 	let lexer = lexer_init(io::empty, "<test>");
-	unlex(&lexer, (base_token::IF, location {
+	unlex(&lexer, (btoken::IF, location {
 		path = "<test>",
-		start = linecol { line = 1234, col = 1234 },
-		end = linecol { line = 1234, col = 1234 },
+		line = 1234,
+		col = 1234,
 	}));
 	let t = lex(&lexer) as (token, location);
-	assert(t.0 is base_token);
-	assert(t.0 as base_token == base_token::IF);
+	assert(t.0 is btoken);
+	assert(t.0 as btoken == btoken::IF);
 	assert(t.1.path == "<test>");
-	assert(t.1.start.line == 1234 && t.1.start.col == 1234);
-	assert(t.1.end.line == 1234 && t.1.end.col == 1234);
+	assert(t.1.line == 1234 && t.1.col == 1234);
+};
+
+@test fn lex1() void = {
+	const in = "~,{[(}]);";
+	const expected = [
+		btoken::BNOT,
+		btoken::COMMA,
+		btoken::LBRACE,
+		btoken::LBRACKET,
+		btoken::LPAREN,
+		btoken::RBRACE,
+		btoken::RBRACKET,
+		btoken::RPAREN,
+		btoken::SEMICOLON,
+	];
+	let lexer = lexer_init(bufio::fixed(strings::to_utf8(in)), "<test>");
+	for (let i = 0z; i < len(expected); i += 1) {
+		let tl = lex(&lexer) as (token, location);
+		let tok = tl.0, loc = tl.1;
+		assert(tok as btoken == expected[i]);
+		assert(loc.path == "<test>");
+		assert(loc.line == 1 && loc.col == i + 1);
+	};
 };
diff --git a/hare/lex/token.ha b/hare/lex/token.ha
@@ -1,5 +1,6 @@
 // A token with no additional context, such as '+'
-export type base_token = enum {
+export type btoken = enum {
+	// Keep ordered with bmap
 	// Alpha shorted
 	ATTR_FINI,
 	ATTR_INIT,
@@ -111,6 +112,116 @@ export type base_token = enum {
 	BXOREQ,
 };
 
+const bmap: [_]str = [
+	// Keep ordered with btoken
+	"@fini",
+	"@init",
+	"@noreturn",
+	"@symbol",
+	"@test",
+	"_",
+	"abort",
+	"alloc",
+	"append",
+	"as",
+	"assert",
+	"bool",
+	"break",
+	"char",
+	"const",
+	"continue",
+	"def",
+	"defer",
+	"else",
+	"enum",
+	"export",
+	"f32",
+	"f64",
+	"false",
+	"fn",
+	"for",
+	"free",
+	"i16",
+	"i32",
+	"i64",
+	"i8",
+	"if",
+	"int",
+	"is",
+	"len",
+	"let",
+	"match",
+	"null",
+	"nullable",
+	"offset",
+	"return",
+	"rune",
+	"size",
+	"static",
+	"str",
+	"struct",
+	"switch",
+	"true",
+	"type",
+	"u16",
+	"u32",
+	"u64",
+	"u8",
+	"uint",
+	"uintptr",
+	"union",
+	"use",
+	"void",
+	"&=",
+	"&",
+	"~",
+	"|",
+	"case",
+	":",
+	",",
+	"/",
+	"/=",
+	".",
+	"::",
+	"...",
+	"=",
+	">",
+	">=",
+	"&&",
+	"{",
+	"[",
+	"==",
+	"<",
+	"<=",
+	"!",
+	"||",
+	"(",
+	"<<",
+	"<<=",
+	"^^",
+	"-",
+	"-=",
+	"--",
+	"%=",
+	"%",
+	"!=",
+	"|=",
+	"+",
+	"+=",
+	"++",
+	"}",
+	"]",
+	")",
+	">>",
+	">>=",
+	";",
+	"..",
+	"*",
+	"*=",
+	"^",
+	"^=",
+];
+
 // A loop label, such as ':example'
 export type label = str;
 
@@ -147,15 +258,20 @@ export type literal = struct {
 	},
 };
 
-// A tuple of a line number and column number, counting from 1.
-export type linecol = struct { line: uint, col: uint };
-
 // A location within a source file.
 export type location = struct {
 	path: str,
-	start: linecol,
-	end: linecol,
+	line: uint,
+	col: uint
 };
 
 // A single lexical token.
-export type token = (base_token | label | name | literal);
+export type token = (btoken | label | name | literal);
+
+// Converts a token to its string representation
+export fn tokstr(tok: token) const str = {
+	return match (tok) {
+		b: btoken => bmap[b: int],
+		* => abort(), // TODO
+	};
+};

	hare The Hare programming language
	git clone https://git.torresjrjr.com/hare.git
	Log \| Files \| Refs \| README \| LICENSE

A	bufio/fixed.ha	\|	32	++++++++++++++++++++++++++++++++
M	hare/lex/lex.ha	\|	139	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
M	hare/lex/token.ha	\|	130	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----