hare::lex: lex2 - hare - The Hare programming language

commit 1bdaa5ffcccd5de266f7837ba7b7009ebbeb9886
parent 6085c10bebe42fd7dcb41734c86d122e5b8736fe
Author: Drew DeVault <sir@cmpwn.com>
Date:   Sun, 14 Feb 2021 16:46:26 -0500

hare::lex: lex2

Diffstat:
A hare/lex/+test.ha  | 101 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M hare/lex/lex.ha  | 163 +++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------

2 files changed, 207 insertions(+), 57 deletions(-)
diff --git a/hare/lex/+test.ha b/hare/lex/+test.ha
@@ -0,0 +1,101 @@
+use bufio;
+use fmt;
+use io;
+use strings;
+
+@test fn unget() void = {
+	let lexer = lexer_init(bufio::fixed(strings::to_utf8("z")), "<test>");
+	unget(&lexer, 'x');
+	unget(&lexer, 'y');
+	assert(next(&lexer) as rune == 'y');
+	assert(next(&lexer) as rune == 'x');
+	assert(next(&lexer) as rune == 'z');
+	assert(next(&lexer) is io::EOF);
+	unget(&lexer, io::EOF);
+	assert(next(&lexer) is io::EOF);
+};
+
+@test fn unlex() void = {
+	let lexer = lexer_init(io::empty, "<test>");
+	unlex(&lexer, (btoken::IF, location {
+		path = "<test>",
+		line = 1234,
+		col = 1234,
+	}));
+	let t = lex(&lexer) as (token, location);
+	assert(t.0 is btoken);
+	assert(t.0 as btoken == btoken::IF);
+	assert(t.1.path == "<test>");
+	assert(t.1.line == 1234 && t.1.col == 1234);
+};
+
+fn lextest(in: str, expected: [](uint, uint, token)) void = {
+	let lexer = lexer_init(bufio::fixed(strings::to_utf8(in)), "<test>");
+	for (let i = 0z; i < len(expected); i += 1) {
+		let eline = expected[i].0, ecol = expected[i].1,
+			etok = expected[i].2;
+		let tl = lex(&lexer) as (token, location);
+		let tok = tl.0, loc = tl.1;
+		match (tok) {
+			b: btoken => if (etok as btoken != b) {
+				fmt::errorln("bad token at {}: got {}, wanted {}",
+					i, tokstr(tok), tokstr(etok));
+				abort();
+			},
+			* => abort("TODO"),
+		};
+		assert(loc.path == "<test>");
+		if (loc.line != eline || loc.col != ecol) {
+			fmt::errorln("bad line/col at {}: got {},{}; wanted {},{}",
+				i, loc.line, loc.col, eline, ecol);
+			abort();
+		};
+	};
+	assert(lex(&lexer) is io::EOF);
+};
+
+@test fn lex1() void = {
+	const in = "~,{[(}]);";
+	const expected: [_](uint, uint, token) = [
+		(1, 1, btoken::BNOT),
+		(1, 2, btoken::COMMA),
+		(1, 3, btoken::LBRACE),
+		(1, 4, btoken::LBRACKET),
+		(1, 5, btoken::LPAREN),
+		(1, 6, btoken::RBRACE),
+		(1, 7, btoken::RBRACKET),
+		(1, 8, btoken::RPAREN),
+		(1, 9, btoken::SEMICOLON),
+	];
+	lextest(in, expected);
+};
+
+@test fn lex2() void = {
+	// Ends with = to test =, EOF
+	const in = "^ ^^ ^= * *= % %= + += - -= : :: & && &= | || |= = == =";
+	const expected: [_](uint, uint, token) = [
+		(1, 1,  btoken::BXOR),
+		(1, 3,  btoken::LXOR),
+		(1, 6,  btoken::BXOREQ),
+		(1, 9,  btoken::TIMES),
+		(1, 11,  btoken::TIMESEQ),
+		(1, 14, btoken::MODULO),
+		(1, 16, btoken::MODEQ),
+		(1, 19,  btoken::PLUS),
+		(1, 21,  btoken::PLUSEQ),
+		(1, 24,  btoken::MINUS),
+		(1, 26,  btoken::MINUSEQ),
+		(1, 29,  btoken::COLON),
+		(1, 31,  btoken::DOUBLE_COLON),
+		(1, 34,  btoken::BAND),
+		(1, 36,  btoken::LAND),
+		(1, 39,  btoken::ANDEQ),
+		(1, 42,  btoken::BOR),
+		(1, 44,  btoken::LOR),
+		(1, 47,  btoken::OREQ),
+		(1, 50,  btoken::EQUAL),
+		(1, 52,  btoken::LEQUAL),
+		(1, 55,  btoken::EQUAL),
+	];
+	lextest(in, expected);
+};
diff --git a/hare/lex/lex.ha b/hare/lex/lex.ha
@@ -1,9 +1,8 @@
 // hare::lex provides a lexer for Hare source code.
 use ascii;
-use bufio;
 use io;
 use strings;
-use types;
+use fmt;
 
 // State associated with a lexer.
 export type lexer = struct {
@@ -46,11 +45,14 @@ export fn lex(lex: *lexer) ((token, location) | io::EOF | error) = {
 		void => void,
 	};
 
-	let loc = mkloc(lex);
-	let r: rune = match (next(lex)) {
+	let loc = location { ... };
+	let r: rune = match (nextw(lex)) {
 		e: io::error => return e,
 		io::EOF => return io::EOF,
-		r: rune => r,
+		r: (rune, location) => {
+			loc = r.1;
+			r.0;
+		},
 	};
 
 	if (ascii::isalpha(r) || r == '_' || r == '@') {
@@ -63,11 +65,11 @@ export fn lex(lex: *lexer) ((token, location) | io::EOF | error) = {
 	};
 
 	let tok: token = switch (r) {
-		* => return mkloc(lex),
+		* => return syntaxerr(loc),
 		'"', '\'' => abort(), // TODO: Strings/runes
 		'.', '<', '>' => return lex3(lex, r),
 		'^', '*', '%', '/', '+', '-', ':', '!', '&', '|', '=' => {
-			return lex2(lex, r);
+			return lex2(lex, loc, r);
 		},
 		'~' => btoken::BNOT,
 		',' => btoken::COMMA,
@@ -82,12 +84,90 @@ export fn lex(lex: *lexer) ((token, location) | io::EOF | error) = {
 	return (tok, loc);
 };
 
-fn lex3(lex: *lexer, r: rune) ((token, location) | io::EOF | error) = {
-	abort();
-	return io::EOF; // TODO
+fn lex2(
+	lex: *lexer,
+	loc: location,
+	r: rune,
+) ((token, location) | io::EOF | error) = {
+	let n = match (next(lex)) {
+		err: io::error => return err,
+		io::EOF => io::EOF,
+		r: rune => r,
+	};
+	let tok: token = switch (r) {
+		'^' => match (n) {
+			r: rune => switch (r) {
+				'^' => return (btoken::LXOR: token, loc),
+				'=' => return (btoken::BXOREQ: token, loc),
+				*   => btoken::BXOR,
+			},
+			io::EOF => btoken::BXOR,
+		},
+		'*' => match (n) {
+			r: rune => switch (r) {
+				'=' => return (btoken::TIMESEQ: token, loc),
+				*   => btoken::TIMES,
+			},
+			io::EOF => btoken::TIMES,
+		},
+		'%' => match (n) {
+			r: rune => switch (r) {
+				'=' => return (btoken::MODEQ: token, loc),
+				*   => btoken::MODULO,
+			},
+			io::EOF => btoken::MODULO,
+		},
+		'+' => match (n) {
+			r: rune => switch (r) {
+				'=' => return (btoken::PLUSEQ: token, loc),
+				*   => btoken::PLUS,
+			},
+			io::EOF => btoken::PLUS,
+		},
+		'-' => match (n) {
+			r: rune => switch (r) {
+				'=' => return (btoken::MINUSEQ: token, loc),
+				*   => btoken::MINUS,
+			},
+			io::EOF => btoken::MINUS,
+		},
+		':' => match (n) {
+			r: rune => switch (r) {
+				':' => return (btoken::DOUBLE_COLON: token, loc),
+				*   => btoken::COLON,
+			},
+			io::EOF => btoken::COLON,
+		},
+		'&' => match (n) {
+			r: rune => switch (r) {
+				'&' => return (btoken::LAND: token, loc),
+				'=' => return (btoken::ANDEQ: token, loc),
+				*   => btoken::BAND,
+			},
+			io::EOF => btoken::BAND,
+		},
+		'|' => match (n) {
+			r: rune => switch (r) {
+				'|' => return (btoken::LOR: token, loc),
+				'=' => return (btoken::OREQ: token, loc),
+				*   => btoken::BOR,
+			},
+			io::EOF => btoken::BOR,
+		},
+		'=' => match (n) {
+			r: rune => switch (r) {
+				'=' => return (btoken::LEQUAL: token, loc),
+				*   => btoken::EQUAL,
+			},
+			io::EOF => btoken::EQUAL,
+		},
+		* => return syntaxerr(loc),
+	};
+	unget(lex, n);
+	return (tok, loc);
 };
 
-fn lex2(lex: *lexer, r: rune) ((token, location) | io::EOF | error) = {
+fn lex3(lex: *lexer, r: rune) ((token, location) | io::EOF | error) = {
 	abort();
 	return io::EOF; // TODO
 };
@@ -116,7 +196,6 @@ fn next(lex: *lexer) (rune | io::EOF | io::error) = {
 			err: io::error => err,
 			r: rune => {
 				lexloc(lex, r);
-				if (ascii::isspace(r)) continue;
 				r;
 			},
 		};
@@ -125,6 +204,20 @@ fn next(lex: *lexer) (rune | io::EOF | io::error) = {
 	abort("unreachable");
 };
 
+fn nextw(lex: *lexer) ((rune, location) | io::EOF | io::error) = {
+	for (true) {
+		let loc = mkloc(lex);
+		match (next(lex)) {
+			err: io::error => return err,
+			io::EOF => return io::EOF,
+			r: rune => if (!ascii::isspace(r)) {
+				return (r, loc);
+			},
+		};
+	};
+	abort();
+};
+
 fn lexloc(lex: *lexer, r: rune) void = {
 	switch (r) {
 		'\n' => {
@@ -154,48 +247,4 @@ fn mkloc(lex: *lexer) location = location {
 	col = lex.loc.1,
 };
 
-@test fn unget() void = {
-	let lexer = lexer_init(io::empty, "<test>");
-	unget(&lexer, 'x');
-	unget(&lexer, 'y');
-	assert(next(&lexer) as rune == 'y');
-	assert(next(&lexer) as rune == 'x');
-	assert(next(&lexer) is io::EOF);
-};
-
-@test fn unlex() void = {
-	let lexer = lexer_init(io::empty, "<test>");
-	unlex(&lexer, (btoken::IF, location {
-		path = "<test>",
-		line = 1234,
-		col = 1234,
-	}));
-	let t = lex(&lexer) as (token, location);
-	assert(t.0 is btoken);
-	assert(t.0 as btoken == btoken::IF);
-	assert(t.1.path == "<test>");
-	assert(t.1.line == 1234 && t.1.col == 1234);
-};
-
-@test fn lex1() void = {
-	const in = "~,{[(}]);";
-	const expected = [
-		btoken::BNOT,
-		btoken::COMMA,
-		btoken::LBRACE,
-		btoken::LBRACKET,
-		btoken::LPAREN,
-		btoken::RBRACE,
-		btoken::RBRACKET,
-		btoken::RPAREN,
-		btoken::SEMICOLON,
-	];
-	let lexer = lexer_init(bufio::fixed(strings::to_utf8(in)), "<test>");
-	for (let i = 0z; i < len(expected); i += 1) {
-		let tl = lex(&lexer) as (token, location);
-		let tok = tl.0, loc = tl.1;
-		assert(tok as btoken == expected[i]);
-		assert(loc.path == "<test>");
-		assert(loc.line == 1 && loc.col == i + 1);
-	};
-};
+fn syntaxerr(loc: location) error = loc: syntax: error;

	hare The Hare programming language
	git clone https://git.torresjrjr.com/hare.git
	Log \| Files \| Refs \| README \| LICENSE

A	hare/lex/+test.ha	\|	101	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	hare/lex/lex.ha	\|	163	+++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------