hare::lex: initial riggings - hare - The Hare programming language

commit 86d5d5c432fd86fa4ccebdfecdbebcf27820f4a5
parent 159dbb469fbefb328a0adf5cb1de1fb2d199115c
Author: Drew DeVault <sir@cmpwn.com>
Date:   Sat, 13 Feb 2021 11:35:33 -0500

hare::lex: initial riggings

Diffstat:
A bufio/fixed.ha  | 32 ++++++++++++++++++++++++++++++++
A hare/lex/lex.ha  | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A hare/lex/token.ha  | 161 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

3 files changed, 271 insertions(+), 0 deletions(-)
diff --git a/bufio/fixed.ha b/bufio/fixed.ha
@@ -0,0 +1,32 @@
+use io;
+use rt;
+
+// XXX: All of this is temporary
+export type fixed_stream = struct {
+	stream: io::stream,
+	buf: []u8,
+};
+
+export fn fixed(in: []u8) *io::stream = {
+	let s = alloc(*fixed_stream, fixed_stream {
+		stream = io::stream {
+			name = "<bufio::fixed>",
+			reader = &fixed_read,
+			...
+		},
+		buf = in,
+	});
+	return &s.stream;
+};
+
+fn fixed_read(s: *io::stream, buf: []u8) (size | io::error | io::EOF) = {
+	let stream = s: *fixed_stream;
+	if (len(stream.buf) == 0) {
+		return io::EOF;
+	};
+	const n = if (len(buf) > len(stream.buf)) len(stream.buf) else len(buf);
+	// TODO: Fix me up once slice copying is in
+	rt::memcpy(buf: *[*]u8, stream.buf: *[*]u8, n);
+	stream.buf = stream.buf[n..];
+	return n;
+};
diff --git a/hare/lex/lex.ha b/hare/lex/lex.ha
@@ -0,0 +1,78 @@
+// hare::lex provides a lexer for Hare source code.
+use bufio;
+use io;
+use strings;
+use types;
+use fmt;
+
+// State associated with a lexer.
+export type lexer = struct {
+	in: *io::stream,
+	path: str,
+	loc: linecol,
+	un: ((token, location) | void),
+	rb: [2](rune | io::EOF | void),
+};
+
+// Initializes a new lexer for the given input stream. The path is borrowed.
+export fn lexer_init(in: *io::stream, path: str) lexer = lexer {
+	in = in,
+	path = path,
+	loc = (1, 1),
+	un = void,
+	rb = [void...],
+};
+
+// Returns the next token from the lexer.
+export fn lex(lex: *lexer) ((token, location) | io::EOF | io::error) = {
+	match (lex.un) {
+		tok: (token, location) => {
+			lex.un = void;
+			return tok;
+		},
+		void => void,
+	};
+
+	return io::EOF; // TODO
+};
+
+// Unlex a single token. The next call to [lex] will return this token, location
+// pair. Only one unlex is supported at a time; you must call [lex] before
+// calling [unlex] again.
+export fn unlex(lex: *lexer, tok: (token, location)) void = {
+	assert(lex.un is void, "attempted to unlex more than one token");
+	lex.un = tok;
+};
+
+fn next(lex: *lexer) (rune | io::EOF | io::error) = {
+	match (lex.rb[0]) {
+		void => void,
+		r: (rune | io::EOF) => {
+			lex.rb[0] = lex.rb[1];
+			lex.rb[1] = void;
+			return r;
+		},
+	};
+	return match (io::getrune(lex.in)) {
+		r: rune => r,
+		io::EOF => io::EOF,
+		err: io::error => err,
+	};
+};
+
+fn unget(lex: *lexer, r: (rune | io::EOF)) void = {
+	if (!(lex.rb[0] is void)) {
+		assert(lex.rb[1] is void, "ungot too many runes");
+		lex.rb[1] = lex.rb[0];
+	};
+	lex.rb[0] = r;
+};
+
+@test fn unget() void = {
+	let lexer = lexer_init(bufio::fixed([]), "<test>");
+	unget(&lexer, 'x');
+	unget(&lexer, 'y');
+	assert(next(&lexer) as rune == 'y');
+	assert(next(&lexer) as rune == 'x');
+	assert(next(&lexer) is io::EOF);
+};
diff --git a/hare/lex/token.ha b/hare/lex/token.ha
@@ -0,0 +1,161 @@
+// A token with no additional context, such as '+'
+export type base_token = enum {
+	// Alpha shorted
+	ATTR_FINI,
+	ATTR_INIT,
+	ATTR_NORETURN,
+	ATTR_SYMBOL,
+	ATTR_TEST,
+	UNDERSCORE,
+	ABORT,
+	ALLOC,
+	APPEND,
+	AS,
+	ASSERT,
+	BOOL,
+	BREAK,
+	CHAR,
+	CONST,
+	CONTINUE,
+	DEF,
+	DEFER,
+	ELSE,
+	ENUM,
+	EXPORT,
+	F32,
+	F64,
+	FALSE,
+	FN,
+	FOR,
+	FREE,
+	I16,
+	I32,
+	I64,
+	I8,
+	IF,
+	INT,
+	IS,
+	LEN,
+	LET,
+	MATCH,
+	NULL,
+	NULLABLE,
+	OFFSET,
+	RETURN,
+	RUNE,
+	SIZE,
+	STATIC,
+	STR,
+	STRUCT,
+	SWITCH,
+	TRUE,
+	TYPE,
+	U16,
+	U32,
+	U64,
+	U8,
+	UINT,
+	UINTPTR,
+	UNION,
+	USE,
+	VOID,
+
+	// Operators
+	ANDEQ,
+	BAND,
+	BNOT,
+	BOR,
+	CASE,
+	COLON,
+	COMMA,
+	DIV,
+	DIVEQ,
+	DOT,
+	DOUBLE_COLON,
+	ELLIPSIS,
+	EQUAL,
+	GREATER,
+	GREATEREQ,
+	LAND,
+	LBRACE,
+	LBRACKET,
+	LEQUAL,
+	LESS,
+	LESSEQ,
+	LNOT,
+	LOR,
+	LPAREN,
+	LSHIFT,
+	LSHIFTEQ,
+	LXOR,
+	MINUS,
+	MINUSEQ,
+	MINUSMINUS,
+	MODEQ,
+	MODULO,
+	NEQUAL,
+	OREQ,
+	PLUS,
+	PLUSEQ,
+	PLUSPLUS,
+	RBRACE,
+	RBRACKET,
+	RPAREN,
+	RSHIFT,
+	RSHIFTEQ,
+	SEMICOLON,
+	SLICE,
+	TIMES,
+	TIMESEQ,
+	BXOR,
+	BXOREQ,
+};
+
+// A loop label, such as ':example'
+export type label = str;
+
+// A name, such as 'example'
+export type name = str;
+
+// The type of a literal token, such as '1337u32' (U32)
+export type literal_type = enum {
+	U8,
+	U16,
+	U32,
+	U64,
+	UINT,
+	UINTPTR,
+	I8,
+	I16,
+	I32,
+	I64,
+	INT,
+	F32,
+	F64,
+	VOID,
+};
+
+// A token for a literal value, such as '1337u32'
+export type literal = struct {
+	storage: literal_type,
+	union {
+		string: str,
+		_rune: rune,
+		_int: i64,
+		_uint: u64,
+		float: f64,
+	},
+};
+
+// A tuple of a line number and column number, counting from 1.
+export type linecol = (uint, uint);
+
+// A location within a source file.
+export type location = struct {
+	path: str,
+	start: linecol,
+	end: linecol,
+};
+
+// A single lexical token.
+export type token = (base_token | label | name | literal);

	hare The Hare programming language
	git clone https://git.torresjrjr.com/hare.git
	Log \| Files \| Refs \| README \| LICENSE

A	bufio/fixed.ha	\|	32	++++++++++++++++++++++++++++++++
A	hare/lex/lex.ha	\|	78	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	hare/lex/token.ha	\|	161	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++