commit 86d5d5c432fd86fa4ccebdfecdbebcf27820f4a5
parent 159dbb469fbefb328a0adf5cb1de1fb2d199115c
Author: Drew DeVault <sir@cmpwn.com>
Date: Sat, 13 Feb 2021 11:35:33 -0500
hare::lex: initial riggings
Diffstat:
A | bufio/fixed.ha | | | 32 | ++++++++++++++++++++++++++++++++ |
A | hare/lex/lex.ha | | | 78 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
A | hare/lex/token.ha | | | 161 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
3 files changed, 271 insertions(+), 0 deletions(-)
diff --git a/bufio/fixed.ha b/bufio/fixed.ha
@@ -0,0 +1,32 @@
+use io;
+use rt;
+
+// XXX: All of this is temporary
+export type fixed_stream = struct {
+ stream: io::stream,
+ buf: []u8,
+};
+
+export fn fixed(in: []u8) *io::stream = {
+ let s = alloc(*fixed_stream, fixed_stream {
+ stream = io::stream {
+ name = "<bufio::fixed>",
+ reader = &fixed_read,
+ ...
+ },
+ buf = in,
+ });
+ return &s.stream;
+};
+
+fn fixed_read(s: *io::stream, buf: []u8) (size | io::error | io::EOF) = {
+ let stream = s: *fixed_stream;
+ if (len(stream.buf) == 0) {
+ return io::EOF;
+ };
+ const n = if (len(buf) > len(stream.buf)) len(stream.buf) else len(buf);
+ // TODO: Fix me up once slice copying is in
+ rt::memcpy(buf: *[*]u8, stream.buf: *[*]u8, n);
+ stream.buf = stream.buf[n..];
+ return n;
+};
diff --git a/hare/lex/lex.ha b/hare/lex/lex.ha
@@ -0,0 +1,78 @@
+// hare::lex provides a lexer for Hare source code.
+use bufio;
+use io;
+use strings;
+use types;
+use fmt;
+
+// State associated with a lexer.
+export type lexer = struct {
+ in: *io::stream,
+ path: str,
+ loc: linecol,
+ un: ((token, location) | void),
+ rb: [2](rune | io::EOF | void),
+};
+
+// Initializes a new lexer for the given input stream. The path is borrowed.
+export fn lexer_init(in: *io::stream, path: str) lexer = lexer {
+ in = in,
+ path = path,
+ loc = (1, 1),
+ un = void,
+ rb = [void...],
+};
+
+// Returns the next token from the lexer.
+export fn lex(lex: *lexer) ((token, location) | io::EOF | io::error) = {
+ match (lex.un) {
+ tok: (token, location) => {
+ lex.un = void;
+ return tok;
+ },
+ void => void,
+ };
+
+ return io::EOF; // TODO
+};
+
+// Unlex a single token. The next call to [lex] will return this token, location
+// pair. Only one unlex is supported at a time; you must call [lex] before
+// calling [unlex] again.
+export fn unlex(lex: *lexer, tok: (token, location)) void = {
+ assert(lex.un is void, "attempted to unlex more than one token");
+ lex.un = tok;
+};
+
+fn next(lex: *lexer) (rune | io::EOF | io::error) = {
+ match (lex.rb[0]) {
+ void => void,
+ r: (rune | io::EOF) => {
+ lex.rb[0] = lex.rb[1];
+ lex.rb[1] = void;
+ return r;
+ },
+ };
+ return match (io::getrune(lex.in)) {
+ r: rune => r,
+ io::EOF => io::EOF,
+ err: io::error => err,
+ };
+};
+
+fn unget(lex: *lexer, r: (rune | io::EOF)) void = {
+ if (!(lex.rb[0] is void)) {
+ assert(lex.rb[1] is void, "ungot too many runes");
+ lex.rb[1] = lex.rb[0];
+ };
+ lex.rb[0] = r;
+};
+
+@test fn unget() void = {
+ let lexer = lexer_init(bufio::fixed([]), "<test>");
+ unget(&lexer, 'x');
+ unget(&lexer, 'y');
+ assert(next(&lexer) as rune == 'y');
+ assert(next(&lexer) as rune == 'x');
+ assert(next(&lexer) is io::EOF);
+};
diff --git a/hare/lex/token.ha b/hare/lex/token.ha
@@ -0,0 +1,161 @@
+// A token with no additional context, such as '+'
+export type base_token = enum {
+ // Alpha shorted
+ ATTR_FINI,
+ ATTR_INIT,
+ ATTR_NORETURN,
+ ATTR_SYMBOL,
+ ATTR_TEST,
+ UNDERSCORE,
+ ABORT,
+ ALLOC,
+ APPEND,
+ AS,
+ ASSERT,
+ BOOL,
+ BREAK,
+ CHAR,
+ CONST,
+ CONTINUE,
+ DEF,
+ DEFER,
+ ELSE,
+ ENUM,
+ EXPORT,
+ F32,
+ F64,
+ FALSE,
+ FN,
+ FOR,
+ FREE,
+ I16,
+ I32,
+ I64,
+ I8,
+ IF,
+ INT,
+ IS,
+ LEN,
+ LET,
+ MATCH,
+ NULL,
+ NULLABLE,
+ OFFSET,
+ RETURN,
+ RUNE,
+ SIZE,
+ STATIC,
+ STR,
+ STRUCT,
+ SWITCH,
+ TRUE,
+ TYPE,
+ U16,
+ U32,
+ U64,
+ U8,
+ UINT,
+ UINTPTR,
+ UNION,
+ USE,
+ VOID,
+
+ // Operators
+ ANDEQ,
+ BAND,
+ BNOT,
+ BOR,
+ CASE,
+ COLON,
+ COMMA,
+ DIV,
+ DIVEQ,
+ DOT,
+ DOUBLE_COLON,
+ ELLIPSIS,
+ EQUAL,
+ GREATER,
+ GREATEREQ,
+ LAND,
+ LBRACE,
+ LBRACKET,
+ LEQUAL,
+ LESS,
+ LESSEQ,
+ LNOT,
+ LOR,
+ LPAREN,
+ LSHIFT,
+ LSHIFTEQ,
+ LXOR,
+ MINUS,
+ MINUSEQ,
+ MINUSMINUS,
+ MODEQ,
+ MODULO,
+ NEQUAL,
+ OREQ,
+ PLUS,
+ PLUSEQ,
+ PLUSPLUS,
+ RBRACE,
+ RBRACKET,
+ RPAREN,
+ RSHIFT,
+ RSHIFTEQ,
+ SEMICOLON,
+ SLICE,
+ TIMES,
+ TIMESEQ,
+ BXOR,
+ BXOREQ,
+};
+
+// A loop label, such as ':example'
+export type label = str;
+
+// A name, such as 'example'
+export type name = str;
+
+// The type of a literal token, such as '1337u32' (U32)
+export type literal_type = enum {
+ U8,
+ U16,
+ U32,
+ U64,
+ UINT,
+ UINTPTR,
+ I8,
+ I16,
+ I32,
+ I64,
+ INT,
+ F32,
+ F64,
+ VOID,
+};
+
+// A token for a literal value, such as '1337u32'
+export type literal = struct {
+ storage: literal_type,
+ union {
+ string: str,
+ _rune: rune,
+ _int: i64,
+ _uint: u64,
+ float: f64,
+ },
+};
+
+// A tuple of a line number and column number, counting from 1.
+export type linecol = (uint, uint);
+
+// A location within a source file.
+export type location = struct {
+ path: str,
+ start: linecol,
+ end: linecol,
+};
+
+// A single lexical token.
+export type token = (base_token | label | name | literal);