hare

The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

commit 86d5d5c432fd86fa4ccebdfecdbebcf27820f4a5
parent 159dbb469fbefb328a0adf5cb1de1fb2d199115c
Author: Drew DeVault <sir@cmpwn.com>
Date:   Sat, 13 Feb 2021 11:35:33 -0500

hare::lex: initial riggings

Diffstat:
Abufio/fixed.ha | 32++++++++++++++++++++++++++++++++
Ahare/lex/lex.ha | 78++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ahare/lex/token.ha | 161+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 271 insertions(+), 0 deletions(-)

diff --git a/bufio/fixed.ha b/bufio/fixed.ha @@ -0,0 +1,32 @@ +use io; +use rt; + +// XXX: All of this is temporary +export type fixed_stream = struct { + stream: io::stream, + buf: []u8, +}; + +export fn fixed(in: []u8) *io::stream = { + let s = alloc(*fixed_stream, fixed_stream { + stream = io::stream { + name = "<bufio::fixed>", + reader = &fixed_read, + ... + }, + buf = in, + }); + return &s.stream; +}; + +fn fixed_read(s: *io::stream, buf: []u8) (size | io::error | io::EOF) = { + let stream = s: *fixed_stream; + if (len(stream.buf) == 0) { + return io::EOF; + }; + const n = if (len(buf) > len(stream.buf)) len(stream.buf) else len(buf); + // TODO: Fix me up once slice copying is in + rt::memcpy(buf: *[*]u8, stream.buf: *[*]u8, n); + stream.buf = stream.buf[n..]; + return n; +}; diff --git a/hare/lex/lex.ha b/hare/lex/lex.ha @@ -0,0 +1,78 @@ +// hare::lex provides a lexer for Hare source code. +use bufio; +use io; +use strings; +use types; +use fmt; + +// State associated with a lexer. +export type lexer = struct { + in: *io::stream, + path: str, + loc: linecol, + un: ((token, location) | void), + rb: [2](rune | io::EOF | void), +}; + +// Initializes a new lexer for the given input stream. The path is borrowed. +export fn lexer_init(in: *io::stream, path: str) lexer = lexer { + in = in, + path = path, + loc = (1, 1), + un = void, + rb = [void...], +}; + +// Returns the next token from the lexer. +export fn lex(lex: *lexer) ((token, location) | io::EOF | io::error) = { + match (lex.un) { + tok: (token, location) => { + lex.un = void; + return tok; + }, + void => void, + }; + + return io::EOF; // TODO +}; + +// Unlex a single token. The next call to [lex] will return this token, location +// pair. Only one unlex is supported at a time; you must call [lex] before +// calling [unlex] again. +export fn unlex(lex: *lexer, tok: (token, location)) void = { + assert(lex.un is void, "attempted to unlex more than one token"); + lex.un = tok; +}; + +fn next(lex: *lexer) (rune | io::EOF | io::error) = { + match (lex.rb[0]) { + void => void, + r: (rune | io::EOF) => { + lex.rb[0] = lex.rb[1]; + lex.rb[1] = void; + return r; + }, + }; + return match (io::getrune(lex.in)) { + r: rune => r, + io::EOF => io::EOF, + err: io::error => err, + }; +}; + +fn unget(lex: *lexer, r: (rune | io::EOF)) void = { + if (!(lex.rb[0] is void)) { + assert(lex.rb[1] is void, "ungot too many runes"); + lex.rb[1] = lex.rb[0]; + }; + lex.rb[0] = r; +}; + +@test fn unget() void = { + let lexer = lexer_init(bufio::fixed([]), "<test>"); + unget(&lexer, 'x'); + unget(&lexer, 'y'); + assert(next(&lexer) as rune == 'y'); + assert(next(&lexer) as rune == 'x'); + assert(next(&lexer) is io::EOF); +}; diff --git a/hare/lex/token.ha b/hare/lex/token.ha @@ -0,0 +1,161 @@ +// A token with no additional context, such as '+' +export type base_token = enum { + // Alpha shorted + ATTR_FINI, + ATTR_INIT, + ATTR_NORETURN, + ATTR_SYMBOL, + ATTR_TEST, + UNDERSCORE, + ABORT, + ALLOC, + APPEND, + AS, + ASSERT, + BOOL, + BREAK, + CHAR, + CONST, + CONTINUE, + DEF, + DEFER, + ELSE, + ENUM, + EXPORT, + F32, + F64, + FALSE, + FN, + FOR, + FREE, + I16, + I32, + I64, + I8, + IF, + INT, + IS, + LEN, + LET, + MATCH, + NULL, + NULLABLE, + OFFSET, + RETURN, + RUNE, + SIZE, + STATIC, + STR, + STRUCT, + SWITCH, + TRUE, + TYPE, + U16, + U32, + U64, + U8, + UINT, + UINTPTR, + UNION, + USE, + VOID, + + // Operators + ANDEQ, + BAND, + BNOT, + BOR, + CASE, + COLON, + COMMA, + DIV, + DIVEQ, + DOT, + DOUBLE_COLON, + ELLIPSIS, + EQUAL, + GREATER, + GREATEREQ, + LAND, + LBRACE, + LBRACKET, + LEQUAL, + LESS, + LESSEQ, + LNOT, + LOR, + LPAREN, + LSHIFT, + LSHIFTEQ, + LXOR, + MINUS, + MINUSEQ, + MINUSMINUS, + MODEQ, + MODULO, + NEQUAL, + OREQ, + PLUS, + PLUSEQ, + PLUSPLUS, + RBRACE, + RBRACKET, + RPAREN, + RSHIFT, + RSHIFTEQ, + SEMICOLON, + SLICE, + TIMES, + TIMESEQ, + BXOR, + BXOREQ, +}; + +// A loop label, such as ':example' +export type label = str; + +// A name, such as 'example' +export type name = str; + +// The type of a literal token, such as '1337u32' (U32) +export type literal_type = enum { + U8, + U16, + U32, + U64, + UINT, + UINTPTR, + I8, + I16, + I32, + I64, + INT, + F32, + F64, + VOID, +}; + +// A token for a literal value, such as '1337u32' +export type literal = struct { + storage: literal_type, + union { + string: str, + _rune: rune, + _int: i64, + _uint: u64, + float: f64, + }, +}; + +// A tuple of a line number and column number, counting from 1. +export type linecol = (uint, uint); + +// A location within a source file. +export type location = struct { + path: str, + start: linecol, + end: linecol, +}; + +// A single lexical token. +export type token = (base_token | label | name | literal);