hare

The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

commit 37f47c120bf9cf72a226eabf12462e22df275c20
parent f62b8cb02d3991e2bc51e8d064f3d85b2b8da6d4
Author: Drew DeVault <sir@cmpwn.com>
Date:   Sat, 13 Feb 2021 14:27:09 -0500

hare::lex: lex1

Diffstat:
Abufio/fixed.ha | 32++++++++++++++++++++++++++++++++
Mhare/lex/lex.ha | 139+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
Mhare/lex/token.ha | 130++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
3 files changed, 280 insertions(+), 21 deletions(-)

diff --git a/bufio/fixed.ha b/bufio/fixed.ha @@ -0,0 +1,32 @@ +use io; +use rt; + +// XXX: All of this is temporary +export type fixed_stream = struct { + stream: io::stream, + buf: []u8, +}; + +export fn fixed(in: []u8) *io::stream = { + let s = alloc(*fixed_stream, fixed_stream { + stream = io::stream { + name = "<bufio::fixed>", + reader = &fixed_read, + ... + }, + buf = in, + }); + return &s.stream; +}; + +fn fixed_read(s: *io::stream, buf: []u8) (size | io::error | io::EOF) = { + let stream = s: *fixed_stream; + if (len(stream.buf) == 0) { + return io::EOF; + }; + const n = if (len(buf) > len(stream.buf)) len(stream.buf) else len(buf); + // TODO: Fix me up once slice copying is in + rt::memcpy(buf: *[*]u8, stream.buf: *[*]u8, n); + stream.buf = stream.buf[n..]; + return n; +}; diff --git a/hare/lex/lex.ha b/hare/lex/lex.ha @@ -1,4 +1,6 @@ // hare::lex provides a lexer for Hare source code. +use ascii; +use bufio; use io; use strings; use types; @@ -7,22 +9,35 @@ use types; export type lexer = struct { in: *io::stream, path: str, - loc: linecol, + loc: (uint, uint), un: ((token, location) | void), rb: [2](rune | io::EOF | void), }; +// A syntax error +export type syntax = location; + +// All possible lexer errors +export type error = (io::error | syntax); + +export fn errstr(err: error) const str = { + return match (err) { + err: io::error => io::errstr(err), + syntax => "Syntax error", // TODO: add line info + }; +}; + // Initializes a new lexer for the given input stream. The path is borrowed. export fn lexer_init(in: *io::stream, path: str) lexer = lexer { in = in, path = path, - loc = linecol { line = 1, col = 1 }, + loc = (1, 1), un = void, rb = [void...], }; // Returns the next token from the lexer. -export fn lex(lex: *lexer) ((token, location) | io::EOF | io::error) = { +export fn lex(lex: *lexer) ((token, location) | io::EOF | error) = { match (lex.un) { tok: (token, location) => { lex.un = void; @@ -31,6 +46,50 @@ export fn lex(lex: *lexer) ((token, location) | io::EOF | io::error) = { void => void, }; + let loc = mkloc(lex); + let r: rune = match (next(lex)) { + e: io::error => return e, + io::EOF => return io::EOF, + r: rune => r, + }; + + if (ascii::isalpha(r) || r == '_' || r == '@') { + unget(lex, r); + abort(); // TODO: Keywords/names + }; + if (ascii::isdigit(r)) { + unget(lex, r); + abort(); // TODO: Literals + }; + + let tok: token = switch (r) { + * => return mkloc(lex), + '"', '\'' => abort(), // TODO: Strings/runes + '.', '<', '>' => return lex3(lex, r), + '^', '*', '%', '/', '+', '-', ':', '!', '&', '|', '=' => { + return lex2(lex, r); + }, + '~' => btoken::BNOT, + ',' => btoken::COMMA, + '{' => btoken::LBRACE, + '[' => btoken::LBRACKET, + '(' => btoken::LPAREN, + '}' => btoken::RBRACE, + ']' => btoken::RBRACKET, + ')' => btoken::RPAREN, + ';' => btoken::SEMICOLON, + }; + + return (tok, loc); +}; + +fn lex3(lex: *lexer, r: rune) ((token, location) | io::EOF | error) = { + abort(); + return io::EOF; // TODO +}; + +fn lex2(lex: *lexer, r: rune) ((token, location) | io::EOF | error) = { + abort(); return io::EOF; // TODO }; @@ -51,10 +110,34 @@ fn next(lex: *lexer) (rune | io::EOF | io::error) = { return r; }, }; - return match (io::getrune(lex.in)) { - r: rune => r, - io::EOF => io::EOF, - err: io::error => err, + + for (true) { + return match (io::getrune(lex.in)) { + io::EOF => io::EOF, + err: io::error => err, + r: rune => { + lexloc(lex, r); + if (ascii::isspace(r)) continue; + r; + }, + }; + }; + + abort("unreachable"); +}; + +fn lexloc(lex: *lexer, r: rune) void = { + switch (r) { + '\n' => { + lex.loc.0 += 1; + lex.loc.1 = 1; + }, + '\t' => { + lex.loc.1 += 8; + }, + * => { + lex.loc.1 += 1; + }, }; }; @@ -66,6 +149,12 @@ fn unget(lex: *lexer, r: (rune | io::EOF)) void = { lex.rb[0] = r; }; +fn mkloc(lex: *lexer) location = location { + path = lex.path, + line = lex.loc.0, + col = lex.loc.1, +}; + @test fn unget() void = { let lexer = lexer_init(io::empty, "<test>"); unget(&lexer, 'x'); @@ -77,15 +166,37 @@ fn unget(lex: *lexer, r: (rune | io::EOF)) void = { @test fn unlex() void = { let lexer = lexer_init(io::empty, "<test>"); - unlex(&lexer, (base_token::IF, location { + unlex(&lexer, (btoken::IF, location { path = "<test>", - start = linecol { line = 1234, col = 1234 }, - end = linecol { line = 1234, col = 1234 }, + line = 1234, + col = 1234, })); let t = lex(&lexer) as (token, location); - assert(t.0 is base_token); - assert(t.0 as base_token == base_token::IF); + assert(t.0 is btoken); + assert(t.0 as btoken == btoken::IF); assert(t.1.path == "<test>"); - assert(t.1.start.line == 1234 && t.1.start.col == 1234); - assert(t.1.end.line == 1234 && t.1.end.col == 1234); + assert(t.1.line == 1234 && t.1.col == 1234); +}; + +@test fn lex1() void = { + const in = "~,{[(}]);"; + const expected = [ + btoken::BNOT, + btoken::COMMA, + btoken::LBRACE, + btoken::LBRACKET, + btoken::LPAREN, + btoken::RBRACE, + btoken::RBRACKET, + btoken::RPAREN, + btoken::SEMICOLON, + ]; + let lexer = lexer_init(bufio::fixed(strings::to_utf8(in)), "<test>"); + for (let i = 0z; i < len(expected); i += 1) { + let tl = lex(&lexer) as (token, location); + let tok = tl.0, loc = tl.1; + assert(tok as btoken == expected[i]); + assert(loc.path == "<test>"); + assert(loc.line == 1 && loc.col == i + 1); + }; }; diff --git a/hare/lex/token.ha b/hare/lex/token.ha @@ -1,5 +1,6 @@ // A token with no additional context, such as '+' -export type base_token = enum { +export type btoken = enum { + // Keep ordered with bmap // Alpha shorted ATTR_FINI, ATTR_INIT, @@ -111,6 +112,116 @@ export type base_token = enum { BXOREQ, }; +const bmap: [_]str = [ + // Keep ordered with btoken + "@fini", + "@init", + "@noreturn", + "@symbol", + "@test", + "_", + "abort", + "alloc", + "append", + "as", + "assert", + "bool", + "break", + "char", + "const", + "continue", + "def", + "defer", + "else", + "enum", + "export", + "f32", + "f64", + "false", + "fn", + "for", + "free", + "i16", + "i32", + "i64", + "i8", + "if", + "int", + "is", + "len", + "let", + "match", + "null", + "nullable", + "offset", + "return", + "rune", + "size", + "static", + "str", + "struct", + "switch", + "true", + "type", + "u16", + "u32", + "u64", + "u8", + "uint", + "uintptr", + "union", + "use", + "void", + "&=", + "&", + "~", + "|", + "case", + ":", + ",", + "/", + "/=", + ".", + "::", + "...", + "=", + ">", + ">=", + "&&", + "{", + "[", + "==", + "<", + "<=", + "!", + "||", + "(", + "<<", + "<<=", + "^^", + "-", + "-=", + "--", + "%=", + "%", + "!=", + "|=", + "+", + "+=", + "++", + "}", + "]", + ")", + ">>", + ">>=", + ";", + "..", + "*", + "*=", + "^", + "^=", +]; + // A loop label, such as ':example' export type label = str; @@ -147,15 +258,20 @@ export type literal = struct { }, }; -// A tuple of a line number and column number, counting from 1. -export type linecol = struct { line: uint, col: uint }; - // A location within a source file. export type location = struct { path: str, - start: linecol, - end: linecol, + line: uint, + col: uint }; // A single lexical token. -export type token = (base_token | label | name | literal); +export type token = (btoken | label | name | literal); + +// Converts a token to its string representation +export fn tokstr(tok: token) const str = { + return match (tok) { + b: btoken => bmap[b: int], + * => abort(), // TODO + }; +};