hare

The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

commit 1bdaa5ffcccd5de266f7837ba7b7009ebbeb9886
parent 6085c10bebe42fd7dcb41734c86d122e5b8736fe
Author: Drew DeVault <sir@cmpwn.com>
Date:   Sun, 14 Feb 2021 16:46:26 -0500

hare::lex: lex2

Diffstat:
Ahare/lex/+test.ha | 101+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mhare/lex/lex.ha | 163+++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------
2 files changed, 207 insertions(+), 57 deletions(-)

diff --git a/hare/lex/+test.ha b/hare/lex/+test.ha @@ -0,0 +1,101 @@ +use bufio; +use fmt; +use io; +use strings; + +@test fn unget() void = { + let lexer = lexer_init(bufio::fixed(strings::to_utf8("z")), "<test>"); + unget(&lexer, 'x'); + unget(&lexer, 'y'); + assert(next(&lexer) as rune == 'y'); + assert(next(&lexer) as rune == 'x'); + assert(next(&lexer) as rune == 'z'); + assert(next(&lexer) is io::EOF); + unget(&lexer, io::EOF); + assert(next(&lexer) is io::EOF); +}; + +@test fn unlex() void = { + let lexer = lexer_init(io::empty, "<test>"); + unlex(&lexer, (btoken::IF, location { + path = "<test>", + line = 1234, + col = 1234, + })); + let t = lex(&lexer) as (token, location); + assert(t.0 is btoken); + assert(t.0 as btoken == btoken::IF); + assert(t.1.path == "<test>"); + assert(t.1.line == 1234 && t.1.col == 1234); +}; + +fn lextest(in: str, expected: [](uint, uint, token)) void = { + let lexer = lexer_init(bufio::fixed(strings::to_utf8(in)), "<test>"); + for (let i = 0z; i < len(expected); i += 1) { + let eline = expected[i].0, ecol = expected[i].1, + etok = expected[i].2; + let tl = lex(&lexer) as (token, location); + let tok = tl.0, loc = tl.1; + match (tok) { + b: btoken => if (etok as btoken != b) { + fmt::errorln("bad token at {}: got {}, wanted {}", + i, tokstr(tok), tokstr(etok)); + abort(); + }, + * => abort("TODO"), + }; + assert(loc.path == "<test>"); + if (loc.line != eline || loc.col != ecol) { + fmt::errorln("bad line/col at {}: got {},{}; wanted {},{}", + i, loc.line, loc.col, eline, ecol); + abort(); + }; + }; + assert(lex(&lexer) is io::EOF); +}; + +@test fn lex1() void = { + const in = "~,{[(}]);"; + const expected: [_](uint, uint, token) = [ + (1, 1, btoken::BNOT), + (1, 2, btoken::COMMA), + (1, 3, btoken::LBRACE), + (1, 4, btoken::LBRACKET), + (1, 5, btoken::LPAREN), + (1, 6, btoken::RBRACE), + (1, 7, btoken::RBRACKET), + (1, 8, btoken::RPAREN), + (1, 9, btoken::SEMICOLON), + ]; + lextest(in, expected); +}; + +@test fn lex2() void = { + // Ends with = to test =, EOF + const in = "^ ^^ ^= * *= % %= + += - -= : :: & && &= | || |= = == ="; + const expected: [_](uint, uint, token) = [ + (1, 1, btoken::BXOR), + (1, 3, btoken::LXOR), + (1, 6, btoken::BXOREQ), + (1, 9, btoken::TIMES), + (1, 11, btoken::TIMESEQ), + (1, 14, btoken::MODULO), + (1, 16, btoken::MODEQ), + (1, 19, btoken::PLUS), + (1, 21, btoken::PLUSEQ), + (1, 24, btoken::MINUS), + (1, 26, btoken::MINUSEQ), + (1, 29, btoken::COLON), + (1, 31, btoken::DOUBLE_COLON), + (1, 34, btoken::BAND), + (1, 36, btoken::LAND), + (1, 39, btoken::ANDEQ), + (1, 42, btoken::BOR), + (1, 44, btoken::LOR), + (1, 47, btoken::OREQ), + (1, 50, btoken::EQUAL), + (1, 52, btoken::LEQUAL), + (1, 55, btoken::EQUAL), + ]; + lextest(in, expected); +}; diff --git a/hare/lex/lex.ha b/hare/lex/lex.ha @@ -1,9 +1,8 @@ // hare::lex provides a lexer for Hare source code. use ascii; -use bufio; use io; use strings; -use types; +use fmt; // State associated with a lexer. export type lexer = struct { @@ -46,11 +45,14 @@ export fn lex(lex: *lexer) ((token, location) | io::EOF | error) = { void => void, }; - let loc = mkloc(lex); - let r: rune = match (next(lex)) { + let loc = location { ... }; + let r: rune = match (nextw(lex)) { e: io::error => return e, io::EOF => return io::EOF, - r: rune => r, + r: (rune, location) => { + loc = r.1; + r.0; + }, }; if (ascii::isalpha(r) || r == '_' || r == '@') { @@ -63,11 +65,11 @@ export fn lex(lex: *lexer) ((token, location) | io::EOF | error) = { }; let tok: token = switch (r) { - * => return mkloc(lex), + * => return syntaxerr(loc), '"', '\'' => abort(), // TODO: Strings/runes '.', '<', '>' => return lex3(lex, r), '^', '*', '%', '/', '+', '-', ':', '!', '&', '|', '=' => { - return lex2(lex, r); + return lex2(lex, loc, r); }, '~' => btoken::BNOT, ',' => btoken::COMMA, @@ -82,12 +84,90 @@ export fn lex(lex: *lexer) ((token, location) | io::EOF | error) = { return (tok, loc); }; -fn lex3(lex: *lexer, r: rune) ((token, location) | io::EOF | error) = { - abort(); - return io::EOF; // TODO +fn lex2( + lex: *lexer, + loc: location, + r: rune, +) ((token, location) | io::EOF | error) = { + let n = match (next(lex)) { + err: io::error => return err, + io::EOF => io::EOF, + r: rune => r, + }; + let tok: token = switch (r) { + '^' => match (n) { + r: rune => switch (r) { + '^' => return (btoken::LXOR: token, loc), + '=' => return (btoken::BXOREQ: token, loc), + * => btoken::BXOR, + }, + io::EOF => btoken::BXOR, + }, + '*' => match (n) { + r: rune => switch (r) { + '=' => return (btoken::TIMESEQ: token, loc), + * => btoken::TIMES, + }, + io::EOF => btoken::TIMES, + }, + '%' => match (n) { + r: rune => switch (r) { + '=' => return (btoken::MODEQ: token, loc), + * => btoken::MODULO, + }, + io::EOF => btoken::MODULO, + }, + '+' => match (n) { + r: rune => switch (r) { + '=' => return (btoken::PLUSEQ: token, loc), + * => btoken::PLUS, + }, + io::EOF => btoken::PLUS, + }, + '-' => match (n) { + r: rune => switch (r) { + '=' => return (btoken::MINUSEQ: token, loc), + * => btoken::MINUS, + }, + io::EOF => btoken::MINUS, + }, + ':' => match (n) { + r: rune => switch (r) { + ':' => return (btoken::DOUBLE_COLON: token, loc), + * => btoken::COLON, + }, + io::EOF => btoken::COLON, + }, + '&' => match (n) { + r: rune => switch (r) { + '&' => return (btoken::LAND: token, loc), + '=' => return (btoken::ANDEQ: token, loc), + * => btoken::BAND, + }, + io::EOF => btoken::BAND, + }, + '|' => match (n) { + r: rune => switch (r) { + '|' => return (btoken::LOR: token, loc), + '=' => return (btoken::OREQ: token, loc), + * => btoken::BOR, + }, + io::EOF => btoken::BOR, + }, + '=' => match (n) { + r: rune => switch (r) { + '=' => return (btoken::LEQUAL: token, loc), + * => btoken::EQUAL, + }, + io::EOF => btoken::EQUAL, + }, + * => return syntaxerr(loc), + }; + unget(lex, n); + return (tok, loc); }; -fn lex2(lex: *lexer, r: rune) ((token, location) | io::EOF | error) = { +fn lex3(lex: *lexer, r: rune) ((token, location) | io::EOF | error) = { abort(); return io::EOF; // TODO }; @@ -116,7 +196,6 @@ fn next(lex: *lexer) (rune | io::EOF | io::error) = { err: io::error => err, r: rune => { lexloc(lex, r); - if (ascii::isspace(r)) continue; r; }, }; @@ -125,6 +204,20 @@ fn next(lex: *lexer) (rune | io::EOF | io::error) = { abort("unreachable"); }; +fn nextw(lex: *lexer) ((rune, location) | io::EOF | io::error) = { + for (true) { + let loc = mkloc(lex); + match (next(lex)) { + err: io::error => return err, + io::EOF => return io::EOF, + r: rune => if (!ascii::isspace(r)) { + return (r, loc); + }, + }; + }; + abort(); +}; + fn lexloc(lex: *lexer, r: rune) void = { switch (r) { '\n' => { @@ -154,48 +247,4 @@ fn mkloc(lex: *lexer) location = location { col = lex.loc.1, }; -@test fn unget() void = { - let lexer = lexer_init(io::empty, "<test>"); - unget(&lexer, 'x'); - unget(&lexer, 'y'); - assert(next(&lexer) as rune == 'y'); - assert(next(&lexer) as rune == 'x'); - assert(next(&lexer) is io::EOF); -}; - -@test fn unlex() void = { - let lexer = lexer_init(io::empty, "<test>"); - unlex(&lexer, (btoken::IF, location { - path = "<test>", - line = 1234, - col = 1234, - })); - let t = lex(&lexer) as (token, location); - assert(t.0 is btoken); - assert(t.0 as btoken == btoken::IF); - assert(t.1.path == "<test>"); - assert(t.1.line == 1234 && t.1.col == 1234); -}; - -@test fn lex1() void = { - const in = "~,{[(}]);"; - const expected = [ - btoken::BNOT, - btoken::COMMA, - btoken::LBRACE, - btoken::LBRACKET, - btoken::LPAREN, - btoken::RBRACE, - btoken::RBRACKET, - btoken::RPAREN, - btoken::SEMICOLON, - ]; - let lexer = lexer_init(bufio::fixed(strings::to_utf8(in)), "<test>"); - for (let i = 0z; i < len(expected); i += 1) { - let tl = lex(&lexer) as (token, location); - let tok = tl.0, loc = tl.1; - assert(tok as btoken == expected[i]); - assert(loc.path == "<test>"); - assert(loc.line == 1 && loc.col == i + 1); - }; -}; +fn syntaxerr(loc: location) error = loc: syntax: error;