hare

The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

commit c9a7e680b20833937ad85a6dc3e020ffdbcf0067
parent a369d1faf2d9115ecfea9d8c87c0555c8c529e33
Author: Eyal Sawady <ecs@d2evs.net>
Date:   Fri, 28 May 2021 11:45:05 -0400

lex::mkloc: fix interaction with unget

And fix some tests which checked for the old (incorrect) behavior.

Signed-off-by: Eyal Sawady <ecs@d2evs.net>

Diffstat:
Mhare/lex/+test.ha | 22++++++++++++++--------
Mhare/lex/lex.ha | 418++++++++++++++++++++++++++++---------------------------------------------------
2 files changed, 160 insertions(+), 280 deletions(-)

diff --git a/hare/lex/+test.ha b/hare/lex/+test.ha @@ -8,11 +8,17 @@ use strings; let buf = bufio::fixed(strings::toutf8("z"), mode::READ); defer io::close(buf); let lexer = init(buf, "<test>"); - unget(&lexer, 'x'); - unget(&lexer, 'y'); - assert(next(&lexer) as rune == 'y'); - assert(next(&lexer) as rune == 'x'); - assert(next(&lexer) as rune == 'z'); + unget(&lexer, ('x', location { path = "<test>", line = 1, col = 2 })); + unget(&lexer, ('y', location { path = "<test>", line = 1, col = 3 })); + let r = next(&lexer) as (rune, location); + assert(r.0 == 'y'); + assert(r.1.path == "<test>" && r.1.line == 1 && r.1.col == 3); + r = next(&lexer) as (rune, location); + assert(r.0 == 'x'); + assert(r.1.path == "<test>" && r.1.line == 1 && r.1.col == 2); + r = next(&lexer) as (rune, location); + assert(r.0 == 'z'); + assert(r.1.path == "<test>" && r.1.line == 1 && r.1.col == 1); assert(next(&lexer) is io::EOF); unget(&lexer, io::EOF); assert(next(&lexer) is io::EOF); @@ -234,11 +240,11 @@ fn loc(line: uint, col: uint) location = location { const in = "\"hello world\", \"こんにちは\", \"return\", \"foo\""; const expected: [_]token = [ (ltok::LIT_STR, "hello world", loc(1, 1)), - (ltok::COMMA, void, loc(1, 15)), + (ltok::COMMA, void, loc(1, 14)), (ltok::LIT_STR, "こんにちは", loc(1, 16)), - (ltok::COMMA, void, loc(1, 24)), + (ltok::COMMA, void, loc(1, 23)), (ltok::LIT_STR, "return", loc(1, 25)), - (ltok::COMMA, void, loc(1, 34)), + (ltok::COMMA, void, loc(1, 33)), (ltok::LIT_STR, "foo", loc(1, 35)), ]; lextest(in, expected); diff --git a/hare/lex/lex.ha b/hare/lex/lex.ha @@ -14,7 +14,7 @@ export type lexer = struct { path: str, loc: (uint, uint), un: (token | void), - rb: [2](rune | io::EOF | void), + rb: [2]((rune, location) | io::EOF | void), flags: flags, comment: str, }; @@ -72,33 +72,33 @@ export fn lex(lex: *lexer) (token | error) = { void => void, }; - let loc = location { ... }; - let r: rune = match (nextw(lex)?) { + let r = match (nextw(lex)?) { io::EOF => return (ltok::EOF, void, mkloc(lex)), - r: (rune, location) => { - loc = r.1; - r.0; - }, + r: (rune, location) => r, }; - if (is_name(r, false)) { + if (is_name(r.0, false)) { unget(lex, r); - return lex_name(lex, loc, true); + return lex_name(lex, r.1, false); }; - if (ascii::isdigit(r)) { + if (ascii::isdigit(r.0)) { unget(lex, r); - return lex_literal(lex, loc); + return lex_literal(lex); }; - let tok: ltok = switch (r) { - * => return syntaxerr(loc, "invalid character"), + let tok = switch (r.0) { + * => return syntaxerr(r.1, "invalid character"), '"', '\'' => { unget(lex, r); - return lex_rn_str(lex, loc); + return lex_rn_str(lex); + }, + '.', '<', '>', '&', '|', '^' => { + unget(lex, r); + return lex3(lex); }, - '.', '<', '>', '&', '|', '^' => return lex3(lex, loc, r), '*', '%', '/', '+', '-', ':', '!', '=' => { - return lex2(lex, loc, r); + unget(lex, r); + return lex2(lex); }, '~' => ltok::BNOT, ',' => ltok::COMMA, @@ -111,7 +111,7 @@ export fn lex(lex: *lexer) (token | error) = { ';' => ltok::SEMICOLON, '?' => ltok::QUESTION, }; - return (tok, void, loc); + return (tok, void, r.1); }; fn is_name(r: rune, num: bool) bool = @@ -132,7 +132,7 @@ fn lex_unicode(lex: *lexer, loc: location, n: size) (rune | error) = { let r = match (next(lex)?) { io::EOF => return syntaxerr(loc, "unexpected EOF scanning for escape"), - r: rune => r, + r: (rune, location) => r.0, }; if (!ascii::isxdigit(r)) { return syntaxerr(loc, @@ -148,7 +148,7 @@ fn lex_rune(lex: *lexer, loc: location) (rune | error) = { let r = match (next(lex)?) { io::EOF => return syntaxerr(loc, "unexpected EOF scanning for rune"), - r: rune => r, + r: (rune, location) => r.0, }; if (r != '\\') { return r; @@ -156,7 +156,7 @@ fn lex_rune(lex: *lexer, loc: location) (rune | error) = { r = match (next(lex)?) { io::EOF => return syntaxerr(loc, "unexpected EOF scanning for escape"), - r: rune => r, + r: (rune, location) => r.0, }; return switch (r) { '\\' => '\\', @@ -180,19 +180,18 @@ fn lex_string(lex: *lexer, loc: location) (token | error) = { let buf = strio::dynamic(); for (true) match (next(lex)?) { io::EOF => return syntaxerr(loc, "unexpected EOF scanning string literal"), - r: rune => - if (r == '"') break + r: (rune, location) => + if (r.0 == '"') break else { unget(lex, r); - r = lex_rune(lex, loc)?; + let r = lex_rune(lex, loc)?; strio::appendrune(buf, r)?; }, }; match (nextw(lex)?) { io::EOF => void, r: (rune, location) => { - const r = r.0; - if (r == '"') { + if (r.0 == '"') { const tok = lex_string(lex, loc)?; const next = tok.1 as str; strio::concat(buf, next)!; @@ -205,9 +204,10 @@ fn lex_string(lex: *lexer, loc: location) (token | error) = { return (ltok::LIT_STR, strio::finish(buf), loc); }; -fn lex_rn_str(lex: *lexer, loc: location) (token | error) = { +fn lex_rn_str(lex: *lexer) (token | error) = { + const loc = mkloc(lex); let r = match (next(lex)) { - r: rune => r, + r: (rune, location) => r.0, (io::EOF | io::error) => abort(), }; switch (r) { @@ -221,36 +221,36 @@ fn lex_rn_str(lex: *lexer, loc: location) (token | error) = { match (next(lex)?) { io::EOF => return syntaxerr(loc, "unexpected EOF"), - n: rune => if (n != '\'') - return syntaxerr(loc, "expected \"\'\""), + n: (rune, location) => if (n.0 != '\'') + return syntaxerr(n.1, "expected \"\'\""), }; return ret; }; -fn lex_name(lex: *lexer, loc: location, keyword: bool) (token | error) = { +fn lex_name(lex: *lexer, loc: location, label: bool) (token | error) = { let buf = strio::dynamic(); match (next(lex)) { - r: rune => { - assert(is_name(r, false)); - strio::appendrune(buf, r)!; + r: (rune, location) => { + assert(is_name(r.0, false)); + strio::appendrune(buf, r.0)!; }, (io::EOF | io::error) => abort(), }; for (true) match (next(lex)?) { io::EOF => break, - r: rune => { - if (!is_name(r, true)) { + r: (rune, location) => { + if (!is_name(r.0, true)) { unget(lex, r); break; }; - strio::appendrune(buf, r)?; + strio::appendrune(buf, r.0)?; }, }; let n = strio::finish(buf); - if (!keyword) { - return (ltok::NAME, n, loc); + if (label) { + return (ltok::LABEL, n, loc); }; return match (sort::search(bmap[..ltok::LAST_KEYWORD+1], @@ -265,11 +265,11 @@ fn lex_name(lex: *lexer, loc: location, keyword: bool) (token | error) = { }; }; -fn lex_comment(lexr: *lexer, loc: location) (token | error) = { +fn lex_comment(lexr: *lexer) (token | error) = { if (lexr.flags & flags::COMMENTS != flags::COMMENTS) { for (true) match (next(lexr)?) { io::EOF => break, - r: rune => if (r == '\n') break, + r: (rune, location) => if (r.0 == '\n') break, }; return lex(lexr); }; @@ -278,9 +278,9 @@ fn lex_comment(lexr: *lexer, loc: location) (token | error) = { defer io::close(buf); for (true) match (next(lexr)?) { io::EOF => break, - r: rune => { - strio::appendrune(buf, r)!; - if (r == '\n') break; + r: (rune, location) => { + strio::appendrune(buf, r.0)!; + if (r.0 == '\n') break; }, }; let new = strings::concat(lexr.comment, strio::string(buf)); @@ -289,28 +289,29 @@ fn lex_comment(lexr: *lexer, loc: location) (token | error) = { return lex(lexr); }; -fn lex_literal(lex: *lexer, loc: location) (token | error) = { +fn lex_literal(lex: *lexer) (token | error) = { + const loc = mkloc(lex); let chars: []u8 = []; let r = match (next(lex)?) { io::EOF => return (ltok::EOF, void, loc), - r: rune => r, + r: (rune, location) => r, }; - if (r == '-') { - append(chars, utf8::encoderune(r)...); + if (r.0 == '-') { + append(chars, utf8::encoderune(r.0)...); r = match (next(lex)?) { io::EOF => return (ltok::EOF, void, loc), - r: rune => r, + r: (rune, location) => r, }; }; let base = 10u; - if (r == '0') { - append(chars, utf8::encoderune(r)...); + if (r.0 == '0') { + append(chars, utf8::encoderune(r.0)...); r = match (next(lex)?) { io::EOF => return (ltok::LIT_ICONST, 0i64, loc), - r: rune => r, + r: (rune, location) => r, }; - switch (r) { + switch (r.0) { 'b' => base = 2, 'o' => base = 8, 'x' => base = 16, @@ -331,9 +332,9 @@ fn lex_literal(lex: *lexer, loc: location) (token | error) = { for (true) { r = match (next(lex)?) { io::EOF => break, - r: rune => r, + r: (rune, location) => r, }; - if (!strings::contains(basechrs, r)) switch (r) { + if (!strings::contains(basechrs, r.0)) switch (r.0) { '.' => if (float || exp is size || suff is size || base != 10) { unget(lex, r); @@ -341,11 +342,15 @@ fn lex_literal(lex: *lexer, loc: location) (token | error) = { } else { r = match (next(lex)?) { io::EOF => break, - r: rune => r, + r: (rune, location) => r, }; - if (!strings::contains(basechrs, r)) { + if (!strings::contains(basechrs, r.0)) { unget(lex, r); - unget(lex, '.'); + unget(lex, ('.', location { + path = r.1.path, + line = r.1.line, + col = r.1.col - 1, + })); break; }; unget(lex, r); @@ -357,15 +362,15 @@ fn lex_literal(lex: *lexer, loc: location) (token | error) = { break; } else { if (end == 0) end = len(chars); - append(chars, utf8::encoderune(r)...); + append(chars, utf8::encoderune(r.0)...); exp = len(chars); r = match (next(lex)?) { io::EOF => break, - r: rune => r, + r: (rune, location) => r, }; - switch (r) { + switch (r.0) { '+', '-' => append(chars, - utf8::encoderune(r)...), + utf8::encoderune(r.0)...), * => unget(lex, r), }; basechrs = "0123456789"; @@ -376,14 +381,14 @@ fn lex_literal(lex: *lexer, loc: location) (token | error) = { } else { suff = len(chars); if (end == 0) end = len(chars); - append(chars, utf8::encoderune(r)...); + append(chars, utf8::encoderune(r.0)...); basechrs = "0123456789"; }, * => { unget(lex, r); break; }, - } else append(chars, utf8::encoderune(r)...); + } else append(chars, utf8::encoderune(r.0)...); }; if (end == 0) end = len(chars); @@ -476,9 +481,15 @@ fn lex_literal(lex: *lexer, loc: location) (token | error) = { return (suff, val, loc); }; -fn lex2(lexr: *lexer, loc: location, r: rune) (token | error) = { - let n = next(lexr)?; - let tok: ltok = switch (r) { +fn lex2(lex: *lexer) (token | error) = { + let first = next(lex)? as (rune, location); + let second = next(lex)?; + let loc = first.1; + let n = match (second) { + n: (rune, location) => n.0, + io::EOF => io::EOF, + }; + let tok: ltok = switch (first.0) { '^' => match (n) { r: rune => switch (r) { '^' => return (ltok::LXOR, void, loc), @@ -497,7 +508,7 @@ fn lex2(lexr: *lexer, loc: location, r: rune) (token | error) = { '/' => match (n) { r: rune => switch (r) { '=' => return (ltok::DIVEQ, void, loc), - '/' => return lex_comment(lexr, loc), + '/' => return lex_comment(lex), * => ltok::DIV, }, io::EOF => ltok::DIV, @@ -520,23 +531,21 @@ fn lex2(lexr: *lexer, loc: location, r: rune) (token | error) = { r: rune => switch (r) { '=' => return (ltok::MINUSEQ, void, loc), * => if (ascii::isdigit(r)) { - unget(lexr, r); - unget(lexr, '-'); - return lex_literal(lexr, loc); + unget(lex, second); + unget(lex, first); + return lex_literal(lex); } else { ltok::MINUS; }, }, io::EOF => ltok::MINUS, }, - ':' => match (n) { - r: rune => switch (r) { + ':' => match (second) { + r: (rune, location) => switch (r.0) { ':' => return (ltok::DOUBLE_COLON, void, loc), - * => if (is_name(r, false)) { - unget(lexr, r); - let tok = lex_name(lexr, loc, false)?; - tok.0 = ltok::LABEL; - return tok; + * => if (is_name(r.0, false)) { + unget(lex, second); + return lex_name(lex, first.1, true)?; } else ltok::COLON, }, io::EOF => ltok::COLON, @@ -558,190 +567,40 @@ fn lex2(lexr: *lexer, loc: location, r: rune) (token | error) = { }, * => return syntaxerr(loc, "unknown token sequence"), }; - unget(lexr, n); + unget(lex, second); return (tok, void, loc); }; -fn lex3(lex: *lexer, loc: location, r: rune) (token | error) = { - let n = match (next(lex)?) { - io::EOF => return switch (r) { - '.' => (ltok::DOT, void, loc), - '<' => (ltok::LESS, void, loc), - '>' => (ltok::GREATER, void, loc), - '&' => (ltok::BAND, void, loc), - '|' => (ltok::BOR, void, loc), - '^' => (ltok::BXOR, void, loc), - * => abort(), // Invariant - }, - r: rune => r, - }; - return switch (r) { - '.' => lex3dot(lex, loc, n), - '<' => lex3lt(lex, loc, n), - '>' => lex3gt(lex, loc, n), - '&' => lex3and(lex, loc, n), - '|' => lex3or(lex, loc, n), - '^' => lex3xor(lex, loc, n), - * => syntaxerr(loc, "unknown token sequence"), - }; -}; - -fn lex3dot(lex: *lexer, loc: location, n: rune) (token | error) = { - let tok: ltok = switch (n) { +fn lex3(lex: *lexer) (token | error) = { + let r = next(lex)? as (rune, location); + let toks = switch (r.0) { '.' => { - let q = match (next(lex)?) { - io::EOF => io::EOF, - r: rune => r, - }; - let t = match (q) { - r: rune => switch (r) { - '.' => return (ltok::ELLIPSIS, void, loc), - * => ltok::SLICE, - }, - io::EOF => ltok::SLICE, - }; - unget(lex, q); - t; - }, - * => { - unget(lex, n); - ltok::DOT; - } - }; - return (tok, void, loc); -}; - -fn lex3lt(lex: *lexer, loc: location, n: rune) (token | error) = { - let tok: ltok = switch (n) { - '<' => { - let q = match (next(lex)?) { - io::EOF => io::EOF, - r: rune => r, - }; - let t = match (q) { - r: rune => switch (r) { - '=' => return (ltok::LSHIFTEQ, void, loc), - * => ltok::LSHIFT, - }, - io::EOF => ltok::LSHIFT, - }; - unget(lex, q); - t; - }, - '=' => ltok::LESSEQ, - * => { - unget(lex, n); - ltok::LESS; - } - }; - return (tok, void, loc); -}; - -fn lex3gt(lex: *lexer, loc: location, n: rune) (token | error) = { - let tok: ltok = switch (n) { - '>' => { - let q = match (next(lex)?) { - io::EOF => io::EOF, - r: rune => r, - }; - let t = match (q) { - r: rune => switch (r) { - '=' => return (ltok::RSHIFTEQ, void, loc), - * => ltok::RSHIFT, - }, - io::EOF => ltok::RSHIFT, - }; - unget(lex, q); - t; - }, - '=' => ltok::GREATEREQ, - * => { - unget(lex, n); - ltok::GREATER; - } - }; - return (tok, void, loc); -}; - -fn lex3and(lex: *lexer, loc: location, n: rune) (token | error) = { - let tok: ltok = switch (n) { - '&' => { - let q = match (next(lex)?) { - io::EOF => io::EOF, - r: rune => r, - }; - let t = match (q) { - r: rune => switch (r) { - '=' => return (ltok::LANDEQ, void, loc), - * => ltok::LAND, - }, - io::EOF => ltok::LAND, - }; - unget(lex, q); - t; - }, - '=' => ltok::BANDEQ, - * => { - unget(lex, n); - ltok::BAND; - } - }; - return (tok, void, loc); -}; - -fn lex3or(lex: *lexer, loc: location, n: rune) (token | error) = { - let tok: ltok = switch (n) { - '|' => { - let q = match (next(lex)?) { - io::EOF => io::EOF, - r: rune => r, - }; - let t = match (q) { - r: rune => switch (r) { - '=' => return (ltok::LOREQ, void, loc), - * => ltok::LOR, - }, - io::EOF => ltok::LOR, - }; - unget(lex, q); - t; - }, - '=' => ltok::BOREQ, - * => { - unget(lex, n); - ltok::BOR; - } - }; - return (tok, void, loc); -}; - -fn lex3xor(lex: *lexer, loc: location, n: rune) (token | error) = { - let tok: ltok = switch (n) { - '^' => { - let q = match (next(lex)?) { - io::EOF => io::EOF, - r: rune => r, - }; - let t = match (q) { - r: rune => switch (r) { - '=' => return (ltok::LXOREQ, void, loc), - * => ltok::LXOR, - }, - io::EOF => ltok::LXOR, - }; - unget(lex, q); - t; + let tok = if (try(lex, '.') is void) ltok::DOT + else if (try(lex, '.') is void) ltok::SLICE + else ltok::ELLIPSIS; + return (tok, void, r.1); + }, + '<' => [ltok::LESS, ltok::LESSEQ, ltok::LSHIFT, ltok::LSHIFTEQ], + '>' => [ltok::GREATER, ltok::GREATEREQ, ltok::RSHIFT, + ltok::RSHIFTEQ], + '&' => [ltok::BAND, ltok::BANDEQ, ltok::LAND, ltok::LANDEQ], + '|' => [ltok::BOR, ltok::BOREQ, ltok::LOR, ltok::LOREQ], + '^' => [ltok::BXOR, ltok::BXOREQ, ltok::LXOR, ltok::LXOREQ], + * => return syntaxerr(r.1, "unknown token sequence"), + }; + let idx = match (try(lex, r.0, '=')?) { + void => 0, // X + n: (rune, location) => switch (n.0) { + '=' => 1, // X= + * => match (try(lex, '=')?) { + void => 2, // XX + (rune, location) => 3, // XX= + }, }, - '=' => ltok::BXOREQ, - * => { - unget(lex, n); - ltok::BXOR; - } }; - return (tok, void, loc); + return (toks[idx], void, r.1); }; - // Unlex a single token. The next call to [[lex]] will return this token. Only one // unlex is supported at a time; you must call [[lex]] before calling [[unlex]] // again. @@ -750,10 +609,10 @@ export fn unlex(lex: *lexer, tok: token) void = { lex.un = tok; }; -fn next(lex: *lexer) (rune | io::EOF | io::error) = { +fn next(lex: *lexer) ((rune, location) | io::EOF | io::error) = { match (lex.rb[0]) { void => void, - r: (rune | io::EOF) => { + r: ((rune, location) | io::EOF) => { lex.rb[0] = lex.rb[1]; lex.rb[1] = void; return r; @@ -763,28 +622,40 @@ fn next(lex: *lexer) (rune | io::EOF | io::error) = { return match (bufio::scanrune(lex.in)) { e: (io::EOF | io::error) => e, r: rune => { + const loc = mkloc(lex); lexloc(lex, r); - return r; + return (r, loc); }, }; }; fn nextw(lex: *lexer) ((rune, location) | io::EOF | io::error) = { - for (true) { - let loc = mkloc(lex); - match (next(lex)) { - e: (io::error | io::EOF) => return e, - r: rune => if (!ascii::isspace(r)) { - return (r, loc); - } else { - free(lex.comment); - lex.comment = ""; - }, - }; + for (true) match (next(lex)) { + e: (io::error | io::EOF) => return e, + r: (rune, location) => if (!ascii::isspace(r.0)) { + return r; + } else { + free(lex.comment); + lex.comment = ""; + }, }; abort(); }; +fn try(lex: *lexer, want: rune...) ((rune, location) | void | io::error) = { + let r = match (next(lex)?) { + io::EOF => return void, + r: (rune, location) => r, + }; + assert(len(want) > 0); + for (let i = 0z; i < len(want); i += 1) { + if (r.0 == want[i]) { + return r; + }; + }; + unget(lex, r); +}; + fn lexloc(lex: *lexer, r: rune) void = { switch (r) { '\n' => { @@ -796,7 +667,7 @@ fn lexloc(lex: *lexer, r: rune) void = { }; }; -fn unget(lex: *lexer, r: (rune | io::EOF)) void = { +fn unget(lex: *lexer, r: ((rune, location) | io::EOF)) void = { if (!(lex.rb[0] is void)) { assert(lex.rb[1] is void, "ungot too many runes"); lex.rb[1] = lex.rb[0]; @@ -804,10 +675,13 @@ fn unget(lex: *lexer, r: (rune | io::EOF)) void = { lex.rb[0] = r; }; -export fn mkloc(lex: *lexer) location = location { - path = lex.path, - line = lex.loc.0, - col = lex.loc.1, +export fn mkloc(lex: *lexer) location = match (lex.rb[0]) { + r: (rune, location) => r.1, + void => location { + path = lex.path, + line = lex.loc.0, + col = lex.loc.1, + }, }; fn syntaxerr(loc: location, why: str) error = (loc, why);