commit 025646da5faa323edd3ccd29abf786cc2664ac75
parent 88cb6c68e9cb248aac3bd54bf543b9f8a08ac607
Author: Drew DeVault <sir@cmpwn.com>
Date: Fri, 19 Feb 2021 16:08:53 -0500
hare::lex: lex rune literals
Diffstat:
3 files changed, 159 insertions(+), 10 deletions(-)
diff --git a/hare/lex/+test.ha b/hare/lex/+test.ha
@@ -29,6 +29,30 @@ use strings;
assert(t.1.line == 1234 && t.1.col == 1234);
};
+fn liteq(expected: literal, actual: literal) bool = {
+ if (expected.storage != actual.storage) {
+ return false;
+ };
+ return switch (expected.storage) {
+ literal_type::U8,
+ literal_type::U16,
+ literal_type::U32,
+ literal_type::U64,
+ literal_type::UINT,
+ literal_type::UINTPTR => expected._uint == actual._uint,
+ literal_type::I8,
+ literal_type::I16,
+ literal_type::I32,
+ literal_type::I64,
+ literal_type::INT,
+ literal_type::ICONST => expected._int == actual._int,
+ literal_type::F32,
+ literal_type::F64,
+ literal_type::FCONST => expected.float == actual.float,
+ literal_type::RUNE => expected._rune == actual._rune,
+ };
+};
+
fn lextest(in: str, expected: [](uint, uint, token)) void = {
let lexer = lexer_init(bufio::fixed(strings::to_utf8(in)), "<test>");
for (let i = 0z; i < len(expected); i += 1) {
@@ -36,7 +60,14 @@ fn lextest(in: str, expected: [](uint, uint, token)) void = {
etok = expected[i].2;
let tl = match (lex(&lexer)) {
tl: (token, location) => tl,
- * => abort(),
+ io::EOF => {
+ fmt::errorln("unexpected EOF at {}", i);
+ abort();
+ },
+ err: error => {
+ fmt::errorln("{}: {}", i, errstr(err));
+ abort();
+ },
};
let tok = tl.0, loc = tl.1;
match (tok) {
@@ -50,6 +81,18 @@ fn lextest(in: str, expected: [](uint, uint, token)) void = {
i, tokstr(tok), tokstr(etok));
abort();
},
+ l: literal => if (!(etok is literal)) {
+ fmt::errorln("bad token at {}: got {}, wanted {}",
+ i, tokstr(tok), tokstr(etok));
+ abort();
+ } else {
+ let e = etok as literal;
+ if (!liteq(l, e)) {
+ fmt::errorln("bad token at {}: got '{}', wanted '{}'",
+ i, tokstr(tok), tokstr(etok));
+ abort();
+ };
+ },
* => abort("TODO"),
};
assert(loc.path == "<test>");
@@ -150,3 +193,24 @@ fn lextest(in: str, expected: [](uint, uint, token)) void = {
];
lextest(in, expected);
};
+
+@test fn runes() void = {
+ const in = "'a' 'b' '\\a' '\\b' '\\f' '\\n' '\\r' '\\t' '\\v' '\\0' "
+ "'\\\\' '\\\''";
+ const expected: [_](uint, uint, token) = [
+ (1, 1, literal { storage = literal_type::RUNE, _rune = 'a' }),
+ (1, 5, literal { storage = literal_type::RUNE, _rune = 'b' }),
+ (1, 9, literal { storage = literal_type::RUNE, _rune = '\a' }),
+ (1, 14, literal { storage = literal_type::RUNE, _rune = '\b' }),
+ (1, 19, literal { storage = literal_type::RUNE, _rune = '\f' }),
+ (1, 24, literal { storage = literal_type::RUNE, _rune = '\n' }),
+ (1, 29, literal { storage = literal_type::RUNE, _rune = '\r' }),
+ (1, 34, literal { storage = literal_type::RUNE, _rune = '\t' }),
+ (1, 39, literal { storage = literal_type::RUNE, _rune = '\v' }),
+ (1, 44, literal { storage = literal_type::RUNE, _rune = '\0' }),
+ (1, 49, literal { storage = literal_type::RUNE, _rune = '\\' }),
+ (1, 54, literal { storage = literal_type::RUNE, _rune = '\'' }),
+ ];
+ // TODO: test \x and \u and \U
+ lextest(in, expected);
+};
diff --git a/hare/lex/lex.ha b/hare/lex/lex.ha
@@ -15,7 +15,7 @@ export type lexer = struct {
};
// A syntax error
-export type syntax = location;
+export type syntax = (location, str);
// All possible lexer errors
export type error = (io::error | syntax);
@@ -23,7 +23,7 @@ export type error = (io::error | syntax);
export fn errstr(err: error) const str = {
return match (err) {
err: io::error => io::errstr(err),
- syntax => "Syntax error", // TODO: add line info
+ s: syntax => s.1, // TODO: format me
};
};
@@ -66,8 +66,11 @@ export fn lex(lex: *lexer) ((token, location) | io::EOF | error) = {
};
let tok: token = switch (r) {
- * => return syntaxerr(loc),
- '"', '\'' => abort(), // TODO: Strings/runes
+ * => return syntaxerr(loc, "invalid character"),
+ '"', '\'' => {
+ unget(lex, r);
+ return lex_string(lex, loc);
+ },
'.', '<', '>' => return lex3(lex, loc, r),
'^', '*', '%', '/', '+', '-', ':', '!', '&', '|', '=' => {
return lex2(lex, loc, r);
@@ -96,7 +99,82 @@ fn ncmp(a: const *void, b: const *void) int = {
};
};
-fn lex_name(lex: *lexer, loc: location) ((token, location) | io::EOF | error) = {
+fn lex_rune(lex: *lexer, loc: location) (rune | error) = {
+ let r = match (next(lex)) {
+ io::EOF => return syntaxerr(loc,
+ "unexpected EOF scanning for rune"),
+ err: io::error => return err,
+ r: rune => r,
+ };
+ if (r != '\\') {
+ match (next(lex)) {
+ io::EOF => return syntaxerr(loc,
+ "unexpected EOF scanning rune, expected \"\'\""),
+ err: io::error => return err,
+ r: rune => if (r != '\'')
+ return syntaxerr(loc, "expected \"\'\" after rune"),
+ };
+ return r;
+ };
+ r = match (next(lex)) {
+ io::EOF => return syntaxerr(loc,
+ "unexpected EOF scanning for escape"),
+ err: io::error => return err,
+ r: rune => r,
+ };
+ let r = switch (r) {
+ '\\' => '\\',
+ '\'' => '\'',
+ '0' => '\0',
+ 'a' => '\a',
+ 'b' => '\b',
+ 'f' => '\f',
+ 'n' => '\n',
+ 'r' => '\r',
+ 't' => '\t',
+ 'v' => '\v',
+ '"' => '\"',
+ 'x' => abort(), // TODO
+ 'u' => abort(), // TODO
+ };
+ match (next(lex)) {
+ io::EOF => return syntaxerr(loc,
+ "unexpected EOF scanning escape sequence, expected \"\'\""),
+ err: io::error => return err,
+ r: rune => if (r != '\'') return syntaxerr(loc,
+ "expected \"\'\" after escape sequence"),
+ };
+ return r;
+};
+
+fn lex_string(
+ lex: *lexer,
+ loc: location,
+) ((token, location) | io::EOF | error) = {
+ let r = match (next(lex)) {
+ r: rune => r,
+ (io::EOF | io::error) => abort(),
+ };
+ return switch (r) {
+ '\'' => match (lex_rune(lex, loc)) {
+ err: error => err,
+ r: rune => (literal {
+ storage = literal_type::RUNE,
+ _rune = r,
+ }: token, loc),
+ },
+ '\"' => {
+ let chars: []u8 = [];
+ abort(); // TODO
+ },
+ * => abort(), // Invariant
+ };
+};
+
+fn lex_name(
+ lex: *lexer,
+ loc: location,
+) ((token, location) | io::EOF | error) = {
let chars: []u8 = [];
match (next(lex)) {
r: rune => {
@@ -226,7 +304,7 @@ fn lex2(
},
io::EOF => btoken::EQUAL,
},
- * => return syntaxerr(loc),
+ * => return syntaxerr(loc, "unknown token sequence"),
};
unget(lexr, n);
return (tok, loc);
@@ -250,7 +328,7 @@ fn lex3(
'.' => lex3dot(lex, loc, n),
'<' => lex3lt(lex, loc, n),
'>' => lex3gt(lex, loc, n),
- * => syntaxerr(loc),
+ * => syntaxerr(loc, "unknown token sequence"),
};
};
@@ -419,4 +497,4 @@ fn mkloc(lex: *lexer) location = location {
col = lex.loc.1,
};
-fn syntaxerr(loc: location) error = loc: syntax: error;
+fn syntaxerr(loc: location, why: str) error = (loc, why): syntax: error;
diff --git a/hare/lex/token.ha b/hare/lex/token.ha
@@ -1,3 +1,6 @@
+use encoding::utf8;
+use strings;
+
// A token with no additional context, such as '+'
export type btoken = enum {
// Keep ordered with bmap
@@ -248,7 +251,7 @@ export type literal_type = enum {
F32,
F64,
FCONST,
- VOID,
+ RUNE,
};
// A token for a literal value, such as '1337u32'
@@ -278,6 +281,10 @@ export fn tokstr(tok: token) const str = {
return match (tok) {
b: btoken => bmap[b: int],
n: name => n: str,
+ l: literal => switch (l.storage) {
+ literal_type::RUNE => "rune",
+ * => abort(), // TODO
+ },
* => abort(), // TODO
};
};