commit 4b014b2b24b381bd8d264005e7b5a717595f36bd
parent acfd06899d3a8622da57627c7cf0b2785fd562ef
Author: Mallory Adams <malloryadams@fastmail.com>
Date: Thu, 16 May 2024 09:46:21 -0400
hare::lex: allow digit separators
This change brings hare::lex up to date with the current specification
and harec.
Signed-off-by: Mallory Adams <malloryadams@fastmail.com>
Diffstat:
M | hare/lex/+test.ha | | | 84 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------- |
M | hare/lex/lex.ha | | | 154 | +++++++++++++++++++++++++++++++++++++++++++++++-------------------------------- |
2 files changed, 166 insertions(+), 72 deletions(-)
diff --git a/hare/lex/+test.ha b/hare/lex/+test.ha
@@ -303,6 +303,39 @@ fn loc(line: uint, col: uint) location = location {
lextest(in, expected);
};
+@test fn literals_underscores() void = {
+ const in = "1_0e5 -1_0i32 9_223_372_036_854_775_809 1_0e2z 2_55u8 0o4_2u16\n"
+ "0b100_0101u32 0xDE_AD_BE_EFu64 -0b1_0i8 -5_0e0i16 -0o1_6i32\n"
+ "0b0000_0010_0000_0110_0000_0111_0000_1111_0000_0010_0000_0110_0000_0111i64\n"
+ "1_3.3_7 1_3.3_7f32 1_3.3_7f64 6.0_2_2e23 1.6_1_6_2_5_5e-35f64 1_0e-1 0x0_1p-2";
+ const expected: [_]token = [
+ (ltok::LIT_ICONST, 10e5u64, loc(1, 1)),
+ (ltok::MINUS, void, loc(1, 7)),
+ (ltok::LIT_I32, 10u64, loc(1, 8)),
+ (ltok::LIT_ICONST, 9223372036854775809u64, loc(1, 15)),
+ (ltok::LIT_SIZE, 10e2u64, loc(1, 41)),
+ (ltok::LIT_U8, 255u64, loc(1, 48)),
+ (ltok::LIT_U16, 0o42u64, loc(1, 55)),
+ (ltok::LIT_U32, 0b1000101u64, loc(2, 1)),
+ (ltok::LIT_U64, 0xDEADBEEFu64, loc(2, 15)),
+ (ltok::MINUS, void, loc(2, 32)),
+ (ltok::LIT_I8, 0b10u64, loc(2, 33)),
+ (ltok::MINUS, void, loc(2, 41)),
+ (ltok::LIT_I16, 50e0u64, loc(2, 42)),
+ (ltok::MINUS, void, loc(2, 51)),
+ (ltok::LIT_I32, 0o16u64, loc(2, 52)),
+ (ltok::LIT_I64, 0b00000010000001100000011100001111000000100000011000000111u64, loc(3, 1)),
+ (ltok::LIT_FCONST, 13.37, loc(4, 1)),
+ (ltok::LIT_F32, 13.37, loc(4, 9)),
+ (ltok::LIT_F64, 13.37, loc(4, 20)),
+ (ltok::LIT_FCONST, 6.022e23, loc(4, 31)),
+ (ltok::LIT_F64, 1.616255e-35, loc(4, 42)),
+ (ltok::LIT_FCONST, 10e-1, loc(4, 63)),
+ (ltok::LIT_FCONST, 0x1p-2, loc(4, 70)),
+ ];
+ lextest(in, expected);
+};
+
@test fn invalid() void = {
// Using \x80 within a string literal will cause this to output an
// empty string
@@ -316,16 +349,49 @@ fn loc(line: uint, col: uint) location = location {
const s = lex(&lexer) as error as syntax;
assert(s.1 == "Source file is not valid UTF-8");
- // Regression: invalid escape sequences such as "\^" used to casue a
- // crash
- let lexer = initstr(`"\^"`);
- const s = lex(&lexer) as error as syntax;
- assert(s.1 == "unknown escape sequence");
+ const invalid_tokens: [](str, str) = [
+ // Regression: invalid escape sequences such as "\^" used to
+ // cause a crash
+ (`"\^"`, "unknown escape sequence"),
- // Regression: <X>e followed by another token used to cause a crash
- let lexer = initstr("0e)");
- const s = lex(&lexer) as error as syntax;
- assert(s.1 == "expected exponent");
+ // Regression: <X>e followed by another token used to cause a
+ // crash
+ ("0e)", "expected exponent"),
+
+ // Invalid digit separators
+ ("1_", "Expected digit after separator"),
+ ("100_", "Expected digit after separator"),
+ ("1_000_", "Expected digit after separator"),
+ ("1__0", "Expected digit after separator"),
+ ("1__000_0", "Expected digit after separator"),
+ ("1_000__0", "Expected digit after separator"),
+ ("1___0", "Expected digit after separator"),
+ ("2e_8", "Exponents may not contain separators"),
+ ("2_e8", "Expected digit after separator"),
+ ("2e8_", "Exponents may not contain separators"),
+ ("3e1__1", "Exponents may not contain separators"),
+ ("2e+_5", "Exponents may not contain separators"),
+ ("2e_+5", "Exponents may not contain separators"),
+ ("0x_FFFF", "Expected integer literal"),
+ ("0b_1010", "Expected integer literal"),
+ ("0b1111_0000_", "Expected digit after separator"),
+ ("0o6__6", "Expected digit after separator"),
+ ("0_b1010", "Expected digit after separator"),
+ ("0_o77", "Expected digit after separator"),
+ ("0_xFF", "Expected digit after separator"),
+ ("2e1_6", "Exponents may not contain separators"),
+ ("0x2p1_0", "Exponents may not contain separators"),
+ ("2e-1_0", "Exponents may not contain separators"),
+ ("100u3_2", "Suffixes may not contain separators"),
+ ("100u32_", "Suffixes may not contain separators"),
+ ("100u_32", "Suffixes may not contain separators"),
+ ("100_u32", "Expected digit after separator"),
+ ];
+ for (const invalid_token .. invalid_tokens) {
+ let lexer = initstr(invalid_token.0);
+ const s = lex(&lexer) as error as syntax;
+ assert(s.1 == invalid_token.1);
+ };
};
diff --git a/hare/lex/lex.ha b/hare/lex/lex.ha
@@ -447,90 +447,118 @@ fn lex_literal(lex: *lexer) (token | error) = {
let exp: (size | void) = void;
let end = 0z;
let float = false;
+ let last_rune_was_separator = false;
for (true) {
r = match (next(lex)?) {
case io::EOF =>
+ if (last_rune_was_separator) {
+ return syntaxerr(loc,
+ "Expected digit after separator");
+ };
break;
case let r: (rune, location) =>
yield r;
};
- if (!strings::contains(basechrs, r.0)) switch (r.0) {
- case '.' =>
- if (!started) {
+ if (!strings::contains(basechrs, r.0)) {
+ if (last_rune_was_separator) {
return syntaxerr(loc,
- "Expected integer literal");
+ "Expected digit after separator");
};
- if (float || exp is size || suff is size
- || lex.require_int) {
- unget(lex, r.0);
- break;
- } else {
- r = match (next(lex)?) {
- case io::EOF =>
- break;
- case let r: (rune, location) =>
- yield r;
+ switch (r.0) {
+ case '.' =>
+ if (!started) {
+ return syntaxerr(loc,
+ "Expected integer literal");
};
- if (!strings::contains(basechrs, r.0)) {
+ if (float || exp is size || suff is size
+ || lex.require_int) {
unget(lex, r.0);
- unget(lex, '.');
break;
+ } else {
+ r = match (next(lex)?) {
+ case io::EOF =>
+ break;
+ case let r: (rune, location) =>
+ yield r;
+ };
+ if (!strings::contains(basechrs, r.0)) {
+ unget(lex, r.0);
+ unget(lex, '.');
+ break;
+ };
+ unget(lex, r.0);
+ float = true;
+ append(chars, utf8::encoderune('.')...);
};
- unget(lex, r.0);
- float = true;
- append(chars, utf8::encoderune('.')...);
- };
- case 'e', 'E', 'p', 'P' =>
- if (!started) {
- return syntaxerr(loc,
- "Expected integer literal");
- };
- if ((r.0 == 'e' || r.0 == 'E') !=
- (base == strconv::base::DEC)) {
- unget(lex, r.0);
- break;
- };
- if (exp is size || suff is size) {
- unget(lex, r.0);
- break;
- } else {
- if (end == 0) end = len(chars);
- append(chars, utf8::encoderune(r.0)...);
- exp = len(chars);
- r = match (next(lex)?) {
- case io::EOF =>
+ case 'e', 'E', 'p', 'P' =>
+ if (!started) {
+ return syntaxerr(loc,
+ "Expected integer literal");
+ };
+ if ((r.0 == 'e' || r.0 == 'E') !=
+ (base == strconv::base::DEC)) {
+ unget(lex, r.0);
break;
- case let r: (rune, location) =>
- yield r;
};
- switch (r.0) {
- case '+', '-' =>
+ if (exp is size || suff is size) {
+ unget(lex, r.0);
+ break;
+ } else {
+ if (end == 0) end = len(chars);
append(chars, utf8::encoderune(r.0)...);
- case =>
+ exp = len(chars);
+ r = match (next(lex)?) {
+ case io::EOF =>
+ break;
+ case let r: (rune, location) =>
+ yield r;
+ };
+ switch (r.0) {
+ case '+', '-' =>
+ append(chars, utf8::encoderune(r.0)...);
+ case =>
+ unget(lex, r.0);
+ };
+ basechrs = "0123456789";
+ };
+ case 'i', 'u', 'f', 'z' =>
+ if (!started) {
+ return syntaxerr(loc,
+ "Expected integer literal");
+ };
+ if (suff is size || r.0 != 'f' && float
+ || r.0 == 'f'
+ && base != strconv::base::DEC) {
unget(lex, r.0);
+ break;
+ } else {
+ suff = len(chars);
+ if (end == 0) end = len(chars);
+ append(chars, utf8::encoderune(r.0)...);
+ basechrs = "0123456789";
};
- basechrs = "0123456789";
- };
- case 'i', 'u', 'f', 'z' =>
- if (!started) {
- return syntaxerr(loc,
- "Expected integer literal");
- };
- if (suff is size || r.0 != 'f' && float
- || r.0 == 'f'
- && base != strconv::base::DEC) {
+ case '_' =>
+ if (!started) {
+ return syntaxerr(loc,
+ "Expected integer literal");
+ };
+ if (exp is size) {
+ return syntaxerr(loc,
+ "Exponents may not contain separators");
+ };
+ if (suff is size) {
+ return syntaxerr(loc,
+ "Suffixes may not contain separators");
+ };
+ last_rune_was_separator = true;
+ case =>
unget(lex, r.0);
break;
- } else {
- suff = len(chars);
- if (end == 0) end = len(chars);
- append(chars, utf8::encoderune(r.0)...);
- basechrs = "0123456789";
};
- case =>
- unget(lex, r.0);
- break;
- } else append(chars, utf8::encoderune(r.0)...);
+ } else {
+ last_rune_was_separator = false;
+ append(chars, utf8::encoderune(r.0)...);
+ };
started = true;
};
if (!started) {