hare

[hare] The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

commit 413a5e43a6bfa6f7d7e0fca27a367e92c0ede2da
parent d1cf40891bc75c036e9ce60ab5bc9cfad32baf74
Author: Max Schillinger <max@mxsr.de>
Date:   Sun, 14 Apr 2024 22:44:24 +0200

regex: implement whole-expression alternation

When a pattern contains a `|` outside of a capture group, make this the
jump position of a whole-expression alternation.

Signed-off-by: Max Schillinger <max@mxsr.de>
Implements: https://todo.sr.ht/~sircmpwn/hare/695

Diffstat:
Mregex/+test.ha | 7++++---
Mregex/regex.ha | 14+++++++++++++-
2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/regex/+test.ha b/regex/+test.ha @@ -577,9 +577,10 @@ fn run_rawreplace_case( (`(a|ab)(bcd|c)(d|.*)`, "abcd", matchres::MATCH, 0, -1), // POSIX: (0,4)(0,2)(2,3)(3,4) (`(ab|a)(c|bcd)(d|.*)`, "abcd", matchres::MATCH, 0, -1), // POSIX: (0,4)(0,2)(2,3)(3,4) (`(ab|a)(bcd|c)(d|.*)`, "abcd", matchres::MATCH, 0, -1), // POSIX: (0,4)(0,2)(2,3)(3,4) - // TODO: whole-expression alternation - // (`ab|cd`, "abc", matchres::MATCH, 0, -1), - // (`ab|cd`, "abcd", matchres::MATCH, 0, -1), + // whole-expression alternation + (`ab|cd`, "cd", matchres::MATCH, 0, 2), + (`ab|cd`, "abc", matchres::MATCH, 0, 2), + (`ab|cd`, "abcd", matchres::MATCH, 0, 2), // TODO: multiple alternation // (`a|b|c|d|e`, "e", matchres::MATCH, 0, -1), // (`(a|b|c|d|e)f`, "ef", matchres::MATCH, 0, -1), diff --git a/regex/regex.ha b/regex/regex.ha @@ -300,7 +300,12 @@ export fn compile(expr: str) (regex | error) = { }; case '|' => append(insts, types::SIZE_MAX: inst_jump); - const origin = find_last_groupstart(&insts)? + 1; + const origin = match (find_last_groupstart(&insts)) { + case error => + yield 0z; + case let sz: size => + yield sz + 1; + }; const newinst = (len(insts) + 1): inst_split; insert(insts[origin], newinst); curr_alt_jump_idx = (len(insts) - 1): int; @@ -394,6 +399,13 @@ export fn compile(expr: str) (regex | error) = { r_idx += 1; }; + // handle whole expression alternation + if (curr_alt_jump_idx != -1) { + assert(insts[curr_alt_jump_idx] is inst_jump); + insts[curr_alt_jump_idx] = len(insts): inst_jump; + curr_alt_jump_idx = -1; + }; + append(insts, anchored: inst_match); return regex {