hare

[hare] The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

commit bf1e316aced9e60aeb4d85bc38fa3158544eaf6c
parent 51d4feb9fa969343789246b5566f1b85ddbb6013
Author: Max Schillinger <max@mxsr.de>
Date:   Mon, 15 Apr 2024 17:47:28 +0200

regex: implement multiple alternation

Example: The regex pattern `(ha|py|sh)` is implemented using the
following Hare Regular Expression Engine Virtual Machine NFA
Representation:

 0 groupstart
 1 split → 5 (split)
 2 lit h
 3 lit a
 4 jump → 11 (groupend)
 5 split → 9 (s)
 6 lit p
 7 lit y
 8 jump → 11 (groupend)
 9 lit s
10 lit h
11 groupend

Implements: https://todo.sr.ht/~sircmpwn/hare/696
Signed-off-by: Max Schillinger <max@mxsr.de>

Diffstat:
Mregex/+test.ha | 6+++---
Mregex/regex.ha | 26++++++++++++++------------
2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/regex/+test.ha b/regex/+test.ha @@ -581,9 +581,9 @@ fn run_rawreplace_case( (`ab|cd`, "cd", matchres::MATCH, 0, 2), (`ab|cd`, "abc", matchres::MATCH, 0, 2), (`ab|cd`, "abcd", matchres::MATCH, 0, 2), - // TODO: multiple alternation - // (`a|b|c|d|e`, "e", matchres::MATCH, 0, -1), - // (`(a|b|c|d|e)f`, "ef", matchres::MATCH, 0, -1), + // multiple alternation + (`a|b|c|d|e`, "e", matchres::MATCH, 0, -1), + (`(a|b|c|d|e)f`, "ef", matchres::MATCH, 0, -1), // TODO: nested capture groups (`((a))`, "abc", matchres::ERROR, 0, -1), // (`((a))`, "abc", matchres::MATCH, 0, -1), diff --git a/regex/regex.ha b/regex/regex.ha @@ -223,7 +223,7 @@ export fn compile(expr: str) (regex | error) = { let iter = strings::iter(expr); let r_idx = 0z; let anchored = false; - let curr_alt_jump_idx = -1; + let jump_idxs: []size = []; let in_bracket = false; let skip_charclass_rest = false; let bracket_idx = -1; @@ -292,12 +292,11 @@ export fn compile(expr: str) (regex | error) = { }; n_groupstarts -= 1; append(insts, void: inst_groupend); - if (curr_alt_jump_idx != -1) { - assert(insts[curr_alt_jump_idx] is inst_jump); - insts[curr_alt_jump_idx] = - (len(insts) - 1): inst_jump; - curr_alt_jump_idx = -1; + for (let jump_idx .. jump_idxs) { + assert(insts[jump_idx] is inst_jump); + insts[jump_idx] = (len(insts) - 1): inst_jump; }; + jump_idxs = []; case '|' => append(insts, types::SIZE_MAX: inst_jump); const origin = match (find_last_groupstart(&insts)) { @@ -307,8 +306,11 @@ export fn compile(expr: str) (regex | error) = { yield sz + 1; }; const newinst = (len(insts) + 1): inst_split; - insert(insts[origin], newinst); - curr_alt_jump_idx = (len(insts) - 1): int; + // add split after last jump (if any) or at origin + const split_idx = if (len(jump_idxs) > 0) + jump_idxs[len(jump_idxs) - 1] + 1 else origin; + insert(insts[split_idx], newinst); + append(jump_idxs, len(insts) - 1); case '{' => let origin = len(insts) - 1; if (insts[origin] is inst_groupend) { @@ -400,11 +402,11 @@ export fn compile(expr: str) (regex | error) = { }; // handle whole expression alternation - if (curr_alt_jump_idx != -1) { - assert(insts[curr_alt_jump_idx] is inst_jump); - insts[curr_alt_jump_idx] = len(insts): inst_jump; - curr_alt_jump_idx = -1; + for (let jump_idx .. jump_idxs) { + assert(insts[jump_idx] is inst_jump); + insts[jump_idx] = len(insts): inst_jump; }; + jump_idxs = []; append(insts, anchored: inst_match);