commit 7e8af1e3d8c12d584016d1498fc4d26978e7fd54
parent e942a8e5fcbc642c352ca9371014da9ab5cc9613
Author: Sebastian <sebastian@sebsite.pw>
Date: Sun, 1 Dec 2024 20:23:22 -0500
regex: implement nested capture groups
Implements: https://todo.sr.ht/~sircmpwn/hare/697
Signed-off-by: Sebastian <sebastian@sebsite.pw>
Diffstat:
M | regex/+test.ha | | | 40 | +++++++++++++++++++--------------------- |
M | regex/regex.ha | | | 140 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------ |
2 files changed, 116 insertions(+), 64 deletions(-)
diff --git a/regex/+test.ha b/regex/+test.ha
@@ -559,7 +559,7 @@ fn run_rawreplace_case(
(`a[bcd]+dcdcde`, "adcdcde", matchres::NOMATCH, 0, 0),
(`(ab|a)b*c`, "abc", matchres::MATCH, 0, -1),
(`[a-zA-Z_][a-zA-Z0-9_]*`, "alpha", matchres::MATCH, 0, -1),
- (`^a(bc+|b[eh])g|.h$`, "abh", matchres::MATCH, 0, -1),
+ (`^a(bc+|b[eh])g|.h$`, "abh", matchres::MATCH, 1, -1),
(`multiple words of text`, "uh-uh", matchres::NOMATCH, 0, 0),
(`multiple words`, "multiple words, yeah", matchres::MATCH, 0, 14),
(`(.*)c(.*)`, "abcde", matchres::MATCH, 0, -1),
@@ -608,26 +608,24 @@ fn run_rawreplace_case(
(`a|^b|^c|^d|e`, "xa", matchres::MATCH, 1, 2),
(`a|^b|^c|^d|e`, "xc", matchres::NOMATCH, 0, 0),
(`a|^b|^c|^d|e`, "xe", matchres::MATCH, 1, 2),
- // TODO: nested capture groups
- (`((a))`, "abc", matchres::ERROR, 0, -1),
- // (`((a))`, "abc", matchres::MATCH, 0, -1),
- // (`((a)(b)c)(d)`, "abcd", matchres::MATCH, 0, -1),
- // (`(bc+d$|ef*g.|h?i(j|k))`, "effgz", matchres::MATCH, 0, -1),
- // (`(bc+d$|ef*g.|h?i(j|k))`, "ij", matchres::MATCH, 0, -1),
- // (`(bc+d$|ef*g.|h?i(j|k))`, "effg", matchres::NOMATCH, 0, 0),
- // (`(bc+d$|ef*g.|h?i(j|k))`, "bcdd", matchres::NOMATCH, 0, 0),
- // (`(bc+d$|ef*g.|h?i(j|k))`, "reffgz", matchres::MATCH, 0, -1),
- // (`((((((((((a))))))))))`, "a", matchres::MATCH, 0, -1),
- // (`(((((((((a)))))))))`, "a", matchres::MATCH, 0, -1),
- // (`(([a-z]+):)?([a-z]+)$`, "smil", matchres::MATCH, 0, -1),
- // (`^((a)c)?(ab)$`, "ab", matchres::MATCH, 0, -1),
- // TODO: multiple simultaneous capture groups
- // (`(a+|b)*`, "ab", matchres::MATCH, 0, -1),
- // (`(a+|b){0,}`, "ab", matchres::MATCH, 0, -1),
- // (`(a+|b)+`, "ab", matchres::MATCH, 0, -1),
- // (`(a+|b){1,}`, "ab", matchres::MATCH, 0, -1),
- // (`(a+|b)?`, "ab", matchres::MATCH, 0, -1),
- // (`(a+|b){0,1}`, "ab", matchres::MATCH, 0, -1),
+ (`((a))`, "abc", matchres::MATCH, 0, 1),
+ (`((a)(b)c)(d)`, "abcd", matchres::MATCH, 0, -1),
+ // TODO: anchor in capture groups
+ //(`(bc+d$|ef*g.|h?i(j|k))`, "effgz", matchres::MATCH, 0, -1),
+ //(`(bc+d$|ef*g.|h?i(j|k))`, "ij", matchres::MATCH, 0, -1),
+ //(`(bc+d$|ef*g.|h?i(j|k))`, "effg", matchres::NOMATCH, 0, 0),
+ //(`(bc+d$|ef*g.|h?i(j|k))`, "bcdd", matchres::NOMATCH, 0, 0),
+ //(`(bc+d$|ef*g.|h?i(j|k))`, "reffgz", matchres::MATCH, 0, -1),
+ (`((((((((((a))))))))))`, "a", matchres::MATCH, 0, -1),
+ (`(((((((((a)))))))))`, "a", matchres::MATCH, 0, -1),
+ (`(([a-z]+):)?([a-z]+)$`, "smil", matchres::MATCH, 0, -1),
+ (`^((a)c)?(ab)$`, "ab", matchres::MATCH, 0, -1),
+ (`(a+|b)*`, "ab", matchres::MATCH, 0, -1),
+ (`(a+|b){0,}`, "ab", matchres::MATCH, 0, -1),
+ (`(a+|b)+`, "ab", matchres::MATCH, 0, -1),
+ (`(a+|b){1,}`, "ab", matchres::MATCH, 0, -1),
+ (`(a+|b)?`, "ab", matchres::MATCH, 0, 1),
+ (`(a+|b){0,1}`, "ab", matchres::MATCH, 0, 1),
// NOTE: character sequences not currently supported
// (`\0`, "\0", matchres::MATCH, 0, -1),
// (`[\0a]`, "\0", matchres::MATCH, 0, -1),
diff --git a/regex/regex.ha b/regex/regex.ha
@@ -20,7 +20,7 @@ export type inst_lit = rune,
inst_jump = size,
inst_skip = void,
inst_match = bool,
- inst_groupstart = void,
+ inst_groupstart = size,
inst_groupend = void,
inst_repeat = struct {
id: size,
@@ -58,8 +58,6 @@ type thread = struct {
start_bytesize: size,
root_capture: capture,
captures: []capture,
- curr_capture: capture,
- curr_capture_inited: bool,
rep_counters: []size,
matched: bool,
failed: bool,
@@ -107,15 +105,37 @@ export fn finish(re: *regex) void = {
free(re.charsets);
};
-fn find_last_groupstart(insts: *[]inst) (size | error) = {
+fn find_last_groupstart(insts: []inst) (size | error) = {
+ let nested = 0u;
for (let i = len(insts); i > 0; i -= 1) {
- if (insts[i - 1] is inst_groupstart) {
- return i - 1;
+ match (insts[i - 1]) {
+ case inst_groupstart =>
+ if (nested == 0) {
+ return i - 1;
+ };
+ nested -= 1;
+ case inst_groupend =>
+ nested += 1;
+ case => void;
};
};
return `Unmatched ')'`: error;
};
+// increments all [[inst_jump]] and [[inst_split]] instructions to account for a
+// newly inserted instruction before the given slice
+fn shift(sl: []inst) void = {
+ for (let inst &.. sl) {
+ match (*inst) {
+ case let z: inst_jump =>
+ *inst = (z + 1): inst_jump;
+ case let z: inst_split =>
+ *inst = (z + 1): inst_split;
+ case => void;
+ };
+ };
+};
+
fn handle_bracket(
insts: *[]inst,
r: rune,
@@ -230,7 +250,8 @@ export fn compile(expr: str) (regex | error) = {
let is_charset_positive = true;
let was_prev_rune_pipe = false;
let n_reps = 0z;
- let n_groupstarts = 0;
+ let group_level = 0z;
+ let capture_idx = 0z;
for (true) {
const next = strings::next(&iter);
@@ -253,7 +274,7 @@ export fn compile(expr: str) (regex | error) = {
const r = match (next) {
case done =>
- if (n_groupstarts > 0) {
+ if (group_level > 0) {
return `Unmatched '('`: error;
};
break;
@@ -269,14 +290,14 @@ export fn compile(expr: str) (regex | error) = {
r_idx += 1;
};
case '^' =>
- if (n_groupstarts > 0) {
+ if (group_level > 0) {
return `Anchor '^' in capture groups is unsupported`: error;
};
if (!(r_idx == 0 || was_prev_rune_pipe)) {
return `Anchor '^' not at start of whole pattern or alternation`: error;
};
case '$' =>
- if (n_groupstarts > 0) {
+ if (group_level > 0) {
return `Anchor '$' in capture groups is unsupported`: error;
};
const peek1 = strings::next(&iter);
@@ -292,16 +313,14 @@ export fn compile(expr: str) (regex | error) = {
case ']' =>
append(insts, r: inst_lit);
case '(' =>
- if (n_groupstarts > 0) {
- return `Nested capture groups are unsupported`: error;
- };
- append(insts, inst_groupstart);
- n_groupstarts += 1;
+ append(insts, capture_idx: inst_groupstart);
+ group_level += 1;
+ capture_idx += 1;
case ')' =>
- if (n_groupstarts == 0) {
+ if (group_level == 0) {
return `Unmatched ')'`: error;
};
- n_groupstarts -= 1;
+ group_level -= 1;
append(insts, inst_groupend);
for (let jump_idx .. jump_idxs) {
assert(insts[jump_idx] is inst_jump);
@@ -310,7 +329,7 @@ export fn compile(expr: str) (regex | error) = {
delete(jump_idxs[..]);
case '|' =>
append(insts, types::SIZE_MAX: inst_jump);
- const origin = match (find_last_groupstart(&insts)) {
+ const origin = match (find_last_groupstart(insts)) {
case error =>
yield 0z;
case let sz: size =>
@@ -321,6 +340,7 @@ export fn compile(expr: str) (regex | error) = {
const split_idx = if (len(jump_idxs) > 0)
jump_idxs[len(jump_idxs) - 1] + 1 else origin;
insert(insts[split_idx], newinst);
+ shift(insts[split_idx + 1..]);
append(jump_idxs, len(insts) - 1);
// add skip if it's a whole-expression alternation
if (origin == 0) {
@@ -335,7 +355,7 @@ export fn compile(expr: str) (regex | error) = {
case '{' =>
let origin = len(insts) - 1;
if (insts[origin] is inst_groupend) {
- origin = find_last_groupstart(&insts)?;
+ origin = find_last_groupstart(insts[..origin])?;
};
const rest = strings::iterstr(&iter);
const rep_parts = parse_repetition(rest)?;
@@ -352,6 +372,7 @@ export fn compile(expr: str) (regex | error) = {
// advance to len(insts) + 2 to make space for the `inst_repeat`
insert(insts[origin],
len(insts) + 2: inst_split);
+ shift(insts[origin + 1..]);
origin += 1;
};
const newinst = inst_repeat {
@@ -374,7 +395,8 @@ export fn compile(expr: str) (regex | error) = {
match (insts[term_start_idx]) {
case (inst_lit | inst_charset | inst_any) => void;
case inst_groupend =>
- term_start_idx = find_last_groupstart(&insts)?;
+ term_start_idx = find_last_groupstart(
+ insts[..term_start_idx])?;
case inst_groupstart =>
return `Unused '?'`: error;
case =>
@@ -382,6 +404,7 @@ export fn compile(expr: str) (regex | error) = {
};
const after_idx = len(insts) + 1;
insert(insts[term_start_idx], after_idx: inst_split);
+ shift(insts[term_start_idx + 1..]);
case '*' =>
if (r_idx == 0 || len(insts) == 0) {
return `Unused '*'`: error;
@@ -393,7 +416,8 @@ export fn compile(expr: str) (regex | error) = {
match (insts[term_start_idx]) {
case (inst_lit | inst_charset | inst_any) => void;
case inst_groupend =>
- term_start_idx = find_last_groupstart(&insts)?;
+ term_start_idx = find_last_groupstart(
+ insts[..term_start_idx])?;
case inst_groupstart =>
return `Unused '*'`: error;
case =>
@@ -402,6 +426,7 @@ export fn compile(expr: str) (regex | error) = {
const split_idx = term_start_idx;
term_start_idx += new_inst_offset;
insert(insts[split_idx], after_idx: inst_split);
+ shift(insts[split_idx + 1..]);
append(insts, split_idx: inst_jump);
case '+' =>
if (r_idx == 0 || len(insts) == 0) {
@@ -411,7 +436,8 @@ export fn compile(expr: str) (regex | error) = {
match (insts[term_start_idx]) {
case (inst_lit | inst_charset | inst_any) => void;
case inst_groupend =>
- term_start_idx = find_last_groupstart(&insts)?;
+ term_start_idx = find_last_groupstart(
+ insts[..term_start_idx])?;
case inst_groupstart =>
return `Unused '+'`: error;
case =>
@@ -530,8 +556,6 @@ fn add_thread(threads: *[]thread, parent_idx: size, new_pc: size) void = {
pc = new_pc,
start_idx = threads[parent_idx].start_idx,
start_bytesize = threads[parent_idx].start_bytesize,
- curr_capture = threads[parent_idx].curr_capture,
- curr_capture_inited = threads[parent_idx].curr_capture_inited,
matched = threads[parent_idx].matched,
failed = threads[parent_idx].failed,
captures = alloc(threads[parent_idx].captures...),
@@ -587,23 +611,38 @@ fn run_thread(
};
threads[i].matched = true;
return newmatch;
- case inst_groupstart =>
- assert(!threads[i].curr_capture_inited, "Found nested capture groups in expression, which are not supported");
- threads[i].curr_capture.start = str_idx;
- threads[i].curr_capture.start_bytesize = str_bytesize;
- threads[i].curr_capture_inited = true;
+ case let idx: inst_groupstart =>
+ if (idx >= len(threads[i].captures)) {
+ append(threads[i].captures,
+ [capture { ... }...],
+ idx - len(threads[i].captures) + 1);
+ };
+ assert(threads[i].captures[idx].end != types::SIZE_MAX);
+ threads[i].captures[idx] = capture {
+ content = "",
+ start = str_idx,
+ start_bytesize = str_bytesize,
+ // end=types::SIZE_MAX indicates that the
+ // capture group hasn't ended yet
+ end = types::SIZE_MAX,
+ end_bytesize = types::SIZE_MAX,
+ };
threads[i].pc += 1;
case inst_groupend =>
- assert(threads[i].curr_capture_inited, `Found a groupend token ")" without having previously seen a groupstart token "(". Please report this as a bug`);
- threads[i].curr_capture.end = str_idx;
- threads[i].curr_capture.end_bytesize = str_bytesize;
- threads[i].curr_capture.content =
- strings::fromutf8_unsafe(str_bytes[
- threads[i].curr_capture.start_bytesize..
- threads[i].curr_capture.end_bytesize]);
- append(threads[i].captures, threads[i].curr_capture);
- threads[i].curr_capture = capture { ... };
- threads[i].curr_capture_inited = false;
+ let curr_capture = len(threads[i].captures);
+ for (curr_capture > 0; curr_capture -= 1) {
+ // find inner-most unclosed capture group
+ if (threads[i].captures[curr_capture - 1].end
+ == types::SIZE_MAX) {
+ break;
+ };
+ };
+ assert(curr_capture > 0, `Found a groupend token ")" without having previously seen a groupstart token "(". Please report this as a bug`);
+ let capture = &threads[i].captures[curr_capture - 1];
+ capture.end = str_idx;
+ capture.end_bytesize = str_bytesize;
+ capture.content = strings::fromutf8_unsafe(str_bytes[
+ capture.start_bytesize..capture.end_bytesize]);
threads[i].pc += 1;
case let ir: inst_repeat =>
assert(ir.id < len(threads[i].rep_counters));
@@ -741,10 +780,25 @@ fn search(
best_n_captures = len(threads[i].captures);
};
};
- let res: []capture = alloc([],
- len(threads[best_idx].captures) + 1);
- append(res, threads[best_idx].root_capture);
- append(res, threads[best_idx].captures...);
+
+ // length = number of captures (index of final group +
+ // 1) + root capture
+ let length = 1z;
+ for (let i = len(re.insts); i > 0; i -= 1) {
+ match (re.insts[i - 1]) {
+ case let z: inst_groupstart =>
+ length = z + 2;
+ break;
+ case => void;
+ };
+ };
+ let res: result = alloc([], length);
+ static append(res, threads[best_idx].root_capture);
+ static append(res, threads[best_idx].captures...);
+ if (length != len(res)) {
+ static append(res, [capture { ... }...],
+ length - len(res));
+ };
return res;
};