hare

[hare] The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

commit be3ec0a58e15438edd82638608a6e5f11f1a1692
parent 18c3d7e05bb3ca4ce759450264116d671fcaf396
Author: Vlad-Stefan Harbuz <vlad@vladh.net>
Date:   Fri, 13 May 2022 16:41:19 +0100

regex: rename "matchgroup" to "capture"

Signed-off-by: Vlad-Stefan Harbuz <vlad@vladh.net>

Diffstat:
Mregex/+test.ha | 8++++----
Mregex/README | 41++++++++++++++++++++++-------------------
Mregex/regex.ha | 84++++++++++++++++++++++++++++++++++++++++----------------------------------------
3 files changed, 68 insertions(+), 65 deletions(-)

diff --git a/regex/+test.ha b/regex/+test.ha @@ -41,7 +41,7 @@ fn run_find_case( expr, string); }; - case let m: []matchgroup => + case let m: []capture => if (expected == matchres::NOMATCH) { fmt::fatalf("Expected expression /{}/ to not match string \"{}\", but it did", expr, string); @@ -51,11 +51,11 @@ fn run_find_case( expr, string); }; if (start: size != m[0].start) { - fmt::fatalf("Expected start of main match group to be {} but it was {}", + fmt::fatalf("Expected start of main capture to be {} but it was {}", start, m[0].start); }; if (end: size != m[0].end) { - fmt::fatalf("Expected end of main match group to be {} but it was {}", + fmt::fatalf("Expected end of main capture to be {} but it was {}", end, m[0].end); }; @@ -104,7 +104,7 @@ fn run_findall_case( expr, string); }; - case let groupsets: [][]matchgroup => + case let groupsets: [][]capture => if (expected == matchres::NOMATCH) { fmt::fatalf("Expected expression /{}/ to not match string \"{}\", but it did", expr, string); diff --git a/regex/README b/regex/README @@ -6,12 +6,13 @@ By default, matches will be found anywhere in the given string. The ^ and $ characters can be used to anchor the match to the beginning or end of the string. -find() returns a slice of [[matchgroup]]s for the first match. The first -[[matchgroup]] represents the entire match, while the rest represent the -submatches, specified in the expression using (parens). +find() returns a slice of [[capture]]s for the first match. The first +[[capture]] represents the entire matching string, while the rest represent the +matching substrings for the subexpressions, specified in the regular expression +using parentheses. findall() finds all non-overlapping matches in the given string and returns -a slice of slices of [[matchgroup]]s. +a slice of slices of [[capture]]s. This module implements the POSIX match disambiguation rules by returning the longest match among the leftmost matches. @@ -22,26 +23,28 @@ the longest match among the leftmost matches. const first_match = regex::find(&re, "Hello Hare, hello Hare.")!; match (first_match) { case void => void; - case let groups: []regex::matchgroup => - defer free(groups); - // The match groups provide the content, start index and end - // index of the main match, as well as all submatches. - fmt::printfln("{} ({}, {})", groups[0].content, - groups[0].start, - groups[0].end)!; + case let captures: []regex::capture => + defer free(captures); + // captures[0]: The full matching string. + // captures[1...]: A capture for every capture group. + fmt::printfln("{} ({}, {})", captures[0].content, + captures[0].start, + captures[0].end)!; }; const all_matches = regex::findall(&re, "Hello Hare, hello Hare.")!; match (all_matches) { case void => void; - case let groupsets: [][]regex::matchgroup => - defer regex::freeall(groupsets); - // A slice of multiple match group sets, which can be used - // similarly to the find() example. - for (let i = 0z; i < len(groupsets); i += 1) { - fmt::printfln("{} ({}, {})", groupsets[i][0].content, - groupsets[i][0].start, - groupsets[i][0].end)!; + case let matches: [][]regex::capture => + defer regex::freeall(matches); + // matches[0]: All captures for the first match. + // matches[0][0]: The full matching string for the first match. + // matches[0][1...]: A capture for every capture group in the + // first match. + for (let i = 0z; i < len(matches); i += 1) { + fmt::printfln("{} ({}, {})", matches[i][0].content, + matches[i][0].start, + matches[i][0].end)!; }; }; diff --git a/regex/regex.ha b/regex/regex.ha @@ -31,7 +31,7 @@ export type inst = (inst_lit | inst_any | inst_split | inst_jump | inst_repeat); // A (sub)match found as a result of matching a certain string against a regex. -export type matchgroup = struct { +export type capture = struct { content: str, start: size, end: size, @@ -40,10 +40,10 @@ export type matchgroup = struct { type thread = struct { pc: size, start_idx: size, - root_group: matchgroup, - groups: []matchgroup, - curr_group: matchgroup, - curr_group_inited: bool, + root_capture: capture, + captures: []capture, + curr_capture: capture, + curr_capture_inited: bool, rep_counters: []size, matched: bool, failed: bool, @@ -468,7 +468,7 @@ fn parse_repetition( }; fn delete_thread(i: size, threads: *[]thread) void = { - free(threads[i].groups); + free(threads[i].captures); free(threads[i].rep_counters); delete(threads[i]); }; @@ -492,11 +492,11 @@ fn add_thread(threads: *[]thread, parent_idx: size, new_pc: size) void = { append(threads, thread { pc = new_pc, start_idx = threads[parent_idx].start_idx, - curr_group = threads[parent_idx].curr_group, - curr_group_inited = threads[parent_idx].curr_group_inited, + curr_capture = threads[parent_idx].curr_capture, + curr_capture_inited = threads[parent_idx].curr_capture_inited, matched = threads[parent_idx].matched, failed = threads[parent_idx].failed, - groups = alloc(threads[parent_idx].groups...), + captures = alloc(threads[parent_idx].captures...), rep_counters = alloc(threads[parent_idx].rep_counters...), ... }); @@ -535,7 +535,7 @@ fn run_thread( threads[i].failed = true; return; }; - threads[i].root_group = matchgroup { + threads[i].root_capture = capture { start = threads[i].start_idx, end = str_idx: size, // TODO: This is a perf issue for large strings @@ -546,24 +546,24 @@ fn run_thread( threads[i].matched = true; return newmatch; case inst_groupstart => - if (threads[i].curr_group_inited) { + if (threads[i].curr_capture_inited) { return "Found nested capture groups in expression, which are not supported": error; }; - threads[i].curr_group.start = str_idx: size; - threads[i].curr_group_inited = true; + threads[i].curr_capture.start = str_idx: size; + threads[i].curr_capture_inited = true; threads[i].pc += 1; case inst_groupend => - if (!threads[i].curr_group_inited) { + if (!threads[i].curr_capture_inited) { return `Found a groupend token ")" without having previously seen a groupstart token "("`: error; }; - threads[i].curr_group.end = str_idx: size; + threads[i].curr_capture.end = str_idx: size; // TODO: This is a perf issue for large strings - threads[i].curr_group.content = strings::sub(string, - threads[i].curr_group.start, - threads[i].curr_group.end); - append(threads[i].groups, threads[i].curr_group); - threads[i].curr_group = matchgroup { ... }; - threads[i].curr_group_inited = false; + threads[i].curr_capture.content = strings::sub(string, + threads[i].curr_capture.start, + threads[i].curr_capture.end); + append(threads[i].captures, threads[i].curr_capture); + threads[i].curr_capture = capture { ... }; + threads[i].curr_capture_inited = false; threads[i].pc += 1; case let ir: inst_repeat => assert(ir.id < len(threads[i].rep_counters)); @@ -652,16 +652,16 @@ fn search( string: str, str_iter: *strings::iterator, str_idx: *int -) (void | []matchgroup | error) = { +) (void | []capture | error) = { let threads: []thread = alloc([ - thread { groups = [], ... } + thread { captures = [], ... } ]); if (re.n_reps > 0) { threads[0].rep_counters = alloc([0...], re.n_reps); }; defer { for (let i = 0z; i < len(threads); i += 1) { - free(threads[i].groups); + free(threads[i].captures); free(threads[i].rep_counters); }; free(threads); @@ -684,25 +684,25 @@ fn search( if (all_matched) { let best_len = 0z; - let best_n_groups = 0z; + let best_n_captures = 0z; let best_idx = 0z; for (let i = 0z; i < len(threads); i += 1) { - let match_len = threads[i].root_group.end - - threads[i].root_group.start; + let match_len = threads[i].root_capture.end + - threads[i].root_capture.start; const is_better = match_len > best_len || match_len == best_len - && len(threads[i].groups) - > best_n_groups; + && len(threads[i].captures) + > best_n_captures; if (is_better) { best_len = match_len; best_idx = i; - best_n_groups = len(threads[i].groups); + best_n_captures = len(threads[i].captures); }; }; - let res: []matchgroup = alloc([], - len(threads[best_idx].groups) + 1); - append(res, threads[best_idx].root_group); - append(res, threads[best_idx].groups...); + let res: []capture = alloc([], + len(threads[best_idx].captures) + 1); + append(res, threads[best_idx].root_capture); + append(res, threads[best_idx].captures...); return res; }; @@ -712,8 +712,8 @@ fn search( for (let i = 0z; i < len(threads); i += 1) { const res = run_thread(i, re, string, &threads, r_or_end, *str_idx)?; - const matchlen = threads[i].root_group.end - - threads[i].root_group.start; + const matchlen = threads[i].root_capture.end + - threads[i].root_capture.start; const is_better = res is newmatch && matchlen > 0 && (first_match_idx is void || threads[i].start_idx @@ -771,7 +771,7 @@ fn search( // Attempts to match a regular expression against a string and returns the // longest leftmost match, or void if there is no match. -export fn find(re: *regex, string: str) (void | []matchgroup | error) = { +export fn find(re: *regex, string: str) (void | []capture | error) = { let str_idx = -1; let str_iter = strings::iter(string); return search(re, string, &str_iter, &str_idx); @@ -779,14 +779,14 @@ export fn find(re: *regex, string: str) (void | []matchgroup | error) = { // Attempts to match a regular expression against a string and returns all // non-overlapping matches, or void if there are no matches. -export fn findall(re: *regex, string: str) (void | [][]matchgroup | error) = { - let res: [][]matchgroup = []; +export fn findall(re: *regex, string: str) (void | [][]capture | error) = { + let res: [][]capture = []; let str_idx = -1; let str_iter = strings::iter(string); for (true) { const findres = search(re, string, &str_iter, &str_idx)?; match (findres) { - case let m: []matchgroup => + case let m: []capture => append(res, m); assert(str_idx: size >= m[0].end); for (str_idx: size > m[0].end) { @@ -805,8 +805,8 @@ export fn findall(re: *regex, string: str) (void | [][]matchgroup | error) = { return res; }; -// Frees all the matches in a slice and the slice itself. -export fn freeall(s: [][]matchgroup) void = { +// Frees each match in a slice of matches, as well as the slice itself. +export fn freeall(s: [][]capture) void = { for (let i = 0z; i < len(s); i += 1) { free(s[i]); };