regex: rename "matchgroup" to "capture" - hare - [hare] The Hare programming language

commit be3ec0a58e15438edd82638608a6e5f11f1a1692
parent 18c3d7e05bb3ca4ce759450264116d671fcaf396
Author: Vlad-Stefan Harbuz <vlad@vladh.net>
Date:   Fri, 13 May 2022 16:41:19 +0100

regex: rename "matchgroup" to "capture"

Signed-off-by: Vlad-Stefan Harbuz <vlad@vladh.net>

Diffstat:
M regex/+test.ha  | 8 ++++----
M regex/README  | 41 ++++++++++++++++++++++-------------------
M regex/regex.ha  | 84 ++++++++++++++++++++++++++++++++++++++++----------------------------------------

3 files changed, 68 insertions(+), 65 deletions(-)
diff --git a/regex/+test.ha b/regex/+test.ha
@@ -41,7 +41,7 @@ fn run_find_case(
 				expr, string);
 		};
 
-	case let m: []matchgroup =>
+	case let m: []capture =>
 		if (expected == matchres::NOMATCH) {
 			fmt::fatalf("Expected expression /{}/ to not match string \"{}\", but it did",
 				expr, string);
@@ -51,11 +51,11 @@ fn run_find_case(
 				expr, string);
 		};
 		if (start: size != m[0].start) {
-			fmt::fatalf("Expected start of main match group to be {} but it was {}",
+			fmt::fatalf("Expected start of main capture to be {} but it was {}",
 				start, m[0].start);
 		};
 		if (end: size != m[0].end) {
-			fmt::fatalf("Expected end of main match group to be {} but it was {}",
+			fmt::fatalf("Expected end of main capture to be {} but it was {}",
 				end, m[0].end);
 		};
 
@@ -104,7 +104,7 @@ fn run_findall_case(
 				expr, string);
 		};
 
-	case let groupsets: [][]matchgroup =>
+	case let groupsets: [][]capture =>
 		if (expected == matchres::NOMATCH) {
 			fmt::fatalf("Expected expression /{}/ to not match string \"{}\", but it did",
 				expr, string);
diff --git a/regex/README b/regex/README
@@ -6,12 +6,13 @@ By default, matches will be found anywhere in the given string. The ^ and $
 characters can be used to anchor the match to the beginning or end of the
 string.
 
-find() returns a slice of [[matchgroup]]s for the first match. The first
-[[matchgroup]] represents the entire match, while the rest represent the
-submatches, specified in the expression using (parens).
+find() returns a slice of [[capture]]s for the first match. The first
+[[capture]] represents the entire matching string, while the rest represent the
+matching substrings for the subexpressions, specified in the regular expression
+using parentheses.
 
 findall() finds all non-overlapping matches in the given string and returns
-a slice of slices of [[matchgroup]]s.
+a slice of slices of [[capture]]s.
 
 This module implements the POSIX match disambiguation rules by returning
 the longest match among the leftmost matches.
@@ -22,26 +23,28 @@ the longest match among the leftmost matches.
 	const first_match = regex::find(&re, "Hello Hare, hello Hare.")!;
 	match (first_match) {
 	case void => void;
-	case let groups: []regex::matchgroup =>
-		defer free(groups);
-		// The match groups provide the content, start index and end
-		// index of the main match, as well as all submatches.
-		fmt::printfln("{} ({}, {})", groups[0].content,
-			groups[0].start,
-			groups[0].end)!;
+	case let captures: []regex::capture =>
+		defer free(captures);
+		// captures[0]: The full matching string.
+		// captures[1...]: A capture for every capture group.
+		fmt::printfln("{} ({}, {})", captures[0].content,
+			captures[0].start,
+			captures[0].end)!;
 	};
 
 	const all_matches = regex::findall(&re, "Hello Hare, hello Hare.")!;
 	match (all_matches) {
 	case void => void;
-	case let groupsets: [][]regex::matchgroup =>
-		defer regex::freeall(groupsets);
-		// A slice of multiple match group sets, which can be used
-		// similarly to the find() example.
-		for (let i = 0z; i < len(groupsets); i += 1) {
-			fmt::printfln("{} ({}, {})", groupsets[i][0].content,
-				groupsets[i][0].start,
-				groupsets[i][0].end)!;
+	case let matches: [][]regex::capture =>
+		defer regex::freeall(matches);
+		// matches[0]: All captures for the first match.
+		// matches[0][0]: The full matching string for the first match.
+		// matches[0][1...]: A capture for every capture group in the
+		//     first match.
+		for (let i = 0z; i < len(matches); i += 1) {
+			fmt::printfln("{} ({}, {})", matches[i][0].content,
+				matches[i][0].start,
+				matches[i][0].end)!;
 		};
 	};
 
diff --git a/regex/regex.ha b/regex/regex.ha
@@ -31,7 +31,7 @@ export type inst = (inst_lit | inst_any | inst_split | inst_jump |
 	inst_repeat);
 
 // A (sub)match found as a result of matching a certain string against a regex.
-export type matchgroup = struct {
+export type capture = struct {
 	content: str,
 	start: size,
 	end: size,
@@ -40,10 +40,10 @@ export type matchgroup = struct {
 type thread = struct {
 	pc: size,
 	start_idx: size,
-	root_group: matchgroup,
-	groups: []matchgroup,
-	curr_group: matchgroup,
-	curr_group_inited: bool,
+	root_capture: capture,
+	captures: []capture,
+	curr_capture: capture,
+	curr_capture_inited: bool,
 	rep_counters: []size,
 	matched: bool,
 	failed: bool,
@@ -468,7 +468,7 @@ fn parse_repetition(
 };
 
 fn delete_thread(i: size, threads: *[]thread) void = {
-	free(threads[i].groups);
+	free(threads[i].captures);
 	free(threads[i].rep_counters);
 	delete(threads[i]);
 };
@@ -492,11 +492,11 @@ fn add_thread(threads: *[]thread, parent_idx: size, new_pc: size) void = {
 	append(threads, thread {
 		pc = new_pc,
 		start_idx = threads[parent_idx].start_idx,
-		curr_group = threads[parent_idx].curr_group,
-		curr_group_inited = threads[parent_idx].curr_group_inited,
+		curr_capture = threads[parent_idx].curr_capture,
+		curr_capture_inited = threads[parent_idx].curr_capture_inited,
 		matched = threads[parent_idx].matched,
 		failed = threads[parent_idx].failed,
-		groups = alloc(threads[parent_idx].groups...),
+		captures = alloc(threads[parent_idx].captures...),
 		rep_counters = alloc(threads[parent_idx].rep_counters...),
 		...
 	});
@@ -535,7 +535,7 @@ fn run_thread(
 				threads[i].failed = true;
 				return;
 			};
-			threads[i].root_group = matchgroup {
+			threads[i].root_capture = capture {
 				start = threads[i].start_idx,
 				end = str_idx: size,
 				// TODO: This is a perf issue for large strings
@@ -546,24 +546,24 @@ fn run_thread(
 			threads[i].matched = true;
 			return newmatch;
 		case inst_groupstart =>
-			if (threads[i].curr_group_inited) {
+			if (threads[i].curr_capture_inited) {
 				return "Found nested capture groups in expression, which are not supported": error;
 			};
-			threads[i].curr_group.start = str_idx: size;
-			threads[i].curr_group_inited = true;
+			threads[i].curr_capture.start = str_idx: size;
+			threads[i].curr_capture_inited = true;
 			threads[i].pc += 1;
 		case inst_groupend =>
-			if (!threads[i].curr_group_inited) {
+			if (!threads[i].curr_capture_inited) {
 				return `Found a groupend token ")" without having previously seen a groupstart token "("`: error;
 			};
-			threads[i].curr_group.end = str_idx: size;
+			threads[i].curr_capture.end = str_idx: size;
 			// TODO: This is a perf issue for large strings
-			threads[i].curr_group.content = strings::sub(string,
-				threads[i].curr_group.start,
-				threads[i].curr_group.end);
-			append(threads[i].groups, threads[i].curr_group);
-			threads[i].curr_group = matchgroup { ... };
-			threads[i].curr_group_inited = false;
+			threads[i].curr_capture.content = strings::sub(string,
+				threads[i].curr_capture.start,
+				threads[i].curr_capture.end);
+			append(threads[i].captures, threads[i].curr_capture);
+			threads[i].curr_capture = capture { ... };
+			threads[i].curr_capture_inited = false;
 			threads[i].pc += 1;
 		case let ir: inst_repeat =>
 			assert(ir.id < len(threads[i].rep_counters));
@@ -652,16 +652,16 @@ fn search(
 	string: str,
 	str_iter: *strings::iterator,
 	str_idx: *int
-) (void | []matchgroup | error) = {
+) (void | []capture | error) = {
 	let threads: []thread = alloc([
-		thread { groups = [], ... }
+		thread { captures = [], ... }
 	]);
 	if (re.n_reps > 0) {
 		threads[0].rep_counters = alloc([0...], re.n_reps);
 	};
 	defer {
 		for (let i = 0z; i < len(threads); i += 1) {
-			free(threads[i].groups);
+			free(threads[i].captures);
 			free(threads[i].rep_counters);
 		};
 		free(threads);
@@ -684,25 +684,25 @@ fn search(
 
 		if (all_matched) {
 			let best_len = 0z;
-			let best_n_groups = 0z;
+			let best_n_captures = 0z;
 			let best_idx = 0z;
 			for (let i = 0z; i < len(threads); i += 1) {
-				let match_len = threads[i].root_group.end
-					- threads[i].root_group.start;
+				let match_len = threads[i].root_capture.end
+					- threads[i].root_capture.start;
 				const is_better = match_len > best_len
 					|| match_len == best_len
-					&& len(threads[i].groups)
-					> best_n_groups;
+					&& len(threads[i].captures)
+					> best_n_captures;
 				if (is_better) {
 					best_len = match_len;
 					best_idx = i;
-					best_n_groups = len(threads[i].groups);
+					best_n_captures = len(threads[i].captures);
 				};
 			};
-			let res: []matchgroup = alloc([],
-				len(threads[best_idx].groups) + 1);
-			append(res, threads[best_idx].root_group);
-			append(res, threads[best_idx].groups...);
+			let res: []capture = alloc([],
+				len(threads[best_idx].captures) + 1);
+			append(res, threads[best_idx].root_capture);
+			append(res, threads[best_idx].captures...);
 			return res;
 		};
 
@@ -712,8 +712,8 @@ fn search(
 		for (let i = 0z; i < len(threads); i += 1) {
 			const res = run_thread(i, re, string, &threads,
 				r_or_end, *str_idx)?;
-			const matchlen = threads[i].root_group.end
-				- threads[i].root_group.start;
+			const matchlen = threads[i].root_capture.end
+				- threads[i].root_capture.start;
 			const is_better = res is newmatch && matchlen > 0
 				&& (first_match_idx is void
 					|| threads[i].start_idx
@@ -771,7 +771,7 @@ fn search(
 
 // Attempts to match a regular expression against a string and returns the
 // longest leftmost match, or void if there is no match.
-export fn find(re: *regex, string: str) (void | []matchgroup | error) = {
+export fn find(re: *regex, string: str) (void | []capture | error) = {
 	let str_idx = -1;
 	let str_iter = strings::iter(string);
 	return search(re, string, &str_iter, &str_idx);
@@ -779,14 +779,14 @@ export fn find(re: *regex, string: str) (void | []matchgroup | error) = {
 
 // Attempts to match a regular expression against a string and returns all
 // non-overlapping matches, or void if there are no matches.
-export fn findall(re: *regex, string: str) (void | [][]matchgroup | error) = {
-	let res: [][]matchgroup = [];
+export fn findall(re: *regex, string: str) (void | [][]capture | error) = {
+	let res: [][]capture = [];
 	let str_idx = -1;
 	let str_iter = strings::iter(string);
 	for (true) {
 		const findres = search(re, string, &str_iter, &str_idx)?;
 		match (findres) {
-		case let m: []matchgroup =>
+		case let m: []capture =>
 			append(res, m);
 			assert(str_idx: size >= m[0].end);
 			for (str_idx: size > m[0].end) {
@@ -805,8 +805,8 @@ export fn findall(re: *regex, string: str) (void | [][]matchgroup | error) = {
 	return res;
 };
 
-// Frees all the matches in a slice and the slice itself.
-export fn freeall(s: [][]matchgroup) void = {
+// Frees each match in a slice of matches, as well as the slice itself.
+export fn freeall(s: [][]capture) void = {
 	for (let i = 0z; i < len(s); i += 1) {
 		free(s[i]);
 	};

	hare [hare] The Hare programming language
	git clone https://git.torresjrjr.com/hare.git
	Log \| Files \| Refs \| README \| LICENSE

M	regex/+test.ha	\|	8	++++----
M	regex/README	\|	41	++++++++++++++++++++++-------------------
M	regex/regex.ha	\|	84	++++++++++++++++++++++++++++++++++++++++----------------------------------------