add regex - hare - The Hare programming language

commit 79e76b127206608c7c55f6b791bcd9844c9bdcdd
parent b3d613259690b701f50e2a03384d59304bc16bd7
Author: Vlad-Stefan Harbuz <vlad@vladh.net>
Date:   Fri,  8 Apr 2022 14:03:42 +0200

add regex

Signed-off-by: Vlad-Stefan Harbuz <vlad@vladh.net>

Diffstat:
A regex/+test.ha  | 598 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A regex/README  | 41 +++++++++++++++++++++++++++++++++++++++++
A regex/regex.ha  | 813 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M scripts/gen-stdlib  | 11 +++++++++++
M stdlib.mk  | 33 +++++++++++++++++++++++++++++++++

5 files changed, 1496 insertions(+), 0 deletions(-)
diff --git a/regex/+test.ha b/regex/+test.ha
@@ -0,0 +1,598 @@
+// License: MPL-2.0
+// (c) 2022 Vlad-Stefan Harbuz <vlad@vladh.net>
+use fmt;
+use io;
+use os;
+use strings;
+
+def PERFTEST_ON: bool = false;
+type matchres = enum { MATCH, NOMATCH, ERROR };
+
+fn run_find_case(
+	expr: str,
+	string: str,
+	expected: matchres,
+	start: int,
+	end: int
+) void = {
+	const re = match (compile(expr)) {
+	case let re: regex => yield re;
+	case let e: error =>
+		if (expected == matchres::MATCH) {
+			fmt::println(e)!;
+			fmt::fatal("Expected expression /{}/ to match, but it errored",
+				expr, string);
+		};
+		if (expected == matchres::NOMATCH) {
+			fmt::println(e)!;
+			fmt::fatal("Expected expression /{}/ to not match, but it errored",
+				expr, string);
+		};
+		return;
+	};
+
+	match (find(re, string)) {
+	case void =>
+		if (expected == matchres::MATCH) {
+			fmt::fatal("Expected expression /{}/ to match string \"{}\", but it did not",
+				expr, string);
+		};
+		if (expected == matchres::ERROR) {
+			fmt::fatal("Expression /{}/ failed to match, but should have errored",
+				expr, string);
+		};
+
+	case let m: []matchgroup =>
+		if (expected == matchres::NOMATCH) {
+			fmt::fatal("Expected expression /{}/ to not match string \"{}\", but it did",
+				expr, string);
+		};
+		if (expected == matchres::ERROR) {
+			fmt::fatal("Expression /{}/ matched, but should have errored",
+				expr, string);
+		};
+		if (start: size != m[0].start) {
+			fmt::fatal("Expected start of main match group to be {} but it was {}",
+				start, m[0].start);
+		};
+		if (end: size != m[0].end) {
+			fmt::fatal("Expected end of main match group to be {} but it was {}",
+				end, m[0].end);
+		};
+
+	case let e: error =>
+		if (expected == matchres::MATCH) {
+			fmt::fatal("Expected expression /{}/ to match, but it errored",
+				expr, string);
+		};
+		if (expected == matchres::NOMATCH) {
+			fmt::fatal("Expected expression /{}/ to not match, but it errored",
+				expr, string);
+		};
+	};
+};
+
+fn run_findall_case(
+	expr: str,
+	string: str,
+	expected: matchres,
+	count: int
+) void = {
+	const re = match (compile(expr)) {
+	case let re: regex => yield re;
+	case let e: error =>
+		if (expected == matchres::MATCH) {
+			fmt::println(e)!;
+			fmt::fatal("Expected expression /{}/ to match, but it errored",
+				expr, string);
+		};
+		if (expected == matchres::NOMATCH) {
+			fmt::println(e)!;
+			fmt::fatal("Expected expression /{}/ to not match, but it errored",
+				expr, string);
+		};
+		return;
+	};
+
+	match (findall(re, string)) {
+	case void =>
+		if (expected == matchres::MATCH) {
+			fmt::fatal("Expected expression /{}/ to match string \"{}\", but it did not",
+				expr, string);
+		};
+		if (expected == matchres::ERROR) {
+			fmt::fatal("Expression /{}/ failed to match, but should have errored",
+				expr, string);
+		};
+
+	case let groupsets: [][]matchgroup =>
+		if (expected == matchres::NOMATCH) {
+			fmt::fatal("Expected expression /{}/ to not match string \"{}\", but it did",
+				expr, string);
+		};
+		if (expected == matchres::ERROR) {
+			fmt::fatal("Expression /{}/ matched, but should have errored",
+				expr, string);
+		};
+		if (count: size != len(groupsets)) {
+			fmt::fatal("Expected to find {} matches but found {}",
+				count, len(groupsets));
+		};
+
+	case let e: error =>
+		if (expected == matchres::MATCH) {
+			fmt::fatal("Expected expression /{}/ to match, but it errored",
+				expr, string);
+		};
+		if (expected == matchres::NOMATCH) {
+			fmt::fatal("Expected expression /{}/ to not match, but it errored",
+				expr, string);
+		};
+	};
+};
+
+@test fn find() void = {
+	const cases = [
+		// literals
+		(`^$`, "", matchres::MATCH, 0, 0),
+		(``, "", matchres::MATCH, 0, -1),
+		(`abcd`, "abcd", matchres::MATCH, 0, -1),
+		(`abc`, "abcd", matchres::MATCH, 0, 3),
+		(`bcd`, "abcd", matchres::MATCH, 1, 4),
+		(`^abc$`, "abc", matchres::MATCH, 0, -1),
+		(`^abc$`, "axc", matchres::NOMATCH, 0, -1),
+		// .
+		(`^.$`, "x", matchres::MATCH, 0, 1),
+		(`^.$`, "y", matchres::MATCH, 0, 1),
+		(`^.$`, "", matchres::NOMATCH, 0, 1),
+		// +
+		(`^a+$`, "a", matchres::MATCH, 0, 1),
+		(`^a+$`, "aaa", matchres::MATCH, 0, 3),
+		(`^a+$`, "", matchres::NOMATCH, 0, 0),
+		(`^(abc)+$`, "abc", matchres::MATCH, 0, 3),
+		(`^(abc)+$`, "abcabc", matchres::MATCH, 0, 6),
+		(`^(abc)+$`, "", matchres::NOMATCH, 0, 0),
+		// *
+		(`^a*$`, "", matchres::MATCH, 0, 0),
+		(`^a*$`, "aaaa", matchres::MATCH, 0, 4),
+		(`^a*$`, "b", matchres::NOMATCH, 0, 0),
+		(`^(abc)*$`, "", matchres::MATCH, 0, 0),
+		(`^(abc)*$`, "abc", matchres::MATCH, 0, 3),
+		(`^(abc)*$`, "abcabc", matchres::MATCH, 0, 6),
+		(`^(abc)*$`, "bbb", matchres::NOMATCH, 0, 3),
+		// ?
+		(`^a?$`, "", matchres::MATCH, 0, 0),
+		(`^a?$`, "a", matchres::MATCH, 0, 1),
+		(`^a?$`, "b", matchres::NOMATCH, 0, 0),
+		(`^(abc)?$`, "", matchres::MATCH, 0, 0),
+		(`^(abc)?$`, "abc", matchres::MATCH, 0, 3),
+		(`^(abc)?$`, "bbb", matchres::NOMATCH, 0, 0),
+		// ^ and $
+		(`^a*`, "aaaa", matchres::MATCH, 0, 4),
+		(`a*$`, "aaaa", matchres::MATCH, 0, 4),
+		(`^a*$`, "aaaa", matchres::MATCH, 0, 4),
+		(`a*`, "aaaa", matchres::MATCH, 0, 4),
+		(`b*`, "aaaabbbb", matchres::MATCH, 4, 8),
+		(`^b*`, "aaaabbbb", matchres::MATCH, 0, 0),
+		(`b*$`, "aaaabbbb", matchres::MATCH, 4, 8),
+		// (a|b)
+		(`^(cafe|b)x$`, "cafex", matchres::MATCH, 0, 5),
+		(`^(cafe|b)x$`, "bx", matchres::MATCH, 0, 2),
+		(`^(cafe|b)x$`, "XXXx", matchres::NOMATCH, 0, 0),
+		(`^(cafe|b)x$`, "bx", matchres::MATCH, 0, 2),
+		(
+			`^(Privat|Jagd)(haftpflicht|schaden)versicherungs(police|betrag)$`,
+			"Jagdhaftpflichtversicherungsbetrag",
+			matchres::MATCH, 0, -1
+		),
+		(
+			`^(Privat|Jagd)(haftpflicht|schaden)versicherungs(police|betrag)$`,
+			"Jagdhaftpflichtversicherungsbetrug",
+			matchres::NOMATCH, 0, -1
+		),
+		(
+			`^(Privat|Jagd)(haftpflicht|schaden)versicherungs(police|betrag)$`,
+			"Jagdversicherungspolice",
+			matchres::NOMATCH, 0, -1
+		),
+		(`)`, "", matchres::ERROR, 0, 0),
+		// [abc]
+		(`^test[abc]$`, "testa", matchres::MATCH, 0, -1),
+		(`^test[abc]$`, "testb", matchres::MATCH, 0, -1),
+		(`^test[abc]$`, "testc", matchres::MATCH, 0, -1),
+		(`^test[abc]$`, "testd", matchres::NOMATCH, 0, -1),
+		(`^test[abc]*$`, "test", matchres::MATCH, 0, -1),
+		(`^test[abc]*$`, "testa", matchres::MATCH, 0, -1),
+		(`^test[abc]*$`, "testaaa", matchres::MATCH, 0, -1),
+		(`^test[abc]*$`, "testabc", matchres::MATCH, 0, -1),
+		(`^test[abc]?$`, "test", matchres::MATCH, 0, -1),
+		(`^test[abc]?$`, "testa", matchres::MATCH, 0, -1),
+		(`^test[abc]+$`, "testa", matchres::MATCH, 0, -1),
+		(`^test[abc]+$`, "test", matchres::NOMATCH, 0, -1),
+		(`^test[]abc]$`, "test]", matchres::MATCH, 0, -1),
+		(`^test[[abc]$`, "test[", matchres::MATCH, 0, -1),
+		(`^test[^abc]$`, "testd", matchres::MATCH, 0, -1),
+		(`^test[^abc]$`, "test!", matchres::MATCH, 0, -1),
+		(`^test[^abc]$`, "testa", matchres::NOMATCH, 0, -1),
+		(`^test[^abc]$`, "testb", matchres::NOMATCH, 0, -1),
+		(`^test[^abc]$`, "testc", matchres::NOMATCH, 0, -1),
+		(`^test[^]abc]$`, "test]", matchres::NOMATCH, 0, -1),
+		(`^test[^abc[]$`, "test[", matchres::NOMATCH, 0, -1),
+		(`^test[^abc]*$`, "testd", matchres::MATCH, 0, -1),
+		(`^test[^abc]*$`, "testqqqqq", matchres::MATCH, 0, -1),
+		(`^test[^abc]*$`, "test", matchres::MATCH, 0, -1),
+		(`^test[^abc]*$`, "testc", matchres::NOMATCH, 0, -1),
+		(`^test[^abc]?$`, "test", matchres::MATCH, 0, -1),
+		(`^test[^abc]?$`, "testd", matchres::MATCH, 0, -1),
+		(`^test[^abc]?$`, "testc", matchres::NOMATCH, 0, -1),
+		(`^test[^abc]+$`, "testd", matchres::MATCH, 0, -1),
+		(`^test[^abc]+$`, "testddd", matchres::MATCH, 0, -1),
+		(`^test[^abc]+$`, "testc", matchres::NOMATCH, 0, -1),
+		(`^test[^abc]+$`, "testcccc", matchres::NOMATCH, 0, -1),
+		(`^test[a-c]$`, "testa", matchres::MATCH, 0, -1),
+		(`^test[a-c]$`, "testb", matchres::MATCH, 0, -1),
+		(`^test[a-c]$`, "testc", matchres::MATCH, 0, -1),
+		(`^test[a-c]$`, "testd", matchres::NOMATCH, 0, -1),
+		(`^test[a-c]$`, "test!", matchres::NOMATCH, 0, -1),
+		(`^test[a-c]$`, "test-", matchres::NOMATCH, 0, -1),
+		(`^test[-a-c]$`, "test-", matchres::MATCH, 0, -1),
+		(`^test[a-c-]$`, "test-", matchres::MATCH, 0, -1),
+		(`^test[a-c]*$`, "test", matchres::MATCH, 0, -1),
+		(`^test[a-c]*$`, "testa", matchres::MATCH, 0, -1),
+		(`^test[a-c]*$`, "testabb", matchres::MATCH, 0, -1),
+		(`^test[a-c]*$`, "testddd", matchres::NOMATCH, 0, -1),
+		(`^test[a-c]?$`, "test", matchres::MATCH, 0, -1),
+		(`^test[a-c]?$`, "testb", matchres::MATCH, 0, -1),
+		(`^test[a-c]?$`, "testd", matchres::NOMATCH, 0, -1),
+		(`^test[a-c]+$`, "test", matchres::NOMATCH, 0, -1),
+		(`^test[a-c]+$`, "testbcbc", matchres::MATCH, 0, -1),
+		(`^test[a-c]+$`, "testd", matchres::NOMATCH, 0, -1),
+		(`^test[^a-c]$`, "testa", matchres::NOMATCH, 0, -1),
+		(`^test[^a-c]$`, "testb", matchres::NOMATCH, 0, -1),
+		(`^test[^a-c]$`, "testc", matchres::NOMATCH, 0, -1),
+		(`^test[^a-c]$`, "testd", matchres::MATCH, 0, -1),
+		(`^test[^a-c]$`, "test!", matchres::MATCH, 0, -1),
+		(`^test[^a-c]$`, "test-", matchres::MATCH, 0, -1),
+		(`^test[^-a-c]$`, "test-", matchres::NOMATCH, 0, -1),
+		(`^test[^a-c-]$`, "test-", matchres::NOMATCH, 0, -1),
+		(`^test[^a-c-]*$`, "test", matchres::MATCH, 0, -1),
+		(`^test[^a-c-]*$`, "test--", matchres::NOMATCH, 0, -1),
+		(`^test[^a-c-]*$`, "testq", matchres::MATCH, 0, -1),
+		(`^test[^a-c-]?$`, "test", matchres::MATCH, 0, -1),
+		(`^test[^a-c-]?$`, "testq", matchres::MATCH, 0, -1),
+		(`^test[^a-c-]?$`, "test-", matchres::NOMATCH, 0, -1),
+		(`^test[^a-c-]+$`, "test", matchres::NOMATCH, 0, -1),
+		(`^test[^a-c-]+$`, "testb", matchres::NOMATCH, 0, -1),
+		(`^test[^a-c-]+$`, "testddd", matchres::MATCH, 0, -1),
+		(`([a-z][a-z0-9]*,)+`, "a5,b7,c9,", matchres::MATCH, 0, -1),
+		// [:alpha:] etc.
+		(`^test[[:alnum:]]+$`, "testaA1", matchres::MATCH, 0, -1),
+		(`^test[[:alnum:]]+$`, "testa_1", matchres::NOMATCH, 0, -1),
+		(`^test[[:alpha:]]+$`, "testa", matchres::MATCH, 0, -1),
+		(`^test[[:alpha:]]+$`, "testa1", matchres::NOMATCH, 0, -1),
+		(`^test[[:blank:]]+$`, "testa", matchres::NOMATCH, 0, -1),
+		(`^test[[:blank:]]+$`, "test ", matchres::MATCH, 0, -1),
+		(`^test[^[:blank:]]+$`, "testx", matchres::MATCH, 0, -1),
+		(`^test[[:blank:]]+$`, "test ", matchres::MATCH, 0, -1),
+		(`^test[^[:cntrl:]]+$`, "testa", matchres::MATCH, 0, -1),
+		(`^test[[:digit:]]$`, "test1", matchres::MATCH, 0, -1),
+		(`^test[[:digit:]]$`, "testa", matchres::NOMATCH, 0, -1),
+		(`^test[[:graph:]]+$`, "test\t", matchres::NOMATCH, 0, -1),
+		(`^test[[:lower:]]+$`, "testa", matchres::MATCH, 0, -1),
+		(`^test[[:lower:]]+$`, "testA", matchres::NOMATCH, 0, -1),
+		(`^test[[:print:]]+$`, "test\t", matchres::NOMATCH, 0, -1),
+		(`^test[[:punct:]]+$`, "testA", matchres::NOMATCH, 0, -1),
+		(`^test[[:punct:]]+$`, "test!", matchres::MATCH, 0, -1),
+		(`^test[[:space:]]+$`, "test ", matchres::MATCH, 0, -1),
+		(`^test[[:upper:]]+$`, "testa", matchres::NOMATCH, 0, -1),
+		(`^test[[:upper:]]+$`, "testA", matchres::MATCH, 0, -1),
+		(`^test[[:word:]]+$`, "test!2", matchres::NOMATCH, 0, -1),
+		(`^test[[:word:]]+$`, "test_2", matchres::MATCH, 0, -1),
+		(`^test[[:xdigit:]]+$`, "testCAFE", matchres::MATCH, 0, -1),
+		// [:alpha:] etc. plus extra characters
+		(`^test[[:digit:]][[:alpha:]]$`, "test1a", matchres::MATCH, 0, -1),
+		(`^test[[:digit:]][[:alpha:]]$`, "testa1", matchres::NOMATCH, 0, -1),
+		(`^test[[:alnum:]!]+$`, "testa!1", matchres::MATCH, 0, -1),
+		(`^test[@[:alnum:]!]+$`, "testa!@1", matchres::MATCH, 0, -1),
+		// Escaped characters such as \+
+		(`^a\+b$`, "a+b", matchres::MATCH, 0, -1),
+		(`^a\?b$`, "a?b", matchres::MATCH, 0, -1),
+		(`^a\*b$`, "a*b", matchres::MATCH, 0, -1),
+		(`^a\^b$`, "a^b", matchres::MATCH, 0, -1),
+		(`^a\$b$`, "a$b", matchres::MATCH, 0, -1),
+		(`^a\[b$`, "a[b", matchres::MATCH, 0, -1),
+		(`^a\]b$`, "a]b", matchres::MATCH, 0, -1),
+		(`^a\(b$`, "a(b", matchres::MATCH, 0, -1),
+		(`^a\)b$`, "a)b", matchres::MATCH, 0, -1),
+		(`^a\|b$`, "a|b", matchres::MATCH, 0, -1),
+		(`^a\.b$`, "a.b", matchres::MATCH, 0, -1),
+		(`^a\\b$`, "a\\b", matchres::MATCH, 0, -1),
+		// {m,n}
+		(`^x(abc){1,2}$`, "xabc", matchres::MATCH, 0, -1),
+		(`^x(abc){1,2}$`, "xabcabc", matchres::MATCH, 0, -1),
+		(`^x(abc){1,2}$`, "xabcabcabc", matchres::NOMATCH, 0, -1),
+		(`^x(abc){,2}$`, "xabc", matchres::MATCH, 0, -1),
+		(`^x(abc){,2}$`, "xabcabc", matchres::MATCH, 0, -1),
+		(`^x(abc){,2}$`, "xabcabcabc", matchres::NOMATCH, 0, -1),
+		(`^x(abc){1,}$`, "xabc", matchres::MATCH, 0, -1),
+		(`^x(abc){1,}$`, "xabcabc", matchres::MATCH, 0, -1),
+		(`^x(abc){3,}$`, "xabcabc", matchres::NOMATCH, 0, -1),
+		(`^x(abc){3,}$`, "xabcabcabc", matchres::MATCH, 0, -1),
+		(`^x(abc){2,2}$`, "xabcabc", matchres::MATCH, 0, -1),
+		(`^x(abc){2,2}$`, "xabc", matchres::NOMATCH, 0, -1),
+		(`^x(abc){2,2}$`, "xabcabcabc", matchres::NOMATCH, 0, -1),
+		(`^x(abc){-1,2}$`, "xabcabcabc", matchres::ERROR, 0, -1),
+		(`^x(abc){x,2}$`, "xabcabcabc", matchres::ERROR, 0, -1),
+		(`^x(abc){0,-2}$`, "xabcabcabc", matchres::ERROR, 0, -1),
+		// various
+		(
+			`^.(1024)?(face)*(1024)*ca*(f+e?cafe)(babe)+$`,
+			"X1024facefacecaaaaafffcafebabebabe",
+			matchres::MATCH, 0, -1,
+		),
+		(
+			`.(1024)?(face)*(1024)*ca*(f+e?cafe)(babe)+`,
+			"X1024facefacecaaaaafffcafebabebabe",
+			matchres::MATCH, 0, -1,
+		),
+		(
+			`^.(1024)?(face)*(1024)*ca*(f+e?cafe)(babe)+$`,
+			"1024facefacecaaaaafffcafebabebabe",
+			matchres::NOMATCH, 0, 0,
+		),
+		(
+			`.(1024)?(face)*(1024)*ca*(f+e?cafe)(babe)+`,
+			"1024facefacecaaaaafffcafebabebabe",
+			matchres::MATCH, 3, -1,
+		),
+		(
+			`^([a-zA-Z]{1,2}[[:digit:]]{1,2})[[:space:]]*([[:digit:]][a-zA-Z]{2})$`,
+			"M15 4QN",
+			matchres::MATCH, 0, -1
+		),
+		// tests from perl
+		(`abc`, "abc", matchres::MATCH, 0, -1),
+		(`abc`, "xbc", matchres::NOMATCH, 0, 0),
+		(`abc`, "axc", matchres::NOMATCH, 0, 0),
+		(`abc`, "abx", matchres::NOMATCH, 0, 0),
+		(`abc`, "xabcy", matchres::MATCH, 1, 4),
+		(`abc`, "ababc", matchres::MATCH, 2, -1),
+		(`ab*c`, "abc", matchres::MATCH, 0, -1),
+		(`ab*bc`, "abc", matchres::MATCH, 0, -1),
+		(`ab*bc`, "abbc", matchres::MATCH, 0, -1),
+		(`ab*bc`, "abbbbc", matchres::MATCH, 0, -1),
+		(`ab{0,}bc`, "abbbbc", matchres::MATCH, 0, -1),
+		(`ab+bc`, "abbc", matchres::MATCH, 0, -1),
+		(`ab+bc`, "abc", matchres::NOMATCH, 0, 0),
+		(`ab+bc`, "abq", matchres::NOMATCH, 0, 0),
+		(`ab{1,}bc`, "abq", matchres::NOMATCH, 0, 0),
+		(`ab+bc`, "abbbbc", matchres::MATCH, 0, -1),
+		(`ab{1,}bc`, "abbbbc", matchres::MATCH, 0, -1),
+		(`ab{1,3}bc`, "abbbbc", matchres::MATCH, 0, -1),
+		(`ab{3,4}bc`, "abbbbc", matchres::MATCH, 0, -1),
+		(`ab{4,5}bc`, "abbbbc", matchres::NOMATCH, 0, 0),
+		(`ab?bc`, "abbc", matchres::MATCH, 0, -1),
+		(`ab?bc`, "abc", matchres::MATCH, 0, -1),
+		(`ab{0,1}bc`, "abc", matchres::MATCH, 0, -1),
+		(`ab?bc`, "abbbbc", matchres::NOMATCH, 0, 0),
+		(`ab?c`, "abc", matchres::MATCH, 0, -1),
+		(`ab{0,1}c`, "abc", matchres::MATCH, 0, -1),
+		(`^abc$`, "abc", matchres::MATCH, 0, -1),
+		(`^abc$`, "abcc", matchres::NOMATCH, 0, 0),
+		(`^abc`, "abcc", matchres::MATCH, 0, 3),
+		(`^abc$`, "aabc", matchres::NOMATCH, 0, 0),
+		(`abc$`, "aabc", matchres::MATCH, 1, -1),
+		(`^`, "abc", matchres::MATCH, 0, 0),
+		(`$`, "abc", matchres::MATCH, 3, 3),
+		(`a.c`, "abc", matchres::MATCH, 0, -1),
+		(`a.c`, "axc", matchres::MATCH, 0, -1),
+		(`a.*c`, "axyzc", matchres::MATCH, 0, -1),
+		(`a.*c`, "axyzd", matchres::NOMATCH, 0, 0),
+		(`a[bc]d`, "abc", matchres::NOMATCH, 0, 0),
+		(`a[bc]d`, "abd", matchres::MATCH, 0, -1),
+		(`a[b-d]e`, "abd", matchres::NOMATCH, 0, 0),
+		(`a[b-d]e`, "ace", matchres::MATCH, 0, -1),
+		(`a[b-d]`, "aac", matchres::MATCH, 1, -1),
+		(`a[-b]`, "a-", matchres::MATCH, 0, -1),
+		(`a[b-]`, "a-", matchres::MATCH, 0, -1),
+		(`a[b-a]`, "-", matchres::ERROR, 0, 0),
+		(`a[]b`, "-", matchres::ERROR, 0, 0),
+		(`a[`, "-", matchres::ERROR, 0, 0),
+		(`a]`, "a]", matchres::MATCH, 0, -1),
+		(`a[]]b`, "a]b", matchres::MATCH, 0, -1),
+		(`a[^bc]d`, "aed", matchres::MATCH, 0, -1),
+		(`a[^bc]d`, "abd", matchres::NOMATCH, 0, 0),
+		(`a[^-b]c`, "adc", matchres::MATCH, 0, -1),
+		(`a[^-b]c`, "a-c", matchres::NOMATCH, 0, 0),
+		(`a[^]b]c`, "a]c", matchres::NOMATCH, 0, 0),
+		(`a[^]b]c`, "adc", matchres::MATCH, 0, -1),
+		(`()ef`, "def", matchres::MATCH, 1, -1),
+		(`*a`, "-", matchres::ERROR, 0, 0),
+		(`(*)b`, "-", matchres::ERROR, 0, 0),
+		(`$b`, "b", matchres::ERROR, 0, 0),
+		(`a\`, "-", matchres::ERROR, 0, 0),
+		(`a\(b`, "a(b", matchres::MATCH, 0, -1),
+		(`a\(*b`, "ab", matchres::MATCH, 0, -1),
+		(`a\(*b`, "a((b", matchres::MATCH, 0, -1),
+		(`a\\b`, `a\b`, matchres::MATCH, 0, -1),
+		(`abc)`, "-", matchres::ERROR, 0, 0),
+		(`(abc`, "-", matchres::ERROR, 0, 0),
+		(`(a)b(c)`, "abc", matchres::MATCH, 0, -1),
+		(`a+b+c`, "aabbabc", matchres::MATCH, 4, -1),
+		(`a{1,}b{1,}c`, "aabbabc", matchres::MATCH, 4, -1),
+		(`a**`, "-", matchres::ERROR, 0, 0),
+		(`)(`, "-", matchres::ERROR, 0, 0),
+		(`[^ab]*`, "cde", matchres::MATCH, 0, -1),
+		(`abc`, "", matchres::NOMATCH, 0, 0),
+		(`a*`, "", matchres::MATCH, 0, -1),
+		(`([abc])*d`, "abbbcd", matchres::MATCH, 0, -1),
+		(`([abc])*bcd`, "abcd", matchres::MATCH, 0, -1),
+		(`abcd*efg`, "abcdefg", matchres::MATCH, 0, -1),
+		(`ab*`, "xabyabbbz", matchres::MATCH, 1, 3),
+		(`ab*`, "xayabbbz", matchres::MATCH, 1, 2),
+		(`(ab|cd)e`, "abcde", matchres::MATCH, 2, -1),
+		(`[abhgefdc]ij`, "hij", matchres::MATCH, 0, -1),
+		(`^(ab|cd)e`, "abcde", matchres::NOMATCH, 0, 0),
+		(`(abc|)ef`, "abcdef", matchres::MATCH, 4, -1),
+		(`(a|b)c*d`, "abcd", matchres::MATCH, 1, -1),
+		(`(ab|ab*)bc`, "abc", matchres::MATCH, 0, -1),
+		(`a([bc]*)c*`, "abc", matchres::MATCH, 0, -1),
+		(`a([bc]*)(c*d)`, "abcd", matchres::MATCH, 0, -1),
+		(`a([bc]+)(c*d)`, "abcd", matchres::MATCH, 0, -1),
+		(`a([bc]*)(c+d)`, "abcd", matchres::MATCH, 0, -1),
+		(`a[bcd]*dcdcde`, "adcdcde", matchres::MATCH, 0, -1),
+		(`a[bcd]+dcdcde`, "adcdcde", matchres::NOMATCH, 0, 0),
+		(`(ab|a)b*c`, "abc", matchres::MATCH, 0, -1),
+		(`[a-zA-Z_][a-zA-Z0-9_]*`, "alpha", matchres::MATCH, 0, -1),
+		(`^a(bc+|b[eh])g|.h$`, "abh", matchres::MATCH, 0, -1),
+		(`multiple words of text`, "uh-uh", matchres::NOMATCH, 0, 0),
+		(`multiple words`, "multiple words, yeah", matchres::MATCH, 0, 14),
+		(`(.*)c(.*)`, "abcde", matchres::MATCH, 0, -1),
+		(`\((.*), (.*)\)`, "(a, b)", matchres::MATCH, 0, -1),
+		(`[k]`, "ab", matchres::NOMATCH, 0, 0),
+		(`a[-]?c`, "ac", matchres::MATCH, 0, -1),
+		(`.*d`, "abc\nabd", matchres::MATCH, 0, -1),
+		(`(`, "", matchres::ERROR, 0, 0),
+		(`(x?)?`, "x", matchres::MATCH, 0, -1),
+		(`^*`, "", matchres::ERROR, 0, 0),
+		// Submatch handling
+		(`(a|ab)(c|bcd)(d*)`, "abcd", matchres::MATCH, 0, -1), // POSIX: (0,4)(0,2)(2,3)(3,4)
+		(`(a|ab)(bcd|c)(d*)`, "abcd", matchres::MATCH, 0, -1), // POSIX: (0,4)(0,2)(2,3)(3,4)
+		(`(ab|a)(c|bcd)(d*)`, "abcd", matchres::MATCH, 0, -1), // POSIX: (0,4)(0,2)(2,3)(3,4)
+		(`(ab|a)(bcd|c)(d*)`, "abcd", matchres::MATCH, 0, -1), // POSIX: (0,4)(0,2)(2,3)(3,4)
+		(`(a*)(b|abc)(c*)`, "abc", matchres::MATCH, 0, -1), // POSIX: (0,3)(0,1)(1,2)(2,3)
+		(`(a*)(abc|b)(c*)`, "abc", matchres::MATCH, 0, -1), // POSIX: (0,3)(0,1)(1,2)(2,3)
+		(`(a*)(b|abc)(c*)`, "abc", matchres::MATCH, 0, -1), // POSIX: (0,3)(0,1)(1,2)(2,3)
+		(`(a*)(abc|b)(c*)`, "abc", matchres::MATCH, 0, -1), // POSIX: (0,3)(0,1)(1,2)(2,3)
+		(`(a|ab)(c|bcd)(d|.*)`, "abcd", matchres::MATCH, 0, -1), // POSIX: (0,4)(0,2)(2,3)(3,4)
+		(`(a|ab)(bcd|c)(d|.*)`, "abcd", matchres::MATCH, 0, -1), // POSIX: (0,4)(0,2)(2,3)(3,4)
+		(`(ab|a)(c|bcd)(d|.*)`, "abcd", matchres::MATCH, 0, -1), // POSIX: (0,4)(0,2)(2,3)(3,4)
+		(`(ab|a)(bcd|c)(d|.*)`, "abcd", matchres::MATCH, 0, -1), // POSIX: (0,4)(0,2)(2,3)(3,4)
+		// TODO: whole-expression alternation
+		// (`ab|cd`, "abc", matchres::MATCH, 0, -1),
+		// (`ab|cd`, "abcd", matchres::MATCH, 0, -1),
+		// TODO: multiple alternation
+		// (`a|b|c|d|e`, "e", matchres::MATCH, 0, -1),
+		// (`(a|b|c|d|e)f`, "ef", matchres::MATCH, 0, -1),
+		// TODO: nested capture groups
+		// (`((a))`, "abc", matchres::MATCH, 0, -1),
+		// (`((a)(b)c)(d)`, "abcd", matchres::MATCH, 0, -1),
+		// (`(bc+d$|ef*g.|h?i(j|k))`, "effgz", matchres::MATCH, 0, -1),
+		// (`(bc+d$|ef*g.|h?i(j|k))`, "ij", matchres::MATCH, 0, -1),
+		// (`(bc+d$|ef*g.|h?i(j|k))`, "effg", matchres::NOMATCH, 0, 0),
+		// (`(bc+d$|ef*g.|h?i(j|k))`, "bcdd", matchres::NOMATCH, 0, 0),
+		// (`(bc+d$|ef*g.|h?i(j|k))`, "reffgz", matchres::MATCH, 0, -1),
+		// (`((((((((((a))))))))))`, "a", matchres::MATCH, 0, -1),
+		// (`(((((((((a)))))))))`, "a", matchres::MATCH, 0, -1),
+		// (`(([a-z]+):)?([a-z]+)$`, "smil", matchres::MATCH, 0, -1),
+		// (`^((a)c)?(ab)$`, "ab", matchres::MATCH, 0, -1),
+		// TODO: multiple simultaneous capture groups
+		// (`(a+|b)*`, "ab", matchres::MATCH, 0, -1),
+		// (`(a+|b){0,}`, "ab", matchres::MATCH, 0, -1),
+		// (`(a+|b)+`, "ab", matchres::MATCH, 0, -1),
+		// (`(a+|b){1,}`, "ab", matchres::MATCH, 0, -1),
+		// (`(a+|b)?`, "ab", matchres::MATCH, 0, -1),
+		// (`(a+|b){0,1}`, "ab", matchres::MATCH, 0, -1),
+		// NOTE: character sequences not currently supported
+		// (`\0`, "\0", matchres::MATCH, 0, -1),
+		// (`[\0a]`, "\0", matchres::MATCH, 0, -1),
+		// (`[a\0]`, "\0", matchres::MATCH, 0, -1),
+		// (`[^a\0]`, "\0", matchres::NOMATCH, 0, 0),
+		// NOTE: octal sequences not currently supported
+		// (`[\1]`, "\1", matchres::MATCH, 0, -1),
+		// (`\09`, "\0(separate-me)9", matchres::MATCH, 0, -1),
+		// (`\141`, "a", matchres::MATCH, 0, -1),
+		// (`[\41]`, "!", matchres::MATCH, 0, -1),
+		// NOTE: hex sequences not currently supported
+		// (`\xff`, "\377", matchres::MATCH, 0, -1),
+		// NOTE: non-greedy matching not currently supported
+		// (`a.+?c`, "abcabc", matchres::MATCH, 0, -1),
+		// (`.*?\S *:`, "xx:", matchres::MATCH, 0, -1),
+		// (`a[ ]*?\ (\d+).*`, "a   10", matchres::MATCH, 0, -1),
+		// (`a[ ]*?\ (\d+).*`, "a    10", matchres::MATCH, 0, -1),
+		// (`"(\\"|[^"])*?"`, `"\""`, matchres::MATCH, 0, -1),
+		// (`^.*?$`, "one\ntwo\nthree\n", matchres::NOMATCH, 0, 0),
+		// (`a[^>]*?b`, "a>b", matchres::NOMATCH, 0, 0),
+		// (`^a*?$`, "foo", matchres::NOMATCH, 0, 0),
+		// (`^([ab]*?)(?=(b)?)c`, "abc", matchres::MATCH, 0, -1),
+		// (`^([ab]*?)(?!(b))c`, "abc", matchres::MATCH, 0, -1),
+		// (`^([ab]*?)(?<!(a))c`, "abc", matchres::MATCH, 0, -1),
+	];
+
+	for (let i = 0z; i < len(cases); i += 1) {
+		const expr = cases[i].0;
+		const string = cases[i].1;
+		const should_match = cases[i].2;
+		const start = cases[i].3;
+		const end = if (cases[i].4 == -1) {
+			yield len(string): int;
+		} else {
+			yield cases[i].4;
+		};
+		run_find_case(expr, string, should_match, start, end);
+	};
+
+	if (PERFTEST_ON) {
+		const f = os::open("compress/zlib/data+test.ha")!;
+		const buf: [2000000]u8 = [0...];
+		io::read(f, buf)!;
+		const test_data = strings::fromutf8(buf);
+
+		const perf_cases = [
+			(
+				`^a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*aaaaaaaaaaaaaaaa$`,
+				"aaaaaaaaaaaaaaaa",
+				matchres::MATCH,
+				0, -1
+			),
+			(`.*.*.*rand_out`, test_data, matchres::MATCH, 0, 1211255),
+			(`0x`, test_data, matchres::MATCH, 86, 88),
+		];
+
+		for (let i = 0z; i < len(perf_cases); i += 1) {
+			const expr = perf_cases[i].0;
+			const string = perf_cases[i].1;
+			const should_match = perf_cases[i].2;
+			const start = perf_cases[i].3;
+			const end = if (perf_cases[i].4 == -1) {
+				yield len(string): int;
+			} else {
+				yield perf_cases[i].4;
+			};
+			run_find_case(expr, string, should_match, start, end);
+		};
+	};
+};
+
+@test fn findall() void = {
+	const cases = [
+		(`ab.`, "hello abc and abz test abq thanks", matchres::MATCH, 3),
+	];
+
+	for (let i = 0z; i < len(cases); i += 1) {
+		const expr = cases[i].0;
+		const string = cases[i].1;
+		const should_match = cases[i].2;
+		const count = cases[i].3;
+		run_findall_case(expr, string, should_match, count);
+	};
+
+	if (PERFTEST_ON) {
+		const f = os::open("compress/zlib/data+test.ha")!;
+		const buf: [2000000]u8 = [0...];
+		io::read(f, buf)!;
+		const test_data = strings::fromutf8(buf);
+
+		const perf_cases = [
+			(`0x`, test_data, matchres::MATCH, 199053),
+		];
+
+		for (let i = 0z; i < len(perf_cases); i += 1) {
+			const expr = perf_cases[i].0;
+			const string = perf_cases[i].1;
+			const should_match = perf_cases[i].2;
+			const count = perf_cases[i].3;
+			run_findall_case(expr, string, should_match, count);
+		};
+	};
+};
diff --git a/regex/README b/regex/README
@@ -0,0 +1,41 @@
+This is a NFA-based regex implementation. It closely adheres to the POSIX
+Extended Regular Expressions specification [0]. Matching is guaranteed to run in
+linear time, and various optimisations have been implemented to ensure good
+performance on most inputs.
+
+By default, matches will be found anywhere in the given string. The ^ and $
+characters can be used to anchor the match to the beginning or end of the
+string.
+
+find() returns a slice of [[regex::matchgroup]]s for the first match. The
+first [[regex::matchgroup]] represents the entire match, while the rest
+represent the submatches, specified in the expression using (parens).
+
+findall() finds all non-overlapping matches in the given string and returns
+a slice of slices of [[regex::matchgroup]]s.
+
+This module implements the POSIX match disambiguation rules by returning
+the longest match among the leftmost matches.
+
+	const re = regex::compile(`[Hh]are`)!;
+	defer regex::regex_free(re);
+
+	const first_match = regex::find(re, "Hello Hare, hello Hare.")!;
+	match (first_match) {
+	case void => void;
+	case let groups: []regex::matchgroup =>
+		defer free(groups);
+		// The match groups provide the content, start index and end index of
+		// the main match, as well as all submatches.
+	};
+
+	const all_matches = regex::findall(re, "Hello hare, hello hare.")!;
+	match (all_matches) {
+	case void => void;
+	case let groupsets: [][]regex::matchgroup =>
+		defer regex::freeall(groupsets);
+		// A slice of multiple match group sets, which can be used similarly
+		// to the find() example.
+	};
+
+[0]: https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html#tag_09_04
diff --git a/regex/regex.ha b/regex/regex.ha
@@ -0,0 +1,813 @@
+// License: MPL-2.0
+// (c) 2022 Vlad-Stefan Harbuz <vlad@vladh.net>
+use ascii;
+use encoding::utf8;
+use errors;
+use strconv;
+use strings;
+
+// A string describing the error the occurred.
+export type error = !str;
+
+export type match_type = enum { ANCHORED, FLOATING };
+export type inst_lit = rune,
+	inst_charset = struct { idx: size, is_positive: bool },
+	inst_any = void,
+	inst_split = size,
+	inst_jump = size,
+	inst_skip = void,
+	inst_match = match_type,
+	inst_groupstart = void,
+	inst_groupend = void,
+	inst_repeat = struct {
+		id: size,
+		origin: size,
+		min: (void | size),
+		max: (void | size),
+	};
+export type inst = (inst_lit | inst_any | inst_split | inst_jump |
+	inst_skip | inst_match | inst_charset |
+	inst_groupstart | inst_groupend |
+	inst_repeat);
+
+// A (sub)match found as a result of matching a certain string against a regex.
+export type matchgroup = struct {
+	content: str,
+	start: size,
+	end: size,
+};
+
+type thread = struct {
+	pc: size,
+	start_idx: size,
+	root_group: matchgroup,
+	groups: []matchgroup,
+	curr_group: matchgroup,
+	curr_group_inited: bool,
+	rep_counters: []size,
+	matched: bool,
+	failed: bool,
+};
+
+type newmatch = void;
+
+export type charclass = enum {
+	ALNUM, ALPHA, ASCII, BLANK, CNTRL, DIGIT, GRAPH, LOWER, PRINT, PUNCT,
+	SPACE, UPPER, WORD, XDIGIT,
+};
+export type charset = [](charset_lit_item | charset_range_item |
+	charset_class_item),
+	charset_lit_item = rune,
+	charset_range_item = (u8, u8),
+	charset_class_item = charclass;
+const charclass_names: [](charclass, str) = [
+	(charclass::ALNUM, ":alnum:]"),
+	(charclass::ALPHA, ":alpha:]"),
+	(charclass::ASCII, ":ascii:]"),
+	(charclass::BLANK, ":blank:]"),
+	(charclass::CNTRL, ":cntrl:]"),
+	(charclass::DIGIT, ":digit:]"),
+	(charclass::GRAPH, ":graph:]"),
+	(charclass::LOWER, ":lower:]"),
+	(charclass::PRINT, ":print:]"),
+	(charclass::PUNCT, ":punct:]"),
+	(charclass::SPACE, ":space:]"),
+	(charclass::UPPER, ":upper:]"),
+	(charclass::WORD, ":word:]"),
+	(charclass::XDIGIT, ":xdigit:]"),
+];
+const charclass_fns: [](charclass, *fn(c: rune) bool) = [
+	(charclass::ALNUM, &ascii::isalnum),
+	(charclass::ALPHA, &ascii::isalpha),
+	(charclass::ASCII, &ascii::isascii),
+	(charclass::BLANK, &ascii::isblank),
+	(charclass::CNTRL, &ascii::iscntrl),
+	(charclass::DIGIT, &ascii::isdigit),
+	(charclass::GRAPH, &ascii::isgraph),
+	(charclass::LOWER, &ascii::islower),
+	(charclass::PRINT, &ascii::isprint),
+	(charclass::PUNCT, &ascii::ispunct),
+	(charclass::SPACE, &ascii::isspace),
+	(charclass::UPPER, &ascii::isupper),
+	(charclass::WORD, &isword),
+	(charclass::XDIGIT, &ascii::isxdigit),
+];
+const multibyte_err: error = "Character ranges do not support characters larger than one byte.";
+
+export type regex = struct {
+	insts: []inst,
+	charsets: []charset,
+	n_reps: size,
+};
+
+// Frees the memory used by a regex.
+export fn regex_free(re: regex) void = {
+	free(re.insts);
+	for (let i = 0z; i < len(re.charsets); i += 1) {
+		free(re.charsets[i]);
+	};
+	free(re.charsets);
+};
+
+fn find_last_groupstart(insts: *[]inst) (size | error) = {
+	for (let i = len(insts); i > 0; i -= 1) {
+		if (insts[i - 1] is inst_groupstart) {
+			return i - 1;
+		};
+	};
+	return `Encountered ")" token without matching "("`: error;
+};
+
+fn isword(c: rune) bool = ascii::isalnum(c) || c == '_';
+
+fn handle_bracket(
+	insts: *[]inst,
+	r: rune,
+	r_idx: *size,
+	bracket_idx: *int,
+	iter: *strings::iterator,
+	charsets: *[]charset,
+	skip_charclass_rest: *bool,
+	is_charset_positive: *bool,
+	in_bracket: *bool
+) (void | error) = {
+	const peek1 = strings::next(iter);
+	const peek2 = strings::next(iter);
+	const peek3 = strings::next(iter);
+	if (!(peek1 is void)) {
+		strings::prev(iter);
+	};
+	if (!(peek2 is void)) {
+		strings::prev(iter);
+	};
+	if (!(peek3 is void)) {
+		strings::prev(iter);
+	};
+
+	if (*bracket_idx == -1) {
+		append(charsets, alloc([]));
+	};
+	*bracket_idx += 1;
+
+	if (*skip_charclass_rest) {
+		if (r == ']') {
+			*skip_charclass_rest = false;
+		};
+		*r_idx += 1;
+		return;
+	};
+
+	const is_range = peek1 is rune && peek1 as rune == '-' &&
+		!(peek2 is void) && !(peek3 is void);
+	const range_end = peek2;
+	const is_first_char = *bracket_idx == 0 || *bracket_idx == 1 &&
+		!*is_charset_positive;
+	if (r == ']' && !is_first_char) {
+		const newinst = inst_charset {
+			idx = len(charsets) - 1,
+			is_positive = *is_charset_positive,
+		};
+		append(insts, newinst);
+		*in_bracket = false;
+		*bracket_idx = -1;
+		*is_charset_positive = true;
+	} else if (r == '^' && *bracket_idx == 0) {
+		*is_charset_positive = false;
+	} else if (r == '[' && !(peek1 is void) &&
+			peek1 as rune == ':') {
+		const rest = strings::iterstr(iter);
+		const n_cc = len(charclass_names);
+		for (let cc_idx = 0z; cc_idx < n_cc; cc_idx += 1) {
+			const cc = charclass_names[cc_idx];
+			if (strings::hasprefix(rest, cc.1)) {
+				append(charsets[len(charsets) - 1],
+					cc.0: charset_class_item);
+				*skip_charclass_rest = true;
+				break;
+			};
+		};
+		if (!*skip_charclass_rest) {
+			return `Found "[:" in bracket expression and expected a character class such as [:alpha:], but none was found. If you did not mean to use a charclass, try ":["`: error;
+		};
+	} else if (is_range) {
+		const start_enc = utf8::encoderune(r);
+		if (len(start_enc) > 1) {
+			return multibyte_err;
+		};
+		const start_b = start_enc[0];
+
+		const end_enc = utf8::encoderune(range_end as rune);
+		if (len(end_enc) > 1) {
+			return multibyte_err;
+		};
+		const end_b = end_enc[0];
+
+		if (end_b < start_b) {
+			return `Found range in bracket expression where end character was before start character, e.g. "[b-a]"`: error;
+		};
+
+		append(charsets[len(charsets) - 1],
+			(start_b, end_b): charset_range_item);
+		strings::next(iter);
+		strings::next(iter);
+		*r_idx += 2;
+	} else {
+		append(charsets[len(charsets) - 1],
+			r: charset_lit_item);
+	};
+
+	*r_idx += 1;
+};
+
+// Compiles a string containing a regular expression into a regex struct.
+export fn compile(expr: str) (regex | error) = {
+	let insts: []inst = alloc([]);
+	let charsets: []charset = alloc([]);
+	let iter = strings::iter(expr);
+	let r_idx = 0z;
+	let match_type = match_type::FLOATING;
+	let curr_alt_jump_idx = -1;
+	let in_bracket = false;
+	let skip_charclass_rest = false;
+	let bracket_idx = -1;
+	let is_charset_positive = true;
+	let n_reps = 0z;
+	let n_groupstarts = 0;
+
+	for (true) {
+		const next = strings::next(&iter);
+
+		if (r_idx == 0 && next is rune && next: rune != '^') {
+				append(insts, void: inst_skip);
+		};
+
+		if (in_bracket) {
+			if (next is void) {
+				return `Found unterminated bracket expression, are you missing a closing "]"?`: error;
+			};
+			const r = next: rune;
+			handle_bracket(&insts, r, &r_idx, &bracket_idx, &iter,
+				&charsets, &skip_charclass_rest,
+				&is_charset_positive,
+				&in_bracket)?;
+			continue;
+		};
+
+		const r = match (next) {
+		case void =>
+			if (n_groupstarts > 0) {
+				return "Expression ended, but there were still unclosed groups": error;
+			};
+			break;
+		case let r: rune => yield r;
+		};
+		switch (r) {
+		case '\\' =>
+			const peek1 = strings::next(&iter);
+			if (peek1 is void) {
+				return "Found an escaping backslash, but there was nothing to escape": error;
+			} else {
+				append(insts, (peek1 as rune): inst_lit);
+				r_idx += 1;
+			};
+		case '^' =>
+			if (r_idx != 0) {
+				return `Anchor character "^" may only occur at the start of the expression`: error;
+			};
+		case '$' =>
+			if (r_idx != len(expr) - 1) {
+				return `Anchor character "$" may only occur at the end of the expression`: error;
+			};
+			match_type = match_type::ANCHORED;
+		case '[' =>
+			in_bracket = true;
+		case ']' =>
+			if (in_bracket) {
+				in_bracket = false;
+			} else {
+				append(insts, r: inst_lit);
+			};
+		case '(' =>
+			append(insts, void: inst_groupstart);
+			n_groupstarts += 1;
+		case ')' =>
+			if (n_groupstarts == 0) {
+				return "Tried to close group but none was open": error;
+			};
+			n_groupstarts -= 1;
+			append(insts, void: inst_groupend);
+			if (curr_alt_jump_idx != -1) {
+				assert(insts[curr_alt_jump_idx] is inst_jump);
+				insts[curr_alt_jump_idx] =
+					(len(insts) - 1): inst_jump;
+				curr_alt_jump_idx = -1;
+			};
+		case '|' =>
+			append(insts, 9999999: inst_jump);
+			const origin = find_last_groupstart(&insts)? + 1;
+			const newinst = (len(insts) + 1): inst_split;
+			insert(insts[origin], newinst);
+			curr_alt_jump_idx = (len(insts) - 1): int;
+		case '{' =>
+			let origin = len(insts) - 1;
+			if (insts[origin] is inst_groupend) {
+				origin = find_last_groupstart(&insts)?;
+			};
+			const rest = strings::iterstr(&iter);
+			const rep_parts = parse_repetition(rest)?;
+			const can_skip = rep_parts.0 == 0;
+			const min = if (rep_parts.0 == 0) {
+				yield 1z;
+			} else {
+				yield rep_parts.0;
+			};
+			if (can_skip) {
+				insert(insts[origin],
+					len(insts) + 2: inst_split);
+				origin += 1;
+			};
+			const newinst = inst_repeat {
+				id = n_reps,
+				origin = origin,
+				min = min,
+				max = rep_parts.1,
+			};
+			for (let i = 0z; i <= rep_parts.2; i += 1) {
+				strings::next(&iter);
+				r_idx += 1;
+			};
+			append(insts, newinst);
+			n_reps += 1;
+		case '?' =>
+			if (r_idx == 0 || len(insts) == 0) {
+				return `Found "?" but there was nothing before it`: error;
+			};
+			let term_start_idx = len(insts) - 1;
+			match (insts[term_start_idx]) {
+			case (inst_lit | inst_charset | inst_any) => void;
+			case inst_groupend =>
+				term_start_idx = find_last_groupstart(&insts)?;
+			case inst_groupstart =>
+				return `Found "?" but it was in an empty group`: error;
+			case =>
+				return `Invalid use of "?"`: error;
+			};
+			const after_idx = len(insts) + 1;
+			insert(insts[term_start_idx], after_idx: inst_split);
+		case '*' =>
+			if (r_idx == 0 || len(insts) == 0) {
+				return `Found "*" but there was nothing before it`: error;
+			};
+			const new_inst_offset = 1z;
+			const jump_idx = len(insts) + new_inst_offset;
+			const after_idx = jump_idx + 1z;
+			let term_start_idx = len(insts) - 1z;
+			match (insts[term_start_idx]) {
+			case (inst_lit | inst_charset | inst_any) => void;
+			case inst_groupend =>
+				term_start_idx = find_last_groupstart(&insts)?;
+			case inst_groupstart =>
+				return `Found "*" but it was in an empty group`: error;
+			case =>
+				return `Invalid use of "*"`: error;
+			};
+			const split_idx = term_start_idx;
+			term_start_idx += new_inst_offset;
+			insert(insts[split_idx], after_idx: inst_split);
+			append(insts, split_idx: inst_jump);
+		case '+' =>
+			if (r_idx == 0 || len(insts) == 0) {
+				return `Found "+" but there was nothing before it`: error;
+			};
+			let term_start_idx = len(insts) - 1;
+			match (insts[term_start_idx]) {
+			case (inst_lit | inst_charset | inst_any) => void;
+			case inst_groupend =>
+				term_start_idx = find_last_groupstart(&insts)?;
+			case inst_groupstart =>
+				return `Found "+" but it was in an empty group`: error;
+			case =>
+				return `Invalid use of "+"`: error;
+			};
+			append(insts, term_start_idx: inst_split);
+		case '.' =>
+			append(insts, void: inst_any);
+		case =>
+			append(insts, r: inst_lit);
+		};
+		r_idx += 1;
+	};
+
+	append(insts, match_type: inst_match);
+
+	return regex {
+		insts = insts,
+		charsets = charsets,
+		n_reps = n_reps,
+	};
+};
+
+fn parse_repetition(
+	s: str
+) (((void | size), (void | size), size) | error) = {
+	const first_comma = strings::index(s, ",");
+	const first_endbrace = strings::index(s, "}");
+	if (first_endbrace is void) {
+		return "Invalid repetition value": error;
+	};
+	const first_endbrace = first_endbrace as size;
+
+	let min_str = "";
+	let max_str = "";
+	let is_single_arg = false;
+	if (first_comma is void || first_endbrace < first_comma as size) {
+		const cut = strings::cut(s, "}");
+		min_str = cut.0;
+		max_str = cut.0;
+		is_single_arg = true;
+	} else {
+		const cut = strings::cut(s, ",");
+		min_str = cut.0;
+		max_str = strings::cut(cut.1, "}").0;
+	};
+
+	let min: (void | size) = void;
+	let max: (void | size) = void;
+
+	if (len(min_str) > 0) {
+		min = match (strconv::stoi(min_str)) {
+		case let res: int =>
+			yield if (res < 0) {
+				return `Only positive integers are allowed inside "{}"`: error;
+			} else {
+				yield res: size;
+			};
+		case => return "Invalid repetition minimum value": error;
+		};
+	};
+
+	if (len(max_str) > 0) {
+		max = match (strconv::stoi(max_str)) {
+		case let res: int =>
+			yield if (res < 0) {
+				return `Only positive integers are allowed inside "{}"`: error;
+			} else {
+				yield res: size;
+			};
+		case => return "Invalid repetition maximum value": error;
+		};
+	};
+
+	const rep_len = if (is_single_arg) {
+		yield len(min_str);
+	} else {
+		yield len(min_str) + 1 + len(max_str);
+	};
+	return (min, max, rep_len);
+};
+
+fn delete_thread(i: size, threads: *[]thread) void = {
+	free(threads[i].groups);
+	free(threads[i].rep_counters);
+	delete(threads[i]);
+};
+
+fn is_consuming_inst(a: inst) bool = {
+	return a is (inst_lit | inst_any | inst_charset);
+};
+
+fn add_thread(threads: *[]thread, parent_idx: size, new_pc: size) void = {
+	// Do not add this thread if there is already another thread with
+	// the same PC
+	for (let i = 0z; i < len(threads); i += 1) {
+		if (threads[i].pc == new_pc &&
+				!threads[i].matched &&
+				threads[i].start_idx <
+					threads[parent_idx].start_idx) {
+			return;
+		};
+	};
+
+	append(threads, thread {
+		pc = new_pc,
+		start_idx = threads[parent_idx].start_idx,
+		curr_group = threads[parent_idx].curr_group,
+		curr_group_inited = threads[parent_idx].curr_group_inited,
+		matched = threads[parent_idx].matched,
+		failed = threads[parent_idx].failed,
+		groups = alloc(threads[parent_idx].groups...),
+		rep_counters = alloc(threads[parent_idx].rep_counters...),
+		...
+	});
+};
+
+fn run_thread(
+	i: size,
+	re: regex,
+	string: str,
+	threads: *[]thread,
+	r_or_end: (rune | void),
+	str_idx: int
+) (void | error | newmatch) = {
+	if (threads[i].matched) {
+		return;
+	};
+	for (!is_consuming_inst(re.insts[threads[i].pc])) {
+		match (re.insts[threads[i].pc]) {
+		case inst_lit => abort();
+		case inst_any => abort();
+		case inst_split =>
+			const new_pc = re.insts[threads[i].pc]: inst_split: size;
+			add_thread(threads, i, new_pc);
+			threads[i].pc += 1;
+		case inst_jump =>
+			threads[i].pc = re.insts[threads[i].pc]: inst_jump: size;
+		case inst_skip =>
+			const new_pc = threads[i].pc + 1;
+			threads[i].start_idx = str_idx: size;
+			add_thread(threads, i, new_pc);
+			break;
+		case inst_match =>
+			// Do not match if we need an end-anchored match, but we
+			// have not exhausted our string
+			const mt = re.insts[threads[i].pc]: inst_match: match_type;
+			if (mt == match_type::ANCHORED && !(r_or_end is void)) {
+				threads[i].failed = true;
+				return;
+			};
+			threads[i].root_group = matchgroup {
+				start = threads[i].start_idx,
+				end = str_idx: size,
+				// TODO: This is a perf issue for large strings
+				content = strings::sub(string,
+					threads[i].start_idx,
+					str_idx: size),
+			};
+			threads[i].matched = true;
+			return newmatch;
+		case inst_groupstart =>
+			if (threads[i].curr_group_inited) {
+				return "Found nested capture groups in expression, which are not supported": error;
+			};
+			threads[i].curr_group.start = str_idx: size;
+			threads[i].curr_group_inited = true;
+			threads[i].pc += 1;
+		case inst_groupend =>
+			if (!threads[i].curr_group_inited) {
+				return `Found a groupend token ")" without having previously seen a groupstart token "("`: error;
+			};
+			threads[i].curr_group.end = str_idx: size;
+			// TODO: This is a perf issue for large strings
+			threads[i].curr_group.content = strings::sub(string,
+				threads[i].curr_group.start,
+				threads[i].curr_group.end);
+			append(threads[i].groups, threads[i].curr_group);
+			threads[i].curr_group = matchgroup { ... };
+			threads[i].curr_group_inited = false;
+			threads[i].pc += 1;
+		case let ir: inst_repeat =>
+			assert(ir.id < len(threads[i].rep_counters));
+			threads[i].rep_counters[ir.id] += 1;
+			if (ir.max is size &&
+					threads[i].rep_counters[ir.id] >
+					ir.max as size) {
+				threads[i].failed = true;
+				return;
+			};
+			const new_pc = threads[i].pc + 1;
+			threads[i].pc = ir.origin;
+			if (ir.min is void ||
+					threads[i].rep_counters[ir.id] >=
+					ir.min as size) {
+				add_thread(threads, i, new_pc);
+			};
+		};
+	};
+
+	// From now on, we're only matching consuming instructions, and these
+	// can't do anything without another rune.
+	if (r_or_end is void) {
+		threads[i].failed = true;
+		return;
+	};
+
+	const r = r_or_end as rune;
+
+	match (re.insts[threads[i].pc]) {
+	case inst_skip => return;
+	case let lit: inst_lit =>
+		if (r != lit) {
+			threads[i].failed = true;
+		};
+	case inst_any => void;
+	case let cs: inst_charset =>
+		const charset = re.charsets[cs.idx];
+		// Disprove the match if we're looking for a negative match
+		// Prove the match if we're looking for a positive match
+		let matched = !cs.is_positive;
+		for (let i = 0z; i < len(charset); i += 1) match (charset[i]) {
+		case let lit: charset_lit_item =>
+			if (r == lit) {
+				// Succeeded if positive match
+				// Failed if negative match
+				matched = cs.is_positive;
+				break;
+			};
+		case let range: charset_range_item =>
+			const r_enc = utf8::encoderune(r);
+			if (len(r_enc) > 1) {
+				return multibyte_err;
+			};
+			const r_b = r_enc[0];
+			if (r_b >= range.0 && r_b <= range.1) {
+				// Succeeded if positive match
+				// Failed if negative match
+				matched = cs.is_positive;
+				break;
+			};
+		case let class: charset_class_item =>
+			const n_cc = len(charclass_fns);
+			for (let cc_idx = 0z; cc_idx < n_cc; cc_idx += 1) {
+				const cc = charclass_fns[cc_idx];
+				if (cc.0 == class: charclass && cc.1(r)) {
+					// Succeeded if positive match
+					// Failed if negative match
+					matched = cs.is_positive;
+					break;
+				};
+			};
+		};
+		if (!matched) {
+			threads[i].failed = true;
+		};
+	};
+
+	threads[i].pc += 1;
+};
+
+// Attempts to match a regular expression against a string and returns the
+// either the longest leftmost match or all matches.
+fn search(
+	re: regex,
+	string: str,
+	str_iter: *strings::iterator,
+	str_idx: *int
+) (void | []matchgroup | error) = {
+	let threads: []thread = alloc([
+		thread { groups = alloc([]), ... }
+	]);
+	if (re.n_reps > 0) {
+		threads[0].rep_counters = alloc([0...], re.n_reps);
+	};
+	defer {
+		for (let i = 0z; i < len(threads); i += 1) {
+			free(threads[i].groups);
+			free(threads[i].rep_counters);
+		};
+		free(threads);
+	};
+
+	let first_match_idx: (void | size) = void;
+
+	for (true) {
+		if (len(threads) == 0) {
+			return void;
+		};
+
+		let all_matched = true;
+		for (let i = 0z; i < len(threads); i += 1) {
+			if (!threads[i].matched) {
+				all_matched = false;
+				break;
+			};
+		};
+
+		if (all_matched) {
+			let best_len = 0z;
+			let best_n_groups = 0z;
+			let best_idx = 0z;
+			for (let i = 0z; i < len(threads); i += 1) {
+				let match_len = threads[i].root_group.end -
+					threads[i].root_group.start;
+				const is_better = match_len > best_len ||
+					match_len == best_len &&
+					len(threads[i].groups) > best_n_groups;
+				if (is_better) {
+					best_len = match_len;
+					best_idx = i;
+					best_n_groups = len(threads[i].groups);
+				};
+			};
+			let res: []matchgroup = alloc([],
+				len(threads[best_idx].groups) + 1);
+			append(res, threads[best_idx].root_group);
+			append(res, threads[best_idx].groups...);
+			return res;
+		};
+
+		const r_or_end = strings::next(str_iter);
+		*str_idx += 1;
+
+		for (let i = 0z; i < len(threads); i += 1) {
+			const res = run_thread(i, re, string, &threads,
+				r_or_end, *str_idx)?;
+			const matchlen = threads[i].root_group.end -
+				threads[i].root_group.start;
+			const is_better = res is newmatch && matchlen > 0 &&
+				(first_match_idx is void ||
+					threads[i].start_idx <
+						first_match_idx as size);
+			if (is_better) {
+				first_match_idx = threads[i].start_idx;
+			};
+		};
+
+		// When we only want the leftmost match, delete all threads that
+		// start after the earliest non-zero-length matched thread
+		if (first_match_idx is size) {
+			for (let i = 0z; i < len(threads); i += 1) {
+				if (threads[i].start_idx >
+						first_match_idx as size) {
+					threads[i].failed = true;
+				};
+			};
+		};
+
+		// Delete threads that have a PC that has already been
+		// encountered in previous threads. Prioritise threads that
+		// have an earlier start_idx, and threads that were added
+		// earlier.
+		for (let i = 0i64; i < len(threads): i64 - 1; i += 1) {
+			for (let j = i + 1; j < len(threads): i64; j += 1) {
+				const same_pc = threads[i].pc == threads[j].pc;
+				const none_matched = !threads[j].matched &&
+					!threads[i].matched;
+				if (same_pc && none_matched) {
+					if (threads[i].start_idx <=
+							threads[j].start_idx) {
+						delete_thread(j: size, &threads);
+						j -= 1;
+					} else {
+						delete_thread(i: size, &threads);
+						i -= 1;
+						break;
+					};
+				};
+			};
+		};
+
+		for (let i = 0z; i < len(threads); i += 1) {
+			if (threads[i].failed) {
+				delete_thread(i, &threads);
+				i -= 1;
+			};
+		};
+	};
+
+	return void;
+};
+
+
+// Attempts to match a regular expression against a string and returns the
+// either the longest leftmost match or all matches.
+export fn find(re: regex, string: str) (void | []matchgroup | error) = {
+	let str_idx = -1;
+	let str_iter = strings::iter(string);
+	return search(re, string, &str_iter, &str_idx);
+};
+
+// Attempts to match a regular expression against a string and returns all
+// non-overlapping matches.
+export fn findall(re: regex, string: str) (void | [][]matchgroup | error) = {
+	let res: [][]matchgroup = alloc([]);
+	let str_idx = -1;
+	let str_iter = strings::iter(string);
+	for (true) {
+		const findres = search(re, string, &str_iter, &str_idx)?;
+		match (findres) {
+		case let m: []matchgroup =>
+			append(res, m);
+			assert(str_idx: size >= m[0].end);
+			for (str_idx: size > m[0].end) {
+				strings::prev(&str_iter);
+				str_idx -= 1;
+			};
+			if (str_idx: size >= len(string)) {
+				break;
+			};
+		case void => break;
+		};
+	};
+	if (len(res) == 0) {
+		return void;
+	};
+	return res;
+};
+
+// Frees all the matches in a slice and the slice itself.
+export fn freeall(s: [][]matchgroup) void = {
+	for (let i = 0z; i < len(s); i += 1) {
+		free(s[i]);
+	};
+	free(s);
+};
diff --git a/scripts/gen-stdlib b/scripts/gen-stdlib
@@ -1025,6 +1025,16 @@ path() {
 	gen_ssa path strings bufio bytes io
 }
 
+regex() {
+	if [ $testing -eq 0 ]; then
+		gen_srcs regex regex.ha
+		gen_ssa regex encoding::utf8 errors strconv strings
+	else
+		gen_srcs regex regex.ha +test.ha
+		gen_ssa regex encoding::utf8 errors strconv strings fmt io os
+	fi
+}
+
 gensrcs_strconv() {
 	gen_srcs strconv \
 		types.ha \
@@ -1298,6 +1308,7 @@ net::uri
 os				linux freebsd
 os::exec		linux freebsd
 path
+regex
 shlex
 slices
 sort
diff --git a/stdlib.mk b/stdlib.mk
@@ -548,6 +548,12 @@ stdlib_deps_any+=$(stdlib_path_any)
 stdlib_path_linux=$(stdlib_path_any)
 stdlib_path_freebsd=$(stdlib_path_any)
 
+# gen_lib regex (any)
+stdlib_regex_any=$(HARECACHE)/regex/regex-any.o
+stdlib_deps_any+=$(stdlib_regex_any)
+stdlib_regex_linux=$(stdlib_regex_any)
+stdlib_regex_freebsd=$(stdlib_regex_any)
+
 # gen_lib shlex (any)
 stdlib_shlex_any=$(HARECACHE)/shlex/shlex-any.o
 stdlib_deps_any+=$(stdlib_shlex_any)
@@ -1612,6 +1618,16 @@ $(HARECACHE)/path/path-any.ssa: $(stdlib_path_any_srcs) $(stdlib_rt) $(stdlib_st
 	@HARECACHE=$(HARECACHE) $(HAREC) $(HAREFLAGS) -o $@ -Npath \
 		-t$(HARECACHE)/path/path.td $(stdlib_path_any_srcs)
 
+# regex (+any)
+stdlib_regex_any_srcs= \
+	$(STDLIB)/regex/regex.ha
+
+$(HARECACHE)/regex/regex-any.ssa: $(stdlib_regex_any_srcs) $(stdlib_rt) $(stdlib_encoding_utf8_$(PLATFORM)) $(stdlib_errors_$(PLATFORM)) $(stdlib_strconv_$(PLATFORM)) $(stdlib_strings_$(PLATFORM))
+	@printf 'HAREC \t$@\n'
+	@mkdir -p $(HARECACHE)/regex
+	@HARECACHE=$(HARECACHE) $(HAREC) $(HAREFLAGS) -o $@ -Nregex \
+		-t$(HARECACHE)/regex/regex.td $(stdlib_regex_any_srcs)
+
 # shlex (+any)
 stdlib_shlex_any_srcs= \
 	$(STDLIB)/shlex/split.ha
@@ -2429,6 +2445,12 @@ testlib_deps_any+=$(testlib_path_any)
 testlib_path_linux=$(testlib_path_any)
 testlib_path_freebsd=$(testlib_path_any)
 
+# gen_lib regex (any)
+testlib_regex_any=$(TESTCACHE)/regex/regex-any.o
+testlib_deps_any+=$(testlib_regex_any)
+testlib_regex_linux=$(testlib_regex_any)
+testlib_regex_freebsd=$(testlib_regex_any)
+
 # gen_lib shlex (any)
 testlib_shlex_any=$(TESTCACHE)/shlex/shlex-any.o
 testlib_deps_any+=$(testlib_shlex_any)
@@ -3533,6 +3555,17 @@ $(TESTCACHE)/path/path-any.ssa: $(testlib_path_any_srcs) $(testlib_rt) $(testlib
 	@HARECACHE=$(TESTCACHE) $(HAREC) $(TESTHAREFLAGS) -o $@ -Npath \
 		-t$(TESTCACHE)/path/path.td $(testlib_path_any_srcs)
 
+# regex (+any)
+testlib_regex_any_srcs= \
+	$(STDLIB)/regex/regex.ha \
+	$(STDLIB)/regex/+test.ha
+
+$(TESTCACHE)/regex/regex-any.ssa: $(testlib_regex_any_srcs) $(testlib_rt) $(testlib_encoding_utf8_$(PLATFORM)) $(testlib_errors_$(PLATFORM)) $(testlib_strconv_$(PLATFORM)) $(testlib_strings_$(PLATFORM)) $(testlib_fmt_$(PLATFORM)) $(testlib_io_$(PLATFORM)) $(testlib_os_$(PLATFORM))
+	@printf 'HAREC \t$@\n'
+	@mkdir -p $(TESTCACHE)/regex
+	@HARECACHE=$(TESTCACHE) $(HAREC) $(TESTHAREFLAGS) -o $@ -Nregex \
+		-t$(TESTCACHE)/regex/regex.td $(testlib_regex_any_srcs)
+
 # shlex (+any)
 testlib_shlex_any_srcs= \
 	$(STDLIB)/shlex/split.ha \

	hare The Hare programming language
	git clone https://git.torresjrjr.com/hare.git
	Log \| Files \| Refs \| README \| LICENSE

A	regex/+test.ha	\|	598	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	regex/README	\|	41	+++++++++++++++++++++++++++++++++++++++++
A	regex/regex.ha	\|	813	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	scripts/gen-stdlib	\|	11	+++++++++++
M	stdlib.mk	\|	33	+++++++++++++++++++++++++++++++++