bytes::index: implement two-way algorithm - hare

commit d2e1e4657daa217b5730c84ff36a82fbcb1b0acb
parent aada50172c781e8360dec73f65f893a9a89cd47e
Author: Bor Grošelj Simić <bor.groseljsimic@telemach.net>
Date:   Sun, 18 Apr 2021 22:55:31 +0200

bytes::index: implement two-way algorithm

This change implements a faster substring search algorithm for
bytes::index.  Instead of the naive solution we use the same main
algorithm that is also used in glibc and musl's strstr implementations.
There are some tweaks musl uses that this implementation lacks at the
moment, and we should probably implement another, simpler algorithm for
short needles.  Nevertheless, this is a significant step forward in
speeding up bytes::index.

Diffstat:
M bytes/index.ha  | 151 ++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------
A bytes/two_way.ha  | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M scripts/gen-stdlib  | 3 ++-
M stdlib.mk  | 6 ++++--

4 files changed, 197 insertions(+), 55 deletions(-)
diff --git a/bytes/index.ha b/bytes/index.ha
@@ -15,15 +15,13 @@ fn index_byte(haystack: []u8, needle: u8) (size | void) = {
 	};
 };
 
-fn index_slice(haystack: []u8, needle: []u8) (size | void) = {
-	for (let i = 0z; i + len(needle) <= len(haystack); i += 1) {
-		if (equal(haystack[i..i + len(needle)], needle)) {
-			return i;
-		};
-	};
+fn index_slice(haystack: []u8, b: []u8) (size | void) = switch (len(b)) {
+	0 => 0,
+	1 => index_byte(haystack, b[0]),
+	// TODO special-case 2, 3, 4
+	* => index_tw(haystack, b),
 };
 
-
 // Returns the offset of the last instance of 'needle' in a 'haystack' of
 // bytes, or void if it is not found.
 export fn rindex(haystack: []u8, needle: (u8 | []u8)) (size | void) = {
@@ -50,59 +48,108 @@ fn rindex_slice(haystack: []u8, needle: []u8) (size | void) = {
 	};
 };
 
-
 @test fn index() void = {
 	// Bytes
 	const a: [4]u8 = [1, 3, 3, 7];
-	match (index(a, 7)) {
-		n: size => assert(n == 3),
-		void => abort(),
-	};
-	match (index(a, 42)) {
-		size => abort(),
-		void => void,
-	};
-	match (index([], 42)) {
-		size => abort(),
-		void => void,
-	};
-
-	match (rindex(a, 3)) {
-		n: size => assert(n == 2),
-		void => abort(),
-	};
-	match (rindex(a, 42)) {
-		n: size => abort(),
-		void => void,
-	};
-	match (rindex([], 42)) {
-		size => abort(),
-		void => void,
-	};
+	assert(index(a, 7) as size == 3);
+	assert(index(a, 42) is void);
+	assert(index([], 42) is void);
 
+	assert(rindex(a, 3) as size == 2);
+	assert(rindex(a, 42) is void);
+	assert(rindex([], 42) is void);
 
 	// Slices
-	match (index(a, [3, 3])) {
-		n: size => assert(n == 1),
-		void => abort(),
-	};
-	match (index(a, [])) {
-		n: size => assert(n == 0),
-		void => abort(),
-	};
-	match(index(a, [4, 2])) {
-		size => abort(),
-		void => void,
-	};
+	const a: [4]u8 = [1, 1, 1, 2];
+	assert(rindex(a, [1, 1]) as size == 1);
+	assert(rindex(a, [1, 2]) as size == 2);
+
+	// len(haystack) < len(needle)
+	assert(index([], [1, 2, 3]) is void);
+	assert(index([1, 2, 3], [1, 2, 3, 4]) is void);
+
+	// len(haystack) == len(needle)
+	assert(index([42, 20], [42, 20]) as size == 0);
+	assert(index([1, 1, 1, 2], [1, 1, 1, 3]) is void);
+
+	assert(index([1, 42, 24], [42, 24]) as size == 1);
+	assert(index([1, 3, 3, 7], [3, 3]) as size == 1);
+	assert(index([1, 1, 1, 2], [1, 1, 2]) as size == 1);
+	assert(index([1, 1, 1, 3, 2], [1, 1, 1, 2]) is void);
 
-	const special: []u8 = [1, 1, 1, 2];
-	match (index(special, [1, 1, 2])) {
-		n: size => assert(n == 1),
-		void => abort(),
+	const tests: [_](str, str) = [
+		("3214567844", "0123456789"),
+		("3214567889012345", "0123456789012345"),
+		("32145678890123456789", "01234567890123456789"),
+		("32145678890123456789012345678901234567890211", "01234567890123456789012345678901"),
+		("321456788901234567890123456789012345678911", "0123456789012345678901234567890"),
+		("abc", "x"),
+		("oxoxoxoxoxoxoxoxoxoxoxox", "oy"),
+		("x", "a"),
+		("x01234567x89", "0123456789"),
+		("xabcqxq", "abcqq"),
+		("xabxc", "abc"),
+		("xabxcqq", "abcqq"),
+		("xbc", "abc"),
+		("xbcd", "abcd"),
+		("xx012345678901234567890123456789012345678901234567890123456789012", "0123456789012345678901234567890123456xxx"),
+		("xyz012345678", "0123456789"),
+		("xyz0123456789012345678", "01234567890123456789"),
+		("xyz0123456789012345678901234567890", "01234567890123456789012345678901"),
+	];
+	for (let i = 0z; i < len(tests); i += 1) {
+		let haystack = *(&tests[i].0: *[]u8);
+		let needle = *(&tests[i].1: *[]u8);
+		index(haystack, needle) as void;
 	};
 
-	match (rindex(special, [1, 1])) {
-		n: size => assert(n == 1),
-		void => abort(),
+	const tests: [_](str, str, size) = [
+		("", "", 0),
+		("01234567", "01234567", 0),
+		("0123456789", "0123456789", 0),
+		("0123456789012345", "0123456789012345", 0),
+		("01234567890123456789", "01234567890123456789", 0),
+		("0123456789012345678901234567890", "0123456789012345678901234567890", 0),
+		("01234567890123456789012345678901", "01234567890123456789012345678901", 0),
+		("ab", "ab", 0),
+		("abc", "a", 0),
+		("abc", "abc", 0),
+		("abc", "b", 1),
+		("abc", "c", 2),
+		("abcABCabc", "A", 3),
+		("abcd", "abcd", 0),
+		("abcqq", "abcqq", 0),
+		("barfoobarfoo", "foo", 3),
+		("foo", "", 0),
+		("foo", "foo", 0),
+		("foo", "o", 1),
+		("oofofoofooo", "f", 2),
+		("oofofoofooo", "foo", 4),
+		("oxoxoxoxoxoxoxoxoxoxoxoy", "oy", 22),
+		("x", "x", 0),
+		("x01234567", "01234567", 1),
+		("x0123456789", "0123456789", 1),
+		("x0123456789012345", "0123456789012345", 1),
+		("x01234567890123456789", "01234567890123456789", 1),
+		("x0123456789012345678901234567890", "0123456789012345678901234567890", 1),
+		("x01234567890123456789012345678901", "01234567890123456789012345678901", 1),
+		("x0123456789012345678901234567890x01234567890123456789012345678901", "01234567890123456789012345678901", 33),
+		("x012345678901234567890123456789x0123456789012345678901234567890", "0123456789012345678901234567890", 32),
+		("x0123456789012345678x01234567890123456789", "01234567890123456789", 21),
+		("x012345678901234x0123456789012345", "0123456789012345", 17),
+		("x012345678x0123456789", "0123456789", 11),
+		("x0123456x01234567", "01234567", 9),
+		("xab", "ab", 1),
+		("xabc", "abc", 1),
+		("xabcd", "abcd", 1),
+		("xabcqq", "abcqq", 1),
+		("xx012345678901234567890123456789012345678901234567890123456789012", "0123456789012345678901234567890123456789", 2),
+		("xx0123456789012345678901234567890123456789012345678901234567890120123456789012345678901234567890123456xxx", "0123456789012345678901234567890123456xxx", 65),
+		("xxxxxx012345678901234567890123456789012345678901234567890123456789012", "012345678901234567890123456789012345678901234567890123456789012", 6),
+	];
+	for (let i = 0z; i < len(tests); i += 1) {
+		let haystack = *(&tests[i].0: *[]u8);
+		let needle = *(&tests[i].1: *[]u8);
+		assert(index(haystack, needle) as size == tests[i].2);
 	};
 };
diff --git a/bytes/two_way.ha b/bytes/two_way.ha
@@ -0,0 +1,92 @@
+use types;
+
+// Implements the so called Two-way string matching algorithm by Crochemore and
+// Perrin. It preprocesses the needle in O(len(needle)) steps and performs the
+// matching in O(len(haystack)) steps. It does so without any space overhead.
+fn index_tw(haystack: []u8, needle: []u8) (size | void) = {
+	const n = len(haystack);
+	const m = len(needle);
+	if (n < m) {
+		return void;
+	};
+	if (n == m) {
+		return if (equal(haystack, needle)) 0 else void;
+	};
+
+	// preprocessing
+	let tup1 = max_suf(needle, false);
+	let i = tup1.0, p = tup1.1;
+	let tup2 = max_suf(needle, true);
+	let j = tup2.0, q = tup2.1;
+	let ms = i;
+	let per = p;
+	if (i + 1 < j + 1) {
+		ms = j;
+		per = q;
+	};
+
+	let mem0 = 0z, mem = 0z;
+	if (equal(haystack[..ms + 1], haystack[per..per + ms + 1])) {
+		if (ms + 1 > m - ms - 1) {
+			per = ms + 2;
+		} else {
+			per = m - ms;
+		};
+		mem0 = m - per;
+	};
+
+
+	j = 0;
+	for (j <= n - m) {
+		// right half
+		i = if (ms + 1 > mem) ms + 1 else mem;
+		for (i < m && needle[i] == haystack[j + i]) {
+			i += 1;
+		};
+		if (i < m) {
+			j += i - ms;
+			mem = 0;
+			continue;
+		};
+
+		// left half
+		i = ms + 1;
+		for (i > mem && needle[i - 1] == haystack[j + i - 1]) {
+			i -= 1;
+		};
+		if (i <= mem) {
+			return j;
+		};
+		j += per;
+		mem = mem0;
+	};
+};
+
+fn max_suf(x: []u8, inv: bool) (size, size) = {
+	let i = types::SIZE_MAX;
+	let j = 0z;
+	let k = 1z, p = 1z;
+	let m = len(x);
+	for (j + k < m) {
+		let a = x[j + k];
+		let b = x[i + k];
+		if (a == b) {
+			if (k == p) {
+				j += p;
+				k = 1;
+			} else {
+				k += 1;
+			};
+		} else if (a < b ^^ inv) {
+			j += k;
+			k = 1;
+			p = j - i;
+		} else {
+			i = j;
+			j += 1;
+			k = 1;
+			p = 1;
+	 	};
+	};
+	return (i, p);
+};
diff --git a/scripts/gen-stdlib b/scripts/gen-stdlib
@@ -120,7 +120,8 @@ bytes() {
 		equal.ha \
 		index.ha \
 		reverse.ha \
-		tokenize.ha
+		tokenize.ha \
+		two_way.ha
 	gen_ssa bytes types
 }
 
diff --git a/stdlib.mk b/stdlib.mk
@@ -255,7 +255,8 @@ stdlib_bytes_srcs= \
 	$(STDLIB)/bytes/equal.ha \
 	$(STDLIB)/bytes/index.ha \
 	$(STDLIB)/bytes/reverse.ha \
-	$(STDLIB)/bytes/tokenize.ha
+	$(STDLIB)/bytes/tokenize.ha \
+	$(STDLIB)/bytes/two_way.ha
 
 $(HARECACHE)/bytes/bytes.ssa: $(stdlib_bytes_srcs) $(stdlib_rt) $(stdlib_types)
 	@printf 'HAREC \t$@\n'
@@ -1108,7 +1109,8 @@ testlib_bytes_srcs= \
 	$(STDLIB)/bytes/equal.ha \
 	$(STDLIB)/bytes/index.ha \
 	$(STDLIB)/bytes/reverse.ha \
-	$(STDLIB)/bytes/tokenize.ha
+	$(STDLIB)/bytes/tokenize.ha \
+	$(STDLIB)/bytes/two_way.ha
 
 $(TESTCACHE)/bytes/bytes.ssa: $(testlib_bytes_srcs) $(testlib_rt) $(testlib_types)
 	@printf 'HAREC \t$@\n'

	hare The Hare programming language
	git clone https://git.torresjrjr.com/hare.git
	Log \| Files \| Refs \| README \| LICENSE

M	bytes/index.ha	\|	151	++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------
A	bytes/two_way.ha	\|	92	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	scripts/gen-stdlib	\|	3	++-
M	stdlib.mk	\|	6	++++--