strings: Add rindex, rbyteindex, rtokenize, rsplitn, rcut - hare

commit 90f29aff6a24db536b39fcd4b1796eb336782596
parent 36c9c7c1dbd41dcdcf9448ea45d39339856acf5b
Author: Autumn! <autumnull@posteo.net>
Date:   Sun, 12 Feb 2023 21:09:47 +0000

strings: Add rindex, rbyteindex, rtokenize, rsplitn, rcut

Signed-off-by: Autumn! <autumnull@posteo.net>

Diffstat:
M strings/index.ha  | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M strings/tokenize.ha  | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----

2 files changed, 145 insertions(+), 5 deletions(-)
diff --git a/strings/index.ha b/strings/index.ha
@@ -17,6 +17,18 @@ export fn index(haystack: str, needle: (str | rune)) (size | void) = {
 	};
 };
 
+// Returns the index of the last occurance of 'needle' in the 'haystack', or
+// void if not present. The index returned is the rune-wise index, not the
+// byte-wise index.
+export fn rindex(haystack: str, needle: (str | rune)) (size | void) = {
+	match (needle) {
+	case let r: rune =>
+		return rindex_rune(haystack, r);
+	case let s: str =>
+		return rindex_string(haystack, s);
+	};
+};
+
 fn index_rune(s: str, r: rune) (size | void) = {
 	let iter = iter(s);
 	for (let i = 0z; true; i += 1) {
@@ -31,6 +43,20 @@ fn index_rune(s: str, r: rune) (size | void) = {
 	};
 };
 
+fn rindex_rune(s: str, r: rune) (size | void) = {
+	let iter = riter(s);
+	for (let i = len(s) - 1; true; i -= 1) {
+		match (next(&iter)) {
+		case let n: rune =>
+			if (r == n) {
+				return i;
+			};
+		case void =>
+			break;
+		};
+	};
+};
+
 fn index_string(s: str, needle: str) (size | void) = {
 	let s_iter = iter(s);
 	for (let i = 0z; true; i += 1) {
@@ -55,6 +81,31 @@ fn index_string(s: str, needle: str) (size | void) = {
 	};
 };
 
+fn rindex_string(s: str, needle: str) (size | void) = {
+	let s_iter = riter(s);
+	for (let i = len(s); true; i -= 1) {
+		let rest_iter = s_iter;
+		let needle_iter = riter(needle);
+		for (true) {
+			const rest_rune = next(&rest_iter);
+			const needle_rune = next(&needle_iter);
+			if (rest_rune is void && !(needle_rune is void)) {
+				break;
+			};
+			if (needle_rune is void) {
+				return i - len(needle);
+			};
+			if ((rest_rune as rune) != (needle_rune as rune)) {
+				break;
+			};
+		};
+		if (next(&s_iter) is void) {
+			break;
+		};
+	};
+};
+
+
 @test fn index() void = {
 	assert(index("hello world", 'w') as size == 6);
 	assert(index("こんにちは", 'ち') as size == 3);
@@ -67,6 +118,9 @@ fn index_string(s: str, needle: str) (size | void) = {
 	assert(index("hello world!", "word") is void);
 	assert(index("こんにちは", "ちは") as size == 3);
 	assert(index("こんにちは", "きょうは") is void);
+
+	assert(index("hello world!", "o") as size == 4);
+	assert(rindex("hello world!", "o") as size == 7);
 };
 
 // Returns the byte-wise index of the first occurance of 'needle' in the
@@ -80,6 +134,17 @@ export fn byteindex(haystack: str, needle: (str | rune)) (size | void) = {
 	});
 };
 
+// Returns the byte-wise index of the last occurance of 'needle' in the
+// 'haystack', or void if not present.
+export fn rbyteindex(haystack: str, needle: (str | rune)) (size | void) = {
+	return bytes::rindex(toutf8(haystack), match (needle) {
+	case let s: str =>
+		yield toutf8(s);
+	case let r: rune =>
+		yield if (r: u32 <= 0x7f) r: u32: u8 else utf8::encoderune(r);
+	});
+};
+
 @test fn byteindex() void = {
 	assert(byteindex("hello world", 'w') as size == 6);
 	assert(byteindex("こんにちは", 'ち') as size == 9);
@@ -92,4 +157,7 @@ export fn byteindex(haystack: str, needle: (str | rune)) (size | void) = {
 	assert(byteindex("hello world!", "word") is void);
 	assert(byteindex("こんにちは", "ちは") as size == 9);
 	assert(byteindex("こんにちは", "きょうは") is void);
+
+	assert(byteindex("またあったね", "た") as size == 3);
+	assert(rbyteindex("またあったね", "た") as size == 12);
 };
diff --git a/strings/tokenize.ha b/strings/tokenize.ha
@@ -9,7 +9,8 @@ use types;
 // The state for a tokenizer.
 export type tokenizer = bytes::tokenizer;
 
-// Returns a tokenizer which yields sub-strings tokenized by a delimiter.
+// Returns a tokenizer which yields sub-strings tokenized by a delimiter,
+// starting at the beginning of the string.
 //
 // 	let tok = strings::tokenize("hello, my name is drew", " ");
 // 	assert(strings::next_token(&tok) as str == "hello,");
@@ -21,6 +22,20 @@ export type tokenizer = bytes::tokenizer;
 export fn tokenize(s: str, delim: str) tokenizer =
 	bytes::tokenize(toutf8(s), toutf8(delim));
 
+// Returns a tokenizer which yields sub-strings tokenized by a delimiter,
+// starting at the end of the string and moving backwards with each call
+// to [[next_token]].
+//
+// 	let tok = strings::rtokenize("hello, my name is drew", " ");
+// 	assert(strings::next_token(&tok) as str == "drew");
+// 	assert(strings::next_token(&tok) as str == "is");
+// 	assert(strings::next_token(&tok) as str == "name");
+// 	assert(strings::remaining_tokens(&tok) == "hello, my");
+//
+// The caller must ensure that 'delimiter' is not an empty string.
+export fn rtokenize(s: str, delim: str) tokenizer =
+	bytes::rtokenize(toutf8(s), toutf8(delim));
+
 // Returns the next string from a tokenizer, and advances the cursor. Returns
 // void if there are no tokens left.
 export fn next_token(s: *tokenizer) (str | void) = {
@@ -67,12 +82,20 @@ export fn remaining_tokens(s: *tokenizer) str = {
 	let tok = tokenize("", "foo");
 	assert(peek_token(&tok) is void);
 	assert(next_token(&tok) is void);
+
+	let tok = rtokenize("Hello, my name is drew", " ");
+	assert(next_token(&tok) as str == "drew");
+	assert(next_token(&tok) as str == "is");
+	assert(next_token(&tok) as str == "name");
+	assert(remaining_tokens(&tok) == "Hello, my");
+	assert(peek_token(&tok) as str == "my");
+	assert(remaining_tokens(&tok) == "Hello, my");
 };
 
-// Splits a string into tokens delimited by 'delim', returning a slice of up to
-// N tokens. The caller must free this slice. The strings within the slice are
-// borrowed from 'in', and needn't be freed - but should be [[dupall]]'d if they
-// should outlive 'in'.
+// Splits a string into tokens delimited by 'delim', starting at the beginning
+// of the string, and returning a slice of up to N tokens. The caller must free
+// this slice. The strings within the slice are borrowed from 'in', and needn't
+// be freed - but should be [[dupall]]'d if they should outlive 'in'.
 //
 // The caller must ensure that 'delimiter' is not an empty string.
 export fn splitn(in: str, delim: str, n: size) []str = {
@@ -94,6 +117,31 @@ export fn splitn(in: str, delim: str, n: size) []str = {
 	return toks;
 };
 
+// Splits a string into tokens delimited by 'delim', starting at the end
+// of the string, and returning a slice of up to N tokens. The caller must free
+// this slice. The strings within the slice are borrowed from 'in', and needn't
+// be freed - but should be [[dupall]]'d if they should outlive 'in'.
+//
+// The caller must ensure that 'delimiter' is not an empty string.
+export fn rsplitn(in: str, delim: str, n: size) []str = {
+	let toks: []str = [];
+	let tok = rtokenize(in, delim);
+	for (let i = 0z; i < n - 1z; i += 1) {
+		match (next_token(&tok)) {
+		case let s: str =>
+			insert(toks[0], s);
+		case void =>
+			return toks;
+		};
+	};
+	match(peek_token(&tok)) {
+	case void => void;
+	case let s: str =>
+		insert(toks[0], remaining_tokens(&tok));
+	};
+	return toks;
+};
+
 // Splits a string into tokens delimited by 'delim'.  The caller must free the
 // returned slice. The strings within the slice are borrowed from 'in', and
 // needn't be freed - but must be [[dupall]]'d if they should outlive 'in'.
@@ -122,6 +170,13 @@ export fn split(in: str, delim: str) []str = splitn(in, delim, types::SIZE_MAX);
 	for (let i = 0z; i < len(expected3); i += 1) {
 		assert(expected3[i] == actual3[i]);
 	};
+
+	const expected4 = ["Hello, my", "name", "is", "Drew"];
+	const actual4 = rsplitn("Hello, my name is Drew", " ", 4z);
+	assert(len(expected4) == len(actual4));
+	for (let i = 0z; i < len(expected4); i += 1) {
+		assert(expected4[i] == actual4[i]);
+	};
 };
 
 // Returns a string "cut" along the first instance of a delimiter, returning
@@ -138,6 +193,20 @@ export fn cut(in: str, delim: str) (str, str) = {
 	return (fromutf8_unsafe(c.0), fromutf8_unsafe(c.1));
 };
 
+// Returns a string "cut" along the last instance of a delimiter, returning
+// everything up to the delimiter, and everything after the delimiter, in a
+// tuple.
+//
+// 	strings::rcut("hello=world=foobar", "=")	// ("hello=world", "foobar")
+// 	strings::rcut("hello world", "=")	// ("hello world", "")
+//
+// The return value is borrowed from the 'in' parameter.  The caller must ensure
+// that 'delimiter' is not an empty string.
+export fn rcut(in: str, delim: str) (str, str) = {
+	let c = bytes::rcut(toutf8(in), toutf8(delim));
+	return (fromutf8_unsafe(c.0), fromutf8_unsafe(c.1));
+};
+
 @test fn cut() void = {
 	const sample = cut("hello=world", "=");
 	assert(sample.0 == "hello" && sample.1 == "world");
@@ -147,4 +216,7 @@ export fn cut(in: str, delim: str) (str, str) = {
 	assert(sample.0 == "hello world" && sample.1 == "");
 	const sample = cut("", "=");
 	assert(sample.0 == "" && sample.1 == "");
+
+	const sample = rcut("hello=world=foobar", "=");
+	assert(sample.0 == "hello=world" && sample.1 == "foobar");
 };

	hare [hare] The Hare programming language
	git clone https://git.torresjrjr.com/hare.git
	Log \| Files \| Refs \| README \| LICENSE

M	strings/index.ha	\|	68	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	strings/tokenize.ha	\|	82	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----