bytes, strings::tokenize: support multiple delimiters - hare

commit 04eda2e65fa49b4e7af0aedb55cc41131a97b9a5
parent 9409fd01c1140b3786e2e7538ba0ee350be4010d
Author: Drew DeVault <sir@cmpwn.com>
Date:   Sun,  2 Jun 2024 12:13:23 +0200

bytes, strings::tokenize: support multiple delimiters

This updates both functions to accept more than one delimiter, with the
caveat that each delimiter cannot be greater than one byte in length.

Signed-off-by: Drew DeVault <sir@cmpwn.com>

Diffstat:
M bytes/tokenize.ha  | 234 +++++++++++++++++++++++++++++++++++++++++++++----------------------------------
M strings/tokenize.ha  | 177 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------

2 files changed, 254 insertions(+), 157 deletions(-)
diff --git a/bytes/tokenize.ha b/bytes/tokenize.ha
@@ -9,11 +9,20 @@ export type tokenizer = struct {
 	p: i64,		// p < 0 for reverse tokenizers, 0 <= p for forward ones.
 };
 
-// Returns a tokenizer which yields sub-slices tokenized by a delimiter, starting
-// at the beginning of the slice. The caller must ensure that 'delim' is not an
-// empty slice. Can tokenize a slice of length less than [[types::I64_MAX]].
-export fn tokenize(in: []u8, delim: []u8) tokenizer = {
+// Tokenizes a byte slice, returning an iterator that yields tokens from the
+// slice delimited by one of any number of delimiter bytes. If the input slice
+// begins with or ends with a delimiter, an empty slice is returned respectively
+// as the first and last call to [[next_token]].
+//
+// The variadic argument slice is borrowed from the caller, who should take care
+// to ensure that it is valid for the lifetime of the tokenizer.
+//
+// The caller must ensure that at least one delimiter is provided and that the
+// length of the slice is less than [[types::I64_MAX]].
+export fn tokenize(in: []u8, delim: u8...) tokenizer = {
 	assert(len(delim) > 0, "bytes::tokenize called with empty slice");
+	assert(len(in) < types::I64_MAX: size,
+		"bytes::tokenize: input length exceeds I64_MAX");
 	if (len(in) == 0) {
 		delim = [];
 	};
@@ -25,12 +34,13 @@ export fn tokenize(in: []u8, delim: []u8) tokenizer = {
 	};
 };
 
-// Returns a tokenizer which yields sub-slices tokenized by a delimiter, starting at
-// the end of the slice and moving backwards with each call to [[next_token]]. The
-// caller must ensure that 'delimiter' is not an empty slice. Can tokenize a slice
-// of length less than [[types::I64_MAX]].
-export fn rtokenize(in: []u8, delim: []u8) tokenizer = {
+// Like [[tokenize]], but tokenizes the slice in reverse, such that the first
+// call to [[next_token]] returns the last token and the last call returns the
+// first token.
+export fn rtokenize(in: []u8, delim: u8...) tokenizer = {
 	assert(len(delim) > 0, "bytes::rtokenize called with empty slice");
+	assert(len(in) < types::I64_MAX: size,
+		"bytes::rtokenize: input length exceeds I64_MAX");
 	if (len(in) == 0) {
 		delim = [];
 	};
@@ -45,10 +55,7 @@ export fn rtokenize(in: []u8, delim: []u8) tokenizer = {
 	};
 };
 
-// Returns the next slice from a tokenizer, and advances the cursor. Returns
-// done if there are no tokens left and on all subsequent invocations. If a
-// string starts with, or ends with, a token, an empty slice is returned at the
-// beginning or end of the sequence, respectively.
+// Returns the next token from a [[tokenizer]] and advances the cursor.
 export fn next_token(s: *tokenizer) ([]u8 | done) = {
 	const b = match (peek_token(s)) {
 	case let b: []u8 =>
@@ -57,14 +64,13 @@ export fn next_token(s: *tokenizer) ([]u8 | done) = {
 	};
 
 	const slen = len(s.in): i64;
-	const dlen = len(s.delim);
 	const reverse = s.p < 0;
 	if (reverse) {
 		if (slen + s.p + 1 == 0) {
 			s.delim = s.delim[..0];
 			s.in = s.in[..0];
 		} else {
-			const end = (slen + s.p + 1): size - dlen;
+			const end = (slen + s.p + 1): size - 1;
 			s.in = s.in[..end];
 		};
 		s.p = types::I64_MIN;
@@ -73,7 +79,7 @@ export fn next_token(s: *tokenizer) ([]u8 | done) = {
 			s.delim = s.delim[..0];
 			s.in = s.in[..0];
 		} else {
-			s.in = s.in[s.p: size + dlen..];
+			s.in = s.in[s.p: size + 1..];
 		};
 		s.p = types::I64_MAX;
 	};
@@ -81,7 +87,7 @@ export fn next_token(s: *tokenizer) ([]u8 | done) = {
 	return b;
 };
 
-// Same as [[next_token]], but does not advance the cursor
+// Returns the next token from a [[tokenizer]] without advancing the cursor.
 export fn peek_token(s: *tokenizer) ([]u8 | done) = {
 	if (len(s.delim) == 0) {
 		return done;
@@ -93,16 +99,27 @@ export fn peek_token(s: *tokenizer) ([]u8 | done) = {
 	const known = ((reverse && s.p != types::I64_MIN) ||
 		(!reverse && s.p != types::I64_MAX));
 	if (!known) {
-		let i = 0i64;
+		let i = if (reverse) types::I64_MIN else types::I64_MAX;
 		let dlen = 0i64;
 		const slen = len(s.in): i64;
 
-		match (ifunc(s.in, s.delim)) {
-		case let ix: size =>
-			dlen = len(s.delim): i64;
-			i = ix: i64;
-		case void =>
-			i = slen;
+		for (let d .. s.delim) {
+			match (ifunc(s.in, d)) {
+			case let ix: size =>
+				if (!reverse && ix: i64 < i) {
+					i = ix: i64;
+					dlen = 1;
+				} else if (reverse && ix: i64 > i) {
+					i = ix: i64;
+					dlen = 1;
+				};
+			case void =>
+				if (!reverse && slen < i: i64) {
+					i = slen;
+				} else if (reverse && 0 > i: i64) {
+					i = 0;
+				};
+			};
 		};
 
 		if (reverse) {
@@ -123,88 +140,103 @@ export fn peek_token(s: *tokenizer) ([]u8 | done) = {
 	};
 };
 
-
-// Returns the remainder of the slice associated with a tokenizer, without doing
-// any further tokenization.
+// Returns the remainder of the input slice from a [[tokenizer]] ahead of the
+// token cursor.
 export fn remaining_tokens(s: *tokenizer) []u8 = {
 	return s.in;
 };
 
+fn tokenize_test(
+	func: *fn([]u8, u8...) tokenizer,
+	testcase: str,
+	in: []u8,
+	delim: []u8,
+	tokens: [][]u8,
+	iters: size = types::SIZE_MAX,
+) tokenizer = {
+	const tok = func(in, delim...);
+	let n = 0z;
+	for (const want .. tokens) {
+		if (n >= iters) {
+			return tok;
+		};
+		n += 1;
+
+		const p = peek_token(&tok) as []u8;
+		const n = next_token(&tok) as []u8;
+		assert(equal(p, n), testcase);
+		assert(equal(n, want), testcase);
+	};
+
+	if (n >= iters) {
+		return tok;
+	};
+
+	assert(peek_token(&tok) is done, testcase);
+	assert(next_token(&tok) is done, testcase);
+	return tok;
+};
+
 @test fn tokenize() void = {
-	const input: [_]u8 = [1, 2, 24, 42, 3, 24, 24, 42, 4, 5];
-	let t = tokenize(input, [24, 42]);
-	let p = peek_token(&t) as []u8;
-	let n = next_token(&t) as []u8;
-	assert(equal(p, n));
-	assert(equal([1, 2], n));
-	let p = peek_token(&t) as []u8;
-	let n = next_token(&t) as []u8;
-	assert(equal(p, n));
-	assert(equal([3, 24], n));
-	assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8));
-	assert(equal([4, 5], next_token(&t) as []u8));
-	assert(peek_token(&t) is done);
-	assert(next_token(&t) is done);
-
-	const input: [_]u8 = [24, 42, 1, 24, 42];
-	t = tokenize(input, [24, 42]);
-	assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8));
-	assert(equal([], next_token(&t) as []u8));
-	assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8));
-	assert(equal([1], next_token(&t) as []u8));
-	assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8));
-	assert(equal([], next_token(&t) as []u8));
-	assert(peek_token(&t) is done);
-	assert(next_token(&t) is done);
-
-	const input: [_]u8 = [1, 1, 1, 2, 1, 1, 2, 2];
-	t = tokenize(input, [1, 2]);
-	assert(equal([1, 1], next_token(&t) as []u8));
-	assert(equal([1], next_token(&t) as []u8));
-	assert(equal([2], next_token(&t) as []u8));
-	assert(next_token(&t) is done);
-
-	const input: [_]u8 = [1, 2];
-	t = tokenize(input, [1, 2]);
-	assert(equal([], next_token(&t) as []u8));
-	assert(equal([], next_token(&t) as []u8));
-	assert(peek_token(&t) is done);
-	assert(next_token(&t) is done);
-
-	const input: [_]u8 = [24, 42, 1, 24, 42, 2, 3, 4];
-	t = tokenize(input, [24, 42]);
-	assert(equal([], next_token(&t) as []u8));
-	assert(equal([1], next_token(&t) as []u8));
-	assert(equal(remaining_tokens(&t), [2, 3, 4]));
-	assert(equal(peek_token(&t) as []u8, [2, 3, 4]));
-	assert(equal(remaining_tokens(&t), [2, 3, 4]));
-
-	t = tokenize([], [42]);
-	assert(peek_token(&t) is done);
-	assert(next_token(&t) is done);
-
-	const input: [_]u8 = [1, 2, 24, 42, 3, 24, 24, 42, 4, 5];
-	let t = rtokenize(input, [24, 42]);
-	let p = peek_token(&t) as []u8;
-	let n = next_token(&t) as []u8;
-	assert(equal(p, n));
-	assert(equal([4, 5], n));
-	let p = peek_token(&t) as []u8;
-	let n = next_token(&t) as []u8;
-	assert(equal(p, n));
-	assert(equal([3, 24], n));
-	assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8));
-	assert(equal([1, 2], next_token(&t) as []u8));
-	assert(peek_token(&t) is done);
-	assert(next_token(&t) is done);
-
-	const input: [_]u8 = [1, 2, 3, 24, 42, 4, 24, 42];
-	t = rtokenize(input, [24, 42]);
-	assert(equal([], next_token(&t) as []u8));
-	assert(equal([4], next_token(&t) as []u8));
-	assert(equal(remaining_tokens(&t), [1, 2, 3]));
-	assert(equal(peek_token(&t) as []u8, [1, 2, 3]));
-	assert(equal(remaining_tokens(&t), [1, 2, 3]));
+	tokenize_test(&tokenize, "simple case", [1, 2, 0, 3, 4], [0], [
+		[1, 2],
+		[3, 4],
+	]);
+
+	tokenize_test(&tokenize, "multiple delimiters", [1, 2, 0, 3, 4, 42, 5, 6], [0, 42], [
+		[1, 2],
+		[3, 4],
+		[5, 6],
+	]);
+
+	tokenize_test(&tokenize, "empty tokens", [1, 2, 0, 0, 0, 3, 4], [0], [
+		[1, 2],
+		[],
+		[],
+		[3, 4],
+	]);
+
+	tokenize_test(&tokenize, "leading empty tokens", [0, 1, 2, 3, 0], [0], [
+		[],
+		[1, 2, 3],
+		[],
+	]);
+
+	const tok = tokenize_test(&tokenize, "remaining_tokens", [1, 2, 0, 3, 4], [0], [
+		[1, 2],
+	], 1);
+	assert(equal(remaining_tokens(&tok), [3, 4]));
+};
+
+@test fn rtokenize() void = {
+	tokenize_test(&rtokenize, "simple case", [1, 2, 0, 3, 4], [0], [
+		[3, 4],
+		[1, 2],
+	]);
+
+	tokenize_test(&rtokenize, "multiple delimiters", [1, 2, 0, 3, 4, 42, 5, 6], [0, 42], [
+		[5, 6],
+		[3, 4],
+		[1, 2],
+	]);
+
+	tokenize_test(&rtokenize, "empty tokens", [1, 2, 0, 0, 0, 3, 4], [0], [
+		[3, 4],
+		[],
+		[],
+		[1, 2],
+	]);
+
+	tokenize_test(&rtokenize, "leading empty tokens", [0, 1, 2, 3, 0], [0], [
+		[],
+		[1, 2, 3],
+		[],
+	]);
+
+	const tok = tokenize_test(&rtokenize, "remaining_tokens", [1, 2, 0, 3, 4], [0], [
+		[3, 4],
+	], 1);
+	assert(equal(remaining_tokens(&tok), [1, 2]));
 };
 
 // Returns the input slice "cut" along the first instance of a delimiter,
diff --git a/strings/tokenize.ha b/strings/tokenize.ha
@@ -4,38 +4,53 @@
 use bytes;
 use types;
 
-// The state for a tokenizer.
 export type tokenizer = bytes::tokenizer;
 
-// Returns a tokenizer which yields sub-strings tokenized by a delimiter,
-// starting at the beginning of the string.
+// Tokenizes a string, returning an iterator that yields substrings separated by
+// one or more delimiters, such that the string will be split along any of the
+// characters found in "delim". If the string begins with or ends with a
+// delimiter, an empty string is returned respectively as the first and last
+// call to [[next_token]].
 //
-// 	let tok = strings::tokenize("hello, my name is drew", " ");
-// 	assert(strings::next_token(&tok) as str == "hello,");
-// 	assert(strings::next_token(&tok) as str == "my");
-// 	assert(strings::next_token(&tok) as str == "name");
-// 	assert(strings::remaining_tokens(&tok) == "is drew");
+// Each character of the delimiter string must be an ASCII character (see
+// [[ascii::valid]]).
 //
-// The caller must ensure that 'delim' is not an empty string.
-export fn tokenize(s: str, delim: str) tokenizer =
-	bytes::tokenize(toutf8(s), toutf8(delim));
-
-// Returns a tokenizer which yields sub-strings tokenized by a delimiter,
-// starting at the end of the string and moving backwards with each call
-// to [[next_token]].
+// The input string and delimiter string are borrowed from the caller for the
+// lifetime of the tokenizer.
 //
-// 	let tok = strings::rtokenize("hello, my name is drew", " ");
-// 	assert(strings::next_token(&tok) as str == "drew");
-// 	assert(strings::next_token(&tok) as str == "is");
-// 	assert(strings::next_token(&tok) as str == "name");
-// 	assert(strings::remaining_tokens(&tok) == "hello, my");
+// The caller must ensure that at least one delimiter is provided and that the
+// length of the input string is less than [[types::I64_MAX]].
 //
-// The caller must ensure that 'delim' is not an empty string.
-export fn rtokenize(s: str, delim: str) tokenizer =
-	bytes::rtokenize(toutf8(s), toutf8(delim));
+// 	const tok = strings::tokenize("Hello world!\tMy name is Harriet.", " \t");
+// 	assert(next_token(&tok) as str == "Hello");
+// 	assert(next_token(&tok) as str == "world!");
+// 	assert(next_token(&tok) as str == "My");
+// 	assert(next_token(&tok) as str == "name");
+// 	assert(next_token(&tok) as str == "is");
+// 	assert(next_token(&tok) as str == "Harriet");
+// 	assert(next_token(&tok) is done);
+export fn tokenize(s: str, delim: str) tokenizer = {
+	const in = toutf8(s);
+	const delim = toutf8(delim);
+	for (let ch .. delim) {
+		assert(ch & 0x80 == 0, "strings::tokenize cannot tokenize on non-ASCII delimiters");
+	};
+	return bytes::tokenize(in, delim...);
+};
 
-// Returns the next string from a tokenizer, and advances the cursor. Returns
-// done if there are no tokens left.
+// Like [[tokenize]], but tokenizes the string in reverse, such that the first
+// call to [[next_token]] returns the last token and the last call returns the
+// first token.
+export fn rtokenize(s: str, delim: str) tokenizer = {
+	const in = toutf8(s);
+	const delim = toutf8(delim);
+	for (let ch .. delim) {
+		assert(ch & 0x80 == 0, "strings::tokenize cannot tokenize on non-ASCII delimiters");
+	};
+	return bytes::rtokenize(in, delim...);
+};
+
+// Returns the next token from a [[tokenizer]] and advances the cursor.
 export fn next_token(s: *tokenizer) (str | done) = {
 	let s = s: *bytes::tokenizer;
 	match (bytes::next_token(s)) {
@@ -45,7 +60,7 @@ export fn next_token(s: *tokenizer) (str | done) = {
 	};
 };
 
-// Same as next_token(), but does not advance the cursor
+// Returns the next token from a [[tokenizer]] without advancing the cursor.
 export fn peek_token(s: *tokenizer) (str | done) = {
 	let s = s: *bytes::tokenizer;
 	return match (bytes::peek_token(s)) {
@@ -56,42 +71,92 @@ export fn peek_token(s: *tokenizer) (str | done) = {
 	};
 };
 
-// Returns the remainder of the string associated with a tokenizer, without doing
-// any further tokenization.
+// Returns the remainder of the input string from a [[tokenizer]] ahead of the
+// token cursor.
 export fn remaining_tokens(s: *tokenizer) str = {
 	let s = s: *bytes::tokenizer;
 	return fromutf8_unsafe(bytes::remaining_tokens(s));
 };
 
+fn tokenize_test(
+	testcase: str,
+	in: str,
+	delim: str,
+	tokens: []str,
+	iters: size = types::SIZE_MAX,
+) tokenizer = {
+	const tok = tokenize(in, delim);
+	let n = 0z;
+	for (const want .. tokens) {
+		if (n >= iters) {
+			return tok;
+		};
+		n += 1;
+
+		const p = peek_token(&tok) as str;
+		const n = next_token(&tok) as str;
+		assert(p == n, testcase);
+		assert(n == want, testcase);
+	};
+
+	if (n >= iters) {
+		return tok;
+	};
+
+	assert(peek_token(&tok) is done, testcase);
+	assert(next_token(&tok) is done, testcase);
+	return tok;
+};
+
 @test fn tokenize() void = {
-	let tok = tokenize("Hello, my name is drew", " ");
-	assert(next_token(&tok) as str == "Hello,");
-	assert(next_token(&tok) as str == "my");
-	assert(peek_token(&tok) as str == "name");
-	assert(next_token(&tok) as str == "name");
-	assert(remaining_tokens(&tok) == "is drew");
-	assert(peek_token(&tok) as str == "is");
-	assert(remaining_tokens(&tok) == "is drew");
-
-	let tok = tokenize("foo", "foo");
-	assert(peek_token(&tok) as str == "");
-	assert(next_token(&tok) as str == "");
-	assert(peek_token(&tok) as str == "");
-	assert(next_token(&tok) as str == "");
-	assert(peek_token(&tok) is done);
-	assert(next_token(&tok) is done);
-
-	let tok = tokenize("", "foo");
-	assert(peek_token(&tok) is done);
-	assert(next_token(&tok) is done);
-
-	let tok = rtokenize("Hello, my name is drew", " ");
-	assert(next_token(&tok) as str == "drew");
-	assert(next_token(&tok) as str == "is");
-	assert(next_token(&tok) as str == "name");
-	assert(remaining_tokens(&tok) == "Hello, my");
-	assert(peek_token(&tok) as str == "my");
-	assert(remaining_tokens(&tok) == "Hello, my");
+	tokenize_test("simple case",
+		"Hello world! My name is Harriet.", " ",
+		[
+			"Hello",
+			"world!",
+			"My",
+			"name",
+			"is",
+			"Harriet.",
+		]);
+
+	tokenize_test("multiple delimiters",
+		"/dev/sda1\t/ ext4 rw,relatime\t0 0", " \t",
+		[
+			"/dev/sda1",
+			"/",
+			"ext4",
+			"rw,relatime",
+			"0",
+			"0",
+		]);
+
+	tokenize_test("consecutive delimiters",
+		"hello    world", " ",
+		[
+			"hello",
+			"",
+			"",
+			"",
+			"world",
+		]);
+
+	tokenize_test("leading delimiters",
+		" hello world ", " ",
+		[
+			"",
+			"hello",
+			"world",
+			"",
+		]);
+
+	const tok = tokenize_test("remaining_tokens",
+		"Hello world! My name is Harriet.", " ",
+		[
+			"Hello",
+			"world!",
+		], 2);
+	assert(remaining_tokens(&tok) == "My name is Harriet.");
 };
 
 // Splits a string into tokens delimited by 'delim', starting at the beginning

	hare [hare] The Hare programming language
	git clone https://git.torresjrjr.com/hare.git
	Log \| Files \| Refs \| README \| LICENSE

M	bytes/tokenize.ha	\|	234	+++++++++++++++++++++++++++++++++++++++++++++----------------------------------
M	strings/tokenize.ha	\|	177	++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------