bytes, strings: add tokenize, split - hare - The Hare programming language

commit 4ec74642362c0cef067d231e851550a374e91e9a
parent fd0359ea21f152be1f82f33db5d3f3b2b297458b
Author: Drew DeVault <sir@cmpwn.com>
Date:   Sat,  6 Feb 2021 15:18:02 -0500

bytes, strings: add tokenize, split

Diffstat:
A bytes/tokenize.ha  | 119 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M strings/dup.ha  | 8 ++++++++
A strings/tokenize.ha  | 91 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

3 files changed, 218 insertions(+), 0 deletions(-)
diff --git a/bytes/tokenize.ha b/bytes/tokenize.ha
@@ -0,0 +1,119 @@
+// The state for a tokenizer.
+export type tokenizer = struct { s: []u8, d: []u8 };
+
+// Returns a tokenizer which yields sub-slices tokenized by a delimiter.
+export fn tokenize(s: []u8, delim: []u8) tokenizer = tokenizer {
+	s = s,
+	d = delim,
+};
+
+// Returns the number of bytes in a which are equal to bytes in b.
+fn nequal(a: []u8, b: []u8) size = {
+	let i = 0z;
+	for (i < len(a) && i < len(b); i += 1z) {
+		if (a[i] != b[i]) {
+			break;
+		};
+	};
+	return i;
+};
+
+// Returns the next slice from a tokenizer, and advances the cursor. Returns
+// void if there are no tokens left. If a string starts with, or ends with, a
+// token, an empty slice is returned at the beginning or end of the sequence,
+// respectively.
+export fn next_token(s: *tokenizer) ([]u8 | void) = {
+	let i = 0z;
+	for (i < len(s.s)) {
+		let n = nequal(s.s[i..], s.d);
+		if (n == len(s.d)) {
+			let tok = s.s[..i];
+			if (len(tok) + len(s.d) == len(s.s) && len(tok) != 0z) {
+				s.s = s.s[i..];
+			} else {
+				s.s = s.s[i+len(s.d)..];
+			};
+			return tok;
+		} else if (n != 0z) {
+			i += n;
+		} else {
+			i += 1z;
+		};
+	};
+
+	if (len(s.s) != 0z) {
+		let tok = s.s[..];
+		s.s = s.s[..0];
+		return tok;
+	};
+
+	return void;
+};
+
+// Returns the remainder of the slice associated with a tokenizer, without doing
+// any further tokenization.
+export fn remaining_tokens(s: *tokenizer) []u8 = {
+	return s.s;
+};
+
+@test fn tokenize() void = {
+	const input = [1u8, 2u8, 24u8, 42u8, 3u8, 24u8, 24u8, 42u8, 4u8, 5u8];
+	let t = tokenize(input, [24u8, 42u8]);
+
+	match (next_token(&t)) {
+		b: []u8 => assert(equal([1u8, 2u8], b)),
+		void    => abort(),
+	};
+
+	match (next_token(&t)) {
+		b: []u8 => assert(equal([3u8, 24u8], b)),
+		void    => abort(),
+	};
+
+	match (next_token(&t)) {
+		b: []u8 => assert(equal([4u8, 5u8], b)),
+		void    => abort(),
+	};
+
+	assert(next_token(&t) is void);
+
+	const input2 = [24u8, 42u8, 1u8, 24u8, 42u8];
+	t = tokenize(input2, [24u8, 42u8]);
+
+	match (next_token(&t)) {
+		b: []u8 => assert(equal([], b)),
+		void    => abort(),
+	};
+
+	match (next_token(&t)) {
+		b: []u8 => assert(equal([1u8], b)),
+		void    => abort(),
+	};
+
+	match (next_token(&t)) {
+		b: []u8 => assert(equal([], b)),
+		void    => abort(),
+	};
+
+	assert(next_token(&t) is void);
+
+	const input3 = [1u8, 1u8, 1u8, 2u8, 1u8, 1u8, 2u8, 2u8];
+	t = tokenize(input3, [1u8, 2u8]);
+
+	match (next_token(&t)) {
+		b: []u8 => assert(equal([1u8, 1u8], b)),
+		void    => abort(),
+	};
+
+	match (next_token(&t)) {
+		b: []u8 => assert(equal([1u8], b)),
+		void    => abort(),
+	};
+
+	match (next_token(&t)) {
+		b: []u8 => assert(equal([2u8], b)),
+		void    => abort(),
+	};
+
+	assert(next_token(&t) is void);
+};
diff --git a/strings/dup.ha b/strings/dup.ha
@@ -17,3 +17,11 @@ export fn dup(s: const str) str = {
 	};
 	return *(&out: *str);
 };
+
+// Duplicates every string of a slice in place, returning the same slice with
+// new strings.
+export fn dup_all(s: []str) void = {
+	for (let i = 0z; i < len(s); i += 1z) {
+		s[i] = strings::dup(s[i]);
+	};
+};
diff --git a/strings/tokenize.ha b/strings/tokenize.ha
@@ -0,0 +1,91 @@
+use bytes;
+use types;
+
+// The state for a tokenizer.
+type tokenizer = bytes::tokenizer;
+
+// Returns a tokenizer which yields sub-strings tokenized by a delimiter.
+//
+// 	let tok = strings::tokenize("hello, my name is drew", " ");
+// 	assert(strings::token(tok) == "hello,");
+// 	assert(strings::token(tok) == "my");
+// 	assert(strings::token(tok) == "name");
+// 	assert(strings::remaining_tokens(tok) == "is drew");
+export fn tokenize(s: str, delim: str) tokenizer =
+	bytes::tokenize(to_utf8(s), to_utf8(delim));
+
+// Returns the next string from a tokenizer, and advances the cursor. Returns
+// void if there are no tokens left.
+export fn next_token(s: *tokenizer) (str | void) = {
+	return match (bytes::next_token(s)) {
+		b: []u8 => from_utf8(b),
+		void => void,
+	};
+};
+
+// Returns the remainder of the string associated with a tokenizer, without doing
+// any further tokenization.
+export fn remaining_tokens(s: *tokenizer) str = {
+	return from_utf8(bytes::remaining_tokens(s));
+};
+
+@test fn tokenize() void = {
+	let tok = tokenize("Hello, my name is drew", " ");
+	match (next_token(&tok)) {
+		s: str => assert(s == "Hello,"),
+		void   => abort(),
+	};
+
+	match (next_token(&tok)) {
+		s: str => assert(s == "my"),
+		void   => abort(),
+	};
+
+	match (next_token(&tok)) {
+		s: str => assert(s == "name"),
+		void   => abort(),
+	};
+
+	assert(remaining_tokens(&tok) == "is drew");
+};
+
+// Splits a string into tokens delimited by 'delim', returning a slice of up to
+// N tokens. The caller must free this slice. The strings within the slice are
+// borrowed from 'in', and needn't be freed - but should be [strings::dup_all]'d
+// if they should outlive 'in'.
+export fn splitN(in: str, delim: str, n: size) []str = {
+	let toks = alloc([]str, [], 8z); // TODO: Drop explicit capacity
+	let tok = tokenize(in, delim);
+	for (let i = 0z; i < n - 1z; i += 1z) {
+		match (next_token(&tok)) {
+			s: str => {
+				append(toks, s);
+			},
+			void => return toks,
+		};
+	};
+	append(toks, remaining_tokens(&tok));
+	return toks;
+};
+
+// Splits a string into tokens delimited by 'delim'.  The caller must free the
+// returned slice. The strings within the slice are borrowed from 'in', and
+// needn't be freed - but must be [strings::dup_all]'d if they should outlive
+// 'in'.
+export fn split(in: str, delim: str) []str = splitN(in, delim, types::SIZE_MAX);
+
+@test fn split() void = {
+	const expected = ["Hello,", "my", "name", "is Drew"];
+	const actual = splitN("Hello, my name is Drew", " ", 4z);
+	assert(len(expected) == len(actual));
+	for (let i = 0z; i < len(expected); i += 1z) {
+		assert(expected[i] == actual[i]);
+	};
+
+	const expected2 = ["Hello,", "my", "name", "is", "Drew"];
+	const actual2 = split("Hello, my name is Drew", " ");
+	assert(len(expected2) == len(actual2));
+	for (let i = 0z; i < len(expected2); i += 1z) {
+		assert(expected2[i] == actual2[i]);
+	};
+};

	hare The Hare programming language
	git clone https://git.torresjrjr.com/hare.git
	Log \| Files \| Refs \| README \| LICENSE

A	bytes/tokenize.ha	\|	119	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	strings/dup.ha	\|	8	++++++++
A	strings/tokenize.ha	\|	91	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++