commit 04eda2e65fa49b4e7af0aedb55cc41131a97b9a5
parent 9409fd01c1140b3786e2e7538ba0ee350be4010d
Author: Drew DeVault <sir@cmpwn.com>
Date: Sun, 2 Jun 2024 12:13:23 +0200
bytes, strings::tokenize: support multiple delimiters
This updates both functions to accept more than one delimiter, with the
caveat that each delimiter cannot be greater than one byte in length.
Signed-off-by: Drew DeVault <sir@cmpwn.com>
Diffstat:
M | bytes/tokenize.ha | | | 234 | +++++++++++++++++++++++++++++++++++++++++++++---------------------------------- |
M | strings/tokenize.ha | | | 177 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------- |
2 files changed, 254 insertions(+), 157 deletions(-)
diff --git a/bytes/tokenize.ha b/bytes/tokenize.ha
@@ -9,11 +9,20 @@ export type tokenizer = struct {
p: i64, // p < 0 for reverse tokenizers, 0 <= p for forward ones.
};
-// Returns a tokenizer which yields sub-slices tokenized by a delimiter, starting
-// at the beginning of the slice. The caller must ensure that 'delim' is not an
-// empty slice. Can tokenize a slice of length less than [[types::I64_MAX]].
-export fn tokenize(in: []u8, delim: []u8) tokenizer = {
+// Tokenizes a byte slice, returning an iterator that yields tokens from the
+// slice delimited by one of any number of delimiter bytes. If the input slice
+// begins with or ends with a delimiter, an empty slice is returned respectively
+// as the first and last call to [[next_token]].
+//
+// The variadic argument slice is borrowed from the caller, who should take care
+// to ensure that it is valid for the lifetime of the tokenizer.
+//
+// The caller must ensure that at least one delimiter is provided and that the
+// length of the slice is less than [[types::I64_MAX]].
+export fn tokenize(in: []u8, delim: u8...) tokenizer = {
assert(len(delim) > 0, "bytes::tokenize called with empty slice");
+ assert(len(in) < types::I64_MAX: size,
+ "bytes::tokenize: input length exceeds I64_MAX");
if (len(in) == 0) {
delim = [];
};
@@ -25,12 +34,13 @@ export fn tokenize(in: []u8, delim: []u8) tokenizer = {
};
};
-// Returns a tokenizer which yields sub-slices tokenized by a delimiter, starting at
-// the end of the slice and moving backwards with each call to [[next_token]]. The
-// caller must ensure that 'delimiter' is not an empty slice. Can tokenize a slice
-// of length less than [[types::I64_MAX]].
-export fn rtokenize(in: []u8, delim: []u8) tokenizer = {
+// Like [[tokenize]], but tokenizes the slice in reverse, such that the first
+// call to [[next_token]] returns the last token and the last call returns the
+// first token.
+export fn rtokenize(in: []u8, delim: u8...) tokenizer = {
assert(len(delim) > 0, "bytes::rtokenize called with empty slice");
+ assert(len(in) < types::I64_MAX: size,
+ "bytes::rtokenize: input length exceeds I64_MAX");
if (len(in) == 0) {
delim = [];
};
@@ -45,10 +55,7 @@ export fn rtokenize(in: []u8, delim: []u8) tokenizer = {
};
};
-// Returns the next slice from a tokenizer, and advances the cursor. Returns
-// done if there are no tokens left and on all subsequent invocations. If a
-// string starts with, or ends with, a token, an empty slice is returned at the
-// beginning or end of the sequence, respectively.
+// Returns the next token from a [[tokenizer]] and advances the cursor.
export fn next_token(s: *tokenizer) ([]u8 | done) = {
const b = match (peek_token(s)) {
case let b: []u8 =>
@@ -57,14 +64,13 @@ export fn next_token(s: *tokenizer) ([]u8 | done) = {
};
const slen = len(s.in): i64;
- const dlen = len(s.delim);
const reverse = s.p < 0;
if (reverse) {
if (slen + s.p + 1 == 0) {
s.delim = s.delim[..0];
s.in = s.in[..0];
} else {
- const end = (slen + s.p + 1): size - dlen;
+ const end = (slen + s.p + 1): size - 1;
s.in = s.in[..end];
};
s.p = types::I64_MIN;
@@ -73,7 +79,7 @@ export fn next_token(s: *tokenizer) ([]u8 | done) = {
s.delim = s.delim[..0];
s.in = s.in[..0];
} else {
- s.in = s.in[s.p: size + dlen..];
+ s.in = s.in[s.p: size + 1..];
};
s.p = types::I64_MAX;
};
@@ -81,7 +87,7 @@ export fn next_token(s: *tokenizer) ([]u8 | done) = {
return b;
};
-// Same as [[next_token]], but does not advance the cursor
+// Returns the next token from a [[tokenizer]] without advancing the cursor.
export fn peek_token(s: *tokenizer) ([]u8 | done) = {
if (len(s.delim) == 0) {
return done;
@@ -93,16 +99,27 @@ export fn peek_token(s: *tokenizer) ([]u8 | done) = {
const known = ((reverse && s.p != types::I64_MIN) ||
(!reverse && s.p != types::I64_MAX));
if (!known) {
- let i = 0i64;
+ let i = if (reverse) types::I64_MIN else types::I64_MAX;
let dlen = 0i64;
const slen = len(s.in): i64;
- match (ifunc(s.in, s.delim)) {
- case let ix: size =>
- dlen = len(s.delim): i64;
- i = ix: i64;
- case void =>
- i = slen;
+ for (let d .. s.delim) {
+ match (ifunc(s.in, d)) {
+ case let ix: size =>
+ if (!reverse && ix: i64 < i) {
+ i = ix: i64;
+ dlen = 1;
+ } else if (reverse && ix: i64 > i) {
+ i = ix: i64;
+ dlen = 1;
+ };
+ case void =>
+ if (!reverse && slen < i: i64) {
+ i = slen;
+ } else if (reverse && 0 > i: i64) {
+ i = 0;
+ };
+ };
};
if (reverse) {
@@ -123,88 +140,103 @@ export fn peek_token(s: *tokenizer) ([]u8 | done) = {
};
};
-
-// Returns the remainder of the slice associated with a tokenizer, without doing
-// any further tokenization.
+// Returns the remainder of the input slice from a [[tokenizer]] ahead of the
+// token cursor.
export fn remaining_tokens(s: *tokenizer) []u8 = {
return s.in;
};
+fn tokenize_test(
+ func: *fn([]u8, u8...) tokenizer,
+ testcase: str,
+ in: []u8,
+ delim: []u8,
+ tokens: [][]u8,
+ iters: size = types::SIZE_MAX,
+) tokenizer = {
+ const tok = func(in, delim...);
+ let n = 0z;
+ for (const want .. tokens) {
+ if (n >= iters) {
+ return tok;
+ };
+ n += 1;
+
+ const p = peek_token(&tok) as []u8;
+ const n = next_token(&tok) as []u8;
+ assert(equal(p, n), testcase);
+ assert(equal(n, want), testcase);
+ };
+
+ if (n >= iters) {
+ return tok;
+ };
+
+ assert(peek_token(&tok) is done, testcase);
+ assert(next_token(&tok) is done, testcase);
+ return tok;
+};
+
@test fn tokenize() void = {
- const input: [_]u8 = [1, 2, 24, 42, 3, 24, 24, 42, 4, 5];
- let t = tokenize(input, [24, 42]);
- let p = peek_token(&t) as []u8;
- let n = next_token(&t) as []u8;
- assert(equal(p, n));
- assert(equal([1, 2], n));
- let p = peek_token(&t) as []u8;
- let n = next_token(&t) as []u8;
- assert(equal(p, n));
- assert(equal([3, 24], n));
- assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8));
- assert(equal([4, 5], next_token(&t) as []u8));
- assert(peek_token(&t) is done);
- assert(next_token(&t) is done);
-
- const input: [_]u8 = [24, 42, 1, 24, 42];
- t = tokenize(input, [24, 42]);
- assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8));
- assert(equal([], next_token(&t) as []u8));
- assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8));
- assert(equal([1], next_token(&t) as []u8));
- assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8));
- assert(equal([], next_token(&t) as []u8));
- assert(peek_token(&t) is done);
- assert(next_token(&t) is done);
-
- const input: [_]u8 = [1, 1, 1, 2, 1, 1, 2, 2];
- t = tokenize(input, [1, 2]);
- assert(equal([1, 1], next_token(&t) as []u8));
- assert(equal([1], next_token(&t) as []u8));
- assert(equal([2], next_token(&t) as []u8));
- assert(next_token(&t) is done);
-
- const input: [_]u8 = [1, 2];
- t = tokenize(input, [1, 2]);
- assert(equal([], next_token(&t) as []u8));
- assert(equal([], next_token(&t) as []u8));
- assert(peek_token(&t) is done);
- assert(next_token(&t) is done);
-
- const input: [_]u8 = [24, 42, 1, 24, 42, 2, 3, 4];
- t = tokenize(input, [24, 42]);
- assert(equal([], next_token(&t) as []u8));
- assert(equal([1], next_token(&t) as []u8));
- assert(equal(remaining_tokens(&t), [2, 3, 4]));
- assert(equal(peek_token(&t) as []u8, [2, 3, 4]));
- assert(equal(remaining_tokens(&t), [2, 3, 4]));
-
- t = tokenize([], [42]);
- assert(peek_token(&t) is done);
- assert(next_token(&t) is done);
-
- const input: [_]u8 = [1, 2, 24, 42, 3, 24, 24, 42, 4, 5];
- let t = rtokenize(input, [24, 42]);
- let p = peek_token(&t) as []u8;
- let n = next_token(&t) as []u8;
- assert(equal(p, n));
- assert(equal([4, 5], n));
- let p = peek_token(&t) as []u8;
- let n = next_token(&t) as []u8;
- assert(equal(p, n));
- assert(equal([3, 24], n));
- assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8));
- assert(equal([1, 2], next_token(&t) as []u8));
- assert(peek_token(&t) is done);
- assert(next_token(&t) is done);
-
- const input: [_]u8 = [1, 2, 3, 24, 42, 4, 24, 42];
- t = rtokenize(input, [24, 42]);
- assert(equal([], next_token(&t) as []u8));
- assert(equal([4], next_token(&t) as []u8));
- assert(equal(remaining_tokens(&t), [1, 2, 3]));
- assert(equal(peek_token(&t) as []u8, [1, 2, 3]));
- assert(equal(remaining_tokens(&t), [1, 2, 3]));
+ tokenize_test(&tokenize, "simple case", [1, 2, 0, 3, 4], [0], [
+ [1, 2],
+ [3, 4],
+ ]);
+
+ tokenize_test(&tokenize, "multiple delimiters", [1, 2, 0, 3, 4, 42, 5, 6], [0, 42], [
+ [1, 2],
+ [3, 4],
+ [5, 6],
+ ]);
+
+ tokenize_test(&tokenize, "empty tokens", [1, 2, 0, 0, 0, 3, 4], [0], [
+ [1, 2],
+ [],
+ [],
+ [3, 4],
+ ]);
+
+ tokenize_test(&tokenize, "leading empty tokens", [0, 1, 2, 3, 0], [0], [
+ [],
+ [1, 2, 3],
+ [],
+ ]);
+
+ const tok = tokenize_test(&tokenize, "remaining_tokens", [1, 2, 0, 3, 4], [0], [
+ [1, 2],
+ ], 1);
+ assert(equal(remaining_tokens(&tok), [3, 4]));
+};
+
+@test fn rtokenize() void = {
+ tokenize_test(&rtokenize, "simple case", [1, 2, 0, 3, 4], [0], [
+ [3, 4],
+ [1, 2],
+ ]);
+
+ tokenize_test(&rtokenize, "multiple delimiters", [1, 2, 0, 3, 4, 42, 5, 6], [0, 42], [
+ [5, 6],
+ [3, 4],
+ [1, 2],
+ ]);
+
+ tokenize_test(&rtokenize, "empty tokens", [1, 2, 0, 0, 0, 3, 4], [0], [
+ [3, 4],
+ [],
+ [],
+ [1, 2],
+ ]);
+
+ tokenize_test(&rtokenize, "leading empty tokens", [0, 1, 2, 3, 0], [0], [
+ [],
+ [1, 2, 3],
+ [],
+ ]);
+
+ const tok = tokenize_test(&rtokenize, "remaining_tokens", [1, 2, 0, 3, 4], [0], [
+ [3, 4],
+ ], 1);
+ assert(equal(remaining_tokens(&tok), [1, 2]));
};
// Returns the input slice "cut" along the first instance of a delimiter,
diff --git a/strings/tokenize.ha b/strings/tokenize.ha
@@ -4,38 +4,53 @@
use bytes;
use types;
-// The state for a tokenizer.
export type tokenizer = bytes::tokenizer;
-// Returns a tokenizer which yields sub-strings tokenized by a delimiter,
-// starting at the beginning of the string.
+// Tokenizes a string, returning an iterator that yields substrings separated by
+// one or more delimiters, such that the string will be split along any of the
+// characters found in "delim". If the string begins with or ends with a
+// delimiter, an empty string is returned respectively as the first and last
+// call to [[next_token]].
//
-// let tok = strings::tokenize("hello, my name is drew", " ");
-// assert(strings::next_token(&tok) as str == "hello,");
-// assert(strings::next_token(&tok) as str == "my");
-// assert(strings::next_token(&tok) as str == "name");
-// assert(strings::remaining_tokens(&tok) == "is drew");
+// Each character of the delimiter string must be an ASCII character (see
+// [[ascii::valid]]).
//
-// The caller must ensure that 'delim' is not an empty string.
-export fn tokenize(s: str, delim: str) tokenizer =
- bytes::tokenize(toutf8(s), toutf8(delim));
-
-// Returns a tokenizer which yields sub-strings tokenized by a delimiter,
-// starting at the end of the string and moving backwards with each call
-// to [[next_token]].
+// The input string and delimiter string are borrowed from the caller for the
+// lifetime of the tokenizer.
//
-// let tok = strings::rtokenize("hello, my name is drew", " ");
-// assert(strings::next_token(&tok) as str == "drew");
-// assert(strings::next_token(&tok) as str == "is");
-// assert(strings::next_token(&tok) as str == "name");
-// assert(strings::remaining_tokens(&tok) == "hello, my");
+// The caller must ensure that at least one delimiter is provided and that the
+// length of the input string is less than [[types::I64_MAX]].
//
-// The caller must ensure that 'delim' is not an empty string.
-export fn rtokenize(s: str, delim: str) tokenizer =
- bytes::rtokenize(toutf8(s), toutf8(delim));
+// const tok = strings::tokenize("Hello world!\tMy name is Harriet.", " \t");
+// assert(next_token(&tok) as str == "Hello");
+// assert(next_token(&tok) as str == "world!");
+// assert(next_token(&tok) as str == "My");
+// assert(next_token(&tok) as str == "name");
+// assert(next_token(&tok) as str == "is");
+// assert(next_token(&tok) as str == "Harriet");
+// assert(next_token(&tok) is done);
+export fn tokenize(s: str, delim: str) tokenizer = {
+ const in = toutf8(s);
+ const delim = toutf8(delim);
+ for (let ch .. delim) {
+ assert(ch & 0x80 == 0, "strings::tokenize cannot tokenize on non-ASCII delimiters");
+ };
+ return bytes::tokenize(in, delim...);
+};
-// Returns the next string from a tokenizer, and advances the cursor. Returns
-// done if there are no tokens left.
+// Like [[tokenize]], but tokenizes the string in reverse, such that the first
+// call to [[next_token]] returns the last token and the last call returns the
+// first token.
+export fn rtokenize(s: str, delim: str) tokenizer = {
+ const in = toutf8(s);
+ const delim = toutf8(delim);
+ for (let ch .. delim) {
+ assert(ch & 0x80 == 0, "strings::tokenize cannot tokenize on non-ASCII delimiters");
+ };
+ return bytes::rtokenize(in, delim...);
+};
+
+// Returns the next token from a [[tokenizer]] and advances the cursor.
export fn next_token(s: *tokenizer) (str | done) = {
let s = s: *bytes::tokenizer;
match (bytes::next_token(s)) {
@@ -45,7 +60,7 @@ export fn next_token(s: *tokenizer) (str | done) = {
};
};
-// Same as next_token(), but does not advance the cursor
+// Returns the next token from a [[tokenizer]] without advancing the cursor.
export fn peek_token(s: *tokenizer) (str | done) = {
let s = s: *bytes::tokenizer;
return match (bytes::peek_token(s)) {
@@ -56,42 +71,92 @@ export fn peek_token(s: *tokenizer) (str | done) = {
};
};
-// Returns the remainder of the string associated with a tokenizer, without doing
-// any further tokenization.
+// Returns the remainder of the input string from a [[tokenizer]] ahead of the
+// token cursor.
export fn remaining_tokens(s: *tokenizer) str = {
let s = s: *bytes::tokenizer;
return fromutf8_unsafe(bytes::remaining_tokens(s));
};
+fn tokenize_test(
+ testcase: str,
+ in: str,
+ delim: str,
+ tokens: []str,
+ iters: size = types::SIZE_MAX,
+) tokenizer = {
+ const tok = tokenize(in, delim);
+ let n = 0z;
+ for (const want .. tokens) {
+ if (n >= iters) {
+ return tok;
+ };
+ n += 1;
+
+ const p = peek_token(&tok) as str;
+ const n = next_token(&tok) as str;
+ assert(p == n, testcase);
+ assert(n == want, testcase);
+ };
+
+ if (n >= iters) {
+ return tok;
+ };
+
+ assert(peek_token(&tok) is done, testcase);
+ assert(next_token(&tok) is done, testcase);
+ return tok;
+};
+
@test fn tokenize() void = {
- let tok = tokenize("Hello, my name is drew", " ");
- assert(next_token(&tok) as str == "Hello,");
- assert(next_token(&tok) as str == "my");
- assert(peek_token(&tok) as str == "name");
- assert(next_token(&tok) as str == "name");
- assert(remaining_tokens(&tok) == "is drew");
- assert(peek_token(&tok) as str == "is");
- assert(remaining_tokens(&tok) == "is drew");
-
- let tok = tokenize("foo", "foo");
- assert(peek_token(&tok) as str == "");
- assert(next_token(&tok) as str == "");
- assert(peek_token(&tok) as str == "");
- assert(next_token(&tok) as str == "");
- assert(peek_token(&tok) is done);
- assert(next_token(&tok) is done);
-
- let tok = tokenize("", "foo");
- assert(peek_token(&tok) is done);
- assert(next_token(&tok) is done);
-
- let tok = rtokenize("Hello, my name is drew", " ");
- assert(next_token(&tok) as str == "drew");
- assert(next_token(&tok) as str == "is");
- assert(next_token(&tok) as str == "name");
- assert(remaining_tokens(&tok) == "Hello, my");
- assert(peek_token(&tok) as str == "my");
- assert(remaining_tokens(&tok) == "Hello, my");
+ tokenize_test("simple case",
+ "Hello world! My name is Harriet.", " ",
+ [
+ "Hello",
+ "world!",
+ "My",
+ "name",
+ "is",
+ "Harriet.",
+ ]);
+
+ tokenize_test("multiple delimiters",
+ "/dev/sda1\t/ ext4 rw,relatime\t0 0", " \t",
+ [
+ "/dev/sda1",
+ "/",
+ "ext4",
+ "rw,relatime",
+ "0",
+ "0",
+ ]);
+
+ tokenize_test("consecutive delimiters",
+ "hello world", " ",
+ [
+ "hello",
+ "",
+ "",
+ "",
+ "world",
+ ]);
+
+ tokenize_test("leading delimiters",
+ " hello world ", " ",
+ [
+ "",
+ "hello",
+ "world",
+ "",
+ ]);
+
+ const tok = tokenize_test("remaining_tokens",
+ "Hello world! My name is Harriet.", " ",
+ [
+ "Hello",
+ "world!",
+ ], 2);
+ assert(remaining_tokens(&tok) == "My name is Harriet.");
};
// Splits a string into tokens delimited by 'delim', starting at the beginning