commit 4ec74642362c0cef067d231e851550a374e91e9a
parent fd0359ea21f152be1f82f33db5d3f3b2b297458b
Author: Drew DeVault <sir@cmpwn.com>
Date: Sat, 6 Feb 2021 15:18:02 -0500
bytes, strings: add tokenize, split
Diffstat:
3 files changed, 218 insertions(+), 0 deletions(-)
diff --git a/bytes/tokenize.ha b/bytes/tokenize.ha
@@ -0,0 +1,119 @@
+// The state for a tokenizer.
+export type tokenizer = struct { s: []u8, d: []u8 };
+
+// Returns a tokenizer which yields sub-slices tokenized by a delimiter.
+export fn tokenize(s: []u8, delim: []u8) tokenizer = tokenizer {
+ s = s,
+ d = delim,
+};
+
+// Returns the number of bytes in a which are equal to bytes in b.
+fn nequal(a: []u8, b: []u8) size = {
+ let i = 0z;
+ for (i < len(a) && i < len(b); i += 1z) {
+ if (a[i] != b[i]) {
+ break;
+ };
+ };
+ return i;
+};
+
+// Returns the next slice from a tokenizer, and advances the cursor. Returns
+// void if there are no tokens left. If a string starts with, or ends with, a
+// token, an empty slice is returned at the beginning or end of the sequence,
+// respectively.
+export fn next_token(s: *tokenizer) ([]u8 | void) = {
+ let i = 0z;
+ for (i < len(s.s)) {
+ let n = nequal(s.s[i..], s.d);
+ if (n == len(s.d)) {
+ let tok = s.s[..i];
+ if (len(tok) + len(s.d) == len(s.s) && len(tok) != 0z) {
+ s.s = s.s[i..];
+ } else {
+ s.s = s.s[i+len(s.d)..];
+ };
+ return tok;
+ } else if (n != 0z) {
+ i += n;
+ } else {
+ i += 1z;
+ };
+ };
+
+ if (len(s.s) != 0z) {
+ let tok = s.s[..];
+ s.s = s.s[..0];
+ return tok;
+ };
+
+ return void;
+};
+
+// Returns the remainder of the slice associated with a tokenizer, without doing
+// any further tokenization.
+export fn remaining_tokens(s: *tokenizer) []u8 = {
+ return s.s;
+};
+
+@test fn tokenize() void = {
+ const input = [1u8, 2u8, 24u8, 42u8, 3u8, 24u8, 24u8, 42u8, 4u8, 5u8];
+ let t = tokenize(input, [24u8, 42u8]);
+
+ match (next_token(&t)) {
+ b: []u8 => assert(equal([1u8, 2u8], b)),
+ void => abort(),
+ };
+
+ match (next_token(&t)) {
+ b: []u8 => assert(equal([3u8, 24u8], b)),
+ void => abort(),
+ };
+
+ match (next_token(&t)) {
+ b: []u8 => assert(equal([4u8, 5u8], b)),
+ void => abort(),
+ };
+
+ assert(next_token(&t) is void);
+
+ const input2 = [24u8, 42u8, 1u8, 24u8, 42u8];
+ t = tokenize(input2, [24u8, 42u8]);
+
+ match (next_token(&t)) {
+ b: []u8 => assert(equal([], b)),
+ void => abort(),
+ };
+
+ match (next_token(&t)) {
+ b: []u8 => assert(equal([1u8], b)),
+ void => abort(),
+ };
+
+ match (next_token(&t)) {
+ b: []u8 => assert(equal([], b)),
+ void => abort(),
+ };
+
+ assert(next_token(&t) is void);
+
+ const input3 = [1u8, 1u8, 1u8, 2u8, 1u8, 1u8, 2u8, 2u8];
+ t = tokenize(input3, [1u8, 2u8]);
+
+ match (next_token(&t)) {
+ b: []u8 => assert(equal([1u8, 1u8], b)),
+ void => abort(),
+ };
+
+ match (next_token(&t)) {
+ b: []u8 => assert(equal([1u8], b)),
+ void => abort(),
+ };
+
+ match (next_token(&t)) {
+ b: []u8 => assert(equal([2u8], b)),
+ void => abort(),
+ };
+
+ assert(next_token(&t) is void);
+};
diff --git a/strings/dup.ha b/strings/dup.ha
@@ -17,3 +17,11 @@ export fn dup(s: const str) str = {
};
return *(&out: *str);
};
+
+// Duplicates every string of a slice in place, returning the same slice with
+// new strings.
+export fn dup_all(s: []str) void = {
+ for (let i = 0z; i < len(s); i += 1z) {
+ s[i] = strings::dup(s[i]);
+ };
+};
diff --git a/strings/tokenize.ha b/strings/tokenize.ha
@@ -0,0 +1,91 @@
+use bytes;
+use types;
+
+// The state for a tokenizer.
+type tokenizer = bytes::tokenizer;
+
+// Returns a tokenizer which yields sub-strings tokenized by a delimiter.
+//
+// let tok = strings::tokenize("hello, my name is drew", " ");
+// assert(strings::token(tok) == "hello,");
+// assert(strings::token(tok) == "my");
+// assert(strings::token(tok) == "name");
+// assert(strings::remaining_tokens(tok) == "is drew");
+export fn tokenize(s: str, delim: str) tokenizer =
+ bytes::tokenize(to_utf8(s), to_utf8(delim));
+
+// Returns the next string from a tokenizer, and advances the cursor. Returns
+// void if there are no tokens left.
+export fn next_token(s: *tokenizer) (str | void) = {
+ return match (bytes::next_token(s)) {
+ b: []u8 => from_utf8(b),
+ void => void,
+ };
+};
+
+// Returns the remainder of the string associated with a tokenizer, without doing
+// any further tokenization.
+export fn remaining_tokens(s: *tokenizer) str = {
+ return from_utf8(bytes::remaining_tokens(s));
+};
+
+@test fn tokenize() void = {
+ let tok = tokenize("Hello, my name is drew", " ");
+ match (next_token(&tok)) {
+ s: str => assert(s == "Hello,"),
+ void => abort(),
+ };
+
+ match (next_token(&tok)) {
+ s: str => assert(s == "my"),
+ void => abort(),
+ };
+
+ match (next_token(&tok)) {
+ s: str => assert(s == "name"),
+ void => abort(),
+ };
+
+ assert(remaining_tokens(&tok) == "is drew");
+};
+
+// Splits a string into tokens delimited by 'delim', returning a slice of up to
+// N tokens. The caller must free this slice. The strings within the slice are
+// borrowed from 'in', and needn't be freed - but should be [strings::dup_all]'d
+// if they should outlive 'in'.
+export fn splitN(in: str, delim: str, n: size) []str = {
+ let toks = alloc([]str, [], 8z); // TODO: Drop explicit capacity
+ let tok = tokenize(in, delim);
+ for (let i = 0z; i < n - 1z; i += 1z) {
+ match (next_token(&tok)) {
+ s: str => {
+ append(toks, s);
+ },
+ void => return toks,
+ };
+ };
+ append(toks, remaining_tokens(&tok));
+ return toks;
+};
+
+// Splits a string into tokens delimited by 'delim'. The caller must free the
+// returned slice. The strings within the slice are borrowed from 'in', and
+// needn't be freed - but must be [strings::dup_all]'d if they should outlive
+// 'in'.
+export fn split(in: str, delim: str) []str = splitN(in, delim, types::SIZE_MAX);
+
+@test fn split() void = {
+ const expected = ["Hello,", "my", "name", "is Drew"];
+ const actual = splitN("Hello, my name is Drew", " ", 4z);
+ assert(len(expected) == len(actual));
+ for (let i = 0z; i < len(expected); i += 1z) {
+ assert(expected[i] == actual[i]);
+ };
+
+ const expected2 = ["Hello,", "my", "name", "is", "Drew"];
+ const actual2 = split("Hello, my name is Drew", " ");
+ assert(len(expected2) == len(actual2));
+ for (let i = 0z; i < len(expected2); i += 1z) {
+ assert(expected2[i] == actual2[i]);
+ };
+};