hare

[hare] The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

commit 04eda2e65fa49b4e7af0aedb55cc41131a97b9a5
parent 9409fd01c1140b3786e2e7538ba0ee350be4010d
Author: Drew DeVault <sir@cmpwn.com>
Date:   Sun,  2 Jun 2024 12:13:23 +0200

bytes, strings::tokenize: support multiple delimiters

This updates both functions to accept more than one delimiter, with the
caveat that each delimiter cannot be greater than one byte in length.

Signed-off-by: Drew DeVault <sir@cmpwn.com>

Diffstat:
Mbytes/tokenize.ha | 234+++++++++++++++++++++++++++++++++++++++++++++----------------------------------
Mstrings/tokenize.ha | 177++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------
2 files changed, 254 insertions(+), 157 deletions(-)

diff --git a/bytes/tokenize.ha b/bytes/tokenize.ha @@ -9,11 +9,20 @@ export type tokenizer = struct { p: i64, // p < 0 for reverse tokenizers, 0 <= p for forward ones. }; -// Returns a tokenizer which yields sub-slices tokenized by a delimiter, starting -// at the beginning of the slice. The caller must ensure that 'delim' is not an -// empty slice. Can tokenize a slice of length less than [[types::I64_MAX]]. -export fn tokenize(in: []u8, delim: []u8) tokenizer = { +// Tokenizes a byte slice, returning an iterator that yields tokens from the +// slice delimited by one of any number of delimiter bytes. If the input slice +// begins with or ends with a delimiter, an empty slice is returned respectively +// as the first and last call to [[next_token]]. +// +// The variadic argument slice is borrowed from the caller, who should take care +// to ensure that it is valid for the lifetime of the tokenizer. +// +// The caller must ensure that at least one delimiter is provided and that the +// length of the slice is less than [[types::I64_MAX]]. +export fn tokenize(in: []u8, delim: u8...) tokenizer = { assert(len(delim) > 0, "bytes::tokenize called with empty slice"); + assert(len(in) < types::I64_MAX: size, + "bytes::tokenize: input length exceeds I64_MAX"); if (len(in) == 0) { delim = []; }; @@ -25,12 +34,13 @@ export fn tokenize(in: []u8, delim: []u8) tokenizer = { }; }; -// Returns a tokenizer which yields sub-slices tokenized by a delimiter, starting at -// the end of the slice and moving backwards with each call to [[next_token]]. The -// caller must ensure that 'delimiter' is not an empty slice. Can tokenize a slice -// of length less than [[types::I64_MAX]]. -export fn rtokenize(in: []u8, delim: []u8) tokenizer = { +// Like [[tokenize]], but tokenizes the slice in reverse, such that the first +// call to [[next_token]] returns the last token and the last call returns the +// first token. +export fn rtokenize(in: []u8, delim: u8...) tokenizer = { assert(len(delim) > 0, "bytes::rtokenize called with empty slice"); + assert(len(in) < types::I64_MAX: size, + "bytes::rtokenize: input length exceeds I64_MAX"); if (len(in) == 0) { delim = []; }; @@ -45,10 +55,7 @@ export fn rtokenize(in: []u8, delim: []u8) tokenizer = { }; }; -// Returns the next slice from a tokenizer, and advances the cursor. Returns -// done if there are no tokens left and on all subsequent invocations. If a -// string starts with, or ends with, a token, an empty slice is returned at the -// beginning or end of the sequence, respectively. +// Returns the next token from a [[tokenizer]] and advances the cursor. export fn next_token(s: *tokenizer) ([]u8 | done) = { const b = match (peek_token(s)) { case let b: []u8 => @@ -57,14 +64,13 @@ export fn next_token(s: *tokenizer) ([]u8 | done) = { }; const slen = len(s.in): i64; - const dlen = len(s.delim); const reverse = s.p < 0; if (reverse) { if (slen + s.p + 1 == 0) { s.delim = s.delim[..0]; s.in = s.in[..0]; } else { - const end = (slen + s.p + 1): size - dlen; + const end = (slen + s.p + 1): size - 1; s.in = s.in[..end]; }; s.p = types::I64_MIN; @@ -73,7 +79,7 @@ export fn next_token(s: *tokenizer) ([]u8 | done) = { s.delim = s.delim[..0]; s.in = s.in[..0]; } else { - s.in = s.in[s.p: size + dlen..]; + s.in = s.in[s.p: size + 1..]; }; s.p = types::I64_MAX; }; @@ -81,7 +87,7 @@ export fn next_token(s: *tokenizer) ([]u8 | done) = { return b; }; -// Same as [[next_token]], but does not advance the cursor +// Returns the next token from a [[tokenizer]] without advancing the cursor. export fn peek_token(s: *tokenizer) ([]u8 | done) = { if (len(s.delim) == 0) { return done; @@ -93,16 +99,27 @@ export fn peek_token(s: *tokenizer) ([]u8 | done) = { const known = ((reverse && s.p != types::I64_MIN) || (!reverse && s.p != types::I64_MAX)); if (!known) { - let i = 0i64; + let i = if (reverse) types::I64_MIN else types::I64_MAX; let dlen = 0i64; const slen = len(s.in): i64; - match (ifunc(s.in, s.delim)) { - case let ix: size => - dlen = len(s.delim): i64; - i = ix: i64; - case void => - i = slen; + for (let d .. s.delim) { + match (ifunc(s.in, d)) { + case let ix: size => + if (!reverse && ix: i64 < i) { + i = ix: i64; + dlen = 1; + } else if (reverse && ix: i64 > i) { + i = ix: i64; + dlen = 1; + }; + case void => + if (!reverse && slen < i: i64) { + i = slen; + } else if (reverse && 0 > i: i64) { + i = 0; + }; + }; }; if (reverse) { @@ -123,88 +140,103 @@ export fn peek_token(s: *tokenizer) ([]u8 | done) = { }; }; - -// Returns the remainder of the slice associated with a tokenizer, without doing -// any further tokenization. +// Returns the remainder of the input slice from a [[tokenizer]] ahead of the +// token cursor. export fn remaining_tokens(s: *tokenizer) []u8 = { return s.in; }; +fn tokenize_test( + func: *fn([]u8, u8...) tokenizer, + testcase: str, + in: []u8, + delim: []u8, + tokens: [][]u8, + iters: size = types::SIZE_MAX, +) tokenizer = { + const tok = func(in, delim...); + let n = 0z; + for (const want .. tokens) { + if (n >= iters) { + return tok; + }; + n += 1; + + const p = peek_token(&tok) as []u8; + const n = next_token(&tok) as []u8; + assert(equal(p, n), testcase); + assert(equal(n, want), testcase); + }; + + if (n >= iters) { + return tok; + }; + + assert(peek_token(&tok) is done, testcase); + assert(next_token(&tok) is done, testcase); + return tok; +}; + @test fn tokenize() void = { - const input: [_]u8 = [1, 2, 24, 42, 3, 24, 24, 42, 4, 5]; - let t = tokenize(input, [24, 42]); - let p = peek_token(&t) as []u8; - let n = next_token(&t) as []u8; - assert(equal(p, n)); - assert(equal([1, 2], n)); - let p = peek_token(&t) as []u8; - let n = next_token(&t) as []u8; - assert(equal(p, n)); - assert(equal([3, 24], n)); - assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8)); - assert(equal([4, 5], next_token(&t) as []u8)); - assert(peek_token(&t) is done); - assert(next_token(&t) is done); - - const input: [_]u8 = [24, 42, 1, 24, 42]; - t = tokenize(input, [24, 42]); - assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8)); - assert(equal([], next_token(&t) as []u8)); - assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8)); - assert(equal([1], next_token(&t) as []u8)); - assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8)); - assert(equal([], next_token(&t) as []u8)); - assert(peek_token(&t) is done); - assert(next_token(&t) is done); - - const input: [_]u8 = [1, 1, 1, 2, 1, 1, 2, 2]; - t = tokenize(input, [1, 2]); - assert(equal([1, 1], next_token(&t) as []u8)); - assert(equal([1], next_token(&t) as []u8)); - assert(equal([2], next_token(&t) as []u8)); - assert(next_token(&t) is done); - - const input: [_]u8 = [1, 2]; - t = tokenize(input, [1, 2]); - assert(equal([], next_token(&t) as []u8)); - assert(equal([], next_token(&t) as []u8)); - assert(peek_token(&t) is done); - assert(next_token(&t) is done); - - const input: [_]u8 = [24, 42, 1, 24, 42, 2, 3, 4]; - t = tokenize(input, [24, 42]); - assert(equal([], next_token(&t) as []u8)); - assert(equal([1], next_token(&t) as []u8)); - assert(equal(remaining_tokens(&t), [2, 3, 4])); - assert(equal(peek_token(&t) as []u8, [2, 3, 4])); - assert(equal(remaining_tokens(&t), [2, 3, 4])); - - t = tokenize([], [42]); - assert(peek_token(&t) is done); - assert(next_token(&t) is done); - - const input: [_]u8 = [1, 2, 24, 42, 3, 24, 24, 42, 4, 5]; - let t = rtokenize(input, [24, 42]); - let p = peek_token(&t) as []u8; - let n = next_token(&t) as []u8; - assert(equal(p, n)); - assert(equal([4, 5], n)); - let p = peek_token(&t) as []u8; - let n = next_token(&t) as []u8; - assert(equal(p, n)); - assert(equal([3, 24], n)); - assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8)); - assert(equal([1, 2], next_token(&t) as []u8)); - assert(peek_token(&t) is done); - assert(next_token(&t) is done); - - const input: [_]u8 = [1, 2, 3, 24, 42, 4, 24, 42]; - t = rtokenize(input, [24, 42]); - assert(equal([], next_token(&t) as []u8)); - assert(equal([4], next_token(&t) as []u8)); - assert(equal(remaining_tokens(&t), [1, 2, 3])); - assert(equal(peek_token(&t) as []u8, [1, 2, 3])); - assert(equal(remaining_tokens(&t), [1, 2, 3])); + tokenize_test(&tokenize, "simple case", [1, 2, 0, 3, 4], [0], [ + [1, 2], + [3, 4], + ]); + + tokenize_test(&tokenize, "multiple delimiters", [1, 2, 0, 3, 4, 42, 5, 6], [0, 42], [ + [1, 2], + [3, 4], + [5, 6], + ]); + + tokenize_test(&tokenize, "empty tokens", [1, 2, 0, 0, 0, 3, 4], [0], [ + [1, 2], + [], + [], + [3, 4], + ]); + + tokenize_test(&tokenize, "leading empty tokens", [0, 1, 2, 3, 0], [0], [ + [], + [1, 2, 3], + [], + ]); + + const tok = tokenize_test(&tokenize, "remaining_tokens", [1, 2, 0, 3, 4], [0], [ + [1, 2], + ], 1); + assert(equal(remaining_tokens(&tok), [3, 4])); +}; + +@test fn rtokenize() void = { + tokenize_test(&rtokenize, "simple case", [1, 2, 0, 3, 4], [0], [ + [3, 4], + [1, 2], + ]); + + tokenize_test(&rtokenize, "multiple delimiters", [1, 2, 0, 3, 4, 42, 5, 6], [0, 42], [ + [5, 6], + [3, 4], + [1, 2], + ]); + + tokenize_test(&rtokenize, "empty tokens", [1, 2, 0, 0, 0, 3, 4], [0], [ + [3, 4], + [], + [], + [1, 2], + ]); + + tokenize_test(&rtokenize, "leading empty tokens", [0, 1, 2, 3, 0], [0], [ + [], + [1, 2, 3], + [], + ]); + + const tok = tokenize_test(&rtokenize, "remaining_tokens", [1, 2, 0, 3, 4], [0], [ + [3, 4], + ], 1); + assert(equal(remaining_tokens(&tok), [1, 2])); }; // Returns the input slice "cut" along the first instance of a delimiter, diff --git a/strings/tokenize.ha b/strings/tokenize.ha @@ -4,38 +4,53 @@ use bytes; use types; -// The state for a tokenizer. export type tokenizer = bytes::tokenizer; -// Returns a tokenizer which yields sub-strings tokenized by a delimiter, -// starting at the beginning of the string. +// Tokenizes a string, returning an iterator that yields substrings separated by +// one or more delimiters, such that the string will be split along any of the +// characters found in "delim". If the string begins with or ends with a +// delimiter, an empty string is returned respectively as the first and last +// call to [[next_token]]. // -// let tok = strings::tokenize("hello, my name is drew", " "); -// assert(strings::next_token(&tok) as str == "hello,"); -// assert(strings::next_token(&tok) as str == "my"); -// assert(strings::next_token(&tok) as str == "name"); -// assert(strings::remaining_tokens(&tok) == "is drew"); +// Each character of the delimiter string must be an ASCII character (see +// [[ascii::valid]]). // -// The caller must ensure that 'delim' is not an empty string. -export fn tokenize(s: str, delim: str) tokenizer = - bytes::tokenize(toutf8(s), toutf8(delim)); - -// Returns a tokenizer which yields sub-strings tokenized by a delimiter, -// starting at the end of the string and moving backwards with each call -// to [[next_token]]. +// The input string and delimiter string are borrowed from the caller for the +// lifetime of the tokenizer. // -// let tok = strings::rtokenize("hello, my name is drew", " "); -// assert(strings::next_token(&tok) as str == "drew"); -// assert(strings::next_token(&tok) as str == "is"); -// assert(strings::next_token(&tok) as str == "name"); -// assert(strings::remaining_tokens(&tok) == "hello, my"); +// The caller must ensure that at least one delimiter is provided and that the +// length of the input string is less than [[types::I64_MAX]]. // -// The caller must ensure that 'delim' is not an empty string. -export fn rtokenize(s: str, delim: str) tokenizer = - bytes::rtokenize(toutf8(s), toutf8(delim)); +// const tok = strings::tokenize("Hello world!\tMy name is Harriet.", " \t"); +// assert(next_token(&tok) as str == "Hello"); +// assert(next_token(&tok) as str == "world!"); +// assert(next_token(&tok) as str == "My"); +// assert(next_token(&tok) as str == "name"); +// assert(next_token(&tok) as str == "is"); +// assert(next_token(&tok) as str == "Harriet"); +// assert(next_token(&tok) is done); +export fn tokenize(s: str, delim: str) tokenizer = { + const in = toutf8(s); + const delim = toutf8(delim); + for (let ch .. delim) { + assert(ch & 0x80 == 0, "strings::tokenize cannot tokenize on non-ASCII delimiters"); + }; + return bytes::tokenize(in, delim...); +}; -// Returns the next string from a tokenizer, and advances the cursor. Returns -// done if there are no tokens left. +// Like [[tokenize]], but tokenizes the string in reverse, such that the first +// call to [[next_token]] returns the last token and the last call returns the +// first token. +export fn rtokenize(s: str, delim: str) tokenizer = { + const in = toutf8(s); + const delim = toutf8(delim); + for (let ch .. delim) { + assert(ch & 0x80 == 0, "strings::tokenize cannot tokenize on non-ASCII delimiters"); + }; + return bytes::rtokenize(in, delim...); +}; + +// Returns the next token from a [[tokenizer]] and advances the cursor. export fn next_token(s: *tokenizer) (str | done) = { let s = s: *bytes::tokenizer; match (bytes::next_token(s)) { @@ -45,7 +60,7 @@ export fn next_token(s: *tokenizer) (str | done) = { }; }; -// Same as next_token(), but does not advance the cursor +// Returns the next token from a [[tokenizer]] without advancing the cursor. export fn peek_token(s: *tokenizer) (str | done) = { let s = s: *bytes::tokenizer; return match (bytes::peek_token(s)) { @@ -56,42 +71,92 @@ export fn peek_token(s: *tokenizer) (str | done) = { }; }; -// Returns the remainder of the string associated with a tokenizer, without doing -// any further tokenization. +// Returns the remainder of the input string from a [[tokenizer]] ahead of the +// token cursor. export fn remaining_tokens(s: *tokenizer) str = { let s = s: *bytes::tokenizer; return fromutf8_unsafe(bytes::remaining_tokens(s)); }; +fn tokenize_test( + testcase: str, + in: str, + delim: str, + tokens: []str, + iters: size = types::SIZE_MAX, +) tokenizer = { + const tok = tokenize(in, delim); + let n = 0z; + for (const want .. tokens) { + if (n >= iters) { + return tok; + }; + n += 1; + + const p = peek_token(&tok) as str; + const n = next_token(&tok) as str; + assert(p == n, testcase); + assert(n == want, testcase); + }; + + if (n >= iters) { + return tok; + }; + + assert(peek_token(&tok) is done, testcase); + assert(next_token(&tok) is done, testcase); + return tok; +}; + @test fn tokenize() void = { - let tok = tokenize("Hello, my name is drew", " "); - assert(next_token(&tok) as str == "Hello,"); - assert(next_token(&tok) as str == "my"); - assert(peek_token(&tok) as str == "name"); - assert(next_token(&tok) as str == "name"); - assert(remaining_tokens(&tok) == "is drew"); - assert(peek_token(&tok) as str == "is"); - assert(remaining_tokens(&tok) == "is drew"); - - let tok = tokenize("foo", "foo"); - assert(peek_token(&tok) as str == ""); - assert(next_token(&tok) as str == ""); - assert(peek_token(&tok) as str == ""); - assert(next_token(&tok) as str == ""); - assert(peek_token(&tok) is done); - assert(next_token(&tok) is done); - - let tok = tokenize("", "foo"); - assert(peek_token(&tok) is done); - assert(next_token(&tok) is done); - - let tok = rtokenize("Hello, my name is drew", " "); - assert(next_token(&tok) as str == "drew"); - assert(next_token(&tok) as str == "is"); - assert(next_token(&tok) as str == "name"); - assert(remaining_tokens(&tok) == "Hello, my"); - assert(peek_token(&tok) as str == "my"); - assert(remaining_tokens(&tok) == "Hello, my"); + tokenize_test("simple case", + "Hello world! My name is Harriet.", " ", + [ + "Hello", + "world!", + "My", + "name", + "is", + "Harriet.", + ]); + + tokenize_test("multiple delimiters", + "/dev/sda1\t/ ext4 rw,relatime\t0 0", " \t", + [ + "/dev/sda1", + "/", + "ext4", + "rw,relatime", + "0", + "0", + ]); + + tokenize_test("consecutive delimiters", + "hello world", " ", + [ + "hello", + "", + "", + "", + "world", + ]); + + tokenize_test("leading delimiters", + " hello world ", " ", + [ + "", + "hello", + "world", + "", + ]); + + const tok = tokenize_test("remaining_tokens", + "Hello world! My name is Harriet.", " ", + [ + "Hello", + "world!", + ], 2); + assert(remaining_tokens(&tok) == "My name is Harriet."); }; // Splits a string into tokens delimited by 'delim', starting at the beginning