hare

The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

commit 4ec74642362c0cef067d231e851550a374e91e9a
parent fd0359ea21f152be1f82f33db5d3f3b2b297458b
Author: Drew DeVault <sir@cmpwn.com>
Date:   Sat,  6 Feb 2021 15:18:02 -0500

bytes, strings: add tokenize, split

Diffstat:
Abytes/tokenize.ha | 119+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mstrings/dup.ha | 8++++++++
Astrings/tokenize.ha | 91+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 218 insertions(+), 0 deletions(-)

diff --git a/bytes/tokenize.ha b/bytes/tokenize.ha @@ -0,0 +1,119 @@ +// The state for a tokenizer. +export type tokenizer = struct { s: []u8, d: []u8 }; + +// Returns a tokenizer which yields sub-slices tokenized by a delimiter. +export fn tokenize(s: []u8, delim: []u8) tokenizer = tokenizer { + s = s, + d = delim, +}; + +// Returns the number of bytes in a which are equal to bytes in b. +fn nequal(a: []u8, b: []u8) size = { + let i = 0z; + for (i < len(a) && i < len(b); i += 1z) { + if (a[i] != b[i]) { + break; + }; + }; + return i; +}; + +// Returns the next slice from a tokenizer, and advances the cursor. Returns +// void if there are no tokens left. If a string starts with, or ends with, a +// token, an empty slice is returned at the beginning or end of the sequence, +// respectively. +export fn next_token(s: *tokenizer) ([]u8 | void) = { + let i = 0z; + for (i < len(s.s)) { + let n = nequal(s.s[i..], s.d); + if (n == len(s.d)) { + let tok = s.s[..i]; + if (len(tok) + len(s.d) == len(s.s) && len(tok) != 0z) { + s.s = s.s[i..]; + } else { + s.s = s.s[i+len(s.d)..]; + }; + return tok; + } else if (n != 0z) { + i += n; + } else { + i += 1z; + }; + }; + + if (len(s.s) != 0z) { + let tok = s.s[..]; + s.s = s.s[..0]; + return tok; + }; + + return void; +}; + +// Returns the remainder of the slice associated with a tokenizer, without doing +// any further tokenization. +export fn remaining_tokens(s: *tokenizer) []u8 = { + return s.s; +}; + +@test fn tokenize() void = { + const input = [1u8, 2u8, 24u8, 42u8, 3u8, 24u8, 24u8, 42u8, 4u8, 5u8]; + let t = tokenize(input, [24u8, 42u8]); + + match (next_token(&t)) { + b: []u8 => assert(equal([1u8, 2u8], b)), + void => abort(), + }; + + match (next_token(&t)) { + b: []u8 => assert(equal([3u8, 24u8], b)), + void => abort(), + }; + + match (next_token(&t)) { + b: []u8 => assert(equal([4u8, 5u8], b)), + void => abort(), + }; + + assert(next_token(&t) is void); + + const input2 = [24u8, 42u8, 1u8, 24u8, 42u8]; + t = tokenize(input2, [24u8, 42u8]); + + match (next_token(&t)) { + b: []u8 => assert(equal([], b)), + void => abort(), + }; + + match (next_token(&t)) { + b: []u8 => assert(equal([1u8], b)), + void => abort(), + }; + + match (next_token(&t)) { + b: []u8 => assert(equal([], b)), + void => abort(), + }; + + assert(next_token(&t) is void); + + const input3 = [1u8, 1u8, 1u8, 2u8, 1u8, 1u8, 2u8, 2u8]; + t = tokenize(input3, [1u8, 2u8]); + + match (next_token(&t)) { + b: []u8 => assert(equal([1u8, 1u8], b)), + void => abort(), + }; + + match (next_token(&t)) { + b: []u8 => assert(equal([1u8], b)), + void => abort(), + }; + + match (next_token(&t)) { + b: []u8 => assert(equal([2u8], b)), + void => abort(), + }; + + assert(next_token(&t) is void); +}; diff --git a/strings/dup.ha b/strings/dup.ha @@ -17,3 +17,11 @@ export fn dup(s: const str) str = { }; return *(&out: *str); }; + +// Duplicates every string of a slice in place, returning the same slice with +// new strings. +export fn dup_all(s: []str) void = { + for (let i = 0z; i < len(s); i += 1z) { + s[i] = strings::dup(s[i]); + }; +}; diff --git a/strings/tokenize.ha b/strings/tokenize.ha @@ -0,0 +1,91 @@ +use bytes; +use types; + +// The state for a tokenizer. +type tokenizer = bytes::tokenizer; + +// Returns a tokenizer which yields sub-strings tokenized by a delimiter. +// +// let tok = strings::tokenize("hello, my name is drew", " "); +// assert(strings::token(tok) == "hello,"); +// assert(strings::token(tok) == "my"); +// assert(strings::token(tok) == "name"); +// assert(strings::remaining_tokens(tok) == "is drew"); +export fn tokenize(s: str, delim: str) tokenizer = + bytes::tokenize(to_utf8(s), to_utf8(delim)); + +// Returns the next string from a tokenizer, and advances the cursor. Returns +// void if there are no tokens left. +export fn next_token(s: *tokenizer) (str | void) = { + return match (bytes::next_token(s)) { + b: []u8 => from_utf8(b), + void => void, + }; +}; + +// Returns the remainder of the string associated with a tokenizer, without doing +// any further tokenization. +export fn remaining_tokens(s: *tokenizer) str = { + return from_utf8(bytes::remaining_tokens(s)); +}; + +@test fn tokenize() void = { + let tok = tokenize("Hello, my name is drew", " "); + match (next_token(&tok)) { + s: str => assert(s == "Hello,"), + void => abort(), + }; + + match (next_token(&tok)) { + s: str => assert(s == "my"), + void => abort(), + }; + + match (next_token(&tok)) { + s: str => assert(s == "name"), + void => abort(), + }; + + assert(remaining_tokens(&tok) == "is drew"); +}; + +// Splits a string into tokens delimited by 'delim', returning a slice of up to +// N tokens. The caller must free this slice. The strings within the slice are +// borrowed from 'in', and needn't be freed - but should be [strings::dup_all]'d +// if they should outlive 'in'. +export fn splitN(in: str, delim: str, n: size) []str = { + let toks = alloc([]str, [], 8z); // TODO: Drop explicit capacity + let tok = tokenize(in, delim); + for (let i = 0z; i < n - 1z; i += 1z) { + match (next_token(&tok)) { + s: str => { + append(toks, s); + }, + void => return toks, + }; + }; + append(toks, remaining_tokens(&tok)); + return toks; +}; + +// Splits a string into tokens delimited by 'delim'. The caller must free the +// returned slice. The strings within the slice are borrowed from 'in', and +// needn't be freed - but must be [strings::dup_all]'d if they should outlive +// 'in'. +export fn split(in: str, delim: str) []str = splitN(in, delim, types::SIZE_MAX); + +@test fn split() void = { + const expected = ["Hello,", "my", "name", "is Drew"]; + const actual = splitN("Hello, my name is Drew", " ", 4z); + assert(len(expected) == len(actual)); + for (let i = 0z; i < len(expected); i += 1z) { + assert(expected[i] == actual[i]); + }; + + const expected2 = ["Hello,", "my", "name", "is", "Drew"]; + const actual2 = split("Hello, my name is Drew", " "); + assert(len(expected2) == len(actual2)); + for (let i = 0z; i < len(expected2); i += 1z) { + assert(expected2[i] == actual2[i]); + }; +};