hare

[hare] The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

tokenize.ha (8549B)


      1 // SPDX-License-Identifier: MPL-2.0
      2 // (c) Hare authors <https://harelang.org>
      3 
      4 use bytes;
      5 use types;
      6 
      7 export type tokenizer = bytes::tokenizer;
      8 
      9 // Tokenizes a string, returning an iterator that yields substrings separated by
     10 // one or more delimiters, such that the string will be split along any of the
     11 // characters found in "delim". If the string begins with or ends with a
     12 // delimiter, an empty string is returned respectively as the first and last
     13 // call to [[next_token]].
     14 //
     15 // Each character of the delimiter string must be an ASCII character (see
     16 // [[ascii::valid]]).
     17 //
     18 // The input string and delimiter string are borrowed from the caller for the
     19 // lifetime of the tokenizer.
     20 //
     21 // The caller must ensure that at least one delimiter is provided and that the
     22 // length of the input string is less than [[types::I64_MAX]].
     23 //
     24 // 	const tok = strings::tokenize("Hello world!\tMy name is Harriet.", " \t");
     25 // 	assert(next_token(&tok) as str == "Hello");
     26 // 	assert(next_token(&tok) as str == "world!");
     27 // 	assert(next_token(&tok) as str == "My");
     28 // 	assert(next_token(&tok) as str == "name");
     29 // 	assert(next_token(&tok) as str == "is");
     30 // 	assert(next_token(&tok) as str == "Harriet");
     31 // 	assert(next_token(&tok) is done);
     32 export fn tokenize(s: str, delim: str) tokenizer = {
     33 	const in = toutf8(s);
     34 	const delim = toutf8(delim);
     35 	for (let ch .. delim) {
     36 		assert(ch & 0x80 == 0, "strings::tokenize cannot tokenize on non-ASCII delimiters");
     37 	};
     38 	return bytes::tokenize(in, delim...);
     39 };
     40 
     41 // Like [[tokenize]], but tokenizes the string in reverse, such that the first
     42 // call to [[next_token]] returns the last token and the last call returns the
     43 // first token.
     44 export fn rtokenize(s: str, delim: str) tokenizer = {
     45 	const in = toutf8(s);
     46 	const delim = toutf8(delim);
     47 	for (let ch .. delim) {
     48 		assert(ch & 0x80 == 0, "strings::tokenize cannot tokenize on non-ASCII delimiters");
     49 	};
     50 	return bytes::rtokenize(in, delim...);
     51 };
     52 
     53 // Returns the next token from a [[tokenizer]] and advances the cursor.
     54 export fn next_token(s: *tokenizer) (str | done) = {
     55 	let s = s: *bytes::tokenizer;
     56 	match (bytes::next_token(s)) {
     57 	case let b: []u8 =>
     58 		return fromutf8_unsafe(b);
     59 	case done => return done;
     60 	};
     61 };
     62 
     63 // Returns the next token from a [[tokenizer]] without advancing the cursor.
     64 export fn peek_token(s: *tokenizer) (str | done) = {
     65 	let s = s: *bytes::tokenizer;
     66 	return match (bytes::peek_token(s)) {
     67 	case let b: []u8 =>
     68 		yield fromutf8_unsafe(b);
     69 	case done =>
     70 		return done;
     71 	};
     72 };
     73 
     74 // Returns the remainder of the input string from a [[tokenizer]] ahead of the
     75 // token cursor.
     76 export fn remaining_tokens(s: *tokenizer) str = {
     77 	let s = s: *bytes::tokenizer;
     78 	return fromutf8_unsafe(bytes::remaining_tokens(s));
     79 };
     80 
     81 fn tokenize_test(
     82 	testcase: str,
     83 	in: str,
     84 	delim: str,
     85 	tokens: []str,
     86 	iters: size = types::SIZE_MAX,
     87 ) tokenizer = {
     88 	const tok = tokenize(in, delim);
     89 	let n = 0z;
     90 	for (const want .. tokens) {
     91 		if (n >= iters) {
     92 			return tok;
     93 		};
     94 		n += 1;
     95 
     96 		const p = peek_token(&tok) as str;
     97 		const n = next_token(&tok) as str;
     98 		assert(p == n, testcase);
     99 		assert(n == want, testcase);
    100 	};
    101 
    102 	if (n >= iters) {
    103 		return tok;
    104 	};
    105 
    106 	assert(peek_token(&tok) is done, testcase);
    107 	assert(next_token(&tok) is done, testcase);
    108 	return tok;
    109 };
    110 
    111 @test fn tokenize() void = {
    112 	tokenize_test("simple case",
    113 		"Hello world! My name is Harriet.", " ",
    114 		[
    115 			"Hello",
    116 			"world!",
    117 			"My",
    118 			"name",
    119 			"is",
    120 			"Harriet.",
    121 		]);
    122 
    123 	tokenize_test("multiple delimiters",
    124 		"/dev/sda1\t/ ext4 rw,relatime\t0 0", " \t",
    125 		[
    126 			"/dev/sda1",
    127 			"/",
    128 			"ext4",
    129 			"rw,relatime",
    130 			"0",
    131 			"0",
    132 		]);
    133 
    134 	tokenize_test("consecutive delimiters",
    135 		"hello    world", " ",
    136 		[
    137 			"hello",
    138 			"",
    139 			"",
    140 			"",
    141 			"world",
    142 		]);
    143 
    144 	tokenize_test("leading delimiters",
    145 		" hello world ", " ",
    146 		[
    147 			"",
    148 			"hello",
    149 			"world",
    150 			"",
    151 		]);
    152 
    153 	const tok = tokenize_test("remaining_tokens",
    154 		"Hello world! My name is Harriet.", " ",
    155 		[
    156 			"Hello",
    157 			"world!",
    158 		], 2);
    159 	assert(remaining_tokens(&tok) == "My name is Harriet.");
    160 };
    161 
    162 // Splits a string into tokens delimited by 'delim', starting at the beginning
    163 // of the string, and returning a slice of up to N tokens. The caller must free
    164 // this slice. The strings within the slice are borrowed from 'in'.
    165 //
    166 // The caller must ensure that 'delim' is not an empty string.
    167 export fn splitn(in: str, delim: str, n: size) []str = {
    168 	let toks: []str = [];
    169 	let tok = tokenize(in, delim);
    170 	for (let i = 0z; i < n - 1z; i += 1) {
    171 		match (next_token(&tok)) {
    172 		case let s: str =>
    173 			append(toks, s)!;
    174 		case done =>
    175 			return toks;
    176 		};
    177 	};
    178 	match(peek_token(&tok)) {
    179 	case done => void;
    180 	case let s: str =>
    181 		append(toks, remaining_tokens(&tok))!;
    182 	};
    183 	return toks;
    184 };
    185 
    186 // Splits a string into tokens delimited by 'delim', starting at the end
    187 // of the string, and returning a slice of up to N tokens. The caller must free
    188 // this slice. The strings within the slice are borrowed from 'in'.
    189 //
    190 // The caller must ensure that 'delim' is not an empty string.
    191 export fn rsplitn(in: str, delim: str, n: size) []str = {
    192 	let toks: []str = [];
    193 	let tok = rtokenize(in, delim);
    194 	for (let i = 0z; i < n - 1z; i += 1) {
    195 		match (next_token(&tok)) {
    196 		case let s: str =>
    197 			append(toks, s)!;
    198 		case done =>
    199 			return toks;
    200 		};
    201 	};
    202 	match(peek_token(&tok)) {
    203 	case done => void;
    204 	case let s: str =>
    205 		append(toks, remaining_tokens(&tok))!;
    206 	};
    207 
    208 	for (let i = 0z; i < len(toks) / 2; i += 1) {
    209 		const tmp = toks[i];
    210 		toks[i] = toks[len(toks) - i - 1];
    211 		toks[len(toks) - i - 1] = tmp;
    212 	};
    213 
    214 	return toks;
    215 };
    216 
    217 // Splits a string into tokens delimited by 'delim'. The caller must free the
    218 // returned slice. The strings within the slice are borrowed from 'in'.
    219 //
    220 // The caller must ensure that 'delim' is not an empty string.
    221 export fn split(in: str, delim: str) []str = splitn(in, delim, types::SIZE_MAX);
    222 
    223 @test fn split() void = {
    224 	const expected = ["Hello,", "my", "name", "is Drew"];
    225 	const actual = splitn("Hello, my name is Drew", " ", 4z);
    226 	assert(len(expected) == len(actual));
    227 	for (let i = 0z; i < len(expected); i += 1) {
    228 		assert(expected[i] == actual[i]);
    229 	};
    230 	free(actual);
    231 
    232 	const expected2 = ["Hello,", "my", "name", "is", "Drew"];
    233 	const actual2 = split("Hello, my name is Drew", " ");
    234 	assert(len(expected2) == len(actual2));
    235 	for (let i = 0z; i < len(expected2); i += 1) {
    236 		assert(expected2[i] == actual2[i]);
    237 	};
    238 	free(actual2);
    239 
    240 	const expected3 = ["one"];
    241 	const actual3 = splitn("one", "=", 2z);
    242 	assert(len(expected3) == len(actual3));
    243 	for (let i = 0z; i < len(expected3); i += 1) {
    244 		assert(expected3[i] == actual3[i]);
    245 	};
    246 	free(actual3);
    247 
    248 	const expected4 = ["Hello, my", "name", "is", "Drew"];
    249 	const actual4 = rsplitn("Hello, my name is Drew", " ", 4z);
    250 	assert(len(expected4) == len(actual4));
    251 	for (let i = 0z; i < len(expected4); i += 1) {
    252 		assert(expected4[i] == actual4[i]);
    253 	};
    254 	free(actual4);
    255 };
    256 
    257 // Returns a string "cut" along the first instance of a delimiter, returning
    258 // everything up to the delimiter, and everything after the delimiter, in a
    259 // tuple.
    260 //
    261 // 	strings::cut("hello=world=foobar", "=")	// ("hello", "world=foobar")
    262 // 	strings::cut("hello world", "=")	// ("hello world", "")
    263 //
    264 // The return value is borrowed from the 'in' parameter. The caller must ensure
    265 // that 'delim' is not an empty string.
    266 export fn cut(in: str, delim: str) (str, str) = {
    267 	let c = bytes::cut(toutf8(in), toutf8(delim));
    268 	return (fromutf8_unsafe(c.0), fromutf8_unsafe(c.1));
    269 };
    270 
    271 // Returns a string "cut" along the last instance of a delimiter, returning
    272 // everything up to the delimiter, and everything after the delimiter, in a
    273 // tuple.
    274 //
    275 // 	strings::rcut("hello=world=foobar", "=")	// ("hello=world", "foobar")
    276 // 	strings::rcut("hello world", "=")	// ("hello world", "")
    277 //
    278 // The return value is borrowed from the 'in' parameter. The caller must ensure
    279 // that 'delim' is not an empty string.
    280 export fn rcut(in: str, delim: str) (str, str) = {
    281 	let c = bytes::rcut(toutf8(in), toutf8(delim));
    282 	return (fromutf8_unsafe(c.0), fromutf8_unsafe(c.1));
    283 };
    284 
    285 @test fn cut() void = {
    286 	const sample = cut("hello=world", "=");
    287 	assert(sample.0 == "hello" && sample.1 == "world");
    288 	const sample = cut("hello=world=foobar", "=");
    289 	assert(sample.0 == "hello" && sample.1 == "world=foobar");
    290 	const sample = cut("hello world", "=");
    291 	assert(sample.0 == "hello world" && sample.1 == "");
    292 	const sample = cut("", "=");
    293 	assert(sample.0 == "" && sample.1 == "");
    294 
    295 	const sample = rcut("hello=world=foobar", "=");
    296 	assert(sample.0 == "hello=world" && sample.1 == "foobar");
    297 };