hare

[hare] The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

tokenize.ha (7729B)


      1 // SPDX-License-Identifier: MPL-2.0
      2 // (c) Hare authors <https://harelang.org>
      3 
      4 use bytes;
      5 use types;
      6 
      7 // The state for a tokenizer.
      8 export type tokenizer = bytes::tokenizer;
      9 
     10 // Returns a tokenizer which yields sub-strings tokenized by a delimiter,
     11 // starting at the beginning of the string.
     12 //
     13 // 	let tok = strings::tokenize("hello, my name is drew", " ");
     14 // 	assert(strings::next_token(&tok) as str == "hello,");
     15 // 	assert(strings::next_token(&tok) as str == "my");
     16 // 	assert(strings::next_token(&tok) as str == "name");
     17 // 	assert(strings::remaining_tokens(&tok) == "is drew");
     18 //
     19 // The caller must ensure that 'delim' is not an empty string.
     20 export fn tokenize(s: str, delim: str) tokenizer =
     21 	bytes::tokenize(toutf8(s), toutf8(delim));
     22 
     23 // Returns a tokenizer which yields sub-strings tokenized by a delimiter,
     24 // starting at the end of the string and moving backwards with each call
     25 // to [[next_token]].
     26 //
     27 // 	let tok = strings::rtokenize("hello, my name is drew", " ");
     28 // 	assert(strings::next_token(&tok) as str == "drew");
     29 // 	assert(strings::next_token(&tok) as str == "is");
     30 // 	assert(strings::next_token(&tok) as str == "name");
     31 // 	assert(strings::remaining_tokens(&tok) == "hello, my");
     32 //
     33 // The caller must ensure that 'delim' is not an empty string.
     34 export fn rtokenize(s: str, delim: str) tokenizer =
     35 	bytes::rtokenize(toutf8(s), toutf8(delim));
     36 
     37 // Returns the next string from a tokenizer, and advances the cursor. Returns
     38 // done if there are no tokens left.
     39 export fn next_token(s: *tokenizer) (str | done) = {
     40 	let s = s: *bytes::tokenizer;
     41 	return match (bytes::next_token(s)) {
     42 	case let b: []u8 =>
     43 		yield fromutf8_unsafe(b);
     44 	case void => yield done;
     45 	};
     46 };
     47 
     48 // Same as next_token(), but does not advance the cursor
     49 export fn peek_token(s: *tokenizer) (str | void) = {
     50 	let s = s: *bytes::tokenizer;
     51 	return match (bytes::peek_token(s)) {
     52 	case let b: []u8 =>
     53 		yield fromutf8_unsafe(b);
     54 	case void => void;
     55 	};
     56 };
     57 
     58 // Returns the remainder of the string associated with a tokenizer, without doing
     59 // any further tokenization.
     60 export fn remaining_tokens(s: *tokenizer) str = {
     61 	let s = s: *bytes::tokenizer;
     62 	return fromutf8_unsafe(bytes::remaining_tokens(s));
     63 };
     64 
     65 @test fn tokenize() void = {
     66 	let tok = tokenize("Hello, my name is drew", " ");
     67 	assert(next_token(&tok) as str == "Hello,");
     68 	assert(next_token(&tok) as str == "my");
     69 	assert(peek_token(&tok) as str == "name");
     70 	assert(next_token(&tok) as str == "name");
     71 	assert(remaining_tokens(&tok) == "is drew");
     72 	assert(peek_token(&tok) as str == "is");
     73 	assert(remaining_tokens(&tok) == "is drew");
     74 
     75 	let tok = tokenize("foo", "foo");
     76 	assert(peek_token(&tok) as str == "");
     77 	assert(next_token(&tok) as str == "");
     78 	assert(peek_token(&tok) as str == "");
     79 	assert(next_token(&tok) as str == "");
     80 	assert(peek_token(&tok) is void);
     81 	assert(next_token(&tok) is done);
     82 
     83 	let tok = tokenize("", "foo");
     84 	assert(peek_token(&tok) is void);
     85 	assert(next_token(&tok) is done);
     86 
     87 	let tok = rtokenize("Hello, my name is drew", " ");
     88 	assert(next_token(&tok) as str == "drew");
     89 	assert(next_token(&tok) as str == "is");
     90 	assert(next_token(&tok) as str == "name");
     91 	assert(remaining_tokens(&tok) == "Hello, my");
     92 	assert(peek_token(&tok) as str == "my");
     93 	assert(remaining_tokens(&tok) == "Hello, my");
     94 };
     95 
     96 // Splits a string into tokens delimited by 'delim', starting at the beginning
     97 // of the string, and returning a slice of up to N tokens. The caller must free
     98 // this slice. The strings within the slice are borrowed from 'in'.
     99 //
    100 // The caller must ensure that 'delim' is not an empty string.
    101 export fn splitn(in: str, delim: str, n: size) []str = {
    102 	let toks: []str = [];
    103 	let tok = tokenize(in, delim);
    104 	for (let i = 0z; i < n - 1z; i += 1) {
    105 		match (next_token(&tok)) {
    106 		case let s: str =>
    107 			append(toks, s);
    108 		case done =>
    109 			return toks;
    110 		};
    111 	};
    112 	match(peek_token(&tok)) {
    113 	case void => void;
    114 	case let s: str =>
    115 		append(toks, remaining_tokens(&tok));
    116 	};
    117 	return toks;
    118 };
    119 
    120 // Splits a string into tokens delimited by 'delim', starting at the end
    121 // of the string, and returning a slice of up to N tokens. The caller must free
    122 // this slice. The strings within the slice are borrowed from 'in'.
    123 //
    124 // The caller must ensure that 'delim' is not an empty string.
    125 export fn rsplitn(in: str, delim: str, n: size) []str = {
    126 	let toks: []str = [];
    127 	let tok = rtokenize(in, delim);
    128 	for (let i = 0z; i < n - 1z; i += 1) {
    129 		match (next_token(&tok)) {
    130 		case let s: str =>
    131 			append(toks, s);
    132 		case done =>
    133 			return toks;
    134 		};
    135 	};
    136 	match(peek_token(&tok)) {
    137 	case void => void;
    138 	case let s: str =>
    139 		append(toks, remaining_tokens(&tok));
    140 	};
    141 
    142 	for (let i = 0z; i < len(toks) / 2; i += 1) {
    143 		const tmp = toks[i];
    144 		toks[i] = toks[len(toks) - i - 1];
    145 		toks[len(toks) - i - 1] = tmp;
    146 	};
    147 
    148 	return toks;
    149 };
    150 
    151 // Splits a string into tokens delimited by 'delim'. The caller must free the
    152 // returned slice. The strings within the slice are borrowed from 'in'.
    153 //
    154 // The caller must ensure that 'delim' is not an empty string.
    155 export fn split(in: str, delim: str) []str = splitn(in, delim, types::SIZE_MAX);
    156 
    157 @test fn split() void = {
    158 	const expected = ["Hello,", "my", "name", "is Drew"];
    159 	const actual = splitn("Hello, my name is Drew", " ", 4z);
    160 	assert(len(expected) == len(actual));
    161 	for (let i = 0z; i < len(expected); i += 1) {
    162 		assert(expected[i] == actual[i]);
    163 	};
    164 
    165 	const expected2 = ["Hello,", "my", "name", "is", "Drew"];
    166 	const actual2 = split("Hello, my name is Drew", " ");
    167 	assert(len(expected2) == len(actual2));
    168 	for (let i = 0z; i < len(expected2); i += 1) {
    169 		assert(expected2[i] == actual2[i]);
    170 	};
    171 
    172 	const expected3 = ["one"];
    173 	const actual3 = splitn("one", "=", 2z);
    174 	assert(len(expected3) == len(actual3));
    175 	for (let i = 0z; i < len(expected3); i += 1) {
    176 		assert(expected3[i] == actual3[i]);
    177 	};
    178 
    179 	const expected4 = ["Hello, my", "name", "is", "Drew"];
    180 	const actual4 = rsplitn("Hello, my name is Drew", " ", 4z);
    181 	assert(len(expected4) == len(actual4));
    182 	for (let i = 0z; i < len(expected4); i += 1) {
    183 		assert(expected4[i] == actual4[i]);
    184 	};
    185 };
    186 
    187 // Returns a string "cut" along the first instance of a delimiter, returning
    188 // everything up to the delimiter, and everything after the delimiter, in a
    189 // tuple.
    190 //
    191 // 	strings::cut("hello=world=foobar", "=")	// ("hello", "world=foobar")
    192 // 	strings::cut("hello world", "=")	// ("hello world", "")
    193 //
    194 // The return value is borrowed from the 'in' parameter. The caller must ensure
    195 // that 'delim' is not an empty string.
    196 export fn cut(in: str, delim: str) (str, str) = {
    197 	let c = bytes::cut(toutf8(in), toutf8(delim));
    198 	return (fromutf8_unsafe(c.0), fromutf8_unsafe(c.1));
    199 };
    200 
    201 // Returns a string "cut" along the last instance of a delimiter, returning
    202 // everything up to the delimiter, and everything after the delimiter, in a
    203 // tuple.
    204 //
    205 // 	strings::rcut("hello=world=foobar", "=")	// ("hello=world", "foobar")
    206 // 	strings::rcut("hello world", "=")	// ("hello world", "")
    207 //
    208 // The return value is borrowed from the 'in' parameter. The caller must ensure
    209 // that 'delim' is not an empty string.
    210 export fn rcut(in: str, delim: str) (str, str) = {
    211 	let c = bytes::rcut(toutf8(in), toutf8(delim));
    212 	return (fromutf8_unsafe(c.0), fromutf8_unsafe(c.1));
    213 };
    214 
    215 @test fn cut() void = {
    216 	const sample = cut("hello=world", "=");
    217 	assert(sample.0 == "hello" && sample.1 == "world");
    218 	const sample = cut("hello=world=foobar", "=");
    219 	assert(sample.0 == "hello" && sample.1 == "world=foobar");
    220 	const sample = cut("hello world", "=");
    221 	assert(sample.0 == "hello world" && sample.1 == "");
    222 	const sample = cut("", "=");
    223 	assert(sample.0 == "" && sample.1 == "");
    224 
    225 	const sample = rcut("hello=world=foobar", "=");
    226 	assert(sample.0 == "hello=world" && sample.1 == "foobar");
    227 };