tokenize.ha (8549B)
1 // SPDX-License-Identifier: MPL-2.0 2 // (c) Hare authors <https://harelang.org> 3 4 use bytes; 5 use types; 6 7 export type tokenizer = bytes::tokenizer; 8 9 // Tokenizes a string, returning an iterator that yields substrings separated by 10 // one or more delimiters, such that the string will be split along any of the 11 // characters found in "delim". If the string begins with or ends with a 12 // delimiter, an empty string is returned respectively as the first and last 13 // call to [[next_token]]. 14 // 15 // Each character of the delimiter string must be an ASCII character (see 16 // [[ascii::valid]]). 17 // 18 // The input string and delimiter string are borrowed from the caller for the 19 // lifetime of the tokenizer. 20 // 21 // The caller must ensure that at least one delimiter is provided and that the 22 // length of the input string is less than [[types::I64_MAX]]. 23 // 24 // const tok = strings::tokenize("Hello world!\tMy name is Harriet.", " \t"); 25 // assert(next_token(&tok) as str == "Hello"); 26 // assert(next_token(&tok) as str == "world!"); 27 // assert(next_token(&tok) as str == "My"); 28 // assert(next_token(&tok) as str == "name"); 29 // assert(next_token(&tok) as str == "is"); 30 // assert(next_token(&tok) as str == "Harriet"); 31 // assert(next_token(&tok) is done); 32 export fn tokenize(s: str, delim: str) tokenizer = { 33 const in = toutf8(s); 34 const delim = toutf8(delim); 35 for (let ch .. delim) { 36 assert(ch & 0x80 == 0, "strings::tokenize cannot tokenize on non-ASCII delimiters"); 37 }; 38 return bytes::tokenize(in, delim...); 39 }; 40 41 // Like [[tokenize]], but tokenizes the string in reverse, such that the first 42 // call to [[next_token]] returns the last token and the last call returns the 43 // first token. 44 export fn rtokenize(s: str, delim: str) tokenizer = { 45 const in = toutf8(s); 46 const delim = toutf8(delim); 47 for (let ch .. delim) { 48 assert(ch & 0x80 == 0, "strings::tokenize cannot tokenize on non-ASCII delimiters"); 49 }; 50 return bytes::rtokenize(in, delim...); 51 }; 52 53 // Returns the next token from a [[tokenizer]] and advances the cursor. 54 export fn next_token(s: *tokenizer) (str | done) = { 55 let s = s: *bytes::tokenizer; 56 match (bytes::next_token(s)) { 57 case let b: []u8 => 58 return fromutf8_unsafe(b); 59 case done => return done; 60 }; 61 }; 62 63 // Returns the next token from a [[tokenizer]] without advancing the cursor. 64 export fn peek_token(s: *tokenizer) (str | done) = { 65 let s = s: *bytes::tokenizer; 66 return match (bytes::peek_token(s)) { 67 case let b: []u8 => 68 yield fromutf8_unsafe(b); 69 case done => 70 return done; 71 }; 72 }; 73 74 // Returns the remainder of the input string from a [[tokenizer]] ahead of the 75 // token cursor. 76 export fn remaining_tokens(s: *tokenizer) str = { 77 let s = s: *bytes::tokenizer; 78 return fromutf8_unsafe(bytes::remaining_tokens(s)); 79 }; 80 81 fn tokenize_test( 82 testcase: str, 83 in: str, 84 delim: str, 85 tokens: []str, 86 iters: size = types::SIZE_MAX, 87 ) tokenizer = { 88 const tok = tokenize(in, delim); 89 let n = 0z; 90 for (const want .. tokens) { 91 if (n >= iters) { 92 return tok; 93 }; 94 n += 1; 95 96 const p = peek_token(&tok) as str; 97 const n = next_token(&tok) as str; 98 assert(p == n, testcase); 99 assert(n == want, testcase); 100 }; 101 102 if (n >= iters) { 103 return tok; 104 }; 105 106 assert(peek_token(&tok) is done, testcase); 107 assert(next_token(&tok) is done, testcase); 108 return tok; 109 }; 110 111 @test fn tokenize() void = { 112 tokenize_test("simple case", 113 "Hello world! My name is Harriet.", " ", 114 [ 115 "Hello", 116 "world!", 117 "My", 118 "name", 119 "is", 120 "Harriet.", 121 ]); 122 123 tokenize_test("multiple delimiters", 124 "/dev/sda1\t/ ext4 rw,relatime\t0 0", " \t", 125 [ 126 "/dev/sda1", 127 "/", 128 "ext4", 129 "rw,relatime", 130 "0", 131 "0", 132 ]); 133 134 tokenize_test("consecutive delimiters", 135 "hello world", " ", 136 [ 137 "hello", 138 "", 139 "", 140 "", 141 "world", 142 ]); 143 144 tokenize_test("leading delimiters", 145 " hello world ", " ", 146 [ 147 "", 148 "hello", 149 "world", 150 "", 151 ]); 152 153 const tok = tokenize_test("remaining_tokens", 154 "Hello world! My name is Harriet.", " ", 155 [ 156 "Hello", 157 "world!", 158 ], 2); 159 assert(remaining_tokens(&tok) == "My name is Harriet."); 160 }; 161 162 // Splits a string into tokens delimited by 'delim', starting at the beginning 163 // of the string, and returning a slice of up to N tokens. The caller must free 164 // this slice. The strings within the slice are borrowed from 'in'. 165 // 166 // The caller must ensure that 'delim' is not an empty string. 167 export fn splitn(in: str, delim: str, n: size) []str = { 168 let toks: []str = []; 169 let tok = tokenize(in, delim); 170 for (let i = 0z; i < n - 1z; i += 1) { 171 match (next_token(&tok)) { 172 case let s: str => 173 append(toks, s)!; 174 case done => 175 return toks; 176 }; 177 }; 178 match(peek_token(&tok)) { 179 case done => void; 180 case let s: str => 181 append(toks, remaining_tokens(&tok))!; 182 }; 183 return toks; 184 }; 185 186 // Splits a string into tokens delimited by 'delim', starting at the end 187 // of the string, and returning a slice of up to N tokens. The caller must free 188 // this slice. The strings within the slice are borrowed from 'in'. 189 // 190 // The caller must ensure that 'delim' is not an empty string. 191 export fn rsplitn(in: str, delim: str, n: size) []str = { 192 let toks: []str = []; 193 let tok = rtokenize(in, delim); 194 for (let i = 0z; i < n - 1z; i += 1) { 195 match (next_token(&tok)) { 196 case let s: str => 197 append(toks, s)!; 198 case done => 199 return toks; 200 }; 201 }; 202 match(peek_token(&tok)) { 203 case done => void; 204 case let s: str => 205 append(toks, remaining_tokens(&tok))!; 206 }; 207 208 for (let i = 0z; i < len(toks) / 2; i += 1) { 209 const tmp = toks[i]; 210 toks[i] = toks[len(toks) - i - 1]; 211 toks[len(toks) - i - 1] = tmp; 212 }; 213 214 return toks; 215 }; 216 217 // Splits a string into tokens delimited by 'delim'. The caller must free the 218 // returned slice. The strings within the slice are borrowed from 'in'. 219 // 220 // The caller must ensure that 'delim' is not an empty string. 221 export fn split(in: str, delim: str) []str = splitn(in, delim, types::SIZE_MAX); 222 223 @test fn split() void = { 224 const expected = ["Hello,", "my", "name", "is Drew"]; 225 const actual = splitn("Hello, my name is Drew", " ", 4z); 226 assert(len(expected) == len(actual)); 227 for (let i = 0z; i < len(expected); i += 1) { 228 assert(expected[i] == actual[i]); 229 }; 230 free(actual); 231 232 const expected2 = ["Hello,", "my", "name", "is", "Drew"]; 233 const actual2 = split("Hello, my name is Drew", " "); 234 assert(len(expected2) == len(actual2)); 235 for (let i = 0z; i < len(expected2); i += 1) { 236 assert(expected2[i] == actual2[i]); 237 }; 238 free(actual2); 239 240 const expected3 = ["one"]; 241 const actual3 = splitn("one", "=", 2z); 242 assert(len(expected3) == len(actual3)); 243 for (let i = 0z; i < len(expected3); i += 1) { 244 assert(expected3[i] == actual3[i]); 245 }; 246 free(actual3); 247 248 const expected4 = ["Hello, my", "name", "is", "Drew"]; 249 const actual4 = rsplitn("Hello, my name is Drew", " ", 4z); 250 assert(len(expected4) == len(actual4)); 251 for (let i = 0z; i < len(expected4); i += 1) { 252 assert(expected4[i] == actual4[i]); 253 }; 254 free(actual4); 255 }; 256 257 // Returns a string "cut" along the first instance of a delimiter, returning 258 // everything up to the delimiter, and everything after the delimiter, in a 259 // tuple. 260 // 261 // strings::cut("hello=world=foobar", "=") // ("hello", "world=foobar") 262 // strings::cut("hello world", "=") // ("hello world", "") 263 // 264 // The return value is borrowed from the 'in' parameter. The caller must ensure 265 // that 'delim' is not an empty string. 266 export fn cut(in: str, delim: str) (str, str) = { 267 let c = bytes::cut(toutf8(in), toutf8(delim)); 268 return (fromutf8_unsafe(c.0), fromutf8_unsafe(c.1)); 269 }; 270 271 // Returns a string "cut" along the last instance of a delimiter, returning 272 // everything up to the delimiter, and everything after the delimiter, in a 273 // tuple. 274 // 275 // strings::rcut("hello=world=foobar", "=") // ("hello=world", "foobar") 276 // strings::rcut("hello world", "=") // ("hello world", "") 277 // 278 // The return value is borrowed from the 'in' parameter. The caller must ensure 279 // that 'delim' is not an empty string. 280 export fn rcut(in: str, delim: str) (str, str) = { 281 let c = bytes::rcut(toutf8(in), toutf8(delim)); 282 return (fromutf8_unsafe(c.0), fromutf8_unsafe(c.1)); 283 }; 284 285 @test fn cut() void = { 286 const sample = cut("hello=world", "="); 287 assert(sample.0 == "hello" && sample.1 == "world"); 288 const sample = cut("hello=world=foobar", "="); 289 assert(sample.0 == "hello" && sample.1 == "world=foobar"); 290 const sample = cut("hello world", "="); 291 assert(sample.0 == "hello world" && sample.1 == ""); 292 const sample = cut("", "="); 293 assert(sample.0 == "" && sample.1 == ""); 294 295 const sample = rcut("hello=world=foobar", "="); 296 assert(sample.0 == "hello=world" && sample.1 == "foobar"); 297 };