tokenize.ha (7729B)
1 // SPDX-License-Identifier: MPL-2.0 2 // (c) Hare authors <https://harelang.org> 3 4 use bytes; 5 use types; 6 7 // The state for a tokenizer. 8 export type tokenizer = bytes::tokenizer; 9 10 // Returns a tokenizer which yields sub-strings tokenized by a delimiter, 11 // starting at the beginning of the string. 12 // 13 // let tok = strings::tokenize("hello, my name is drew", " "); 14 // assert(strings::next_token(&tok) as str == "hello,"); 15 // assert(strings::next_token(&tok) as str == "my"); 16 // assert(strings::next_token(&tok) as str == "name"); 17 // assert(strings::remaining_tokens(&tok) == "is drew"); 18 // 19 // The caller must ensure that 'delim' is not an empty string. 20 export fn tokenize(s: str, delim: str) tokenizer = 21 bytes::tokenize(toutf8(s), toutf8(delim)); 22 23 // Returns a tokenizer which yields sub-strings tokenized by a delimiter, 24 // starting at the end of the string and moving backwards with each call 25 // to [[next_token]]. 26 // 27 // let tok = strings::rtokenize("hello, my name is drew", " "); 28 // assert(strings::next_token(&tok) as str == "drew"); 29 // assert(strings::next_token(&tok) as str == "is"); 30 // assert(strings::next_token(&tok) as str == "name"); 31 // assert(strings::remaining_tokens(&tok) == "hello, my"); 32 // 33 // The caller must ensure that 'delim' is not an empty string. 34 export fn rtokenize(s: str, delim: str) tokenizer = 35 bytes::rtokenize(toutf8(s), toutf8(delim)); 36 37 // Returns the next string from a tokenizer, and advances the cursor. Returns 38 // done if there are no tokens left. 39 export fn next_token(s: *tokenizer) (str | done) = { 40 let s = s: *bytes::tokenizer; 41 return match (bytes::next_token(s)) { 42 case let b: []u8 => 43 yield fromutf8_unsafe(b); 44 case void => yield done; 45 }; 46 }; 47 48 // Same as next_token(), but does not advance the cursor 49 export fn peek_token(s: *tokenizer) (str | void) = { 50 let s = s: *bytes::tokenizer; 51 return match (bytes::peek_token(s)) { 52 case let b: []u8 => 53 yield fromutf8_unsafe(b); 54 case void => void; 55 }; 56 }; 57 58 // Returns the remainder of the string associated with a tokenizer, without doing 59 // any further tokenization. 60 export fn remaining_tokens(s: *tokenizer) str = { 61 let s = s: *bytes::tokenizer; 62 return fromutf8_unsafe(bytes::remaining_tokens(s)); 63 }; 64 65 @test fn tokenize() void = { 66 let tok = tokenize("Hello, my name is drew", " "); 67 assert(next_token(&tok) as str == "Hello,"); 68 assert(next_token(&tok) as str == "my"); 69 assert(peek_token(&tok) as str == "name"); 70 assert(next_token(&tok) as str == "name"); 71 assert(remaining_tokens(&tok) == "is drew"); 72 assert(peek_token(&tok) as str == "is"); 73 assert(remaining_tokens(&tok) == "is drew"); 74 75 let tok = tokenize("foo", "foo"); 76 assert(peek_token(&tok) as str == ""); 77 assert(next_token(&tok) as str == ""); 78 assert(peek_token(&tok) as str == ""); 79 assert(next_token(&tok) as str == ""); 80 assert(peek_token(&tok) is void); 81 assert(next_token(&tok) is done); 82 83 let tok = tokenize("", "foo"); 84 assert(peek_token(&tok) is void); 85 assert(next_token(&tok) is done); 86 87 let tok = rtokenize("Hello, my name is drew", " "); 88 assert(next_token(&tok) as str == "drew"); 89 assert(next_token(&tok) as str == "is"); 90 assert(next_token(&tok) as str == "name"); 91 assert(remaining_tokens(&tok) == "Hello, my"); 92 assert(peek_token(&tok) as str == "my"); 93 assert(remaining_tokens(&tok) == "Hello, my"); 94 }; 95 96 // Splits a string into tokens delimited by 'delim', starting at the beginning 97 // of the string, and returning a slice of up to N tokens. The caller must free 98 // this slice. The strings within the slice are borrowed from 'in'. 99 // 100 // The caller must ensure that 'delim' is not an empty string. 101 export fn splitn(in: str, delim: str, n: size) []str = { 102 let toks: []str = []; 103 let tok = tokenize(in, delim); 104 for (let i = 0z; i < n - 1z; i += 1) { 105 match (next_token(&tok)) { 106 case let s: str => 107 append(toks, s); 108 case done => 109 return toks; 110 }; 111 }; 112 match(peek_token(&tok)) { 113 case void => void; 114 case let s: str => 115 append(toks, remaining_tokens(&tok)); 116 }; 117 return toks; 118 }; 119 120 // Splits a string into tokens delimited by 'delim', starting at the end 121 // of the string, and returning a slice of up to N tokens. The caller must free 122 // this slice. The strings within the slice are borrowed from 'in'. 123 // 124 // The caller must ensure that 'delim' is not an empty string. 125 export fn rsplitn(in: str, delim: str, n: size) []str = { 126 let toks: []str = []; 127 let tok = rtokenize(in, delim); 128 for (let i = 0z; i < n - 1z; i += 1) { 129 match (next_token(&tok)) { 130 case let s: str => 131 append(toks, s); 132 case done => 133 return toks; 134 }; 135 }; 136 match(peek_token(&tok)) { 137 case void => void; 138 case let s: str => 139 append(toks, remaining_tokens(&tok)); 140 }; 141 142 for (let i = 0z; i < len(toks) / 2; i += 1) { 143 const tmp = toks[i]; 144 toks[i] = toks[len(toks) - i - 1]; 145 toks[len(toks) - i - 1] = tmp; 146 }; 147 148 return toks; 149 }; 150 151 // Splits a string into tokens delimited by 'delim'. The caller must free the 152 // returned slice. The strings within the slice are borrowed from 'in'. 153 // 154 // The caller must ensure that 'delim' is not an empty string. 155 export fn split(in: str, delim: str) []str = splitn(in, delim, types::SIZE_MAX); 156 157 @test fn split() void = { 158 const expected = ["Hello,", "my", "name", "is Drew"]; 159 const actual = splitn("Hello, my name is Drew", " ", 4z); 160 assert(len(expected) == len(actual)); 161 for (let i = 0z; i < len(expected); i += 1) { 162 assert(expected[i] == actual[i]); 163 }; 164 165 const expected2 = ["Hello,", "my", "name", "is", "Drew"]; 166 const actual2 = split("Hello, my name is Drew", " "); 167 assert(len(expected2) == len(actual2)); 168 for (let i = 0z; i < len(expected2); i += 1) { 169 assert(expected2[i] == actual2[i]); 170 }; 171 172 const expected3 = ["one"]; 173 const actual3 = splitn("one", "=", 2z); 174 assert(len(expected3) == len(actual3)); 175 for (let i = 0z; i < len(expected3); i += 1) { 176 assert(expected3[i] == actual3[i]); 177 }; 178 179 const expected4 = ["Hello, my", "name", "is", "Drew"]; 180 const actual4 = rsplitn("Hello, my name is Drew", " ", 4z); 181 assert(len(expected4) == len(actual4)); 182 for (let i = 0z; i < len(expected4); i += 1) { 183 assert(expected4[i] == actual4[i]); 184 }; 185 }; 186 187 // Returns a string "cut" along the first instance of a delimiter, returning 188 // everything up to the delimiter, and everything after the delimiter, in a 189 // tuple. 190 // 191 // strings::cut("hello=world=foobar", "=") // ("hello", "world=foobar") 192 // strings::cut("hello world", "=") // ("hello world", "") 193 // 194 // The return value is borrowed from the 'in' parameter. The caller must ensure 195 // that 'delim' is not an empty string. 196 export fn cut(in: str, delim: str) (str, str) = { 197 let c = bytes::cut(toutf8(in), toutf8(delim)); 198 return (fromutf8_unsafe(c.0), fromutf8_unsafe(c.1)); 199 }; 200 201 // Returns a string "cut" along the last instance of a delimiter, returning 202 // everything up to the delimiter, and everything after the delimiter, in a 203 // tuple. 204 // 205 // strings::rcut("hello=world=foobar", "=") // ("hello=world", "foobar") 206 // strings::rcut("hello world", "=") // ("hello world", "") 207 // 208 // The return value is borrowed from the 'in' parameter. The caller must ensure 209 // that 'delim' is not an empty string. 210 export fn rcut(in: str, delim: str) (str, str) = { 211 let c = bytes::rcut(toutf8(in), toutf8(delim)); 212 return (fromutf8_unsafe(c.0), fromutf8_unsafe(c.1)); 213 }; 214 215 @test fn cut() void = { 216 const sample = cut("hello=world", "="); 217 assert(sample.0 == "hello" && sample.1 == "world"); 218 const sample = cut("hello=world=foobar", "="); 219 assert(sample.0 == "hello" && sample.1 == "world=foobar"); 220 const sample = cut("hello world", "="); 221 assert(sample.0 == "hello world" && sample.1 == ""); 222 const sample = cut("", "="); 223 assert(sample.0 == "" && sample.1 == ""); 224 225 const sample = rcut("hello=world=foobar", "="); 226 assert(sample.0 == "hello=world" && sample.1 == "foobar"); 227 };