scanner.ha (10623B)
1 // SPDX-License-Identifier: MPL-2.0 2 // (c) Hare authors <https://harelang.org> 3 4 use bytes; 5 use encoding::utf8; 6 use errors; 7 use io; 8 use strings; 9 use types; 10 11 def BUFSZ: size = 4096; 12 13 const scanner_vtable = io::vtable { 14 reader = &scan_read, 15 ... 16 }; 17 18 export type scanner = struct { 19 stream: io::stream, 20 src: io::handle, 21 buffer: []u8, 22 // Index of start of pending bytes in buffer 23 start: size, 24 // Sub-slice with pending bytes in buffer 25 pending: []u8, 26 // User-confirmed maximum size of read buffer 27 maxread: size, 28 // Change some scanning behaviors 29 opts: scan_options, 30 }; 31 32 // Options which fine-tune the behavior of a [[scanner]]. 33 export type scan_options = enum uint { 34 DEFAULT = EOF_DISCARD, 35 // Upon encountering EOF, all bytes or characters between the 36 // final token and EOF are discarded and EOF is returned 37 // immediately. 38 // 39 // This option is recommended for use-cases where the user is 40 // scanning over a file or buffer which may contain partial 41 // content, and the user wishes to consume as many tokens as 42 // possible and assume that additional data may follow EOF 43 // before a new delimiter is written. 44 // 45 // This is the default behavior. Note that on Unix, text files 46 // are always terminated with a new line, and [[scan_line]] will 47 // enumerate all well-formed lines in a file with this flag -- 48 // however, when scanning ill-formed text files which include 49 // text following the final line feed, this additional text will 50 // be discarded. 51 EOF_DISCARD = 0, 52 // Upon encountering EOF, all bytes or characters between the 53 // final token and EOF are treated as a token and returned to 54 // the caller before returning EOF. 55 // 56 // This is recommended for use-cases where EOF is effectively 57 // considered an additional delimiter between tokens, or where 58 // the remainder of the file following the final delimiter is 59 // meaningful. 60 EOF_GREEDY = 1 << 0, 61 }; 62 63 // Creates a new [[scanner]] which will allocate and maintain a read buffer for 64 // efficient reading of a handle. The scanner will read ahead only up to maxread 65 // bytes, which defaults to [[types::SIZE_MAX]] if no limit is required. The 66 // user must free resources associated with the scanner using [[finish]] after 67 // use. 68 // 69 // Reads from the scanner will return [[errors::overflow]] if maxread is 70 // reached. 71 export fn newscanner( 72 src: io::handle, 73 maxread: size = types::SIZE_MAX, 74 opts: scan_options = scan_options::DEFAULT, 75 ) scanner = { 76 return scanner { 77 stream = &scanner_vtable, 78 src = src, 79 buffer = alloc([0...], BUFSZ)!, 80 maxread = maxread, 81 start = 0, 82 pending = [], 83 opts = opts, 84 }; 85 }; 86 87 // Creates a new [[scanner]] using a user-provided buffer. The scanner will 88 // return [[errors::overflow]] if the buffer length is reached, but will not 89 // perform any allocations. The user should not call [[finish]] after use unless 90 // they wish to free the underlying buffer through bufio. 91 export fn newscanner_static( 92 src: io::handle, 93 buffer: []u8, 94 opts: scan_options = scan_options::DEFAULT, 95 ) scanner = { 96 return scanner { 97 stream = &scanner_vtable, 98 src = src, 99 buffer = buffer, 100 maxread = len(buffer), 101 start = 0, 102 pending = [], 103 opts = opts, 104 }; 105 }; 106 107 // Frees resources associated with a [[scanner]]. Does not close the underlying 108 // I/O handle. 109 export fn finish(scan: *scanner) void = { 110 free(scan.buffer); 111 }; 112 113 fn scan_read(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = { 114 let scan = s: *scanner; 115 116 if (len(scan.pending) == 0) { 117 match (scan_readahead(scan)?) { 118 case io::EOF => 119 return io::EOF; 120 case size => void; 121 }; 122 }; 123 124 const n = if (len(buf) > len(scan.pending)) len(scan.pending) else len(buf); 125 buf[..n] = scan_consume(scan, n)[..]; 126 return n; 127 }; 128 129 // Fills up the scanner buffer with data from the underlying I/O handle. If no 130 // space remains in the read buffer, it is expanded by BUFSZ (up to maxread). 131 // Then, one read from the underlying I/O handle is performed and scan.pending 132 // is updated accordingly. Returns the number of bytes which had been available 133 // prior to the call. 134 fn scan_readahead(scan: *scanner) (size | io::EOF | io::error) = { 135 let start = scan.start; 136 const pending = len(scan.pending); 137 138 if (start + pending == len(scan.buffer)) { 139 if (start > 0) { 140 // Shift buffer to the left to free space at the end 141 scan.buffer[..len(scan.buffer) - start] = scan.buffer[start..]; 142 scan.pending = scan.buffer[..pending]; 143 start = 0; 144 scan.start = 0; 145 } else { 146 // Buffer is full, expand it 147 let readahead = pending + BUFSZ; 148 if (readahead > scan.maxread) { 149 readahead = scan.maxread; 150 }; 151 if (pending >= readahead) { 152 return errors::overflow; 153 }; 154 append(scan.buffer, [0...], readahead)!; 155 }; 156 }; 157 158 match (io::read(scan.src, scan.buffer[start + pending..])?) { 159 case let z: size => 160 scan.pending = scan.buffer[start..start + pending + z]; 161 return pending; 162 case io::EOF => 163 return io::EOF; 164 }; 165 }; 166 167 // Consumes N bytes from the buffer. 168 fn scan_consume(scan: *scanner, n: size) []u8 = { 169 assert(len(scan.pending) >= n); 170 scan.start += n; 171 defer scan.pending = scan.pending[n..]; 172 return scan.pending[..n]; 173 }; 174 175 // Reads one byte from a [[scanner]]. 176 export fn scan_byte(scan: *scanner) (u8 | io::EOF | io::error) = { 177 if (len(scan.pending) == 0) { 178 match (scan_readahead(scan)?) { 179 case io::EOF => 180 return io::EOF; 181 case size => void; 182 }; 183 }; 184 185 return scan_consume(scan, 1)[0]; 186 }; 187 188 // Reads the next token from a [[scanner]], delimited by delim. The delimiter is 189 // read from the source handle but not included in the returned slice. The 190 // return value is borrowed from the internal scanner buffer, which is 191 // invalidated during subsequent operations which use this scanner. 192 export fn scan_bytes( 193 scan: *scanner, 194 delim: (u8 | []u8), 195 ) ([]u8 | io::EOF | io::error) = { 196 let i = 0z; 197 for (true) { 198 match (bytes::index(scan.pending[i..], delim)) { 199 case let ix: size => 200 i += ix; 201 break; 202 case void => void; 203 }; 204 205 match (scan_readahead(scan)?) { 206 case io::EOF => 207 if (scan.opts == scan_options::EOF_DISCARD) { 208 return io::EOF; 209 }; 210 if (len(scan.pending) == 0) { 211 return io::EOF; 212 }; 213 return scan_consume(scan, len(scan.pending)); 214 case let prevpending: size => 215 // No need to re-index the earlier part of the buffer 216 i = prevpending; 217 }; 218 }; 219 220 const ndelim = match (delim) { 221 case u8 => 222 yield 1z; 223 case let u: []u8 => 224 yield len(u); 225 }; 226 const nconsume = i + ndelim; 227 return scan_consume(scan, nconsume)[..i]; 228 }; 229 230 // Reads one rune from a [[scanner]]. 231 export fn scan_rune( 232 scan: *scanner, 233 ) (rune | io::EOF | io::error | utf8::invalid) = { 234 if (len(scan.pending) < 4) { 235 match (scan_readahead(scan)?) { 236 case io::EOF => 237 if (len(scan.pending) == 0) { 238 return io::EOF; 239 }; 240 case size => void; 241 }; 242 }; 243 const sz = utf8::utf8sz(scan.pending[0])?; 244 if (len(scan.pending) < sz) { 245 return utf8::invalid; 246 }; 247 const buf = scan_consume(scan, sz); 248 const dec = utf8::decode(buf[..sz]); 249 match (utf8::next(&dec)?) { 250 case let r: rune => 251 return r; 252 case done => 253 return io::EOF; 254 case utf8::more => 255 return utf8::invalid; 256 }; 257 }; 258 259 // Scans a string of text from a [[scanner]] up to some delimiter. The delimiter 260 // is read from the source handle but not included in the returned string. The 261 // return value is borrowed from the internal scanner buffer, which is 262 // invalidated during subsequent operations which use this scanner. 263 export fn scan_string( 264 scan: *scanner, 265 delim: str, 266 ) (const str | io::EOF | io::error | utf8::invalid) = { 267 const token = match (scan_bytes(scan, strings::toutf8(delim))?) { 268 case let token: []u8 => 269 yield token; 270 case io::EOF => 271 return io::EOF; 272 }; 273 return strings::fromutf8(token)?; 274 }; 275 276 // Scans the next line of text from a [[scanner]]. The return value is borrowed 277 // from the internal scanner buffer, which is invalidated during subsequent 278 // operations which use this scanner. 279 export fn scan_line( 280 scan: *scanner, 281 ) (const str | io::EOF | io::error | utf8::invalid) = { 282 return scan_string(scan, "\n"); 283 }; 284 285 // Returns the internal scanner buffer, which contains all bytes read ahead by 286 // the scanner up to this point. 287 export fn scan_buffer(scan: *scanner) []u8 = { 288 return scan.pending[..]; 289 }; 290 291 fn scan_unread(scan: *scanner, buf: []u8) void = { 292 if (len(buf) == 0) { 293 return; 294 }; 295 if (len(buf) <= scan.start) { 296 const pending_end = scan.start + len(scan.pending); 297 scan.buffer[scan.start - len(buf)..scan.start] = buf; 298 scan.start -= len(buf); 299 scan.pending = scan.buffer[scan.start..pending_end]; 300 } else { 301 assert(len(buf) <= len(scan.buffer) - len(scan.pending), 302 "Attempted to unread more data than buffer has available"); 303 // Shift buffer to the right to free space at the beginning 304 scan.buffer[len(buf)..len(buf) + len(scan.pending)] = 305 scan.buffer[scan.start..scan.start + len(scan.pending)]; 306 scan.buffer[..len(buf)] = buf; 307 scan.pending = scan.buffer[..len(scan.pending) + len(buf)]; 308 scan.start = 0; 309 }; 310 }; 311 312 // Reads a single byte from an [[io::handle]]. 313 export fn read_byte(h: io::handle) (u8 | io::EOF | io::error) = { 314 let buf: [1]u8 = [0...]; 315 316 match (io::readall(h, buf)?) { 317 case size => 318 return buf[0]; 319 case io::EOF => 320 return io::EOF; 321 }; 322 }; 323 324 // Reads a slice of bytes until the delimiter. Delimiter is not included but 325 // it is read from the handle. The return value must be freed by the caller. 326 export fn read_tok(h: io::handle, delim: u8...) ([]u8 | io::EOF | io::error) = { 327 let buf: []u8 = []; 328 329 for (true) { 330 match (read_byte(h)?) { 331 case let res: u8 => 332 if (bytes::contains(delim, res)) { 333 break; 334 }; 335 append(buf, res)!; 336 case io::EOF => 337 if (len(buf) == 0) { 338 return io::EOF; 339 }; 340 break; 341 }; 342 }; 343 344 return buf; 345 }; 346 347 // Reads a slice of bytes until a newline character (\n, 0x0A). Newline itself 348 // is not included but it is read from the handle. The return value must be 349 // freed by the caller. 350 export fn read_line(h: io::handle) ([]u8 | io::EOF | io::error) = 351 read_tok(h, '\n'); 352 353 // Reads a rune from a UTF-8 stream. 354 export fn read_rune( 355 h: io::handle, 356 ) (rune | utf8::invalid | io::EOF | io::error) = { 357 let b: [4]u8 = [0...]; 358 match (io::readall(h, b[..1])?) { 359 case let n: size => void; 360 case io::EOF => 361 return io::EOF; 362 }; 363 364 const sz = utf8::utf8sz(b[0])?; 365 366 if (sz == 1) { 367 return b[0]: rune; 368 }; 369 370 match (io::readall(h, b[1..sz])) { 371 case let n: size => void; 372 case io::EOF => 373 return io::EOF; 374 case let err: io::error => 375 return if (err is io::underread) utf8::invalid else err; 376 }; 377 378 let dec = utf8::decode(b[..sz]); 379 match (utf8::next(&dec)?) { 380 case let r: rune => 381 return r; 382 case done => 383 return io::EOF; 384 case utf8::more => 385 return utf8::invalid; 386 }; 387 };