hare

[hare] The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

scanner.ha (10623B)


      1 // SPDX-License-Identifier: MPL-2.0
      2 // (c) Hare authors <https://harelang.org>
      3 
      4 use bytes;
      5 use encoding::utf8;
      6 use errors;
      7 use io;
      8 use strings;
      9 use types;
     10 
     11 def BUFSZ: size = 4096;
     12 
     13 const scanner_vtable = io::vtable {
     14 	reader = &scan_read,
     15 	...
     16 };
     17 
     18 export type scanner = struct {
     19 	stream: io::stream,
     20 	src: io::handle,
     21 	buffer: []u8,
     22 	// Index of start of pending bytes in buffer
     23 	start: size,
     24 	// Sub-slice with pending bytes in buffer
     25 	pending: []u8,
     26 	// User-confirmed maximum size of read buffer
     27 	maxread: size,
     28 	// Change some scanning behaviors
     29 	opts: scan_options,
     30 };
     31 
     32 // Options which fine-tune the behavior of a [[scanner]].
     33 export type scan_options = enum uint {
     34 	DEFAULT = EOF_DISCARD,
     35 	// Upon encountering EOF, all bytes or characters between the
     36 	// final token and EOF are discarded and EOF is returned
     37 	// immediately.
     38 	//
     39 	// This option is recommended for use-cases where the user is
     40 	// scanning over a file or buffer which may contain partial
     41 	// content, and the user wishes to consume as many tokens as
     42 	// possible and assume that additional data may follow EOF
     43 	// before a new delimiter is written.
     44 	//
     45 	// This is the default behavior. Note that on Unix, text files
     46 	// are always terminated with a new line, and [[scan_line]] will
     47 	// enumerate all well-formed lines in a file with this flag --
     48 	// however, when scanning ill-formed text files which include
     49 	// text following the final line feed, this additional text will
     50 	// be discarded.
     51 	EOF_DISCARD = 0,
     52 	// Upon encountering EOF, all bytes or characters between the
     53 	// final token and EOF are treated as a token and returned to
     54 	// the caller before returning EOF.
     55 	//
     56 	// This is recommended for use-cases where EOF is effectively
     57 	// considered an additional delimiter between tokens, or where
     58 	// the remainder of the file following the final delimiter is
     59 	// meaningful.
     60 	EOF_GREEDY = 1 << 0,
     61 };
     62 
     63 // Creates a new [[scanner]] which will allocate and maintain a read buffer for
     64 // efficient reading of a handle. The scanner will read ahead only up to maxread
     65 // bytes, which defaults to [[types::SIZE_MAX]] if no limit is required. The
     66 // user must free resources associated with the scanner using [[finish]] after
     67 // use.
     68 //
     69 // Reads from the scanner will return [[errors::overflow]] if maxread is
     70 // reached.
     71 export fn newscanner(
     72 	src: io::handle,
     73 	maxread: size = types::SIZE_MAX,
     74 	opts: scan_options = scan_options::DEFAULT,
     75 ) scanner = {
     76 	return scanner {
     77 		stream = &scanner_vtable,
     78 		src = src,
     79 		buffer = alloc([0...], BUFSZ)!,
     80 		maxread = maxread,
     81 		start = 0,
     82 		pending = [],
     83 		opts = opts,
     84 	};
     85 };
     86 
     87 // Creates a new [[scanner]] using a user-provided buffer. The scanner will
     88 // return [[errors::overflow]] if the buffer length is reached, but will not
     89 // perform any allocations. The user should not call [[finish]] after use unless
     90 // they wish to free the underlying buffer through bufio.
     91 export fn newscanner_static(
     92 	src: io::handle,
     93 	buffer: []u8,
     94 	opts: scan_options = scan_options::DEFAULT,
     95 ) scanner = {
     96 	return scanner {
     97 		stream = &scanner_vtable,
     98 		src = src,
     99 		buffer = buffer,
    100 		maxread = len(buffer),
    101 		start = 0,
    102 		pending = [],
    103 		opts = opts,
    104 	};
    105 };
    106 
    107 // Frees resources associated with a [[scanner]]. Does not close the underlying
    108 // I/O handle.
    109 export fn finish(scan: *scanner) void = {
    110 	free(scan.buffer);
    111 };
    112 
    113 fn scan_read(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = {
    114 	let scan = s: *scanner;
    115 
    116 	if (len(scan.pending) == 0) {
    117 		match (scan_readahead(scan)?) {
    118 		case io::EOF =>
    119 			return io::EOF;
    120 		case size => void;
    121 		};
    122 	};
    123 
    124 	const n = if (len(buf) > len(scan.pending)) len(scan.pending) else len(buf);
    125 	buf[..n] = scan_consume(scan, n)[..];
    126 	return n;
    127 };
    128 
    129 // Fills up the scanner buffer with data from the underlying I/O handle. If no
    130 // space remains in the read buffer, it is expanded by BUFSZ (up to maxread).
    131 // Then, one read from the underlying I/O handle is performed and scan.pending
    132 // is updated accordingly. Returns the number of bytes which had been available
    133 // prior to the call.
    134 fn scan_readahead(scan: *scanner) (size | io::EOF | io::error) = {
    135 	let start = scan.start;
    136 	const pending = len(scan.pending);
    137 
    138 	if (start + pending == len(scan.buffer)) {
    139 		if (start > 0) {
    140 			// Shift buffer to the left to free space at the end
    141 			scan.buffer[..len(scan.buffer) - start] = scan.buffer[start..];
    142 			scan.pending = scan.buffer[..pending];
    143 			start = 0;
    144 			scan.start = 0;
    145 		} else {
    146 			// Buffer is full, expand it
    147 			let readahead = pending + BUFSZ;
    148 			if (readahead > scan.maxread) {
    149 				readahead = scan.maxread;
    150 			};
    151 			if (pending >= readahead) {
    152 				return errors::overflow;
    153 			};
    154 			append(scan.buffer, [0...], readahead)!;
    155 		};
    156 	};
    157 
    158 	match (io::read(scan.src, scan.buffer[start + pending..])?) {
    159 	case let z: size =>
    160 		scan.pending = scan.buffer[start..start + pending + z];
    161 		return pending;
    162 	case io::EOF =>
    163 		return io::EOF;
    164 	};
    165 };
    166 
    167 // Consumes N bytes from the buffer.
    168 fn scan_consume(scan: *scanner, n: size) []u8 = {
    169 	assert(len(scan.pending) >= n);
    170 	scan.start += n;
    171 	defer scan.pending = scan.pending[n..];
    172 	return scan.pending[..n];
    173 };
    174 
    175 // Reads one byte from a [[scanner]].
    176 export fn scan_byte(scan: *scanner) (u8 | io::EOF | io::error) = {
    177 	if (len(scan.pending) == 0) {
    178 		match (scan_readahead(scan)?) {
    179 		case io::EOF =>
    180 			return io::EOF;
    181 		case size => void;
    182 		};
    183 	};
    184 
    185 	return scan_consume(scan, 1)[0];
    186 };
    187 
    188 // Reads the next token from a [[scanner]], delimited by delim. The delimiter is
    189 // read from the source handle but not included in the returned slice. The
    190 // return value is borrowed from the internal scanner buffer, which is
    191 // invalidated during subsequent operations which use this scanner.
    192 export fn scan_bytes(
    193 	scan: *scanner,
    194 	delim: (u8 | []u8),
    195 ) ([]u8 | io::EOF | io::error) = {
    196 	let i = 0z;
    197 	for (true) {
    198 		match (bytes::index(scan.pending[i..], delim)) {
    199 		case let ix: size =>
    200 			i += ix;
    201 			break;
    202 		case void => void;
    203 		};
    204 
    205 		match (scan_readahead(scan)?) {
    206 		case io::EOF =>
    207 			if (scan.opts == scan_options::EOF_DISCARD) {
    208 				return io::EOF;
    209 			};
    210 			if (len(scan.pending) == 0) {
    211 				return io::EOF;
    212 			};
    213 			return scan_consume(scan, len(scan.pending));
    214 		case let prevpending: size =>
    215 			// No need to re-index the earlier part of the buffer
    216 			i = prevpending;
    217 		};
    218 	};
    219 
    220 	const ndelim = match (delim) {
    221 	case u8 =>
    222 		yield 1z;
    223 	case let u: []u8 =>
    224 		yield len(u);
    225 	};
    226 	const nconsume = i + ndelim;
    227 	return scan_consume(scan, nconsume)[..i];
    228 };
    229 
    230 // Reads one rune from a [[scanner]].
    231 export fn scan_rune(
    232 	scan: *scanner,
    233 ) (rune | io::EOF | io::error | utf8::invalid) = {
    234 	if (len(scan.pending) < 4) {
    235 		match (scan_readahead(scan)?) {
    236 		case io::EOF =>
    237 			if (len(scan.pending) == 0) {
    238 				return io::EOF;
    239 			};
    240 		case size => void;
    241 		};
    242 	};
    243 	const sz = utf8::utf8sz(scan.pending[0])?;
    244 	if (len(scan.pending) < sz) {
    245 		return utf8::invalid;
    246 	};
    247 	const buf = scan_consume(scan, sz);
    248 	const dec = utf8::decode(buf[..sz]);
    249 	match (utf8::next(&dec)?) {
    250 	case let r: rune =>
    251 		return r;
    252 	case done =>
    253 		return io::EOF;
    254 	case utf8::more =>
    255 		return utf8::invalid;
    256 	};
    257 };
    258 
    259 // Scans a string of text from a [[scanner]] up to some delimiter. The delimiter
    260 // is read from the source handle but not included in the returned string. The
    261 // return value is borrowed from the internal scanner buffer, which is
    262 // invalidated during subsequent operations which use this scanner.
    263 export fn scan_string(
    264 	scan: *scanner,
    265 	delim: str,
    266 ) (const str | io::EOF | io::error | utf8::invalid) = {
    267 	const token = match (scan_bytes(scan, strings::toutf8(delim))?) {
    268 	case let token: []u8 =>
    269 		yield token;
    270 	case io::EOF =>
    271 		return io::EOF;
    272 	};
    273 	return strings::fromutf8(token)?;
    274 };
    275 
    276 // Scans the next line of text from a [[scanner]]. The return value is borrowed
    277 // from the internal scanner buffer, which is invalidated during subsequent
    278 // operations which use this scanner.
    279 export fn scan_line(
    280 	scan: *scanner,
    281 ) (const str | io::EOF | io::error | utf8::invalid) = {
    282 	return scan_string(scan, "\n");
    283 };
    284 
    285 // Returns the internal scanner buffer, which contains all bytes read ahead by
    286 // the scanner up to this point.
    287 export fn scan_buffer(scan: *scanner) []u8 = {
    288 	return scan.pending[..];
    289 };
    290 
    291 fn scan_unread(scan: *scanner, buf: []u8) void = {
    292 	if (len(buf) == 0) {
    293 		return;
    294 	};
    295 	if (len(buf) <= scan.start) {
    296 		const pending_end = scan.start + len(scan.pending);
    297 		scan.buffer[scan.start - len(buf)..scan.start] = buf;
    298 		scan.start -= len(buf);
    299 		scan.pending = scan.buffer[scan.start..pending_end];
    300 	} else {
    301 		assert(len(buf) <= len(scan.buffer) - len(scan.pending),
    302 			"Attempted to unread more data than buffer has available");
    303 		// Shift buffer to the right to free space at the beginning
    304 		scan.buffer[len(buf)..len(buf) + len(scan.pending)] =
    305 			scan.buffer[scan.start..scan.start + len(scan.pending)];
    306 		scan.buffer[..len(buf)] = buf;
    307 		scan.pending = scan.buffer[..len(scan.pending) + len(buf)];
    308 		scan.start = 0;
    309 	};
    310 };
    311 
    312 // Reads a single byte from an [[io::handle]].
    313 export fn read_byte(h: io::handle) (u8 | io::EOF | io::error) = {
    314 	let buf: [1]u8 = [0...];
    315 
    316 	match (io::readall(h, buf)?) {
    317 	case size =>
    318 		return buf[0];
    319 	case io::EOF =>
    320 		return io::EOF;
    321 	};
    322 };
    323 
    324 // Reads a slice of bytes until the delimiter. Delimiter is not included but
    325 // it is read from the handle. The return value must be freed by the caller.
    326 export fn read_tok(h: io::handle, delim: u8...) ([]u8 | io::EOF | io::error) = {
    327 	let buf: []u8 = [];
    328 
    329 	for (true) {
    330 		match (read_byte(h)?) {
    331 		case let res: u8 =>
    332 			if (bytes::contains(delim, res)) {
    333 				break;
    334 			};
    335 			append(buf, res)!;
    336 		case io::EOF =>
    337 			if (len(buf) == 0) {
    338 				return io::EOF;
    339 			};
    340 			break;
    341 		};
    342 	};
    343 
    344 	return buf;
    345 };
    346 
    347 // Reads a slice of bytes until a newline character (\n, 0x0A). Newline itself
    348 // is not included but it is read from the handle. The return value must be
    349 // freed by the caller.
    350 export fn read_line(h: io::handle) ([]u8 | io::EOF | io::error) =
    351 	read_tok(h, '\n');
    352 
    353 // Reads a rune from a UTF-8 stream.
    354 export fn read_rune(
    355 	h: io::handle,
    356 ) (rune | utf8::invalid | io::EOF | io::error) = {
    357 	let b: [4]u8 = [0...];
    358 	match (io::readall(h, b[..1])?) {
    359 	case let n: size => void;
    360 	case io::EOF =>
    361 		return io::EOF;
    362 	};
    363 
    364 	const sz = utf8::utf8sz(b[0])?;
    365 
    366 	if (sz == 1) {
    367 		return b[0]: rune;
    368 	};
    369 
    370 	match (io::readall(h, b[1..sz])) {
    371 	case let n: size => void;
    372 	case io::EOF =>
    373 		return io::EOF;
    374 	case let err: io::error =>
    375 		return if (err is io::underread) utf8::invalid else err;
    376 	};
    377 
    378 	let dec = utf8::decode(b[..sz]);
    379 	match (utf8::next(&dec)?) {
    380 	case let r: rune =>
    381 		return r;
    382 	case done =>
    383 		return io::EOF;
    384 	case utf8::more =>
    385 		return utf8::invalid;
    386 	};
    387 };