commit 56e37055a8d5f2f5458a0b2fa23645204651605a
parent 937503d916d7d1aabf928be9ad7a2667b18f2f98
Author: Willow Barraco <contact@willowbarraco.fr>
Date: Fri, 30 Aug 2024 09:52:13 +0200
bufio::scanner: add options to fine-tune the behavior of a scanner
EOF_GREEDY is the previous scanner behavior. Now with the new default
value, the scanner will return EOF if it reachs the source EOF without
encountering the token.
This new option is necessary to parse uncomplete buffers in a
non-blocking way (ex: in hare-ev). It is also necessary to parse
ill-formed data in general.
This is a breaking change. Programs that relies on the previous behavior
may now have a confusing scanner result that might be difficult to
debug.
The changes to the tests represent correctly the change of behavior:
unix::hosts::next need line breaks, because /etc/hosts-formatted formated lines
ends with line breaks. "127" is not invalid, it is an incomplete line.
The same conclusion for format::ini.
Signed-off-by: Willow Barraco <contact@willowbarraco.fr>
Diffstat:
4 files changed, 67 insertions(+), 4 deletions(-)
diff --git a/bufio/scanner.ha b/bufio/scanner.ha
@@ -25,6 +25,39 @@ export type scanner = struct {
pending: []u8,
// User-confirmed maximum size of read buffer
maxread: size,
+ // Change some scanning behaviors
+ opts: scan_options,
+};
+
+// Options which fine-tune the behavior of a [[scanner]].
+export type scan_options = enum uint {
+ DEFAULT = EOF_DISCARD,
+ // Upon encountering EOF, all bytes or characters between the
+ // final token and EOF are discarded and EOF is returned
+ // immediately.
+ //
+ // This option is recommended for use-cases where the user is
+ // scanning over a file or buffer which may contain partial
+ // content, and the user wishes to consume as many tokens as
+ // possible and assume that additional data may follow EOF
+ // before a new delimiter is written.
+ //
+ // This is the default behavior. Note that on Unix, text files
+ // are always terminated with a new line, and [[scan_line]] will
+ // enumerate all well-formed lines in a file with this flag --
+ // however, when scanning ill-formed text files which include
+ // text following the final line feed, this additional text will
+ // be discarded.
+ EOF_DISCARD = 0,
+ // Upon encountering EOF, all bytes or characters between the
+ // final token and EOF are treated as a token and returned to
+ // the caller before returning EOF.
+ //
+ // This is recommended for use-cases where EOF is effectively
+ // considered an additional delimiter between tokens, or where
+ // the remainder of the file following the final delimiter is
+ // meaningful.
+ EOF_GREEDY = 1 << 0,
};
// Creates a new [[scanner]] which will allocate and maintain a read buffer for
@@ -38,6 +71,7 @@ export type scanner = struct {
export fn newscanner(
src: io::handle,
maxread: size = types::SIZE_MAX,
+ opts: scan_options = scan_options::DEFAULT,
) scanner = {
return scanner {
stream = &scanner_vtable,
@@ -46,6 +80,7 @@ export fn newscanner(
maxread = maxread,
start = 0,
pending = [],
+ opts = opts,
};
};
@@ -53,7 +88,11 @@ export fn newscanner(
// return [[errors::overflow]] if the buffer length is reached, but will not
// perform any allocations. The user should not call [[finish]] after use unless
// they wish to free the underlying buffer through bufio.
-export fn newscanner_static(src: io::handle, buffer: []u8) scanner = {
+export fn newscanner_static(
+ src: io::handle,
+ buffer: []u8,
+ opts: scan_options = scan_options::DEFAULT,
+) scanner = {
return scanner {
stream = &scanner_vtable,
src = src,
@@ -61,6 +100,7 @@ export fn newscanner_static(src: io::handle, buffer: []u8) scanner = {
maxread = len(buffer),
start = 0,
pending = [],
+ opts = opts,
};
};
@@ -164,6 +204,9 @@ export fn scan_bytes(
match (scan_readahead(scan)?) {
case io::EOF =>
+ if (scan.opts & scan_options::EOF_DISCARD == 0) {
+ return io::EOF;
+ };
if (len(scan.pending) == 0) {
return io::EOF;
};
diff --git a/bufio/scanner_test+test.ha b/bufio/scanner_test+test.ha
@@ -6,6 +6,7 @@ use encoding::utf8;
use io;
use memio;
use strings;
+use types;
@test fn read_byte() void = {
let buf = memio::fixed([1, 3, 3, 7]);
@@ -180,3 +181,21 @@ use strings;
assert(scan_line(&scanner) is io::EOF);
};
+
+@test fn scan_uncomplete_line() void = {
+ let buf = memio::dynamic();
+ let scan = newscanner(&buf);
+
+ assert(scan_line(&scan) is io::EOF);
+
+ io::write(&buf, strings::toutf8("hello"))!;
+ io::seek(&buf, 0, io::whence::SET)!;
+
+ assert(scan_line(&scan) is io::EOF);
+
+ io::write(&buf, strings::toutf8("\n"))!;
+ io::seek(&buf, -1, io::whence::CUR)!;
+
+ let line = scan_line(&scan) as const str;
+ assert(strings::compare(line, "hello") == 0);
+};
diff --git a/format/ini/+test.ha b/format/ini/+test.ha
@@ -13,7 +13,8 @@ name=Sourcehut
description=The hacker's forge
[harelang.org]
name=Hare
-description=The Hare programming language"));
+description=The Hare programming language
+"));
const sc = scan(&buf);
defer finish(&sc);
diff --git a/unix/hosts/test+test.ha b/unix/hosts/test+test.ha
@@ -48,10 +48,10 @@ def HOSTS_FILE = `
};
@test fn errors() void = {
- const s = "127";
+ const s = "127\n";
assert(next(&read(&memio::fixed(strings::toutf8(s))))
is ip::invalid);
- const s = "127.0.0.1";
+ const s = "127.0.0.1\n";
assert(next(&read(&memio::fixed(strings::toutf8(s))))
is invalid);
};