encoding::json: new module - hare - [hare] The Hare programming language

commit a112f1369a2f5e01f7bbe170dddabc06df571ca2
parent dbf510799d20337552e0e66b8a48193b7c538505
Author: Drew DeVault <sir@cmpwn.com>
Date:   Mon,  9 May 2022 15:08:44 +0200

encoding::json: new module

Signed-off-by: Drew DeVault <sir@cmpwn.com>

Diffstat:
A encoding/json/+test/lexer.ha  | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A encoding/json/lex.ha  | 264 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A encoding/json/types.ha  | 32 ++++++++++++++++++++++++++++++++
M scripts/gen-stdlib  | 17 +++++++++++++++++
M stdlib.mk  | 35 +++++++++++++++++++++++++++++++++++

5 files changed, 408 insertions(+), 0 deletions(-)
diff --git a/encoding/json/+test/lexer.ha b/encoding/json/+test/lexer.ha
@@ -0,0 +1,60 @@
+use bufio;
+use strings;
+use io;
+
+@test fn lex() void = {
+	const cases: [_](str, []token) = [
+		("true", [true]),
+		("false", [false]),
+		("null", [_null]),
+		("1234", [1234.0]),
+		("12.34", [12.34]),
+		("12.34e5", [12.34e5]),
+		("12.34E5", [12.34e5]),
+		("12e5", [12.0e5]),
+		("-1234", [-1234.0]),
+		(`"hello world"`, ["hello world"]),
+		(`"\"\\\/\b\f\n\r\t\u0020"`, ["\"\\/\b\f\n\r\t\u0020"]),
+		("[ null, null ]", [arraystart, _null, comma, _null, arrayend]),
+	];
+
+	for (let i = 0z; i < len(cases); i += 1) {
+		const src = strings::toutf8(cases[i].0);
+		const src = bufio::fixed(src, io::mode::READ);
+		const lexer = lex(&src);
+		defer finish(&lexer);
+
+		for (let j = 0z; j < len(cases[i].1); j += 1) {
+			const want = cases[i].1[j];
+			const have = next(&lexer)! as token;
+			assert(tokeq(want, have));
+		};
+
+		assert(next(&lexer) is io::EOF);
+	};
+};
+
+fn tokeq(want: token, have: token) bool = {
+	match (want) {
+	case _null =>
+		return have is _null;
+	case comma =>
+		return have is comma;
+	case colon =>
+		return have is colon;
+	case arraystart =>
+		return have is arraystart;
+	case arrayend =>
+		return have is arrayend;
+	case objstart =>
+		return have is objstart;
+	case objend =>
+		return have is objend;
+	case let b: bool =>
+		return have as bool == b;
+	case let f: f64 =>
+		return have as f64 == f;
+	case let s: str =>
+		return have as str == s;
+	};
+};
diff --git a/encoding/json/lex.ha b/encoding/json/lex.ha
@@ -0,0 +1,264 @@
+use ascii;
+use bufio;
+use encoding::utf8;
+use io;
+use os;
+use strconv;
+use strings;
+use strio;
+
+export type lexer = struct {
+	src: bufio::bufstream,
+	buffer: []u8,
+	strbuf: strio::stream,
+};
+
+// Creates a new JSON lexer. The caller can obtain tokens with [[next]] and
+// should pass the result to [[finish]] when they're done with it.
+export fn lex(src: io::handle) lexer = {
+	let buf: []u8 = alloc([0...], os::BUFSIZ);
+	return lexer {
+		src = bufio::buffered(src, buf, []),
+		buffer = buf,
+		strbuf = strio::dynamic(),
+	};
+};
+
+// Frees state associated with a JSON lexer.
+export fn finish(lex: *lexer) void = {
+	free(lex.buffer);
+	io::close(&lex.strbuf)!;
+};
+
+// Returns the next token from a JSON lexer. The return value is borrowed from
+// the lexer and will be overwritten on subsequent calls.
+export fn next(lex: *lexer) (token | io::EOF | error) = {
+	const rn = match (nextrunews(lex)?) {
+	case io::EOF =>
+		return io::EOF;
+	case let rn: rune =>
+		yield rn;
+	};
+
+	switch (rn) {
+	case '[' =>
+		return arraystart;
+	case ']' =>
+		return arrayend;
+	case '{' =>
+		return objstart;
+	case '}' =>
+		return objend;
+	case ',' =>
+		return comma;
+	case ':' =>
+		return colon;
+	case '"' =>
+		return scan_str(lex)?;
+	case =>
+		return invalid;
+	case =>
+		yield;
+	};
+
+	if (ascii::isdigit(rn) || rn == '+' || rn == '-') {
+		bufio::unreadrune(&lex.src, rn);
+		return scan_number(lex)?;
+	};
+
+	bufio::unreadrune(&lex.src, rn);
+	const word = scan_word(lex)?;
+	switch (word) {
+	case "true" =>
+		return true;
+	case "false" =>
+		return false;
+	case "null" =>
+		return _null;
+	case =>
+		return invalid;
+	};
+};
+
+// Scans until encountering a non-alphabetical character, returning the
+// resulting word.
+fn scan_word(lex: *lexer) (str | error) = {
+	strio::reset(&lex.strbuf);
+	for (true) {
+		const rn = match (nextrune(lex)?) {
+		case let rn: rune =>
+			yield rn;
+		case io::EOF =>
+			break;
+		};
+		if (!ascii::isalpha(rn)) {
+			bufio::unreadrune(&lex.src, rn);
+			break;
+		};
+		strio::appendrune(&lex.strbuf, rn)!;
+	};
+	return strio::string(&lex.strbuf);
+};
+
+type numstate = enum {
+	INTEGER,
+	FRACTION,
+	EXPONENT,
+};
+
+fn scan_number(lex: *lexer) (token | error) = {
+	strio::reset(&lex.strbuf);
+
+	let state = numstate::INTEGER;
+	for (true) {
+		const rn = match (nextrune(lex)?) {
+		case let rn: rune =>
+			yield rn;
+		case io::EOF =>
+			break;
+		};
+
+		switch (state) {
+		case numstate::INTEGER =>
+			switch (rn) {
+			case '.' =>
+				state = numstate::FRACTION;
+			case 'e', 'E' =>
+				state = numstate::EXPONENT;
+			case '+', '-' =>
+				void;
+			case =>
+				if (!ascii::isdigit(rn)) {
+					return invalid;
+				};
+			};
+		case numstate::FRACTION =>
+			switch (rn) {
+			case 'e', 'E' =>
+				state = numstate::EXPONENT;
+			case =>
+				if (!ascii::isdigit(rn)) {
+					bufio::unreadrune(&lex.src, rn);
+					break;
+				};
+			};
+		case numstate::EXPONENT =>
+			if (!ascii::isdigit(rn)) {
+				bufio::unreadrune(&lex.src, rn);
+				break;
+			};
+		};
+
+		strio::appendrune(&lex.strbuf, rn)!;
+	};
+
+	match (strconv::stof64(strio::string(&lex.strbuf))) {
+	case let f: f64 =>
+		return f;
+	case =>
+		return invalid;
+	};
+};
+
+fn scan_str(lex: *lexer) (token | error) = {
+	strio::reset(&lex.strbuf);
+
+	for (true) {
+		const rn = match (nextrune(lex)?) {
+		case let rn: rune =>
+			yield rn;
+		case io::EOF =>
+			return invalid;
+		};
+
+		switch (rn) {
+		case '"' =>
+			break;
+		case '\\' =>
+			const rn = scan_escape(lex)?;
+			strio::appendrune(&lex.strbuf, rn)!;
+		case =>
+			strio::appendrune(&lex.strbuf, rn)!;
+		};
+	};
+
+	return strio::string(&lex.strbuf);
+};
+
+fn scan_escape(lex: *lexer) (rune | error) = {
+	const rn = match (nextrune(lex)?) {
+	case let rn: rune =>
+		yield rn;
+	case io::EOF =>
+		return invalid;
+	};
+
+	switch (rn) {
+	case '\"' =>
+		return '\"';
+	case '\\' =>
+		return '\\';
+	case '/' =>
+		return '/';
+	case 'b' =>
+		return '\b';
+	case 'f' =>
+		return '\f';
+	case 'n' =>
+		return '\n';
+	case 'r' =>
+		return '\r';
+	case 't' =>
+		return '\t';
+	case 'u' =>
+		let buf: [4]u8 = [0...];
+		match (io::readall(&lex.src, buf)?) {
+		case io::EOF =>
+			return invalid;
+		case size =>
+			yield;
+		};
+		const s = match (strings::try_fromutf8(buf)) {
+		case let s: str =>
+			yield s;
+		case =>
+			return invalid;
+		};
+		match (strconv::stou32b(s, strconv::base::HEX)) {
+		case let u: u32 =>
+			return u: rune;
+		case =>
+			return invalid;
+		};
+	};
+};
+
+// Gets the next rune from the I/O source
+fn nextrune(lex: *lexer) (rune | io::EOF | error) = {
+	match (bufio::scanrune(&lex.src)) {
+	case let err: io::error =>
+		return err;
+	case utf8::invalid =>
+		return invalid;
+	case io::EOF =>
+		return io::EOF;
+	case let rn: rune =>
+		return rn;
+	};
+};
+
+// Like nextrune but skips whitespace tokens
+fn nextrunews(lex: *lexer) (rune | io::EOF | error) = {
+	for (true) {
+		match (nextrune(lex)?) {
+		case let rn: rune =>
+			if (ascii::isspace(rn)) {
+				continue;
+			};
+			return rn;
+		case io::EOF =>
+			return io::EOF;
+		};
+	};
+	abort(); // Unreachable
+};
diff --git a/encoding/json/types.ha b/encoding/json/types.ha
@@ -0,0 +1,32 @@
+use io;
+
+// An invalid JSON token was encountered.
+export type invalid = !void;
+
+// A tagged union of all possible errors returned from this module.
+export type error = !(invalid | io::error);
+
+// The JSON null value.
+export type _null = void;
+
+// The '[' token, signaling the start of a JSON array.
+export type arraystart = void;
+
+// The ']' token, signaling the end of a JSON array.
+export type arrayend = void;
+
+// The '{' token, signaling the start of a JSON object.
+export type objstart = void;
+
+// The '}' token, signaling the end of a JSON object.
+export type objend = void;
+
+// The ':' token.
+export type colon = void;
+
+// The ',' token.
+export type comma = void;
+
+// All tokens which can be returned from the JSON tokenizer.
+export type token = (arraystart | arrayend | objstart |
+	objend | colon | comma | str | f64 | bool | _null);
diff --git a/scripts/gen-stdlib b/scripts/gen-stdlib
@@ -509,6 +509,22 @@ encoding_hex() {
 	gen_ssa encoding::hex ascii bytes fmt io strconv strio strings
 }
 
+encoding_json() {
+	if [ $testing -eq 0 ]
+	then
+		gen_srcs encoding::json \
+			types.ha \
+			lex.ha
+	else
+		gen_srcs encoding::json \
+			types.ha \
+			lex.ha \
+			+test/lexer.ha
+	fi
+	gen_ssa encoding::json ascii bufio io strio os encoding::utf8 strings \
+		strconv
+}
+
 encoding_pem() {
 	if [ $testing -eq 0 ]
 	then
@@ -1363,6 +1379,7 @@ dirs
 encoding::base64
 encoding::base32
 encoding::hex
+encoding::json
 encoding::pem
 encoding::utf8
 endian
diff --git a/stdlib.mk b/stdlib.mk
@@ -308,6 +308,12 @@ stdlib_deps_any += $(stdlib_encoding_hex_any)
 stdlib_encoding_hex_linux = $(stdlib_encoding_hex_any)
 stdlib_encoding_hex_freebsd = $(stdlib_encoding_hex_any)
 
+# gen_lib encoding::json (any)
+stdlib_encoding_json_any = $(HARECACHE)/encoding/json/encoding_json-any.o
+stdlib_deps_any += $(stdlib_encoding_json_any)
+stdlib_encoding_json_linux = $(stdlib_encoding_json_any)
+stdlib_encoding_json_freebsd = $(stdlib_encoding_json_any)
+
 # gen_lib encoding::pem (any)
 stdlib_encoding_pem_any = $(HARECACHE)/encoding/pem/encoding_pem-any.o
 stdlib_deps_any += $(stdlib_encoding_pem_any)
@@ -1051,6 +1057,17 @@ $(HARECACHE)/encoding/hex/encoding_hex-any.ssa: $(stdlib_encoding_hex_any_srcs) 
 	@HARECACHE=$(HARECACHE) $(HAREC) $(HAREFLAGS) -o $@ -Nencoding::hex \
 		-t$(HARECACHE)/encoding/hex/encoding_hex.td $(stdlib_encoding_hex_any_srcs)
 
+# encoding::json (+any)
+stdlib_encoding_json_any_srcs = \
+	$(STDLIB)/encoding/json/types.ha \
+	$(STDLIB)/encoding/json/lex.ha
+
+$(HARECACHE)/encoding/json/encoding_json-any.ssa: $(stdlib_encoding_json_any_srcs) $(stdlib_rt) $(stdlib_ascii_$(PLATFORM)) $(stdlib_bufio_$(PLATFORM)) $(stdlib_io_$(PLATFORM)) $(stdlib_strio_$(PLATFORM)) $(stdlib_os_$(PLATFORM)) $(stdlib_encoding_utf8_$(PLATFORM)) $(stdlib_strings_$(PLATFORM))
+	@printf 'HAREC \t$@\n'
+	@mkdir -p $(HARECACHE)/encoding/json
+	@HARECACHE=$(HARECACHE) $(HAREC) $(HAREFLAGS) -o $@ -Nencoding::json \
+		-t$(HARECACHE)/encoding/json/encoding_json.td $(stdlib_encoding_json_any_srcs)
+
 # encoding::pem (+any)
 stdlib_encoding_pem_any_srcs = \
 	$(STDLIB)/encoding/pem/pem.ha
@@ -2370,6 +2387,12 @@ testlib_deps_any += $(testlib_encoding_hex_any)
 testlib_encoding_hex_linux = $(testlib_encoding_hex_any)
 testlib_encoding_hex_freebsd = $(testlib_encoding_hex_any)
 
+# gen_lib encoding::json (any)
+testlib_encoding_json_any = $(TESTCACHE)/encoding/json/encoding_json-any.o
+testlib_deps_any += $(testlib_encoding_json_any)
+testlib_encoding_json_linux = $(testlib_encoding_json_any)
+testlib_encoding_json_freebsd = $(testlib_encoding_json_any)
+
 # gen_lib encoding::pem (any)
 testlib_encoding_pem_any = $(TESTCACHE)/encoding/pem/encoding_pem-any.o
 testlib_deps_any += $(testlib_encoding_pem_any)
@@ -3133,6 +3156,18 @@ $(TESTCACHE)/encoding/hex/encoding_hex-any.ssa: $(testlib_encoding_hex_any_srcs)
 	@HARECACHE=$(TESTCACHE) $(HAREC) $(TESTHAREFLAGS) -o $@ -Nencoding::hex \
 		-t$(TESTCACHE)/encoding/hex/encoding_hex.td $(testlib_encoding_hex_any_srcs)
 
+# encoding::json (+any)
+testlib_encoding_json_any_srcs = \
+	$(STDLIB)/encoding/json/types.ha \
+	$(STDLIB)/encoding/json/lex.ha \
+	$(STDLIB)/encoding/json/+test/lexer.ha
+
+$(TESTCACHE)/encoding/json/encoding_json-any.ssa: $(testlib_encoding_json_any_srcs) $(testlib_rt) $(testlib_ascii_$(PLATFORM)) $(testlib_bufio_$(PLATFORM)) $(testlib_io_$(PLATFORM)) $(testlib_strio_$(PLATFORM)) $(testlib_os_$(PLATFORM)) $(testlib_encoding_utf8_$(PLATFORM)) $(testlib_strings_$(PLATFORM))
+	@printf 'HAREC \t$@\n'
+	@mkdir -p $(TESTCACHE)/encoding/json
+	@HARECACHE=$(TESTCACHE) $(HAREC) $(TESTHAREFLAGS) -o $@ -Nencoding::json \
+		-t$(TESTCACHE)/encoding/json/encoding_json.td $(testlib_encoding_json_any_srcs)
+
 # encoding::pem (+any)
 testlib_encoding_pem_any_srcs = \
 	$(STDLIB)/encoding/pem/pem.ha \

	hare [hare] The Hare programming language
	git clone https://git.torresjrjr.com/hare.git
	Log \| Files \| Refs \| README \| LICENSE

A	encoding/json/+test/lexer.ha	\|	60	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	encoding/json/lex.ha	\|	264	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	encoding/json/types.ha	\|	32	++++++++++++++++++++++++++++++++
M	scripts/gen-stdlib	\|	17	+++++++++++++++++
M	stdlib.mk	\|	35	+++++++++++++++++++++++++++++++++++