commit a112f1369a2f5e01f7bbe170dddabc06df571ca2
parent dbf510799d20337552e0e66b8a48193b7c538505
Author: Drew DeVault <sir@cmpwn.com>
Date: Mon, 9 May 2022 15:08:44 +0200
encoding::json: new module
Signed-off-by: Drew DeVault <sir@cmpwn.com>
Diffstat:
5 files changed, 408 insertions(+), 0 deletions(-)
diff --git a/encoding/json/+test/lexer.ha b/encoding/json/+test/lexer.ha
@@ -0,0 +1,60 @@
+use bufio;
+use strings;
+use io;
+
+@test fn lex() void = {
+ const cases: [_](str, []token) = [
+ ("true", [true]),
+ ("false", [false]),
+ ("null", [_null]),
+ ("1234", [1234.0]),
+ ("12.34", [12.34]),
+ ("12.34e5", [12.34e5]),
+ ("12.34E5", [12.34e5]),
+ ("12e5", [12.0e5]),
+ ("-1234", [-1234.0]),
+ (`"hello world"`, ["hello world"]),
+ (`"\"\\\/\b\f\n\r\t\u0020"`, ["\"\\/\b\f\n\r\t\u0020"]),
+ ("[ null, null ]", [arraystart, _null, comma, _null, arrayend]),
+ ];
+
+ for (let i = 0z; i < len(cases); i += 1) {
+ const src = strings::toutf8(cases[i].0);
+ const src = bufio::fixed(src, io::mode::READ);
+ const lexer = lex(&src);
+ defer finish(&lexer);
+
+ for (let j = 0z; j < len(cases[i].1); j += 1) {
+ const want = cases[i].1[j];
+ const have = next(&lexer)! as token;
+ assert(tokeq(want, have));
+ };
+
+ assert(next(&lexer) is io::EOF);
+ };
+};
+
+fn tokeq(want: token, have: token) bool = {
+ match (want) {
+ case _null =>
+ return have is _null;
+ case comma =>
+ return have is comma;
+ case colon =>
+ return have is colon;
+ case arraystart =>
+ return have is arraystart;
+ case arrayend =>
+ return have is arrayend;
+ case objstart =>
+ return have is objstart;
+ case objend =>
+ return have is objend;
+ case let b: bool =>
+ return have as bool == b;
+ case let f: f64 =>
+ return have as f64 == f;
+ case let s: str =>
+ return have as str == s;
+ };
+};
diff --git a/encoding/json/lex.ha b/encoding/json/lex.ha
@@ -0,0 +1,264 @@
+use ascii;
+use bufio;
+use encoding::utf8;
+use io;
+use os;
+use strconv;
+use strings;
+use strio;
+
+export type lexer = struct {
+ src: bufio::bufstream,
+ buffer: []u8,
+ strbuf: strio::stream,
+};
+
+// Creates a new JSON lexer. The caller can obtain tokens with [[next]] and
+// should pass the result to [[finish]] when they're done with it.
+export fn lex(src: io::handle) lexer = {
+ let buf: []u8 = alloc([0...], os::BUFSIZ);
+ return lexer {
+ src = bufio::buffered(src, buf, []),
+ buffer = buf,
+ strbuf = strio::dynamic(),
+ };
+};
+
+// Frees state associated with a JSON lexer.
+export fn finish(lex: *lexer) void = {
+ free(lex.buffer);
+ io::close(&lex.strbuf)!;
+};
+
+// Returns the next token from a JSON lexer. The return value is borrowed from
+// the lexer and will be overwritten on subsequent calls.
+export fn next(lex: *lexer) (token | io::EOF | error) = {
+ const rn = match (nextrunews(lex)?) {
+ case io::EOF =>
+ return io::EOF;
+ case let rn: rune =>
+ yield rn;
+ };
+
+ switch (rn) {
+ case '[' =>
+ return arraystart;
+ case ']' =>
+ return arrayend;
+ case '{' =>
+ return objstart;
+ case '}' =>
+ return objend;
+ case ',' =>
+ return comma;
+ case ':' =>
+ return colon;
+ case '"' =>
+ return scan_str(lex)?;
+ case =>
+ return invalid;
+ case =>
+ yield;
+ };
+
+ if (ascii::isdigit(rn) || rn == '+' || rn == '-') {
+ bufio::unreadrune(&lex.src, rn);
+ return scan_number(lex)?;
+ };
+
+ bufio::unreadrune(&lex.src, rn);
+ const word = scan_word(lex)?;
+ switch (word) {
+ case "true" =>
+ return true;
+ case "false" =>
+ return false;
+ case "null" =>
+ return _null;
+ case =>
+ return invalid;
+ };
+};
+
+// Scans until encountering a non-alphabetical character, returning the
+// resulting word.
+fn scan_word(lex: *lexer) (str | error) = {
+ strio::reset(&lex.strbuf);
+ for (true) {
+ const rn = match (nextrune(lex)?) {
+ case let rn: rune =>
+ yield rn;
+ case io::EOF =>
+ break;
+ };
+ if (!ascii::isalpha(rn)) {
+ bufio::unreadrune(&lex.src, rn);
+ break;
+ };
+ strio::appendrune(&lex.strbuf, rn)!;
+ };
+ return strio::string(&lex.strbuf);
+};
+
+type numstate = enum {
+ INTEGER,
+ FRACTION,
+ EXPONENT,
+};
+
+fn scan_number(lex: *lexer) (token | error) = {
+ strio::reset(&lex.strbuf);
+
+ let state = numstate::INTEGER;
+ for (true) {
+ const rn = match (nextrune(lex)?) {
+ case let rn: rune =>
+ yield rn;
+ case io::EOF =>
+ break;
+ };
+
+ switch (state) {
+ case numstate::INTEGER =>
+ switch (rn) {
+ case '.' =>
+ state = numstate::FRACTION;
+ case 'e', 'E' =>
+ state = numstate::EXPONENT;
+ case '+', '-' =>
+ void;
+ case =>
+ if (!ascii::isdigit(rn)) {
+ return invalid;
+ };
+ };
+ case numstate::FRACTION =>
+ switch (rn) {
+ case 'e', 'E' =>
+ state = numstate::EXPONENT;
+ case =>
+ if (!ascii::isdigit(rn)) {
+ bufio::unreadrune(&lex.src, rn);
+ break;
+ };
+ };
+ case numstate::EXPONENT =>
+ if (!ascii::isdigit(rn)) {
+ bufio::unreadrune(&lex.src, rn);
+ break;
+ };
+ };
+
+ strio::appendrune(&lex.strbuf, rn)!;
+ };
+
+ match (strconv::stof64(strio::string(&lex.strbuf))) {
+ case let f: f64 =>
+ return f;
+ case =>
+ return invalid;
+ };
+};
+
+fn scan_str(lex: *lexer) (token | error) = {
+ strio::reset(&lex.strbuf);
+
+ for (true) {
+ const rn = match (nextrune(lex)?) {
+ case let rn: rune =>
+ yield rn;
+ case io::EOF =>
+ return invalid;
+ };
+
+ switch (rn) {
+ case '"' =>
+ break;
+ case '\\' =>
+ const rn = scan_escape(lex)?;
+ strio::appendrune(&lex.strbuf, rn)!;
+ case =>
+ strio::appendrune(&lex.strbuf, rn)!;
+ };
+ };
+
+ return strio::string(&lex.strbuf);
+};
+
+fn scan_escape(lex: *lexer) (rune | error) = {
+ const rn = match (nextrune(lex)?) {
+ case let rn: rune =>
+ yield rn;
+ case io::EOF =>
+ return invalid;
+ };
+
+ switch (rn) {
+ case '\"' =>
+ return '\"';
+ case '\\' =>
+ return '\\';
+ case '/' =>
+ return '/';
+ case 'b' =>
+ return '\b';
+ case 'f' =>
+ return '\f';
+ case 'n' =>
+ return '\n';
+ case 'r' =>
+ return '\r';
+ case 't' =>
+ return '\t';
+ case 'u' =>
+ let buf: [4]u8 = [0...];
+ match (io::readall(&lex.src, buf)?) {
+ case io::EOF =>
+ return invalid;
+ case size =>
+ yield;
+ };
+ const s = match (strings::try_fromutf8(buf)) {
+ case let s: str =>
+ yield s;
+ case =>
+ return invalid;
+ };
+ match (strconv::stou32b(s, strconv::base::HEX)) {
+ case let u: u32 =>
+ return u: rune;
+ case =>
+ return invalid;
+ };
+ };
+};
+
+// Gets the next rune from the I/O source
+fn nextrune(lex: *lexer) (rune | io::EOF | error) = {
+ match (bufio::scanrune(&lex.src)) {
+ case let err: io::error =>
+ return err;
+ case utf8::invalid =>
+ return invalid;
+ case io::EOF =>
+ return io::EOF;
+ case let rn: rune =>
+ return rn;
+ };
+};
+
+// Like nextrune but skips whitespace tokens
+fn nextrunews(lex: *lexer) (rune | io::EOF | error) = {
+ for (true) {
+ match (nextrune(lex)?) {
+ case let rn: rune =>
+ if (ascii::isspace(rn)) {
+ continue;
+ };
+ return rn;
+ case io::EOF =>
+ return io::EOF;
+ };
+ };
+ abort(); // Unreachable
+};
diff --git a/encoding/json/types.ha b/encoding/json/types.ha
@@ -0,0 +1,32 @@
+use io;
+
+// An invalid JSON token was encountered.
+export type invalid = !void;
+
+// A tagged union of all possible errors returned from this module.
+export type error = !(invalid | io::error);
+
+// The JSON null value.
+export type _null = void;
+
+// The '[' token, signaling the start of a JSON array.
+export type arraystart = void;
+
+// The ']' token, signaling the end of a JSON array.
+export type arrayend = void;
+
+// The '{' token, signaling the start of a JSON object.
+export type objstart = void;
+
+// The '}' token, signaling the end of a JSON object.
+export type objend = void;
+
+// The ':' token.
+export type colon = void;
+
+// The ',' token.
+export type comma = void;
+
+// All tokens which can be returned from the JSON tokenizer.
+export type token = (arraystart | arrayend | objstart |
+ objend | colon | comma | str | f64 | bool | _null);
diff --git a/scripts/gen-stdlib b/scripts/gen-stdlib
@@ -509,6 +509,22 @@ encoding_hex() {
gen_ssa encoding::hex ascii bytes fmt io strconv strio strings
}
+encoding_json() {
+ if [ $testing -eq 0 ]
+ then
+ gen_srcs encoding::json \
+ types.ha \
+ lex.ha
+ else
+ gen_srcs encoding::json \
+ types.ha \
+ lex.ha \
+ +test/lexer.ha
+ fi
+ gen_ssa encoding::json ascii bufio io strio os encoding::utf8 strings \
+ strconv
+}
+
encoding_pem() {
if [ $testing -eq 0 ]
then
@@ -1363,6 +1379,7 @@ dirs
encoding::base64
encoding::base32
encoding::hex
+encoding::json
encoding::pem
encoding::utf8
endian
diff --git a/stdlib.mk b/stdlib.mk
@@ -308,6 +308,12 @@ stdlib_deps_any += $(stdlib_encoding_hex_any)
stdlib_encoding_hex_linux = $(stdlib_encoding_hex_any)
stdlib_encoding_hex_freebsd = $(stdlib_encoding_hex_any)
+# gen_lib encoding::json (any)
+stdlib_encoding_json_any = $(HARECACHE)/encoding/json/encoding_json-any.o
+stdlib_deps_any += $(stdlib_encoding_json_any)
+stdlib_encoding_json_linux = $(stdlib_encoding_json_any)
+stdlib_encoding_json_freebsd = $(stdlib_encoding_json_any)
+
# gen_lib encoding::pem (any)
stdlib_encoding_pem_any = $(HARECACHE)/encoding/pem/encoding_pem-any.o
stdlib_deps_any += $(stdlib_encoding_pem_any)
@@ -1051,6 +1057,17 @@ $(HARECACHE)/encoding/hex/encoding_hex-any.ssa: $(stdlib_encoding_hex_any_srcs)
@HARECACHE=$(HARECACHE) $(HAREC) $(HAREFLAGS) -o $@ -Nencoding::hex \
-t$(HARECACHE)/encoding/hex/encoding_hex.td $(stdlib_encoding_hex_any_srcs)
+# encoding::json (+any)
+stdlib_encoding_json_any_srcs = \
+ $(STDLIB)/encoding/json/types.ha \
+ $(STDLIB)/encoding/json/lex.ha
+
+$(HARECACHE)/encoding/json/encoding_json-any.ssa: $(stdlib_encoding_json_any_srcs) $(stdlib_rt) $(stdlib_ascii_$(PLATFORM)) $(stdlib_bufio_$(PLATFORM)) $(stdlib_io_$(PLATFORM)) $(stdlib_strio_$(PLATFORM)) $(stdlib_os_$(PLATFORM)) $(stdlib_encoding_utf8_$(PLATFORM)) $(stdlib_strings_$(PLATFORM))
+ @printf 'HAREC \t$@\n'
+ @mkdir -p $(HARECACHE)/encoding/json
+ @HARECACHE=$(HARECACHE) $(HAREC) $(HAREFLAGS) -o $@ -Nencoding::json \
+ -t$(HARECACHE)/encoding/json/encoding_json.td $(stdlib_encoding_json_any_srcs)
+
# encoding::pem (+any)
stdlib_encoding_pem_any_srcs = \
$(STDLIB)/encoding/pem/pem.ha
@@ -2370,6 +2387,12 @@ testlib_deps_any += $(testlib_encoding_hex_any)
testlib_encoding_hex_linux = $(testlib_encoding_hex_any)
testlib_encoding_hex_freebsd = $(testlib_encoding_hex_any)
+# gen_lib encoding::json (any)
+testlib_encoding_json_any = $(TESTCACHE)/encoding/json/encoding_json-any.o
+testlib_deps_any += $(testlib_encoding_json_any)
+testlib_encoding_json_linux = $(testlib_encoding_json_any)
+testlib_encoding_json_freebsd = $(testlib_encoding_json_any)
+
# gen_lib encoding::pem (any)
testlib_encoding_pem_any = $(TESTCACHE)/encoding/pem/encoding_pem-any.o
testlib_deps_any += $(testlib_encoding_pem_any)
@@ -3133,6 +3156,18 @@ $(TESTCACHE)/encoding/hex/encoding_hex-any.ssa: $(testlib_encoding_hex_any_srcs)
@HARECACHE=$(TESTCACHE) $(HAREC) $(TESTHAREFLAGS) -o $@ -Nencoding::hex \
-t$(TESTCACHE)/encoding/hex/encoding_hex.td $(testlib_encoding_hex_any_srcs)
+# encoding::json (+any)
+testlib_encoding_json_any_srcs = \
+ $(STDLIB)/encoding/json/types.ha \
+ $(STDLIB)/encoding/json/lex.ha \
+ $(STDLIB)/encoding/json/+test/lexer.ha
+
+$(TESTCACHE)/encoding/json/encoding_json-any.ssa: $(testlib_encoding_json_any_srcs) $(testlib_rt) $(testlib_ascii_$(PLATFORM)) $(testlib_bufio_$(PLATFORM)) $(testlib_io_$(PLATFORM)) $(testlib_strio_$(PLATFORM)) $(testlib_os_$(PLATFORM)) $(testlib_encoding_utf8_$(PLATFORM)) $(testlib_strings_$(PLATFORM))
+ @printf 'HAREC \t$@\n'
+ @mkdir -p $(TESTCACHE)/encoding/json
+ @HARECACHE=$(TESTCACHE) $(HAREC) $(TESTHAREFLAGS) -o $@ -Nencoding::json \
+ -t$(TESTCACHE)/encoding/json/encoding_json.td $(testlib_encoding_json_any_srcs)
+
# encoding::pem (+any)
testlib_encoding_pem_any_srcs = \
$(STDLIB)/encoding/pem/pem.ha \