hare

[hare] The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

commit a112f1369a2f5e01f7bbe170dddabc06df571ca2
parent dbf510799d20337552e0e66b8a48193b7c538505
Author: Drew DeVault <sir@cmpwn.com>
Date:   Mon,  9 May 2022 15:08:44 +0200

encoding::json: new module

Signed-off-by: Drew DeVault <sir@cmpwn.com>

Diffstat:
Aencoding/json/+test/lexer.ha | 60++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aencoding/json/lex.ha | 264+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aencoding/json/types.ha | 32++++++++++++++++++++++++++++++++
Mscripts/gen-stdlib | 17+++++++++++++++++
Mstdlib.mk | 35+++++++++++++++++++++++++++++++++++
5 files changed, 408 insertions(+), 0 deletions(-)

diff --git a/encoding/json/+test/lexer.ha b/encoding/json/+test/lexer.ha @@ -0,0 +1,60 @@ +use bufio; +use strings; +use io; + +@test fn lex() void = { + const cases: [_](str, []token) = [ + ("true", [true]), + ("false", [false]), + ("null", [_null]), + ("1234", [1234.0]), + ("12.34", [12.34]), + ("12.34e5", [12.34e5]), + ("12.34E5", [12.34e5]), + ("12e5", [12.0e5]), + ("-1234", [-1234.0]), + (`"hello world"`, ["hello world"]), + (`"\"\\\/\b\f\n\r\t\u0020"`, ["\"\\/\b\f\n\r\t\u0020"]), + ("[ null, null ]", [arraystart, _null, comma, _null, arrayend]), + ]; + + for (let i = 0z; i < len(cases); i += 1) { + const src = strings::toutf8(cases[i].0); + const src = bufio::fixed(src, io::mode::READ); + const lexer = lex(&src); + defer finish(&lexer); + + for (let j = 0z; j < len(cases[i].1); j += 1) { + const want = cases[i].1[j]; + const have = next(&lexer)! as token; + assert(tokeq(want, have)); + }; + + assert(next(&lexer) is io::EOF); + }; +}; + +fn tokeq(want: token, have: token) bool = { + match (want) { + case _null => + return have is _null; + case comma => + return have is comma; + case colon => + return have is colon; + case arraystart => + return have is arraystart; + case arrayend => + return have is arrayend; + case objstart => + return have is objstart; + case objend => + return have is objend; + case let b: bool => + return have as bool == b; + case let f: f64 => + return have as f64 == f; + case let s: str => + return have as str == s; + }; +}; diff --git a/encoding/json/lex.ha b/encoding/json/lex.ha @@ -0,0 +1,264 @@ +use ascii; +use bufio; +use encoding::utf8; +use io; +use os; +use strconv; +use strings; +use strio; + +export type lexer = struct { + src: bufio::bufstream, + buffer: []u8, + strbuf: strio::stream, +}; + +// Creates a new JSON lexer. The caller can obtain tokens with [[next]] and +// should pass the result to [[finish]] when they're done with it. +export fn lex(src: io::handle) lexer = { + let buf: []u8 = alloc([0...], os::BUFSIZ); + return lexer { + src = bufio::buffered(src, buf, []), + buffer = buf, + strbuf = strio::dynamic(), + }; +}; + +// Frees state associated with a JSON lexer. +export fn finish(lex: *lexer) void = { + free(lex.buffer); + io::close(&lex.strbuf)!; +}; + +// Returns the next token from a JSON lexer. The return value is borrowed from +// the lexer and will be overwritten on subsequent calls. +export fn next(lex: *lexer) (token | io::EOF | error) = { + const rn = match (nextrunews(lex)?) { + case io::EOF => + return io::EOF; + case let rn: rune => + yield rn; + }; + + switch (rn) { + case '[' => + return arraystart; + case ']' => + return arrayend; + case '{' => + return objstart; + case '}' => + return objend; + case ',' => + return comma; + case ':' => + return colon; + case '"' => + return scan_str(lex)?; + case => + return invalid; + case => + yield; + }; + + if (ascii::isdigit(rn) || rn == '+' || rn == '-') { + bufio::unreadrune(&lex.src, rn); + return scan_number(lex)?; + }; + + bufio::unreadrune(&lex.src, rn); + const word = scan_word(lex)?; + switch (word) { + case "true" => + return true; + case "false" => + return false; + case "null" => + return _null; + case => + return invalid; + }; +}; + +// Scans until encountering a non-alphabetical character, returning the +// resulting word. +fn scan_word(lex: *lexer) (str | error) = { + strio::reset(&lex.strbuf); + for (true) { + const rn = match (nextrune(lex)?) { + case let rn: rune => + yield rn; + case io::EOF => + break; + }; + if (!ascii::isalpha(rn)) { + bufio::unreadrune(&lex.src, rn); + break; + }; + strio::appendrune(&lex.strbuf, rn)!; + }; + return strio::string(&lex.strbuf); +}; + +type numstate = enum { + INTEGER, + FRACTION, + EXPONENT, +}; + +fn scan_number(lex: *lexer) (token | error) = { + strio::reset(&lex.strbuf); + + let state = numstate::INTEGER; + for (true) { + const rn = match (nextrune(lex)?) { + case let rn: rune => + yield rn; + case io::EOF => + break; + }; + + switch (state) { + case numstate::INTEGER => + switch (rn) { + case '.' => + state = numstate::FRACTION; + case 'e', 'E' => + state = numstate::EXPONENT; + case '+', '-' => + void; + case => + if (!ascii::isdigit(rn)) { + return invalid; + }; + }; + case numstate::FRACTION => + switch (rn) { + case 'e', 'E' => + state = numstate::EXPONENT; + case => + if (!ascii::isdigit(rn)) { + bufio::unreadrune(&lex.src, rn); + break; + }; + }; + case numstate::EXPONENT => + if (!ascii::isdigit(rn)) { + bufio::unreadrune(&lex.src, rn); + break; + }; + }; + + strio::appendrune(&lex.strbuf, rn)!; + }; + + match (strconv::stof64(strio::string(&lex.strbuf))) { + case let f: f64 => + return f; + case => + return invalid; + }; +}; + +fn scan_str(lex: *lexer) (token | error) = { + strio::reset(&lex.strbuf); + + for (true) { + const rn = match (nextrune(lex)?) { + case let rn: rune => + yield rn; + case io::EOF => + return invalid; + }; + + switch (rn) { + case '"' => + break; + case '\\' => + const rn = scan_escape(lex)?; + strio::appendrune(&lex.strbuf, rn)!; + case => + strio::appendrune(&lex.strbuf, rn)!; + }; + }; + + return strio::string(&lex.strbuf); +}; + +fn scan_escape(lex: *lexer) (rune | error) = { + const rn = match (nextrune(lex)?) { + case let rn: rune => + yield rn; + case io::EOF => + return invalid; + }; + + switch (rn) { + case '\"' => + return '\"'; + case '\\' => + return '\\'; + case '/' => + return '/'; + case 'b' => + return '\b'; + case 'f' => + return '\f'; + case 'n' => + return '\n'; + case 'r' => + return '\r'; + case 't' => + return '\t'; + case 'u' => + let buf: [4]u8 = [0...]; + match (io::readall(&lex.src, buf)?) { + case io::EOF => + return invalid; + case size => + yield; + }; + const s = match (strings::try_fromutf8(buf)) { + case let s: str => + yield s; + case => + return invalid; + }; + match (strconv::stou32b(s, strconv::base::HEX)) { + case let u: u32 => + return u: rune; + case => + return invalid; + }; + }; +}; + +// Gets the next rune from the I/O source +fn nextrune(lex: *lexer) (rune | io::EOF | error) = { + match (bufio::scanrune(&lex.src)) { + case let err: io::error => + return err; + case utf8::invalid => + return invalid; + case io::EOF => + return io::EOF; + case let rn: rune => + return rn; + }; +}; + +// Like nextrune but skips whitespace tokens +fn nextrunews(lex: *lexer) (rune | io::EOF | error) = { + for (true) { + match (nextrune(lex)?) { + case let rn: rune => + if (ascii::isspace(rn)) { + continue; + }; + return rn; + case io::EOF => + return io::EOF; + }; + }; + abort(); // Unreachable +}; diff --git a/encoding/json/types.ha b/encoding/json/types.ha @@ -0,0 +1,32 @@ +use io; + +// An invalid JSON token was encountered. +export type invalid = !void; + +// A tagged union of all possible errors returned from this module. +export type error = !(invalid | io::error); + +// The JSON null value. +export type _null = void; + +// The '[' token, signaling the start of a JSON array. +export type arraystart = void; + +// The ']' token, signaling the end of a JSON array. +export type arrayend = void; + +// The '{' token, signaling the start of a JSON object. +export type objstart = void; + +// The '}' token, signaling the end of a JSON object. +export type objend = void; + +// The ':' token. +export type colon = void; + +// The ',' token. +export type comma = void; + +// All tokens which can be returned from the JSON tokenizer. +export type token = (arraystart | arrayend | objstart | + objend | colon | comma | str | f64 | bool | _null); diff --git a/scripts/gen-stdlib b/scripts/gen-stdlib @@ -509,6 +509,22 @@ encoding_hex() { gen_ssa encoding::hex ascii bytes fmt io strconv strio strings } +encoding_json() { + if [ $testing -eq 0 ] + then + gen_srcs encoding::json \ + types.ha \ + lex.ha + else + gen_srcs encoding::json \ + types.ha \ + lex.ha \ + +test/lexer.ha + fi + gen_ssa encoding::json ascii bufio io strio os encoding::utf8 strings \ + strconv +} + encoding_pem() { if [ $testing -eq 0 ] then @@ -1363,6 +1379,7 @@ dirs encoding::base64 encoding::base32 encoding::hex +encoding::json encoding::pem encoding::utf8 endian diff --git a/stdlib.mk b/stdlib.mk @@ -308,6 +308,12 @@ stdlib_deps_any += $(stdlib_encoding_hex_any) stdlib_encoding_hex_linux = $(stdlib_encoding_hex_any) stdlib_encoding_hex_freebsd = $(stdlib_encoding_hex_any) +# gen_lib encoding::json (any) +stdlib_encoding_json_any = $(HARECACHE)/encoding/json/encoding_json-any.o +stdlib_deps_any += $(stdlib_encoding_json_any) +stdlib_encoding_json_linux = $(stdlib_encoding_json_any) +stdlib_encoding_json_freebsd = $(stdlib_encoding_json_any) + # gen_lib encoding::pem (any) stdlib_encoding_pem_any = $(HARECACHE)/encoding/pem/encoding_pem-any.o stdlib_deps_any += $(stdlib_encoding_pem_any) @@ -1051,6 +1057,17 @@ $(HARECACHE)/encoding/hex/encoding_hex-any.ssa: $(stdlib_encoding_hex_any_srcs) @HARECACHE=$(HARECACHE) $(HAREC) $(HAREFLAGS) -o $@ -Nencoding::hex \ -t$(HARECACHE)/encoding/hex/encoding_hex.td $(stdlib_encoding_hex_any_srcs) +# encoding::json (+any) +stdlib_encoding_json_any_srcs = \ + $(STDLIB)/encoding/json/types.ha \ + $(STDLIB)/encoding/json/lex.ha + +$(HARECACHE)/encoding/json/encoding_json-any.ssa: $(stdlib_encoding_json_any_srcs) $(stdlib_rt) $(stdlib_ascii_$(PLATFORM)) $(stdlib_bufio_$(PLATFORM)) $(stdlib_io_$(PLATFORM)) $(stdlib_strio_$(PLATFORM)) $(stdlib_os_$(PLATFORM)) $(stdlib_encoding_utf8_$(PLATFORM)) $(stdlib_strings_$(PLATFORM)) + @printf 'HAREC \t$@\n' + @mkdir -p $(HARECACHE)/encoding/json + @HARECACHE=$(HARECACHE) $(HAREC) $(HAREFLAGS) -o $@ -Nencoding::json \ + -t$(HARECACHE)/encoding/json/encoding_json.td $(stdlib_encoding_json_any_srcs) + # encoding::pem (+any) stdlib_encoding_pem_any_srcs = \ $(STDLIB)/encoding/pem/pem.ha @@ -2370,6 +2387,12 @@ testlib_deps_any += $(testlib_encoding_hex_any) testlib_encoding_hex_linux = $(testlib_encoding_hex_any) testlib_encoding_hex_freebsd = $(testlib_encoding_hex_any) +# gen_lib encoding::json (any) +testlib_encoding_json_any = $(TESTCACHE)/encoding/json/encoding_json-any.o +testlib_deps_any += $(testlib_encoding_json_any) +testlib_encoding_json_linux = $(testlib_encoding_json_any) +testlib_encoding_json_freebsd = $(testlib_encoding_json_any) + # gen_lib encoding::pem (any) testlib_encoding_pem_any = $(TESTCACHE)/encoding/pem/encoding_pem-any.o testlib_deps_any += $(testlib_encoding_pem_any) @@ -3133,6 +3156,18 @@ $(TESTCACHE)/encoding/hex/encoding_hex-any.ssa: $(testlib_encoding_hex_any_srcs) @HARECACHE=$(TESTCACHE) $(HAREC) $(TESTHAREFLAGS) -o $@ -Nencoding::hex \ -t$(TESTCACHE)/encoding/hex/encoding_hex.td $(testlib_encoding_hex_any_srcs) +# encoding::json (+any) +testlib_encoding_json_any_srcs = \ + $(STDLIB)/encoding/json/types.ha \ + $(STDLIB)/encoding/json/lex.ha \ + $(STDLIB)/encoding/json/+test/lexer.ha + +$(TESTCACHE)/encoding/json/encoding_json-any.ssa: $(testlib_encoding_json_any_srcs) $(testlib_rt) $(testlib_ascii_$(PLATFORM)) $(testlib_bufio_$(PLATFORM)) $(testlib_io_$(PLATFORM)) $(testlib_strio_$(PLATFORM)) $(testlib_os_$(PLATFORM)) $(testlib_encoding_utf8_$(PLATFORM)) $(testlib_strings_$(PLATFORM)) + @printf 'HAREC \t$@\n' + @mkdir -p $(TESTCACHE)/encoding/json + @HARECACHE=$(TESTCACHE) $(HAREC) $(TESTHAREFLAGS) -o $@ -Nencoding::json \ + -t$(TESTCACHE)/encoding/json/encoding_json.td $(testlib_encoding_json_any_srcs) + # encoding::pem (+any) testlib_encoding_pem_any_srcs = \ $(STDLIB)/encoding/pem/pem.ha \