hare

The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

commit d396fa03d53033441b5bfb2fd97c81835e09f57c
parent d0c9cd5f8380963cf9e5775a41caf9aea0393a7c
Author: Drew DeVault <sir@cmpwn.com>
Date:   Tue,  6 Apr 2021 11:03:50 -0400

encoding::xml: parse element start

Diffstat:
Aencoding/xml/chars.ha | 30++++++++++++++++++++++++++++++
Mencoding/xml/parser.ha | 60++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
Mencoding/xml/types.ha | 46++++++++++++++++++++++++++++++++++++++++------
Mscripts/gen-stdlib | 5+++--
Mstdlib.mk | 10++++++----
5 files changed, 137 insertions(+), 14 deletions(-)

diff --git a/encoding/xml/chars.ha b/encoding/xml/chars.ha @@ -0,0 +1,30 @@ +use ascii; + +fn isnamestart(rn: rune) bool = { + if (rn == ':' || rn == '_' || ascii::isalpha(rn)) return true; + let rn = rn: u32; + return + (rn >= 0xC0 && rn <= 0xD6) || + (rn >= 0xD8 && rn <= 0xF6) || + (rn >= 0xF8 && rn <= 0x2FF) || + (rn >= 0x370 && rn <= 0x37D) || + (rn >= 0x37F && rn <= 0x1FFF) || + (rn >= 0x200C && rn <= 0x200D) || + (rn >= 0x2070 && rn <= 0x218F) || + (rn >= 0x2C00 && rn <= 0x2FEF) || + (rn >= 0x3001 && rn <= 0xD7FF) || + (rn >= 0xF900 && rn <= 0xFDCF) || + (rn >= 0xFDF0 && rn <= 0xFFFD) || + (rn >= 0x10000 && rn <= 0xEFFFF); +}; + +fn isname(rn: rune) bool = { + if (isnamestart(rn) || rn == '-' || rn == '.' || ascii::isdigit(rn)) { + return true; + }; + let rn = rn: u32; + return + (rn == 0xB7) || + (rn >= 0x300 && rn <= 0x36F) || + (rn >= 0x203F && rn <= 0x2040); +}; diff --git a/encoding/xml/parser.ha b/encoding/xml/parser.ha @@ -3,6 +3,7 @@ use bufio; use encoding::utf8; use io; use strings; +use strio; // Returns an XML parser which reads from a stream. The caller must call // [parser_free] when they are finished with it. @@ -36,10 +37,65 @@ export fn parser_free(par: *parser) void = { free(par); }; +// Scans for and returns the next [token]. The caller must pass the returned +// token to [token_free] when they're done with it. +export fn scan(par: *parser) (token | void | error) = { + want(par, OPTWS)?; + let rn: rune = match (bufio::scanrune(par.in)?) { + io::EOF => return void, + rn: rune => rn, + }; + bufio::unreadrune(par.in, rn); + return switch (par.state) { + state::ELEMENT => switch (rn) { + '<' => { + let el = scan_element(par); + par.state = state::ATTRS; + el; + }, + * => syntaxerr, + }, + state::ATTRS => { + abort(); // TODO + }, + }; +}; + +fn scan_element(par: *parser) (token | error) = { + want(par, '<')?; + let name = scan_name(par)?; + return name: elementstart; +}; + +fn scan_name(par: *parser) (str | error) = { + let buf = strio::dynamic(); + + const rn = match (bufio::scanrune(par.in)?) { + io::EOF => return syntaxerr, + rn: rune => rn, + }; + if (!isnamestart(rn)) { + return syntaxerr; + }; + strio::appendrune(buf, rn); + + for (true) match (bufio::scanrune(par.in)?) { + io::EOF => return syntaxerr, + rn: rune => if (isname(rn)) { + strio::appendrune(buf, rn); + } else { + bufio::unreadrune(par.in, rn); + break; + }, + }; + + return strio::finish(buf); +}; + fn prolog(par: *parser) (void | error) = { - want(par, "<?xml", WS); + want(par, "<?xml", WS)?; - want(par, "version", OPTWS, '=', OPTWS); + want(par, "version", OPTWS, '=', OPTWS)?; let quot = quote(par)?; want(par, OPTWS, "1.")?; for (true) match (bufio::scanrune(par.in)?) { diff --git a/encoding/xml/types.ha b/encoding/xml/types.ha @@ -1,19 +1,40 @@ use encoding::utf8; use io; -// A syntax error was encountered in the document. -export type syntaxerr = void!; // TODO: Add line number? - -// Any error which can occur during XML parsing. -export type error = (syntaxerr | utf8::invalid | io::error)!; - // Represents the state for an XML parser. export type parser = struct { orig: *io::stream, in: *io::stream, buf: [4096]u8, + state: state, }; +export type state = enum { + ELEMENT, + ATTRS, +}; + +// The start of an XML element, e.g. <example +export type elementstart = str; + +// The end of an XML element, e.g. /> or </example> +export type elementend = void; + +// An attribute of an XML element, e.g. foo="bar" +export type attribute = (str, str); + +// Text content of an XML element, e.g. baz or <![CDATA[baz]]> +export type text = str; + +// Any valid XML token +export type token = (elementstart | elementend | attribute | text); + +// A syntax error was encountered in the document. +export type syntaxerr = void!; // TODO: Add line number? + +// Any error which can occur during XML parsing. +export type error = (syntaxerr | utf8::invalid | io::error)!; + // Converts an [error] to a user-friendly string representation. export fn strerror(err: error) const str = { return match (err) { @@ -22,3 +43,16 @@ export fn strerror(err: error) const str = { err: io::error => io::strerror(err), }; }; + +// Frees resources associated with a [token]. +export fn token_free(tok: token) void = { + match (tok) { + el: elementstart => free(el), + attr: attribute => { + free(attr.0); + free(attr.1); + }, + tx: text => free(tx), + elementend => void, + }; +}; diff --git a/scripts/gen-stdlib b/scripts/gen-stdlib @@ -216,8 +216,9 @@ encoding_utf8() { encoding_xml() { gen_srcs encoding::xml \ types.ha \ - parser.ha - gen_ssa encoding::xml io bufio strings + parser.ha \ + chars.ha + gen_ssa encoding::xml io bufio strings ascii strio } endian() { diff --git a/stdlib.mk b/stdlib.mk @@ -351,9 +351,10 @@ $(HARECACHE)/encoding/utf8/encoding_utf8.ssa: $(stdlib_encoding_utf8_srcs) $(std # encoding::xml stdlib_encoding_xml_srcs= \ $(STDLIB)/encoding/xml/types.ha \ - $(STDLIB)/encoding/xml/parser.ha + $(STDLIB)/encoding/xml/parser.ha \ + $(STDLIB)/encoding/xml/chars.ha -$(HARECACHE)/encoding/xml/encoding_xml.ssa: $(stdlib_encoding_xml_srcs) $(stdlib_rt) $(stdlib_io) $(stdlib_bufio) $(stdlib_strings) +$(HARECACHE)/encoding/xml/encoding_xml.ssa: $(stdlib_encoding_xml_srcs) $(stdlib_rt) $(stdlib_io) $(stdlib_bufio) $(stdlib_strings) $(stdlib_ascii) $(stdlib_strio) @printf 'HAREC \t$@\n' @mkdir -p $(HARECACHE)/encoding/xml @HARECACHE=$(HARECACHE) $(HAREC) $(HAREFLAGS) -o $@ -Nencoding::xml \ @@ -1167,9 +1168,10 @@ $(TESTCACHE)/encoding/utf8/encoding_utf8.ssa: $(testlib_encoding_utf8_srcs) $(te # encoding::xml testlib_encoding_xml_srcs= \ $(STDLIB)/encoding/xml/types.ha \ - $(STDLIB)/encoding/xml/parser.ha + $(STDLIB)/encoding/xml/parser.ha \ + $(STDLIB)/encoding/xml/chars.ha -$(TESTCACHE)/encoding/xml/encoding_xml.ssa: $(testlib_encoding_xml_srcs) $(testlib_rt) $(testlib_io) $(testlib_bufio) $(testlib_strings) +$(TESTCACHE)/encoding/xml/encoding_xml.ssa: $(testlib_encoding_xml_srcs) $(testlib_rt) $(testlib_io) $(testlib_bufio) $(testlib_strings) $(testlib_ascii) $(testlib_strio) @printf 'HAREC \t$@\n' @mkdir -p $(TESTCACHE)/encoding/xml @HARECACHE=$(TESTCACHE) $(HAREC) $(TESTHAREFLAGS) -o $@ -Nencoding::xml \