hare

The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

commit c6fbfa5519948400676271027e801400732d7bdd
parent 235bdab44ca526567e112b23cd1aaff6dce042e4
Author: Drew DeVault <sir@cmpwn.com>
Date:   Mon,  5 Apr 2021 15:30:50 -0400

encoding::xml: initial skeleton

Diffstat:
Aencoding/xml/parser.ha | 144+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aencoding/xml/types.ha | 24++++++++++++++++++++++++
Mscripts/gen-stdlib | 8++++++++
Mstdlib.mk | 28++++++++++++++++++++++++++++
4 files changed, 204 insertions(+), 0 deletions(-)

diff --git a/encoding/xml/parser.ha b/encoding/xml/parser.ha @@ -0,0 +1,144 @@ +use ascii; +use bufio; +use encoding::utf8; +use io; +use strings; + +// Returns an XML parser which reads from a stream. The caller must call +// [parser_free] when they are finished with it. +// +// Hare's XML parser only supports UTF-8 encoded input files. +// +// This function will attempt to read the XML prologue before returning, and +// will return an error if it is not valid. +export fn parse(in: *io::stream) (*parser | error) = { + // XXX: The main reason we allocate this instead of returning it on the + // stack is so that we have a consistent address for the bufio buffer. + // This is kind of lame, maybe we can avoid that. + let par = alloc(parser { + orig = in, + in = in, + ... + }); + if (!bufio::isbuffered(in)) { + par.in = bufio::buffered(par.in, par.buf[..], []); + }; + prolog(par)?; + return par; +}; + +// Frees the resources associated with this parser. Does not close the +// underlying stream. +export fn parser_free(par: *parser) void = { + if (par.in != par.orig) { + io::close(par.in); + }; + free(par); +}; + +fn prolog(par: *parser) (void | error) = { + want(par, "<?xml", WS); + + want(par, "version", OPTWS, '=', OPTWS); + let quot = quote(par)?; + want(par, OPTWS, "1.")?; + for (true) match (bufio::scanrune(par.in)?) { + io::EOF => break, + rn: rune => if (!ascii::isdigit(rn)) { + bufio::unreadrune(par.in, rn); + break; + }, + }; + want(par, quot)?; + + // TODO: Replace this with attribute() when it's written + let hadws = want(par, OPTWS)?; + let encoding = match (bufio::scanrune(par.in)) { + io::EOF => false, + rn: rune => { + bufio::unreadrune(par.in, rn); + hadws && rn == 'e'; + }, + }; + if (encoding) { + want(par, "encoding", OPTWS, '=', OPTWS)?; + let quot = quote(par)?; + match (want(par, "UTF-8")) { + syntaxerr => return utf8::invalid, + err: error => return err, + bool => void, + }; + want(par, quot)?; + }; + + let hadws = want(par, OPTWS)?; + let standalone = match (bufio::scanrune(par.in)) { + io::EOF => false, + rn: rune => { + bufio::unreadrune(par.in, rn); + hadws && rn == 's'; + }, + }; + if (standalone) { + want(par, "standalone", OPTWS, '=', OPTWS)?; + let quot = quote(par)?; + // TODO: Should we support standalone="no"? + want(par, "yes", quot)?; + }; + + want(par, OPTWS, "?>")?; + // TODO: Parse doctypedecl & misc + return; +}; + +// Mandatory if true +type whitespace = bool; +def WS: whitespace = true; +def OPTWS: whitespace = false; + +fn quote(par: *parser) (rune | error) = { + return match (bufio::scanrune(par.in)?) { + * => return syntaxerr, + rn: rune => switch (rn) { + '"', '\'' => rn, + * => return syntaxerr, + }, + }; +}; + +fn want(par: *parser, tok: (rune | str | whitespace)...) (bool | error) = { + let hadws = false; + for (let i = 0z; i < len(tok); i += 1) match (tok[i]) { + x: rune => { + let have = match (bufio::scanrune(par.in)?) { + * => return syntaxerr, + rn: rune => rn, + }; + if (have != x) { + return syntaxerr; + }; + }, + x: str => { + let iter = strings::iter(x); + for (true) match (strings::next(&iter)) { + rn: rune => want(par, rn)?, + void => break, + }; + }, + ws: whitespace => { + let n = 0; + for (true; n += 1) match (bufio::scanrune(par.in)?) { + io::EOF => break, + rn: rune => if (!ascii::isspace(rn)) { + bufio::unreadrune(par.in, rn); + break; + }, + }; + if (ws && n < 1) { + return syntaxerr; + }; + hadws = n >= 1; + }, + }; + return hadws; +}; diff --git a/encoding/xml/types.ha b/encoding/xml/types.ha @@ -0,0 +1,24 @@ +use encoding::utf8; +use io; + +// A syntax error was encountered in the document. +export type syntaxerr = void!; // TODO: Add line number? + +// Any error which can occur during XML parsing. +export type error = (syntaxerr | utf8::invalid | io::error)!; + +// Represents the state for an XML parser. +export type parser = struct { + orig: *io::stream, + in: *io::stream, + buf: [4096]u8, +}; + +// Converts an [error] to a user-friendly string representation. +export fn strerror(err: error) const str = { + return match (err) { + syntaxerr => "Syntax error", + utf8::invalid => "Document is not valid UTF-8", + err: io::error => io::strerror(err), + }; +}; diff --git a/scripts/gen-stdlib b/scripts/gen-stdlib @@ -213,6 +213,13 @@ encoding_utf8() { gen_ssa encoding::utf8 types } +encoding_xml() { + gen_srcs encoding::xml \ + types.ha \ + parser.ha + gen_ssa encoding::xml io bufio strings +} + endian() { gen_srcs endian \ big.ha \ @@ -605,6 +612,7 @@ crypto_sha512 dirs encoding_hex encoding_utf8 +encoding_xml endian errors fmt diff --git a/stdlib.mk b/stdlib.mk @@ -105,6 +105,9 @@ hare_stdlib_deps+=$(stdlib_encoding_hex) stdlib_encoding_utf8=$(HARECACHE)/encoding/utf8/encoding_utf8.o hare_stdlib_deps+=$(stdlib_encoding_utf8) +stdlib_encoding_xml=$(HARECACHE)/encoding/xml/encoding_xml.o +hare_stdlib_deps+=$(stdlib_encoding_xml) + stdlib_endian=$(HARECACHE)/endian/endian.o hare_stdlib_deps+=$(stdlib_endian) @@ -345,6 +348,17 @@ $(HARECACHE)/encoding/utf8/encoding_utf8.ssa: $(stdlib_encoding_utf8_srcs) $(std @HARECACHE=$(HARECACHE) $(HAREC) $(HAREFLAGS) -o $@ -Nencoding::utf8 \ -t$(HARECACHE)/encoding/utf8/encoding_utf8.td $(stdlib_encoding_utf8_srcs) +# encoding::xml +stdlib_encoding_xml_srcs= \ + $(STDLIB)/encoding/xml/types.ha \ + $(STDLIB)/encoding/xml/parser.ha + +$(HARECACHE)/encoding/xml/encoding_xml.ssa: $(stdlib_encoding_xml_srcs) $(stdlib_rt) $(stdlib_io) $(stdlib_bufio) $(stdlib_strings) + @printf 'HAREC \t$@\n' + @mkdir -p $(HARECACHE)/encoding/xml + @HARECACHE=$(HARECACHE) $(HAREC) $(HAREFLAGS) -o $@ -Nencoding::xml \ + -t$(HARECACHE)/encoding/xml/encoding_xml.td $(stdlib_encoding_xml_srcs) + # endian stdlib_endian_srcs= \ $(STDLIB)/endian/big.ha \ @@ -900,6 +914,9 @@ hare_testlib_deps+=$(testlib_encoding_hex) testlib_encoding_utf8=$(TESTCACHE)/encoding/utf8/encoding_utf8.o hare_testlib_deps+=$(testlib_encoding_utf8) +testlib_encoding_xml=$(TESTCACHE)/encoding/xml/encoding_xml.o +hare_testlib_deps+=$(testlib_encoding_xml) + testlib_endian=$(TESTCACHE)/endian/endian.o hare_testlib_deps+=$(testlib_endian) @@ -1144,6 +1161,17 @@ $(TESTCACHE)/encoding/utf8/encoding_utf8.ssa: $(testlib_encoding_utf8_srcs) $(te @HARECACHE=$(TESTCACHE) $(HAREC) $(TESTHAREFLAGS) -o $@ -Nencoding::utf8 \ -t$(TESTCACHE)/encoding/utf8/encoding_utf8.td $(testlib_encoding_utf8_srcs) +# encoding::xml +testlib_encoding_xml_srcs= \ + $(STDLIB)/encoding/xml/types.ha \ + $(STDLIB)/encoding/xml/parser.ha + +$(TESTCACHE)/encoding/xml/encoding_xml.ssa: $(testlib_encoding_xml_srcs) $(testlib_rt) $(testlib_io) $(testlib_bufio) $(testlib_strings) + @printf 'HAREC \t$@\n' + @mkdir -p $(TESTCACHE)/encoding/xml + @HARECACHE=$(TESTCACHE) $(HAREC) $(TESTHAREFLAGS) -o $@ -Nencoding::xml \ + -t$(TESTCACHE)/encoding/xml/encoding_xml.td $(testlib_encoding_xml_srcs) + # endian testlib_endian_srcs= \ $(STDLIB)/endian/big.ha \