hare

The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

commit 5c0d392f2678b91416aaf3f330dc769e0d5d3086
parent 469f041b43a7c6996b39dd6b916699b2d933af91
Author: Eyal Sawady <ecs@d2evs.net>
Date:   Mon, 29 Mar 2021 21:59:54 -0400

compress::flate: new module

Diffstat:
Acompress/flate/inflate.ha | 425+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mscripts/gen-stdlib | 7+++++++
Mstdlib.mk | 26++++++++++++++++++++++++++
3 files changed, 458 insertions(+), 0 deletions(-)

diff --git a/compress/flate/inflate.ha b/compress/flate/inflate.ha @@ -0,0 +1,425 @@ +use bufio; +use bytes; +use endian; +use errors; +use fmt; +use io; + +def BUFSIZE: size = 1 << 16; +def MAXBITS: size = 15; +def MAXCODES: size = 288; + +const fixed_len: huffman = huffman { + counts = [0...], + symbols = [0...], +}; + +const fixed_dist: huffman = huffman { + counts = [0...], + symbols = [0...], +}; + +type huffman = struct { + counts: [MAXBITS]u16, + symbols: [MAXCODES]u16, +}; + +type inflate_err = enum u8 { + EOF, + BTYPE, + LEN, + DISTANCE, + CODE, + HUFFMAN, + TABLE, +}; + +type decompressor = struct { + stream: io::stream, + in: *io::stream, + bitbuf: u32, + cnt: u32, + final: bool, + state: *fn(d: *decompressor) (void | io::EOF | io::error), + left: size, + hl: huffman, + hd: huffman, + + // History ring buffer + hist: []u8, + head: size, + + // Output buffer + buf: []u8, + bufstart: []u8, +}; + +@init fn init_fixed() void = { + // See section 3.2.6 of RFC 1951 + let lens: [288]u16 = [0...]; + let i = 0z; + for (i < 144; i += 1) { + lens[i] = 8; + }; + for (i < 256; i += 1) { + lens[i] = 9; + }; + for (i < 280; i += 1) { + lens[i] = 7; + }; + for (i < 288; i += 1) { + lens[i] = 8; + }; + construct(&fixed_len, lens[..]) as void; + + let dists: [30]u16 = [5...]; + construct(&fixed_dist, dists[..]) as void; +}; + +// Creates a stream which decompresses Deflate (RFC 1951) data. +export fn inflate(s: *io::stream) *io::stream = alloc(decompressor { + stream = io::stream { + reader = &read, + closer = &close, + ... + }, + in = s, + bitbuf = 0, + cnt = 0, + final = false, + state = &uncompressed_read, + left = 0, + + hist = [], + head = 0, + + buf = [], + bufstart = [], +}): *io::stream; + +// Read [want] bits from the decompressor +fn bits(d: *decompressor, want: u32) (u32 | io::error) = { + assert(want <= 32); + let val = d.bitbuf; + for (d.cnt < want) { + let buf: [_]u8 = [0]; + match (io::read(d.in, buf)?) { + io::EOF => return wraperror(inflate_err::EOF), + z: size => if (z < 1) { + continue; // Short read, retry + }, + }; + val |= buf[0] << d.cnt; + d.cnt += 8; + }; + d.bitbuf = val >> want; + d.cnt -= want; + assert(d.cnt <= 7); + return val & (1 << want) - 1; +}; + +fn put(d: *decompressor, b: u8...) void = { + assert(len(d.buf) == 0 || d.buf: *[*]u8 == d.bufstart: *[*]u8); + assert(len(d.hist) <= BUFSIZE); + if (len(d.buf) == 0) { + d.bufstart = d.bufstart[..0]; + }; + append(d.bufstart, ...b); + d.buf = d.bufstart; + let n = BUFSIZE - len(d.hist); + if (n > len(b)) { + n = len(b); + }; + append(d.hist, ...b[..n]); + b = b[n..]; + d.head += n; + for (let s = b; len(s) != 0) { + assert(len(d.buf) == BUFSIZE); + const toput = if (d.head + len(s) > len(d.hist)) + len(d.hist) - d.head else len(s); + d.hist[d.head..d.head + toput] = s[..toput]; + d.head = (d.head + toput) % BUFSIZE; + s = s[toput..]; + }; +}; + +fn copy(d: *decompressor, back: size, n: size) (void | io::error) = { + if (len(d.hist) < back) { + return wraperror(inflate_err::DISTANCE); + }; + for (let i = 0z; i < n; i += 1) { + put(d, d.hist[(d.head - back + BUFSIZE) % BUFSIZE]); + }; +}; + +fn decode(d: *decompressor, h: huffman) (u16 | io::error) = { + let code = 0u32, index = 0u32, first = 0u32; + for (let i = 1z; i < MAXBITS; i += 1) { + code |= bits(d, 1)?; + if (code < first + h.counts[i]) { + return h.symbols[index + (code - first)]; + }; + index += h.counts[i]; + first += h.counts[i]; + first <<= 1; + code <<= 1; + }; + return wraperror(inflate_err::CODE); +}; + +fn construct(h: *huffman, lens: []u16) (void | io::error) = { + h.counts = [0...]; + for (let i = 0z; i < len(lens); i += 1) { + h.counts[lens[i]] += 1; + }; + if (h.counts[0] == len(lens)) return; + + for (let left = 1z, i = 1z; i < MAXBITS; i += 1) { + left <<= 1; + left -= h.counts[i]; + if (left < 0) return wraperror(inflate_err::HUFFMAN); + }; + + let offs: [MAXBITS + 1]u16 = [0...]; + for (let i = 1z; i < MAXBITS; i += 1) { + offs[i + 1] = offs[i] + h.counts[i]; + }; + + for (let i = 0u16; i < len(lens); i += 1) if (lens[i] != 0) { + let off = offs[lens[i]]; + offs[lens[i]] += 1; + h.symbols[off] = i; + }; +}; + +fn compressed_read(d: *decompressor) (void | io::EOF | io::error) = { + static const lbases: [_]u32 = [ + 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, 35, 43, + 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, + ]; + static const lexts: [_]u32 = [ + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, + 4, 4, 4, 4, 5, 5, 5, 5, 0, + ]; + static const dbases: [_]u32 = [ + 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193, 257, + 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145, 8193, 12289, + 16385, 24577, + ]; + static const dexts: [_]u32 = [ + 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, + 9, 10, 10, 11, 11, 12, 12, 13, 13, + ]; + + let c = decode(d, d.hl)?; + if (c < 256) { + put(d, c: u8); + return; + }; + if (c == 256) { + d.left = 0; + return; + }; + + const n = lbases[c - 257] + bits(d, lexts[c - 257])?; + c = decode(d, d.hd)?; + const back = dbases[c] + bits(d, dexts[c])?; + copy(d, back, n)?; +}; + +fn uncompressed_read(d: *decompressor) (void | io::EOF | io::error) = { + static let _buf: [1024]u8 = [0...]; + let buf = if (len(_buf) > d.left: size) _buf[..d.left] else _buf[..]; + let z = match (io::read(d.in, buf)?) { + z: size => { + d.left -= z; + z; + }, + io::EOF => return wraperror(inflate_err::EOF), + }; + put(d, buf[..z]...); +}; + +fn opaque_strerror( + data: *errors::opaque_data +) const str = switch (*(data: *inflate_err)) { + inflate_err::EOF => "Unexpected EOF", + inflate_err::BTYPE => "Invalid block type", + inflate_err::LEN => "Invalid block length", + inflate_err::DISTANCE => "Invalid distance", + inflate_err::CODE => "Invalid Huffman code", + inflate_err::HUFFMAN => "Oversubscribed Huffman code", + inflate_err::TABLE => "Invalid dynamic Huffman code table", +}; + +fn wraperror(err: inflate_err) errors::opaque = { + static assert(size(inflate_err) <= size(error::opaque_data)); + let wrapped = errors::opaque { strerror = &opaque_strerror, ... }; + let myptr = &wrapped.data: *inflate_err; + *myptr = err; + return wrapped; +}; + +fn dynamic(d: *decompressor) (void | io::EOF | io::error) = { + const hlit = bits(d, 5)? + 257; + const hdist = bits(d, 5)? + 1; + const hclen = bits(d, 4)? + 4; + + if (hlit > 286 || hdist > 30) return wraperror(inflate_err::TABLE); + let codes: [19]u16 = [0...]; + static const order = [ + 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, + 15, + ]; + for (let i = 0z; i < hclen; i += 1) { + codes[order[i]] = bits(d, 3)?: u16; + }; + let hcl = huffman { + counts = [0...], + symbols = [0...], + }; + construct(&hcl, codes[..])?; + let lens: []u16 = []; + defer free(lens); + for (len(lens) < hlit + hdist) { + let c = decode(d, hcl)?; + if (c < 16) { + append(lens, c); + continue; + }; + let n = 0u16; + let times = switch (c) { + 16 => { + if (len(lens) == 0) { + return wraperror(inflate_err::TABLE); + }; + n = lens[len(lens) - 1]; + 3 + bits(d, 2)?; + }, + 17 => 3 + bits(d, 3)?, + 18 => 11 + bits(d, 7)?, + * => abort(), + }; + for (times > 0; times -= 1) { + append(lens, n); + }; + }; + if (len(lens) < 256 || lens[256] == 0) { + return wraperror(inflate_err::TABLE); + }; + construct(&d.hl, lens[..hlit])?; + construct(&d.hd, lens[hlit..hlit + hdist])?; + d.state = &compressed_read; + d.left = 1; +}; + +fn next(d: *decompressor) (void | io::EOF | io::error) = { + if (d.final) { + return io::EOF; + }; + if (bits(d, 1)? == 1) { + d.final = true; + }; + return switch (bits(d, 2)?) { + 0b00 => { + // Skip to next byte boundary + d.cnt = 0; + d.bitbuf = 0; + let buf: [4]u8 = [0...]; + for (let n = 0z; n < 4) { + match (io::read(d.in, buf[n..])?) { + io::EOF => return wraperror(inflate_err::EOF), + z: size => n += z, + }; + }; + const length = endian::legetu16(buf[..2]); + if (length != ~endian::legetu16(buf[2..])) { + return wraperror(inflate_err::LEN); + }; + d.state = &uncompressed_read; + d.left = length; + }, + 0b01 => { + d.hl = fixed_len; + d.hd = fixed_dist; + d.state = &compressed_read; + d.left = 1; + }, + 0b10 => dynamic(d), + 0b11 => wraperror(inflate_err::BTYPE), + * => abort(), + }; +}; + +fn read(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = { + const s = s: *decompressor; + let n = 0z; + let a = buf; + for (n < len(buf)) { + if (len(s.buf) == 0) { + if (s.left == 0) match (next(s)?) { + void => void, + io::EOF => return if (n == 0) io::EOF else n, + }; + match (s.state(s)?) { + void => void, + io::EOF => return if (n == 0) io::EOF else n, + }; + }; + const toread = + if (len(s.buf) > len(buf)) len(buf) else len(s.buf); + buf[..toread] = s.buf[..toread]; + s.buf = s.buf[toread..]; + buf = buf[toread..]; + n += toread; + }; + return n; +}; + +fn close(s: *io::stream) void = { + const s = s: *decompressor; + free(s.bufstart); + free(s.hist); +}; + +// Further testing is done in compress::zlib +@test fn inflate() void = { + const in: []u8 = [ + // Non-final block, uncompressed + 0b00000000, + // 4 bytes long + 0b00000100, + 0b00000000, + // ~LEN + 0b11111011, + 0b11111111, + // Data + 0xDE, 0xAD, 0xBE, 0xEF, + // Final block, uncompressed + 0b00000001, + // 2 bytes long + 0b00000010, + 0b00000000, + // ~LEN + 0b11111101, + 0b11111111, + // Data + 0x42, 0x69, + ]; + let expected: []u8 = [0xDE, 0xAD, 0xBE, 0xEF, 0x42, 0x69]; + + let ins = bufio::fixed(in, io::mode::READ); + let outs = bufio::dynamic(io::mode::WRITE); + let s = inflate(ins); + defer io::close(s); + match (io::copy(outs, s)) { + size => void, + e: io::error => { + fmt::errorln(io::strerror(e)); + abort(); + }, + }; + let out = bufio::finish(outs); + defer free(out); + assert(bytes::equal(expected, out)); +}; diff --git a/scripts/gen-stdlib b/scripts/gen-stdlib @@ -125,6 +125,12 @@ bytes() { gen_ssa bytes types } +compress_flate() { + gen_srcs compress::flate \ + inflate.ha + gen_ssa compress::flate bufio bytes endian errors io fmt +} + crypto_math() { gen_srcs crypto::math \ bits.ha @@ -607,6 +613,7 @@ printf '# This file is generated by the gen-stdlib script, do not edit it by han modules="ascii bufio bytes +compress_flate crypto_math crypto_random crypto_md5 diff --git a/stdlib.mk b/stdlib.mk @@ -78,6 +78,9 @@ hare_stdlib_deps+=$(stdlib_bufio) stdlib_bytes=$(HARECACHE)/bytes/bytes.o hare_stdlib_deps+=$(stdlib_bytes) +stdlib_compress_flate=$(HARECACHE)/compress/flate/compress_flate.o +hare_stdlib_deps+=$(stdlib_compress_flate) + stdlib_crypto_math=$(HARECACHE)/crypto/math/crypto_math.o hare_stdlib_deps+=$(stdlib_crypto_math) @@ -255,6 +258,16 @@ $(HARECACHE)/bytes/bytes.ssa: $(stdlib_bytes_srcs) $(stdlib_rt) $(stdlib_types) @HARECACHE=$(HARECACHE) $(HAREC) $(HAREFLAGS) -o $@ -Nbytes \ -t$(HARECACHE)/bytes/bytes.td $(stdlib_bytes_srcs) +# compress::flate +stdlib_compress_flate_srcs= \ + $(STDLIB)/compress/flate/inflate.ha + +$(HARECACHE)/compress/flate/compress_flate.ssa: $(stdlib_compress_flate_srcs) $(stdlib_rt) $(stdlib_bufio) $(stdlib_bytes) $(stdlib_endian) $(stdlib_errors) $(stdlib_io) $(stdlib_fmt) + @printf 'HAREC \t$@\n' + @mkdir -p $(HARECACHE)/compress/flate + @HARECACHE=$(HARECACHE) $(HAREC) $(HAREFLAGS) -o $@ -Ncompress::flate \ + -t$(HARECACHE)/compress/flate/compress_flate.td $(stdlib_compress_flate_srcs) + # crypto::math stdlib_crypto_math_srcs= \ $(STDLIB)/crypto/math/bits.ha @@ -891,6 +904,9 @@ hare_testlib_deps+=$(testlib_bufio) testlib_bytes=$(TESTCACHE)/bytes/bytes.o hare_testlib_deps+=$(testlib_bytes) +testlib_compress_flate=$(TESTCACHE)/compress/flate/compress_flate.o +hare_testlib_deps+=$(testlib_compress_flate) + testlib_crypto_math=$(TESTCACHE)/crypto/math/crypto_math.o hare_testlib_deps+=$(testlib_crypto_math) @@ -1068,6 +1084,16 @@ $(TESTCACHE)/bytes/bytes.ssa: $(testlib_bytes_srcs) $(testlib_rt) $(testlib_type @HARECACHE=$(TESTCACHE) $(HAREC) $(TESTHAREFLAGS) -o $@ -Nbytes \ -t$(TESTCACHE)/bytes/bytes.td $(testlib_bytes_srcs) +# compress::flate +testlib_compress_flate_srcs= \ + $(STDLIB)/compress/flate/inflate.ha + +$(TESTCACHE)/compress/flate/compress_flate.ssa: $(testlib_compress_flate_srcs) $(testlib_rt) $(testlib_bufio) $(testlib_bytes) $(testlib_endian) $(testlib_errors) $(testlib_io) $(testlib_fmt) + @printf 'HAREC \t$@\n' + @mkdir -p $(TESTCACHE)/compress/flate + @HARECACHE=$(TESTCACHE) $(HAREC) $(TESTHAREFLAGS) -o $@ -Ncompress::flate \ + -t$(TESTCACHE)/compress/flate/compress_flate.td $(testlib_compress_flate_srcs) + # crypto::math testlib_crypto_math_srcs= \ $(STDLIB)/crypto/math/bits.ha