hare

[hare] The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

commit b2d01b32f15688f225c4b8a01fb879c33ce4419e
parent 9c2d4ba4dafa4679127edcb26078a9ddb8695010
Author: Autumn! <autumnull@posteo.net>
Date:   Thu,  7 Sep 2023 14:47:24 +0000

rt: copy from musl in memset

Signed-off-by: Autumn! <autumnull@posteo.net>

Diffstat:
Mrt/memset.ha | 75+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 71 insertions(+), 4 deletions(-)

diff --git a/rt/memset.ha b/rt/memset.ha @@ -1,9 +1,76 @@ // License: MPL-2.0 // (c) 2021 Drew DeVault <sir@cmpwn.com> -export fn memset(dest: *opaque, val: u8, amt: size) void = { - let a = dest: *[*]u8; - for (let i = 0z; i < amt; i += 1) { - a[i] = val; +export fn memset(dest: *opaque, val: u8, n: size) void = { + // implementation adapted from musl libc + + let d = memfunc_ptr { byte = dest: *[*]u8 }; + + // fill 4 bytes of head and tail with minimal branching. the head/tail + // regions may overlap, in which case we return early, and if not + // then we infer that we can fill twice as much on the next round. + if (n == 0) return; + d.byte[0] = val; + d.byte[n-1] = val; + if (n <= 2) return; + d.byte[1] = val; + d.byte[2] = val; + d.byte[n-2] = val; + d.byte[n-3] = val; + if (n <= 6) return; + d.byte[3] = val; + d.byte[n-4] = val; + // NOTE: we could do more here but the work would be duplicated later + if (n <= 8) return; + + // advance pointer to align it at a 4-byte boundary, + // and truncate n to a multiple of 4. the previous code + // already took care of any head/tail that get cut off + // by the alignment + let diff = -d.uptr & 0b11; + d.uptr = d.uptr + diff; + n -= diff; + // convert length in u8 to u32, truncating it in the process + n >>= 2; + + // 4-byte copy of val + let val32 = 0x01010101u32 * val; + + // fill 7 u32s (28 bytes) of head and tail, using the same process + // as before. we don't need to check for n == 0 because we advanced <4 + // bytes out of more than 8, so there's at least one u32 left. + d.quad[0] = val32; + d.quad[n-1] = val32; + if (n <= 2) return; + d.quad[1] = val32; + d.quad[2] = val32; + d.quad[n-2] = val32; + d.quad[n-3] = val32; + if (n <= 6) return; + d.quad[3] = val32; + d.quad[4] = val32; + d.quad[5] = val32; + d.quad[6] = val32; + d.quad[n-4] = val32; + d.quad[n-5] = val32; + d.quad[n-6] = val32; + d.quad[n-7] = val32; + + // align to a multiple of 8 so we can copy as u64. + // NOTE: the 24 here skips over most of the head we just filled, + // while making sure that diff <= 28 + diff = 24 + (d.uptr & 4); + d.uptr = d.uptr + diff; + n -= diff >> 2; + + // 28 tail bytes have already been filled, so any remainder + // when n <= 7 (28 bytes) can be safely ignored + const val64 = (val32: u64 << 32) | val32; + for (8 <= n; n -= 8) { + d.octs[0] = val64; + d.octs[1] = val64; + d.octs[2] = val64; + d.octs[3] = val64; + d.uptr += 32; }; };