hare

[hare] The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

memset.ha (2179B)


      1 // SPDX-License-Identifier: MPL-2.0
      2 // (c) Hare authors <https://harelang.org>
      3 
      4 export fn memset(dest: *opaque, val: u8, n: size) void = {
      5 	// implementation adapted from musl libc
      6 
      7 	let d = memfunc_ptr { byte = dest: *[*]u8 };
      8 
      9 	// fill 4 bytes of head and tail with minimal branching. the head/tail
     10 	// regions may overlap, in which case we return early, and if not
     11 	// then we infer that we can fill twice as much on the next round.
     12 	if (n == 0) return;
     13 	d.byte[0] = val;
     14 	d.byte[n-1] = val;
     15 	if (n <= 2) return;
     16 	d.byte[1] = val;
     17 	d.byte[2] = val;
     18 	d.byte[n-2] = val;
     19 	d.byte[n-3] = val;
     20 	if (n <= 6) return;
     21 	d.byte[3] = val;
     22 	d.byte[n-4] = val;
     23 	// NOTE: we could do more here but the work would be duplicated later
     24 	if (n <= 8) return;
     25 
     26 	// advance pointer to align it at a 4-byte boundary,
     27 	// and truncate n to a multiple of 4. the previous code
     28 	// already took care of any head/tail that get cut off
     29 	// by the alignment
     30 	let diff = -d.uptr & 0b11;
     31 	d.uptr = d.uptr + diff;
     32 	n -= diff;
     33 	// convert length in u8 to u32, truncating it in the process
     34 	n >>= 2;
     35 
     36 	// 4-byte copy of val
     37 	let val32 = 0x01010101u32 * val;
     38 
     39 	// fill 7 u32s (28 bytes) of head and tail, using the same process
     40 	// as before. we don't need to check for n == 0 because we advanced <4
     41 	// bytes out of more than 8, so there's at least one u32 left.
     42 	d.quad[0] = val32;
     43 	d.quad[n-1] = val32;
     44 	if (n <= 2) return;
     45 	d.quad[1] = val32;
     46 	d.quad[2] = val32;
     47 	d.quad[n-2] = val32;
     48 	d.quad[n-3] = val32;
     49 	if (n <= 6) return;
     50 	d.quad[3] = val32;
     51 	d.quad[4] = val32;
     52 	d.quad[5] = val32;
     53 	d.quad[6] = val32;
     54 	d.quad[n-4] = val32;
     55 	d.quad[n-5] = val32;
     56 	d.quad[n-6] = val32;
     57 	d.quad[n-7] = val32;
     58 
     59 	// align to a multiple of 8 so we can copy as u64.
     60 	// NOTE: the 24 here skips over most of the head we just filled,
     61 	// while making sure that diff <= 28
     62 	diff = 24 + (d.uptr & 4);
     63 	d.uptr = d.uptr + diff;
     64 	n -= diff >> 2;
     65 
     66 	// 28 tail bytes have already been filled, so any remainder
     67 	// when n <= 7 (28 bytes) can be safely ignored
     68 	const val64 = (val32: u64 << 32) | val32;
     69 	for (8 <= n; n -= 8) {
     70 		d.octs[0] = val64;
     71 		d.octs[1] = val64;
     72 		d.octs[2] = val64;
     73 		d.octs[3] = val64;
     74 		d.uptr += 32;
     75 	};
     76 };