memset.ha (2179B)
1 // SPDX-License-Identifier: MPL-2.0 2 // (c) Hare authors <https://harelang.org> 3 4 export fn memset(dest: *opaque, val: u8, n: size) void = { 5 // implementation adapted from musl libc 6 7 let d = memfunc_ptr { byte = dest: *[*]u8 }; 8 9 // fill 4 bytes of head and tail with minimal branching. the head/tail 10 // regions may overlap, in which case we return early, and if not 11 // then we infer that we can fill twice as much on the next round. 12 if (n == 0) return; 13 d.byte[0] = val; 14 d.byte[n-1] = val; 15 if (n <= 2) return; 16 d.byte[1] = val; 17 d.byte[2] = val; 18 d.byte[n-2] = val; 19 d.byte[n-3] = val; 20 if (n <= 6) return; 21 d.byte[3] = val; 22 d.byte[n-4] = val; 23 // NOTE: we could do more here but the work would be duplicated later 24 if (n <= 8) return; 25 26 // advance pointer to align it at a 4-byte boundary, 27 // and truncate n to a multiple of 4. the previous code 28 // already took care of any head/tail that get cut off 29 // by the alignment 30 let diff = -d.uptr & 0b11; 31 d.uptr = d.uptr + diff; 32 n -= diff; 33 // convert length in u8 to u32, truncating it in the process 34 n >>= 2; 35 36 // 4-byte copy of val 37 let val32 = 0x01010101u32 * val; 38 39 // fill 7 u32s (28 bytes) of head and tail, using the same process 40 // as before. we don't need to check for n == 0 because we advanced <4 41 // bytes out of more than 8, so there's at least one u32 left. 42 d.quad[0] = val32; 43 d.quad[n-1] = val32; 44 if (n <= 2) return; 45 d.quad[1] = val32; 46 d.quad[2] = val32; 47 d.quad[n-2] = val32; 48 d.quad[n-3] = val32; 49 if (n <= 6) return; 50 d.quad[3] = val32; 51 d.quad[4] = val32; 52 d.quad[5] = val32; 53 d.quad[6] = val32; 54 d.quad[n-4] = val32; 55 d.quad[n-5] = val32; 56 d.quad[n-6] = val32; 57 d.quad[n-7] = val32; 58 59 // align to a multiple of 8 so we can copy as u64. 60 // NOTE: the 24 here skips over most of the head we just filled, 61 // while making sure that diff <= 28 62 diff = 24 + (d.uptr & 4); 63 d.uptr = d.uptr + diff; 64 n -= diff >> 2; 65 66 // 28 tail bytes have already been filled, so any remainder 67 // when n <= 7 (28 bytes) can be safely ignored 68 const val64 = (val32: u64 << 32) | val32; 69 for (8 <= n; n -= 8) { 70 d.octs[0] = val64; 71 d.octs[1] = val64; 72 d.octs[2] = val64; 73 d.octs[3] = val64; 74 d.uptr += 32; 75 }; 76 };