encode.ha (1733B)
1 // SPDX-License-Identifier: MPL-2.0 2 // (c) Hare authors <https://harelang.org> 3 4 // The value of this rune is not a valid Unicode codepoint. 5 export type invalidcodepoint = !rune; 6 7 fn isvalidcodepoint(cp: u32) bool = { 8 return (cp < 0xD800 || cp > 0xDFFF) && cp <= 0x10FFFF; 9 }; 10 11 // Encodes a rune as UTF-8 and returns the result as a slice. The return value 12 // is statically allocated, and will not be consistent after subsequent calls to 13 // encoderune. 14 export fn encoderune(r: rune) ([]u8 | invalidcodepoint) = { 15 let ch = r: u32, n = 0z, first = 0u8; 16 if (!isvalidcodepoint(ch)) { 17 return r: invalidcodepoint; 18 }; 19 20 if (ch < 0x80) { 21 first = 0; 22 n = 1; 23 } else if (ch < 0x800) { 24 first = 0xC0; 25 n = 2; 26 } else if (ch < 0x10000) { 27 first = 0xE0; 28 n = 3; 29 } else { 30 first = 0xF0; 31 n = 4; 32 }; 33 34 static let buf: [4]u8 = [0...]; 35 for (let i = n - 1; i > 0; i -= 1) { 36 buf[i] = ch: u8 & 0x3F | 0x80; 37 ch >>= 6; 38 }; 39 buf[0] = ch: u8 | first; 40 return buf[..n]; 41 }; 42 43 @test fn encode() void = { 44 const testcases: [](rune, bool, []u8) = [ 45 // input rune 46 // | expects error 47 // | | expected encoding 48 ('\0', 49 false, [0]), 50 ('%', 51 false, [0x25]), 52 ('こ', 53 false, [0xE3, 0x81, 0x93]), 54 (0xD800: rune, 55 true, []), 56 (0xDF00: rune, 57 true, []), 58 (0x110000: rune, 59 true, []), 60 ]; 61 62 for (let i = 0z; i < len(testcases); i += 1) { 63 const tc = testcases[i]; 64 const input = tc.0; 65 const want = tc.2; 66 67 match(encoderune(input)) { 68 case invalidcodepoint => 69 assert(tc.1, "want []u8, got invalidcodepoint"); 70 case let got: []u8 => 71 assert(!tc.1, "want invalidcodepoint, got []u8"); 72 for (let j = 0z; j < len(want); j += 1) { 73 assert(got[j] == want[j], 74 "[]u8 mismatch"); 75 }; 76 }; 77 }; 78 };