hare

The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

commit fa8d5b5f83d66760c833d887e9ada0f300bcbfc7
parent 080d6d83136b1a7955e35f6f2bff74a0b3760cb7
Author: Armin Preiml <apreiml@strohwolke.at>
Date:   Fri, 26 Nov 2021 13:34:05 +0100

implement AES-NI for x86_64

Signed-off-by: Armin Preiml <apreiml@strohwolke.at>
Signed-off-by: Drew DeVault <sir@cmpwn.com>

Diffstat:
Acrypto/aes/+test/ni+x86_64.ha | 280+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Acrypto/aes/+x86_64/ni.ha | 64++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Acrypto/aes/+x86_64/ni_native.s | 500+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mscripts/gen-stdlib | 9++++++---
Mstdlib.mk | 4++--
5 files changed, 852 insertions(+), 5 deletions(-)

diff --git a/crypto/aes/+test/ni+x86_64.ha b/crypto/aes/+test/ni+x86_64.ha @@ -0,0 +1,280 @@ +use bytes; +use crypto::cipher; + +const zero_rk: [RKLEN_256]u8 = [0...]; + +// taken from fips-197.pdf Section A.1 +@test fn ni_enc_key_expand_128() void = { + const key: [16]u8 = [ + 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, + 0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c + ]; + + const expected_rounds: [_]u8 = [ + 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, + 0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c, + 0xa0, 0xfa, 0xfe, 0x17, 0x88, 0x54, 0x2c, 0xb1, + 0x23, 0xa3, 0x39, 0x39, 0x2a, 0x6c, 0x76, 0x05, + 0xf2, 0xc2, 0x95, 0xf2, 0x7a, 0x96, 0xb9, 0x43, + 0x59, 0x35, 0x80, 0x7a, 0x73, 0x59, 0xf6, 0x7f, + 0x3d, 0x80, 0x47, 0x7d, 0x47, 0x16, 0xfe, 0x3e, + 0x1e, 0x23, 0x7e, 0x44, 0x6d, 0x7a, 0x88, 0x3b, + 0xef, 0x44, 0xa5, 0x41, 0xa8, 0x52, 0x5b, 0x7f, + 0xb6, 0x71, 0x25, 0x3b, 0xdb, 0x0b, 0xad, 0x00, + 0xd4, 0xd1, 0xc6, 0xf8, 0x7c, 0x83, 0x9d, 0x87, + 0xca, 0xf2, 0xb8, 0xbc, 0x11, 0xf9, 0x15, 0xbc, + 0x6d, 0x88, 0xa3, 0x7a, 0x11, 0x0b, 0x3e, 0xfd, + 0xdb, 0xf9, 0x86, 0x41, 0xca, 0x00, 0x93, 0xfd, + 0x4e, 0x54, 0xf7, 0x0e, 0x5f, 0x5f, 0xc9, 0xf3, + 0x84, 0xa6, 0x4f, 0xb2, 0x4e, 0xa6, 0xdc, 0x4f, + 0xea, 0xd2, 0x73, 0x21, 0xb5, 0x8d, 0xba, 0xd2, + 0x31, 0x2b, 0xf5, 0x60, 0x7f, 0x8d, 0x29, 0x2f, + 0xac, 0x77, 0x66, 0xf3, 0x19, 0xfa, 0xdc, 0x21, + 0x28, 0xd1, 0x29, 0x41, 0x57, 0x5c, 0x00, 0x6e, + 0xd0, 0x14, 0xf9, 0xa8, 0xc9, 0xee, 0x25, 0x89, + 0xe1, 0x3f, 0x0c, 0xc8, 0xb6, 0x63, 0x0c, 0xa6, + ]; + + let block = x86ni(); + x86ni_init(&block, key[..]); + + assert(block.rklen == 176); + assert(bytes::equal(expected_rounds[..], block.enc_rk[..RKLEN_128])); + + cipher::finish(&block); + assert(bytes::equal(zero_rk[..], block.enc_rk[..])); + assert(bytes::equal(zero_rk[..], block.dec_rk[..])); +}; + +// taken from fips-197.pdf Section A.2 +@test fn ni_enc_key_expand_192() void = { + const key: [24]u8 = [ + 0x8e, 0x73, 0xb0, 0xf7, 0xda, 0x0e, 0x64, 0x52, + 0xc8, 0x10, 0xf3, 0x2b, 0x80, 0x90, 0x79, 0xe5, + 0x62, 0xf8, 0xea, 0xd2, 0x52, 0x2c, 0x6b, 0x7b + ]; + + const expected_rounds: [_]u8 = [ + 0x8e, 0x73, 0xb0, 0xf7, 0xda, 0x0e, 0x64, 0x52, + 0xc8, 0x10, 0xf3, 0x2b, 0x80, 0x90, 0x79, 0xe5, + 0x62, 0xf8, 0xea, 0xd2, 0x52, 0x2c, 0x6b, 0x7b, + 0xfe, 0x0c, 0x91, 0xf7, 0x24, 0x02, 0xf5, 0xa5, + 0xec, 0x12, 0x06, 0x8e, 0x6c, 0x82, 0x7f, 0x6b, + 0x0e, 0x7a, 0x95, 0xb9, 0x5c, 0x56, 0xfe, 0xc2, + 0x4d, 0xb7, 0xb4, 0xbd, 0x69, 0xb5, 0x41, 0x18, + 0x85, 0xa7, 0x47, 0x96, 0xe9, 0x25, 0x38, 0xfd, + 0xe7, 0x5f, 0xad, 0x44, 0xbb, 0x09, 0x53, 0x86, + 0x48, 0x5a, 0xf0, 0x57, 0x21, 0xef, 0xb1, 0x4f, + 0xa4, 0x48, 0xf6, 0xd9, 0x4d, 0x6d, 0xce, 0x24, + 0xaa, 0x32, 0x63, 0x60, 0x11, 0x3b, 0x30, 0xe6, + 0xa2, 0x5e, 0x7e, 0xd5, 0x83, 0xb1, 0xcf, 0x9a, + 0x27, 0xf9, 0x39, 0x43, 0x6a, 0x94, 0xf7, 0x67, + 0xc0, 0xa6, 0x94, 0x07, 0xd1, 0x9d, 0xa4, 0xe1, + 0xec, 0x17, 0x86, 0xeb, 0x6f, 0xa6, 0x49, 0x71, + 0x48, 0x5f, 0x70, 0x32, 0x22, 0xcb, 0x87, 0x55, + 0xe2, 0x6d, 0x13, 0x52, 0x33, 0xf0, 0xb7, 0xb3, + 0x40, 0xbe, 0xeb, 0x28, 0x2f, 0x18, 0xa2, 0x59, + 0x67, 0x47, 0xd2, 0x6b, 0x45, 0x8c, 0x55, 0x3e, + 0xa7, 0xe1, 0x46, 0x6c, 0x94, 0x11, 0xf1, 0xdf, + 0x82, 0x1f, 0x75, 0x0a, 0xad, 0x07, 0xd7, 0x53, + 0xca, 0x40, 0x05, 0x38, 0x8f, 0xcc, 0x50, 0x06, + 0x28, 0x2d, 0x16, 0x6a, 0xbc, 0x3c, 0xe7, 0xb5, + 0xe9, 0x8b, 0xa0, 0x6f, 0x44, 0x8c, 0x77, 0x3c, + 0x8e, 0xcc, 0x72, 0x04, 0x01, 0x00, 0x22, 0x02, + ]; + + let block = x86ni(); + x86ni_init(&block, key[..]); + + assert(block.rklen == 208); + assert(bytes::equal(expected_rounds[..], block.enc_rk[..RKLEN_192])); +}; + + +// taken from fips-197.pdf Section A.3 +@test fn ni_enc_key_expand_256() void = { + const key: [32]u8 = [ + 0x60, 0x3d, 0xeb, 0x10, 0x15, 0xca, 0x71, 0xbe, + 0x2b, 0x73, 0xae, 0xf0, 0x85, 0x7d, 0x77, 0x81, + 0x1f, 0x35, 0x2c, 0x07, 0x3b, 0x61, 0x08, 0xd7, + 0x2d, 0x98, 0x10, 0xa3, 0x09, 0x14, 0xdf, 0xf4, + ]; + + const expected_rounds: [_]u8 = [ + 0x60, 0x3d, 0xeb, 0x10, 0x15, 0xca, 0x71, 0xbe, + 0x2b, 0x73, 0xae, 0xf0, 0x85, 0x7d, 0x77, 0x81, + 0x1f, 0x35, 0x2c, 0x07, 0x3b, 0x61, 0x08, 0xd7, + 0x2d, 0x98, 0x10, 0xa3, 0x09, 0x14, 0xdf, 0xf4, + 0x9b, 0xa3, 0x54, 0x11, 0x8e, 0x69, 0x25, 0xaf, + 0xa5, 0x1a, 0x8b, 0x5f, 0x20, 0x67, 0xfc, 0xde, + 0xa8, 0xb0, 0x9c, 0x1a, 0x93, 0xd1, 0x94, 0xcd, + 0xbe, 0x49, 0x84, 0x6e, 0xb7, 0x5d, 0x5b, 0x9a, + 0xd5, 0x9a, 0xec, 0xb8, 0x5b, 0xf3, 0xc9, 0x17, + 0xfe, 0xe9, 0x42, 0x48, 0xde, 0x8e, 0xbe, 0x96, + 0xb5, 0xa9, 0x32, 0x8a, 0x26, 0x78, 0xa6, 0x47, + 0x98, 0x31, 0x22, 0x29, 0x2f, 0x6c, 0x79, 0xb3, + 0x81, 0x2c, 0x81, 0xad, 0xda, 0xdf, 0x48, 0xba, + 0x24, 0x36, 0x0a, 0xf2, 0xfa, 0xb8, 0xb4, 0x64, + 0x98, 0xc5, 0xbf, 0xc9, 0xbe, 0xbd, 0x19, 0x8e, + 0x26, 0x8c, 0x3b, 0xa7, 0x09, 0xe0, 0x42, 0x14, + 0x68, 0x00, 0x7b, 0xac, 0xb2, 0xdf, 0x33, 0x16, + 0x96, 0xe9, 0x39, 0xe4, 0x6c, 0x51, 0x8d, 0x80, + 0xc8, 0x14, 0xe2, 0x04, 0x76, 0xa9, 0xfb, 0x8a, + 0x50, 0x25, 0xc0, 0x2d, 0x59, 0xc5, 0x82, 0x39, + 0xde, 0x13, 0x69, 0x67, 0x6c, 0xcc, 0x5a, 0x71, + 0xfa, 0x25, 0x63, 0x95, 0x96, 0x74, 0xee, 0x15, + 0x58, 0x86, 0xca, 0x5d, 0x2e, 0x2f, 0x31, 0xd7, + 0x7e, 0x0a, 0xf1, 0xfa, 0x27, 0xcf, 0x73, 0xc3, + 0x74, 0x9c, 0x47, 0xab, 0x18, 0x50, 0x1d, 0xda, + 0xe2, 0x75, 0x7e, 0x4f, 0x74, 0x01, 0x90, 0x5a, + 0xca, 0xfa, 0xaa, 0xe3, 0xe4, 0xd5, 0x9b, 0x34, + 0x9a, 0xdf, 0x6a, 0xce, 0xbd, 0x10, 0x19, 0x0d, + 0xfe, 0x48, 0x90, 0xd1, 0xe6, 0x18, 0x8d, 0x0b, + 0x04, 0x6d, 0xf3, 0x44, 0x70, 0x6c, 0x63, 0x1e, + ]; + + let block = x86ni(); + x86ni_init(&block, key[..]); + + assert(block.rklen == 240); + assert(bytes::equal(expected_rounds[..], block.enc_rk[..RKLEN_256])); +}; + +@test fn ni_test_encrypt_128() void = { + let key: [_]u8 = [ + 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, + 0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c, + ]; + + let plain: [16]u8 = [ + 0x32, 0x43, 0xf6, 0xa8, 0x88, 0x5a, 0x30, 0x8d, + 0x31, 0x31, 0x98, 0xa2, 0xe0, 0x37, 0x07, 0x34, + ]; + + const cipher: [16]u8 = [ + 0x39, 0x25, 0x84, 0x1d, 0x02, 0xdc, 0x09, 0xfb, + 0xdc, 0x11, 0x85, 0x97, 0x19, 0x6a, 0x0b, 0x32, + ]; + + let result: [16]u8 = [0...]; + let b = x86ni(); + + x86ni_init(&b, key[..]); + cipher::encrypt(&b, result[..], plain); + + assert(bytes::equal(cipher, result)); +}; + +@test fn ni_test_decrypt_128() void = { + const key: [_]u8 = [ + 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, + 0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c, + ]; + + const plain: [16]u8 = [ + 0x32, 0x43, 0xf6, 0xa8, 0x88, 0x5a, 0x30, 0x8d, + 0x31, 0x31, 0x98, 0xa2, 0xe0, 0x37, 0x07, 0x34, + ]; + + const cipher: [16]u8 = [ + 0x39, 0x25, 0x84, 0x1d, 0x02, 0xdc, 0x09, 0xfb, + 0xdc, 0x11, 0x85, 0x97, 0x19, 0x6a, 0x0b, 0x32, + ]; + + let result: [16]u8 = [0...]; + let b = x86ni(); + + x86ni_init(&b, key[..]); + cipher::decrypt(&b, result[..], cipher); + assert(bytes::equal(plain, result)); +}; + +// fips-197.pdf Appendix C.1 +@test fn ni_test_example_vector1() void = { + const key: []u8 = [ + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + ]; + + const plain: []u8 = [ + 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, + 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, + ]; + + const cipher: []u8 = [ + 0x69, 0xc4, 0xe0, 0xd8, 0x6a, 0x7b, 0x04, 0x30, + 0xd8, 0xcd, 0xb7, 0x80, 0x70, 0xb4, 0xc5, 0x5a, + ]; + + let result: [16]u8 = [0...]; + let b = x86ni(); + + x86ni_init(&b, key[..]); + + cipher::encrypt(&b, result[..], plain); + assert(bytes::equal(cipher, result)); + + cipher::decrypt(&b, result[..], cipher); + assert(bytes::equal(plain, result)); +}; + +// fips-197.pdf Appendix C.2 +@test fn ni_test_example_vector2() void = { + const key: []u8 = [ + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + ]; + + const plain: []u8 = [ + 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, + 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, + ]; + + const cipher: []u8 = [ + 0xdd, 0xa9, 0x7c, 0xa4, 0x86, 0x4c, 0xdf, 0xe0, + 0x6e, 0xaf, 0x70, 0xa0, 0xec, 0x0d, 0x71, 0x91, + ]; + + let result: [16]u8 = [0...]; + let b = x86ni(); + + x86ni_init(&b, key[..]); + + cipher::encrypt(&b, result[..], plain); + assert(bytes::equal(cipher, result)); + + cipher::decrypt(&b, result[..], cipher); + assert(bytes::equal(plain, result)); +}; + +// fips-197.pdf Appendix C.3 +@test fn ni_test_example_vector3() void = { + const key: []u8 = [ + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + ]; + + const plain: []u8 = [ + 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, + 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, + ]; + + const cipher: []u8 = [ + 0x8e, 0xa2, 0xb7, 0xca, 0x51, 0x67, 0x45, 0xbf, + 0xea, 0xfc, 0x49, 0x90, 0x4b, 0x49, 0x60, 0x89, + ]; + + let result: [16]u8 = [0...]; + let b = x86ni(); + + x86ni_init(&b, key[..]); + + cipher::encrypt(&b, result[..], plain); + assert(bytes::equal(cipher, result)); + + cipher::decrypt(&b, result[..], cipher); + assert(bytes::equal(plain, result)); +}; diff --git a/crypto/aes/+x86_64/ni.ha b/crypto/aes/+x86_64/ni.ha @@ -0,0 +1,64 @@ +use bytes; +use crypto::cipher; +use rt; + +def RKLEN_128: size = 176; +def RKLEN_192: size = 208; +def RKLEN_256: size = 240; + +export type x86ni_block = struct { + cipher::block, + enc_rk: [RKLEN_256]u8, + dec_rk: [RKLEN_256]u8, + rklen: u8, +}; + +// Checks if the native AES interface is available. +export fn x86ni_available() bool = { + return rt::cpuid_hasflags(0, rt::cpuid_ecxflags::AES); +}; + +// Returns a native AES [[crypto::cipher::block]] implementation for x86_64 +// CPUs supporting AES-NI. +// +// The caller must call [[x86ni_init]] to add a key to the cipher before using +// the cipher, and must call [[crypto::cipher::finish]] when they are finished +// using the cipher to securely erase any secret data stored in the cipher +// state. +export fn x86ni() x86ni_block = { + return x86ni_block { + encrypt = &x86ni_encrypt, + decrypt = &x86ni_decrypt, + finish = &x86ni_finish, + ... + }; +}; + +export fn x86ni_init(b: *x86ni_block, key: []u8) void = { + assert(len(key) == 16 || len(key) == 24 || len(key) == 32, + "Invalid aes key length"); + + b.rklen = x86ni_keyexp(key[..], b.enc_rk[..], b.dec_rk[..]); +}; + +fn x86ni_encrypt(block: *cipher::block, dest: []u8, src: []u8) void = { + let b = block: *x86ni_block; + x86ni_asencrypt(b.enc_rk[..b.rklen], dest, src); +}; + +fn x86ni_decrypt(block: *cipher::block, dest: []u8, src: []u8) void = { + let b = block: *x86ni_block; + x86ni_asdecrypt(b.dec_rk[..b.rklen], dest, src); +}; + +fn x86ni_finish(block: *cipher::block) void = { + let b = block: *x86ni_block; + bytes::zero(b.enc_rk[..]); + bytes::zero(b.dec_rk[..]); +}; + +// Expands encryption and decryption key and returns the size of the round keys. +export fn x86ni_keyexp(key: []u8, enc_rk: []u8, dec_rk: []u8) u8; +export fn x86ni_asencrypt(key_exp: []u8, dest: []u8, src: []u8) void; +export fn x86ni_asdecrypt(key_exp: []u8, dest: []u8, src: []u8) void; + diff --git a/crypto/aes/+x86_64/ni_native.s b/crypto/aes/+x86_64/ni_native.s @@ -0,0 +1,500 @@ +.global crypto.aes.x86ni_keyexp +.type crypto.aes.x86ni_keyexp,@function +crypto.aes.x86ni_keyexp: + pushq %rbp + mov %rsp, %rbp + + pushq %rbx + pushq %rcx + pushq %rdx + + movq 0x10(%rbp), %rbx # &key + movq 0x18(%rbp), %rax # keylen + + movq 0x28(%rbp), %rcx # &enk_rk + + mov $0x18, %rdx + cmp %rax, %rdx + je enc_key_192 + jle enc_key_256 + +enc_key_128: + movdqu (%rbx), %xmm1 + movdqu %xmm1, (%rcx) + aeskeygenassist $0x1, %xmm1, %xmm2 + call key_expand_128 + movdqu %xmm1, 0x10(%rcx) + aeskeygenassist $0x2, %xmm1, %xmm2 + call key_expand_128 + movdqu %xmm1, 0x20(%rcx) + aeskeygenassist $0x4, %xmm1, %xmm2 + call key_expand_128 + movdqu %xmm1, 0x30(%rcx) + aeskeygenassist $0x8, %xmm1, %xmm2 + call key_expand_128 + movdqu %xmm1, 0x40(%rcx) + aeskeygenassist $0x10, %xmm1, %xmm2 + call key_expand_128 + movdqu %xmm1, 0x50(%rcx) + aeskeygenassist $0x20, %xmm1, %xmm2 + call key_expand_128 + movdqu %xmm1, 0x60(%rcx) + aeskeygenassist $0x40, %xmm1, %xmm2 + call key_expand_128 + movdqu %xmm1, 0x70(%rcx) + aeskeygenassist $0x80, %xmm1, %xmm2 + call key_expand_128 + movdqu %xmm1, 0x80(%rcx) + aeskeygenassist $0x1b, %xmm1, %xmm2 + call key_expand_128 + movdqu %xmm1, 0x90(%rcx) + aeskeygenassist $0x36, %xmm1, %xmm2 + call key_expand_128 + movdqu %xmm1, 0xa0(%rcx) + + jmp dec_key +key_expand_128: + vpslldq $0x4, %xmm1, %xmm3 + pxor %xmm3, %xmm1 + vpslldq $0x4, %xmm1, %xmm3 + pxor %xmm3, %xmm1 + vpslldq $0x4, %xmm1, %xmm3 + pxor %xmm3, %xmm1 + + pshufd $0xff, %xmm2, %xmm2 + pxor %xmm2, %xmm1 + ret + +enc_key_192: + movdqu (%rbx), %xmm1 + movdqu 0x10(%rbx), %xmm3 + + movdqu %xmm1, (%rcx) + movdqu %xmm3, %xmm5 + + aeskeygenassist $0x1, %xmm3, %xmm2 + call key_expand_192 + shufpd $0, %xmm1, %xmm5 + movdqu %xmm5, 0x10(%rcx) + movdqu %xmm1, %xmm6 + shufpd $1, %xmm3, %xmm6 + movdqu %xmm6, 0x20(%rcx) + + aeskeygenassist $0x2, %xmm3, %xmm2 + call key_expand_192 + movdqu %xmm1, 0x30(%rcx) + movdqu %xmm3, %xmm5 + + aeskeygenassist $0x4, %xmm3, %xmm2 + call key_expand_192 + shufpd $0, %xmm1, %xmm5 + movdqu %xmm5, 0x40(%rcx) + movdqu %xmm1, %xmm6 + shufpd $1, %xmm3, %xmm6 + movdqu %xmm6, 0x50(%rcx) + + aeskeygenassist $0x8, %xmm3, %xmm2 + call key_expand_192 + movdqu %xmm1, 0x60(%rcx) + movdqu %xmm3, %xmm5 + + aeskeygenassist $0x10, %xmm3, %xmm2 + call key_expand_192 + shufpd $0, %xmm1, %xmm5 + movdqu %xmm5, 0x70(%rcx) + movdqu %xmm1, %xmm6 + shufpd $1, %xmm3, %xmm6 + movdqu %xmm6, 0x80(%rcx) + + aeskeygenassist $0x20, %xmm3, %xmm2 + call key_expand_192 + movdqu %xmm1, 0x90(%rcx) + movdqu %xmm3, %xmm5 + + aeskeygenassist $0x40, %xmm3, %xmm2 + call key_expand_192 + shufpd $0, %xmm1, %xmm5 + movdqu %xmm5, 0xa0(%rcx) + movdqu %xmm1, %xmm6 + shufpd $1, %xmm3, %xmm6 + movdqu %xmm6, 0xb0(%rcx) + + aeskeygenassist $0x80, %xmm3, %xmm2 + call key_expand_192 + movdqu %xmm1, 0xc0(%rcx) + movdqu %xmm3, %xmm5 + + jmp dec_key + +key_expand_192: + vpslldq $0x4, %xmm1, %xmm4 + pxor %xmm4, %xmm1 + vpslldq $0x4, %xmm1, %xmm4 + pxor %xmm4, %xmm1 + vpslldq $0x4, %xmm1, %xmm4 + pxor %xmm4, %xmm1 + + pshufd $0x55, %xmm2, %xmm2 + pxor %xmm2, %xmm1 + + pshufd $0xff, %xmm1, %xmm2 + vpslldq $0x4, %xmm3, %xmm4 + + pxor %xmm4, %xmm3 + pxor %xmm2, %xmm3 + + ret + +enc_key_256: + movdqu (%rbx), %xmm1 + movdqu 0x10(%rbx), %xmm3 + + movdqu %xmm1, (%rcx) + movdqu %xmm3, 0x10(%rcx) + + aeskeygenassist $0x1, %xmm3, %xmm2 + call key_expand_256_a + movdqu %xmm1, 0x20(%rcx) + aeskeygenassist $0x0, %xmm1, %xmm2 + call key_expand_256_b + movdqu %xmm3, 0x30(%rcx) + aeskeygenassist $0x2, %xmm3, %xmm2 + call key_expand_256_a + movdqu %xmm1, 0x40(%rcx) + aeskeygenassist $0x0, %xmm1, %xmm2 + call key_expand_256_b + movdqu %xmm3, 0x50(%rcx) + aeskeygenassist $0x4, %xmm3, %xmm2 + call key_expand_256_a + movdqu %xmm1, 0x60(%rcx) + aeskeygenassist $0x0, %xmm1, %xmm2 + call key_expand_256_b + movdqu %xmm3, 0x70(%rcx) + aeskeygenassist $0x8, %xmm3, %xmm2 + call key_expand_256_a + movdqu %xmm1, 0x80(%rcx) + aeskeygenassist $0x0, %xmm1, %xmm2 + call key_expand_256_b + movdqu %xmm3, 0x90(%rcx) + aeskeygenassist $0x10, %xmm3, %xmm2 + call key_expand_256_a + movdqu %xmm1, 0xa0(%rcx) + aeskeygenassist $0x0, %xmm1, %xmm2 + call key_expand_256_b + movdqu %xmm3, 0xb0(%rcx) + aeskeygenassist $0x20, %xmm3, %xmm2 + call key_expand_256_a + movdqu %xmm1, 0xc0(%rcx) + aeskeygenassist $0x0, %xmm1, %xmm2 + call key_expand_256_b + movdqu %xmm3, 0xd0(%rcx) + aeskeygenassist $0x40, %xmm3, %xmm2 + call key_expand_256_a + movdqu %xmm1, 0xe0(%rcx) + jmp dec_key + +key_expand_256_a: + movdqa %xmm1, %xmm4 + pslldq $4, %xmm4 + pxor %xmm4, %xmm1 + pslldq $4, %xmm4 + pxor %xmm4, %xmm1 + pslldq $4, %xmm4 + pxor %xmm4, %xmm1 + + pshufd $0xff, %xmm2, %xmm2 + pxor %xmm2, %xmm1 + + ret + +key_expand_256_b: + movdqa %xmm3, %xmm4 + pslldq $4, %xmm4 + pxor %xmm4, %xmm3 + pslldq $4, %xmm4 + pxor %xmm4, %xmm3 + pslldq $4, %xmm4 + pxor %xmm4, %xmm3 + + pshufd $0xaa, %xmm2, %xmm2 + pxor %xmm2, %xmm3 + + ret + +dec_key: + movq 0x40(%rbp), %rdx # &dec_rk + + # store key in reverse order, therefore add rklen to rk pointer + mov $0x18, %rbx + je rklen_dec_key_192 + jle rklen_dec_key_256 + add $160, %rdx + jmp dec_key_start +rklen_dec_key_192: + add $192, %rdx + jmp dec_key_start +rklen_dec_key_256: + add $224, %rdx + +dec_key_start: + movdqu 0x0(%rcx), %xmm1 + movdqu %xmm1, 0x0(%rdx) + + movdqu 0x10(%rcx), %xmm1 + aesimc %xmm1, %xmm1 + movdqu %xmm1, -0x10(%rdx) + movdqu 0x20(%rcx), %xmm1 + aesimc %xmm1, %xmm1 + movdqu %xmm1, -0x20(%rdx) + movdqu 0x30(%rcx), %xmm1 + aesimc %xmm1, %xmm1 + movdqu %xmm1, -0x30(%rdx) + movdqu 0x40(%rcx), %xmm1 + aesimc %xmm1, %xmm1 + movdqu %xmm1, -0x40(%rdx) + movdqu 0x50(%rcx), %xmm1 + aesimc %xmm1, %xmm1 + movdqu %xmm1, -0x50(%rdx) + movdqu 0x60(%rcx), %xmm1 + aesimc %xmm1, %xmm1 + movdqu %xmm1, -0x60(%rdx) + movdqu 0x70(%rcx), %xmm1 + aesimc %xmm1, %xmm1 + movdqu %xmm1, -0x70(%rdx) + movdqu 0x80(%rcx), %xmm1 + aesimc %xmm1, %xmm1 + movdqu %xmm1, -0x80(%rdx) + movdqu 0x90(%rcx), %xmm1 + aesimc %xmm1, %xmm1 + movdqu %xmm1, -0x90(%rdx) + + cmp %rax, %rbx + je dec_key_192 + jle dec_key_256 + + movdqu 0xa0(%rcx), %xmm1 + movdqu %xmm1, -0xa0(%rdx) + + # return rklen + movl $176, %eax + + jmp key_exp_end + +dec_key_192: + movdqu 0xa0(%rcx), %xmm1 + aesimc %xmm1, %xmm1 + movdqu %xmm1, -0xa0(%rdx) + movdqu 0xb0(%rcx), %xmm1 + aesimc %xmm1, %xmm1 + movdqu %xmm1, -0xb0(%rdx) + + + movdqu 0xc0(%rcx), %xmm1 + movdqu %xmm1, -0xc0(%rdx) + + # return rklen + movl $208, %eax + jmp key_exp_end +dec_key_256: + movdqu 0xa0(%rcx), %xmm1 + aesimc %xmm1, %xmm1 + movdqu %xmm1, -0xa0(%rdx) + movdqu 0xb0(%rcx), %xmm1 + aesimc %xmm1, %xmm1 + movdqu %xmm1, -0xb0(%rdx) + movdqu 0xc0(%rcx), %xmm1 + aesimc %xmm1, %xmm1 + movdqu %xmm1, -0xc0(%rdx) + movdqu 0xd0(%rcx), %xmm1 + aesimc %xmm1, %xmm1 + movdqu %xmm1, -0xd0(%rdx) + + + movdqu 0xe0(%rcx), %xmm1 + movdqu %xmm1, -0xe0(%rdx) + + # return rklen + movl $240, %eax + +key_exp_end: + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + + popq %rdx + popq %rcx + popq %rbx + + leave + ret + +.global crypto.aes.x86ni_asencrypt +.type crypto.aes.x86ni_asencrypt,@function +crypto.aes.x86ni_asencrypt: + pushq %rbp + mov %rsp, %rbp + pushq %rbx + pushq %rcx + pushq %rdx + + movq 0x10(%rbp), %rbx # &rk + movq 0x18(%rbp), %rax # rklen + + movq 0x28(%rbp), %rcx # &dest + movq 0x40(%rbp), %rdx # &src + + movdqu (%rdx), %xmm0 + movdqu (%rbx), %xmm1 + pxor %xmm1, %xmm0 + + movdqu 0x10(%rbx), %xmm1 + aesenc %xmm1, %xmm0 + movdqu 0x20(%rbx), %xmm1 + aesenc %xmm1, %xmm0 + movdqu 0x30(%rbx), %xmm1 + aesenc %xmm1, %xmm0 + movdqu 0x40(%rbx), %xmm1 + aesenc %xmm1, %xmm0 + movdqu 0x50(%rbx), %xmm1 + aesenc %xmm1, %xmm0 + movdqu 0x60(%rbx), %xmm1 + aesenc %xmm1, %xmm0 + movdqu 0x70(%rbx), %xmm1 + aesenc %xmm1, %xmm0 + movdqu 0x80(%rbx), %xmm1 + aesenc %xmm1, %xmm0 + movdqu 0x90(%rbx), %xmm1 + aesenc %xmm1, %xmm0 + + mov $208, %rdx + cmp %rax, %rdx + jl encrypt_256 + je encrypt_192 + + movdqu 0xa0(%rbx), %xmm1 + aesenclast %xmm1, %xmm0 + jmp encrypt_end + +encrypt_192: + movdqu 0xa0(%rbx), %xmm1 + aesenc %xmm1, %xmm0 + movdqu 0xb0(%rbx), %xmm1 + aesenc %xmm1, %xmm0 + movdqu 0xc0(%rbx), %xmm1 + aesenclast %xmm1, %xmm0 + jmp encrypt_end + +encrypt_256: + movdqu 0xa0(%rbx), %xmm1 + aesenc %xmm1, %xmm0 + movdqu 0xb0(%rbx), %xmm1 + aesenc %xmm1, %xmm0 + movdqu 0xc0(%rbx), %xmm1 + aesenc %xmm1, %xmm0 + movdqu 0xd0(%rbx), %xmm1 + aesenc %xmm1, %xmm0 + movdqu 0xe0(%rbx), %xmm1 + aesenclast %xmm1, %xmm0 + jmp encrypt_end + +encrypt_end: + + movdqu %xmm0, (%rcx) + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + + popq %rdx + popq %rcx + popq %rbx + + leave + ret + +.global crypto.aes.x86ni_asdecrypt +.type crypto.aes.x86ni_asdecrypt,@function +crypto.aes.x86ni_asdecrypt: + pushq %rbp + mov %rsp, %rbp + + pushq %rbx + pushq %rcx + pushq %rdx + + movq 0x10(%rbp), %rbx # &rk + movq 0x18(%rbp), %rax # rklen + + movq 0x28(%rbp), %rcx # &dest + movq 0x40(%rbp), %rdx # &src + + movdqu (%rdx), %xmm0 + movdqu (%rbx), %xmm1 + pxor %xmm1, %xmm0 + + movdqu 0x10(%rbx), %xmm1 + aesdec %xmm1, %xmm0 + movdqu 0x20(%rbx), %xmm1 + aesdec %xmm1, %xmm0 + movdqu 0x30(%rbx), %xmm1 + aesdec %xmm1, %xmm0 + movdqu 0x40(%rbx), %xmm1 + aesdec %xmm1, %xmm0 + movdqu 0x50(%rbx), %xmm1 + aesdec %xmm1, %xmm0 + movdqu 0x60(%rbx), %xmm1 + aesdec %xmm1, %xmm0 + movdqu 0x70(%rbx), %xmm1 + aesdec %xmm1, %xmm0 + movdqu 0x80(%rbx), %xmm1 + aesdec %xmm1, %xmm0 + movdqu 0x90(%rbx), %xmm1 + aesdec %xmm1, %xmm0 + + mov $208, %rdx + cmp %rax, %rdx + je decrypt_192 + jl decrypt_256 + + movdqu 0xa0(%rbx), %xmm1 + aesdeclast %xmm1, %xmm0 + jmp decrypt_end + +decrypt_192: + movdqu 0xa0(%rbx), %xmm1 + aesdec %xmm1, %xmm0 + movdqu 0xb0(%rbx), %xmm1 + aesdec %xmm1, %xmm0 + movdqu 0xc0(%rbx), %xmm1 + aesdeclast %xmm1, %xmm0 + jmp decrypt_end + +decrypt_256: + movdqu 0xa0(%rbx), %xmm1 + aesdec %xmm1, %xmm0 + movdqu 0xb0(%rbx), %xmm1 + aesdec %xmm1, %xmm0 + movdqu 0xc0(%rbx), %xmm1 + aesdec %xmm1, %xmm0 + movdqu 0xd0(%rbx), %xmm1 + aesdec %xmm1, %xmm0 + movdqu 0xe0(%rbx), %xmm1 + aesdeclast %xmm1, %xmm0 + jmp decrypt_end + +decrypt_end: + movdqu %xmm0, (%rcx) + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + + popq %rdx + popq %rcx + popq %rbx + + leave + ret + diff --git a/scripts/gen-stdlib b/scripts/gen-stdlib @@ -231,10 +231,13 @@ crypto_aes() { if [ $testing -eq 0 ] then gensrcs_crypto_aes - gen_ssa crypto::aes crypto::cipher crypto::math endian + gen_ssa crypto::aes bytes crypto::cipher crypto::math endian rt else - gensrcs_crypto_aes ct64+test.ha cbc+test.ha ctr+test.ha - gen_ssa crypto::aes bytes crypto::cipher crypto::math endian + gensrcs_crypto_aes \ + ct64+test.ha \ + cbc+test.ha \ + ctr+test.ha + gen_ssa crypto::aes bytes crypto::cipher crypto::math endian rt fi } diff --git a/stdlib.mk b/stdlib.mk @@ -742,7 +742,7 @@ $(HARECACHE)/crypto/crypto-any.ssa: $(stdlib_crypto_any_srcs) $(stdlib_rt) $(std stdlib_crypto_aes_any_srcs= \ $(STDLIB)/crypto/aes/aes_ct64.ha -$(HARECACHE)/crypto/aes/crypto_aes-any.ssa: $(stdlib_crypto_aes_any_srcs) $(stdlib_rt) $(stdlib_crypto_cipher_$(PLATFORM)) $(stdlib_crypto_math_$(PLATFORM)) $(stdlib_endian_$(PLATFORM)) +$(HARECACHE)/crypto/aes/crypto_aes-any.ssa: $(stdlib_crypto_aes_any_srcs) $(stdlib_rt) $(stdlib_bytes_$(PLATFORM)) $(stdlib_crypto_cipher_$(PLATFORM)) $(stdlib_crypto_math_$(PLATFORM)) $(stdlib_endian_$(PLATFORM)) $(stdlib_rt_$(PLATFORM)) @printf 'HAREC \t$@\n' @mkdir -p $(HARECACHE)/crypto/aes @HARECACHE=$(HARECACHE) $(HAREC) $(HAREFLAGS) -o $@ -Ncrypto::aes \ @@ -2654,7 +2654,7 @@ testlib_crypto_aes_any_srcs= \ $(STDLIB)/crypto/aes/cbc+test.ha \ $(STDLIB)/crypto/aes/ctr+test.ha -$(TESTCACHE)/crypto/aes/crypto_aes-any.ssa: $(testlib_crypto_aes_any_srcs) $(testlib_rt) $(testlib_bytes_$(PLATFORM)) $(testlib_crypto_cipher_$(PLATFORM)) $(testlib_crypto_math_$(PLATFORM)) $(testlib_endian_$(PLATFORM)) +$(TESTCACHE)/crypto/aes/crypto_aes-any.ssa: $(testlib_crypto_aes_any_srcs) $(testlib_rt) $(testlib_bytes_$(PLATFORM)) $(testlib_crypto_cipher_$(PLATFORM)) $(testlib_crypto_math_$(PLATFORM)) $(testlib_endian_$(PLATFORM)) $(testlib_rt_$(PLATFORM)) @printf 'HAREC \t$@\n' @mkdir -p $(TESTCACHE)/crypto/aes @HARECACHE=$(TESTCACHE) $(HAREC) $(TESTHAREFLAGS) -o $@ -Ncrypto::aes \