diff options
Diffstat (limited to 'src/block/aes_ssse3/aes_ssse3.cpp')
-rw-r--r-- | src/block/aes_ssse3/aes_ssse3.cpp | 156 |
1 files changed, 78 insertions, 78 deletions
diff --git a/src/block/aes_ssse3/aes_ssse3.cpp b/src/block/aes_ssse3/aes_ssse3.cpp index fca77e9a1..be0408897 100644 --- a/src/block/aes_ssse3/aes_ssse3.cpp +++ b/src/block/aes_ssse3/aes_ssse3.cpp @@ -19,33 +19,33 @@ namespace { const __m128i low_nibs = _mm_set1_epi8(0x0F); -const __m128i k_ipt1 = _mm_set_epi64x( - 0xCABAE09052227808, 0xC2B2E8985A2A7000); -const __m128i k_ipt2 = _mm_set_epi64x( - 0xCD80B1FCB0FDCC81, 0x4C01307D317C4D00); +const __m128i k_ipt1 = _mm_set_epi32( + 0xCABAE090, 0x52227808, 0xC2B2E898, 0x5A2A7000); +const __m128i k_ipt2 = _mm_set_epi32( + 0xCD80B1FC, 0xB0FDCC81, 0x4C01307D, 0x317C4D00); -const __m128i k_inv1 = _mm_set_epi64x( - 0x040703090A0B0C02, 0x0E05060F0D080180); -const __m128i k_inv2 = _mm_set_epi64x( - 0x030D0E0C02050809, 0x01040A060F0B0780); +const __m128i k_inv1 = _mm_set_epi32( + 0x04070309, 0x0A0B0C02, 0x0E05060F, 0x0D080180); +const __m128i k_inv2 = _mm_set_epi32( + 0x030D0E0C, 0x02050809, 0x01040A06, 0x0F0B0780); -const __m128i sb1u = _mm_set_epi64x( - 0xA5DF7A6E142AF544, 0xB19BE18FCB503E00); -const __m128i sb1t = _mm_set_epi64x( - 0x3BF7CCC10D2ED9EF, 0x3618D415FAE22300); +const __m128i sb1u = _mm_set_epi32( + 0xA5DF7A6E, 0x142AF544, 0xB19BE18F, 0xCB503E00); +const __m128i sb1t = _mm_set_epi32( + 0x3BF7CCC1, 0x0D2ED9EF, 0x3618D415, 0xFAE22300); const __m128i mc_forward[4] = { - _mm_set_epi64x(0x0C0F0E0D080B0A09, 0x0407060500030201), - _mm_set_epi64x(0x000302010C0F0E0D, 0x080B0A0904070605), - _mm_set_epi64x(0x0407060500030201, 0x0C0F0E0D080B0A09), - _mm_set_epi64x(0x080B0A0904070605, 0x000302010C0F0E0D) + _mm_set_epi32(0x0C0F0E0D, 0x080B0A09, 0x04070605, 0x00030201), + _mm_set_epi32(0x00030201, 0x0C0F0E0D, 0x080B0A09, 0x04070605), + _mm_set_epi32(0x04070605, 0x00030201, 0x0C0F0E0D, 0x080B0A09), + _mm_set_epi32(0x080B0A09, 0x04070605, 0x00030201, 0x0C0F0E0D) }; const __m128i sr[4] = { - _mm_set_epi64x(0x0F0E0D0C0B0A0908, 0x0706050403020100), - _mm_set_epi64x(0x0B06010C07020D08, 0x030E09040F0A0500), - _mm_set_epi64x(0x070E050C030A0108, 0x0F060D040B020900), - _mm_set_epi64x(0x0306090C0F020508, 0x0B0E0104070A0D00), + _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100), + _mm_set_epi32(0x0B06010C, 0x07020D08, 0x030E0904, 0x0F0A0500), + _mm_set_epi32(0x070E050C, 0x030A0108, 0x0F060D04, 0x0B020900), + _mm_set_epi32(0x0306090C, 0x0F020508, 0x0B0E0104, 0x070A0D00), }; #define mm_xor3(x, y, z) _mm_xor_si128(x, _mm_xor_si128(y, z)) @@ -81,14 +81,14 @@ __m128i aes_schedule_mangle(__m128i k, byte round_no) __m128i aes_schedule_mangle_dec(__m128i k, byte round_no) { const __m128i dsk[8] = { - _mm_set_epi64x(0x4AED933482255BFC, 0xB6116FC87ED9A700), - _mm_set_epi64x(0x8BB89FACE9DAFDCE, 0x4576516227143300), - _mm_set_epi64x(0x4622EE8AADC90561, 0x27438FEBCCA86400), - _mm_set_epi64x(0x73AEE13CBD602FF2, 0x815C13CE4F92DD00), - _mm_set_epi64x(0xF83F3EF9FA3D3CFB, 0x03C4C50201C6C700), - _mm_set_epi64x(0xA5526A9D7384BC4B, 0xEE1921D638CFF700), - _mm_set_epi64x(0xA080D3F310306343, 0xE3C390B053732000), - _mm_set_epi64x(0x2F45AEC48CE60D67, 0xA0CA214B036982E8) + _mm_set_epi32(0x4AED9334, 0x82255BFC, 0xB6116FC8, 0x7ED9A700), + _mm_set_epi32(0x8BB89FAC, 0xE9DAFDCE, 0x45765162, 0x27143300), + _mm_set_epi32(0x4622EE8A, 0xADC90561, 0x27438FEB, 0xCCA86400), + _mm_set_epi32(0x73AEE13C, 0xBD602FF2, 0x815C13CE, 0x4F92DD00), + _mm_set_epi32(0xF83F3EF9, 0xFA3D3CFB, 0x03C4C502, 0x01C6C700), + _mm_set_epi32(0xA5526A9D, 0x7384BC4B, 0xEE1921D6, 0x38CFF700), + _mm_set_epi32(0xA080D3F3, 0x10306343, 0xE3C390B0, 0x53732000), + _mm_set_epi32(0x2F45AEC4, 0x8CE60D67, 0xA0CA214B, 0x036982E8) }; __m128i t = aes_schedule_transform(k, dsk[0], dsk[1]); @@ -108,10 +108,10 @@ __m128i aes_schedule_mangle_dec(__m128i k, byte round_no) __m128i aes_schedule_mangle_last(__m128i k, byte round_no) { - const __m128i out_tr1 = _mm_set_epi64x( - 0xF7974121DEBE6808, 0xFF9F4929D6B66000); - const __m128i out_tr2 = _mm_set_epi64x( - 0xE10D5DB1B05C0CE0, 0x01EDBD5150BCEC00); + const __m128i out_tr1 = _mm_set_epi32( + 0xF7974121, 0xDEBE6808, 0xFF9F4929, 0xD6B66000); + const __m128i out_tr2 = _mm_set_epi32( + 0xE10D5DB1, 0xB05C0CE0, 0x01EDBD51, 0x50BCEC00); k = _mm_shuffle_epi8(k, sr[round_no % 4]); k = _mm_xor_si128(k, _mm_set1_epi8(0x5B)); @@ -120,10 +120,10 @@ __m128i aes_schedule_mangle_last(__m128i k, byte round_no) __m128i aes_schedule_mangle_last_dec(__m128i k) { - const __m128i deskew1 = _mm_set_epi64x( - 0x1DFEB95A5DBEF91A, 0x07E4A34047A4E300); - const __m128i deskew2 = _mm_set_epi64x( - 0x2841C2ABF49D1E77, 0x5F36B5DC83EA6900); + const __m128i deskew1 = _mm_set_epi32( + 0x1DFEB95A, 0x5DBEF91A, 0x07E4A340, 0x47A4E300); + const __m128i deskew2 = _mm_set_epi32( + 0x2841C2AB, 0xF49D1E77, 0x5F36B5DC, 0x83EA6900); k = _mm_xor_si128(k, _mm_set1_epi8(0x5B)); return aes_schedule_transform(k, deskew1, deskew2); @@ -166,21 +166,21 @@ __m128i aes_schedule_round(__m128i* rcon, __m128i input1, __m128i input2) __m128i aes_ssse3_encrypt(__m128i B, const __m128i* keys, u32bit rounds) { - const __m128i sb2u = _mm_set_epi64x( - 0x5EB7E955BC982FCD, 0xE27A93C60B712400); - const __m128i sb2t = _mm_set_epi64x( - 0xC2A163C8AB82234A, 0x69EB88400AE12900); + const __m128i sb2u = _mm_set_epi32( + 0x5EB7E955, 0xBC982FCD, 0xE27A93C6, 0x0B712400); + const __m128i sb2t = _mm_set_epi32( + 0xC2A163C8, 0xAB82234A, 0x69EB8840, 0x0AE12900); - const __m128i sbou = _mm_set_epi64x( - 0x15AABF7AC502A878, 0xD0D26D176FBDC700); - const __m128i sbot = _mm_set_epi64x( - 0x8E1E90D1412B35FA, 0xCFE474A55FBB6A00); + const __m128i sbou = _mm_set_epi32( + 0x15AABF7A, 0xC502A878, 0xD0D26D17, 0x6FBDC700); + const __m128i sbot = _mm_set_epi32( + 0x8E1E90D1, 0x412B35FA, 0xCFE474A5, 0x5FBB6A00); const __m128i mc_backward[4] = { - _mm_set_epi64x(0x0E0D0C0F0A09080B, 0x0605040702010003), - _mm_set_epi64x(0x0A09080B06050407, 0x020100030E0D0C0F), - _mm_set_epi64x(0x0605040702010003, 0x0E0D0C0F0A09080B), - _mm_set_epi64x(0x020100030E0D0C0F, 0x0A09080B06050407), + _mm_set_epi32(0x0E0D0C0F, 0x0A09080B, 0x06050407, 0x02010003), + _mm_set_epi32(0x0A09080B, 0x06050407, 0x02010003, 0x0E0D0C0F), + _mm_set_epi32(0x06050407, 0x02010003, 0x0E0D0C0F, 0x0A09080B), + _mm_set_epi32(0x02010003, 0x0E0D0C0F, 0x0A09080B, 0x06050407), }; B = mm_xor3(_mm_shuffle_epi8(k_ipt1, _mm_and_si128(low_nibs, B)), @@ -235,30 +235,30 @@ __m128i aes_ssse3_encrypt(__m128i B, const __m128i* keys, u32bit rounds) __m128i aes_ssse3_decrypt(__m128i B, const __m128i* keys, u32bit rounds) { - const __m128i k_dipt1 = _mm_set_epi64x( - 0x154A411E114E451A, 0x0F505B040B545F00); - const __m128i k_dipt2 = _mm_set_epi64x( - 0x12771772F491F194, 0x86E383E660056500); - - const __m128i sb9u = _mm_set_epi64x( - 0xCAD51F504F994CC9, 0x851C03539A86D600); - const __m128i sb9t = _mm_set_epi64x( - 0x725E2C9EB2FBA565, 0xC03B1789ECD74900); - - const __m128i sbeu = _mm_set_epi64x( - 0x2242600464B4F6B0, 0x46F2929626D4D000); - const __m128i sbet = _mm_set_epi64x( - 0x9467F36B98593E32, 0x0C55A6CDFFAAC100); - - const __m128i sbdu = _mm_set_epi64x( - 0xF56E9B13882A4439, 0x7D57CCDFE6B1A200); - const __m128i sbdt = _mm_set_epi64x( - 0x2931180D15DEEFD3, 0x3CE2FAF724C6CB00); - - const __m128i sbbu = _mm_set_epi64x( - 0x602646F6B0F2D404, 0xD022649296B44200); - const __m128i sbbt = _mm_set_epi64x( - 0xF3FF0C3E3255AA6B, 0xC19498A6CD596700); + const __m128i k_dipt1 = _mm_set_epi32( + 0x154A411E, 0x114E451A, 0x0F505B04, 0x0B545F00); + const __m128i k_dipt2 = _mm_set_epi32( + 0x12771772, 0xF491F194, 0x86E383E6, 0x60056500); + + const __m128i sb9u = _mm_set_epi32( + 0xCAD51F50, 0x4F994CC9, 0x851C0353, 0x9A86D600); + const __m128i sb9t = _mm_set_epi32( + 0x725E2C9E, 0xB2FBA565, 0xC03B1789, 0xECD74900); + + const __m128i sbeu = _mm_set_epi32( + 0x22426004, 0x64B4F6B0, 0x46F29296, 0x26D4D000); + const __m128i sbet = _mm_set_epi32( + 0x9467F36B, 0x98593E32, 0x0C55A6CD, 0xFFAAC100); + + const __m128i sbdu = _mm_set_epi32( + 0xF56E9B13, 0x882A4439, 0x7D57CCDF, 0xE6B1A200); + const __m128i sbdt = _mm_set_epi32( + 0x2931180D, 0x15DEEFD3, 0x3CE2FAF7, 0x24C6CB00); + + const __m128i sbbu = _mm_set_epi32( + 0x602646F6, 0xB0F2D404, 0xD0226492, 0x96B44200); + const __m128i sbbt = _mm_set_epi32( + 0xF3FF0C3E, 0x3255AA6B, 0xC19498A6, 0xCD596700); __m128i mc = mc_forward[3]; @@ -290,10 +290,10 @@ __m128i aes_ssse3_decrypt(__m128i B, const __m128i* keys, u32bit rounds) if(r == rounds) { - const __m128i sbou = _mm_set_epi64x( - 0xC7AA6DB9D4943E2D, 0x1387EA537EF94000); - const __m128i sbot = _mm_set_epi64x( - 0xCA4B8159D8C58E9C, 0x12D7560F93441D00); + const __m128i sbou = _mm_set_epi32( + 0xC7AA6DB9, 0xD4943E2D, 0x1387EA53, 0x7EF94000); + const __m128i sbot = _mm_set_epi32( + 0xCA4B8159, 0xD8C58E9C, 0x12D7560F, 0x93441D00); __m128i x = _mm_shuffle_epi8(sbou, t5); __m128i y = _mm_shuffle_epi8(sbot, t6); @@ -366,8 +366,8 @@ void AES_128_SSSE3::decrypt_n(const byte in[], byte out[], u32bit blocks) const */ void AES_128_SSSE3::key_schedule(const byte keyb[], u32bit) { - __m128i rcon = _mm_set_epi64x(0x702A98084D7C7D81, - 0x1F8391B9AF9DEEB6); + __m128i rcon = _mm_set_epi32(0x702A9808, 0x4D7C7D81, + 0x1F8391B9, 0xAF9DEEB6); __m128i key = _mm_loadu_si128((const __m128i*)keyb); |