diff options
author | Jack Lloyd <[email protected]> | 2019-09-06 06:12:29 -0400 |
---|---|---|
committer | Jack Lloyd <[email protected]> | 2019-09-06 07:49:26 -0400 |
commit | 1a8f257609592526fd7a81a52a0562242d93a95a (patch) | |
tree | 5aa2a8b3b9bd746e3752b7fbb04410e89af6a7f1 /src/lib/block | |
parent | 1cfc302453b3b72790ec8c8424f14fdd711435fd (diff) |
In aes_vperm avoid loading from data segment
I do not understand the mechanism but this is slightly faster.
Diffstat (limited to 'src/lib/block')
-rw-r--r-- | src/lib/block/aes/aes_vperm/aes_vperm.cpp | 29 |
1 files changed, 12 insertions, 17 deletions
diff --git a/src/lib/block/aes/aes_vperm/aes_vperm.cpp b/src/lib/block/aes/aes_vperm/aes_vperm.cpp index b7e82876c..10e1e5c26 100644 --- a/src/lib/block/aes/aes_vperm/aes_vperm.cpp +++ b/src/lib/block/aes/aes_vperm/aes_vperm.cpp @@ -105,17 +105,15 @@ const SIMD_4x32 rcon[10] = { SIMD_4x32(0x00000083, 0x00000000, 0x00000000, 0x00000000), }; -const SIMD_4x32 lo_nibs_mask = SIMD_4x32::splat_u8(0x0F); -const SIMD_4x32 hi_nibs_mask = SIMD_4x32::splat_u8(0xF0); -const SIMD_4x32 xor_5B = SIMD_4x32::splat_u8(0x5B); - inline SIMD_4x32 low_nibs(SIMD_4x32 x) { + const SIMD_4x32 lo_nibs_mask = SIMD_4x32::splat_u8(0x0F); return lo_nibs_mask & x; } inline SIMD_4x32 high_nibs(SIMD_4x32 x) { + const SIMD_4x32 hi_nibs_mask = SIMD_4x32::splat_u8(0xF0); return (hi_nibs_mask & x).shr<4>(); } @@ -418,7 +416,7 @@ SIMD_4x32 aes_schedule_mangle(SIMD_4x32 k, uint8_t round_no) const SIMD_4x32 mc_forward0(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D); const SIMD_4x32 srx(sr[round_no % 4]); - SIMD_4x32 t = shuffle(k ^ xor_5B, mc_forward0); + SIMD_4x32 t = shuffle(k ^ SIMD_4x32::splat_u8(0x5B), mc_forward0); SIMD_4x32 t2 = t; t = shuffle(t, mc_forward0); t2 = t ^ t2 ^ shuffle(t, mc_forward0); @@ -461,7 +459,7 @@ SIMD_4x32 aes_schedule_mangle_last(SIMD_4x32 k, uint8_t round_no) const SIMD_4x32 out_tr2(0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1); k = shuffle(k, sr[round_no % 4]); - k ^= xor_5B; + k ^= SIMD_4x32::splat_u8(0x5B); return aes_schedule_transform(k, out_tr1, out_tr2); } @@ -470,7 +468,7 @@ SIMD_4x32 aes_schedule_mangle_last_dec(SIMD_4x32 k) const SIMD_4x32 deskew1(0x47A4E300, 0x07E4A340, 0x5DBEF91A, 0x1DFEB95A); const SIMD_4x32 deskew2(0x83EA6900, 0x5F36B5DC, 0xF49D1E77, 0x2841C2AB); - k ^= xor_5B; + k ^= SIMD_4x32::splat_u8(0x5B); return aes_schedule_transform(k, deskew1, deskew2); } @@ -478,20 +476,17 @@ SIMD_4x32 aes_schedule_round(SIMD_4x32 input1, SIMD_4x32 input2) { SIMD_4x32 smeared = input2 ^ shift_elems_left<1>(input2); smeared ^= shift_elems_left<2>(smeared); - smeared ^= xor_5B; - - SIMD_4x32 t = high_nibs(input1); - input1 = low_nibs(input1); + smeared ^= SIMD_4x32::splat_u8(0x5B); - SIMD_4x32 t2 = shuffle(k_inv2, input1); + const SIMD_4x32 Bh = high_nibs(input1); + SIMD_4x32 Bl = low_nibs(input1); - input1 ^= t; + const SIMD_4x32 t2 = shuffle(k_inv2, Bl); - SIMD_4x32 t3 = t2 ^ shuffle(k_inv1, t); - SIMD_4x32 t4 = t2 ^ shuffle(k_inv1, input1); + Bl ^= Bh; - SIMD_4x32 t5 = input1 ^ shuffle(k_inv1, t3); - SIMD_4x32 t6 = t ^ shuffle(k_inv1, t4); + SIMD_4x32 t5 = Bl ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh)); + SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bl)); return smeared ^ shuffle(sb1u, t5) ^ shuffle(sb1t, t6); } |