aboutsummaryrefslogtreecommitdiffstats
path: root/src/lib/block
diff options
context:
space:
mode:
authorJack Lloyd <[email protected]>2019-09-06 06:12:29 -0400
committerJack Lloyd <[email protected]>2019-09-06 07:49:26 -0400
commit1a8f257609592526fd7a81a52a0562242d93a95a (patch)
tree5aa2a8b3b9bd746e3752b7fbb04410e89af6a7f1 /src/lib/block
parent1cfc302453b3b72790ec8c8424f14fdd711435fd (diff)
In aes_vperm avoid loading from data segment
I do not understand the mechanism but this is slightly faster.
Diffstat (limited to 'src/lib/block')
-rw-r--r--src/lib/block/aes/aes_vperm/aes_vperm.cpp29
1 files changed, 12 insertions, 17 deletions
diff --git a/src/lib/block/aes/aes_vperm/aes_vperm.cpp b/src/lib/block/aes/aes_vperm/aes_vperm.cpp
index b7e82876c..10e1e5c26 100644
--- a/src/lib/block/aes/aes_vperm/aes_vperm.cpp
+++ b/src/lib/block/aes/aes_vperm/aes_vperm.cpp
@@ -105,17 +105,15 @@ const SIMD_4x32 rcon[10] = {
SIMD_4x32(0x00000083, 0x00000000, 0x00000000, 0x00000000),
};
-const SIMD_4x32 lo_nibs_mask = SIMD_4x32::splat_u8(0x0F);
-const SIMD_4x32 hi_nibs_mask = SIMD_4x32::splat_u8(0xF0);
-const SIMD_4x32 xor_5B = SIMD_4x32::splat_u8(0x5B);
-
inline SIMD_4x32 low_nibs(SIMD_4x32 x)
{
+ const SIMD_4x32 lo_nibs_mask = SIMD_4x32::splat_u8(0x0F);
return lo_nibs_mask & x;
}
inline SIMD_4x32 high_nibs(SIMD_4x32 x)
{
+ const SIMD_4x32 hi_nibs_mask = SIMD_4x32::splat_u8(0xF0);
return (hi_nibs_mask & x).shr<4>();
}
@@ -418,7 +416,7 @@ SIMD_4x32 aes_schedule_mangle(SIMD_4x32 k, uint8_t round_no)
const SIMD_4x32 mc_forward0(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D);
const SIMD_4x32 srx(sr[round_no % 4]);
- SIMD_4x32 t = shuffle(k ^ xor_5B, mc_forward0);
+ SIMD_4x32 t = shuffle(k ^ SIMD_4x32::splat_u8(0x5B), mc_forward0);
SIMD_4x32 t2 = t;
t = shuffle(t, mc_forward0);
t2 = t ^ t2 ^ shuffle(t, mc_forward0);
@@ -461,7 +459,7 @@ SIMD_4x32 aes_schedule_mangle_last(SIMD_4x32 k, uint8_t round_no)
const SIMD_4x32 out_tr2(0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1);
k = shuffle(k, sr[round_no % 4]);
- k ^= xor_5B;
+ k ^= SIMD_4x32::splat_u8(0x5B);
return aes_schedule_transform(k, out_tr1, out_tr2);
}
@@ -470,7 +468,7 @@ SIMD_4x32 aes_schedule_mangle_last_dec(SIMD_4x32 k)
const SIMD_4x32 deskew1(0x47A4E300, 0x07E4A340, 0x5DBEF91A, 0x1DFEB95A);
const SIMD_4x32 deskew2(0x83EA6900, 0x5F36B5DC, 0xF49D1E77, 0x2841C2AB);
- k ^= xor_5B;
+ k ^= SIMD_4x32::splat_u8(0x5B);
return aes_schedule_transform(k, deskew1, deskew2);
}
@@ -478,20 +476,17 @@ SIMD_4x32 aes_schedule_round(SIMD_4x32 input1, SIMD_4x32 input2)
{
SIMD_4x32 smeared = input2 ^ shift_elems_left<1>(input2);
smeared ^= shift_elems_left<2>(smeared);
- smeared ^= xor_5B;
-
- SIMD_4x32 t = high_nibs(input1);
- input1 = low_nibs(input1);
+ smeared ^= SIMD_4x32::splat_u8(0x5B);
- SIMD_4x32 t2 = shuffle(k_inv2, input1);
+ const SIMD_4x32 Bh = high_nibs(input1);
+ SIMD_4x32 Bl = low_nibs(input1);
- input1 ^= t;
+ const SIMD_4x32 t2 = shuffle(k_inv2, Bl);
- SIMD_4x32 t3 = t2 ^ shuffle(k_inv1, t);
- SIMD_4x32 t4 = t2 ^ shuffle(k_inv1, input1);
+ Bl ^= Bh;
- SIMD_4x32 t5 = input1 ^ shuffle(k_inv1, t3);
- SIMD_4x32 t6 = t ^ shuffle(k_inv1, t4);
+ SIMD_4x32 t5 = Bl ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
+ SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bl));
return smeared ^ shuffle(sb1u, t5) ^ shuffle(sb1t, t6);
}