Some cleanups

author: Jack Lloyd <[email protected]> 2019-09-03 12:02:05 -0400
committer: Jack Lloyd <[email protected]> 2019-09-04 12:16:53 -0400
commit: 54764e302c2488816e6160c32b58de406c47286b (patch)
tree: 58ef8c96c2db2bee4fb09206f74351cc882c94f5
parent: 1ed06b02aee547435d507098824bb96bdb6d3214 (diff)
1 files changed, 52 insertions, 54 deletions
diff --git a/src/lib/block/aes/aes_vperm/aes_vperm.cpp b/src/lib/block/aes/aes_vperm/aes_vperm.cpp
index 23b3c580c..b7e82876c 100644
--- a/src/lib/block/aes/aes_vperm/aes_vperm.cpp
+++ b/src/lib/block/aes/aes_vperm/aes_vperm.cpp
@@ -2,10 +2,10 @@
 * AES using vector permutes (SSSE3, NEON)
 * (C) 2010,2016,2019 Jack Lloyd
 *
-* This is more or less a direct translation of public domain x86-64
-* assembly written by Mike Hamburg, described in "Accelerating AES
-* with Vector Permute Instructions" (CHES 2009). His original code is
-* available at https://crypto.stanford.edu/vpaes/
+* Based on public domain x86-64 assembly written by Mike Hamburg,
+* described in "Accelerating AES with Vector Permute Instructions"
+* (CHES 2009). His original code is available at
+* https://crypto.stanford.edu/vpaes/
 *
 * Botan is released under the Simplified BSD License (see license.txt)
 */
@@ -16,8 +16,6 @@
 
 #if defined(BOTAN_SIMD_USE_SSE2)
   #include <tmmintrin.h>
-#elif defined(BOTAN_SIMD_USE_NEON)
-  #include <arm_neon.h>
 #endif
 
 namespace Botan {
@@ -49,33 +47,25 @@ inline SIMD_4x32 shuffle(SIMD_4x32 a, SIMD_4x32 b)
    }
 
 template<size_t I>
-inline SIMD_4x32 slli(SIMD_4x32 x)
+inline SIMD_4x32 shift_elems_left(SIMD_4x32 x)
    {
 #if defined(BOTAN_SIMD_USE_SSE2)
    return SIMD_4x32(_mm_slli_si128(x.raw(), 4*I));
 #elif defined(BOTAN_SIMD_USE_NEON)
    return SIMD_4x32(vreinterpretq_u32_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u32(x.raw()), 16 - 4*I)));
+#else
+   #error "No shift_elems_left implementation available"
 #endif
    }
 
-inline SIMD_4x32 zero_top_half(SIMD_4x32 x)
-   {
-#if defined(BOTAN_SIMD_USE_SSE2)
-   return SIMD_4x32(_mm_slli_si128(_mm_srli_si128(x.raw(), 8), 8));
-#elif defined(BOTAN_SIMD_USE_NEON)
-   // fixme do better ?
-   const SIMD_4x32 mask(0, 0, ~0, ~0);
-   return x & mask;
-#endif
-   }
-
-template<int C>
-inline SIMD_4x32 alignr(SIMD_4x32 a, SIMD_4x32 b)
+inline SIMD_4x32 alignr8(SIMD_4x32 a, SIMD_4x32 b)
    {
 #if defined(BOTAN_SIMD_USE_SSE2)
-   return SIMD_4x32(_mm_alignr_epi8(a.raw(), b.raw(), C));
+   return SIMD_4x32(_mm_alignr_epi8(a.raw(), b.raw(), 8));
 #elif defined(BOTAN_SIMD_USE_NEON)
-   return SIMD_4x32(vreinterpretq_u32_u8(vextq_u8(vreinterpretq_u8_u32(b.raw()), vreinterpretq_u8_u32(a.raw()), C)));
+   return SIMD_4x32(vreinterpretq_u32_u8(vextq_u8(vreinterpretq_u8_u32(b.raw()), vreinterpretq_u8_u32(a.raw()), 8)));
+#else
+   #error "No alignr8 implementation available"
 #endif
    }
 
@@ -102,10 +92,22 @@ const SIMD_4x32 sr[4] = {
    SIMD_4x32(0x070A0D00, 0x0B0E0104, 0x0F020508, 0x0306090C),
 };
 
+const SIMD_4x32 rcon[10] = {
+   SIMD_4x32(0x00000070, 0x00000000, 0x00000000, 0x00000000),
+   SIMD_4x32(0x0000002A, 0x00000000, 0x00000000, 0x00000000),
+   SIMD_4x32(0x00000098, 0x00000000, 0x00000000, 0x00000000),
+   SIMD_4x32(0x00000008, 0x00000000, 0x00000000, 0x00000000),
+   SIMD_4x32(0x0000004D, 0x00000000, 0x00000000, 0x00000000),
+   SIMD_4x32(0x0000007C, 0x00000000, 0x00000000, 0x00000000),
+   SIMD_4x32(0x0000007D, 0x00000000, 0x00000000, 0x00000000),
+   SIMD_4x32(0x00000081, 0x00000000, 0x00000000, 0x00000000),
+   SIMD_4x32(0x0000001F, 0x00000000, 0x00000000, 0x00000000),
+   SIMD_4x32(0x00000083, 0x00000000, 0x00000000, 0x00000000),
+};
+
 const SIMD_4x32 lo_nibs_mask = SIMD_4x32::splat_u8(0x0F);
 const SIMD_4x32 hi_nibs_mask = SIMD_4x32::splat_u8(0xF0);
-
-const SIMD_4x32 shuffle3333 = SIMD_4x32::splat(0x0F0E0D0C);
+const SIMD_4x32 xor_5B = SIMD_4x32::splat_u8(0x5B);
 
 inline SIMD_4x32 low_nibs(SIMD_4x32 x)
    {
@@ -416,7 +418,7 @@ SIMD_4x32 aes_schedule_mangle(SIMD_4x32 k, uint8_t round_no)
    const SIMD_4x32 mc_forward0(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D);
    const SIMD_4x32 srx(sr[round_no % 4]);
 
-   SIMD_4x32 t = shuffle(k ^ SIMD_4x32::splat_u8(0x5B), mc_forward0);
+   SIMD_4x32 t = shuffle(k ^ xor_5B, mc_forward0);
    SIMD_4x32 t2 = t;
    t = shuffle(t, mc_forward0);
    t2 = t ^ t2 ^ shuffle(t, mc_forward0);
@@ -459,7 +461,7 @@ SIMD_4x32 aes_schedule_mangle_last(SIMD_4x32 k, uint8_t round_no)
    const SIMD_4x32 out_tr2(0x50BCEC00, 0x01EDBD51, 0xB05C0CE0, 0xE10D5DB1);
 
    k = shuffle(k, sr[round_no % 4]);
-   k ^= SIMD_4x32::splat_u8(0x5B);
+   k ^= xor_5B;
    return aes_schedule_transform(k, out_tr1, out_tr2);
    }
 
@@ -468,15 +470,15 @@ SIMD_4x32 aes_schedule_mangle_last_dec(SIMD_4x32 k)
    const SIMD_4x32 deskew1(0x47A4E300, 0x07E4A340, 0x5DBEF91A, 0x1DFEB95A);
    const SIMD_4x32 deskew2(0x83EA6900, 0x5F36B5DC, 0xF49D1E77, 0x2841C2AB);
 
-   k ^= SIMD_4x32::splat_u8(0x5B);
+   k ^= xor_5B;
    return aes_schedule_transform(k, deskew1, deskew2);
    }
 
 SIMD_4x32 aes_schedule_round(SIMD_4x32 input1, SIMD_4x32 input2)
    {
-   SIMD_4x32 smeared = input2 ^ slli<1>(input2);
-   smeared ^= slli<2>(smeared);
-   smeared ^= SIMD_4x32::splat_u8(0x5B);
+   SIMD_4x32 smeared = input2 ^ shift_elems_left<1>(input2);
+   smeared ^= shift_elems_left<2>(smeared);
+   smeared ^= xor_5B;
 
    SIMD_4x32 t = high_nibs(input1);
    input1 = low_nibs(input1);
@@ -494,14 +496,11 @@ SIMD_4x32 aes_schedule_round(SIMD_4x32 input1, SIMD_4x32 input2)
    return smeared ^ shuffle(sb1u, t5) ^ shuffle(sb1t, t6);
    }
 
-SIMD_4x32 aes_schedule_round(SIMD_4x32& rcon, SIMD_4x32 input1, SIMD_4x32 input2)
+SIMD_4x32 aes_schedule_round(SIMD_4x32 rc, SIMD_4x32 input1, SIMD_4x32 input2)
    {
-   input2 ^= alignr<15>(SIMD_4x32(), rcon);
-   rcon = alignr<15>(rcon, rcon);
-   input1 = shuffle(input1, shuffle3333);
-   input1 = alignr<1>(input1, input1);
-
-   return aes_schedule_round(input1, input2);
+   // This byte shuffle is equivalent to alignr<1>(shuffle32(input1, (3,3,3,3)));
+   const SIMD_4x32 shuffle3333_15 = SIMD_4x32::splat(0x0C0F0E0D);
+   return aes_schedule_round(shuffle(input1, shuffle3333_15), input2 ^ rc);
    }
 
 SIMD_4x32 aes_schedule_192_smear(SIMD_4x32 x, SIMD_4x32 y)
@@ -510,6 +509,9 @@ SIMD_4x32 aes_schedule_192_smear(SIMD_4x32 x, SIMD_4x32 y)
       SIMD_4x32(0x0B0A0908, 0x0F0E0D0C, 0x0F0E0D0C, 0x0F0E0D0C);
    const SIMD_4x32 shuffle2000 =
       SIMD_4x32(0x03020100, 0x03020100, 0x03020100, 0x0B0A0908);
+
+   const SIMD_4x32 zero_top_half(0, 0, ~0, ~0);
+   y &= zero_top_half;
    return y ^ shuffle(x, shuffle3332) ^ shuffle(y, shuffle2000);
    }
 
@@ -520,8 +522,6 @@ void AES_128::vperm_key_schedule(const uint8_t keyb[], size_t)
    m_EK.resize(11*4);
    m_DK.resize(11*4);
 
-   SIMD_4x32 rcon(0xAF9DEEB6, 0x1F8391B9, 0x4D7C7D81, 0x702A9808);
-
    SIMD_4x32 key = SIMD_4x32::load_le(keyb);
 
    shuffle(key, sr[2]).store_le(&m_DK[4*10]);
@@ -531,14 +531,14 @@ void AES_128::vperm_key_schedule(const uint8_t keyb[], size_t)
 
    for(size_t i = 1; i != 10; ++i)
       {
-      key = aes_schedule_round(rcon, key, key);
+      key = aes_schedule_round(rcon[i-1], key, key);
 
       aes_schedule_mangle(key, (12-i) % 4).store_le(&m_EK[4*i]);
 
       aes_schedule_mangle_dec(key, (10-i)%4).store_le(&m_DK[4*(10-i)]);
       }
 
-   key = aes_schedule_round(rcon, key, key);
+   key = aes_schedule_round(rcon[9], key, key);
    aes_schedule_mangle_last(key, 2).store_le(&m_EK[4*10]);
    aes_schedule_mangle_last_dec(key).store_le(&m_DK[0]);
    }
@@ -548,8 +548,6 @@ void AES_192::vperm_key_schedule(const uint8_t keyb[], size_t)
    m_EK.resize(13*4);
    m_DK.resize(13*4);
 
-   SIMD_4x32 rcon(0xAF9DEEB6, 0x1F8391B9, 0x4D7C7D81, 0x702A9808);
-
    SIMD_4x32 key1 = SIMD_4x32::load_le(keyb);
    SIMD_4x32 key2 = SIMD_4x32::load_le(keyb + 8);
 
@@ -563,19 +561,19 @@ void AES_192::vperm_key_schedule(const uint8_t keyb[], size_t)
    for(size_t i = 0; i != 4; ++i)
       {
       // key2 with 8 high bytes masked off
-      SIMD_4x32 t = zero_top_half(key2);
-      key2 = aes_schedule_round(rcon, key2, key1);
+      SIMD_4x32 t = key2;
+      key2 = aes_schedule_round(rcon[2*i], key2, key1);
 
-      // fixme cse
-      aes_schedule_mangle(alignr<8>(key2, t), (i+3)%4).store_le(&m_EK[4*(3*i+1)]);
-      aes_schedule_mangle_dec(alignr<8>(key2, t), (i+3)%4).store_le(&m_DK[4*(11-3*i)]);
+      const SIMD_4x32 key2t = alignr8(key2, t);
+      aes_schedule_mangle(key2t, (i+3)%4).store_le(&m_EK[4*(3*i+1)]);
+      aes_schedule_mangle_dec(key2t, (i+3)%4).store_le(&m_DK[4*(11-3*i)]);
 
       t = aes_schedule_192_smear(key2, t);
 
       aes_schedule_mangle(t, (i+2)%4).store_le(&m_EK[4*(3*i+2)]);
       aes_schedule_mangle_dec(t, (i+2)%4).store_le(&m_DK[4*(10-3*i)]);
 
-      key2 = aes_schedule_round(rcon, t, key2);
+      key2 = aes_schedule_round(rcon[2*i+1], t, key2);
 
       if(i == 3)
          {
@@ -589,7 +587,7 @@ void AES_192::vperm_key_schedule(const uint8_t keyb[], size_t)
          }
 
       key1 = key2;
-      key2 = aes_schedule_192_smear(key2, zero_top_half(t));
+      key2 = aes_schedule_192_smear(key2, t);
       }
    }
 
@@ -598,8 +596,6 @@ void AES_256::vperm_key_schedule(const uint8_t keyb[], size_t)
    m_EK.resize(15*4);
    m_DK.resize(15*4);
 
-   SIMD_4x32 rcon(0xAF9DEEB6, 0x1F8391B9, 0x4D7C7D81, 0x702A9808);
-
    SIMD_4x32 key1 = SIMD_4x32::load_le(keyb);
    SIMD_4x32 key2 = SIMD_4x32::load_le(keyb + 16);
 
@@ -613,10 +609,12 @@ void AES_256::vperm_key_schedule(const uint8_t keyb[], size_t)
 
    aes_schedule_mangle_dec(key2, 1).store_le(&m_DK[4*13]);
 
+   const SIMD_4x32 shuffle3333 = SIMD_4x32::splat(0x0F0E0D0C);
+
    for(size_t i = 2; i != 14; i += 2)
       {
       const SIMD_4x32 k_t = key2;
-      key1 = key2 = aes_schedule_round(rcon, key2, key1);
+      key1 = key2 = aes_schedule_round(rcon[(i/2)-1], key2, key1);
 
       aes_schedule_mangle(key2, i % 4).store_le(&m_EK[4*i]);
       aes_schedule_mangle_dec(key2, (i+2)%4).store_le(&m_DK[4*(14-i)]);
@@ -627,7 +625,7 @@ void AES_256::vperm_key_schedule(const uint8_t keyb[], size_t)
       aes_schedule_mangle_dec(key2, (i+1)%4).store_le(&m_DK[4*(13-i)]);
       }
 
-   key2 = aes_schedule_round(rcon, key2, key1);
+   key2 = aes_schedule_round(rcon[6], key2, key1);
 
    aes_schedule_mangle_last(key2, 2).store_le(&m_EK[4*14]);
    aes_schedule_mangle_last_dec(key2).store_le(&m_DK[0]);
author	Jack Lloyd <[email protected]>	2019-09-03 12:02:05 -0400
committer	Jack Lloyd <[email protected]>	2019-09-04 12:16:53 -0400
commit	54764e302c2488816e6160c32b58de406c47286b (patch)
tree	58ef8c96c2db2bee4fb09206f74351cc882c94f5
parent	1ed06b02aee547435d507098824bb96bdb6d3214 (diff)