Unroll blocks by 2x

author: Jack Lloyd <[email protected]> 2019-09-01 13:11:29 -0400
committer: Jack Lloyd <[email protected]> 2019-09-03 08:37:00 -0400
commit: 1ed06b02aee547435d507098824bb96bdb6d3214 (patch)
tree: e598a21960bc1ddb14de5d79af431a20ee6a337d
parent: d2ef8f38e6ec5d850eafd8d1e6ca0adc99d047c7 (diff)
2 files changed, 188 insertions, 70 deletions
diff --git a/src/lib/block/aes/aes_vperm/aes_vperm.cpp b/src/lib/block/aes/aes_vperm/aes_vperm.cpp
index a36118cbd..23b3c580c 100644
--- a/src/lib/block/aes/aes_vperm/aes_vperm.cpp
+++ b/src/lib/block/aes/aes_vperm/aes_vperm.cpp
@@ -33,7 +33,7 @@ inline SIMD_4x32 shuffle(SIMD_4x32 a, SIMD_4x32 b)
    const uint8x16_t idx = vreinterpretq_u8_u32(b.raw());
 
 #if defined(BOTAN_TARGET_ARCH_IS_ARM32)
-   uint8x8x2_t tbl2 = { vget_low_u8(tbl), vget_high_u8(tbl) };
+   const uint8x8x2_t tbl2 = { vget_low_u8(tbl), vget_high_u8(tbl) };
 
    return SIMD_4x32(vreinterpretq_u32_u8(
                        vcombine_u8(vtbl2_u8(tbl2, vget_low_u8(idx)),
@@ -64,7 +64,7 @@ inline SIMD_4x32 zero_top_half(SIMD_4x32 x)
    return SIMD_4x32(_mm_slli_si128(_mm_srli_si128(x.raw(), 8), 8));
 #elif defined(BOTAN_SIMD_USE_NEON)
    // fixme do better ?
-   SIMD_4x32 mask(0, 0, ~0, ~0);
+   const SIMD_4x32 mask(0, 0, ~0, ~0);
    return x & mask;
 #endif
    }
@@ -117,14 +117,16 @@ inline SIMD_4x32 high_nibs(SIMD_4x32 x)
    return (hi_nibs_mask & x).shr<4>();
    }
 
-SIMD_4x32 aes_vperm_encrypt(SIMD_4x32 B, const uint32_t* keys, size_t rounds)
+inline SIMD_4x32 aes_enc_first_round(SIMD_4x32 B, SIMD_4x32 K)
+   {
+   return shuffle(k_ipt1, low_nibs(B)) ^ shuffle(k_ipt2, high_nibs(B)) ^ K;
+   }
+
+inline SIMD_4x32 aes_enc_round(SIMD_4x32 B, SIMD_4x32 K, size_t r)
    {
    const SIMD_4x32 sb2u = SIMD_4x32(0x0B712400, 0xE27A93C6, 0xBC982FCD, 0x5EB7E955);
    const SIMD_4x32 sb2t = SIMD_4x32(0x0AE12900, 0x69EB8840, 0xAB82234A, 0xC2A163C8);
 
-   const SIMD_4x32 sbou = SIMD_4x32(0x6FBDC700, 0xD0D26D17, 0xC502A878, 0x15AABF7A);
-   const SIMD_4x32 sbot = SIMD_4x32(0x5FBB6A00, 0xCFE474A5, 0x412B35FA, 0x8E1E90D1);
-
    const SIMD_4x32 mc_backward[4] = {
       SIMD_4x32(0x02010003, 0x06050407, 0x0A09080B, 0x0E0D0C0F),
       SIMD_4x32(0x0E0D0C0F, 0x02010003, 0x06050407, 0x0A09080B),
@@ -132,43 +134,46 @@ SIMD_4x32 aes_vperm_encrypt(SIMD_4x32 B, const uint32_t* keys, size_t rounds)
       SIMD_4x32(0x06050407, 0x0A09080B, 0x0E0D0C0F, 0x02010003),
    };
 
-   B = shuffle(k_ipt1, low_nibs(B)) ^ shuffle(k_ipt2, high_nibs(B)) ^ SIMD_4x32(&keys[0]);
-
-   for(size_t r = 1; ; ++r)
-      {
-      const SIMD_4x32 K(&keys[4*r]);
-
-      SIMD_4x32 t = high_nibs(B);
-      B = low_nibs(B);
-
-      SIMD_4x32 t2 = shuffle(k_inv2, B);
+   const SIMD_4x32 Bh = high_nibs(B);
+   SIMD_4x32 Bl = low_nibs(B);
+   const SIMD_4x32 t2 = shuffle(k_inv2, Bl);
+   Bl ^= Bh;
 
-      B ^= t;
+   const SIMD_4x32 t5 = Bl ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
+   const SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bl));
 
-      SIMD_4x32 t3 = t2 ^ shuffle(k_inv1, t);
-      SIMD_4x32 t4 = t2 ^ shuffle(k_inv1, B);
+   const SIMD_4x32 t7 = shuffle(sb1t, t6) ^ shuffle(sb1u, t5) ^ K;
+   const SIMD_4x32 t8 = shuffle(sb2t, t6) ^ shuffle(sb2u, t5) ^ shuffle(t7, mc_forward[r % 4]);
 
-      SIMD_4x32 t5 = B ^ shuffle(k_inv1, t3);
-      SIMD_4x32 t6 = t ^ shuffle(k_inv1, t4);
+   return shuffle(t8, mc_forward[r % 4]) ^ shuffle(t7, mc_backward[r % 4]) ^ t8;
+   }
 
-      if(r == rounds)
-         {
-         return shuffle(shuffle(sbou, t5) ^ shuffle(sbot, t6) ^ K, sr[r % 4]);
-         }
+inline SIMD_4x32 aes_enc_last_round(SIMD_4x32 B, SIMD_4x32 K, size_t r)
+   {
+   const SIMD_4x32 sbou = SIMD_4x32(0x6FBDC700, 0xD0D26D17, 0xC502A878, 0x15AABF7A);
+   const SIMD_4x32 sbot = SIMD_4x32(0x5FBB6A00, 0xCFE474A5, 0x412B35FA, 0x8E1E90D1);
 
-      SIMD_4x32 t7 = shuffle(sb1t, t6) ^ shuffle(sb1u, t5) ^ K;
+   const SIMD_4x32 Bh = high_nibs(B);
+   SIMD_4x32 Bl = low_nibs(B);
+   const SIMD_4x32 t2 = shuffle(k_inv2, Bl);
+   Bl ^= Bh;
 
-      SIMD_4x32 t8 = shuffle(sb2t, t6) ^ shuffle(sb2u, t5) ^ shuffle(t7, mc_forward[r % 4]);
+   const SIMD_4x32 t5 = Bl ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
+   const SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bl));
 
-      B = shuffle(t8, mc_forward[r % 4]) ^ shuffle(t7, mc_backward[r % 4]) ^ t8;
-      }
+   return shuffle(shuffle(sbou, t5) ^ shuffle(sbot, t6) ^ K, sr[r % 4]);
    }
 
-SIMD_4x32 aes_vperm_decrypt(SIMD_4x32 B, const uint32_t keys[], size_t rounds)
+inline SIMD_4x32 aes_dec_first_round(SIMD_4x32 B, SIMD_4x32 K)
    {
    const SIMD_4x32 k_dipt1 = SIMD_4x32(0x0B545F00, 0x0F505B04, 0x114E451A, 0x154A411E);
    const SIMD_4x32 k_dipt2 = SIMD_4x32(0x60056500, 0x86E383E6, 0xF491F194, 0x12771772);
 
+   return shuffle(k_dipt1, low_nibs(B)) ^ shuffle(k_dipt2, high_nibs(B)) ^ K;
+   }
+
+inline SIMD_4x32 aes_dec_round(SIMD_4x32 B, SIMD_4x32 K, size_t r)
+   {
    const SIMD_4x32 sb9u = SIMD_4x32(0x9A86D600, 0x851C0353, 0x4F994CC9, 0xCAD51F50);
    const SIMD_4x32 sb9t = SIMD_4x32(0xECD74900, 0xC03B1789, 0xB2FBA565, 0x725E2C9E);
 
@@ -181,55 +186,90 @@ SIMD_4x32 aes_vperm_decrypt(SIMD_4x32 B, const uint32_t keys[], size_t rounds)
    const SIMD_4x32 sbbu = SIMD_4x32(0x96B44200, 0xD0226492, 0xB0F2D404, 0x602646F6);
    const SIMD_4x32 sbbt = SIMD_4x32(0xCD596700, 0xC19498A6, 0x3255AA6B, 0xF3FF0C3E);
 
-   const SIMD_4x32 sbou = SIMD_4x32(0x7EF94000, 0x1387EA53, 0xD4943E2D, 0xC7AA6DB9);
-   const SIMD_4x32 sbot = SIMD_4x32(0x93441D00, 0x12D7560F, 0xD8C58E9C, 0xCA4B8159);
+   const SIMD_4x32 mcx[4] = {
+      SIMD_4x32(0x0C0F0E0D, 0x00030201, 0x04070605, 0x080B0A09),
+      SIMD_4x32(0x080B0A09, 0x0C0F0E0D, 0x00030201, 0x04070605),
+      SIMD_4x32(0x04070605, 0x080B0A09, 0x0C0F0E0D, 0x00030201),
+      SIMD_4x32(0x00030201, 0x04070605, 0x080B0A09, 0x0C0F0E0D),
+   };
 
-   SIMD_4x32 mc(mc_forward[3]);
+   const SIMD_4x32 Bh = high_nibs(B);
+   B = low_nibs(B);
+   const SIMD_4x32 t2 = shuffle(k_inv2, B);
 
-   B = shuffle(k_dipt1, low_nibs(B)) ^ shuffle(k_dipt2, high_nibs(B)) ^ SIMD_4x32(&keys[0]);
+   B ^= Bh;
 
-   for(size_t r = 1; ; ++r)
-      {
-      const SIMD_4x32 K(&keys[4*r]);
+   const SIMD_4x32 t5 = B ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
+   const SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, B));
 
-      SIMD_4x32 t = high_nibs(B);
-      B = low_nibs(B);
+   const SIMD_4x32 mc = mcx[(r-1)%4];
 
-      SIMD_4x32 t2 = shuffle(k_inv2, B);
+   const SIMD_4x32 t8 = shuffle(sb9t, t6) ^ shuffle(sb9u, t5) ^ K;
+   const SIMD_4x32 t9 = shuffle(t8, mc) ^ shuffle(sbdu, t5) ^ shuffle(sbdt, t6);
+   const SIMD_4x32 t12 = shuffle(t9, mc) ^ shuffle(sbbu, t5) ^ shuffle(sbbt, t6);
+   return shuffle(t12, mc) ^ shuffle(sbeu, t5) ^ shuffle(sbet, t6);
+   }
 
-      B ^= t;
+inline SIMD_4x32 aes_dec_last_round(SIMD_4x32 B, SIMD_4x32 K, size_t r)
+   {
+   const SIMD_4x32 sbou = SIMD_4x32(0x7EF94000, 0x1387EA53, 0xD4943E2D, 0xC7AA6DB9);
+   const SIMD_4x32 sbot = SIMD_4x32(0x93441D00, 0x12D7560F, 0xD8C58E9C, 0xCA4B8159);
 
-      const SIMD_4x32 t3 = t2 ^ shuffle(k_inv1, t);
-      const SIMD_4x32 t4 = t2 ^ shuffle(k_inv1, B);
-      const SIMD_4x32 t5 = B ^ shuffle(k_inv1, t3);
-      const SIMD_4x32 t6 = t ^ shuffle(k_inv1, t4);
+   const uint32_t which_sr = ((((r - 1) << 4) ^ 48) & 48) / 16;
 
-      if(r == rounds)
-         {
-         const SIMD_4x32 x = shuffle(sbou, t5) ^ shuffle(sbot, t6) ^ K;
-         const uint32_t which_sr = ((((rounds - 1) << 4) ^ 48) & 48) / 16;
-         return shuffle(x, sr[which_sr]);
-         }
+   const SIMD_4x32 Bh = high_nibs(B);
+   B = low_nibs(B);
+   const SIMD_4x32 t2 = shuffle(k_inv2, B);
 
-      const SIMD_4x32 t8 = shuffle(sb9t, t6) ^ shuffle(sb9u, t5) ^ K;
-      const SIMD_4x32 t9 = shuffle(t8, mc) ^ shuffle(sbdu, t5) ^ shuffle(sbdt, t6);
-      const SIMD_4x32 t12 = shuffle(t9, mc) ^ shuffle(sbbu, t5) ^ shuffle(sbbt, t6);
+   B ^= Bh;
 
-      B = shuffle(t12, mc) ^ shuffle(sbeu, t5) ^ shuffle(sbet, t6);
+   const SIMD_4x32 t5 = B ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, Bh));
+   const SIMD_4x32 t6 = Bh ^ shuffle(k_inv1, t2 ^ shuffle(k_inv1, B));
 
-      mc = alignr<12>(mc, mc);
-      }
+   const SIMD_4x32 x = shuffle(sbou, t5) ^ shuffle(sbot, t6) ^ K;
+   return shuffle(x, sr[which_sr]);
    }
 
 void vperm_encrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks,
-                          const uint32_t keys[], size_t rounds)
+                          const SIMD_4x32 K[], size_t rounds)
    {
    CT::poison(in, blocks * 16);
 
-   BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i)
+   const size_t blocks2 = blocks - (blocks % 2);
+
+   for(size_t i = 0; i != blocks2; i += 2)
+      {
+      SIMD_4x32 B0 = SIMD_4x32::load_le(in + i*16);
+      SIMD_4x32 B1 = SIMD_4x32::load_le(in + (i+1)*16);
+
+      B0 = aes_enc_first_round(B0, K[0]);
+      B1 = aes_enc_first_round(B1, K[0]);
+
+      for(size_t r = 1; r != rounds; ++r)
+         {
+         B0 = aes_enc_round(B0, K[r], r);
+         B1 = aes_enc_round(B1, K[r], r);
+         }
+
+      B0 = aes_enc_last_round(B0, K[rounds], rounds);
+      B1 = aes_enc_last_round(B1, K[rounds], rounds);
+
+      B0.store_le(out + i*16);
+      B1.store_le(out + (i+1)*16);
+      }
+
+   for(size_t i = blocks2; i < blocks; ++i)
       {
       SIMD_4x32 B = SIMD_4x32::load_le(in + i*16); // ???
-      B = aes_vperm_encrypt(B, keys, rounds);
+
+      B = aes_enc_first_round(B, K[0]);
+
+      for(size_t r = 1; r != rounds; ++r)
+         {
+         B = aes_enc_round(B, K[r], r);
+         }
+
+      B = aes_enc_last_round(B, K[rounds], rounds);
       B.store_le(out + i*16);
       }
 
@@ -238,14 +278,45 @@ void vperm_encrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks,
    }
 
 void vperm_decrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks,
-                          const uint32_t keys[], size_t rounds)
+                          const SIMD_4x32 K[], size_t rounds)
    {
    CT::poison(in, blocks * 16);
 
-   BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i)
+   const size_t blocks2 = blocks - (blocks % 2);
+
+   for(size_t i = 0; i != blocks2; i += 2)
+      {
+      SIMD_4x32 B0 = SIMD_4x32::load_le(in + i*16);
+      SIMD_4x32 B1 = SIMD_4x32::load_le(in + (i+1)*16);
+
+      B0 = aes_dec_first_round(B0, K[0]);
+      B1 = aes_dec_first_round(B1, K[0]);
+
+      for(size_t r = 1; r != rounds; ++r)
+         {
+         B0 = aes_dec_round(B0, K[r], r);
+         B1 = aes_dec_round(B1, K[r], r);
+         }
+
+      B0 = aes_dec_last_round(B0, K[rounds], rounds);
+      B1 = aes_dec_last_round(B1, K[rounds], rounds);
+
+      B0.store_le(out + i*16);
+      B1.store_le(out + (i+1)*16);
+      }
+
+   for(size_t i = blocks2; i < blocks; ++i)
       {
       SIMD_4x32 B = SIMD_4x32::load_le(in + i*16); // ???
-      B = aes_vperm_decrypt(B, keys, rounds);
+
+      B = aes_dec_first_round(B, K[0]);
+
+      for(size_t r = 1; r != rounds; ++r)
+         {
+         B = aes_dec_round(B, K[r], r);
+         }
+
+      B = aes_dec_last_round(B, K[rounds], rounds);
       B.store_le(out + i*16);
       }
 
@@ -257,32 +328,78 @@ void vperm_decrypt_blocks(const uint8_t in[], uint8_t out[], size_t blocks,
 
 void AES_128::vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
    {
-   vperm_encrypt_blocks(in, out, blocks, m_EK.data(), 10);
+   const SIMD_4x32 K[11] = {
+      SIMD_4x32(&m_EK[4* 0]), SIMD_4x32(&m_EK[4* 1]), SIMD_4x32(&m_EK[4* 2]),
+      SIMD_4x32(&m_EK[4* 3]), SIMD_4x32(&m_EK[4* 4]), SIMD_4x32(&m_EK[4* 5]),
+      SIMD_4x32(&m_EK[4* 6]), SIMD_4x32(&m_EK[4* 7]), SIMD_4x32(&m_EK[4* 8]),
+      SIMD_4x32(&m_EK[4* 9]), SIMD_4x32(&m_EK[4*10]),
+   };
+
+   return vperm_encrypt_blocks(in, out, blocks, K, 10);
    }
 
 void AES_128::vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
    {
-   vperm_decrypt_blocks(in, out, blocks, m_DK.data(), 10);
+   const SIMD_4x32 K[11] = {
+      SIMD_4x32(&m_DK[4* 0]), SIMD_4x32(&m_DK[4* 1]), SIMD_4x32(&m_DK[4* 2]),
+      SIMD_4x32(&m_DK[4* 3]), SIMD_4x32(&m_DK[4* 4]), SIMD_4x32(&m_DK[4* 5]),
+      SIMD_4x32(&m_DK[4* 6]), SIMD_4x32(&m_DK[4* 7]), SIMD_4x32(&m_DK[4* 8]),
+      SIMD_4x32(&m_DK[4* 9]), SIMD_4x32(&m_DK[4*10]),
+   };
+
+   return vperm_decrypt_blocks(in, out, blocks, K, 10);
    }
 
 void AES_192::vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
    {
-   vperm_encrypt_blocks(in, out, blocks, m_EK.data(), 12);
+   const SIMD_4x32 K[13] = {
+      SIMD_4x32(&m_EK[4* 0]), SIMD_4x32(&m_EK[4* 1]), SIMD_4x32(&m_EK[4* 2]),
+      SIMD_4x32(&m_EK[4* 3]), SIMD_4x32(&m_EK[4* 4]), SIMD_4x32(&m_EK[4* 5]),
+      SIMD_4x32(&m_EK[4* 6]), SIMD_4x32(&m_EK[4* 7]), SIMD_4x32(&m_EK[4* 8]),
+      SIMD_4x32(&m_EK[4* 9]), SIMD_4x32(&m_EK[4*10]), SIMD_4x32(&m_EK[4*11]),
+      SIMD_4x32(&m_EK[4*12]),
+   };
+
+   return vperm_encrypt_blocks(in, out, blocks, K, 12);
    }
 
 void AES_192::vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
    {
-   vperm_decrypt_blocks(in, out, blocks, m_DK.data(), 12);
+   const SIMD_4x32 K[13] = {
+      SIMD_4x32(&m_DK[4* 0]), SIMD_4x32(&m_DK[4* 1]), SIMD_4x32(&m_DK[4* 2]),
+      SIMD_4x32(&m_DK[4* 3]), SIMD_4x32(&m_DK[4* 4]), SIMD_4x32(&m_DK[4* 5]),
+      SIMD_4x32(&m_DK[4* 6]), SIMD_4x32(&m_DK[4* 7]), SIMD_4x32(&m_DK[4* 8]),
+      SIMD_4x32(&m_DK[4* 9]), SIMD_4x32(&m_DK[4*10]), SIMD_4x32(&m_DK[4*11]),
+      SIMD_4x32(&m_DK[4*12]),
+   };
+
+   return vperm_decrypt_blocks(in, out, blocks, K, 12);
    }
 
 void AES_256::vperm_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
    {
-   vperm_encrypt_blocks(in, out, blocks, m_EK.data(), 14);
+   const SIMD_4x32 K[15] = {
+      SIMD_4x32(&m_EK[4* 0]), SIMD_4x32(&m_EK[4* 1]), SIMD_4x32(&m_EK[4* 2]),
+      SIMD_4x32(&m_EK[4* 3]), SIMD_4x32(&m_EK[4* 4]), SIMD_4x32(&m_EK[4* 5]),
+      SIMD_4x32(&m_EK[4* 6]), SIMD_4x32(&m_EK[4* 7]), SIMD_4x32(&m_EK[4* 8]),
+      SIMD_4x32(&m_EK[4* 9]), SIMD_4x32(&m_EK[4*10]), SIMD_4x32(&m_EK[4*11]),
+      SIMD_4x32(&m_EK[4*12]), SIMD_4x32(&m_EK[4*13]), SIMD_4x32(&m_EK[4*14]),
+   };
+
+   return vperm_encrypt_blocks(in, out, blocks, K, 14);
    }
 
 void AES_256::vperm_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
    {
-   vperm_decrypt_blocks(in, out, blocks, m_DK.data(), 14);
+   const SIMD_4x32 K[15] = {
+      SIMD_4x32(&m_DK[4* 0]), SIMD_4x32(&m_DK[4* 1]), SIMD_4x32(&m_DK[4* 2]),
+      SIMD_4x32(&m_DK[4* 3]), SIMD_4x32(&m_DK[4* 4]), SIMD_4x32(&m_DK[4* 5]),
+      SIMD_4x32(&m_DK[4* 6]), SIMD_4x32(&m_DK[4* 7]), SIMD_4x32(&m_DK[4* 8]),
+      SIMD_4x32(&m_DK[4* 9]), SIMD_4x32(&m_DK[4*10]), SIMD_4x32(&m_DK[4*11]),
+      SIMD_4x32(&m_DK[4*12]), SIMD_4x32(&m_DK[4*13]), SIMD_4x32(&m_DK[4*14]),
+   };
+
+   return vperm_decrypt_blocks(in, out, blocks, K, 14);
    }
 
 namespace {
diff --git a/src/lib/block/aes/aes_vperm/info.txt b/src/lib/block/aes/aes_vperm/info.txt
index 5ff0c2aa2..064f5d71d 100644
--- a/src/lib/block/aes/aes_vperm/info.txt
+++ b/src/lib/block/aes/aes_vperm/info.txt
@@ -3,7 +3,8 @@ AES_VPERM -> 20190901
 </defines>
 
 <isa>
-neon
+#neon
+ssse3
 </isa>
 
 <requires>
author	Jack Lloyd <[email protected]>	2019-09-01 13:11:29 -0400
committer	Jack Lloyd <[email protected]>	2019-09-03 08:37:00 -0400
commit	1ed06b02aee547435d507098824bb96bdb6d3214 (patch)
tree	e598a21960bc1ddb14de5d79af431a20ee6a337d
parent	d2ef8f38e6ec5d850eafd8d1e6ca0adc99d047c7 (diff)