aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorlloyd <[email protected]>2009-10-23 01:13:56 +0000
committerlloyd <[email protected]>2009-10-23 01:13:56 +0000
commita6fc4c8a57b6a65f1b893b4f6de59b74c66ff84b (patch)
tree4348074c96efa157aea71fdef7fd6cf69cc94f67
parent377a213c32f33d42e66bad1eb7f7c66b63c1249a (diff)
Simply unrolling the loop in XTEA and processing 4 blocks worth of data at
a time more than doubles performance (from 38 MB/s to 90 MB/s on Core2 Q6600). Could do even better with SIMD, I'm sure, but this is fast and easy, and works everywhere. Probably will hurt on 32-bit x86 from the register pressure.
-rw-r--r--src/block/xtea/xtea.cpp70
1 files changed, 70 insertions, 0 deletions
diff --git a/src/block/xtea/xtea.cpp b/src/block/xtea/xtea.cpp
index 77543e1e8..83c7d9ce5 100644
--- a/src/block/xtea/xtea.cpp
+++ b/src/block/xtea/xtea.cpp
@@ -8,13 +8,75 @@
#include <botan/xtea.h>
#include <botan/loadstor.h>
+#include <stdio.h>
+
namespace Botan {
+namespace {
+
+void xtea_encrypt_4(const byte in[32], byte out[32], const u32bit EK[64])
+ {
+ u32bit L0 = load_be<u32bit>(in, 0), R0 = load_be<u32bit>(in, 1);
+ u32bit L1 = load_be<u32bit>(in, 2), R1 = load_be<u32bit>(in, 3);
+ u32bit L2 = load_be<u32bit>(in, 4), R2 = load_be<u32bit>(in, 5);
+ u32bit L3 = load_be<u32bit>(in, 6), R3 = load_be<u32bit>(in, 7);
+
+ for(u32bit i = 0; i != 32; ++i)
+ {
+ L0 += (((R0 << 4) ^ (R0 >> 5)) + R0) ^ EK[2*i];
+ L1 += (((R1 << 4) ^ (R1 >> 5)) + R1) ^ EK[2*i];
+ L2 += (((R2 << 4) ^ (R2 >> 5)) + R2) ^ EK[2*i];
+ L3 += (((R3 << 4) ^ (R3 >> 5)) + R3) ^ EK[2*i];
+
+ R0 += (((L0 << 4) ^ (L0 >> 5)) + L0) ^ EK[2*i+1];
+ R1 += (((L1 << 4) ^ (L1 >> 5)) + L1) ^ EK[2*i+1];
+ R2 += (((L2 << 4) ^ (L2 >> 5)) + L2) ^ EK[2*i+1];
+ R3 += (((L3 << 4) ^ (L3 >> 5)) + L3) ^ EK[2*i+1];
+ }
+
+ store_be(out , L0, R0, L1, R1);
+ store_be(out + 16, L2, R2, L3, R3);
+ }
+
+void xtea_decrypt_4(const byte in[32], byte out[32], const u32bit EK[64])
+ {
+ u32bit L0 = load_be<u32bit>(in, 0), R0 = load_be<u32bit>(in, 1);
+ u32bit L1 = load_be<u32bit>(in, 2), R1 = load_be<u32bit>(in, 3);
+ u32bit L2 = load_be<u32bit>(in, 4), R2 = load_be<u32bit>(in, 5);
+ u32bit L3 = load_be<u32bit>(in, 6), R3 = load_be<u32bit>(in, 7);
+
+ for(u32bit i = 0; i != 32; ++i)
+ {
+ R0 -= (((L0 << 4) ^ (L0 >> 5)) + L0) ^ EK[63 - 2*i];
+ R1 -= (((L1 << 4) ^ (L1 >> 5)) + L1) ^ EK[63 - 2*i];
+ R2 -= (((L2 << 4) ^ (L2 >> 5)) + L2) ^ EK[63 - 2*i];
+ R3 -= (((L3 << 4) ^ (L3 >> 5)) + L3) ^ EK[63 - 2*i];
+
+ L0 -= (((R0 << 4) ^ (R0 >> 5)) + R0) ^ EK[62 - 2*i];
+ L1 -= (((R1 << 4) ^ (R1 >> 5)) + R1) ^ EK[62 - 2*i];
+ L2 -= (((R2 << 4) ^ (R2 >> 5)) + R2) ^ EK[62 - 2*i];
+ L3 -= (((R3 << 4) ^ (R3 >> 5)) + R3) ^ EK[62 - 2*i];
+ }
+
+ store_be(out , L0, R0, L1, R1);
+ store_be(out + 16, L2, R2, L3, R3);
+ }
+
+}
+
/*
* XTEA Encryption
*/
void XTEA::encrypt_n(const byte in[], byte out[], u32bit blocks) const
{
+ while(blocks >= 4)
+ {
+ xtea_encrypt_4(in, out, this->EK);
+ in += 4 * BLOCK_SIZE;
+ out += 4 * BLOCK_SIZE;
+ blocks -= 4;
+ }
+
for(u32bit i = 0; i != blocks; ++i)
{
u32bit L = load_be<u32bit>(in, 0), R = load_be<u32bit>(in, 1);
@@ -37,6 +99,14 @@ void XTEA::encrypt_n(const byte in[], byte out[], u32bit blocks) const
*/
void XTEA::decrypt_n(const byte in[], byte out[], u32bit blocks) const
{
+ while(blocks >= 4)
+ {
+ xtea_decrypt_4(in, out, this->EK);
+ in += 4 * BLOCK_SIZE;
+ out += 4 * BLOCK_SIZE;
+ blocks -= 4;
+ }
+
for(u32bit i = 0; i != blocks; ++i)
{
u32bit L = load_be<u32bit>(in, 0), R = load_be<u32bit>(in, 1);