diff options
author | lloyd <[email protected]> | 2009-10-23 01:13:56 +0000 |
---|---|---|
committer | lloyd <[email protected]> | 2009-10-23 01:13:56 +0000 |
commit | a6fc4c8a57b6a65f1b893b4f6de59b74c66ff84b (patch) | |
tree | 4348074c96efa157aea71fdef7fd6cf69cc94f67 | |
parent | 377a213c32f33d42e66bad1eb7f7c66b63c1249a (diff) |
Simply unrolling the loop in XTEA and processing 4 blocks worth of data at
a time more than doubles performance (from 38 MB/s to 90 MB/s on Core2 Q6600).
Could do even better with SIMD, I'm sure, but this is fast and easy, and
works everywhere.
Probably will hurt on 32-bit x86 from the register pressure.
-rw-r--r-- | src/block/xtea/xtea.cpp | 70 |
1 files changed, 70 insertions, 0 deletions
diff --git a/src/block/xtea/xtea.cpp b/src/block/xtea/xtea.cpp index 77543e1e8..83c7d9ce5 100644 --- a/src/block/xtea/xtea.cpp +++ b/src/block/xtea/xtea.cpp @@ -8,13 +8,75 @@ #include <botan/xtea.h> #include <botan/loadstor.h> +#include <stdio.h> + namespace Botan { +namespace { + +void xtea_encrypt_4(const byte in[32], byte out[32], const u32bit EK[64]) + { + u32bit L0 = load_be<u32bit>(in, 0), R0 = load_be<u32bit>(in, 1); + u32bit L1 = load_be<u32bit>(in, 2), R1 = load_be<u32bit>(in, 3); + u32bit L2 = load_be<u32bit>(in, 4), R2 = load_be<u32bit>(in, 5); + u32bit L3 = load_be<u32bit>(in, 6), R3 = load_be<u32bit>(in, 7); + + for(u32bit i = 0; i != 32; ++i) + { + L0 += (((R0 << 4) ^ (R0 >> 5)) + R0) ^ EK[2*i]; + L1 += (((R1 << 4) ^ (R1 >> 5)) + R1) ^ EK[2*i]; + L2 += (((R2 << 4) ^ (R2 >> 5)) + R2) ^ EK[2*i]; + L3 += (((R3 << 4) ^ (R3 >> 5)) + R3) ^ EK[2*i]; + + R0 += (((L0 << 4) ^ (L0 >> 5)) + L0) ^ EK[2*i+1]; + R1 += (((L1 << 4) ^ (L1 >> 5)) + L1) ^ EK[2*i+1]; + R2 += (((L2 << 4) ^ (L2 >> 5)) + L2) ^ EK[2*i+1]; + R3 += (((L3 << 4) ^ (L3 >> 5)) + L3) ^ EK[2*i+1]; + } + + store_be(out , L0, R0, L1, R1); + store_be(out + 16, L2, R2, L3, R3); + } + +void xtea_decrypt_4(const byte in[32], byte out[32], const u32bit EK[64]) + { + u32bit L0 = load_be<u32bit>(in, 0), R0 = load_be<u32bit>(in, 1); + u32bit L1 = load_be<u32bit>(in, 2), R1 = load_be<u32bit>(in, 3); + u32bit L2 = load_be<u32bit>(in, 4), R2 = load_be<u32bit>(in, 5); + u32bit L3 = load_be<u32bit>(in, 6), R3 = load_be<u32bit>(in, 7); + + for(u32bit i = 0; i != 32; ++i) + { + R0 -= (((L0 << 4) ^ (L0 >> 5)) + L0) ^ EK[63 - 2*i]; + R1 -= (((L1 << 4) ^ (L1 >> 5)) + L1) ^ EK[63 - 2*i]; + R2 -= (((L2 << 4) ^ (L2 >> 5)) + L2) ^ EK[63 - 2*i]; + R3 -= (((L3 << 4) ^ (L3 >> 5)) + L3) ^ EK[63 - 2*i]; + + L0 -= (((R0 << 4) ^ (R0 >> 5)) + R0) ^ EK[62 - 2*i]; + L1 -= (((R1 << 4) ^ (R1 >> 5)) + R1) ^ EK[62 - 2*i]; + L2 -= (((R2 << 4) ^ (R2 >> 5)) + R2) ^ EK[62 - 2*i]; + L3 -= (((R3 << 4) ^ (R3 >> 5)) + R3) ^ EK[62 - 2*i]; + } + + store_be(out , L0, R0, L1, R1); + store_be(out + 16, L2, R2, L3, R3); + } + +} + /* * XTEA Encryption */ void XTEA::encrypt_n(const byte in[], byte out[], u32bit blocks) const { + while(blocks >= 4) + { + xtea_encrypt_4(in, out, this->EK); + in += 4 * BLOCK_SIZE; + out += 4 * BLOCK_SIZE; + blocks -= 4; + } + for(u32bit i = 0; i != blocks; ++i) { u32bit L = load_be<u32bit>(in, 0), R = load_be<u32bit>(in, 1); @@ -37,6 +99,14 @@ void XTEA::encrypt_n(const byte in[], byte out[], u32bit blocks) const */ void XTEA::decrypt_n(const byte in[], byte out[], u32bit blocks) const { + while(blocks >= 4) + { + xtea_decrypt_4(in, out, this->EK); + in += 4 * BLOCK_SIZE; + out += 4 * BLOCK_SIZE; + blocks -= 4; + } + for(u32bit i = 0; i != blocks; ++i) { u32bit L = load_be<u32bit>(in, 0), R = load_be<u32bit>(in, 1); |