From a6fc4c8a57b6a65f1b893b4f6de59b74c66ff84b Mon Sep 17 00:00:00 2001 From: lloyd Date: Fri, 23 Oct 2009 01:13:56 +0000 Subject: Simply unrolling the loop in XTEA and processing 4 blocks worth of data at a time more than doubles performance (from 38 MB/s to 90 MB/s on Core2 Q6600). Could do even better with SIMD, I'm sure, but this is fast and easy, and works everywhere. Probably will hurt on 32-bit x86 from the register pressure. --- src/block/xtea/xtea.cpp | 70 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) (limited to 'src/block/xtea') diff --git a/src/block/xtea/xtea.cpp b/src/block/xtea/xtea.cpp index 77543e1e8..83c7d9ce5 100644 --- a/src/block/xtea/xtea.cpp +++ b/src/block/xtea/xtea.cpp @@ -8,13 +8,75 @@ #include #include +#include + namespace Botan { +namespace { + +void xtea_encrypt_4(const byte in[32], byte out[32], const u32bit EK[64]) + { + u32bit L0 = load_be(in, 0), R0 = load_be(in, 1); + u32bit L1 = load_be(in, 2), R1 = load_be(in, 3); + u32bit L2 = load_be(in, 4), R2 = load_be(in, 5); + u32bit L3 = load_be(in, 6), R3 = load_be(in, 7); + + for(u32bit i = 0; i != 32; ++i) + { + L0 += (((R0 << 4) ^ (R0 >> 5)) + R0) ^ EK[2*i]; + L1 += (((R1 << 4) ^ (R1 >> 5)) + R1) ^ EK[2*i]; + L2 += (((R2 << 4) ^ (R2 >> 5)) + R2) ^ EK[2*i]; + L3 += (((R3 << 4) ^ (R3 >> 5)) + R3) ^ EK[2*i]; + + R0 += (((L0 << 4) ^ (L0 >> 5)) + L0) ^ EK[2*i+1]; + R1 += (((L1 << 4) ^ (L1 >> 5)) + L1) ^ EK[2*i+1]; + R2 += (((L2 << 4) ^ (L2 >> 5)) + L2) ^ EK[2*i+1]; + R3 += (((L3 << 4) ^ (L3 >> 5)) + L3) ^ EK[2*i+1]; + } + + store_be(out , L0, R0, L1, R1); + store_be(out + 16, L2, R2, L3, R3); + } + +void xtea_decrypt_4(const byte in[32], byte out[32], const u32bit EK[64]) + { + u32bit L0 = load_be(in, 0), R0 = load_be(in, 1); + u32bit L1 = load_be(in, 2), R1 = load_be(in, 3); + u32bit L2 = load_be(in, 4), R2 = load_be(in, 5); + u32bit L3 = load_be(in, 6), R3 = load_be(in, 7); + + for(u32bit i = 0; i != 32; ++i) + { + R0 -= (((L0 << 4) ^ (L0 >> 5)) + L0) ^ EK[63 - 2*i]; + R1 -= (((L1 << 4) ^ (L1 >> 5)) + L1) ^ EK[63 - 2*i]; + R2 -= (((L2 << 4) ^ (L2 >> 5)) + L2) ^ EK[63 - 2*i]; + R3 -= (((L3 << 4) ^ (L3 >> 5)) + L3) ^ EK[63 - 2*i]; + + L0 -= (((R0 << 4) ^ (R0 >> 5)) + R0) ^ EK[62 - 2*i]; + L1 -= (((R1 << 4) ^ (R1 >> 5)) + R1) ^ EK[62 - 2*i]; + L2 -= (((R2 << 4) ^ (R2 >> 5)) + R2) ^ EK[62 - 2*i]; + L3 -= (((R3 << 4) ^ (R3 >> 5)) + R3) ^ EK[62 - 2*i]; + } + + store_be(out , L0, R0, L1, R1); + store_be(out + 16, L2, R2, L3, R3); + } + +} + /* * XTEA Encryption */ void XTEA::encrypt_n(const byte in[], byte out[], u32bit blocks) const { + while(blocks >= 4) + { + xtea_encrypt_4(in, out, this->EK); + in += 4 * BLOCK_SIZE; + out += 4 * BLOCK_SIZE; + blocks -= 4; + } + for(u32bit i = 0; i != blocks; ++i) { u32bit L = load_be(in, 0), R = load_be(in, 1); @@ -37,6 +99,14 @@ void XTEA::encrypt_n(const byte in[], byte out[], u32bit blocks) const */ void XTEA::decrypt_n(const byte in[], byte out[], u32bit blocks) const { + while(blocks >= 4) + { + xtea_decrypt_4(in, out, this->EK); + in += 4 * BLOCK_SIZE; + out += 4 * BLOCK_SIZE; + blocks -= 4; + } + for(u32bit i = 0; i != blocks; ++i) { u32bit L = load_be(in, 0), R = load_be(in, 1); -- cgit v1.2.3