aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorlloyd <[email protected]>2008-09-09 17:21:01 +0000
committerlloyd <[email protected]>2008-09-09 17:21:01 +0000
commit3983d92b673f6eeb5c4d27fbe8aa8aa8fbb62c91 (patch)
treeb562d27515705945c6aaee694ae8fa3c531cf27b /src
parent78eebcfd022891b8b3d71e756da55a9a09dcae35 (diff)
Use individual variables intead of an array inside the Salsa20 function,
shows a 35% speedup on my Core2 with G++ vs previous version.
Diffstat (limited to 'src')
-rw-r--r--src/salsa20.cpp102
1 files changed, 65 insertions, 37 deletions
diff --git a/src/salsa20.cpp b/src/salsa20.cpp
index 710e043f7..d11ba8799 100644
--- a/src/salsa20.cpp
+++ b/src/salsa20.cpp
@@ -18,48 +18,76 @@ namespace {
*************************************************/
void salsa20(byte output[64], u32bit input[16])
{
- u32bit x[16];
-
- copy_mem(x, input, 16);
+ u32bit x00 = input[0];
+ u32bit x01 = input[1];
+ u32bit x02 = input[2];
+ u32bit x03 = input[3];
+ u32bit x04 = input[4];
+ u32bit x05 = input[5];
+ u32bit x06 = input[6];
+ u32bit x07 = input[7];
+ u32bit x08 = input[8];
+ u32bit x09 = input[9];
+ u32bit x10 = input[10];
+ u32bit x11 = input[11];
+ u32bit x12 = input[12];
+ u32bit x13 = input[13];
+ u32bit x14 = input[14];
+ u32bit x15 = input[15];
for(u32bit i = 0; i != 10; ++i)
{
- x[ 4] ^= rotate_left(x[ 0] + x[12], 7);
- x[ 8] ^= rotate_left(x[ 4] + x[ 0], 9);
- x[12] ^= rotate_left(x[ 8] + x[ 4], 13);
- x[ 0] ^= rotate_left(x[12] + x[ 8], 18);
- x[ 9] ^= rotate_left(x[ 5] + x[ 1], 7);
- x[13] ^= rotate_left(x[ 9] + x[ 5], 9);
- x[ 1] ^= rotate_left(x[13] + x[ 9], 13);
- x[ 5] ^= rotate_left(x[ 1] + x[13], 18);
- x[14] ^= rotate_left(x[10] + x[ 6], 7);
- x[ 2] ^= rotate_left(x[14] + x[10], 9);
- x[ 6] ^= rotate_left(x[ 2] + x[14], 13);
- x[10] ^= rotate_left(x[ 6] + x[ 2], 18);
- x[ 3] ^= rotate_left(x[15] + x[11], 7);
- x[ 7] ^= rotate_left(x[ 3] + x[15], 9);
- x[11] ^= rotate_left(x[ 7] + x[ 3], 13);
- x[15] ^= rotate_left(x[11] + x[ 7], 18);
- x[ 1] ^= rotate_left(x[ 0] + x[ 3], 7);
- x[ 2] ^= rotate_left(x[ 1] + x[ 0], 9);
- x[ 3] ^= rotate_left(x[ 2] + x[ 1], 13);
- x[ 0] ^= rotate_left(x[ 3] + x[ 2], 18);
- x[ 6] ^= rotate_left(x[ 5] + x[ 4], 7);
- x[ 7] ^= rotate_left(x[ 6] + x[ 5], 9);
- x[ 4] ^= rotate_left(x[ 7] + x[ 6], 13);
- x[ 5] ^= rotate_left(x[ 4] + x[ 7], 18);
- x[11] ^= rotate_left(x[10] + x[ 9], 7);
- x[ 8] ^= rotate_left(x[11] + x[10], 9);
- x[ 9] ^= rotate_left(x[ 8] + x[11], 13);
- x[10] ^= rotate_left(x[ 9] + x[ 8], 18);
- x[12] ^= rotate_left(x[15] + x[14], 7);
- x[13] ^= rotate_left(x[12] + x[15], 9);
- x[14] ^= rotate_left(x[13] + x[12], 13);
- x[15] ^= rotate_left(x[14] + x[13], 18);
+ x04 ^= rotate_left(x00 + x12, 7);
+ x08 ^= rotate_left(x04 + x00, 9);
+ x12 ^= rotate_left(x08 + x04, 13);
+ x00 ^= rotate_left(x12 + x08, 18);
+ x09 ^= rotate_left(x05 + x01, 7);
+ x13 ^= rotate_left(x09 + x05, 9);
+ x01 ^= rotate_left(x13 + x09, 13);
+ x05 ^= rotate_left(x01 + x13, 18);
+ x14 ^= rotate_left(x10 + x06, 7);
+ x02 ^= rotate_left(x14 + x10, 9);
+ x06 ^= rotate_left(x02 + x14, 13);
+ x10 ^= rotate_left(x06 + x02, 18);
+ x03 ^= rotate_left(x15 + x11, 7);
+ x07 ^= rotate_left(x03 + x15, 9);
+ x11 ^= rotate_left(x07 + x03, 13);
+ x15 ^= rotate_left(x11 + x07, 18);
+
+ x01 ^= rotate_left(x00 + x03, 7);
+ x02 ^= rotate_left(x01 + x00, 9);
+ x03 ^= rotate_left(x02 + x01, 13);
+ x00 ^= rotate_left(x03 + x02, 18);
+ x06 ^= rotate_left(x05 + x04, 7);
+ x07 ^= rotate_left(x06 + x05, 9);
+ x04 ^= rotate_left(x07 + x06, 13);
+ x05 ^= rotate_left(x04 + x07, 18);
+ x11 ^= rotate_left(x10 + x09, 7);
+ x08 ^= rotate_left(x11 + x10, 9);
+ x09 ^= rotate_left(x08 + x11, 13);
+ x10 ^= rotate_left(x09 + x08, 18);
+ x12 ^= rotate_left(x15 + x14, 7);
+ x13 ^= rotate_left(x12 + x15, 9);
+ x14 ^= rotate_left(x13 + x12, 13);
+ x15 ^= rotate_left(x14 + x13, 18);
}
- for(u32bit i = 0; i != 16; ++i)
- store_le(x[i] + input[i], output + 4 * i);
+ store_le(x00 + input[ 0], output + 4 * 0);
+ store_le(x01 + input[ 1], output + 4 * 1);
+ store_le(x02 + input[ 2], output + 4 * 2);
+ store_le(x03 + input[ 3], output + 4 * 3);
+ store_le(x04 + input[ 4], output + 4 * 4);
+ store_le(x05 + input[ 5], output + 4 * 5);
+ store_le(x06 + input[ 6], output + 4 * 6);
+ store_le(x07 + input[ 7], output + 4 * 7);
+ store_le(x08 + input[ 8], output + 4 * 8);
+ store_le(x09 + input[ 9], output + 4 * 9);
+ store_le(x10 + input[10], output + 4 * 10);
+ store_le(x11 + input[11], output + 4 * 11);
+ store_le(x12 + input[12], output + 4 * 12);
+ store_le(x13 + input[13], output + 4 * 13);
+ store_le(x14 + input[14], output + 4 * 14);
+ store_le(x15 + input[15], output + 4 * 15);
++input[8];
if(!input[8])