diff options
author | lloyd <[email protected]> | 2008-09-09 17:21:01 +0000 |
---|---|---|
committer | lloyd <[email protected]> | 2008-09-09 17:21:01 +0000 |
commit | 3983d92b673f6eeb5c4d27fbe8aa8aa8fbb62c91 (patch) | |
tree | b562d27515705945c6aaee694ae8fa3c531cf27b /src | |
parent | 78eebcfd022891b8b3d71e756da55a9a09dcae35 (diff) |
Use individual variables intead of an array inside the Salsa20 function,
shows a 35% speedup on my Core2 with G++ vs previous version.
Diffstat (limited to 'src')
-rw-r--r-- | src/salsa20.cpp | 102 |
1 files changed, 65 insertions, 37 deletions
diff --git a/src/salsa20.cpp b/src/salsa20.cpp index 710e043f7..d11ba8799 100644 --- a/src/salsa20.cpp +++ b/src/salsa20.cpp @@ -18,48 +18,76 @@ namespace { *************************************************/ void salsa20(byte output[64], u32bit input[16]) { - u32bit x[16]; - - copy_mem(x, input, 16); + u32bit x00 = input[0]; + u32bit x01 = input[1]; + u32bit x02 = input[2]; + u32bit x03 = input[3]; + u32bit x04 = input[4]; + u32bit x05 = input[5]; + u32bit x06 = input[6]; + u32bit x07 = input[7]; + u32bit x08 = input[8]; + u32bit x09 = input[9]; + u32bit x10 = input[10]; + u32bit x11 = input[11]; + u32bit x12 = input[12]; + u32bit x13 = input[13]; + u32bit x14 = input[14]; + u32bit x15 = input[15]; for(u32bit i = 0; i != 10; ++i) { - x[ 4] ^= rotate_left(x[ 0] + x[12], 7); - x[ 8] ^= rotate_left(x[ 4] + x[ 0], 9); - x[12] ^= rotate_left(x[ 8] + x[ 4], 13); - x[ 0] ^= rotate_left(x[12] + x[ 8], 18); - x[ 9] ^= rotate_left(x[ 5] + x[ 1], 7); - x[13] ^= rotate_left(x[ 9] + x[ 5], 9); - x[ 1] ^= rotate_left(x[13] + x[ 9], 13); - x[ 5] ^= rotate_left(x[ 1] + x[13], 18); - x[14] ^= rotate_left(x[10] + x[ 6], 7); - x[ 2] ^= rotate_left(x[14] + x[10], 9); - x[ 6] ^= rotate_left(x[ 2] + x[14], 13); - x[10] ^= rotate_left(x[ 6] + x[ 2], 18); - x[ 3] ^= rotate_left(x[15] + x[11], 7); - x[ 7] ^= rotate_left(x[ 3] + x[15], 9); - x[11] ^= rotate_left(x[ 7] + x[ 3], 13); - x[15] ^= rotate_left(x[11] + x[ 7], 18); - x[ 1] ^= rotate_left(x[ 0] + x[ 3], 7); - x[ 2] ^= rotate_left(x[ 1] + x[ 0], 9); - x[ 3] ^= rotate_left(x[ 2] + x[ 1], 13); - x[ 0] ^= rotate_left(x[ 3] + x[ 2], 18); - x[ 6] ^= rotate_left(x[ 5] + x[ 4], 7); - x[ 7] ^= rotate_left(x[ 6] + x[ 5], 9); - x[ 4] ^= rotate_left(x[ 7] + x[ 6], 13); - x[ 5] ^= rotate_left(x[ 4] + x[ 7], 18); - x[11] ^= rotate_left(x[10] + x[ 9], 7); - x[ 8] ^= rotate_left(x[11] + x[10], 9); - x[ 9] ^= rotate_left(x[ 8] + x[11], 13); - x[10] ^= rotate_left(x[ 9] + x[ 8], 18); - x[12] ^= rotate_left(x[15] + x[14], 7); - x[13] ^= rotate_left(x[12] + x[15], 9); - x[14] ^= rotate_left(x[13] + x[12], 13); - x[15] ^= rotate_left(x[14] + x[13], 18); + x04 ^= rotate_left(x00 + x12, 7); + x08 ^= rotate_left(x04 + x00, 9); + x12 ^= rotate_left(x08 + x04, 13); + x00 ^= rotate_left(x12 + x08, 18); + x09 ^= rotate_left(x05 + x01, 7); + x13 ^= rotate_left(x09 + x05, 9); + x01 ^= rotate_left(x13 + x09, 13); + x05 ^= rotate_left(x01 + x13, 18); + x14 ^= rotate_left(x10 + x06, 7); + x02 ^= rotate_left(x14 + x10, 9); + x06 ^= rotate_left(x02 + x14, 13); + x10 ^= rotate_left(x06 + x02, 18); + x03 ^= rotate_left(x15 + x11, 7); + x07 ^= rotate_left(x03 + x15, 9); + x11 ^= rotate_left(x07 + x03, 13); + x15 ^= rotate_left(x11 + x07, 18); + + x01 ^= rotate_left(x00 + x03, 7); + x02 ^= rotate_left(x01 + x00, 9); + x03 ^= rotate_left(x02 + x01, 13); + x00 ^= rotate_left(x03 + x02, 18); + x06 ^= rotate_left(x05 + x04, 7); + x07 ^= rotate_left(x06 + x05, 9); + x04 ^= rotate_left(x07 + x06, 13); + x05 ^= rotate_left(x04 + x07, 18); + x11 ^= rotate_left(x10 + x09, 7); + x08 ^= rotate_left(x11 + x10, 9); + x09 ^= rotate_left(x08 + x11, 13); + x10 ^= rotate_left(x09 + x08, 18); + x12 ^= rotate_left(x15 + x14, 7); + x13 ^= rotate_left(x12 + x15, 9); + x14 ^= rotate_left(x13 + x12, 13); + x15 ^= rotate_left(x14 + x13, 18); } - for(u32bit i = 0; i != 16; ++i) - store_le(x[i] + input[i], output + 4 * i); + store_le(x00 + input[ 0], output + 4 * 0); + store_le(x01 + input[ 1], output + 4 * 1); + store_le(x02 + input[ 2], output + 4 * 2); + store_le(x03 + input[ 3], output + 4 * 3); + store_le(x04 + input[ 4], output + 4 * 4); + store_le(x05 + input[ 5], output + 4 * 5); + store_le(x06 + input[ 6], output + 4 * 6); + store_le(x07 + input[ 7], output + 4 * 7); + store_le(x08 + input[ 8], output + 4 * 8); + store_le(x09 + input[ 9], output + 4 * 9); + store_le(x10 + input[10], output + 4 * 10); + store_le(x11 + input[11], output + 4 * 11); + store_le(x12 + input[12], output + 4 * 12); + store_le(x13 + input[13], output + 4 * 13); + store_le(x14 + input[14], output + 4 * 14); + store_le(x15 + input[15], output + 4 * 15); ++input[8]; if(!input[8]) |