aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJack Lloyd <[email protected]>2017-10-12 19:43:35 -0400
committerJack Lloyd <[email protected]>2017-10-12 19:43:35 -0400
commitebf147bcf6b84249cf289009ba81c3f3611ea2de (patch)
tree8c7baf608daa54477ea8c1ffadba9819708051c8
parentfa8b83578dcffc394b8449207f60662d7e562728 (diff)
OCB optimizations
From ~5 cbp to ~2.5 cbp on Skylake
-rw-r--r--src/lib/block/block_cipher.h39
-rw-r--r--src/lib/modes/aead/ocb/ocb.cpp92
-rw-r--r--src/lib/modes/aead/ocb/ocb.h8
-rw-r--r--src/lib/modes/xts/xts.cpp10
-rw-r--r--src/lib/utils/bit_ops.h11
-rw-r--r--src/lib/utils/mem_ops.cpp61
-rw-r--r--src/lib/utils/mem_ops.h66
7 files changed, 163 insertions, 124 deletions
diff --git a/src/lib/block/block_cipher.h b/src/lib/block/block_cipher.h
index 03e8a5512..f8c6e47b8 100644
--- a/src/lib/block/block_cipher.h
+++ b/src/lib/block/block_cipher.h
@@ -168,6 +168,26 @@ class BOTAN_PUBLIC_API(2,0) BlockCipher : public SymmetricAlgorithm
virtual void decrypt_n(const uint8_t in[], uint8_t out[],
size_t blocks) const = 0;
+ virtual void encrypt_n_xex(uint8_t data[],
+ const uint8_t mask[],
+ size_t blocks) const
+ {
+ const size_t BS = block_size();
+ xor_buf(data, mask, blocks * BS);
+ decrypt_n(data, data, blocks);
+ xor_buf(data, mask, blocks * BS);
+ }
+
+ virtual void decrypt_n_xex(uint8_t data[],
+ const uint8_t mask[],
+ size_t blocks) const
+ {
+ const size_t BS = block_size();
+ xor_buf(data, mask, blocks * BS);
+ encrypt_n(data, data, blocks);
+ xor_buf(data, mask, blocks * BS);
+ }
+
/**
* @return new object representing the same algorithm as *this
*/
@@ -186,6 +206,25 @@ class Block_Cipher_Fixed_Params : public BlockCipher
enum { BLOCK_SIZE = BS };
size_t block_size() const override { return BS; }
+ // override to take advantage of compile time constant block size
+ void encrypt_n_xex(uint8_t data[],
+ const uint8_t mask[],
+ size_t blocks) const override
+ {
+ xor_buf(data, mask, blocks * BS);
+ encrypt_n(data, data, blocks);
+ xor_buf(data, mask, blocks * BS);
+ }
+
+ void decrypt_n_xex(uint8_t data[],
+ const uint8_t mask[],
+ size_t blocks) const override
+ {
+ xor_buf(data, mask, blocks * BS);
+ decrypt_n(data, data, blocks);
+ xor_buf(data, mask, blocks * BS);
+ }
+
Key_Length_Specification key_spec() const override
{
return Key_Length_Specification(KMIN, KMAX, KMOD);
diff --git a/src/lib/modes/aead/ocb/ocb.cpp b/src/lib/modes/aead/ocb/ocb.cpp
index 9a7448161..3e134a642 100644
--- a/src/lib/modes/aead/ocb/ocb.cpp
+++ b/src/lib/modes/aead/ocb/ocb.cpp
@@ -23,13 +23,22 @@ class L_computer final
cipher.encrypt(m_L_star);
m_L_dollar = poly_double(star());
m_L.push_back(poly_double(dollar()));
+
+ while(m_L.size() < 8)
+ m_L.push_back(poly_double(m_L.back()));
}
const secure_vector<uint8_t>& star() const { return m_L_star; }
const secure_vector<uint8_t>& dollar() const { return m_L_dollar; }
- const secure_vector<uint8_t>& operator()(size_t i) const { return get(i); }
+ const secure_vector<uint8_t>& get(size_t i) const
+ {
+ while(m_L.size() <= i)
+ m_L.push_back(poly_double(m_L.back()));
+
+ return m_L[i];
+ }
const secure_vector<uint8_t>&
compute_offsets(secure_vector<uint8_t>& offset,
@@ -37,11 +46,12 @@ class L_computer final
size_t blocks,
size_t BS) const
{
- m_offset_buf.resize(blocks * BS);
+ if(m_offset_buf.size() < blocks * BS)
+ m_offset_buf.resize(blocks * BS);
for(size_t i = 0; i != blocks; ++i)
{ // could be done in parallel
- offset ^= get(ctz(block_index + 1 + i));
+ offset ^= get(ctz<uint32_t>(block_index + 1 + i));
copy_mem(&m_offset_buf[BS*i], offset.data(), BS);
}
@@ -49,14 +59,6 @@ class L_computer final
}
private:
- const secure_vector<uint8_t>& get(size_t i) const
- {
- while(m_L.size() <= i)
- m_L.push_back(poly_double(m_L.back()));
-
- return m_L.at(i);
- }
-
secure_vector<uint8_t> poly_double(const secure_vector<uint8_t>& in) const
{
secure_vector<uint8_t> out(in.size());
@@ -90,7 +92,7 @@ secure_vector<uint8_t> ocb_hash(const L_computer& L,
for(size_t i = 0; i != ad_blocks; ++i)
{
// this loop could run in parallel
- offset ^= L(ctz(i+1));
+ offset ^= L.get(ctz(i+1));
buf = offset;
xor_buf(buf.data(), &ad[BS*i], BS);
@@ -106,7 +108,7 @@ secure_vector<uint8_t> ocb_hash(const L_computer& L,
buf = offset;
xor_buf(buf.data(), &ad[BS*ad_blocks], ad_remainder);
- buf[ad_len % BS] ^= 0x80;
+ buf[ad_remainder] ^= 0x80;
cipher.encrypt(buf);
@@ -123,9 +125,11 @@ OCB_Mode::OCB_Mode(BlockCipher* cipher, size_t tag_size) :
m_checksum(m_cipher->parallel_bytes()),
m_offset(m_cipher->block_size()),
m_ad_hash(m_cipher->block_size()),
- m_tag_size(tag_size)
+ m_tag_size(tag_size),
+ m_block_size(m_cipher->block_size()),
+ m_par_blocks(m_cipher->parallel_bytes() / m_block_size)
{
- const size_t BS = m_cipher->block_size();
+ const size_t BS = block_size();
/*
* draft-krovetz-ocb-wide-d1 specifies OCB for several other block
@@ -162,10 +166,10 @@ bool OCB_Mode::valid_nonce_length(size_t length) const
{
if(length == 0)
return false;
- if(m_cipher->block_size() == 16)
+ if(block_size() == 16)
return length < 16;
else
- return length < (m_cipher->block_size() - 1);
+ return length < (block_size() - 1);
}
std::string OCB_Mode::name() const
@@ -198,7 +202,7 @@ void OCB_Mode::set_associated_data(const uint8_t ad[], size_t ad_len)
secure_vector<uint8_t>
OCB_Mode::update_nonce(const uint8_t nonce[], size_t nonce_len)
{
- const size_t BS = m_cipher->block_size();
+ const size_t BS = block_size();
BOTAN_ASSERT(BS == 16 || BS == 24 || BS == 32 || BS == 64,
"OCB block size is supported");
@@ -300,23 +304,20 @@ void OCB_Mode::start_msg(const uint8_t nonce[], size_t nonce_len)
void OCB_Encryption::encrypt(uint8_t buffer[], size_t blocks)
{
- const size_t BS = m_cipher->block_size();
- const size_t par_blocks = m_checksum.size() / BS;
+ const size_t BS = block_size();
+
+ BOTAN_ASSERT(m_L, "A key was set");
while(blocks)
{
- const size_t proc_blocks = std::min(blocks, par_blocks);
+ const size_t proc_blocks = std::min(blocks, par_blocks());
const size_t proc_bytes = proc_blocks * BS;
- BOTAN_ASSERT(m_L, "A key was set");
-
const auto& offsets = m_L->compute_offsets(m_offset, m_block_index, proc_blocks, BS);
xor_buf(m_checksum.data(), buffer, proc_bytes);
- xor_buf(buffer, offsets.data(), proc_bytes);
- m_cipher->encrypt_n(buffer, buffer, proc_blocks);
- xor_buf(buffer, offsets.data(), proc_bytes);
+ m_cipher->encrypt_n_xex(buffer, offsets.data(), proc_blocks);
buffer += proc_bytes;
blocks -= proc_blocks;
@@ -326,7 +327,7 @@ void OCB_Encryption::encrypt(uint8_t buffer[], size_t blocks)
size_t OCB_Encryption::process(uint8_t buf[], size_t sz)
{
- const size_t BS = m_cipher->block_size();
+ const size_t BS = block_size();
BOTAN_ASSERT(sz % BS == 0, "Invalid OCB input size");
encrypt(buf, sz / BS);
return sz;
@@ -334,7 +335,7 @@ size_t OCB_Encryption::process(uint8_t buf[], size_t sz)
void OCB_Encryption::finish(secure_vector<uint8_t>& buffer, size_t offset)
{
- const size_t BS = m_cipher->block_size();
+ const size_t BS = block_size();
BOTAN_ASSERT(buffer.size() >= offset, "Offset is sane");
const size_t sz = buffer.size() - offset;
@@ -357,17 +358,19 @@ void OCB_Encryption::finish(secure_vector<uint8_t>& buffer, size_t offset)
m_offset ^= m_L->star(); // Offset_*
- secure_vector<uint8_t> zeros(BS);
- m_cipher->encrypt(m_offset, zeros);
- xor_buf(remainder, zeros.data(), remainder_bytes);
+ secure_vector<uint8_t> pad(BS);
+ m_cipher->encrypt(m_offset, pad);
+ xor_buf(remainder, pad.data(), remainder_bytes);
}
}
secure_vector<uint8_t> checksum(BS);
// fold checksum
- for(size_t i = 0; i != m_checksum.size(); ++i)
- checksum[i % checksum.size()] ^= m_checksum[i];
+ for(size_t i = 0; i != m_checksum.size(); i += BS)
+ {
+ xor_buf(checksum.data(), m_checksum.data() + i, BS);
+ }
// now compute the tag
secure_vector<uint8_t> mac = m_offset;
@@ -385,23 +388,16 @@ void OCB_Encryption::finish(secure_vector<uint8_t>& buffer, size_t offset)
void OCB_Decryption::decrypt(uint8_t buffer[], size_t blocks)
{
- const size_t BS = m_cipher->block_size();
- const size_t par_bytes = m_cipher->parallel_bytes();
-
- BOTAN_ASSERT(par_bytes % BS == 0, "Cipher is parallel in full blocks");
-
- const size_t par_blocks = par_bytes / BS;
+ const size_t BS = block_size();
while(blocks)
{
- const size_t proc_blocks = std::min(blocks, par_blocks);
+ const size_t proc_blocks = std::min(blocks, par_blocks());
const size_t proc_bytes = proc_blocks * BS;
const auto& offsets = m_L->compute_offsets(m_offset, m_block_index, proc_blocks, BS);
- xor_buf(buffer, offsets.data(), proc_bytes);
- m_cipher->decrypt_n(buffer, buffer, proc_blocks);
- xor_buf(buffer, offsets.data(), proc_bytes);
+ m_cipher->decrypt_n_xex(buffer, offsets.data(), proc_blocks);
xor_buf(m_checksum.data(), buffer, proc_bytes);
@@ -413,7 +409,7 @@ void OCB_Decryption::decrypt(uint8_t buffer[], size_t blocks)
size_t OCB_Decryption::process(uint8_t buf[], size_t sz)
{
- const size_t BS = m_cipher->block_size();
+ const size_t BS = block_size();
BOTAN_ASSERT(sz % BS == 0, "Invalid OCB input size");
decrypt(buf, sz / BS);
return sz;
@@ -421,7 +417,7 @@ size_t OCB_Decryption::process(uint8_t buf[], size_t sz)
void OCB_Decryption::finish(secure_vector<uint8_t>& buffer, size_t offset)
{
- const size_t BS = m_cipher->block_size();
+ const size_t BS = block_size();
BOTAN_ASSERT(buffer.size() >= offset, "Offset is sane");
const size_t sz = buffer.size() - offset;
@@ -459,8 +455,10 @@ void OCB_Decryption::finish(secure_vector<uint8_t>& buffer, size_t offset)
secure_vector<uint8_t> checksum(BS);
// fold checksum
- for(size_t i = 0; i != m_checksum.size(); ++i)
- checksum[i % checksum.size()] ^= m_checksum[i];
+ for(size_t i = 0; i != m_checksum.size(); i += BS)
+ {
+ xor_buf(checksum.data(), m_checksum.data() + i, BS);
+ }
// compute the mac
secure_vector<uint8_t> mac = m_offset;
diff --git a/src/lib/modes/aead/ocb/ocb.h b/src/lib/modes/aead/ocb/ocb.h
index 4188e8574..f4a54ee30 100644
--- a/src/lib/modes/aead/ocb/ocb.h
+++ b/src/lib/modes/aead/ocb/ocb.h
@@ -55,6 +55,10 @@ class BOTAN_PUBLIC_API(2,0) OCB_Mode : public AEAD_Mode
*/
OCB_Mode(BlockCipher* cipher, size_t tag_size);
+ size_t block_size() const { return m_block_size; }
+ size_t par_blocks() const { return m_par_blocks; }
+ size_t par_bytes() const { return m_checksum.size(); }
+
// fixme make these private
std::unique_ptr<BlockCipher> m_cipher;
std::unique_ptr<L_computer> m_L;
@@ -71,7 +75,9 @@ class BOTAN_PUBLIC_API(2,0) OCB_Mode : public AEAD_Mode
secure_vector<uint8_t> update_nonce(const uint8_t nonce[], size_t nonce_len);
- const size_t m_tag_size = 0;
+ const size_t m_tag_size;
+ const size_t m_block_size;
+ const size_t m_par_blocks;
secure_vector<uint8_t> m_last_nonce;
secure_vector<uint8_t> m_stretch;
};
diff --git a/src/lib/modes/xts/xts.cpp b/src/lib/modes/xts/xts.cpp
index 53e959258..496b71c5f 100644
--- a/src/lib/modes/xts/xts.cpp
+++ b/src/lib/modes/xts/xts.cpp
@@ -119,11 +119,8 @@ size_t XTS_Encryption::process(uint8_t buf[], size_t sz)
while(blocks)
{
const size_t to_proc = std::min(blocks, blocks_in_tweak);
- const size_t to_proc_bytes = to_proc * BS;
- xor_buf(buf, tweak(), to_proc_bytes);
- cipher().encrypt_n(buf, buf, to_proc);
- xor_buf(buf, tweak(), to_proc_bytes);
+ cipher().encrypt_n_xex(buf, tweak(), to_proc);
buf += to_proc * BS;
blocks -= to_proc;
@@ -195,11 +192,8 @@ size_t XTS_Decryption::process(uint8_t buf[], size_t sz)
while(blocks)
{
const size_t to_proc = std::min(blocks, blocks_in_tweak);
- const size_t to_proc_bytes = to_proc * BS;
- xor_buf(buf, tweak(), to_proc_bytes);
- cipher().decrypt_n(buf, buf, to_proc);
- xor_buf(buf, tweak(), to_proc_bytes);
+ cipher().decrypt_n_xex(buf, tweak(), to_proc);
buf += to_proc * BS;
blocks -= to_proc;
diff --git a/src/lib/utils/bit_ops.h b/src/lib/utils/bit_ops.h
index a59404c75..2da0e55fb 100644
--- a/src/lib/utils/bit_ops.h
+++ b/src/lib/utils/bit_ops.h
@@ -102,6 +102,17 @@ inline size_t ctz(T n)
return 8*sizeof(T);
}
+#if defined(BOTAN_BUILD_COMPILER_IS_GCC)
+
+template<>
+inline size_t ctz(uint32_t n)
+ {
+ return __builtin_ctz(n);
+ }
+
+#endif
+
+
template<typename T>
size_t ceil_log2(T x)
{
diff --git a/src/lib/utils/mem_ops.cpp b/src/lib/utils/mem_ops.cpp
index 29c93eb15..3fd463195 100644
--- a/src/lib/utils/mem_ops.cpp
+++ b/src/lib/utils/mem_ops.cpp
@@ -53,65 +53,4 @@ bool constant_time_compare(const uint8_t x[],
return difference == 0;
}
-void xor_buf(uint8_t x[],
- const uint8_t y[],
- size_t len)
- {
- while(len >= 16)
- {
- x[0] ^= y[0];
- x[1] ^= y[1];
- x[2] ^= y[2];
- x[3] ^= y[3];
- x[4] ^= y[4];
- x[5] ^= y[5];
- x[6] ^= y[6];
- x[7] ^= y[7];
- x[8] ^= y[8];
- x[9] ^= y[9];
- x[10] ^= y[10];
- x[11] ^= y[11];
- x[12] ^= y[12];
- x[13] ^= y[13];
- x[14] ^= y[14];
- x[15] ^= y[15];
- x += 16; y += 16; len -= 16;
- }
-
- for(size_t i = 0; i != len; ++i)
- {
- x[i] ^= y[i];
- }
- }
-
-void xor_buf(uint8_t out[],
- const uint8_t in[],
- const uint8_t in2[],
- size_t length)
- {
- while(length >= 16)
- {
- out[0] = in[0] ^ in2[0];
- out[1] = in[1] ^ in2[1];
- out[2] = in[2] ^ in2[2];
- out[3] = in[3] ^ in2[3];
- out[4] = in[4] ^ in2[4];
- out[5] = in[5] ^ in2[5];
- out[6] = in[6] ^ in2[6];
- out[7] = in[7] ^ in2[7];
- out[8] = in[8] ^ in2[8];
- out[9] = in[9] ^ in2[9];
- out[10] = in[10] ^ in2[10];
- out[11] = in[11] ^ in2[11];
- out[12] = in[12] ^ in2[12];
- out[13] = in[13] ^ in2[13];
- out[14] = in[14] ^ in2[14];
- out[15] = in[15] ^ in2[15];
- in += 16; in2 += 16; out += 16; length -= 16;
- }
-
- for(size_t i = 0; i != length; ++i)
- out[i] = in[i] ^ in2[i];
- }
-
}
diff --git a/src/lib/utils/mem_ops.h b/src/lib/utils/mem_ops.h
index ed4d6cb27..175f38e2f 100644
--- a/src/lib/utils/mem_ops.h
+++ b/src/lib/utils/mem_ops.h
@@ -160,9 +160,36 @@ template<typename T> inline bool same_mem(const T* p1, const T* p2, size_t n)
* @param in the read-only input buffer
* @param length the length of the buffers
*/
-BOTAN_PUBLIC_API(2,3) void xor_buf(uint8_t out[],
- const uint8_t in[],
- size_t length);
+inline void xor_buf(uint8_t out[],
+ const uint8_t in[],
+ size_t length)
+ {
+ while(length >= 16)
+ {
+ out[0] ^= in[0];
+ out[1] ^= in[1];
+ out[2] ^= in[2];
+ out[3] ^= in[3];
+ out[4] ^= in[4];
+ out[5] ^= in[5];
+ out[6] ^= in[6];
+ out[7] ^= in[7];
+ out[8] ^= in[8];
+ out[9] ^= in[9];
+ out[10] ^= in[10];
+ out[11] ^= in[11];
+ out[12] ^= in[12];
+ out[13] ^= in[13];
+ out[14] ^= in[14];
+ out[15] ^= in[15];
+ out += 16; in += 16; length -= 16;
+ }
+
+ for(size_t i = 0; i != length; ++i)
+ {
+ out[i] ^= in[i];
+ }
+ }
/**
* XOR arrays. Postcondition out[i] = in[i] ^ in2[i] forall i = 0...length
@@ -171,10 +198,35 @@ BOTAN_PUBLIC_API(2,3) void xor_buf(uint8_t out[],
* @param in2 the second output buffer
* @param length the length of the three buffers
*/
-BOTAN_PUBLIC_API(2,3) void xor_buf(uint8_t out[],
- const uint8_t in[],
- const uint8_t in2[],
- size_t length);
+inline void xor_buf(uint8_t out[],
+ const uint8_t in[],
+ const uint8_t in2[],
+ size_t length)
+ {
+ while(length >= 16)
+ {
+ out[0] = in[0] ^ in2[0];
+ out[1] = in[1] ^ in2[1];
+ out[2] = in[2] ^ in2[2];
+ out[3] = in[3] ^ in2[3];
+ out[4] = in[4] ^ in2[4];
+ out[5] = in[5] ^ in2[5];
+ out[6] = in[6] ^ in2[6];
+ out[7] = in[7] ^ in2[7];
+ out[8] = in[8] ^ in2[8];
+ out[9] = in[9] ^ in2[9];
+ out[10] = in[10] ^ in2[10];
+ out[11] = in[11] ^ in2[11];
+ out[12] = in[12] ^ in2[12];
+ out[13] = in[13] ^ in2[13];
+ out[14] = in[14] ^ in2[14];
+ out[15] = in[15] ^ in2[15];
+ in += 16; in2 += 16; out += 16; length -= 16;
+ }
+
+ for(size_t i = 0; i != length; ++i)
+ out[i] = in[i] ^ in2[i];
+ }
template<typename Alloc, typename Alloc2>
void xor_buf(std::vector<uint8_t, Alloc>& out,