diff options
author | Jack Lloyd <[email protected]> | 2017-10-12 19:43:35 -0400 |
---|---|---|
committer | Jack Lloyd <[email protected]> | 2017-10-12 19:43:35 -0400 |
commit | ebf147bcf6b84249cf289009ba81c3f3611ea2de (patch) | |
tree | 8c7baf608daa54477ea8c1ffadba9819708051c8 | |
parent | fa8b83578dcffc394b8449207f60662d7e562728 (diff) |
OCB optimizations
From ~5 cbp to ~2.5 cbp on Skylake
-rw-r--r-- | src/lib/block/block_cipher.h | 39 | ||||
-rw-r--r-- | src/lib/modes/aead/ocb/ocb.cpp | 92 | ||||
-rw-r--r-- | src/lib/modes/aead/ocb/ocb.h | 8 | ||||
-rw-r--r-- | src/lib/modes/xts/xts.cpp | 10 | ||||
-rw-r--r-- | src/lib/utils/bit_ops.h | 11 | ||||
-rw-r--r-- | src/lib/utils/mem_ops.cpp | 61 | ||||
-rw-r--r-- | src/lib/utils/mem_ops.h | 66 |
7 files changed, 163 insertions, 124 deletions
diff --git a/src/lib/block/block_cipher.h b/src/lib/block/block_cipher.h index 03e8a5512..f8c6e47b8 100644 --- a/src/lib/block/block_cipher.h +++ b/src/lib/block/block_cipher.h @@ -168,6 +168,26 @@ class BOTAN_PUBLIC_API(2,0) BlockCipher : public SymmetricAlgorithm virtual void decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const = 0; + virtual void encrypt_n_xex(uint8_t data[], + const uint8_t mask[], + size_t blocks) const + { + const size_t BS = block_size(); + xor_buf(data, mask, blocks * BS); + decrypt_n(data, data, blocks); + xor_buf(data, mask, blocks * BS); + } + + virtual void decrypt_n_xex(uint8_t data[], + const uint8_t mask[], + size_t blocks) const + { + const size_t BS = block_size(); + xor_buf(data, mask, blocks * BS); + encrypt_n(data, data, blocks); + xor_buf(data, mask, blocks * BS); + } + /** * @return new object representing the same algorithm as *this */ @@ -186,6 +206,25 @@ class Block_Cipher_Fixed_Params : public BlockCipher enum { BLOCK_SIZE = BS }; size_t block_size() const override { return BS; } + // override to take advantage of compile time constant block size + void encrypt_n_xex(uint8_t data[], + const uint8_t mask[], + size_t blocks) const override + { + xor_buf(data, mask, blocks * BS); + encrypt_n(data, data, blocks); + xor_buf(data, mask, blocks * BS); + } + + void decrypt_n_xex(uint8_t data[], + const uint8_t mask[], + size_t blocks) const override + { + xor_buf(data, mask, blocks * BS); + decrypt_n(data, data, blocks); + xor_buf(data, mask, blocks * BS); + } + Key_Length_Specification key_spec() const override { return Key_Length_Specification(KMIN, KMAX, KMOD); diff --git a/src/lib/modes/aead/ocb/ocb.cpp b/src/lib/modes/aead/ocb/ocb.cpp index 9a7448161..3e134a642 100644 --- a/src/lib/modes/aead/ocb/ocb.cpp +++ b/src/lib/modes/aead/ocb/ocb.cpp @@ -23,13 +23,22 @@ class L_computer final cipher.encrypt(m_L_star); m_L_dollar = poly_double(star()); m_L.push_back(poly_double(dollar())); + + while(m_L.size() < 8) + m_L.push_back(poly_double(m_L.back())); } const secure_vector<uint8_t>& star() const { return m_L_star; } const secure_vector<uint8_t>& dollar() const { return m_L_dollar; } - const secure_vector<uint8_t>& operator()(size_t i) const { return get(i); } + const secure_vector<uint8_t>& get(size_t i) const + { + while(m_L.size() <= i) + m_L.push_back(poly_double(m_L.back())); + + return m_L[i]; + } const secure_vector<uint8_t>& compute_offsets(secure_vector<uint8_t>& offset, @@ -37,11 +46,12 @@ class L_computer final size_t blocks, size_t BS) const { - m_offset_buf.resize(blocks * BS); + if(m_offset_buf.size() < blocks * BS) + m_offset_buf.resize(blocks * BS); for(size_t i = 0; i != blocks; ++i) { // could be done in parallel - offset ^= get(ctz(block_index + 1 + i)); + offset ^= get(ctz<uint32_t>(block_index + 1 + i)); copy_mem(&m_offset_buf[BS*i], offset.data(), BS); } @@ -49,14 +59,6 @@ class L_computer final } private: - const secure_vector<uint8_t>& get(size_t i) const - { - while(m_L.size() <= i) - m_L.push_back(poly_double(m_L.back())); - - return m_L.at(i); - } - secure_vector<uint8_t> poly_double(const secure_vector<uint8_t>& in) const { secure_vector<uint8_t> out(in.size()); @@ -90,7 +92,7 @@ secure_vector<uint8_t> ocb_hash(const L_computer& L, for(size_t i = 0; i != ad_blocks; ++i) { // this loop could run in parallel - offset ^= L(ctz(i+1)); + offset ^= L.get(ctz(i+1)); buf = offset; xor_buf(buf.data(), &ad[BS*i], BS); @@ -106,7 +108,7 @@ secure_vector<uint8_t> ocb_hash(const L_computer& L, buf = offset; xor_buf(buf.data(), &ad[BS*ad_blocks], ad_remainder); - buf[ad_len % BS] ^= 0x80; + buf[ad_remainder] ^= 0x80; cipher.encrypt(buf); @@ -123,9 +125,11 @@ OCB_Mode::OCB_Mode(BlockCipher* cipher, size_t tag_size) : m_checksum(m_cipher->parallel_bytes()), m_offset(m_cipher->block_size()), m_ad_hash(m_cipher->block_size()), - m_tag_size(tag_size) + m_tag_size(tag_size), + m_block_size(m_cipher->block_size()), + m_par_blocks(m_cipher->parallel_bytes() / m_block_size) { - const size_t BS = m_cipher->block_size(); + const size_t BS = block_size(); /* * draft-krovetz-ocb-wide-d1 specifies OCB for several other block @@ -162,10 +166,10 @@ bool OCB_Mode::valid_nonce_length(size_t length) const { if(length == 0) return false; - if(m_cipher->block_size() == 16) + if(block_size() == 16) return length < 16; else - return length < (m_cipher->block_size() - 1); + return length < (block_size() - 1); } std::string OCB_Mode::name() const @@ -198,7 +202,7 @@ void OCB_Mode::set_associated_data(const uint8_t ad[], size_t ad_len) secure_vector<uint8_t> OCB_Mode::update_nonce(const uint8_t nonce[], size_t nonce_len) { - const size_t BS = m_cipher->block_size(); + const size_t BS = block_size(); BOTAN_ASSERT(BS == 16 || BS == 24 || BS == 32 || BS == 64, "OCB block size is supported"); @@ -300,23 +304,20 @@ void OCB_Mode::start_msg(const uint8_t nonce[], size_t nonce_len) void OCB_Encryption::encrypt(uint8_t buffer[], size_t blocks) { - const size_t BS = m_cipher->block_size(); - const size_t par_blocks = m_checksum.size() / BS; + const size_t BS = block_size(); + + BOTAN_ASSERT(m_L, "A key was set"); while(blocks) { - const size_t proc_blocks = std::min(blocks, par_blocks); + const size_t proc_blocks = std::min(blocks, par_blocks()); const size_t proc_bytes = proc_blocks * BS; - BOTAN_ASSERT(m_L, "A key was set"); - const auto& offsets = m_L->compute_offsets(m_offset, m_block_index, proc_blocks, BS); xor_buf(m_checksum.data(), buffer, proc_bytes); - xor_buf(buffer, offsets.data(), proc_bytes); - m_cipher->encrypt_n(buffer, buffer, proc_blocks); - xor_buf(buffer, offsets.data(), proc_bytes); + m_cipher->encrypt_n_xex(buffer, offsets.data(), proc_blocks); buffer += proc_bytes; blocks -= proc_blocks; @@ -326,7 +327,7 @@ void OCB_Encryption::encrypt(uint8_t buffer[], size_t blocks) size_t OCB_Encryption::process(uint8_t buf[], size_t sz) { - const size_t BS = m_cipher->block_size(); + const size_t BS = block_size(); BOTAN_ASSERT(sz % BS == 0, "Invalid OCB input size"); encrypt(buf, sz / BS); return sz; @@ -334,7 +335,7 @@ size_t OCB_Encryption::process(uint8_t buf[], size_t sz) void OCB_Encryption::finish(secure_vector<uint8_t>& buffer, size_t offset) { - const size_t BS = m_cipher->block_size(); + const size_t BS = block_size(); BOTAN_ASSERT(buffer.size() >= offset, "Offset is sane"); const size_t sz = buffer.size() - offset; @@ -357,17 +358,19 @@ void OCB_Encryption::finish(secure_vector<uint8_t>& buffer, size_t offset) m_offset ^= m_L->star(); // Offset_* - secure_vector<uint8_t> zeros(BS); - m_cipher->encrypt(m_offset, zeros); - xor_buf(remainder, zeros.data(), remainder_bytes); + secure_vector<uint8_t> pad(BS); + m_cipher->encrypt(m_offset, pad); + xor_buf(remainder, pad.data(), remainder_bytes); } } secure_vector<uint8_t> checksum(BS); // fold checksum - for(size_t i = 0; i != m_checksum.size(); ++i) - checksum[i % checksum.size()] ^= m_checksum[i]; + for(size_t i = 0; i != m_checksum.size(); i += BS) + { + xor_buf(checksum.data(), m_checksum.data() + i, BS); + } // now compute the tag secure_vector<uint8_t> mac = m_offset; @@ -385,23 +388,16 @@ void OCB_Encryption::finish(secure_vector<uint8_t>& buffer, size_t offset) void OCB_Decryption::decrypt(uint8_t buffer[], size_t blocks) { - const size_t BS = m_cipher->block_size(); - const size_t par_bytes = m_cipher->parallel_bytes(); - - BOTAN_ASSERT(par_bytes % BS == 0, "Cipher is parallel in full blocks"); - - const size_t par_blocks = par_bytes / BS; + const size_t BS = block_size(); while(blocks) { - const size_t proc_blocks = std::min(blocks, par_blocks); + const size_t proc_blocks = std::min(blocks, par_blocks()); const size_t proc_bytes = proc_blocks * BS; const auto& offsets = m_L->compute_offsets(m_offset, m_block_index, proc_blocks, BS); - xor_buf(buffer, offsets.data(), proc_bytes); - m_cipher->decrypt_n(buffer, buffer, proc_blocks); - xor_buf(buffer, offsets.data(), proc_bytes); + m_cipher->decrypt_n_xex(buffer, offsets.data(), proc_blocks); xor_buf(m_checksum.data(), buffer, proc_bytes); @@ -413,7 +409,7 @@ void OCB_Decryption::decrypt(uint8_t buffer[], size_t blocks) size_t OCB_Decryption::process(uint8_t buf[], size_t sz) { - const size_t BS = m_cipher->block_size(); + const size_t BS = block_size(); BOTAN_ASSERT(sz % BS == 0, "Invalid OCB input size"); decrypt(buf, sz / BS); return sz; @@ -421,7 +417,7 @@ size_t OCB_Decryption::process(uint8_t buf[], size_t sz) void OCB_Decryption::finish(secure_vector<uint8_t>& buffer, size_t offset) { - const size_t BS = m_cipher->block_size(); + const size_t BS = block_size(); BOTAN_ASSERT(buffer.size() >= offset, "Offset is sane"); const size_t sz = buffer.size() - offset; @@ -459,8 +455,10 @@ void OCB_Decryption::finish(secure_vector<uint8_t>& buffer, size_t offset) secure_vector<uint8_t> checksum(BS); // fold checksum - for(size_t i = 0; i != m_checksum.size(); ++i) - checksum[i % checksum.size()] ^= m_checksum[i]; + for(size_t i = 0; i != m_checksum.size(); i += BS) + { + xor_buf(checksum.data(), m_checksum.data() + i, BS); + } // compute the mac secure_vector<uint8_t> mac = m_offset; diff --git a/src/lib/modes/aead/ocb/ocb.h b/src/lib/modes/aead/ocb/ocb.h index 4188e8574..f4a54ee30 100644 --- a/src/lib/modes/aead/ocb/ocb.h +++ b/src/lib/modes/aead/ocb/ocb.h @@ -55,6 +55,10 @@ class BOTAN_PUBLIC_API(2,0) OCB_Mode : public AEAD_Mode */ OCB_Mode(BlockCipher* cipher, size_t tag_size); + size_t block_size() const { return m_block_size; } + size_t par_blocks() const { return m_par_blocks; } + size_t par_bytes() const { return m_checksum.size(); } + // fixme make these private std::unique_ptr<BlockCipher> m_cipher; std::unique_ptr<L_computer> m_L; @@ -71,7 +75,9 @@ class BOTAN_PUBLIC_API(2,0) OCB_Mode : public AEAD_Mode secure_vector<uint8_t> update_nonce(const uint8_t nonce[], size_t nonce_len); - const size_t m_tag_size = 0; + const size_t m_tag_size; + const size_t m_block_size; + const size_t m_par_blocks; secure_vector<uint8_t> m_last_nonce; secure_vector<uint8_t> m_stretch; }; diff --git a/src/lib/modes/xts/xts.cpp b/src/lib/modes/xts/xts.cpp index 53e959258..496b71c5f 100644 --- a/src/lib/modes/xts/xts.cpp +++ b/src/lib/modes/xts/xts.cpp @@ -119,11 +119,8 @@ size_t XTS_Encryption::process(uint8_t buf[], size_t sz) while(blocks) { const size_t to_proc = std::min(blocks, blocks_in_tweak); - const size_t to_proc_bytes = to_proc * BS; - xor_buf(buf, tweak(), to_proc_bytes); - cipher().encrypt_n(buf, buf, to_proc); - xor_buf(buf, tweak(), to_proc_bytes); + cipher().encrypt_n_xex(buf, tweak(), to_proc); buf += to_proc * BS; blocks -= to_proc; @@ -195,11 +192,8 @@ size_t XTS_Decryption::process(uint8_t buf[], size_t sz) while(blocks) { const size_t to_proc = std::min(blocks, blocks_in_tweak); - const size_t to_proc_bytes = to_proc * BS; - xor_buf(buf, tweak(), to_proc_bytes); - cipher().decrypt_n(buf, buf, to_proc); - xor_buf(buf, tweak(), to_proc_bytes); + cipher().decrypt_n_xex(buf, tweak(), to_proc); buf += to_proc * BS; blocks -= to_proc; diff --git a/src/lib/utils/bit_ops.h b/src/lib/utils/bit_ops.h index a59404c75..2da0e55fb 100644 --- a/src/lib/utils/bit_ops.h +++ b/src/lib/utils/bit_ops.h @@ -102,6 +102,17 @@ inline size_t ctz(T n) return 8*sizeof(T); } +#if defined(BOTAN_BUILD_COMPILER_IS_GCC) + +template<> +inline size_t ctz(uint32_t n) + { + return __builtin_ctz(n); + } + +#endif + + template<typename T> size_t ceil_log2(T x) { diff --git a/src/lib/utils/mem_ops.cpp b/src/lib/utils/mem_ops.cpp index 29c93eb15..3fd463195 100644 --- a/src/lib/utils/mem_ops.cpp +++ b/src/lib/utils/mem_ops.cpp @@ -53,65 +53,4 @@ bool constant_time_compare(const uint8_t x[], return difference == 0; } -void xor_buf(uint8_t x[], - const uint8_t y[], - size_t len) - { - while(len >= 16) - { - x[0] ^= y[0]; - x[1] ^= y[1]; - x[2] ^= y[2]; - x[3] ^= y[3]; - x[4] ^= y[4]; - x[5] ^= y[5]; - x[6] ^= y[6]; - x[7] ^= y[7]; - x[8] ^= y[8]; - x[9] ^= y[9]; - x[10] ^= y[10]; - x[11] ^= y[11]; - x[12] ^= y[12]; - x[13] ^= y[13]; - x[14] ^= y[14]; - x[15] ^= y[15]; - x += 16; y += 16; len -= 16; - } - - for(size_t i = 0; i != len; ++i) - { - x[i] ^= y[i]; - } - } - -void xor_buf(uint8_t out[], - const uint8_t in[], - const uint8_t in2[], - size_t length) - { - while(length >= 16) - { - out[0] = in[0] ^ in2[0]; - out[1] = in[1] ^ in2[1]; - out[2] = in[2] ^ in2[2]; - out[3] = in[3] ^ in2[3]; - out[4] = in[4] ^ in2[4]; - out[5] = in[5] ^ in2[5]; - out[6] = in[6] ^ in2[6]; - out[7] = in[7] ^ in2[7]; - out[8] = in[8] ^ in2[8]; - out[9] = in[9] ^ in2[9]; - out[10] = in[10] ^ in2[10]; - out[11] = in[11] ^ in2[11]; - out[12] = in[12] ^ in2[12]; - out[13] = in[13] ^ in2[13]; - out[14] = in[14] ^ in2[14]; - out[15] = in[15] ^ in2[15]; - in += 16; in2 += 16; out += 16; length -= 16; - } - - for(size_t i = 0; i != length; ++i) - out[i] = in[i] ^ in2[i]; - } - } diff --git a/src/lib/utils/mem_ops.h b/src/lib/utils/mem_ops.h index ed4d6cb27..175f38e2f 100644 --- a/src/lib/utils/mem_ops.h +++ b/src/lib/utils/mem_ops.h @@ -160,9 +160,36 @@ template<typename T> inline bool same_mem(const T* p1, const T* p2, size_t n) * @param in the read-only input buffer * @param length the length of the buffers */ -BOTAN_PUBLIC_API(2,3) void xor_buf(uint8_t out[], - const uint8_t in[], - size_t length); +inline void xor_buf(uint8_t out[], + const uint8_t in[], + size_t length) + { + while(length >= 16) + { + out[0] ^= in[0]; + out[1] ^= in[1]; + out[2] ^= in[2]; + out[3] ^= in[3]; + out[4] ^= in[4]; + out[5] ^= in[5]; + out[6] ^= in[6]; + out[7] ^= in[7]; + out[8] ^= in[8]; + out[9] ^= in[9]; + out[10] ^= in[10]; + out[11] ^= in[11]; + out[12] ^= in[12]; + out[13] ^= in[13]; + out[14] ^= in[14]; + out[15] ^= in[15]; + out += 16; in += 16; length -= 16; + } + + for(size_t i = 0; i != length; ++i) + { + out[i] ^= in[i]; + } + } /** * XOR arrays. Postcondition out[i] = in[i] ^ in2[i] forall i = 0...length @@ -171,10 +198,35 @@ BOTAN_PUBLIC_API(2,3) void xor_buf(uint8_t out[], * @param in2 the second output buffer * @param length the length of the three buffers */ -BOTAN_PUBLIC_API(2,3) void xor_buf(uint8_t out[], - const uint8_t in[], - const uint8_t in2[], - size_t length); +inline void xor_buf(uint8_t out[], + const uint8_t in[], + const uint8_t in2[], + size_t length) + { + while(length >= 16) + { + out[0] = in[0] ^ in2[0]; + out[1] = in[1] ^ in2[1]; + out[2] = in[2] ^ in2[2]; + out[3] = in[3] ^ in2[3]; + out[4] = in[4] ^ in2[4]; + out[5] = in[5] ^ in2[5]; + out[6] = in[6] ^ in2[6]; + out[7] = in[7] ^ in2[7]; + out[8] = in[8] ^ in2[8]; + out[9] = in[9] ^ in2[9]; + out[10] = in[10] ^ in2[10]; + out[11] = in[11] ^ in2[11]; + out[12] = in[12] ^ in2[12]; + out[13] = in[13] ^ in2[13]; + out[14] = in[14] ^ in2[14]; + out[15] = in[15] ^ in2[15]; + in += 16; in2 += 16; out += 16; length -= 16; + } + + for(size_t i = 0; i != length; ++i) + out[i] = in[i] ^ in2[i]; + } template<typename Alloc, typename Alloc2> void xor_buf(std::vector<uint8_t, Alloc>& out, |