ICP: Improve AES-GCM performance

Currently SIMD accelerated AES-GCM performance is limited by two factors: a. The need to disable preemption and interrupts and save the FPU state before using it and to do the reverse when done. Due to the way the code is organized (see (b) below) we have to pay this price twice for each 16 byte GCM block processed. b. Most processing is done in C, operating on single GCM blocks. The use of SIMD instructions is limited to the AES encryption of the counter block (AES-NI) and the Galois multiplication (PCLMULQDQ). This leads to the FPU not being fully utilized for crypto operations. To solve (a) we do crypto processing in larger chunks while owning the FPU. An `icp_gcm_avx_chunk_size` module parameter was introduced to make this chunk size tweakable. It defaults to 32 KiB. This step alone roughly doubles performance. (b) is tackled by porting and using the highly optimized openssl AES-GCM assembler routines, which do all the processing (CTR, AES, GMULT) in a single routine. Both steps together result in up to 32x reduction of the time spend in the en/decryption routines, leading up to approximately 12x throughput increase for large (128 KiB) blocks. Lastly, this commit changes the default encryption algorithm from AES-CCM to AES-GCM when setting the `encryption=on` property. Reviewed-By: Brian Behlendorf <[email protected]> Reviewed-By: Jason King <[email protected]> Reviewed-By: Tom Caputi <[email protected]> Reviewed-By: Richard Laager <[email protected]> Signed-off-by: Attila Fülöp <[email protected]> Closes #9749
author: Attila Fülöp <[email protected]> 2020-02-10 21:59:50 +0100
committer: GitHub <[email protected]> 2020-02-10 12:59:50 -0800
commit: 31b160f0a6c673c8f926233af2ed6d5354808393 (patch)
tree: 185a34810ead2f887d7e4f08f18ba28b1e257c49 /lib
parent: fa3922df75d722724dd51819cb49da0418d054f5 (diff)
2 files changed, 16 insertions, 1 deletions
diff --git a/lib/libicp/Makefile.am b/lib/libicp/Makefile.am
index 02dfce22f..fad91e13c 100644
--- a/lib/libicp/Makefile.am
+++ b/lib/libicp/Makefile.am
@@ -15,6 +15,8 @@ ASM_SOURCES_AS = \
 	asm-x86_64/aes/aes_amd64.S \
 	asm-x86_64/aes/aes_aesni.S \
 	asm-x86_64/modes/gcm_pclmulqdq.S \
+	asm-x86_64/modes/aesni-gcm-x86_64.S \
+	asm-x86_64/modes/ghash-x86_64.S \
 	asm-x86_64/sha1/sha1-x86_64.S \
 	asm-x86_64/sha2/sha256_impl.S \
 	asm-x86_64/sha2/sha512_impl.S
diff --git a/lib/libspl/include/sys/simd.h b/lib/libspl/include/sys/simd.h
index 32864e153..844abe5c6 100644
--- a/lib/libspl/include/sys/simd.h
+++ b/lib/libspl/include/sys/simd.h
@@ -77,7 +77,8 @@ typedef enum cpuid_inst_sets {
 	AVX512ER,
 	AVX512VL,
 	AES,
-	PCLMULQDQ
+	PCLMULQDQ,
+	MOVBE
 } cpuid_inst_sets_t;
 
 /*
@@ -101,6 +102,7 @@ typedef struct cpuid_feature_desc {
 #define	_AVX512VL_BIT		(1U << 31) /* if used also check other levels */
 #define	_AES_BIT		(1U << 25)
 #define	_PCLMULQDQ_BIT		(1U << 1)
+#define	_MOVBE_BIT		(1U << 22)
 
 /*
  * Descriptions of supported instruction sets
@@ -128,6 +130,7 @@ static const cpuid_feature_desc_t cpuid_features[] = {
 	[AVX512VL]	= {7U, 0U, _AVX512ER_BIT,	EBX	},
 	[AES]		= {1U, 0U, _AES_BIT,		ECX	},
 	[PCLMULQDQ]	= {1U, 0U, _PCLMULQDQ_BIT,	ECX	},
+	[MOVBE]		= {1U, 0U, _MOVBE_BIT,		ECX	},
 };
 
 /*
@@ -200,6 +203,7 @@ CPUID_FEATURE_CHECK(avx512er, AVX512ER);
 CPUID_FEATURE_CHECK(avx512vl, AVX512VL);
 CPUID_FEATURE_CHECK(aes, AES);
 CPUID_FEATURE_CHECK(pclmulqdq, PCLMULQDQ);
+CPUID_FEATURE_CHECK(movbe, MOVBE);
 
 /*
  * Detect register set support
@@ -333,6 +337,15 @@ zfs_pclmulqdq_available(void)
 }
 
 /*
+ * Check if MOVBE instruction is available
+ */
+static inline boolean_t
+zfs_movbe_available(void)
+{
+	return (__cpuid_has_movbe());
+}
+
+/*
  * AVX-512 family of instruction sets:
  *
  * AVX512F	Foundation
author	Attila Fülöp <[email protected]>	2020-02-10 21:59:50 +0100
committer	GitHub <[email protected]>	2020-02-10 12:59:50 -0800
commit	31b160f0a6c673c8f926233af2ed6d5354808393 (patch)
tree	185a34810ead2f887d7e4f08f18ba28b1e257c49 /lib
parent	fa3922df75d722724dd51819cb49da0418d054f5 (diff)