Linux 5.0 compat: SIMD compatibility

Restore the SIMD optimization for 4.19.38 LTS, 4.14.120 LTS, and 5.0 and newer kernels. This is accomplished by leveraging the fact that by definition dedicated kernel threads never need to concern themselves with saving and restoring the user FPU state. Therefore, they may use the FPU as long as we can guarantee user tasks always restore their FPU state before context switching back to user space. For the 5.0 and 5.1 kernels disabling preemption and local interrupts is sufficient to allow the FPU to be used. All non-kernel threads will restore the preserved user FPU state. For 5.2 and latter kernels the user FPU state restoration will be skipped if the kernel determines the registers have not changed. Therefore, for these kernels we need to perform the additional step of saving and restoring the FPU registers. Invalidating the per-cpu global tracking the FPU state would force a restore but that functionality is private to the core x86 FPU implementation and unavailable. In practice, restricting SIMD to kernel threads is not a major restriction for ZFS. The vast majority of SIMD operations are already performed by the IO pipeline. The remaining cases are relatively infrequent and can be handled by the generic code without significant impact. The two most noteworthy cases are: 1) Decrypting the wrapping key for an encrypted dataset, i.e. `zfs load-key`. All other encryption and decryption operations will use the SIMD optimized implementations. 2) Generating the payload checksums for a `zfs send` stream. In order to avoid making any changes to the higher layers of ZFS all of the `*_get_ops()` functions were updated to take in to consideration the calling context. This allows for the fastest implementation to be used as appropriate (see kfpu_allowed()). The only other notable instance of SIMD operations being used outside a kernel thread was at module load time. This code was moved in to a taskq in order to accommodate the new kernel thread restriction. Finally, a few other modifications were made in order to further harden this code and facilitate testing. They include updating each implementations operations structure to be declared as a constant. And allowing "cycle" to be set when selecting the preferred ops in the kernel as well as user space. Reviewed-by: Tony Hutter <[email protected]> Signed-off-by: Brian Behlendorf <[email protected]> Closes #8754 Closes #8793 Closes #8965
author: Brian Behlendorf <[email protected]> 2019-07-12 09:31:20 -0700
committer: GitHub <[email protected]> 2019-07-12 09:31:20 -0700
commit: e5db31349484e5e859c7a942eb15b98d68ce5b4d (patch)
tree: 0f1f6ab52249113c3643eb135791287a471f6707 /module/icp/algs
parent: d230a65c3b161d33de3a8f96e78f8a35edce6708 (diff)
4 files changed, 49 insertions, 30 deletions
diff --git a/module/icp/algs/aes/aes_impl.c b/module/icp/algs/aes/aes_impl.c
index e15050635..457b9e45c 100644
--- a/module/icp/algs/aes/aes_impl.c
+++ b/module/icp/algs/aes/aes_impl.c
@@ -27,6 +27,7 @@
 #include <sys/crypto/spi.h>
 #include <modes/modes.h>
 #include <aes/aes_impl.h>
+#include <linux/simd.h>
 
 /*
  * Initialize AES encryption and decryption key schedules.
@@ -40,9 +41,9 @@
 void
 aes_init_keysched(const uint8_t *cipherKey, uint_t keyBits, void *keysched)
 {
-	aes_impl_ops_t	*ops = aes_impl_get_ops();
-	aes_key_t	*newbie = keysched;
-	uint_t		keysize, i, j;
+	const aes_impl_ops_t *ops = aes_impl_get_ops();
+	aes_key_t *newbie = keysched;
+	uint_t keysize, i, j;
 	union {
 		uint64_t	ka64[4];
 		uint32_t	ka32[8];
@@ -252,12 +253,17 @@ static size_t aes_supp_impl_cnt = 0;
 static aes_impl_ops_t *aes_supp_impl[ARRAY_SIZE(aes_all_impl)];
 
 /*
- * Selects the aes operations for encrypt/decrypt/key setup
+ * Returns the AES operations for encrypt/decrypt/key setup.  When a
+ * SIMD implementation is not allowed in the current context, then
+ * fallback to the fastest generic implementation.
  */
-aes_impl_ops_t *
-aes_impl_get_ops()
+const aes_impl_ops_t *
+aes_impl_get_ops(void)
 {
-	aes_impl_ops_t *ops = NULL;
+	if (!kfpu_allowed())
+		return (&aes_generic_impl);
+
+	const aes_impl_ops_t *ops = NULL;
 	const uint32_t impl = AES_IMPL_READ(icp_aes_impl);
 
 	switch (impl) {
@@ -266,15 +272,13 @@ aes_impl_get_ops()
 		ops = &aes_fastest_impl;
 		break;
 	case IMPL_CYCLE:
-	{
+		/* Cycle through supported implementations */
 		ASSERT(aes_impl_initialized);
 		ASSERT3U(aes_supp_impl_cnt, >, 0);
-		/* Cycle through supported implementations */
 		static size_t cycle_impl_idx = 0;
 		size_t idx = (++cycle_impl_idx) % aes_supp_impl_cnt;
 		ops = aes_supp_impl[idx];
-	}
-	break;
+		break;
 	default:
 		ASSERT3U(impl, <, aes_supp_impl_cnt);
 		ASSERT3U(aes_supp_impl_cnt, >, 0);
@@ -288,13 +292,17 @@ aes_impl_get_ops()
 	return (ops);
 }
 
+/*
+ * Initialize all supported implementations.
+ */
+/* ARGSUSED */
 void
-aes_impl_init(void)
+aes_impl_init(void *arg)
 {
 	aes_impl_ops_t *curr_impl;
 	int i, c;
 
-	/* move supported impl into aes_supp_impls */
+	/* Move supported implementations into aes_supp_impls */
 	for (i = 0, c = 0; i < ARRAY_SIZE(aes_all_impl); i++) {
 		curr_impl = (aes_impl_ops_t *)aes_all_impl[i];
 
diff --git a/module/icp/algs/aes/aes_impl_aesni.c b/module/icp/algs/aes/aes_impl_aesni.c
index 97f7c3a47..222c176aa 100644
--- a/module/icp/algs/aes/aes_impl_aesni.c
+++ b/module/icp/algs/aes/aes_impl_aesni.c
@@ -108,7 +108,7 @@ aes_aesni_decrypt(const uint32_t rk[], int Nr, const uint32_t ct[4],
 static boolean_t
 aes_aesni_will_work(void)
 {
-	return (zfs_aes_available());
+	return (kfpu_allowed() && zfs_aes_available());
 }
 
 const aes_impl_ops_t aes_aesni_impl = {
diff --git a/module/icp/algs/modes/gcm.c b/module/icp/algs/modes/gcm.c
index 13bceef0f..f6f8434de 100644
--- a/module/icp/algs/modes/gcm.c
+++ b/module/icp/algs/modes/gcm.c
@@ -29,6 +29,7 @@
 #include <sys/crypto/impl.h>
 #include <sys/byteorder.h>
 #include <modes/gcm_impl.h>
+#include <linux/simd.h>
 
 #define	GHASH(c, d, t, o) \
 	xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \
@@ -46,7 +47,7 @@ gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
     void (*copy_block)(uint8_t *, uint8_t *),
     void (*xor_block)(uint8_t *, uint8_t *))
 {
-	gcm_impl_ops_t *gops;
+	const gcm_impl_ops_t *gops;
 	size_t remainder = length;
 	size_t need = 0;
 	uint8_t *datap = (uint8_t *)data;
@@ -168,7 +169,7 @@ gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
     void (*copy_block)(uint8_t *, uint8_t *),
     void (*xor_block)(uint8_t *, uint8_t *))
 {
-	gcm_impl_ops_t *gops;
+	const gcm_impl_ops_t *gops;
 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
 	uint8_t *ghash, *macp = NULL;
 	int i, rv;
@@ -320,7 +321,7 @@ gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
     void (*xor_block)(uint8_t *, uint8_t *))
 {
-	gcm_impl_ops_t *gops;
+	const gcm_impl_ops_t *gops;
 	size_t pt_len;
 	size_t remainder;
 	uint8_t *ghash;
@@ -427,7 +428,7 @@ gcm_format_initial_blocks(uchar_t *iv, ulong_t iv_len,
     void (*copy_block)(uint8_t *, uint8_t *),
     void (*xor_block)(uint8_t *, uint8_t *))
 {
-	gcm_impl_ops_t *gops;
+	const gcm_impl_ops_t *gops;
 	uint8_t *cb;
 	ulong_t remainder = iv_len;
 	ulong_t processed = 0;
@@ -481,7 +482,7 @@ gcm_init(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len,
     void (*copy_block)(uint8_t *, uint8_t *),
     void (*xor_block)(uint8_t *, uint8_t *))
 {
-	gcm_impl_ops_t *gops;
+	const gcm_impl_ops_t *gops;
 	uint8_t *ghash, *datap, *authp;
 	size_t remainder, processed;
 
@@ -660,12 +661,17 @@ static size_t gcm_supp_impl_cnt = 0;
 static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)];
 
 /*
- * Selects the gcm operation
+ * Returns the GCM operations for encrypt/decrypt/key setup.  When a
+ * SIMD implementation is not allowed in the current context, then
+ * fallback to the fastest generic implementation.
  */
-gcm_impl_ops_t *
+const gcm_impl_ops_t *
 gcm_impl_get_ops()
 {
-	gcm_impl_ops_t *ops = NULL;
+	if (!kfpu_allowed())
+		return (&gcm_generic_impl);
+
+	const gcm_impl_ops_t *ops = NULL;
 	const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
 
 	switch (impl) {
@@ -674,15 +680,13 @@ gcm_impl_get_ops()
 		ops = &gcm_fastest_impl;
 		break;
 	case IMPL_CYCLE:
-	{
+		/* Cycle through supported implementations */
 		ASSERT(gcm_impl_initialized);
 		ASSERT3U(gcm_supp_impl_cnt, >, 0);
-		/* Cycle through supported implementations */
 		static size_t cycle_impl_idx = 0;
 		size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt;
 		ops = gcm_supp_impl[idx];
-	}
-	break;
+		break;
 	default:
 		ASSERT3U(impl, <, gcm_supp_impl_cnt);
 		ASSERT3U(gcm_supp_impl_cnt, >, 0);
@@ -696,13 +700,17 @@ gcm_impl_get_ops()
 	return (ops);
 }
 
+/*
+ * Initialize all supported implementations.
+ */
+/* ARGSUSED */
 void
-gcm_impl_init(void)
+gcm_impl_init(void *arg)
 {
 	gcm_impl_ops_t *curr_impl;
 	int i, c;
 
-	/* move supported impl into aes_supp_impls */
+	/* Move supported implementations into gcm_supp_impls */
 	for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) {
 		curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i];
 
@@ -711,7 +719,10 @@ gcm_impl_init(void)
 	}
 	gcm_supp_impl_cnt = c;
 
-	/* set fastest implementation. assume hardware accelerated is fastest */
+	/*
+	 * Set the fastest implementation given the assumption that the
+	 * hardware accelerated version is the fastest.
+	 */
 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
 	if (gcm_pclmulqdq_impl.is_supported())
 		memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl,
diff --git a/module/icp/algs/modes/gcm_pclmulqdq.c b/module/icp/algs/modes/gcm_pclmulqdq.c
index be00ba37b..8a43ba33a 100644
--- a/module/icp/algs/modes/gcm_pclmulqdq.c
+++ b/module/icp/algs/modes/gcm_pclmulqdq.c
@@ -52,7 +52,7 @@ gcm_pclmulqdq_mul(uint64_t *x_in, uint64_t *y, uint64_t *res)
 static boolean_t
 gcm_pclmulqdq_will_work(void)
 {
-	return (zfs_pclmulqdq_available());
+	return (kfpu_allowed() && zfs_pclmulqdq_available());
 }
 
 const gcm_impl_ops_t gcm_pclmulqdq_impl = {
author	Brian Behlendorf <[email protected]>	2019-07-12 09:31:20 -0700
committer	GitHub <[email protected]>	2019-07-12 09:31:20 -0700
commit	e5db31349484e5e859c7a942eb15b98d68ce5b4d (patch)
tree	0f1f6ab52249113c3643eb135791287a471f6707 /module/icp/algs
parent	d230a65c3b161d33de3a8f96e78f8a35edce6708 (diff)