22 files changed, 227 insertions, 126 deletions
diff --git a/module/icp/algs/aes/aes_impl.c b/module/icp/algs/aes/aes_impl.c
index e15050635..457b9e45c 100644
--- a/module/icp/algs/aes/aes_impl.c
+++ b/module/icp/algs/aes/aes_impl.c
@@ -27,6 +27,7 @@
 #include <sys/crypto/spi.h>
 #include <modes/modes.h>
 #include <aes/aes_impl.h>
+#include <linux/simd.h>
 
 /*
  * Initialize AES encryption and decryption key schedules.
@@ -40,9 +41,9 @@
 void
 aes_init_keysched(const uint8_t *cipherKey, uint_t keyBits, void *keysched)
 {
-	aes_impl_ops_t	*ops = aes_impl_get_ops();
-	aes_key_t	*newbie = keysched;
-	uint_t		keysize, i, j;
+	const aes_impl_ops_t *ops = aes_impl_get_ops();
+	aes_key_t *newbie = keysched;
+	uint_t keysize, i, j;
 	union {
 		uint64_t	ka64[4];
 		uint32_t	ka32[8];
@@ -252,12 +253,17 @@ static size_t aes_supp_impl_cnt = 0;
 static aes_impl_ops_t *aes_supp_impl[ARRAY_SIZE(aes_all_impl)];
 
 /*
- * Selects the aes operations for encrypt/decrypt/key setup
+ * Returns the AES operations for encrypt/decrypt/key setup.  When a
+ * SIMD implementation is not allowed in the current context, then
+ * fallback to the fastest generic implementation.
  */
-aes_impl_ops_t *
-aes_impl_get_ops()
+const aes_impl_ops_t *
+aes_impl_get_ops(void)
 {
-	aes_impl_ops_t *ops = NULL;
+	if (!kfpu_allowed())
+		return (&aes_generic_impl);
+
+	const aes_impl_ops_t *ops = NULL;
 	const uint32_t impl = AES_IMPL_READ(icp_aes_impl);
 
 	switch (impl) {
@@ -266,15 +272,13 @@ aes_impl_get_ops()
 		ops = &aes_fastest_impl;
 		break;
 	case IMPL_CYCLE:
-	{
+		/* Cycle through supported implementations */
 		ASSERT(aes_impl_initialized);
 		ASSERT3U(aes_supp_impl_cnt, >, 0);
-		/* Cycle through supported implementations */
 		static size_t cycle_impl_idx = 0;
 		size_t idx = (++cycle_impl_idx) % aes_supp_impl_cnt;
 		ops = aes_supp_impl[idx];
-	}
-	break;
+		break;
 	default:
 		ASSERT3U(impl, <, aes_supp_impl_cnt);
 		ASSERT3U(aes_supp_impl_cnt, >, 0);
@@ -288,13 +292,17 @@ aes_impl_get_ops()
 	return (ops);
 }
 
+/*
+ * Initialize all supported implementations.
+ */
+/* ARGSUSED */
 void
-aes_impl_init(void)
+aes_impl_init(void *arg)
 {
 	aes_impl_ops_t *curr_impl;
 	int i, c;
 
-	/* move supported impl into aes_supp_impls */
+	/* Move supported implementations into aes_supp_impls */
 	for (i = 0, c = 0; i < ARRAY_SIZE(aes_all_impl); i++) {
 		curr_impl = (aes_impl_ops_t *)aes_all_impl[i];
 
diff --git a/module/icp/algs/aes/aes_impl_aesni.c b/module/icp/algs/aes/aes_impl_aesni.c
index 97f7c3a47..222c176aa 100644
--- a/module/icp/algs/aes/aes_impl_aesni.c
+++ b/module/icp/algs/aes/aes_impl_aesni.c
@@ -108,7 +108,7 @@ aes_aesni_decrypt(const uint32_t rk[], int Nr, const uint32_t ct[4],
 static boolean_t
 aes_aesni_will_work(void)
 {
-	return (zfs_aes_available());
+	return (kfpu_allowed() && zfs_aes_available());
 }
 
 const aes_impl_ops_t aes_aesni_impl = {
diff --git a/module/icp/algs/modes/gcm.c b/module/icp/algs/modes/gcm.c
index 13bceef0f..f6f8434de 100644
--- a/module/icp/algs/modes/gcm.c
+++ b/module/icp/algs/modes/gcm.c
@@ -29,6 +29,7 @@
 #include <sys/crypto/impl.h>
 #include <sys/byteorder.h>
 #include <modes/gcm_impl.h>
+#include <linux/simd.h>
 
 #define	GHASH(c, d, t, o) \
 	xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \
@@ -46,7 +47,7 @@ gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
     void (*copy_block)(uint8_t *, uint8_t *),
     void (*xor_block)(uint8_t *, uint8_t *))
 {
-	gcm_impl_ops_t *gops;
+	const gcm_impl_ops_t *gops;
 	size_t remainder = length;
 	size_t need = 0;
 	uint8_t *datap = (uint8_t *)data;
@@ -168,7 +169,7 @@ gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
     void (*copy_block)(uint8_t *, uint8_t *),
     void (*xor_block)(uint8_t *, uint8_t *))
 {
-	gcm_impl_ops_t *gops;
+	const gcm_impl_ops_t *gops;
 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
 	uint8_t *ghash, *macp = NULL;
 	int i, rv;
@@ -320,7 +321,7 @@ gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
     void (*xor_block)(uint8_t *, uint8_t *))
 {
-	gcm_impl_ops_t *gops;
+	const gcm_impl_ops_t *gops;
 	size_t pt_len;
 	size_t remainder;
 	uint8_t *ghash;
@@ -427,7 +428,7 @@ gcm_format_initial_blocks(uchar_t *iv, ulong_t iv_len,
     void (*copy_block)(uint8_t *, uint8_t *),
     void (*xor_block)(uint8_t *, uint8_t *))
 {
-	gcm_impl_ops_t *gops;
+	const gcm_impl_ops_t *gops;
 	uint8_t *cb;
 	ulong_t remainder = iv_len;
 	ulong_t processed = 0;
@@ -481,7 +482,7 @@ gcm_init(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len,
     void (*copy_block)(uint8_t *, uint8_t *),
     void (*xor_block)(uint8_t *, uint8_t *))
 {
-	gcm_impl_ops_t *gops;
+	const gcm_impl_ops_t *gops;
 	uint8_t *ghash, *datap, *authp;
 	size_t remainder, processed;
 
@@ -660,12 +661,17 @@ static size_t gcm_supp_impl_cnt = 0;
 static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)];
 
 /*
- * Selects the gcm operation
+ * Returns the GCM operations for encrypt/decrypt/key setup.  When a
+ * SIMD implementation is not allowed in the current context, then
+ * fallback to the fastest generic implementation.
  */
-gcm_impl_ops_t *
+const gcm_impl_ops_t *
 gcm_impl_get_ops()
 {
-	gcm_impl_ops_t *ops = NULL;
+	if (!kfpu_allowed())
+		return (&gcm_generic_impl);
+
+	const gcm_impl_ops_t *ops = NULL;
 	const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
 
 	switch (impl) {
@@ -674,15 +680,13 @@ gcm_impl_get_ops()
 		ops = &gcm_fastest_impl;
 		break;
 	case IMPL_CYCLE:
-	{
+		/* Cycle through supported implementations */
 		ASSERT(gcm_impl_initialized);
 		ASSERT3U(gcm_supp_impl_cnt, >, 0);
-		/* Cycle through supported implementations */
 		static size_t cycle_impl_idx = 0;
 		size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt;
 		ops = gcm_supp_impl[idx];
-	}
-	break;
+		break;
 	default:
 		ASSERT3U(impl, <, gcm_supp_impl_cnt);
 		ASSERT3U(gcm_supp_impl_cnt, >, 0);
@@ -696,13 +700,17 @@ gcm_impl_get_ops()
 	return (ops);
 }
 
+/*
+ * Initialize all supported implementations.
+ */
+/* ARGSUSED */
 void
-gcm_impl_init(void)
+gcm_impl_init(void *arg)
 {
 	gcm_impl_ops_t *curr_impl;
 	int i, c;
 
-	/* move supported impl into aes_supp_impls */
+	/* Move supported implementations into gcm_supp_impls */
 	for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) {
 		curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i];
 
@@ -711,7 +719,10 @@ gcm_impl_init(void)
 	}
 	gcm_supp_impl_cnt = c;
 
-	/* set fastest implementation. assume hardware accelerated is fastest */
+	/*
+	 * Set the fastest implementation given the assumption that the
+	 * hardware accelerated version is the fastest.
+	 */
 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
 	if (gcm_pclmulqdq_impl.is_supported())
 		memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl,
diff --git a/module/icp/algs/modes/gcm_pclmulqdq.c b/module/icp/algs/modes/gcm_pclmulqdq.c
index be00ba37b..8a43ba33a 100644
--- a/module/icp/algs/modes/gcm_pclmulqdq.c
+++ b/module/icp/algs/modes/gcm_pclmulqdq.c
@@ -52,7 +52,7 @@ gcm_pclmulqdq_mul(uint64_t *x_in, uint64_t *y, uint64_t *res)
 static boolean_t
 gcm_pclmulqdq_will_work(void)
 {
-	return (zfs_pclmulqdq_available());
+	return (kfpu_allowed() && zfs_pclmulqdq_available());
 }
 
 const gcm_impl_ops_t gcm_pclmulqdq_impl = {
diff --git a/module/icp/include/aes/aes_impl.h b/module/icp/include/aes/aes_impl.h
index 95cfddf9e..9fd9c1bd1 100644
--- a/module/icp/include/aes/aes_impl.h
+++ b/module/icp/include/aes/aes_impl.h
@@ -198,12 +198,12 @@ extern const aes_impl_ops_t aes_aesni_impl;
 /*
  * Initializes fastest implementation
  */
-void aes_impl_init(void);
+void aes_impl_init(void *arg);
 
 /*
- * Get selected aes implementation
+ * Returns optimal allowed AES implementation
  */
-struct aes_impl_ops *aes_impl_get_ops(void);
+const struct aes_impl_ops *aes_impl_get_ops(void);
 
 #ifdef	__cplusplus
 }
diff --git a/module/icp/include/modes/gcm_impl.h b/module/icp/include/modes/gcm_impl.h
index cbb904c05..138090487 100644
--- a/module/icp/include/modes/gcm_impl.h
+++ b/module/icp/include/modes/gcm_impl.h
@@ -61,12 +61,12 @@ extern const gcm_impl_ops_t gcm_pclmulqdq_impl;
 /*
  * Initializes fastest implementation
  */
-void gcm_impl_init(void);
+void gcm_impl_init(void *arg);
 
 /*
- * Get selected aes implementation
+ * Returns optimal allowed GCM implementation
  */
-struct gcm_impl_ops *gcm_impl_get_ops(void);
+const struct gcm_impl_ops *gcm_impl_get_ops(void);
 
 #ifdef	__cplusplus
 }
diff --git a/module/icp/io/aes.c b/module/icp/io/aes.c
index 53b193693..51538bc60 100644
--- a/module/icp/io/aes.c
+++ b/module/icp/io/aes.c
@@ -206,9 +206,35 @@ aes_mod_init(void)
 {
 	int ret;
 
-	/* find fastest implementations and set any requested implementations */
-	aes_impl_init();
-	gcm_impl_init();
+#if defined(_KERNEL)
+	/*
+	 * Determine the fastest available implementation.  The benchmarks
+	 * are run in dedicated kernel threads to allow Linux 5.0+ kernels
+	 * to use SIMD operations.  If for some reason this isn't possible,
+	 * fallback to the generic implementations.  See the comment in
+	 * include/linux/simd_x86.h for additional details.  Additionally,
+	 * this has the benefit of allowing them to be run in parallel.
+	 */
+	taskqid_t aes_id = taskq_dispatch(system_taskq, aes_impl_init,
+	    NULL, TQ_SLEEP);
+	taskqid_t gcm_id = taskq_dispatch(system_taskq, gcm_impl_init,
+	    NULL, TQ_SLEEP);
+
+	if (aes_id != TASKQID_INVALID) {
+		taskq_wait_id(system_taskq, aes_id);
+	} else {
+		aes_impl_init(NULL);
+	}
+
+	if (gcm_id != TASKQID_INVALID) {
+		taskq_wait_id(system_taskq, gcm_id);
+	} else {
+		gcm_impl_init(NULL);
+	}
+#else
+	aes_impl_init(NULL);
+	gcm_impl_init(NULL);
+#endif
 
 	if ((ret = mod_install(&modlinkage)) != 0)
 		return (ret);
diff --git a/module/spl/spl-taskq.c b/module/spl/spl-taskq.c
index a39f94e4c..69d591ff7 100644
--- a/module/spl/spl-taskq.c
+++ b/module/spl/spl-taskq.c
@@ -28,6 +28,7 @@
 #include <sys/taskq.h>
 #include <sys/kmem.h>
 #include <sys/tsd.h>
+#include <linux/simd.h>
 
 int spl_taskq_thread_bind = 0;
 module_param(spl_taskq_thread_bind, int, 0644);
@@ -853,6 +854,7 @@ taskq_thread(void *args)
 	sigfillset(&blocked);
 	sigprocmask(SIG_BLOCK, &blocked, NULL);
 	flush_signals(current);
+	kfpu_initialize();
 
 	tsd_set(taskq_tsd, tq);
 	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
diff --git a/module/spl/spl-thread.c b/module/spl/spl-thread.c
index d441ad65f..c4977bcf2 100644
--- a/module/spl/spl-thread.c
+++ b/module/spl/spl-thread.c
@@ -27,6 +27,7 @@
 #include <sys/thread.h>
 #include <sys/kmem.h>
 #include <sys/tsd.h>
+#include <linux/simd.h>
 
 /*
  * Thread interfaces
@@ -54,6 +55,7 @@ thread_generic_wrapper(void *arg)
 	args = tp->tp_args;
 	set_current_state(tp->tp_state);
 	set_user_nice((kthread_t *)current, PRIO_TO_NICE(tp->tp_pri));
+	kfpu_initialize();
 	kmem_free(tp->tp_name, tp->tp_name_size);
 	kmem_free(tp, sizeof (thread_priv_t));
 
diff --git a/module/zcommon/zfs_fletcher.c b/module/zcommon/zfs_fletcher.c
index 5a991ba60..b75d8ab00 100644
--- a/module/zcommon/zfs_fletcher.c
+++ b/module/zcommon/zfs_fletcher.c
@@ -140,6 +140,7 @@
 #include <sys/zio_checksum.h>
 #include <sys/zfs_context.h>
 #include <zfs_fletcher.h>
+#include <linux/simd.h>
 
 #define	FLETCHER_MIN_SIMD_SIZE	64
 
@@ -205,21 +206,19 @@ static struct fletcher_4_impl_selector {
 	const char	*fis_name;
 	uint32_t	fis_sel;
 } fletcher_4_impl_selectors[] = {
-#if !defined(_KERNEL)
 	{ "cycle",	IMPL_CYCLE },
-#endif
 	{ "fastest",	IMPL_FASTEST },
 	{ "scalar",	IMPL_SCALAR }
 };
 
 #if defined(_KERNEL)
 static kstat_t *fletcher_4_kstat;
-#endif
 
 static struct fletcher_4_kstat {
 	uint64_t native;
 	uint64_t byteswap;
 } fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1];
+#endif
 
 /* Indicate that benchmark has been completed */
 static boolean_t fletcher_4_initialized = B_FALSE;
@@ -408,32 +407,36 @@ fletcher_4_impl_set(const char *val)
 	return (err);
 }
 
+/*
+ * Returns the Fletcher 4 operations for checksums.   When a SIMD
+ * implementation is not allowed in the current context, then fallback
+ * to the fastest generic implementation.
+ */
 static inline const fletcher_4_ops_t *
 fletcher_4_impl_get(void)
 {
-	fletcher_4_ops_t *ops = NULL;
-	const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
+	if (!kfpu_allowed())
+		return (&fletcher_4_superscalar4_ops);
+
+	const fletcher_4_ops_t *ops = NULL;
+	uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
 
 	switch (impl) {
 	case IMPL_FASTEST:
 		ASSERT(fletcher_4_initialized);
 		ops = &fletcher_4_fastest_impl;
 		break;
-#if !defined(_KERNEL)
-	case IMPL_CYCLE: {
+	case IMPL_CYCLE:
+		/* Cycle through supported implementations */
 		ASSERT(fletcher_4_initialized);
 		ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
-
 		static uint32_t cycle_count = 0;
 		uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt;
 		ops = fletcher_4_supp_impls[idx];
-	}
-	break;
-#endif
+		break;
 	default:
 		ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
 		ASSERT3U(impl, <, fletcher_4_supp_impls_cnt);
-
 		ops = fletcher_4_supp_impls[impl];
 		break;
 	}
@@ -658,6 +661,7 @@ fletcher_4_kstat_addr(kstat_t *ksp, loff_t n)
 typedef void fletcher_checksum_func_t(const void *, uint64_t, const void *,
 					zio_cksum_t *);
 
+#if defined(_KERNEL)
 static void
 fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
 {
@@ -716,16 +720,18 @@ fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
 	/* restore original selection */
 	atomic_swap_32(&fletcher_4_impl_chosen, sel_save);
 }
+#endif /* _KERNEL */
 
-void
-fletcher_4_init(void)
+/*
+ * Initialize and benchmark all supported implementations.
+ */
+static void
+fletcher_4_benchmark(void *arg)
 {
-	static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */
 	fletcher_4_ops_t *curr_impl;
-	char *databuf;
 	int i, c;
 
-	/* move supported impl into fletcher_4_supp_impls */
+	/* Move supported implementations into fletcher_4_supp_impls */
 	for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) {
 		curr_impl = (fletcher_4_ops_t *)fletcher_4_impls[i];
 
@@ -735,19 +741,10 @@ fletcher_4_init(void)
 	membar_producer();	/* complete fletcher_4_supp_impls[] init */
 	fletcher_4_supp_impls_cnt = c;	/* number of supported impl */
 
-#if !defined(_KERNEL)
-	/* Skip benchmarking and use last implementation as fastest */
-	memcpy(&fletcher_4_fastest_impl,
-	    fletcher_4_supp_impls[fletcher_4_supp_impls_cnt-1],
-	    sizeof (fletcher_4_fastest_impl));
-	fletcher_4_fastest_impl.name = "fastest";
-	membar_producer();
+#if defined(_KERNEL)
+	static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */
+	char *databuf = vmem_alloc(data_size, KM_SLEEP);
 
-	fletcher_4_initialized = B_TRUE;
-	return;
-#endif
-	/* Benchmark all supported implementations */
-	databuf = vmem_alloc(data_size, KM_SLEEP);
 	for (i = 0; i < data_size / sizeof (uint64_t); i++)
 		((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */
 
@@ -755,9 +752,38 @@ fletcher_4_init(void)
 	fletcher_4_benchmark_impl(B_TRUE, databuf, data_size);
 
 	vmem_free(databuf, data_size);
+#else
+	/*
+	 * Skip the benchmark in user space to avoid impacting libzpool
+	 * consumers (zdb, zhack, zinject, ztest).  The last implementation
+	 * is assumed to be the fastest and used by default.
+	 */
+	memcpy(&fletcher_4_fastest_impl,
+	    fletcher_4_supp_impls[fletcher_4_supp_impls_cnt - 1],
+	    sizeof (fletcher_4_fastest_impl));
+	fletcher_4_fastest_impl.name = "fastest";
+	membar_producer();
+#endif /* _KERNEL */
+}
 
+void
+fletcher_4_init(void)
+{
 #if defined(_KERNEL)
-	/* install kstats for all implementations */
+	/*
+	 * For 5.0 and latter Linux kernels the fletcher 4 benchmarks are
+	 * run in a kernel threads.  This is needed to take advantage of the
+	 * SIMD functionality, see include/linux/simd_x86.h for details.
+	 */
+	taskqid_t id = taskq_dispatch(system_taskq, fletcher_4_benchmark,
+	    NULL, TQ_SLEEP);
+	if (id != TASKQID_INVALID) {
+		taskq_wait_id(system_taskq, id);
+	} else {
+		fletcher_4_benchmark(NULL);
+	}
+
+	/* Install kstats for all implementations */
 	fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc",
 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
 	if (fletcher_4_kstat != NULL) {
@@ -769,6 +795,8 @@ fletcher_4_init(void)
 		    fletcher_4_kstat_addr);
 		kstat_install(fletcher_4_kstat);
 	}
+#else
+	fletcher_4_benchmark(NULL);
 #endif
 
 	/* Finish initialization */
diff --git a/module/zcommon/zfs_fletcher_aarch64_neon.c b/module/zcommon/zfs_fletcher_aarch64_neon.c
index bd2db2b20..3b3c1b52b 100644
--- a/module/zcommon/zfs_fletcher_aarch64_neon.c
+++ b/module/zcommon/zfs_fletcher_aarch64_neon.c
@@ -198,7 +198,7 @@ unsigned char SRC __attribute__((vector_size(16)));
 
 static boolean_t fletcher_4_aarch64_neon_valid(void)
 {
-	return (B_TRUE);
+	return (kfpu_allowed());
 }
 
 const fletcher_4_ops_t fletcher_4_aarch64_neon_ops = {
diff --git a/module/zcommon/zfs_fletcher_avx512.c b/module/zcommon/zfs_fletcher_avx512.c
index 7260a9864..0d4cff21a 100644
--- a/module/zcommon/zfs_fletcher_avx512.c
+++ b/module/zcommon/zfs_fletcher_avx512.c
@@ -157,7 +157,7 @@ STACK_FRAME_NON_STANDARD(fletcher_4_avx512f_byteswap);
 static boolean_t
 fletcher_4_avx512f_valid(void)
 {
-	return (zfs_avx512f_available());
+	return (kfpu_allowed() && zfs_avx512f_available());
 }
 
 const fletcher_4_ops_t fletcher_4_avx512f_ops = {
diff --git a/module/zcommon/zfs_fletcher_intel.c b/module/zcommon/zfs_fletcher_intel.c
index 6dac047da..7f12efe6d 100644
--- a/module/zcommon/zfs_fletcher_intel.c
+++ b/module/zcommon/zfs_fletcher_intel.c
@@ -156,7 +156,7 @@ fletcher_4_avx2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
 
 static boolean_t fletcher_4_avx2_valid(void)
 {
-	return (zfs_avx_available() && zfs_avx2_available());
+	return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available());
 }
 
 const fletcher_4_ops_t fletcher_4_avx2_ops = {
diff --git a/module/zcommon/zfs_fletcher_sse.c b/module/zcommon/zfs_fletcher_sse.c
index a0b42e5f5..e6389d6e5 100644
--- a/module/zcommon/zfs_fletcher_sse.c
+++ b/module/zcommon/zfs_fletcher_sse.c
@@ -157,7 +157,7 @@ fletcher_4_sse2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
 
 static boolean_t fletcher_4_sse2_valid(void)
 {
-	return (zfs_sse2_available());
+	return (kfpu_allowed() && zfs_sse2_available());
 }
 
 const fletcher_4_ops_t fletcher_4_sse2_ops = {
@@ -214,7 +214,8 @@ fletcher_4_ssse3_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
 
 static boolean_t fletcher_4_ssse3_valid(void)
 {
-	return (zfs_sse2_available() && zfs_ssse3_available());
+	return (kfpu_allowed() && zfs_sse2_available() &&
+	    zfs_ssse3_available());
 }
 
 const fletcher_4_ops_t fletcher_4_ssse3_ops = {
diff --git a/module/zfs/vdev_raidz_math.c b/module/zfs/vdev_raidz_math.c
index 3ef67768f..ef514e9e1 100644
--- a/module/zfs/vdev_raidz_math.c
+++ b/module/zfs/vdev_raidz_math.c
@@ -27,9 +27,9 @@
 #include <sys/zio.h>
 #include <sys/debug.h>
 #include <sys/zfs_debug.h>
-
 #include <sys/vdev_raidz.h>
 #include <sys/vdev_raidz_impl.h>
+#include <linux/simd.h>
 
 extern boolean_t raidz_will_scalar_work(void);
 
@@ -87,6 +87,7 @@ static uint32_t user_sel_impl = IMPL_FASTEST;
 static size_t raidz_supp_impl_cnt = 0;
 static raidz_impl_ops_t *raidz_supp_impl[ARRAY_SIZE(raidz_all_maths)];
 
+#if defined(_KERNEL)
 /*
  * kstats values for supported implementations
  * Values represent per disk throughput of 8 disk+parity raidz vdev [B/s]
@@ -95,14 +96,19 @@ static raidz_impl_kstat_t raidz_impl_kstats[ARRAY_SIZE(raidz_all_maths) + 1];
 
 /* kstat for benchmarked implementations */
 static kstat_t *raidz_math_kstat = NULL;
+#endif
 
 /*
- * Selects the raidz operation for raidz_map
- * If rm_ops is set to NULL original raidz implementation will be used
+ * Returns the RAIDZ operations for raidz_map() parity calculations.   When
+ * a SIMD implementation is not allowed in the current context, then fallback
+ * to the fastest generic implementation.
  */
-raidz_impl_ops_t *
-vdev_raidz_math_get_ops()
+const raidz_impl_ops_t *
+vdev_raidz_math_get_ops(void)
 {
+	if (!kfpu_allowed())
+		return (&vdev_raidz_scalar_impl);
+
 	raidz_impl_ops_t *ops = NULL;
 	const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl);
 
@@ -111,18 +117,14 @@ vdev_raidz_math_get_ops()
 		ASSERT(raidz_math_initialized);
 		ops = &vdev_raidz_fastest_impl;
 		break;
-#if !defined(_KERNEL)
 	case IMPL_CYCLE:
-	{
+		/* Cycle through all supported implementations */
 		ASSERT(raidz_math_initialized);
 		ASSERT3U(raidz_supp_impl_cnt, >, 0);
-		/* Cycle through all supported implementations */
 		static size_t cycle_impl_idx = 0;
 		size_t idx = (++cycle_impl_idx) % raidz_supp_impl_cnt;
 		ops = raidz_supp_impl[idx];
-	}
-	break;
-#endif
+		break;
 	case IMPL_ORIGINAL:
 		ops = (raidz_impl_ops_t *)&vdev_raidz_original_impl;
 		break;
@@ -273,6 +275,8 @@ const char *raidz_rec_name[] = {
 	"rec_pq", "rec_pr", "rec_qr", "rec_pqr"
 };
 
+#if defined(_KERNEL)
+
 #define	RAIDZ_KSTAT_LINE_LEN	(17 + 10*12 + 1)
 
 static int
@@ -435,21 +439,21 @@ benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn)
 		}
 	}
 }
+#endif
 
-void
-vdev_raidz_math_init(void)
+/*
+ * Initialize and benchmark all supported implementations.
+ */
+static void
+benchmark_raidz(void *arg)
 {
 	raidz_impl_ops_t *curr_impl;
-	zio_t *bench_zio = NULL;
-	raidz_map_t *bench_rm = NULL;
-	uint64_t bench_parity;
-	int i, c, fn;
+	int i, c;
 
-	/* move supported impl into raidz_supp_impl */
+	/* Move supported impl into raidz_supp_impl */
 	for (i = 0, c = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
 		curr_impl = (raidz_impl_ops_t *)raidz_all_maths[i];
 
-		/* initialize impl */
 		if (curr_impl->init)
 			curr_impl->init();
 
@@ -459,18 +463,10 @@ vdev_raidz_math_init(void)
 	membar_producer();		/* complete raidz_supp_impl[] init */
 	raidz_supp_impl_cnt = c;	/* number of supported impl */
 
-#if !defined(_KERNEL)
-	/* Skip benchmarking and use last implementation as fastest */
-	memcpy(&vdev_raidz_fastest_impl, raidz_supp_impl[raidz_supp_impl_cnt-1],
-	    sizeof (vdev_raidz_fastest_impl));
-	strcpy(vdev_raidz_fastest_impl.name, "fastest");
-
-	raidz_math_initialized = B_TRUE;
-
-	/* Use 'cycle' math selection method for userspace */
-	VERIFY0(vdev_raidz_impl_set("cycle"));
-	return;
-#endif
+#if defined(_KERNEL)
+	zio_t *bench_zio = NULL;
+	raidz_map_t *bench_rm = NULL;
+	uint64_t bench_parity;
 
 	/* Fake a zio and run the benchmark on a warmed up buffer */
 	bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP);
@@ -480,7 +476,7 @@ vdev_raidz_math_init(void)
 	memset(abd_to_buf(bench_zio->io_abd), 0xAA, BENCH_ZIO_SIZE);
 
 	/* Benchmark parity generation methods */
-	for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
+	for (int fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
 		bench_parity = fn + 1;
 		/* New raidz_map is needed for each generate_p/q/r */
 		bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
@@ -495,7 +491,7 @@ vdev_raidz_math_init(void)
 	bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
 	    BENCH_COLS, PARITY_PQR);
 
-	for (fn = 0; fn < RAIDZ_REC_NUM; fn++)
+	for (int fn = 0; fn < RAIDZ_REC_NUM; fn++)
 		benchmark_raidz_impl(bench_rm, fn, benchmark_rec_impl);
 
 	vdev_raidz_map_free(bench_rm);
@@ -503,11 +499,39 @@ vdev_raidz_math_init(void)
 	/* cleanup the bench zio */
 	abd_free(bench_zio->io_abd);
 	kmem_free(bench_zio, sizeof (zio_t));
+#else
+	/*
+	 * Skip the benchmark in user space to avoid impacting libzpool
+	 * consumers (zdb, zhack, zinject, ztest).  The last implementation
+	 * is assumed to be the fastest and used by default.
+	 */
+	memcpy(&vdev_raidz_fastest_impl,
+	    raidz_supp_impl[raidz_supp_impl_cnt - 1],
+	    sizeof (vdev_raidz_fastest_impl));
+	strcpy(vdev_raidz_fastest_impl.name, "fastest");
+#endif /* _KERNEL */
+}
 
-	/* install kstats for all impl */
+void
+vdev_raidz_math_init(void)
+{
+#if defined(_KERNEL)
+	/*
+	 * For 5.0 and latter Linux kernels the fletcher 4 benchmarks are
+	 * run in a kernel threads.  This is needed to take advantage of the
+	 * SIMD functionality, see include/linux/simd_x86.h for details.
+	 */
+	taskqid_t id = taskq_dispatch(system_taskq, benchmark_raidz,
+	    NULL, TQ_SLEEP);
+	if (id != TASKQID_INVALID) {
+		taskq_wait_id(system_taskq, id);
+	} else {
+		benchmark_raidz(NULL);
+	}
+
+	/* Install kstats for all implementations */
 	raidz_math_kstat = kstat_create("zfs", 0, "vdev_raidz_bench", "misc",
 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
-
 	if (raidz_math_kstat != NULL) {
 		raidz_math_kstat->ks_data = NULL;
 		raidz_math_kstat->ks_ndata = UINT32_MAX;
@@ -517,6 +541,9 @@ vdev_raidz_math_init(void)
 		    raidz_math_kstat_addr);
 		kstat_install(raidz_math_kstat);
 	}
+#else
+	benchmark_raidz(NULL);
+#endif
 
 	/* Finish initialization */
 	atomic_swap_32(&zfs_vdev_raidz_impl, user_sel_impl);
@@ -527,15 +554,15 @@ void
 vdev_raidz_math_fini(void)
 {
 	raidz_impl_ops_t const *curr_impl;
-	int i;
 
+#if defined(_KERNEL)
 	if (raidz_math_kstat != NULL) {
 		kstat_delete(raidz_math_kstat);
 		raidz_math_kstat = NULL;
 	}
+#endif
 
-	/* fini impl */
-	for (i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
+	for (int i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
 		curr_impl = raidz_all_maths[i];
 		if (curr_impl->fini)
 			curr_impl->fini();
@@ -546,9 +573,7 @@ static const struct {
 	char *name;
 	uint32_t sel;
 } math_impl_opts[] = {
-#if !defined(_KERNEL)
 		{ "cycle",	IMPL_CYCLE },
-#endif
 		{ "fastest",	IMPL_FASTEST },
 		{ "original",	IMPL_ORIGINAL },
 		{ "scalar",	IMPL_SCALAR }
diff --git a/module/zfs/vdev_raidz_math_aarch64_neon.c b/module/zfs/vdev_raidz_math_aarch64_neon.c
index e3ad06776..0a67ceb84 100644
--- a/module/zfs/vdev_raidz_math_aarch64_neon.c
+++ b/module/zfs/vdev_raidz_math_aarch64_neon.c
@@ -207,7 +207,7 @@ DEFINE_REC_METHODS(aarch64_neon);
 static boolean_t
 raidz_will_aarch64_neon_work(void)
 {
-	return (B_TRUE); // __arch64__ requires NEON
+	return (kfpu_allowed());
 }
 
 const raidz_impl_ops_t vdev_raidz_aarch64_neon_impl = {
diff --git a/module/zfs/vdev_raidz_math_aarch64_neonx2.c b/module/zfs/vdev_raidz_math_aarch64_neonx2.c
index f8688a06a..e072f51cd 100644
--- a/module/zfs/vdev_raidz_math_aarch64_neonx2.c
+++ b/module/zfs/vdev_raidz_math_aarch64_neonx2.c
@@ -217,7 +217,7 @@ DEFINE_REC_METHODS(aarch64_neonx2);
 static boolean_t
 raidz_will_aarch64_neonx2_work(void)
 {
-	return (B_TRUE); // __arch64__ requires NEON
+	return (kfpu_allowed());
 }
 
 const raidz_impl_ops_t vdev_raidz_aarch64_neonx2_impl = {
diff --git a/module/zfs/vdev_raidz_math_avx2.c b/module/zfs/vdev_raidz_math_avx2.c
index 063d29bcd..a12eb6720 100644
--- a/module/zfs/vdev_raidz_math_avx2.c
+++ b/module/zfs/vdev_raidz_math_avx2.c
@@ -396,7 +396,7 @@ DEFINE_REC_METHODS(avx2);
 static boolean_t
 raidz_will_avx2_work(void)
 {
-	return (zfs_avx_available() && zfs_avx2_available());
+	return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available());
 }
 
 const raidz_impl_ops_t vdev_raidz_avx2_impl = {
diff --git a/module/zfs/vdev_raidz_math_avx512bw.c b/module/zfs/vdev_raidz_math_avx512bw.c
index d605653db..2f545c9ec 100644
--- a/module/zfs/vdev_raidz_math_avx512bw.c
+++ b/module/zfs/vdev_raidz_math_avx512bw.c
@@ -393,9 +393,8 @@ DEFINE_REC_METHODS(avx512bw);
 static boolean_t
 raidz_will_avx512bw_work(void)
 {
-	return (zfs_avx_available() &&
-	    zfs_avx512f_available() &&
-	    zfs_avx512bw_available());
+	return (kfpu_allowed() && zfs_avx_available() &&
+	    zfs_avx512f_available() && zfs_avx512bw_available());
 }
 
 const raidz_impl_ops_t vdev_raidz_avx512bw_impl = {
diff --git a/module/zfs/vdev_raidz_math_avx512f.c b/module/zfs/vdev_raidz_math_avx512f.c
index f4e4560ce..75af7a8ee 100644
--- a/module/zfs/vdev_raidz_math_avx512f.c
+++ b/module/zfs/vdev_raidz_math_avx512f.c
@@ -470,9 +470,8 @@ DEFINE_REC_METHODS(avx512f);
 static boolean_t
 raidz_will_avx512f_work(void)
 {
-	return (zfs_avx_available() &&
-	    zfs_avx2_available() &&
-	    zfs_avx512f_available());
+	return (kfpu_allowed() && zfs_avx_available() &&
+	    zfs_avx2_available() && zfs_avx512f_available());
 }
 
 const raidz_impl_ops_t vdev_raidz_avx512f_impl = {
diff --git a/module/zfs/vdev_raidz_math_sse2.c b/module/zfs/vdev_raidz_math_sse2.c
index 9985da273..5b3a9385c 100644
--- a/module/zfs/vdev_raidz_math_sse2.c
+++ b/module/zfs/vdev_raidz_math_sse2.c
@@ -607,7 +607,7 @@ DEFINE_REC_METHODS(sse2);
 static boolean_t
 raidz_will_sse2_work(void)
 {
-	return (zfs_sse_available() && zfs_sse2_available());
+	return (kfpu_allowed() && zfs_sse_available() && zfs_sse2_available());
 }
 
 const raidz_impl_ops_t vdev_raidz_sse2_impl = {
diff --git a/module/zfs/vdev_raidz_math_ssse3.c b/module/zfs/vdev_raidz_math_ssse3.c
index 047a48d54..62247cf8e 100644
--- a/module/zfs/vdev_raidz_math_ssse3.c
+++ b/module/zfs/vdev_raidz_math_ssse3.c
@@ -399,8 +399,8 @@ DEFINE_REC_METHODS(ssse3);
 static boolean_t
 raidz_will_ssse3_work(void)
 {
-	return (zfs_sse_available() && zfs_sse2_available() &&
-	    zfs_ssse3_available());
+	return (kfpu_allowed() && zfs_sse_available() &&
+	    zfs_sse2_available() && zfs_ssse3_available());
 }
 
 const raidz_impl_ops_t vdev_raidz_ssse3_impl = {