4 files changed, 268 insertions, 191 deletions
diff --git a/module/zcommon/zfs_fletcher.c b/module/zcommon/zfs_fletcher.c
index 3ca70db13..355384f50 100644
--- a/module/zcommon/zfs_fletcher.c
+++ b/module/zcommon/zfs_fletcher.c
@@ -138,17 +138,20 @@
 #include <zfs_fletcher.h>
 
 
-static void fletcher_4_scalar_init(zio_cksum_t *zcp);
-static void fletcher_4_scalar_native(const void *buf, uint64_t size,
-    zio_cksum_t *zcp);
-static void fletcher_4_scalar_byteswap(const void *buf, uint64_t size,
-    zio_cksum_t *zcp);
+static void fletcher_4_scalar_init(fletcher_4_ctx_t *ctx);
+static void fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp);
+static void fletcher_4_scalar_native(fletcher_4_ctx_t *ctx,
+    const void *buf, uint64_t size);
+static void fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx,
+    const void *buf, uint64_t size);
 static boolean_t fletcher_4_scalar_valid(void);
 
 static const fletcher_4_ops_t fletcher_4_scalar_ops = {
 	.init_native = fletcher_4_scalar_init,
+	.fini_native = fletcher_4_scalar_fini,
 	.compute_native = fletcher_4_scalar_native,
 	.init_byteswap = fletcher_4_scalar_init,
+	.fini_byteswap = fletcher_4_scalar_fini,
 	.compute_byteswap = fletcher_4_scalar_byteswap,
 	.valid = fletcher_4_scalar_valid,
 	.name = "scalar"
@@ -248,22 +251,29 @@ fletcher_2_byteswap(const void *buf, uint64_t size,
 }
 
 static void
-fletcher_4_scalar_init(zio_cksum_t *zcp)
+fletcher_4_scalar_init(fletcher_4_ctx_t *ctx)
 {
-	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
+	ZIO_SET_CHECKSUM(&ctx->scalar, 0, 0, 0, 0);
+}
+
+static void
+fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
+{
+	memcpy(zcp, &ctx->scalar, sizeof (zio_cksum_t));
 }
 
 static void
-fletcher_4_scalar_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
+fletcher_4_scalar_native(fletcher_4_ctx_t *ctx, const void *buf,
+    uint64_t size)
 {
 	const uint32_t *ip = buf;
 	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
 	uint64_t a, b, c, d;
 
-	a = zcp->zc_word[0];
-	b = zcp->zc_word[1];
-	c = zcp->zc_word[2];
-	d = zcp->zc_word[3];
+	a = ctx->scalar.zc_word[0];
+	b = ctx->scalar.zc_word[1];
+	c = ctx->scalar.zc_word[2];
+	d = ctx->scalar.zc_word[3];
 
 	for (; ip < ipend; ip++) {
 		a += ip[0];
@@ -272,20 +282,21 @@ fletcher_4_scalar_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
 		d += c;
 	}
 
-	ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+	ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d);
 }
 
 static void
-fletcher_4_scalar_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
+fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx, const void *buf,
+    uint64_t size)
 {
 	const uint32_t *ip = buf;
 	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
 	uint64_t a, b, c, d;
 
-	a = zcp->zc_word[0];
-	b = zcp->zc_word[1];
-	c = zcp->zc_word[2];
-	d = zcp->zc_word[3];
+	a = ctx->scalar.zc_word[0];
+	b = ctx->scalar.zc_word[1];
+	c = ctx->scalar.zc_word[2];
+	d = ctx->scalar.zc_word[3];
 
 	for (; ip < ipend; ip++) {
 		a += BSWAP_32(ip[0]);
@@ -294,7 +305,7 @@ fletcher_4_scalar_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
 		d += c;
 	}
 
-	ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+	ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d);
 }
 
 static boolean_t
@@ -384,13 +395,14 @@ fletcher_4_impl_get(void)
 }
 
 static inline void
-fletcher_4_native_impl(const fletcher_4_ops_t *ops, const void *buf,
-	uint64_t size, zio_cksum_t *zcp)
+fletcher_4_native_impl(const void *buf, uint64_t size, zio_cksum_t *zcp)
 {
-	ops->init_native(zcp);
-	ops->compute_native(buf, size, zcp);
-	if (ops->fini_native != NULL)
-		ops->fini_native(zcp);
+	fletcher_4_ctx_t ctx;
+	const fletcher_4_ops_t *ops = fletcher_4_impl_get();
+
+	ops->init_native(&ctx);
+	ops->compute_native(&ctx, buf, size);
+	ops->fini_native(&ctx, zcp);
 }
 
 /*ARGSUSED*/
@@ -398,40 +410,41 @@ void
 fletcher_4_native(const void *buf, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
-	const fletcher_4_ops_t *ops;
-	uint64_t p2size = P2ALIGN(size, 64);
+	const uint64_t p2size = P2ALIGN(size, 64);
 
 	ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
 
-	if (size == 0) {
+	if (size == 0 || p2size == 0) {
 		ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
-	} else if (p2size == 0) {
-		ops = &fletcher_4_scalar_ops;
-		fletcher_4_native_impl(ops, buf, size, zcp);
+
+		if (size > 0)
+			fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp,
+			    buf, size);
 	} else {
-		ops = fletcher_4_impl_get();
-		fletcher_4_native_impl(ops, buf, p2size, zcp);
+		fletcher_4_native_impl(buf, p2size, zcp);
 
 		if (p2size < size)
-			fletcher_4_incremental_native((char *)buf + p2size,
-			    size - p2size, zcp);
+			fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp,
+			    (char *)buf + p2size, size - p2size);
 	}
 }
 
 void
 fletcher_4_native_varsize(const void *buf, uint64_t size, zio_cksum_t *zcp)
 {
-	fletcher_4_native_impl(&fletcher_4_scalar_ops, buf, size, zcp);
+	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
+	fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size);
 }
 
 static inline void
-fletcher_4_byteswap_impl(const fletcher_4_ops_t *ops, const void *buf,
-	uint64_t size, zio_cksum_t *zcp)
+fletcher_4_byteswap_impl(const void *buf, uint64_t size, zio_cksum_t *zcp)
 {
-	ops->init_byteswap(zcp);
-	ops->compute_byteswap(buf, size, zcp);
-	if (ops->fini_byteswap != NULL)
-		ops->fini_byteswap(zcp);
+	fletcher_4_ctx_t ctx;
+	const fletcher_4_ops_t *ops = fletcher_4_impl_get();
+
+	ops->init_byteswap(&ctx);
+	ops->compute_byteswap(&ctx, buf, size);
+	ops->fini_byteswap(&ctx, zcp);
 }
 
 /*ARGSUSED*/
@@ -439,28 +452,29 @@ void
 fletcher_4_byteswap(const void *buf, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
-	const fletcher_4_ops_t *ops;
-	uint64_t p2size = P2ALIGN(size, 64);
+	const uint64_t p2size = P2ALIGN(size, 64);
 
 	ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
 
-	if (size == 0) {
+	if (size == 0 || p2size == 0) {
 		ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
-	} else if (p2size == 0) {
-		ops = &fletcher_4_scalar_ops;
-		fletcher_4_byteswap_impl(ops, buf, size, zcp);
+
+		if (size > 0)
+			fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp,
+			    buf, size);
 	} else {
-		ops = fletcher_4_impl_get();
-		fletcher_4_byteswap_impl(ops, buf, p2size, zcp);
+		fletcher_4_byteswap_impl(buf, p2size, zcp);
 
 		if (p2size < size)
-			fletcher_4_incremental_byteswap((char *)buf + p2size,
-			    size - p2size, zcp);
+			fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp,
+			    (char *)buf + p2size, size - p2size);
 	}
 }
 
 /* Incremental Fletcher 4 */
 
+#define	ZFS_FLETCHER_4_INC_MAX_SIZE	(8ULL << 20)
+
 static inline void
 fletcher_4_incremental_combine(zio_cksum_t *zcp, const uint64_t size,
     const zio_cksum_t *nzcp)
@@ -469,6 +483,13 @@ fletcher_4_incremental_combine(zio_cksum_t *zcp, const uint64_t size,
 	const uint64_t c2 = c1 * (c1 + 1) / 2;
 	const uint64_t c3 = c2 * (c1 + 2) / 3;
 
+	/*
+	 * Value of 'c3' overflows on buffer sizes close to 16MiB. For that
+	 * reason we split incremental fletcher4 computation of large buffers
+	 * to steps of (ZFS_FLETCHER_4_INC_MAX_SIZE) size.
+	 */
+	ASSERT3U(size, <=, ZFS_FLETCHER_4_INC_MAX_SIZE);
+
 	zcp->zc_word[3] += nzcp->zc_word[3] + c1 * zcp->zc_word[2] +
 	    c2 * zcp->zc_word[1] + c3 * zcp->zc_word[0];
 	zcp->zc_word[2] += nzcp->zc_word[2] + c1 * zcp->zc_word[1] +
@@ -481,13 +502,9 @@ static inline void
 fletcher_4_incremental_impl(boolean_t native, const void *buf, uint64_t size,
     zio_cksum_t *zcp)
 {
-	static const uint64_t FLETCHER_4_INC_MAX = 8ULL << 20;
-	uint64_t len;
-
 	while (size > 0) {
 		zio_cksum_t nzc;
-
-		len = MIN(size, FLETCHER_4_INC_MAX);
+		uint64_t len = MIN(size, ZFS_FLETCHER_4_INC_MAX_SIZE);
 
 		if (native)
 			fletcher_4_native(buf, len, NULL, &nzc);
@@ -504,14 +521,22 @@ fletcher_4_incremental_impl(boolean_t native, const void *buf, uint64_t size,
 void
 fletcher_4_incremental_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
 {
-	fletcher_4_incremental_impl(B_TRUE, buf, size, zcp);
+	/* Use scalar impl to directly update cksum of small blocks */
+	if (size < SPA_MINBLOCKSIZE)
+		fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size);
+	else
+		fletcher_4_incremental_impl(B_TRUE, buf, size, zcp);
 }
 
 void
 fletcher_4_incremental_byteswap(const void *buf, uint64_t size,
     zio_cksum_t *zcp)
 {
-	fletcher_4_incremental_impl(B_FALSE, buf, size, zcp);
+	/* Use scalar impl to directly update cksum of small blocks */
+	if (size < SPA_MINBLOCKSIZE)
+		fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, buf, size);
+	else
+		fletcher_4_incremental_impl(B_FALSE, buf, size, zcp);
 }
 
 
@@ -662,9 +687,6 @@ fletcher_4_init(void)
 	membar_producer();
 
 	fletcher_4_initialized = B_TRUE;
-
-	/* Use 'cycle' math selection method for userspace */
-	VERIFY0(fletcher_4_impl_set("cycle"));
 	return;
 #endif
 	/* Benchmark all supported implementations */
diff --git a/module/zcommon/zfs_fletcher_avx512.c b/module/zcommon/zfs_fletcher_avx512.c
index 22e1f410f..2d28ffb11 100644
--- a/module/zcommon/zfs_fletcher_avx512.c
+++ b/module/zcommon/zfs_fletcher_avx512.c
@@ -28,31 +28,73 @@
 #include <sys/byteorder.h>
 #include <sys/spa_checksum.h>
 #include <zfs_fletcher.h>
+#include <strings.h>
 
 #define	__asm __asm__ __volatile__
 
-typedef struct {
-	uint64_t v[8] __attribute__((aligned(64)));
-} zfs_avx512_t;
+static void
+fletcher_4_avx512f_init(fletcher_4_ctx_t *ctx)
+{
+	bzero(ctx->avx512, 4 * sizeof (zfs_fletcher_avx512_t));
+}
 
 static void
-fletcher_4_avx512f_init(zio_cksum_t *zcp)
+fletcher_4_avx512f_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
 {
-	kfpu_begin();
+	static const uint64_t
+	CcA[] = {   0,   0,   1,   3,   6,  10,  15,  21 },
+	CcB[] = {  28,  36,  44,  52,  60,  68,  76,  84 },
+	DcA[] = {   0,   0,   0,   1,   4,  10,  20,  35 },
+	DcB[] = {  56,  84, 120, 164, 216, 276, 344, 420 },
+	DcC[] = { 448, 512, 576, 640, 704, 768, 832, 896 };
+
+	uint64_t A, B, C, D;
+	uint64_t i;
 
-	/* clear registers */
-	__asm("vpxorq %zmm0, %zmm0, %zmm0");
-	__asm("vpxorq %zmm1, %zmm1, %zmm1");
-	__asm("vpxorq %zmm2, %zmm2, %zmm2");
-	__asm("vpxorq %zmm3, %zmm3, %zmm3");
+	A = ctx->avx512[0].v[0];
+	B = 8 * ctx->avx512[1].v[0];
+	C = 64 * ctx->avx512[2].v[0] - CcB[0] * ctx->avx512[1].v[0];
+	D = 512 * ctx->avx512[3].v[0] - DcC[0] * ctx->avx512[2].v[0] +
+	    DcB[0] * ctx->avx512[1].v[0];
+
+	for (i = 1; i < 8; i++) {
+		A += ctx->avx512[0].v[i];
+		B += 8 * ctx->avx512[1].v[i] - i * ctx->avx512[0].v[i];
+		C += 64 * ctx->avx512[2].v[i] - CcB[i] * ctx->avx512[1].v[i] +
+		    CcA[i] * ctx->avx512[0].v[i];
+		D += 512 * ctx->avx512[3].v[i] - DcC[i] * ctx->avx512[2].v[i] +
+		    DcB[i] * ctx->avx512[1].v[i] - DcA[i] * ctx->avx512[0].v[i];
+	}
+
+	ZIO_SET_CHECKSUM(zcp, A, B, C, D);
+}
+
+#define	FLETCHER_4_AVX512_RESTORE_CTX(ctx)				\
+{									\
+	__asm("vmovdqu64 %0, %%zmm0" :: "m" ((ctx)->avx512[0]));	\
+	__asm("vmovdqu64 %0, %%zmm1" :: "m" ((ctx)->avx512[1]));	\
+	__asm("vmovdqu64 %0, %%zmm2" :: "m" ((ctx)->avx512[2]));	\
+	__asm("vmovdqu64 %0, %%zmm3" :: "m" ((ctx)->avx512[3]));	\
+}
+
+#define	FLETCHER_4_AVX512_SAVE_CTX(ctx)					\
+{									\
+	__asm("vmovdqu64 %%zmm0, %0" : "=m" ((ctx)->avx512[0]));	\
+	__asm("vmovdqu64 %%zmm1, %0" : "=m" ((ctx)->avx512[1]));	\
+	__asm("vmovdqu64 %%zmm2, %0" : "=m" ((ctx)->avx512[2]));	\
+	__asm("vmovdqu64 %%zmm3, %0" : "=m" ((ctx)->avx512[3]));	\
 }
 
 static void
-fletcher_4_avx512f_native(const void *buf, uint64_t size, zio_cksum_t *unused)
+fletcher_4_avx512f_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
 {
 	const uint32_t *ip = buf;
 	const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size);
 
+	kfpu_begin();
+
+	FLETCHER_4_AVX512_RESTORE_CTX(ctx);
+
 	for (; ip < ipend; ip += 8) {
 		__asm("vpmovzxdq %0, %%zmm4"::"m" (*ip));
 		__asm("vpaddq %zmm4, %zmm0, %zmm0");
@@ -60,15 +102,24 @@ fletcher_4_avx512f_native(const void *buf, uint64_t size, zio_cksum_t *unused)
 		__asm("vpaddq %zmm1, %zmm2, %zmm2");
 		__asm("vpaddq %zmm2, %zmm3, %zmm3");
 	}
+
+	FLETCHER_4_AVX512_SAVE_CTX(ctx);
+
+	kfpu_end();
 }
 
 static void
-fletcher_4_avx512f_byteswap(const void *buf, uint64_t size, zio_cksum_t *unused)
+fletcher_4_avx512f_byteswap(fletcher_4_ctx_t *ctx, const void *buf,
+    uint64_t size)
 {
 	static const uint64_t byteswap_mask = 0xFFULL;
 	const uint32_t *ip = buf;
 	const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size);
 
+	kfpu_begin();
+
+	FLETCHER_4_AVX512_RESTORE_CTX(ctx);
+
 	__asm("vpbroadcastq %0, %%zmm8" :: "r" (byteswap_mask));
 	__asm("vpsllq $8, %zmm8, %zmm9");
 	__asm("vpsllq $16, %zmm8, %zmm10");
@@ -94,49 +145,10 @@ fletcher_4_avx512f_byteswap(const void *buf, uint64_t size, zio_cksum_t *unused)
 		__asm("vpaddq %zmm1, %zmm2, %zmm2");
 		__asm("vpaddq %zmm2, %zmm3, %zmm3");
 	}
-}
 
-static void
-fletcher_4_avx512f_fini(zio_cksum_t *zcp)
-{
-	static const uint64_t
-	CcA[] = {   0,   0,   1,   3,   6,  10,  15,  21 },
-	CcB[] = {  28,  36,  44,  52,  60,  68,  76,  84 },
-	DcA[] = {   0,   0,   0,   1,   4,  10,  20,  35 },
-	DcB[] = {  56,  84, 120, 164, 216, 276, 344, 420 },
-	DcC[] = { 448, 512, 576, 640, 704, 768, 832, 896 };
-
-	zfs_avx512_t a, b, c, b8, c64, d512;
-	uint64_t A, B, C, D;
-	uint64_t i;
-
-	__asm("vmovdqu64 %%zmm0, %0":"=m" (a));
-	__asm("vmovdqu64 %%zmm1, %0":"=m" (b));
-	__asm("vmovdqu64 %%zmm2, %0":"=m" (c));
-	__asm("vpsllq $3, %zmm1, %zmm1");
-	__asm("vpsllq $6, %zmm2, %zmm2");
-	__asm("vpsllq $9, %zmm3, %zmm3");
-
-	__asm("vmovdqu64 %%zmm1, %0":"=m" (b8));
-	__asm("vmovdqu64 %%zmm2, %0":"=m" (c64));
-	__asm("vmovdqu64 %%zmm3, %0":"=m" (d512));
+	FLETCHER_4_AVX512_SAVE_CTX(ctx)
 
 	kfpu_end();
-
-	A = a.v[0];
-	B = b8.v[0];
-	C = c64.v[0] - CcB[0] * b.v[0];
-	D = d512.v[0] - DcC[0] * c.v[0] + DcB[0] * b.v[0];
-
-	for (i = 1; i < 8; i++) {
-		A += a.v[i];
-		B += b8.v[i] - i * a.v[i];
-		C += c64.v[i] - CcB[i] * b.v[i] + CcA[i] * a.v[i];
-		D += d512.v[i] - DcC[i] * c.v[i] + DcB[i] * b.v[i] -
-		    DcA[i] * a.v[i];
-	}
-
-	ZIO_SET_CHECKSUM(zcp, A, B, C, D);
 }
 
 static boolean_t
diff --git a/module/zcommon/zfs_fletcher_intel.c b/module/zcommon/zfs_fletcher_intel.c
index adc4151c5..a479b9d56 100644
--- a/module/zcommon/zfs_fletcher_intel.c
+++ b/module/zcommon/zfs_fletcher_intel.c
@@ -45,58 +45,69 @@
 #include <linux/simd_x86.h>
 #include <sys/spa_checksum.h>
 #include <zfs_fletcher.h>
+#include <strings.h>
 
 static void
-fletcher_4_avx2_init(zio_cksum_t *zcp)
+fletcher_4_avx2_init(fletcher_4_ctx_t *ctx)
 {
-	kfpu_begin();
-
-	/* clear avx2 registers */
-	asm volatile("vpxor %ymm0, %ymm0, %ymm0");
-	asm volatile("vpxor %ymm1, %ymm1, %ymm1");
-	asm volatile("vpxor %ymm2, %ymm2, %ymm2");
-	asm volatile("vpxor %ymm3, %ymm3, %ymm3");
+	bzero(ctx->avx, 4 * sizeof (zfs_fletcher_avx_t));
 }
 
 static void
-fletcher_4_avx2_fini(zio_cksum_t *zcp)
+fletcher_4_avx2_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
 {
-	uint64_t __attribute__((aligned(32))) a[4];
-	uint64_t __attribute__((aligned(32))) b[4];
-	uint64_t __attribute__((aligned(32))) c[4];
-	uint64_t __attribute__((aligned(32))) d[4];
 	uint64_t A, B, C, D;
 
-	asm volatile("vmovdqu %%ymm0, %0":"=m" (a));
-	asm volatile("vmovdqu %%ymm1, %0":"=m" (b));
-	asm volatile("vmovdqu %%ymm2, %0":"=m" (c));
-	asm volatile("vmovdqu %%ymm3, %0":"=m" (d));
-	asm volatile("vzeroupper");
+	A = ctx->avx[0].v[0] + ctx->avx[0].v[1] +
+	    ctx->avx[0].v[2] + ctx->avx[0].v[3];
+	B = 0 - ctx->avx[0].v[1] - 2 * ctx->avx[0].v[2] - 3 * ctx->avx[0].v[3] +
+	    4 * ctx->avx[1].v[0] + 4 * ctx->avx[1].v[1] + 4 * ctx->avx[1].v[2] +
+	    4 * ctx->avx[1].v[3];
+
+	C = ctx->avx[0].v[2] + 3 * ctx->avx[0].v[3] - 6 * ctx->avx[1].v[0] -
+	    10 * ctx->avx[1].v[1] - 14 * ctx->avx[1].v[2] -
+	    18 * ctx->avx[1].v[3] + 16 * ctx->avx[2].v[0] +
+	    16 * ctx->avx[2].v[1] + 16 * ctx->avx[2].v[2] +
+	    16 * ctx->avx[2].v[3];
+
+	D = 0 - ctx->avx[0].v[3] + 4 * ctx->avx[1].v[0] +
+	    10 * ctx->avx[1].v[1] + 20 * ctx->avx[1].v[2] +
+	    34 * ctx->avx[1].v[3] - 48 * ctx->avx[2].v[0] -
+	    64 * ctx->avx[2].v[1] - 80 * ctx->avx[2].v[2] -
+	    96 * ctx->avx[2].v[3] + 64 * ctx->avx[3].v[0] +
+	    64 * ctx->avx[3].v[1] + 64 * ctx->avx[3].v[2] +
+	    64 * ctx->avx[3].v[3];
 
-	kfpu_end();
-
-	A = a[0] + a[1] + a[2] + a[3];
-	B = 0 - a[1] - 2*a[2] - 3*a[3]
-	    + 4*b[0] + 4*b[1] + 4*b[2] + 4*b[3];
-
-	C = a[2] + 3*a[3]
-	    -  6*b[0] - 10*b[1] - 14*b[2] - 18*b[3]
-	    + 16*c[0] + 16*c[1] + 16*c[2] + 16*c[3];
+	ZIO_SET_CHECKSUM(zcp, A, B, C, D);
+}
 
-	D = 0 - a[3]
-	    +  4*b[0] + 10*b[1] + 20*b[2] + 34*b[3]
-	    - 48*c[0] - 64*c[1] - 80*c[2] - 96*c[3]
-	    + 64*d[0] + 64*d[1] + 64*d[2] + 64*d[3];
+#define	FLETCHER_4_AVX2_RESTORE_CTX(ctx)				\
+{									\
+	asm volatile("vmovdqu %0, %%ymm0" :: "m" ((ctx)->avx[0]));	\
+	asm volatile("vmovdqu %0, %%ymm1" :: "m" ((ctx)->avx[1]));	\
+	asm volatile("vmovdqu %0, %%ymm2" :: "m" ((ctx)->avx[2]));	\
+	asm volatile("vmovdqu %0, %%ymm3" :: "m" ((ctx)->avx[3]));	\
+}
 
-	ZIO_SET_CHECKSUM(zcp, A, B, C, D);
+#define	FLETCHER_4_AVX2_SAVE_CTX(ctx)					\
+{									\
+	asm volatile("vmovdqu %%ymm0, %0" : "=m" ((ctx)->avx[0]));	\
+	asm volatile("vmovdqu %%ymm1, %0" : "=m" ((ctx)->avx[1]));	\
+	asm volatile("vmovdqu %%ymm2, %0" : "=m" ((ctx)->avx[2]));	\
+	asm volatile("vmovdqu %%ymm3, %0" : "=m" ((ctx)->avx[3]));	\
 }
 
+
 static void
-fletcher_4_avx2_native(const void *buf, uint64_t size, zio_cksum_t *unused)
+fletcher_4_avx2_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
 {
 	const uint64_t *ip = buf;
 	const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size);
 
+	kfpu_begin();
+
+	FLETCHER_4_AVX2_RESTORE_CTX(ctx);
+
 	for (; ip < ipend; ip += 2) {
 		asm volatile("vpmovzxdq %0, %%ymm4"::"m" (*ip));
 		asm volatile("vpaddq %ymm4, %ymm0, %ymm0");
@@ -104,21 +115,28 @@ fletcher_4_avx2_native(const void *buf, uint64_t size, zio_cksum_t *unused)
 		asm volatile("vpaddq %ymm1, %ymm2, %ymm2");
 		asm volatile("vpaddq %ymm2, %ymm3, %ymm3");
 	}
+
+	FLETCHER_4_AVX2_SAVE_CTX(ctx);
+	asm volatile("vzeroupper");
+
+	kfpu_end();
 }
 
 static void
-fletcher_4_avx2_byteswap(const void *buf, uint64_t size, zio_cksum_t *unused)
+fletcher_4_avx2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
 {
-	static const struct {
-		uint64_t v[4] __attribute__((aligned(32)));
-	} mask = {
+	static const zfs_fletcher_avx_t mask = {
 		.v = { 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B,
 		    0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B }
 	};
 	const uint64_t *ip = buf;
 	const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size);
 
-	asm volatile("vmovdqa %0, %%ymm5"::"m"(mask));
+	kfpu_begin();
+
+	FLETCHER_4_AVX2_RESTORE_CTX(ctx);
+
+	asm volatile("vmovdqu %0, %%ymm5" :: "m" (mask));
 
 	for (; ip < ipend; ip += 2) {
 		asm volatile("vpmovzxdq %0, %%ymm4"::"m" (*ip));
@@ -129,6 +147,11 @@ fletcher_4_avx2_byteswap(const void *buf, uint64_t size, zio_cksum_t *unused)
 		asm volatile("vpaddq %ymm1, %ymm2, %ymm2");
 		asm volatile("vpaddq %ymm2, %ymm3, %ymm3");
 	}
+
+	FLETCHER_4_AVX2_SAVE_CTX(ctx);
+	asm volatile("vzeroupper");
+
+	kfpu_end();
 }
 
 static boolean_t fletcher_4_avx2_valid(void)
diff --git a/module/zcommon/zfs_fletcher_sse.c b/module/zcommon/zfs_fletcher_sse.c
index 9bc5f7ab6..ae03f4217 100644
--- a/module/zcommon/zfs_fletcher_sse.c
+++ b/module/zcommon/zfs_fletcher_sse.c
@@ -45,39 +45,19 @@
 
 #include <linux/simd_x86.h>
 #include <sys/spa_checksum.h>
+#include <sys/byteorder.h>
 #include <zfs_fletcher.h>
-
-struct zfs_fletcher_sse_array {
-	uint64_t v[2] __attribute__((aligned(16)));
-};
+#include <strings.h>
 
 static void
-fletcher_4_sse2_init(zio_cksum_t *zcp)
-{
-	kfpu_begin();
-
-	/* clear sse registers */
-	asm volatile("pxor %xmm0, %xmm0");
-	asm volatile("pxor %xmm1, %xmm1");
-	asm volatile("pxor %xmm2, %xmm2");
-	asm volatile("pxor %xmm3, %xmm3");
+fletcher_4_sse2_init(fletcher_4_ctx_t *ctx) {
+	bzero(ctx->sse, 4 * sizeof (zfs_fletcher_sse_t));
 }
 
 static void
-fletcher_4_sse2_fini(zio_cksum_t *zcp)
-{
-	struct zfs_fletcher_sse_array a, b, c, d;
+fletcher_4_sse2_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) {
 	uint64_t A, B, C, D;
 
-	asm volatile("movdqu %%xmm0, %0":"=m" (a.v));
-	asm volatile("movdqu %%xmm1, %0":"=m" (b.v));
-	asm volatile("psllq $0x2, %xmm2");
-	asm volatile("movdqu %%xmm2, %0":"=m" (c.v));
-	asm volatile("psllq $0x3, %xmm3");
-	asm volatile("movdqu %%xmm3, %0":"=m" (d.v));
-
-	kfpu_end();
-
 	/*
 	 * The mixing matrix for checksum calculation is:
 	 * a = a0 + a1
@@ -88,20 +68,42 @@ fletcher_4_sse2_fini(zio_cksum_t *zcp)
 	 * c and d are multiplied by 4 and 8, respectively,
 	 * before spilling the vectors out to memory.
 	 */
-	A = a.v[0] + a.v[1];
-	B = 2*b.v[0] + 2*b.v[1] - a.v[1];
-	C = c.v[0] - b.v[0] + c.v[1] - 3*b.v[1];
-	D = d.v[0] - c.v[0] + d.v[1] - 2*c.v[1] + b.v[1];
+	A = ctx->sse[0].v[0] + ctx->sse[0].v[1];
+	B = 2 * ctx->sse[1].v[0] + 2 * ctx->sse[1].v[1] - ctx->sse[0].v[1];
+	C = 4 * ctx->sse[2].v[0] - ctx->sse[1].v[0] + 4 * ctx->sse[2].v[1] -
+	    3 * ctx->sse[1].v[1];
+	D = 8 * ctx->sse[3].v[0] - 4 * ctx->sse[2].v[0] + 8 * ctx->sse[3].v[1] -
+	    8 * ctx->sse[2].v[1] + ctx->sse[1].v[1];
 
 	ZIO_SET_CHECKSUM(zcp, A, B, C, D);
 }
 
+#define	FLETCHER_4_SSE_RESTORE_CTX(ctx)					\
+{									\
+	asm volatile("movdqu %0, %%xmm0" :: "m" ((ctx)->sse[0]));	\
+	asm volatile("movdqu %0, %%xmm1" :: "m" ((ctx)->sse[1]));	\
+	asm volatile("movdqu %0, %%xmm2" :: "m" ((ctx)->sse[2]));	\
+	asm volatile("movdqu %0, %%xmm3" :: "m" ((ctx)->sse[3]));	\
+}
+
+#define	FLETCHER_4_SSE_SAVE_CTX(ctx)					\
+{									\
+	asm volatile("movdqu %%xmm0, %0" : "=m" ((ctx)->sse[0]));	\
+	asm volatile("movdqu %%xmm1, %0" : "=m" ((ctx)->sse[1]));	\
+	asm volatile("movdqu %%xmm2, %0" : "=m" ((ctx)->sse[2]));	\
+	asm volatile("movdqu %%xmm3, %0" : "=m" ((ctx)->sse[3]));	\
+}
+
 static void
-fletcher_4_sse2_native(const void *buf, uint64_t size, zio_cksum_t *unused)
+fletcher_4_sse2_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
 {
 	const uint64_t *ip = buf;
 	const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size);
 
+	kfpu_begin();
+
+	FLETCHER_4_SSE_RESTORE_CTX(ctx);
+
 	asm volatile("pxor %xmm4, %xmm4");
 
 	for (; ip < ipend; ip += 2) {
@@ -118,27 +120,37 @@ fletcher_4_sse2_native(const void *buf, uint64_t size, zio_cksum_t *unused)
 		asm volatile("paddq %xmm1, %xmm2");
 		asm volatile("paddq %xmm2, %xmm3");
 	}
+
+	FLETCHER_4_SSE_SAVE_CTX(ctx);
+
+	kfpu_end();
 }
 
 static void
-fletcher_4_sse2_byteswap(const void *buf, uint64_t size, zio_cksum_t *unused)
+fletcher_4_sse2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
 {
 	const uint32_t *ip = buf;
 	const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size);
 
-	for (; ip < ipend; ip += 2) {
-		uint32_t scratch;
+	kfpu_begin();
+
+	FLETCHER_4_SSE_RESTORE_CTX(ctx);
 
-		asm volatile("bswapl %0" : "=r"(scratch) : "0"(*ip));
-		asm volatile("movd %0, %%xmm5" :: "r"(scratch));
-		asm volatile("bswapl %0" : "=r"(scratch) : "0"(*(ip + 1)));
-		asm volatile("movd %0, %%xmm6" :: "r"(scratch));
+	for (; ip < ipend; ip += 2) {
+		uint32_t scratch1 = BSWAP_32(ip[0]);
+		uint32_t scratch2 = BSWAP_32(ip[1]);
+		asm volatile("movd %0, %%xmm5" :: "r"(scratch1));
+		asm volatile("movd %0, %%xmm6" :: "r"(scratch2));
 		asm volatile("punpcklqdq %xmm6, %xmm5");
 		asm volatile("paddq %xmm5, %xmm0");
 		asm volatile("paddq %xmm0, %xmm1");
 		asm volatile("paddq %xmm1, %xmm2");
 		asm volatile("paddq %xmm2, %xmm3");
 	}
+
+	FLETCHER_4_SSE_SAVE_CTX(ctx);
+
+	kfpu_end();
 }
 
 static boolean_t fletcher_4_sse2_valid(void)
@@ -161,15 +173,19 @@ const fletcher_4_ops_t fletcher_4_sse2_ops = {
 
 #if defined(HAVE_SSE2) && defined(HAVE_SSSE3)
 static void
-fletcher_4_ssse3_byteswap(const void *buf, uint64_t size, zio_cksum_t *unused)
+fletcher_4_ssse3_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
 {
-	static const struct zfs_fletcher_sse_array mask = {
+	static const zfs_fletcher_sse_t mask = {
 		.v = { 0x0405060700010203, 0x0C0D0E0F08090A0B }
 	};
 
 	const uint64_t *ip = buf;
 	const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size);
 
+	kfpu_begin();
+
+	FLETCHER_4_SSE_RESTORE_CTX(ctx);
+
 	asm volatile("movdqu %0, %%xmm7"::"m" (mask));
 	asm volatile("pxor %xmm4, %xmm4");
 
@@ -188,6 +204,10 @@ fletcher_4_ssse3_byteswap(const void *buf, uint64_t size, zio_cksum_t *unused)
 		asm volatile("paddq %xmm1, %xmm2");
 		asm volatile("paddq %xmm2, %xmm3");
 	}
+
+	FLETCHER_4_SSE_SAVE_CTX(ctx);
+
+	kfpu_end();
 }
 
 static boolean_t fletcher_4_ssse3_valid(void)