diff options
Diffstat (limited to 'module/zcommon/zfs_fletcher_sse.c')
-rw-r--r-- | module/zcommon/zfs_fletcher_sse.c | 98 |
1 files changed, 59 insertions, 39 deletions
diff --git a/module/zcommon/zfs_fletcher_sse.c b/module/zcommon/zfs_fletcher_sse.c index 9bc5f7ab6..ae03f4217 100644 --- a/module/zcommon/zfs_fletcher_sse.c +++ b/module/zcommon/zfs_fletcher_sse.c @@ -45,39 +45,19 @@ #include <linux/simd_x86.h> #include <sys/spa_checksum.h> +#include <sys/byteorder.h> #include <zfs_fletcher.h> - -struct zfs_fletcher_sse_array { - uint64_t v[2] __attribute__((aligned(16))); -}; +#include <strings.h> static void -fletcher_4_sse2_init(zio_cksum_t *zcp) -{ - kfpu_begin(); - - /* clear sse registers */ - asm volatile("pxor %xmm0, %xmm0"); - asm volatile("pxor %xmm1, %xmm1"); - asm volatile("pxor %xmm2, %xmm2"); - asm volatile("pxor %xmm3, %xmm3"); +fletcher_4_sse2_init(fletcher_4_ctx_t *ctx) { + bzero(ctx->sse, 4 * sizeof (zfs_fletcher_sse_t)); } static void -fletcher_4_sse2_fini(zio_cksum_t *zcp) -{ - struct zfs_fletcher_sse_array a, b, c, d; +fletcher_4_sse2_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) { uint64_t A, B, C, D; - asm volatile("movdqu %%xmm0, %0":"=m" (a.v)); - asm volatile("movdqu %%xmm1, %0":"=m" (b.v)); - asm volatile("psllq $0x2, %xmm2"); - asm volatile("movdqu %%xmm2, %0":"=m" (c.v)); - asm volatile("psllq $0x3, %xmm3"); - asm volatile("movdqu %%xmm3, %0":"=m" (d.v)); - - kfpu_end(); - /* * The mixing matrix for checksum calculation is: * a = a0 + a1 @@ -88,20 +68,42 @@ fletcher_4_sse2_fini(zio_cksum_t *zcp) * c and d are multiplied by 4 and 8, respectively, * before spilling the vectors out to memory. */ - A = a.v[0] + a.v[1]; - B = 2*b.v[0] + 2*b.v[1] - a.v[1]; - C = c.v[0] - b.v[0] + c.v[1] - 3*b.v[1]; - D = d.v[0] - c.v[0] + d.v[1] - 2*c.v[1] + b.v[1]; + A = ctx->sse[0].v[0] + ctx->sse[0].v[1]; + B = 2 * ctx->sse[1].v[0] + 2 * ctx->sse[1].v[1] - ctx->sse[0].v[1]; + C = 4 * ctx->sse[2].v[0] - ctx->sse[1].v[0] + 4 * ctx->sse[2].v[1] - + 3 * ctx->sse[1].v[1]; + D = 8 * ctx->sse[3].v[0] - 4 * ctx->sse[2].v[0] + 8 * ctx->sse[3].v[1] - + 8 * ctx->sse[2].v[1] + ctx->sse[1].v[1]; ZIO_SET_CHECKSUM(zcp, A, B, C, D); } +#define FLETCHER_4_SSE_RESTORE_CTX(ctx) \ +{ \ + asm volatile("movdqu %0, %%xmm0" :: "m" ((ctx)->sse[0])); \ + asm volatile("movdqu %0, %%xmm1" :: "m" ((ctx)->sse[1])); \ + asm volatile("movdqu %0, %%xmm2" :: "m" ((ctx)->sse[2])); \ + asm volatile("movdqu %0, %%xmm3" :: "m" ((ctx)->sse[3])); \ +} + +#define FLETCHER_4_SSE_SAVE_CTX(ctx) \ +{ \ + asm volatile("movdqu %%xmm0, %0" : "=m" ((ctx)->sse[0])); \ + asm volatile("movdqu %%xmm1, %0" : "=m" ((ctx)->sse[1])); \ + asm volatile("movdqu %%xmm2, %0" : "=m" ((ctx)->sse[2])); \ + asm volatile("movdqu %%xmm3, %0" : "=m" ((ctx)->sse[3])); \ +} + static void -fletcher_4_sse2_native(const void *buf, uint64_t size, zio_cksum_t *unused) +fletcher_4_sse2_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) { const uint64_t *ip = buf; const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size); + kfpu_begin(); + + FLETCHER_4_SSE_RESTORE_CTX(ctx); + asm volatile("pxor %xmm4, %xmm4"); for (; ip < ipend; ip += 2) { @@ -118,27 +120,37 @@ fletcher_4_sse2_native(const void *buf, uint64_t size, zio_cksum_t *unused) asm volatile("paddq %xmm1, %xmm2"); asm volatile("paddq %xmm2, %xmm3"); } + + FLETCHER_4_SSE_SAVE_CTX(ctx); + + kfpu_end(); } static void -fletcher_4_sse2_byteswap(const void *buf, uint64_t size, zio_cksum_t *unused) +fletcher_4_sse2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) { const uint32_t *ip = buf; const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size); - for (; ip < ipend; ip += 2) { - uint32_t scratch; + kfpu_begin(); + + FLETCHER_4_SSE_RESTORE_CTX(ctx); - asm volatile("bswapl %0" : "=r"(scratch) : "0"(*ip)); - asm volatile("movd %0, %%xmm5" :: "r"(scratch)); - asm volatile("bswapl %0" : "=r"(scratch) : "0"(*(ip + 1))); - asm volatile("movd %0, %%xmm6" :: "r"(scratch)); + for (; ip < ipend; ip += 2) { + uint32_t scratch1 = BSWAP_32(ip[0]); + uint32_t scratch2 = BSWAP_32(ip[1]); + asm volatile("movd %0, %%xmm5" :: "r"(scratch1)); + asm volatile("movd %0, %%xmm6" :: "r"(scratch2)); asm volatile("punpcklqdq %xmm6, %xmm5"); asm volatile("paddq %xmm5, %xmm0"); asm volatile("paddq %xmm0, %xmm1"); asm volatile("paddq %xmm1, %xmm2"); asm volatile("paddq %xmm2, %xmm3"); } + + FLETCHER_4_SSE_SAVE_CTX(ctx); + + kfpu_end(); } static boolean_t fletcher_4_sse2_valid(void) @@ -161,15 +173,19 @@ const fletcher_4_ops_t fletcher_4_sse2_ops = { #if defined(HAVE_SSE2) && defined(HAVE_SSSE3) static void -fletcher_4_ssse3_byteswap(const void *buf, uint64_t size, zio_cksum_t *unused) +fletcher_4_ssse3_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) { - static const struct zfs_fletcher_sse_array mask = { + static const zfs_fletcher_sse_t mask = { .v = { 0x0405060700010203, 0x0C0D0E0F08090A0B } }; const uint64_t *ip = buf; const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size); + kfpu_begin(); + + FLETCHER_4_SSE_RESTORE_CTX(ctx); + asm volatile("movdqu %0, %%xmm7"::"m" (mask)); asm volatile("pxor %xmm4, %xmm4"); @@ -188,6 +204,10 @@ fletcher_4_ssse3_byteswap(const void *buf, uint64_t size, zio_cksum_t *unused) asm volatile("paddq %xmm1, %xmm2"); asm volatile("paddq %xmm2, %xmm3"); } + + FLETCHER_4_SSE_SAVE_CTX(ctx); + + kfpu_end(); } static boolean_t fletcher_4_ssse3_valid(void) |