diff options
Diffstat (limited to 'module/zcommon')
-rw-r--r-- | module/zcommon/zfs_fletcher.c | 88 | ||||
-rw-r--r-- | module/zcommon/zfs_fletcher_aarch64_neon.c | 2 | ||||
-rw-r--r-- | module/zcommon/zfs_fletcher_avx512.c | 2 | ||||
-rw-r--r-- | module/zcommon/zfs_fletcher_intel.c | 2 | ||||
-rw-r--r-- | module/zcommon/zfs_fletcher_sse.c | 5 |
5 files changed, 64 insertions, 35 deletions
diff --git a/module/zcommon/zfs_fletcher.c b/module/zcommon/zfs_fletcher.c index 5a991ba60..b75d8ab00 100644 --- a/module/zcommon/zfs_fletcher.c +++ b/module/zcommon/zfs_fletcher.c @@ -140,6 +140,7 @@ #include <sys/zio_checksum.h> #include <sys/zfs_context.h> #include <zfs_fletcher.h> +#include <linux/simd.h> #define FLETCHER_MIN_SIMD_SIZE 64 @@ -205,21 +206,19 @@ static struct fletcher_4_impl_selector { const char *fis_name; uint32_t fis_sel; } fletcher_4_impl_selectors[] = { -#if !defined(_KERNEL) { "cycle", IMPL_CYCLE }, -#endif { "fastest", IMPL_FASTEST }, { "scalar", IMPL_SCALAR } }; #if defined(_KERNEL) static kstat_t *fletcher_4_kstat; -#endif static struct fletcher_4_kstat { uint64_t native; uint64_t byteswap; } fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1]; +#endif /* Indicate that benchmark has been completed */ static boolean_t fletcher_4_initialized = B_FALSE; @@ -408,32 +407,36 @@ fletcher_4_impl_set(const char *val) return (err); } +/* + * Returns the Fletcher 4 operations for checksums. When a SIMD + * implementation is not allowed in the current context, then fallback + * to the fastest generic implementation. + */ static inline const fletcher_4_ops_t * fletcher_4_impl_get(void) { - fletcher_4_ops_t *ops = NULL; - const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen); + if (!kfpu_allowed()) + return (&fletcher_4_superscalar4_ops); + + const fletcher_4_ops_t *ops = NULL; + uint32_t impl = IMPL_READ(fletcher_4_impl_chosen); switch (impl) { case IMPL_FASTEST: ASSERT(fletcher_4_initialized); ops = &fletcher_4_fastest_impl; break; -#if !defined(_KERNEL) - case IMPL_CYCLE: { + case IMPL_CYCLE: + /* Cycle through supported implementations */ ASSERT(fletcher_4_initialized); ASSERT3U(fletcher_4_supp_impls_cnt, >, 0); - static uint32_t cycle_count = 0; uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt; ops = fletcher_4_supp_impls[idx]; - } - break; -#endif + break; default: ASSERT3U(fletcher_4_supp_impls_cnt, >, 0); ASSERT3U(impl, <, fletcher_4_supp_impls_cnt); - ops = fletcher_4_supp_impls[impl]; break; } @@ -658,6 +661,7 @@ fletcher_4_kstat_addr(kstat_t *ksp, loff_t n) typedef void fletcher_checksum_func_t(const void *, uint64_t, const void *, zio_cksum_t *); +#if defined(_KERNEL) static void fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size) { @@ -716,16 +720,18 @@ fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size) /* restore original selection */ atomic_swap_32(&fletcher_4_impl_chosen, sel_save); } +#endif /* _KERNEL */ -void -fletcher_4_init(void) +/* + * Initialize and benchmark all supported implementations. + */ +static void +fletcher_4_benchmark(void *arg) { - static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */ fletcher_4_ops_t *curr_impl; - char *databuf; int i, c; - /* move supported impl into fletcher_4_supp_impls */ + /* Move supported implementations into fletcher_4_supp_impls */ for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) { curr_impl = (fletcher_4_ops_t *)fletcher_4_impls[i]; @@ -735,19 +741,10 @@ fletcher_4_init(void) membar_producer(); /* complete fletcher_4_supp_impls[] init */ fletcher_4_supp_impls_cnt = c; /* number of supported impl */ -#if !defined(_KERNEL) - /* Skip benchmarking and use last implementation as fastest */ - memcpy(&fletcher_4_fastest_impl, - fletcher_4_supp_impls[fletcher_4_supp_impls_cnt-1], - sizeof (fletcher_4_fastest_impl)); - fletcher_4_fastest_impl.name = "fastest"; - membar_producer(); +#if defined(_KERNEL) + static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */ + char *databuf = vmem_alloc(data_size, KM_SLEEP); - fletcher_4_initialized = B_TRUE; - return; -#endif - /* Benchmark all supported implementations */ - databuf = vmem_alloc(data_size, KM_SLEEP); for (i = 0; i < data_size / sizeof (uint64_t); i++) ((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */ @@ -755,9 +752,38 @@ fletcher_4_init(void) fletcher_4_benchmark_impl(B_TRUE, databuf, data_size); vmem_free(databuf, data_size); +#else + /* + * Skip the benchmark in user space to avoid impacting libzpool + * consumers (zdb, zhack, zinject, ztest). The last implementation + * is assumed to be the fastest and used by default. + */ + memcpy(&fletcher_4_fastest_impl, + fletcher_4_supp_impls[fletcher_4_supp_impls_cnt - 1], + sizeof (fletcher_4_fastest_impl)); + fletcher_4_fastest_impl.name = "fastest"; + membar_producer(); +#endif /* _KERNEL */ +} +void +fletcher_4_init(void) +{ #if defined(_KERNEL) - /* install kstats for all implementations */ + /* + * For 5.0 and latter Linux kernels the fletcher 4 benchmarks are + * run in a kernel threads. This is needed to take advantage of the + * SIMD functionality, see include/linux/simd_x86.h for details. + */ + taskqid_t id = taskq_dispatch(system_taskq, fletcher_4_benchmark, + NULL, TQ_SLEEP); + if (id != TASKQID_INVALID) { + taskq_wait_id(system_taskq, id); + } else { + fletcher_4_benchmark(NULL); + } + + /* Install kstats for all implementations */ fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc", KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); if (fletcher_4_kstat != NULL) { @@ -769,6 +795,8 @@ fletcher_4_init(void) fletcher_4_kstat_addr); kstat_install(fletcher_4_kstat); } +#else + fletcher_4_benchmark(NULL); #endif /* Finish initialization */ diff --git a/module/zcommon/zfs_fletcher_aarch64_neon.c b/module/zcommon/zfs_fletcher_aarch64_neon.c index bd2db2b20..3b3c1b52b 100644 --- a/module/zcommon/zfs_fletcher_aarch64_neon.c +++ b/module/zcommon/zfs_fletcher_aarch64_neon.c @@ -198,7 +198,7 @@ unsigned char SRC __attribute__((vector_size(16))); static boolean_t fletcher_4_aarch64_neon_valid(void) { - return (B_TRUE); + return (kfpu_allowed()); } const fletcher_4_ops_t fletcher_4_aarch64_neon_ops = { diff --git a/module/zcommon/zfs_fletcher_avx512.c b/module/zcommon/zfs_fletcher_avx512.c index 7260a9864..0d4cff21a 100644 --- a/module/zcommon/zfs_fletcher_avx512.c +++ b/module/zcommon/zfs_fletcher_avx512.c @@ -157,7 +157,7 @@ STACK_FRAME_NON_STANDARD(fletcher_4_avx512f_byteswap); static boolean_t fletcher_4_avx512f_valid(void) { - return (zfs_avx512f_available()); + return (kfpu_allowed() && zfs_avx512f_available()); } const fletcher_4_ops_t fletcher_4_avx512f_ops = { diff --git a/module/zcommon/zfs_fletcher_intel.c b/module/zcommon/zfs_fletcher_intel.c index 6dac047da..7f12efe6d 100644 --- a/module/zcommon/zfs_fletcher_intel.c +++ b/module/zcommon/zfs_fletcher_intel.c @@ -156,7 +156,7 @@ fletcher_4_avx2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) static boolean_t fletcher_4_avx2_valid(void) { - return (zfs_avx_available() && zfs_avx2_available()); + return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available()); } const fletcher_4_ops_t fletcher_4_avx2_ops = { diff --git a/module/zcommon/zfs_fletcher_sse.c b/module/zcommon/zfs_fletcher_sse.c index a0b42e5f5..e6389d6e5 100644 --- a/module/zcommon/zfs_fletcher_sse.c +++ b/module/zcommon/zfs_fletcher_sse.c @@ -157,7 +157,7 @@ fletcher_4_sse2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) static boolean_t fletcher_4_sse2_valid(void) { - return (zfs_sse2_available()); + return (kfpu_allowed() && zfs_sse2_available()); } const fletcher_4_ops_t fletcher_4_sse2_ops = { @@ -214,7 +214,8 @@ fletcher_4_ssse3_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) static boolean_t fletcher_4_ssse3_valid(void) { - return (zfs_sse2_available() && zfs_ssse3_available()); + return (kfpu_allowed() && zfs_sse2_available() && + zfs_ssse3_available()); } const fletcher_4_ops_t fletcher_4_ssse3_ops = { |