aboutsummaryrefslogtreecommitdiffstats
path: root/module/zcommon
diff options
context:
space:
mode:
Diffstat (limited to 'module/zcommon')
-rw-r--r--module/zcommon/zfs_fletcher.c88
-rw-r--r--module/zcommon/zfs_fletcher_aarch64_neon.c2
-rw-r--r--module/zcommon/zfs_fletcher_avx512.c2
-rw-r--r--module/zcommon/zfs_fletcher_intel.c2
-rw-r--r--module/zcommon/zfs_fletcher_sse.c5
5 files changed, 64 insertions, 35 deletions
diff --git a/module/zcommon/zfs_fletcher.c b/module/zcommon/zfs_fletcher.c
index 5a991ba60..b75d8ab00 100644
--- a/module/zcommon/zfs_fletcher.c
+++ b/module/zcommon/zfs_fletcher.c
@@ -140,6 +140,7 @@
#include <sys/zio_checksum.h>
#include <sys/zfs_context.h>
#include <zfs_fletcher.h>
+#include <linux/simd.h>
#define FLETCHER_MIN_SIMD_SIZE 64
@@ -205,21 +206,19 @@ static struct fletcher_4_impl_selector {
const char *fis_name;
uint32_t fis_sel;
} fletcher_4_impl_selectors[] = {
-#if !defined(_KERNEL)
{ "cycle", IMPL_CYCLE },
-#endif
{ "fastest", IMPL_FASTEST },
{ "scalar", IMPL_SCALAR }
};
#if defined(_KERNEL)
static kstat_t *fletcher_4_kstat;
-#endif
static struct fletcher_4_kstat {
uint64_t native;
uint64_t byteswap;
} fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1];
+#endif
/* Indicate that benchmark has been completed */
static boolean_t fletcher_4_initialized = B_FALSE;
@@ -408,32 +407,36 @@ fletcher_4_impl_set(const char *val)
return (err);
}
+/*
+ * Returns the Fletcher 4 operations for checksums. When a SIMD
+ * implementation is not allowed in the current context, then fallback
+ * to the fastest generic implementation.
+ */
static inline const fletcher_4_ops_t *
fletcher_4_impl_get(void)
{
- fletcher_4_ops_t *ops = NULL;
- const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
+ if (!kfpu_allowed())
+ return (&fletcher_4_superscalar4_ops);
+
+ const fletcher_4_ops_t *ops = NULL;
+ uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
switch (impl) {
case IMPL_FASTEST:
ASSERT(fletcher_4_initialized);
ops = &fletcher_4_fastest_impl;
break;
-#if !defined(_KERNEL)
- case IMPL_CYCLE: {
+ case IMPL_CYCLE:
+ /* Cycle through supported implementations */
ASSERT(fletcher_4_initialized);
ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
-
static uint32_t cycle_count = 0;
uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt;
ops = fletcher_4_supp_impls[idx];
- }
- break;
-#endif
+ break;
default:
ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
ASSERT3U(impl, <, fletcher_4_supp_impls_cnt);
-
ops = fletcher_4_supp_impls[impl];
break;
}
@@ -658,6 +661,7 @@ fletcher_4_kstat_addr(kstat_t *ksp, loff_t n)
typedef void fletcher_checksum_func_t(const void *, uint64_t, const void *,
zio_cksum_t *);
+#if defined(_KERNEL)
static void
fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
{
@@ -716,16 +720,18 @@ fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
/* restore original selection */
atomic_swap_32(&fletcher_4_impl_chosen, sel_save);
}
+#endif /* _KERNEL */
-void
-fletcher_4_init(void)
+/*
+ * Initialize and benchmark all supported implementations.
+ */
+static void
+fletcher_4_benchmark(void *arg)
{
- static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */
fletcher_4_ops_t *curr_impl;
- char *databuf;
int i, c;
- /* move supported impl into fletcher_4_supp_impls */
+ /* Move supported implementations into fletcher_4_supp_impls */
for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) {
curr_impl = (fletcher_4_ops_t *)fletcher_4_impls[i];
@@ -735,19 +741,10 @@ fletcher_4_init(void)
membar_producer(); /* complete fletcher_4_supp_impls[] init */
fletcher_4_supp_impls_cnt = c; /* number of supported impl */
-#if !defined(_KERNEL)
- /* Skip benchmarking and use last implementation as fastest */
- memcpy(&fletcher_4_fastest_impl,
- fletcher_4_supp_impls[fletcher_4_supp_impls_cnt-1],
- sizeof (fletcher_4_fastest_impl));
- fletcher_4_fastest_impl.name = "fastest";
- membar_producer();
+#if defined(_KERNEL)
+ static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */
+ char *databuf = vmem_alloc(data_size, KM_SLEEP);
- fletcher_4_initialized = B_TRUE;
- return;
-#endif
- /* Benchmark all supported implementations */
- databuf = vmem_alloc(data_size, KM_SLEEP);
for (i = 0; i < data_size / sizeof (uint64_t); i++)
((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */
@@ -755,9 +752,38 @@ fletcher_4_init(void)
fletcher_4_benchmark_impl(B_TRUE, databuf, data_size);
vmem_free(databuf, data_size);
+#else
+ /*
+ * Skip the benchmark in user space to avoid impacting libzpool
+ * consumers (zdb, zhack, zinject, ztest). The last implementation
+ * is assumed to be the fastest and used by default.
+ */
+ memcpy(&fletcher_4_fastest_impl,
+ fletcher_4_supp_impls[fletcher_4_supp_impls_cnt - 1],
+ sizeof (fletcher_4_fastest_impl));
+ fletcher_4_fastest_impl.name = "fastest";
+ membar_producer();
+#endif /* _KERNEL */
+}
+void
+fletcher_4_init(void)
+{
#if defined(_KERNEL)
- /* install kstats for all implementations */
+ /*
+ * For 5.0 and latter Linux kernels the fletcher 4 benchmarks are
+ * run in a kernel threads. This is needed to take advantage of the
+ * SIMD functionality, see include/linux/simd_x86.h for details.
+ */
+ taskqid_t id = taskq_dispatch(system_taskq, fletcher_4_benchmark,
+ NULL, TQ_SLEEP);
+ if (id != TASKQID_INVALID) {
+ taskq_wait_id(system_taskq, id);
+ } else {
+ fletcher_4_benchmark(NULL);
+ }
+
+ /* Install kstats for all implementations */
fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc",
KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
if (fletcher_4_kstat != NULL) {
@@ -769,6 +795,8 @@ fletcher_4_init(void)
fletcher_4_kstat_addr);
kstat_install(fletcher_4_kstat);
}
+#else
+ fletcher_4_benchmark(NULL);
#endif
/* Finish initialization */
diff --git a/module/zcommon/zfs_fletcher_aarch64_neon.c b/module/zcommon/zfs_fletcher_aarch64_neon.c
index bd2db2b20..3b3c1b52b 100644
--- a/module/zcommon/zfs_fletcher_aarch64_neon.c
+++ b/module/zcommon/zfs_fletcher_aarch64_neon.c
@@ -198,7 +198,7 @@ unsigned char SRC __attribute__((vector_size(16)));
static boolean_t fletcher_4_aarch64_neon_valid(void)
{
- return (B_TRUE);
+ return (kfpu_allowed());
}
const fletcher_4_ops_t fletcher_4_aarch64_neon_ops = {
diff --git a/module/zcommon/zfs_fletcher_avx512.c b/module/zcommon/zfs_fletcher_avx512.c
index 7260a9864..0d4cff21a 100644
--- a/module/zcommon/zfs_fletcher_avx512.c
+++ b/module/zcommon/zfs_fletcher_avx512.c
@@ -157,7 +157,7 @@ STACK_FRAME_NON_STANDARD(fletcher_4_avx512f_byteswap);
static boolean_t
fletcher_4_avx512f_valid(void)
{
- return (zfs_avx512f_available());
+ return (kfpu_allowed() && zfs_avx512f_available());
}
const fletcher_4_ops_t fletcher_4_avx512f_ops = {
diff --git a/module/zcommon/zfs_fletcher_intel.c b/module/zcommon/zfs_fletcher_intel.c
index 6dac047da..7f12efe6d 100644
--- a/module/zcommon/zfs_fletcher_intel.c
+++ b/module/zcommon/zfs_fletcher_intel.c
@@ -156,7 +156,7 @@ fletcher_4_avx2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
static boolean_t fletcher_4_avx2_valid(void)
{
- return (zfs_avx_available() && zfs_avx2_available());
+ return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available());
}
const fletcher_4_ops_t fletcher_4_avx2_ops = {
diff --git a/module/zcommon/zfs_fletcher_sse.c b/module/zcommon/zfs_fletcher_sse.c
index a0b42e5f5..e6389d6e5 100644
--- a/module/zcommon/zfs_fletcher_sse.c
+++ b/module/zcommon/zfs_fletcher_sse.c
@@ -157,7 +157,7 @@ fletcher_4_sse2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
static boolean_t fletcher_4_sse2_valid(void)
{
- return (zfs_sse2_available());
+ return (kfpu_allowed() && zfs_sse2_available());
}
const fletcher_4_ops_t fletcher_4_sse2_ops = {
@@ -214,7 +214,8 @@ fletcher_4_ssse3_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
static boolean_t fletcher_4_ssse3_valid(void)
{
- return (zfs_sse2_available() && zfs_ssse3_available());
+ return (kfpu_allowed() && zfs_sse2_available() &&
+ zfs_ssse3_available());
}
const fletcher_4_ops_t fletcher_4_ssse3_ops = {