From 24cdeaf12e9e546621902449699fc6d664aeac2b Mon Sep 17 00:00:00 2001
From: Romain Dolbeau <romain.github@dolbeau.name>
Date: Fri, 21 Oct 2016 19:55:49 +0200
Subject: Fletcher4 algorithm implemented in pure NEON for Aarch64 / ARMv8 64
 bits

This is not useful on micro-architecture with a weak NEON
implementation (only 64 bits); the native version is slower &
the byteswap barely faster than scalar.  On A53 or A57, it's
a small improvement on scalar but OK for byteswap.

Results from an A53 system:
0 0 0x01 -1 0 1499068294333000 1499101101878000
implementation   native         byteswap
scalar           1008227510     755880264
aarch64_neon     1198098720     1044818671
fastest          aarch64_neon   aarch64_neon

Results from a A57 system:
0 0 0x01 -1 0 4407214734807033 4407233933777404
implementation   native         byteswap
scalar           2302071241     1124873346
aarch64_neon     2542214946     2245570352
fastest          aarch64_neon   aarch64_neon

Reviewed-by: Gvozden Neskovic <neskovic@gmail.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Romain Dolbeau <romain.dolbeau@atos.net>
Closes #5248
---
 include/zfs_fletcher.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/zfs_fletcher.h')

diff --git a/include/zfs_fletcher.h b/include/zfs_fletcher.h
index 85c2b5a7e..1f113ae2f 100644
--- a/include/zfs_fletcher.h
+++ b/include/zfs_fletcher.h
@@ -77,6 +77,10 @@ typedef struct zfs_fletcher_avx512 {
 	uint64_t v[8] __attribute__((aligned(64)));
 } zfs_fletcher_avx512_t;
 
+typedef struct zfs_fletcher_aarch64_neon {
+	uint64_t v[2] __attribute__((aligned(16)));
+} zfs_fletcher_aarch64_neon_t;
+
 
 typedef union fletcher_4_ctx {
 	zio_cksum_t scalar;
@@ -90,6 +94,9 @@ typedef union fletcher_4_ctx {
 #if defined(__x86_64) && defined(HAVE_AVX512F)
 	zfs_fletcher_avx512_t avx512[4];
 #endif
+#if defined(__aarch64__)
+	zfs_fletcher_aarch64_neon_t aarch64_neon[4];
+#endif
 } fletcher_4_ctx_t;
 
 /*
@@ -128,6 +135,10 @@ extern const fletcher_4_ops_t fletcher_4_avx2_ops;
 extern const fletcher_4_ops_t fletcher_4_avx512f_ops;
 #endif
 
+#if defined(__aarch64__)
+extern const fletcher_4_ops_t fletcher_4_aarch64_neon_ops;
+#endif
+
 #ifdef	__cplusplus
 }
 #endif
-- 
cgit v1.2.3