aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/Makefile.am2
-rw-r--r--include/sys/sha2.h127
-rw-r--r--include/sys/zfs_impl.h69
-rw-r--r--include/sys/zio_checksum.h6
-rw-r--r--lib/libicp/Makefile.am28
-rw-r--r--lib/libzfs/Makefile.am3
-rw-r--r--lib/libzpool/Makefile.am2
-rw-r--r--module/Kbuild.in28
-rw-r--r--module/Makefile.bsd36
-rw-r--r--module/icp/algs/sha2/sha256_impl.c299
-rw-r--r--module/icp/algs/sha2/sha2_generic.c562
-rw-r--r--module/icp/algs/sha2/sha512_impl.c276
-rw-r--r--module/icp/asm-aarch64/sha2/sha256-armv8.S1999
-rw-r--r--module/icp/asm-aarch64/sha2/sha512-armv8.S1558
-rw-r--r--module/icp/asm-arm/sha2/sha256-armv7.S2769
-rw-r--r--module/icp/asm-arm/sha2/sha512-armv7.S1822
-rw-r--r--module/icp/asm-ppc64/sha2/sha256-p8.S1505
-rw-r--r--module/icp/asm-ppc64/sha2/sha256-ppc.S2712
-rw-r--r--module/icp/asm-ppc64/sha2/sha512-p8.S1706
-rw-r--r--module/icp/asm-ppc64/sha2/sha512-ppc.S2958
-rw-r--r--module/icp/asm-x86_64/sha2/sha256-x86_64.S5104
-rw-r--r--module/icp/asm-x86_64/sha2/sha512-x86_64.S4011
-rw-r--r--module/icp/include/generic_impl.c233
-rw-r--r--module/icp/include/sha2/sha2_impl.h27
-rw-r--r--module/icp/io/sha2_mod.c1
-rw-r--r--module/zfs/sha2_zfs.c (renamed from module/zfs/sha256.c)14
-rw-r--r--module/zfs/zfs_chksum.c124
-rw-r--r--module/zfs/zfs_impl.c61
-rw-r--r--module/zfs/zio_checksum.c8
-rw-r--r--tests/zfs-tests/cmd/checksum/sha2_test.c34
30 files changed, 27987 insertions, 97 deletions
diff --git a/include/Makefile.am b/include/Makefile.am
index 1e5c71150..6897e3c5e 100644
--- a/include/Makefile.am
+++ b/include/Makefile.am
@@ -75,6 +75,7 @@ COMMON_H = \
sys/rrwlock.h \
sys/sa.h \
sys/sa_impl.h \
+ sys/sha2.h \
sys/skein.h \
sys/spa.h \
sys/spa_checkpoint.h \
@@ -124,6 +125,7 @@ COMMON_H = \
sys/zfs_delay.h \
sys/zfs_file.h \
sys/zfs_fuid.h \
+ sys/zfs_impl.h \
sys/zfs_project.h \
sys/zfs_quota.h \
sys/zfs_racct.h \
diff --git a/include/sys/sha2.h b/include/sys/sha2.h
new file mode 100644
index 000000000..81dfbbb8c
--- /dev/null
+++ b/include/sys/sha2.h
@@ -0,0 +1,127 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2022 Tino Reichardt <[email protected]>
+ */
+
+#ifndef _SYS_SHA2_H
+#define _SYS_SHA2_H
+
+#ifdef _KERNEL
+#include <sys/types.h>
+#else
+#include <stdint.h>
+#include <stdlib.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define SHA224_BLOCK_LENGTH 64
+#define SHA256_BLOCK_LENGTH 64
+#define SHA384_BLOCK_LENGTH 128
+#define SHA512_BLOCK_LENGTH 128
+
+#define SHA224_DIGEST_LENGTH 28
+#define SHA256_DIGEST_LENGTH 32
+#define SHA384_DIGEST_LENGTH 48
+#define SHA512_DIGEST_LENGTH 64
+
+#define SHA512_224_DIGEST_LENGTH 28
+#define SHA512_256_DIGEST_LENGTH 32
+
+#define SHA256_HMAC_BLOCK_SIZE 64
+#define SHA512_HMAC_BLOCK_SIZE 128
+
+/* sha256 context */
+typedef struct {
+ uint32_t state[8];
+ uint64_t count[2];
+ uint8_t wbuf[64];
+
+ /* const sha256_ops_t *ops */
+ const void *ops;
+} sha256_ctx;
+
+/* sha512 context */
+typedef struct {
+ uint64_t state[8];
+ uint64_t count[2];
+ uint8_t wbuf[128];
+
+ /* const sha256_ops_t *ops */
+ const void *ops;
+} sha512_ctx;
+
+/* SHA2 context */
+typedef struct {
+ union {
+ sha256_ctx sha256;
+ sha512_ctx sha512;
+ };
+
+ /* algorithm type */
+ int algotype;
+} SHA2_CTX;
+
+/* SHA2 algorithm types */
+typedef enum sha2_mech_type {
+ SHA256_MECH_INFO_TYPE, /* SUN_CKM_SHA256 */
+ SHA256_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA256_HMAC */
+ SHA256_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA256_HMAC_GENERAL */
+ SHA384_MECH_INFO_TYPE, /* SUN_CKM_SHA384 */
+ SHA384_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA384_HMAC */
+ SHA384_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA384_HMAC_GENERAL */
+ SHA512_MECH_INFO_TYPE, /* SUN_CKM_SHA512 */
+ SHA512_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA512_HMAC */
+ SHA512_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA512_HMAC_GENERAL */
+ SHA512_224_MECH_INFO_TYPE, /* SUN_CKM_SHA512_224 */
+ SHA512_256_MECH_INFO_TYPE /* SUN_CKM_SHA512_256 */
+} sha2_mech_type_t;
+
+#define SHA256 0
+#define SHA256_HMAC 1
+#define SHA256_HMAC_GEN 2
+#define SHA384 3
+#define SHA384_HMAC 4
+#define SHA384_HMAC_GEN 5
+#define SHA512 6
+#define SHA512_HMAC 7
+#define SHA512_HMAC_GEN 8
+#define SHA512_224 9
+#define SHA512_256 10
+
+/* SHA2 Init function */
+extern void SHA2Init(int algotype, SHA2_CTX *ctx);
+
+/* SHA2 Update function */
+extern void SHA2Update(SHA2_CTX *ctx, const void *data, size_t len);
+
+/* SHA2 Final function */
+extern void SHA2Final(void *digest, SHA2_CTX *ctx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* SYS_SHA2_H */
diff --git a/include/sys/zfs_impl.h b/include/sys/zfs_impl.h
new file mode 100644
index 000000000..df4899f13
--- /dev/null
+++ b/include/sys/zfs_impl.h
@@ -0,0 +1,69 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2022 Tino Reichardt <[email protected]>
+ */
+
+#ifndef _SYS_ZFS_IMPL_H
+#define _SYS_ZFS_IMPL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* generic implementation backends */
+typedef struct
+{
+ /* algorithm name */
+ const char *name;
+
+ /* get number of supported implementations */
+ uint32_t (*getcnt)(void);
+
+ /* get id of selected implementation */
+ uint32_t (*getid)(void);
+
+ /* get name of selected implementation */
+ const char *(*getname)(void);
+
+ /* setup id as fastest implementation */
+ void (*set_fastest)(uint32_t id);
+
+ /* set implementation by id */
+ void (*setid)(uint32_t id);
+
+ /* set implementation by name */
+ int (*setname)(const char *val);
+} zfs_impl_t;
+
+/* return some set of function pointer */
+extern const zfs_impl_t *zfs_impl_get_ops(const char *algo);
+
+extern const zfs_impl_t zfs_blake3_ops;
+extern const zfs_impl_t zfs_sha256_ops;
+extern const zfs_impl_t zfs_sha512_ops;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZFS_IMPL_H */
diff --git a/include/sys/zio_checksum.h b/include/sys/zio_checksum.h
index 5903678df..9fb79ab4a 100644
--- a/include/sys/zio_checksum.h
+++ b/include/sys/zio_checksum.h
@@ -110,9 +110,9 @@ _SYS_ZIO_CHECKSUM_H zio_checksum_info_t
*/
/* SHA2 */
-extern zio_checksum_t abd_checksum_SHA256;
-extern zio_checksum_t abd_checksum_SHA512_native;
-extern zio_checksum_t abd_checksum_SHA512_byteswap;
+extern zio_checksum_t abd_checksum_sha256;
+extern zio_checksum_t abd_checksum_sha512_native;
+extern zio_checksum_t abd_checksum_sha512_byteswap;
/* Skein */
extern zio_checksum_t abd_checksum_skein_native;
diff --git a/lib/libicp/Makefile.am b/lib/libicp/Makefile.am
index 7c6cf71de..4ba55b215 100644
--- a/lib/libicp/Makefile.am
+++ b/lib/libicp/Makefile.am
@@ -16,7 +16,6 @@ nodist_libicp_la_SOURCES = \
module/icp/algs/blake3/blake3.c \
module/icp/algs/blake3/blake3_generic.c \
module/icp/algs/blake3/blake3_impl.c \
- module/icp/algs/blake3/blake3_x86-64.c \
module/icp/algs/edonr/edonr.c \
module/icp/algs/modes/modes.c \
module/icp/algs/modes/cbc.c \
@@ -26,7 +25,9 @@ nodist_libicp_la_SOURCES = \
module/icp/algs/modes/ctr.c \
module/icp/algs/modes/ccm.c \
module/icp/algs/modes/ecb.c \
- module/icp/algs/sha2/sha2.c \
+ module/icp/algs/sha2/sha2_generic.c \
+ module/icp/algs/sha2/sha256_impl.c \
+ module/icp/algs/sha2/sha512_impl.c \
module/icp/algs/skein/skein.c \
module/icp/algs/skein/skein_block.c \
module/icp/algs/skein/skein_iv.c \
@@ -38,18 +39,31 @@ nodist_libicp_la_SOURCES = \
module/icp/core/kcf_prov_lib.c \
module/icp/core/kcf_callprov.c \
module/icp/core/kcf_mech_tabs.c \
- module/icp/core/kcf_prov_tabs.c
+ module/icp/core/kcf_prov_tabs.c \
+ module/zfs/zfs_impl.c
if TARGET_CPU_AARCH64
nodist_libicp_la_SOURCES += \
module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S \
- module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S
+ module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S \
+ module/icp/asm-aarch64/sha2/sha256-armv8.S \
+ module/icp/asm-aarch64/sha2/sha512-armv8.S
+endif
+
+if TARGET_CPU_ARM
+nodist_libicp_la_SOURCES += \
+ module/icp/asm-arm/sha2/sha256-armv7.S \
+ module/icp/asm-arm/sha2/sha512-armv7.S
endif
if TARGET_CPU_POWERPC
nodist_libicp_la_SOURCES += \
module/icp/asm-ppc64/blake3/b3_ppc64le_sse2.S \
- module/icp/asm-ppc64/blake3/b3_ppc64le_sse41.S
+ module/icp/asm-ppc64/blake3/b3_ppc64le_sse41.S \
+ module/icp/asm-ppc64/sha2/sha256-ppc.S \
+ module/icp/asm-ppc64/sha2/sha512-ppc.S \
+ module/icp/asm-ppc64/sha2/sha256-p8.S \
+ module/icp/asm-ppc64/sha2/sha512-p8.S
endif
if TARGET_CPU_X86_64
@@ -60,8 +74,8 @@ nodist_libicp_la_SOURCES += \
module/icp/asm-x86_64/modes/gcm_pclmulqdq.S \
module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S \
module/icp/asm-x86_64/modes/ghash-x86_64.S \
- module/icp/asm-x86_64/sha2/sha256_impl.S \
- module/icp/asm-x86_64/sha2/sha512_impl.S \
+ module/icp/asm-x86_64/sha2/sha256-x86_64.S \
+ module/icp/asm-x86_64/sha2/sha512-x86_64.S \
module/icp/asm-x86_64/blake3/blake3_avx2.S \
module/icp/asm-x86_64/blake3/blake3_avx512.S \
module/icp/asm-x86_64/blake3/blake3_sse2.S \
diff --git a/lib/libzfs/Makefile.am b/lib/libzfs/Makefile.am
index f5eb84679..cffe34122 100644
--- a/lib/libzfs/Makefile.am
+++ b/lib/libzfs/Makefile.am
@@ -34,8 +34,6 @@ dist_libzfs_la_SOURCES += \
endif
nodist_libzfs_la_SOURCES = \
- module/icp/algs/sha2/sha2.c \
- \
module/zcommon/cityhash.c \
module/zcommon/zfeature_common.c \
module/zcommon/zfs_comutil.c \
@@ -52,7 +50,6 @@ nodist_libzfs_la_SOURCES = \
module/zcommon/zpool_prop.c \
module/zcommon/zprop_common.c
-
libzfs_la_LIBADD = \
libshare.la \
libzfs_core.la \
diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am
index 0cc1997f7..0748f1240 100644
--- a/lib/libzpool/Makefile.am
+++ b/lib/libzpool/Makefile.am
@@ -118,7 +118,7 @@ nodist_libzpool_la_SOURCES = \
module/zfs/refcount.c \
module/zfs/rrwlock.c \
module/zfs/sa.c \
- module/zfs/sha256.c \
+ module/zfs/sha2_zfs.c \
module/zfs/skein_zfs.c \
module/zfs/spa.c \
module/zfs/spa_checkpoint.c \
diff --git a/module/Kbuild.in b/module/Kbuild.in
index 6b1c9c48b..21606b8ca 100644
--- a/module/Kbuild.in
+++ b/module/Kbuild.in
@@ -85,7 +85,6 @@ ICP_OBJS := \
algs/blake3/blake3.o \
algs/blake3/blake3_generic.o \
algs/blake3/blake3_impl.o \
- algs/blake3/blake3_x86-64.o \
algs/edonr/edonr.o \
algs/modes/cbc.o \
algs/modes/ccm.o \
@@ -94,6 +93,9 @@ ICP_OBJS := \
algs/modes/gcm.o \
algs/modes/gcm_generic.o \
algs/modes/modes.o \
+ algs/sha2/sha2_generic.o \
+ algs/sha2/sha256_impl.o \
+ algs/sha2/sha512_impl.o \
algs/skein/skein.o \
algs/skein/skein_block.o \
algs/skein/skein_iv.o \
@@ -119,30 +121,40 @@ ICP_OBJS_X86_64 := \
asm-x86_64/blake3/blake3_avx512.o \
asm-x86_64/blake3/blake3_sse2.o \
asm-x86_64/blake3/blake3_sse41.o \
+ asm-x86_64/sha2/sha256-x86_64.o \
+ asm-x86_64/sha2/sha512-x86_64.o \
asm-x86_64/modes/aesni-gcm-x86_64.o \
asm-x86_64/modes/gcm_pclmulqdq.o \
asm-x86_64/modes/ghash-x86_64.o
-
ICP_OBJS_X86 := \
algs/aes/aes_impl_aesni.o \
algs/aes/aes_impl_x86-64.o \
algs/modes/gcm_pclmulqdq.o
+ICP_OBJS_ARM := \
+ asm-arm/sha2/sha256-armv7.o \
+ asm-arm/sha2/sha512-armv7.o
ICP_OBJS_ARM64 := \
asm-aarch64/blake3/b3_aarch64_sse2.o \
- asm-aarch64/blake3/b3_aarch64_sse41.o
-
+ asm-aarch64/blake3/b3_aarch64_sse41.o \
+ asm-aarch64/sha2/sha256-armv8.o \
+ asm-aarch64/sha2/sha512-armv8.o
ICP_OBJS_PPC_PPC64 := \
asm-ppc64/blake3/b3_ppc64le_sse2.o \
- asm-ppc64/blake3/b3_ppc64le_sse41.o
+ asm-ppc64/blake3/b3_ppc64le_sse41.o \
+ asm-ppc64/sha2/sha256-p8.o \
+ asm-ppc64/sha2/sha512-p8.o \
+ asm-ppc64/sha2/sha256-ppc.o \
+ asm-ppc64/sha2/sha512-ppc.o
zfs-objs += $(addprefix icp/,$(ICP_OBJS))
zfs-$(CONFIG_X86) += $(addprefix icp/,$(ICP_OBJS_X86))
zfs-$(CONFIG_UML_X86)+= $(addprefix icp/,$(ICP_OBJS_X86))
zfs-$(CONFIG_X86_64) += $(addprefix icp/,$(ICP_OBJS_X86_64))
+zfs-$(CONFIG_ARM) += $(addprefix icp/,$(ICP_OBJS_ARM))
zfs-$(CONFIG_ARM64) += $(addprefix icp/,$(ICP_OBJS_ARM64))
zfs-$(CONFIG_PPC) += $(addprefix icp/,$(ICP_OBJS_PPC_PPC64))
zfs-$(CONFIG_PPC64) += $(addprefix icp/,$(ICP_OBJS_PPC_PPC64))
@@ -156,6 +168,11 @@ $(addprefix $(obj)/icp/,$(ICP_OBJS) $(ICP_OBJS_X86) $(ICP_OBJS_X86_64) \
# Suppress objtool "return with modified stack frame" warnings.
OBJECT_FILES_NON_STANDARD_aesni-gcm-x86_64.o := y
+# Suppress objtool "unsupported stack pointer realignment" warnings.
+# See #6950 for the reasoning.
+OBJECT_FILES_NON_STANDARD_sha256-x86_64.o := y
+OBJECT_FILES_NON_STANDARD_sha512-x86_64.o := y
+
LUA_OBJS := \
lapi.o \
lauxlib.o \
@@ -382,6 +399,7 @@ ZFS_OBJS := \
zfs_chksum.o \
zfs_fm.o \
zfs_fuid.o \
+ zfs_impl.o \
zfs_ioctl.o \
zfs_log.o \
zfs_onexit.o \
diff --git a/module/Makefile.bsd b/module/Makefile.bsd
index 1663dcec6..667678796 100644
--- a/module/Makefile.bsd
+++ b/module/Makefile.bsd
@@ -13,10 +13,15 @@ KMOD= openzfs
${SRCDIR}/lua \
${SRCDIR}/nvpair \
${SRCDIR}/icp/algs/blake3 \
+ ${SRCDIR}/icp/algs/edonr \
+ ${SRCDIR}/icp/algs/sha2 \
${SRCDIR}/icp/asm-aarch64/blake3 \
+ ${SRCDIR}/icp/asm-aarch64/sha2 \
+ ${SRCDIR}/icp/asm-arm/sha2 \
+ ${SRCDIR}/icp/asm-ppc64/sha2 \
${SRCDIR}/icp/asm-ppc64/blake3 \
${SRCDIR}/icp/asm-x86_64/blake3 \
- ${SRCDIR}/icp/algs/edonr \
+ ${SRCDIR}/icp/asm-x86_64/sha2 \
${SRCDIR}/os/freebsd/spl \
${SRCDIR}/os/freebsd/zfs \
${SRCDIR}/unicode \
@@ -27,8 +32,6 @@ KMOD= openzfs
${SRCDIR}/zstd/lib/compress \
${SRCDIR}/zstd/lib/decompress
-
-
CFLAGS+= -I${INCDIR}
CFLAGS+= -I${INCDIR}/os/freebsd
CFLAGS+= -I${INCDIR}/os/freebsd/spl
@@ -88,8 +91,7 @@ SRCS+= edonr.c
#icp/algs/blake3
SRCS+= blake3.c \
blake3_generic.c \
- blake3_impl.c \
- blake3_x86-64.c
+ blake3_impl.c
#icp/asm-aarch64/blake3
SRCS+= b3_aarch64_sse2.S \
@@ -105,6 +107,29 @@ SRCS+= blake3_avx2.S \
blake3_sse2.S \
blake3_sse41.S
+#icp/algs/sha2
+SRCS+= sha2_generic.c \
+ sha256_impl.c \
+ sha512_impl.c
+
+#icp/asm-arm/sha2
+SRCS+= sha256-armv7.S \
+ sha512-armv7.S
+
+#icp/asm-aarch64/sha2
+SRCS+= sha256-armv8.S \
+ sha512-armv8.S
+
+#icp/asm-ppc64/sha2
+SRCS+= sha256-p8.S \
+ sha512-p8.S \
+ sha256-ppc.S \
+ sha512-ppc.S
+
+#icp/asm-x86_64/sha2
+SRCS+= sha256-x86_64.S \
+ sha512-x86_64.S
+
#lua
SRCS+= lapi.c \
lauxlib.c \
@@ -320,6 +345,7 @@ SRCS+= abd.c \
zfs_file_os.c \
zfs_fm.c \
zfs_fuid.c \
+ zfs_impl.c \
zfs_ioctl.c \
zfs_log.c \
zfs_onexit.c \
diff --git a/module/icp/algs/sha2/sha256_impl.c b/module/icp/algs/sha2/sha256_impl.c
new file mode 100644
index 000000000..024cfb1e4
--- /dev/null
+++ b/module/icp/algs/sha2/sha256_impl.c
@@ -0,0 +1,299 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2022 Tino Reichardt <[email protected]>
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zfs_impl.h>
+#include <sys/sha2.h>
+#include <sys/simd.h>
+
+#include <sha2/sha2_impl.h>
+
+#define TF(E, N) \
+ extern void E(uint32_t s[8], const void *, size_t); \
+ static inline void N(uint32_t s[8], const void *d, size_t b) { \
+ kfpu_begin(); E(s, d, b); kfpu_end(); \
+}
+
+/* some implementation is always okay */
+static inline boolean_t sha2_is_supported(void)
+{
+ return (B_TRUE);
+}
+
+#if defined(__x86_64)
+
+extern void zfs_sha256_transform_x64(uint32_t s[8], const void *, size_t);
+const sha256_ops_t sha256_x64_impl = {
+ .is_supported = sha2_is_supported,
+ .transform = zfs_sha256_transform_x64,
+ .name = "x64"
+};
+
+#if defined(HAVE_SSSE3)
+static boolean_t sha2_have_ssse3(void)
+{
+ return (kfpu_allowed() && zfs_ssse3_available());
+}
+
+TF(zfs_sha256_transform_ssse3, tf_sha256_ssse3);
+const sha256_ops_t sha256_ssse3_impl = {
+ .is_supported = sha2_have_ssse3,
+ .transform = tf_sha256_ssse3,
+ .name = "ssse3"
+};
+#endif
+
+#if defined(HAVE_AVX)
+static boolean_t sha2_have_avx(void)
+{
+ return (kfpu_allowed() && zfs_avx_available());
+}
+
+TF(zfs_sha256_transform_avx, tf_sha256_avx);
+const sha256_ops_t sha256_avx_impl = {
+ .is_supported = sha2_have_avx,
+ .transform = tf_sha256_avx,
+ .name = "avx"
+};
+#endif
+
+#if defined(HAVE_AVX2)
+static boolean_t sha2_have_avx2(void)
+{
+ return (kfpu_allowed() && zfs_avx2_available());
+}
+
+TF(zfs_sha256_transform_avx2, tf_sha256_avx2);
+const sha256_ops_t sha256_avx2_impl = {
+ .is_supported = sha2_have_avx2,
+ .transform = tf_sha256_avx2,
+ .name = "avx2"
+};
+#endif
+
+#if defined(HAVE_SSE4_1)
+static boolean_t sha2_have_shani(void)
+{
+ return (kfpu_allowed() && zfs_sse4_1_available() && \
+ zfs_shani_available());
+}
+
+TF(zfs_sha256_transform_shani, tf_sha256_shani);
+const sha256_ops_t sha256_shani_impl = {
+ .is_supported = sha2_have_shani,
+ .transform = tf_sha256_shani,
+ .name = "shani"
+};
+#endif
+
+#elif defined(__aarch64__) || defined(__arm__)
+static boolean_t sha256_have_neon(void)
+{
+ return (kfpu_allowed() && zfs_neon_available());
+}
+
+static boolean_t sha256_have_armv8ce(void)
+{
+ return (kfpu_allowed() && zfs_sha256_available());
+}
+
+extern void zfs_sha256_block_armv7(uint32_t s[8], const void *, size_t);
+const sha256_ops_t sha256_armv7_impl = {
+ .is_supported = sha2_is_supported,
+ .transform = zfs_sha256_block_armv7,
+ .name = "armv7"
+};
+
+TF(zfs_sha256_block_neon, tf_sha256_neon);
+const sha256_ops_t sha256_neon_impl = {
+ .is_supported = sha256_have_neon,
+ .transform = tf_sha256_neon,
+ .name = "neon"
+};
+
+TF(zfs_sha256_block_armv8, tf_sha256_armv8ce);
+const sha256_ops_t sha256_armv8_impl = {
+ .is_supported = sha256_have_armv8ce,
+ .transform = tf_sha256_armv8ce,
+ .name = "armv8-ce"
+};
+
+#elif defined(__PPC64__)
+static boolean_t sha256_have_vsx(void)
+{
+ return (kfpu_allowed() && zfs_vsx_available());
+}
+
+TF(zfs_sha256_ppc, tf_sha256_ppc);
+const sha256_ops_t sha256_ppc_impl = {
+ .is_supported = sha2_is_supported,
+ .transform = tf_sha256_ppc,
+ .name = "ppc"
+};
+
+TF(zfs_sha256_power8, tf_sha256_power8);
+const sha256_ops_t sha256_power8_impl = {
+ .is_supported = sha256_have_vsx,
+ .transform = tf_sha256_power8,
+ .name = "power8"
+};
+#endif /* __PPC64__ */
+
+/* the two generic ones */
+extern const sha256_ops_t sha256_generic_impl;
+
+/* array with all sha256 implementations */
+static const sha256_ops_t *const sha256_impls[] = {
+ &sha256_generic_impl,
+#if defined(__x86_64)
+ &sha256_x64_impl,
+#endif
+#if defined(__x86_64) && defined(HAVE_SSSE3)
+ &sha256_ssse3_impl,
+#endif
+#if defined(__x86_64) && defined(HAVE_AVX)
+ &sha256_avx_impl,
+#endif
+#if defined(__x86_64) && defined(HAVE_AVX2)
+ &sha256_avx2_impl,
+#endif
+#if defined(__x86_64) && defined(HAVE_SSE4_1)
+ &sha256_shani_impl,
+#endif
+#if defined(__aarch64__) || defined(__arm__)
+ &sha256_armv7_impl,
+ &sha256_neon_impl,
+ &sha256_armv8_impl,
+#endif
+#if defined(__PPC64__)
+ &sha256_ppc_impl,
+ &sha256_power8_impl,
+#endif /* __PPC64__ */
+};
+
+/* use the generic implementation functions */
+#define IMPL_NAME "sha256"
+#define IMPL_OPS_T sha256_ops_t
+#define IMPL_ARRAY sha256_impls
+#define IMPL_GET_OPS sha256_get_ops
+#define ZFS_IMPL_OPS zfs_sha256_ops
+#include <generic_impl.c>
+
+#ifdef _KERNEL
+
+#define IMPL_FMT(impl, i) (((impl) == (i)) ? "[%s] " : "%s ")
+
+#if defined(__linux__)
+
+static int
+sha256_param_get(char *buffer, zfs_kernel_param_t *unused)
+{
+ const uint32_t impl = IMPL_READ(generic_impl_chosen);
+ char *fmt;
+ int cnt = 0;
+
+ /* cycling */
+ fmt = IMPL_FMT(impl, IMPL_CYCLE);
+ cnt += sprintf(buffer + cnt, fmt, "cycle");
+
+ /* list fastest */
+ fmt = IMPL_FMT(impl, IMPL_FASTEST);
+ cnt += sprintf(buffer + cnt, fmt, "fastest");
+
+ /* list all supported implementations */
+ generic_impl_init();
+ for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) {
+ fmt = IMPL_FMT(impl, i);
+ cnt += sprintf(buffer + cnt, fmt,
+ generic_supp_impls[i]->name);
+ }
+
+ return (cnt);
+}
+
+static int
+sha256_param_set(const char *val, zfs_kernel_param_t *unused)
+{
+ (void) unused;
+ return (generic_impl_setname(val));
+}
+
+#elif defined(__FreeBSD__)
+
+#include <sys/sbuf.h>
+
+static int
+sha256_param(ZFS_MODULE_PARAM_ARGS)
+{
+ int err;
+
+ generic_impl_init();
+ if (req->newptr == NULL) {
+ const uint32_t impl = IMPL_READ(generic_impl_chosen);
+ const int init_buflen = 64;
+ const char *fmt;
+ struct sbuf *s;
+
+ s = sbuf_new_for_sysctl(NULL, NULL, init_buflen, req);
+
+ /* cycling */
+ fmt = IMPL_FMT(impl, IMPL_CYCLE);
+ (void) sbuf_printf(s, fmt, "cycle");
+
+ /* list fastest */
+ fmt = IMPL_FMT(impl, IMPL_FASTEST);
+ (void) sbuf_printf(s, fmt, "fastest");
+
+ /* list all supported implementations */
+ for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) {
+ fmt = IMPL_FMT(impl, i);
+ (void) sbuf_printf(s, fmt, generic_supp_impls[i]->name);
+ }
+
+ err = sbuf_finish(s);
+ sbuf_delete(s);
+
+ return (err);
+ }
+
+ char buf[16];
+
+ err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
+ if (err) {
+ return (err);
+ }
+
+ return (-generic_impl_setname(buf));
+}
+#endif
+
+#undef IMPL_FMT
+
+ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs, zfs_, sha256_impl,
+ sha256_param_set, sha256_param_get, ZMOD_RW, \
+ "Select SHA256 implementation.");
+#endif
+
+#undef TF
diff --git a/module/icp/algs/sha2/sha2_generic.c b/module/icp/algs/sha2/sha2_generic.c
new file mode 100644
index 000000000..e69dc7771
--- /dev/null
+++ b/module/icp/algs/sha2/sha2_generic.c
@@ -0,0 +1,562 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Based on public domain code in cppcrypto 0.10.
+ * Copyright (c) 2022 Tino Reichardt <[email protected]>
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zfs_impl.h>
+#include <sys/sha2.h>
+
+#include <sha2/sha2_impl.h>
+
+/*
+ * On i386, gcc brings this for sha512_generic():
+ * error: the frame size of 1040 bytes is larger than 1024
+ */
+#if defined(__GNUC__) && defined(_ILP32)
+#pragma GCC diagnostic ignored "-Wframe-larger-than="
+#endif
+
+/* SHA256 */
+static const uint32_t SHA256_K[64] = {
+ 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+ 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+ 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+ 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+ 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+ 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+ 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+ 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+ 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+ 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+ 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+ 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+ 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+ 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+ 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+ 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+#define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
+#define Maj(x, y, z) (((y) & (z)) | (((y) | (z)) & (x)))
+
+#define rotr32(x, n) (((x) >> n) | ((x) << (32 - n)))
+#define sum0(x) (rotr32((x), 2) ^ rotr32((x), 13) ^ rotr32((x), 22))
+#define sum1(x) (rotr32((x), 6) ^ rotr32((x), 11) ^ rotr32((x), 25))
+#define sigma0(x) (rotr32((x), 7) ^ rotr32((x), 18) ^ ((x) >> 3))
+#define sigma1(x) (rotr32((x), 17) ^ rotr32((x), 19) ^ ((x) >> 10))
+
+#define WU(j) (W[j & 15] += sigma1(W[(j + 14) & 15]) \
+ + W[(j + 9) & 15] + sigma0(W[(j + 1) & 15]))
+
+#define COMPRESS(i, j, K) \
+ T1 = h + sum1(e) + Ch(e, f, g) + K[i + j] + (i? WU(j): W[j]); \
+ T2 = sum0(a) + Maj(a, b, c); \
+ h = g, g = f, f = e, e = d + T1; \
+ d = c, c = b, b = a, a = T1 + T2;
+
+static void sha256_generic(uint32_t state[8], const void *data, size_t num_blks)
+{
+ uint64_t blk;
+
+ for (blk = 0; blk < num_blks; blk++) {
+ uint32_t W[16];
+ uint32_t a, b, c, d, e, f, g, h;
+ uint32_t T1, T2;
+ int i;
+
+ for (i = 0; i < 16; i++) {
+ W[i] = BE_32( \
+ (((const uint32_t *)(data))[blk * 16 + i]));
+ }
+
+ a = state[0];
+ b = state[1];
+ c = state[2];
+ d = state[3];
+ e = state[4];
+ f = state[5];
+ g = state[6];
+ h = state[7];
+
+ for (i = 0; i <= 63; i += 16) {
+ COMPRESS(i, 0, SHA256_K);
+ COMPRESS(i, 1, SHA256_K);
+ COMPRESS(i, 2, SHA256_K);
+ COMPRESS(i, 3, SHA256_K);
+ COMPRESS(i, 4, SHA256_K);
+ COMPRESS(i, 5, SHA256_K);
+ COMPRESS(i, 6, SHA256_K);
+ COMPRESS(i, 7, SHA256_K);
+ COMPRESS(i, 8, SHA256_K);
+ COMPRESS(i, 9, SHA256_K);
+ COMPRESS(i, 10, SHA256_K);
+ COMPRESS(i, 11, SHA256_K);
+ COMPRESS(i, 12, SHA256_K);
+ COMPRESS(i, 13, SHA256_K);
+ COMPRESS(i, 14, SHA256_K);
+ COMPRESS(i, 15, SHA256_K);
+ }
+
+ state[0] += a;
+ state[1] += b;
+ state[2] += c;
+ state[3] += d;
+ state[4] += e;
+ state[5] += f;
+ state[6] += g;
+ state[7] += h;
+ }
+}
+
+#undef sum0
+#undef sum1
+#undef sigma0
+#undef sigma1
+
+#define rotr64(x, n) (((x) >> n) | ((x) << (64 - n)))
+#define sum0(x) (rotr64((x), 28) ^ rotr64((x), 34) ^ rotr64((x), 39))
+#define sum1(x) (rotr64((x), 14) ^ rotr64((x), 18) ^ rotr64((x), 41))
+#define sigma0(x) (rotr64((x), 1) ^ rotr64((x), 8) ^ ((x) >> 7))
+#define sigma1(x) (rotr64((x), 19) ^ rotr64((x), 61) ^ ((x) >> 6))
+
+/* SHA512 */
+static const uint64_t SHA512_K[80] = {
+ 0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f,
+ 0xe9b5dba58189dbbc, 0x3956c25bf348b538, 0x59f111f1b605d019,
+ 0x923f82a4af194f9b, 0xab1c5ed5da6d8118, 0xd807aa98a3030242,
+ 0x12835b0145706fbe, 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2,
+ 0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x9bdc06a725c71235,
+ 0xc19bf174cf692694, 0xe49b69c19ef14ad2, 0xefbe4786384f25e3,
+ 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65, 0x2de92c6f592b0275,
+ 0x4a7484aa6ea6e483, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5,
+ 0x983e5152ee66dfab, 0xa831c66d2db43210, 0xb00327c898fb213f,
+ 0xbf597fc7beef0ee4, 0xc6e00bf33da88fc2, 0xd5a79147930aa725,
+ 0x06ca6351e003826f, 0x142929670a0e6e70, 0x27b70a8546d22ffc,
+ 0x2e1b21385c26c926, 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df,
+ 0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x81c2c92e47edaee6,
+ 0x92722c851482353b, 0xa2bfe8a14cf10364, 0xa81a664bbc423001,
+ 0xc24b8b70d0f89791, 0xc76c51a30654be30, 0xd192e819d6ef5218,
+ 0xd69906245565a910, 0xf40e35855771202a, 0x106aa07032bbd1b8,
+ 0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 0x2748774cdf8eeb99,
+ 0x34b0bcb5e19b48a8, 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb,
+ 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3, 0x748f82ee5defb2fc,
+ 0x78a5636f43172f60, 0x84c87814a1f0ab72, 0x8cc702081a6439ec,
+ 0x90befffa23631e28, 0xa4506cebde82bde9, 0xbef9a3f7b2c67915,
+ 0xc67178f2e372532b, 0xca273eceea26619c, 0xd186b8c721c0c207,
+ 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178, 0x06f067aa72176fba,
+ 0x0a637dc5a2c898a6, 0x113f9804bef90dae, 0x1b710b35131c471b,
+ 0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc,
+ 0x431d67c49c100d4c, 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a,
+ 0x5fcb6fab3ad6faec, 0x6c44198c4a475817
+};
+
+static void sha512_generic(uint64_t state[8], const void *data, size_t num_blks)
+{
+ uint64_t blk;
+
+ for (blk = 0; blk < num_blks; blk++) {
+ uint64_t W[16];
+ uint64_t a, b, c, d, e, f, g, h;
+ uint64_t T1, T2;
+ int i;
+
+ for (i = 0; i < 16; i++) {
+ W[i] = BE_64( \
+ (((const uint64_t *)(data))[blk * 16 + i]));
+ }
+
+ a = state[0];
+ b = state[1];
+ c = state[2];
+ d = state[3];
+ e = state[4];
+ f = state[5];
+ g = state[6];
+ h = state[7];
+
+ for (i = 0; i <= 79; i += 16) {
+ COMPRESS(i, 0, SHA512_K);
+ COMPRESS(i, 1, SHA512_K);
+ COMPRESS(i, 2, SHA512_K);
+ COMPRESS(i, 3, SHA512_K);
+ COMPRESS(i, 4, SHA512_K);
+ COMPRESS(i, 5, SHA512_K);
+ COMPRESS(i, 6, SHA512_K);
+ COMPRESS(i, 7, SHA512_K);
+ COMPRESS(i, 8, SHA512_K);
+ COMPRESS(i, 9, SHA512_K);
+ COMPRESS(i, 10, SHA512_K);
+ COMPRESS(i, 11, SHA512_K);
+ COMPRESS(i, 12, SHA512_K);
+ COMPRESS(i, 13, SHA512_K);
+ COMPRESS(i, 14, SHA512_K);
+ COMPRESS(i, 15, SHA512_K);
+ }
+ state[0] += a;
+ state[1] += b;
+ state[2] += c;
+ state[3] += d;
+ state[4] += e;
+ state[5] += f;
+ state[6] += g;
+ state[7] += h;
+ }
+}
+
+static void sha256_update(sha256_ctx *ctx, const uint8_t *data, size_t len)
+{
+ uint64_t pos = ctx->count[0];
+ uint64_t total = ctx->count[1];
+ uint8_t *m = ctx->wbuf;
+ const sha256_ops_t *ops = ctx->ops;
+
+ if (pos && pos + len >= 64) {
+ memcpy(m + pos, data, 64 - pos);
+ ops->transform(ctx->state, m, 1);
+ len -= 64 - pos;
+ total += (64 - pos) * 8;
+ data += 64 - pos;
+ pos = 0;
+ }
+
+ if (len >= 64) {
+ uint32_t blocks = len / 64;
+ uint32_t bytes = blocks * 64;
+ ops->transform(ctx->state, data, blocks);
+ len -= bytes;
+ total += (bytes) * 8;
+ data += bytes;
+ }
+ memcpy(m + pos, data, len);
+
+ pos += len;
+ total += len * 8;
+ ctx->count[0] = pos;
+ ctx->count[1] = total;
+}
+
+static void sha512_update(sha512_ctx *ctx, const uint8_t *data, size_t len)
+{
+ uint64_t pos = ctx->count[0];
+ uint64_t total = ctx->count[1];
+ uint8_t *m = ctx->wbuf;
+ const sha512_ops_t *ops = ctx->ops;
+
+ if (pos && pos + len >= 128) {
+ memcpy(m + pos, data, 128 - pos);
+ ops->transform(ctx->state, m, 1);
+ len -= 128 - pos;
+ total += (128 - pos) * 8;
+ data += 128 - pos;
+ pos = 0;
+ }
+
+ if (len >= 128) {
+ uint64_t blocks = len / 128;
+ uint64_t bytes = blocks * 128;
+ ops->transform(ctx->state, data, blocks);
+ len -= bytes;
+ total += (bytes) * 8;
+ data += bytes;
+ }
+ memcpy(m + pos, data, len);
+
+ pos += len;
+ total += len * 8;
+ ctx->count[0] = pos;
+ ctx->count[1] = total;
+}
+
+static void sha256_final(sha256_ctx *ctx, uint8_t *result, int bits)
+{
+ uint64_t mlen, pos = ctx->count[0];
+ uint8_t *m = ctx->wbuf;
+ uint32_t *R = (uint32_t *)result;
+ const sha256_ops_t *ops = ctx->ops;
+
+ m[pos++] = 0x80;
+ if (pos > 56) {
+ memset(m + pos, 0, 64 - pos);
+ ops->transform(ctx->state, m, 1);
+ pos = 0;
+ }
+
+ memset(m + pos, 0, 64 - pos);
+ mlen = BE_64(ctx->count[1]);
+ memcpy(m + (64 - 8), &mlen, 64 / 8);
+ ops->transform(ctx->state, m, 1);
+
+ switch (bits) {
+ case 224: /* 28 - unused currently /TR */
+ R[0] = BE_32(ctx->state[0]);
+ R[1] = BE_32(ctx->state[1]);
+ R[2] = BE_32(ctx->state[2]);
+ R[3] = BE_32(ctx->state[3]);
+ R[4] = BE_32(ctx->state[4]);
+ R[5] = BE_32(ctx->state[5]);
+ R[6] = BE_32(ctx->state[6]);
+ break;
+ case 256: /* 32 */
+ R[0] = BE_32(ctx->state[0]);
+ R[1] = BE_32(ctx->state[1]);
+ R[2] = BE_32(ctx->state[2]);
+ R[3] = BE_32(ctx->state[3]);
+ R[4] = BE_32(ctx->state[4]);
+ R[5] = BE_32(ctx->state[5]);
+ R[6] = BE_32(ctx->state[6]);
+ R[7] = BE_32(ctx->state[7]);
+ break;
+ }
+
+ memset(ctx, 0, sizeof (*ctx));
+}
+
+static void sha512_final(sha512_ctx *ctx, uint8_t *result, int bits)
+{
+ uint64_t mlen, pos = ctx->count[0];
+ uint8_t *m = ctx->wbuf, *r;
+ uint64_t *R = (uint64_t *)result;
+ const sha512_ops_t *ops = ctx->ops;
+
+ m[pos++] = 0x80;
+ if (pos > 112) {
+ memset(m + pos, 0, 128 - pos);
+ ops->transform(ctx->state, m, 1);
+ pos = 0;
+ }
+
+ memset(m + pos, 0, 128 - pos);
+ mlen = BE_64(ctx->count[1]);
+ memcpy(m + (128 - 8), &mlen, 64 / 8);
+ ops->transform(ctx->state, m, 1);
+
+ switch (bits) {
+ case 224: /* 28 => 3,5 x 8 */
+ r = result + 24;
+ R[0] = BE_64(ctx->state[0]);
+ R[1] = BE_64(ctx->state[1]);
+ R[2] = BE_64(ctx->state[2]);
+ /* last 4 bytes are special here */
+ *r++ = (uint8_t)(ctx->state[3] >> 56);
+ *r++ = (uint8_t)(ctx->state[3] >> 48);
+ *r++ = (uint8_t)(ctx->state[3] >> 40);
+ *r++ = (uint8_t)(ctx->state[3] >> 32);
+ break;
+ case 256: /* 32 */
+ R[0] = BE_64(ctx->state[0]);
+ R[1] = BE_64(ctx->state[1]);
+ R[2] = BE_64(ctx->state[2]);
+ R[3] = BE_64(ctx->state[3]);
+ break;
+ case 384: /* 48 */
+ R[0] = BE_64(ctx->state[0]);
+ R[1] = BE_64(ctx->state[1]);
+ R[2] = BE_64(ctx->state[2]);
+ R[3] = BE_64(ctx->state[3]);
+ R[4] = BE_64(ctx->state[4]);
+ R[5] = BE_64(ctx->state[5]);
+ break;
+ case 512: /* 64 */
+ R[0] = BE_64(ctx->state[0]);
+ R[1] = BE_64(ctx->state[1]);
+ R[2] = BE_64(ctx->state[2]);
+ R[3] = BE_64(ctx->state[3]);
+ R[4] = BE_64(ctx->state[4]);
+ R[5] = BE_64(ctx->state[5]);
+ R[6] = BE_64(ctx->state[6]);
+ R[7] = BE_64(ctx->state[7]);
+ break;
+ }
+
+ memset(ctx, 0, sizeof (*ctx));
+}
+
+/* SHA2 Init function */
+void
+SHA2Init(int algotype, SHA2_CTX *ctx)
+{
+ sha256_ctx *ctx256 = &ctx->sha256;
+ sha512_ctx *ctx512 = &ctx->sha512;
+
+ ASSERT3U(algotype, >=, SHA256_MECH_INFO_TYPE);
+ ASSERT3U(algotype, <=, SHA512_256_MECH_INFO_TYPE);
+
+ memset(ctx, 0, sizeof (*ctx));
+ ctx->algotype = algotype;
+ switch (ctx->algotype) {
+ case SHA256_MECH_INFO_TYPE:
+ case SHA256_HMAC_MECH_INFO_TYPE:
+ case SHA256_HMAC_GEN_MECH_INFO_TYPE:
+ ctx256->state[0] = 0x6a09e667;
+ ctx256->state[1] = 0xbb67ae85;
+ ctx256->state[2] = 0x3c6ef372;
+ ctx256->state[3] = 0xa54ff53a;
+ ctx256->state[4] = 0x510e527f;
+ ctx256->state[5] = 0x9b05688c;
+ ctx256->state[6] = 0x1f83d9ab;
+ ctx256->state[7] = 0x5be0cd19;
+ ctx256->count[0] = 0;
+ ctx256->ops = sha256_get_ops();
+ break;
+ case SHA384_MECH_INFO_TYPE:
+ case SHA384_HMAC_MECH_INFO_TYPE:
+ case SHA384_HMAC_GEN_MECH_INFO_TYPE:
+ ctx512->state[0] = 0xcbbb9d5dc1059ed8ULL;
+ ctx512->state[1] = 0x629a292a367cd507ULL;
+ ctx512->state[2] = 0x9159015a3070dd17ULL;
+ ctx512->state[3] = 0x152fecd8f70e5939ULL;
+ ctx512->state[4] = 0x67332667ffc00b31ULL;
+ ctx512->state[5] = 0x8eb44a8768581511ULL;
+ ctx512->state[6] = 0xdb0c2e0d64f98fa7ULL;
+ ctx512->state[7] = 0x47b5481dbefa4fa4ULL;
+ ctx512->count[0] = 0;
+ ctx512->count[1] = 0;
+ ctx512->ops = sha512_get_ops();
+ break;
+ case SHA512_MECH_INFO_TYPE:
+ case SHA512_HMAC_MECH_INFO_TYPE:
+ case SHA512_HMAC_GEN_MECH_INFO_TYPE:
+ ctx512->state[0] = 0x6a09e667f3bcc908ULL;
+ ctx512->state[1] = 0xbb67ae8584caa73bULL;
+ ctx512->state[2] = 0x3c6ef372fe94f82bULL;
+ ctx512->state[3] = 0xa54ff53a5f1d36f1ULL;
+ ctx512->state[4] = 0x510e527fade682d1ULL;
+ ctx512->state[5] = 0x9b05688c2b3e6c1fULL;
+ ctx512->state[6] = 0x1f83d9abfb41bd6bULL;
+ ctx512->state[7] = 0x5be0cd19137e2179ULL;
+ ctx512->count[0] = 0;
+ ctx512->count[1] = 0;
+ ctx512->ops = sha512_get_ops();
+ break;
+ case SHA512_224_MECH_INFO_TYPE:
+ ctx512->state[0] = 0x8c3d37c819544da2ULL;
+ ctx512->state[1] = 0x73e1996689dcd4d6ULL;
+ ctx512->state[2] = 0x1dfab7ae32ff9c82ULL;
+ ctx512->state[3] = 0x679dd514582f9fcfULL;
+ ctx512->state[4] = 0x0f6d2b697bd44da8ULL;
+ ctx512->state[5] = 0x77e36f7304c48942ULL;
+ ctx512->state[6] = 0x3f9d85a86a1d36c8ULL;
+ ctx512->state[7] = 0x1112e6ad91d692a1ULL;
+ ctx512->count[0] = 0;
+ ctx512->count[1] = 0;
+ ctx512->ops = sha512_get_ops();
+ break;
+ case SHA512_256_MECH_INFO_TYPE:
+ ctx512->state[0] = 0x22312194fc2bf72cULL;
+ ctx512->state[1] = 0x9f555fa3c84c64c2ULL;
+ ctx512->state[2] = 0x2393b86b6f53b151ULL;
+ ctx512->state[3] = 0x963877195940eabdULL;
+ ctx512->state[4] = 0x96283ee2a88effe3ULL;
+ ctx512->state[5] = 0xbe5e1e2553863992ULL;
+ ctx512->state[6] = 0x2b0199fc2c85b8aaULL;
+ ctx512->state[7] = 0x0eb72ddc81c52ca2ULL;
+ ctx512->count[0] = 0;
+ ctx512->count[1] = 0;
+ ctx512->ops = sha512_get_ops();
+ break;
+ }
+}
+
+/* SHA2 Update function */
+void
+SHA2Update(SHA2_CTX *ctx, const void *data, size_t len)
+{
+ /* check for zero input length */
+ if (len == 0)
+ return;
+
+ ASSERT3P(data, !=, NULL);
+
+ switch (ctx->algotype) {
+ case SHA256_MECH_INFO_TYPE:
+ case SHA256_HMAC_MECH_INFO_TYPE:
+ case SHA256_HMAC_GEN_MECH_INFO_TYPE:
+ sha256_update(&ctx->sha256, data, len);
+ break;
+ case SHA384_MECH_INFO_TYPE:
+ case SHA384_HMAC_MECH_INFO_TYPE:
+ case SHA384_HMAC_GEN_MECH_INFO_TYPE:
+ sha512_update(&ctx->sha512, data, len);
+ break;
+ case SHA512_MECH_INFO_TYPE:
+ case SHA512_HMAC_MECH_INFO_TYPE:
+ case SHA512_HMAC_GEN_MECH_INFO_TYPE:
+ sha512_update(&ctx->sha512, data, len);
+ break;
+ case SHA512_224_MECH_INFO_TYPE:
+ sha512_update(&ctx->sha512, data, len);
+ break;
+ case SHA512_256_MECH_INFO_TYPE:
+ sha512_update(&ctx->sha512, data, len);
+ break;
+ }
+}
+
+/* SHA2Final function */
+void
+SHA2Final(void *digest, SHA2_CTX *ctx)
+{
+ switch (ctx->algotype) {
+ case SHA256_MECH_INFO_TYPE:
+ case SHA256_HMAC_MECH_INFO_TYPE:
+ case SHA256_HMAC_GEN_MECH_INFO_TYPE:
+ sha256_final(&ctx->sha256, digest, 256);
+ break;
+ case SHA384_MECH_INFO_TYPE:
+ case SHA384_HMAC_MECH_INFO_TYPE:
+ case SHA384_HMAC_GEN_MECH_INFO_TYPE:
+ sha512_final(&ctx->sha512, digest, 384);
+ break;
+ case SHA512_MECH_INFO_TYPE:
+ case SHA512_HMAC_MECH_INFO_TYPE:
+ case SHA512_HMAC_GEN_MECH_INFO_TYPE:
+ sha512_final(&ctx->sha512, digest, 512);
+ break;
+ case SHA512_224_MECH_INFO_TYPE:
+ sha512_final(&ctx->sha512, digest, 224);
+ break;
+ case SHA512_256_MECH_INFO_TYPE:
+ sha512_final(&ctx->sha512, digest, 256);
+ break;
+ }
+}
+
+/* the generic implementation is always okay */
+static boolean_t sha2_is_supported(void)
+{
+ return (B_TRUE);
+}
+
+const sha256_ops_t sha256_generic_impl = {
+ .name = "generic",
+ .transform = sha256_generic,
+ .is_supported = sha2_is_supported
+};
+
+const sha512_ops_t sha512_generic_impl = {
+ .name = "generic",
+ .transform = sha512_generic,
+ .is_supported = sha2_is_supported
+};
diff --git a/module/icp/algs/sha2/sha512_impl.c b/module/icp/algs/sha2/sha512_impl.c
new file mode 100644
index 000000000..d21312336
--- /dev/null
+++ b/module/icp/algs/sha2/sha512_impl.c
@@ -0,0 +1,276 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2022 Tino Reichardt <[email protected]>
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zfs_impl.h>
+#include <sys/sha2.h>
+#include <sys/simd.h>
+
+#include <sha2/sha2_impl.h>
+
+#define TF(E, N) \
+ extern void E(uint64_t s[8], const void *, size_t); \
+ static inline void N(uint64_t s[8], const void *d, size_t b) { \
+ kfpu_begin(); E(s, d, b); kfpu_end(); \
+}
+
+/* some implementation is always okay */
+static inline boolean_t sha2_is_supported(void)
+{
+ return (B_TRUE);
+}
+
+#if defined(__x86_64)
+
+extern void zfs_sha512_transform_x64(uint64_t s[8], const void *, size_t);
+const sha512_ops_t sha512_x64_impl = {
+ .is_supported = sha2_is_supported,
+ .transform = zfs_sha512_transform_x64,
+ .name = "x64"
+};
+
+#if defined(HAVE_AVX)
+static boolean_t sha2_have_avx(void)
+{
+ return (kfpu_allowed() && zfs_avx_available());
+}
+
+TF(zfs_sha512_transform_avx, tf_sha512_avx);
+const sha512_ops_t sha512_avx_impl = {
+ .is_supported = sha2_have_avx,
+ .transform = tf_sha512_avx,
+ .name = "avx"
+};
+#endif
+
+#if defined(HAVE_AVX2)
+static boolean_t sha2_have_avx2(void)
+{
+ return (kfpu_allowed() && zfs_avx2_available());
+}
+
+TF(zfs_sha512_transform_avx2, tf_sha512_avx2);
+const sha512_ops_t sha512_avx2_impl = {
+ .is_supported = sha2_have_avx2,
+ .transform = tf_sha512_avx2,
+ .name = "avx2"
+};
+#endif
+
+#elif defined(__aarch64__)
+extern void zfs_sha512_block_armv7(uint64_t s[8], const void *, size_t);
+const sha512_ops_t sha512_armv7_impl = {
+ .is_supported = sha2_is_supported,
+ .transform = zfs_sha512_block_armv7,
+ .name = "armv7"
+};
+
+static boolean_t sha512_have_armv8ce(void)
+{
+ return (kfpu_allowed() && zfs_sha512_available());
+}
+
+TF(zfs_sha512_block_armv8, tf_sha512_armv8ce);
+const sha512_ops_t sha512_armv8_impl = {
+ .is_supported = sha512_have_armv8ce,
+ .transform = tf_sha512_armv8ce,
+ .name = "armv8-ce"
+};
+
+#elif defined(__arm__)
+extern void zfs_sha512_block_armv7(uint64_t s[8], const void *, size_t);
+const sha512_ops_t sha512_armv7_impl = {
+ .is_supported = sha2_is_supported,
+ .transform = zfs_sha512_block_armv7,
+ .name = "armv7"
+};
+
+static boolean_t sha512_have_neon(void)
+{
+ return (kfpu_allowed() && zfs_neon_available());
+}
+
+TF(zfs_sha512_block_neon, tf_sha512_neon);
+const sha512_ops_t sha512_neon_impl = {
+ .is_supported = sha512_have_neon,
+ .transform = tf_sha512_neon,
+ .name = "neon"
+};
+
+#elif defined(__PPC64__)
+TF(zfs_sha512_ppc, tf_sha512_ppc);
+const sha512_ops_t sha512_ppc_impl = {
+ .is_supported = sha2_is_supported,
+ .transform = tf_sha512_ppc,
+ .name = "ppc"
+};
+
+static boolean_t sha512_have_vsx(void)
+{
+ return (kfpu_allowed() && zfs_vsx_available());
+}
+
+TF(zfs_sha512_power8, tf_sha512_power8);
+const sha512_ops_t sha512_power8_impl = {
+ .is_supported = sha512_have_vsx,
+ .transform = tf_sha512_power8,
+ .name = "power8"
+};
+#endif /* __PPC64__ */
+
+/* the two generic ones */
+extern const sha512_ops_t sha512_generic_impl;
+
+/* array with all sha512 implementations */
+static const sha512_ops_t *const sha512_impls[] = {
+ &sha512_generic_impl,
+#if defined(__x86_64)
+ &sha512_x64_impl,
+#endif
+#if defined(__x86_64) && defined(HAVE_AVX)
+ &sha512_avx_impl,
+#endif
+#if defined(__x86_64) && defined(HAVE_AVX2)
+ &sha512_avx2_impl,
+#endif
+#if defined(__aarch64__)
+ &sha512_armv7_impl,
+ &sha512_armv8_impl,
+#endif
+#if defined(__arm__)
+ &sha512_armv7_impl,
+ &sha512_neon_impl,
+#endif
+#if defined(__PPC64__)
+ &sha512_ppc_impl,
+ &sha512_power8_impl,
+#endif /* __PPC64__ */
+};
+
+/* use the generic implementation functions */
+#define IMPL_NAME "sha512"
+#define IMPL_OPS_T sha512_ops_t
+#define IMPL_ARRAY sha512_impls
+#define IMPL_GET_OPS sha512_get_ops
+#define ZFS_IMPL_OPS zfs_sha512_ops
+#include <generic_impl.c>
+
+#ifdef _KERNEL
+
+#define IMPL_FMT(impl, i) (((impl) == (i)) ? "[%s] " : "%s ")
+
+#if defined(__linux__)
+
+static int
+sha512_param_get(char *buffer, zfs_kernel_param_t *unused)
+{
+ const uint32_t impl = IMPL_READ(generic_impl_chosen);
+ char *fmt;
+ int cnt = 0;
+
+ /* cycling */
+ fmt = IMPL_FMT(impl, IMPL_CYCLE);
+ cnt += sprintf(buffer + cnt, fmt, "cycle");
+
+ /* list fastest */
+ fmt = IMPL_FMT(impl, IMPL_FASTEST);
+ cnt += sprintf(buffer + cnt, fmt, "fastest");
+
+ /* list all supported implementations */
+ generic_impl_init();
+ for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) {
+ fmt = IMPL_FMT(impl, i);
+ cnt += sprintf(buffer + cnt, fmt,
+ generic_supp_impls[i]->name);
+ }
+
+ return (cnt);
+}
+
+static int
+sha512_param_set(const char *val, zfs_kernel_param_t *unused)
+{
+ (void) unused;
+ return (generic_impl_setname(val));
+}
+
+#elif defined(__FreeBSD__)
+
+#include <sys/sbuf.h>
+
+static int
+sha512_param(ZFS_MODULE_PARAM_ARGS)
+{
+ int err;
+
+ generic_impl_init();
+ if (req->newptr == NULL) {
+ const uint32_t impl = IMPL_READ(generic_impl_chosen);
+ const int init_buflen = 64;
+ const char *fmt;
+ struct sbuf *s;
+
+ s = sbuf_new_for_sysctl(NULL, NULL, init_buflen, req);
+
+ /* cycling */
+ fmt = IMPL_FMT(impl, IMPL_CYCLE);
+ (void) sbuf_printf(s, fmt, "cycle");
+
+ /* list fastest */
+ fmt = IMPL_FMT(impl, IMPL_FASTEST);
+ (void) sbuf_printf(s, fmt, "fastest");
+
+ /* list all supported implementations */
+ for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) {
+ fmt = IMPL_FMT(impl, i);
+ (void) sbuf_printf(s, fmt, generic_supp_impls[i]->name);
+ }
+
+ err = sbuf_finish(s);
+ sbuf_delete(s);
+
+ return (err);
+ }
+
+ /* we got module parameter */
+ char buf[16];
+
+ err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
+ if (err) {
+ return (err);
+ }
+
+ return (-generic_impl_setname(buf));
+}
+#endif
+
+#undef IMPL_FMT
+
+ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs, zfs_, sha512_impl,
+ sha512_param_set, sha512_param_get, ZMOD_RW, \
+ "Select SHA512 implementation.");
+#endif
+
+#undef TF
diff --git a/module/icp/asm-aarch64/sha2/sha256-armv8.S b/module/icp/asm-aarch64/sha2/sha256-armv8.S
new file mode 100644
index 000000000..fa50c4e74
--- /dev/null
+++ b/module/icp/asm-aarch64/sha2/sha256-armv8.S
@@ -0,0 +1,1999 @@
+/*
+ * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Portions Copyright (c) 2022 Tino Reichardt <[email protected]>
+ * - modified assembly to fit into OpenZFS
+ */
+
+#if defined(__aarch64__)
+
+.text
+
+.align 6
+.type .LK256,%object
+.LK256:
+ .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+ .long 0 //terminator
+.size .LK256,.-.LK256
+
+.globl zfs_sha256_block_armv7
+.type zfs_sha256_block_armv7,%function
+.align 6
+zfs_sha256_block_armv7:
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ sub sp,sp,#4*4
+
+ ldp w20,w21,[x0] // load context
+ ldp w22,w23,[x0,#2*4]
+ ldp w24,w25,[x0,#4*4]
+ add x2,x1,x2,lsl#6 // end of input
+ ldp w26,w27,[x0,#6*4]
+ adr x30,.LK256
+ stp x0,x2,[x29,#96]
+
+.Loop:
+ ldp w3,w4,[x1],#2*4
+ ldr w19,[x30],#4 // *K++
+ eor w28,w21,w22 // magic seed
+ str x1,[x29,#112]
+#ifndef __AARCH64EB__
+ rev w3,w3 // 0
+#endif
+ ror w16,w24,#6
+ add w27,w27,w19 // h+=K[i]
+ eor w6,w24,w24,ror#14
+ and w17,w25,w24
+ bic w19,w26,w24
+ add w27,w27,w3 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w20,w21 // a^b, b^c in next round
+ eor w16,w16,w6,ror#11 // Sigma1(e)
+ ror w6,w20,#2
+ add w27,w27,w17 // h+=Ch(e,f,g)
+ eor w17,w20,w20,ror#9
+ add w27,w27,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w23,w23,w27 // d+=h
+ eor w28,w28,w21 // Maj(a,b,c)
+ eor w17,w6,w17,ror#13 // Sigma0(a)
+ add w27,w27,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w27,w27,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w4,w4 // 1
+#endif
+ ldp w5,w6,[x1],#2*4
+ add w27,w27,w17 // h+=Sigma0(a)
+ ror w16,w23,#6
+ add w26,w26,w28 // h+=K[i]
+ eor w7,w23,w23,ror#14
+ and w17,w24,w23
+ bic w28,w25,w23
+ add w26,w26,w4 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w27,w20 // a^b, b^c in next round
+ eor w16,w16,w7,ror#11 // Sigma1(e)
+ ror w7,w27,#2
+ add w26,w26,w17 // h+=Ch(e,f,g)
+ eor w17,w27,w27,ror#9
+ add w26,w26,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w22,w22,w26 // d+=h
+ eor w19,w19,w20 // Maj(a,b,c)
+ eor w17,w7,w17,ror#13 // Sigma0(a)
+ add w26,w26,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w26,w26,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w5,w5 // 2
+#endif
+ add w26,w26,w17 // h+=Sigma0(a)
+ ror w16,w22,#6
+ add w25,w25,w19 // h+=K[i]
+ eor w8,w22,w22,ror#14
+ and w17,w23,w22
+ bic w19,w24,w22
+ add w25,w25,w5 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w26,w27 // a^b, b^c in next round
+ eor w16,w16,w8,ror#11 // Sigma1(e)
+ ror w8,w26,#2
+ add w25,w25,w17 // h+=Ch(e,f,g)
+ eor w17,w26,w26,ror#9
+ add w25,w25,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w21,w21,w25 // d+=h
+ eor w28,w28,w27 // Maj(a,b,c)
+ eor w17,w8,w17,ror#13 // Sigma0(a)
+ add w25,w25,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w25,w25,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w6,w6 // 3
+#endif
+ ldp w7,w8,[x1],#2*4
+ add w25,w25,w17 // h+=Sigma0(a)
+ ror w16,w21,#6
+ add w24,w24,w28 // h+=K[i]
+ eor w9,w21,w21,ror#14
+ and w17,w22,w21
+ bic w28,w23,w21
+ add w24,w24,w6 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w25,w26 // a^b, b^c in next round
+ eor w16,w16,w9,ror#11 // Sigma1(e)
+ ror w9,w25,#2
+ add w24,w24,w17 // h+=Ch(e,f,g)
+ eor w17,w25,w25,ror#9
+ add w24,w24,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w20,w20,w24 // d+=h
+ eor w19,w19,w26 // Maj(a,b,c)
+ eor w17,w9,w17,ror#13 // Sigma0(a)
+ add w24,w24,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w24,w24,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w7,w7 // 4
+#endif
+ add w24,w24,w17 // h+=Sigma0(a)
+ ror w16,w20,#6
+ add w23,w23,w19 // h+=K[i]
+ eor w10,w20,w20,ror#14
+ and w17,w21,w20
+ bic w19,w22,w20
+ add w23,w23,w7 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w24,w25 // a^b, b^c in next round
+ eor w16,w16,w10,ror#11 // Sigma1(e)
+ ror w10,w24,#2
+ add w23,w23,w17 // h+=Ch(e,f,g)
+ eor w17,w24,w24,ror#9
+ add w23,w23,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w27,w27,w23 // d+=h
+ eor w28,w28,w25 // Maj(a,b,c)
+ eor w17,w10,w17,ror#13 // Sigma0(a)
+ add w23,w23,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w23,w23,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w8,w8 // 5
+#endif
+ ldp w9,w10,[x1],#2*4
+ add w23,w23,w17 // h+=Sigma0(a)
+ ror w16,w27,#6
+ add w22,w22,w28 // h+=K[i]
+ eor w11,w27,w27,ror#14
+ and w17,w20,w27
+ bic w28,w21,w27
+ add w22,w22,w8 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w23,w24 // a^b, b^c in next round
+ eor w16,w16,w11,ror#11 // Sigma1(e)
+ ror w11,w23,#2
+ add w22,w22,w17 // h+=Ch(e,f,g)
+ eor w17,w23,w23,ror#9
+ add w22,w22,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w26,w26,w22 // d+=h
+ eor w19,w19,w24 // Maj(a,b,c)
+ eor w17,w11,w17,ror#13 // Sigma0(a)
+ add w22,w22,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w22,w22,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w9,w9 // 6
+#endif
+ add w22,w22,w17 // h+=Sigma0(a)
+ ror w16,w26,#6
+ add w21,w21,w19 // h+=K[i]
+ eor w12,w26,w26,ror#14
+ and w17,w27,w26
+ bic w19,w20,w26
+ add w21,w21,w9 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w22,w23 // a^b, b^c in next round
+ eor w16,w16,w12,ror#11 // Sigma1(e)
+ ror w12,w22,#2
+ add w21,w21,w17 // h+=Ch(e,f,g)
+ eor w17,w22,w22,ror#9
+ add w21,w21,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w25,w25,w21 // d+=h
+ eor w28,w28,w23 // Maj(a,b,c)
+ eor w17,w12,w17,ror#13 // Sigma0(a)
+ add w21,w21,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w21,w21,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w10,w10 // 7
+#endif
+ ldp w11,w12,[x1],#2*4
+ add w21,w21,w17 // h+=Sigma0(a)
+ ror w16,w25,#6
+ add w20,w20,w28 // h+=K[i]
+ eor w13,w25,w25,ror#14
+ and w17,w26,w25
+ bic w28,w27,w25
+ add w20,w20,w10 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w21,w22 // a^b, b^c in next round
+ eor w16,w16,w13,ror#11 // Sigma1(e)
+ ror w13,w21,#2
+ add w20,w20,w17 // h+=Ch(e,f,g)
+ eor w17,w21,w21,ror#9
+ add w20,w20,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w24,w24,w20 // d+=h
+ eor w19,w19,w22 // Maj(a,b,c)
+ eor w17,w13,w17,ror#13 // Sigma0(a)
+ add w20,w20,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w20,w20,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w11,w11 // 8
+#endif
+ add w20,w20,w17 // h+=Sigma0(a)
+ ror w16,w24,#6
+ add w27,w27,w19 // h+=K[i]
+ eor w14,w24,w24,ror#14
+ and w17,w25,w24
+ bic w19,w26,w24
+ add w27,w27,w11 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w20,w21 // a^b, b^c in next round
+ eor w16,w16,w14,ror#11 // Sigma1(e)
+ ror w14,w20,#2
+ add w27,w27,w17 // h+=Ch(e,f,g)
+ eor w17,w20,w20,ror#9
+ add w27,w27,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w23,w23,w27 // d+=h
+ eor w28,w28,w21 // Maj(a,b,c)
+ eor w17,w14,w17,ror#13 // Sigma0(a)
+ add w27,w27,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w27,w27,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w12,w12 // 9
+#endif
+ ldp w13,w14,[x1],#2*4
+ add w27,w27,w17 // h+=Sigma0(a)
+ ror w16,w23,#6
+ add w26,w26,w28 // h+=K[i]
+ eor w15,w23,w23,ror#14
+ and w17,w24,w23
+ bic w28,w25,w23
+ add w26,w26,w12 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w27,w20 // a^b, b^c in next round
+ eor w16,w16,w15,ror#11 // Sigma1(e)
+ ror w15,w27,#2
+ add w26,w26,w17 // h+=Ch(e,f,g)
+ eor w17,w27,w27,ror#9
+ add w26,w26,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w22,w22,w26 // d+=h
+ eor w19,w19,w20 // Maj(a,b,c)
+ eor w17,w15,w17,ror#13 // Sigma0(a)
+ add w26,w26,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w26,w26,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w13,w13 // 10
+#endif
+ add w26,w26,w17 // h+=Sigma0(a)
+ ror w16,w22,#6
+ add w25,w25,w19 // h+=K[i]
+ eor w0,w22,w22,ror#14
+ and w17,w23,w22
+ bic w19,w24,w22
+ add w25,w25,w13 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w26,w27 // a^b, b^c in next round
+ eor w16,w16,w0,ror#11 // Sigma1(e)
+ ror w0,w26,#2
+ add w25,w25,w17 // h+=Ch(e,f,g)
+ eor w17,w26,w26,ror#9
+ add w25,w25,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w21,w21,w25 // d+=h
+ eor w28,w28,w27 // Maj(a,b,c)
+ eor w17,w0,w17,ror#13 // Sigma0(a)
+ add w25,w25,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w25,w25,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w14,w14 // 11
+#endif
+ ldp w15,w0,[x1],#2*4
+ add w25,w25,w17 // h+=Sigma0(a)
+ str w6,[sp,#12]
+ ror w16,w21,#6
+ add w24,w24,w28 // h+=K[i]
+ eor w6,w21,w21,ror#14
+ and w17,w22,w21
+ bic w28,w23,w21
+ add w24,w24,w14 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w25,w26 // a^b, b^c in next round
+ eor w16,w16,w6,ror#11 // Sigma1(e)
+ ror w6,w25,#2
+ add w24,w24,w17 // h+=Ch(e,f,g)
+ eor w17,w25,w25,ror#9
+ add w24,w24,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w20,w20,w24 // d+=h
+ eor w19,w19,w26 // Maj(a,b,c)
+ eor w17,w6,w17,ror#13 // Sigma0(a)
+ add w24,w24,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w24,w24,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w15,w15 // 12
+#endif
+ add w24,w24,w17 // h+=Sigma0(a)
+ str w7,[sp,#0]
+ ror w16,w20,#6
+ add w23,w23,w19 // h+=K[i]
+ eor w7,w20,w20,ror#14
+ and w17,w21,w20
+ bic w19,w22,w20
+ add w23,w23,w15 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w24,w25 // a^b, b^c in next round
+ eor w16,w16,w7,ror#11 // Sigma1(e)
+ ror w7,w24,#2
+ add w23,w23,w17 // h+=Ch(e,f,g)
+ eor w17,w24,w24,ror#9
+ add w23,w23,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w27,w27,w23 // d+=h
+ eor w28,w28,w25 // Maj(a,b,c)
+ eor w17,w7,w17,ror#13 // Sigma0(a)
+ add w23,w23,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w23,w23,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w0,w0 // 13
+#endif
+ ldp w1,w2,[x1]
+ add w23,w23,w17 // h+=Sigma0(a)
+ str w8,[sp,#4]
+ ror w16,w27,#6
+ add w22,w22,w28 // h+=K[i]
+ eor w8,w27,w27,ror#14
+ and w17,w20,w27
+ bic w28,w21,w27
+ add w22,w22,w0 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w23,w24 // a^b, b^c in next round
+ eor w16,w16,w8,ror#11 // Sigma1(e)
+ ror w8,w23,#2
+ add w22,w22,w17 // h+=Ch(e,f,g)
+ eor w17,w23,w23,ror#9
+ add w22,w22,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w26,w26,w22 // d+=h
+ eor w19,w19,w24 // Maj(a,b,c)
+ eor w17,w8,w17,ror#13 // Sigma0(a)
+ add w22,w22,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w22,w22,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w1,w1 // 14
+#endif
+ ldr w6,[sp,#12]
+ add w22,w22,w17 // h+=Sigma0(a)
+ str w9,[sp,#8]
+ ror w16,w26,#6
+ add w21,w21,w19 // h+=K[i]
+ eor w9,w26,w26,ror#14
+ and w17,w27,w26
+ bic w19,w20,w26
+ add w21,w21,w1 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w22,w23 // a^b, b^c in next round
+ eor w16,w16,w9,ror#11 // Sigma1(e)
+ ror w9,w22,#2
+ add w21,w21,w17 // h+=Ch(e,f,g)
+ eor w17,w22,w22,ror#9
+ add w21,w21,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w25,w25,w21 // d+=h
+ eor w28,w28,w23 // Maj(a,b,c)
+ eor w17,w9,w17,ror#13 // Sigma0(a)
+ add w21,w21,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w21,w21,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w2,w2 // 15
+#endif
+ ldr w7,[sp,#0]
+ add w21,w21,w17 // h+=Sigma0(a)
+ str w10,[sp,#12]
+ ror w16,w25,#6
+ add w20,w20,w28 // h+=K[i]
+ ror w9,w4,#7
+ and w17,w26,w25
+ ror w8,w1,#17
+ bic w28,w27,w25
+ ror w10,w21,#2
+ add w20,w20,w2 // h+=X[i]
+ eor w16,w16,w25,ror#11
+ eor w9,w9,w4,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w21,w22 // a^b, b^c in next round
+ eor w16,w16,w25,ror#25 // Sigma1(e)
+ eor w10,w10,w21,ror#13
+ add w20,w20,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w8,w8,w1,ror#19
+ eor w9,w9,w4,lsr#3 // sigma0(X[i+1])
+ add w20,w20,w16 // h+=Sigma1(e)
+ eor w19,w19,w22 // Maj(a,b,c)
+ eor w17,w10,w21,ror#22 // Sigma0(a)
+ eor w8,w8,w1,lsr#10 // sigma1(X[i+14])
+ add w3,w3,w12
+ add w24,w24,w20 // d+=h
+ add w20,w20,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w3,w3,w9
+ add w20,w20,w17 // h+=Sigma0(a)
+ add w3,w3,w8
+.Loop_16_xx:
+ ldr w8,[sp,#4]
+ str w11,[sp,#0]
+ ror w16,w24,#6
+ add w27,w27,w19 // h+=K[i]
+ ror w10,w5,#7
+ and w17,w25,w24
+ ror w9,w2,#17
+ bic w19,w26,w24
+ ror w11,w20,#2
+ add w27,w27,w3 // h+=X[i]
+ eor w16,w16,w24,ror#11
+ eor w10,w10,w5,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w20,w21 // a^b, b^c in next round
+ eor w16,w16,w24,ror#25 // Sigma1(e)
+ eor w11,w11,w20,ror#13
+ add w27,w27,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w9,w9,w2,ror#19
+ eor w10,w10,w5,lsr#3 // sigma0(X[i+1])
+ add w27,w27,w16 // h+=Sigma1(e)
+ eor w28,w28,w21 // Maj(a,b,c)
+ eor w17,w11,w20,ror#22 // Sigma0(a)
+ eor w9,w9,w2,lsr#10 // sigma1(X[i+14])
+ add w4,w4,w13
+ add w23,w23,w27 // d+=h
+ add w27,w27,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w4,w4,w10
+ add w27,w27,w17 // h+=Sigma0(a)
+ add w4,w4,w9
+ ldr w9,[sp,#8]
+ str w12,[sp,#4]
+ ror w16,w23,#6
+ add w26,w26,w28 // h+=K[i]
+ ror w11,w6,#7
+ and w17,w24,w23
+ ror w10,w3,#17
+ bic w28,w25,w23
+ ror w12,w27,#2
+ add w26,w26,w4 // h+=X[i]
+ eor w16,w16,w23,ror#11
+ eor w11,w11,w6,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w27,w20 // a^b, b^c in next round
+ eor w16,w16,w23,ror#25 // Sigma1(e)
+ eor w12,w12,w27,ror#13
+ add w26,w26,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w10,w10,w3,ror#19
+ eor w11,w11,w6,lsr#3 // sigma0(X[i+1])
+ add w26,w26,w16 // h+=Sigma1(e)
+ eor w19,w19,w20 // Maj(a,b,c)
+ eor w17,w12,w27,ror#22 // Sigma0(a)
+ eor w10,w10,w3,lsr#10 // sigma1(X[i+14])
+ add w5,w5,w14
+ add w22,w22,w26 // d+=h
+ add w26,w26,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w5,w5,w11
+ add w26,w26,w17 // h+=Sigma0(a)
+ add w5,w5,w10
+ ldr w10,[sp,#12]
+ str w13,[sp,#8]
+ ror w16,w22,#6
+ add w25,w25,w19 // h+=K[i]
+ ror w12,w7,#7
+ and w17,w23,w22
+ ror w11,w4,#17
+ bic w19,w24,w22
+ ror w13,w26,#2
+ add w25,w25,w5 // h+=X[i]
+ eor w16,w16,w22,ror#11
+ eor w12,w12,w7,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w26,w27 // a^b, b^c in next round
+ eor w16,w16,w22,ror#25 // Sigma1(e)
+ eor w13,w13,w26,ror#13
+ add w25,w25,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w11,w11,w4,ror#19
+ eor w12,w12,w7,lsr#3 // sigma0(X[i+1])
+ add w25,w25,w16 // h+=Sigma1(e)
+ eor w28,w28,w27 // Maj(a,b,c)
+ eor w17,w13,w26,ror#22 // Sigma0(a)
+ eor w11,w11,w4,lsr#10 // sigma1(X[i+14])
+ add w6,w6,w15
+ add w21,w21,w25 // d+=h
+ add w25,w25,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w6,w6,w12
+ add w25,w25,w17 // h+=Sigma0(a)
+ add w6,w6,w11
+ ldr w11,[sp,#0]
+ str w14,[sp,#12]
+ ror w16,w21,#6
+ add w24,w24,w28 // h+=K[i]
+ ror w13,w8,#7
+ and w17,w22,w21
+ ror w12,w5,#17
+ bic w28,w23,w21
+ ror w14,w25,#2
+ add w24,w24,w6 // h+=X[i]
+ eor w16,w16,w21,ror#11
+ eor w13,w13,w8,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w25,w26 // a^b, b^c in next round
+ eor w16,w16,w21,ror#25 // Sigma1(e)
+ eor w14,w14,w25,ror#13
+ add w24,w24,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w12,w12,w5,ror#19
+ eor w13,w13,w8,lsr#3 // sigma0(X[i+1])
+ add w24,w24,w16 // h+=Sigma1(e)
+ eor w19,w19,w26 // Maj(a,b,c)
+ eor w17,w14,w25,ror#22 // Sigma0(a)
+ eor w12,w12,w5,lsr#10 // sigma1(X[i+14])
+ add w7,w7,w0
+ add w20,w20,w24 // d+=h
+ add w24,w24,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w7,w7,w13
+ add w24,w24,w17 // h+=Sigma0(a)
+ add w7,w7,w12
+ ldr w12,[sp,#4]
+ str w15,[sp,#0]
+ ror w16,w20,#6
+ add w23,w23,w19 // h+=K[i]
+ ror w14,w9,#7
+ and w17,w21,w20
+ ror w13,w6,#17
+ bic w19,w22,w20
+ ror w15,w24,#2
+ add w23,w23,w7 // h+=X[i]
+ eor w16,w16,w20,ror#11
+ eor w14,w14,w9,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w24,w25 // a^b, b^c in next round
+ eor w16,w16,w20,ror#25 // Sigma1(e)
+ eor w15,w15,w24,ror#13
+ add w23,w23,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w13,w13,w6,ror#19
+ eor w14,w14,w9,lsr#3 // sigma0(X[i+1])
+ add w23,w23,w16 // h+=Sigma1(e)
+ eor w28,w28,w25 // Maj(a,b,c)
+ eor w17,w15,w24,ror#22 // Sigma0(a)
+ eor w13,w13,w6,lsr#10 // sigma1(X[i+14])
+ add w8,w8,w1
+ add w27,w27,w23 // d+=h
+ add w23,w23,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w8,w8,w14
+ add w23,w23,w17 // h+=Sigma0(a)
+ add w8,w8,w13
+ ldr w13,[sp,#8]
+ str w0,[sp,#4]
+ ror w16,w27,#6
+ add w22,w22,w28 // h+=K[i]
+ ror w15,w10,#7
+ and w17,w20,w27
+ ror w14,w7,#17
+ bic w28,w21,w27
+ ror w0,w23,#2
+ add w22,w22,w8 // h+=X[i]
+ eor w16,w16,w27,ror#11
+ eor w15,w15,w10,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w23,w24 // a^b, b^c in next round
+ eor w16,w16,w27,ror#25 // Sigma1(e)
+ eor w0,w0,w23,ror#13
+ add w22,w22,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w14,w14,w7,ror#19
+ eor w15,w15,w10,lsr#3 // sigma0(X[i+1])
+ add w22,w22,w16 // h+=Sigma1(e)
+ eor w19,w19,w24 // Maj(a,b,c)
+ eor w17,w0,w23,ror#22 // Sigma0(a)
+ eor w14,w14,w7,lsr#10 // sigma1(X[i+14])
+ add w9,w9,w2
+ add w26,w26,w22 // d+=h
+ add w22,w22,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w9,w9,w15
+ add w22,w22,w17 // h+=Sigma0(a)
+ add w9,w9,w14
+ ldr w14,[sp,#12]
+ str w1,[sp,#8]
+ ror w16,w26,#6
+ add w21,w21,w19 // h+=K[i]
+ ror w0,w11,#7
+ and w17,w27,w26
+ ror w15,w8,#17
+ bic w19,w20,w26
+ ror w1,w22,#2
+ add w21,w21,w9 // h+=X[i]
+ eor w16,w16,w26,ror#11
+ eor w0,w0,w11,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w22,w23 // a^b, b^c in next round
+ eor w16,w16,w26,ror#25 // Sigma1(e)
+ eor w1,w1,w22,ror#13
+ add w21,w21,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w15,w15,w8,ror#19
+ eor w0,w0,w11,lsr#3 // sigma0(X[i+1])
+ add w21,w21,w16 // h+=Sigma1(e)
+ eor w28,w28,w23 // Maj(a,b,c)
+ eor w17,w1,w22,ror#22 // Sigma0(a)
+ eor w15,w15,w8,lsr#10 // sigma1(X[i+14])
+ add w10,w10,w3
+ add w25,w25,w21 // d+=h
+ add w21,w21,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w10,w10,w0
+ add w21,w21,w17 // h+=Sigma0(a)
+ add w10,w10,w15
+ ldr w15,[sp,#0]
+ str w2,[sp,#12]
+ ror w16,w25,#6
+ add w20,w20,w28 // h+=K[i]
+ ror w1,w12,#7
+ and w17,w26,w25
+ ror w0,w9,#17
+ bic w28,w27,w25
+ ror w2,w21,#2
+ add w20,w20,w10 // h+=X[i]
+ eor w16,w16,w25,ror#11
+ eor w1,w1,w12,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w21,w22 // a^b, b^c in next round
+ eor w16,w16,w25,ror#25 // Sigma1(e)
+ eor w2,w2,w21,ror#13
+ add w20,w20,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w0,w0,w9,ror#19
+ eor w1,w1,w12,lsr#3 // sigma0(X[i+1])
+ add w20,w20,w16 // h+=Sigma1(e)
+ eor w19,w19,w22 // Maj(a,b,c)
+ eor w17,w2,w21,ror#22 // Sigma0(a)
+ eor w0,w0,w9,lsr#10 // sigma1(X[i+14])
+ add w11,w11,w4
+ add w24,w24,w20 // d+=h
+ add w20,w20,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w11,w11,w1
+ add w20,w20,w17 // h+=Sigma0(a)
+ add w11,w11,w0
+ ldr w0,[sp,#4]
+ str w3,[sp,#0]
+ ror w16,w24,#6
+ add w27,w27,w19 // h+=K[i]
+ ror w2,w13,#7
+ and w17,w25,w24
+ ror w1,w10,#17
+ bic w19,w26,w24
+ ror w3,w20,#2
+ add w27,w27,w11 // h+=X[i]
+ eor w16,w16,w24,ror#11
+ eor w2,w2,w13,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w20,w21 // a^b, b^c in next round
+ eor w16,w16,w24,ror#25 // Sigma1(e)
+ eor w3,w3,w20,ror#13
+ add w27,w27,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w1,w1,w10,ror#19
+ eor w2,w2,w13,lsr#3 // sigma0(X[i+1])
+ add w27,w27,w16 // h+=Sigma1(e)
+ eor w28,w28,w21 // Maj(a,b,c)
+ eor w17,w3,w20,ror#22 // Sigma0(a)
+ eor w1,w1,w10,lsr#10 // sigma1(X[i+14])
+ add w12,w12,w5
+ add w23,w23,w27 // d+=h
+ add w27,w27,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w12,w12,w2
+ add w27,w27,w17 // h+=Sigma0(a)
+ add w12,w12,w1
+ ldr w1,[sp,#8]
+ str w4,[sp,#4]
+ ror w16,w23,#6
+ add w26,w26,w28 // h+=K[i]
+ ror w3,w14,#7
+ and w17,w24,w23
+ ror w2,w11,#17
+ bic w28,w25,w23
+ ror w4,w27,#2
+ add w26,w26,w12 // h+=X[i]
+ eor w16,w16,w23,ror#11
+ eor w3,w3,w14,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w27,w20 // a^b, b^c in next round
+ eor w16,w16,w23,ror#25 // Sigma1(e)
+ eor w4,w4,w27,ror#13
+ add w26,w26,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w2,w2,w11,ror#19
+ eor w3,w3,w14,lsr#3 // sigma0(X[i+1])
+ add w26,w26,w16 // h+=Sigma1(e)
+ eor w19,w19,w20 // Maj(a,b,c)
+ eor w17,w4,w27,ror#22 // Sigma0(a)
+ eor w2,w2,w11,lsr#10 // sigma1(X[i+14])
+ add w13,w13,w6
+ add w22,w22,w26 // d+=h
+ add w26,w26,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w13,w13,w3
+ add w26,w26,w17 // h+=Sigma0(a)
+ add w13,w13,w2
+ ldr w2,[sp,#12]
+ str w5,[sp,#8]
+ ror w16,w22,#6
+ add w25,w25,w19 // h+=K[i]
+ ror w4,w15,#7
+ and w17,w23,w22
+ ror w3,w12,#17
+ bic w19,w24,w22
+ ror w5,w26,#2
+ add w25,w25,w13 // h+=X[i]
+ eor w16,w16,w22,ror#11
+ eor w4,w4,w15,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w26,w27 // a^b, b^c in next round
+ eor w16,w16,w22,ror#25 // Sigma1(e)
+ eor w5,w5,w26,ror#13
+ add w25,w25,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w3,w3,w12,ror#19
+ eor w4,w4,w15,lsr#3 // sigma0(X[i+1])
+ add w25,w25,w16 // h+=Sigma1(e)
+ eor w28,w28,w27 // Maj(a,b,c)
+ eor w17,w5,w26,ror#22 // Sigma0(a)
+ eor w3,w3,w12,lsr#10 // sigma1(X[i+14])
+ add w14,w14,w7
+ add w21,w21,w25 // d+=h
+ add w25,w25,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w14,w14,w4
+ add w25,w25,w17 // h+=Sigma0(a)
+ add w14,w14,w3
+ ldr w3,[sp,#0]
+ str w6,[sp,#12]
+ ror w16,w21,#6
+ add w24,w24,w28 // h+=K[i]
+ ror w5,w0,#7
+ and w17,w22,w21
+ ror w4,w13,#17
+ bic w28,w23,w21
+ ror w6,w25,#2
+ add w24,w24,w14 // h+=X[i]
+ eor w16,w16,w21,ror#11
+ eor w5,w5,w0,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w25,w26 // a^b, b^c in next round
+ eor w16,w16,w21,ror#25 // Sigma1(e)
+ eor w6,w6,w25,ror#13
+ add w24,w24,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w4,w4,w13,ror#19
+ eor w5,w5,w0,lsr#3 // sigma0(X[i+1])
+ add w24,w24,w16 // h+=Sigma1(e)
+ eor w19,w19,w26 // Maj(a,b,c)
+ eor w17,w6,w25,ror#22 // Sigma0(a)
+ eor w4,w4,w13,lsr#10 // sigma1(X[i+14])
+ add w15,w15,w8
+ add w20,w20,w24 // d+=h
+ add w24,w24,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w15,w15,w5
+ add w24,w24,w17 // h+=Sigma0(a)
+ add w15,w15,w4
+ ldr w4,[sp,#4]
+ str w7,[sp,#0]
+ ror w16,w20,#6
+ add w23,w23,w19 // h+=K[i]
+ ror w6,w1,#7
+ and w17,w21,w20
+ ror w5,w14,#17
+ bic w19,w22,w20
+ ror w7,w24,#2
+ add w23,w23,w15 // h+=X[i]
+ eor w16,w16,w20,ror#11
+ eor w6,w6,w1,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w24,w25 // a^b, b^c in next round
+ eor w16,w16,w20,ror#25 // Sigma1(e)
+ eor w7,w7,w24,ror#13
+ add w23,w23,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w5,w5,w14,ror#19
+ eor w6,w6,w1,lsr#3 // sigma0(X[i+1])
+ add w23,w23,w16 // h+=Sigma1(e)
+ eor w28,w28,w25 // Maj(a,b,c)
+ eor w17,w7,w24,ror#22 // Sigma0(a)
+ eor w5,w5,w14,lsr#10 // sigma1(X[i+14])
+ add w0,w0,w9
+ add w27,w27,w23 // d+=h
+ add w23,w23,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w0,w0,w6
+ add w23,w23,w17 // h+=Sigma0(a)
+ add w0,w0,w5
+ ldr w5,[sp,#8]
+ str w8,[sp,#4]
+ ror w16,w27,#6
+ add w22,w22,w28 // h+=K[i]
+ ror w7,w2,#7
+ and w17,w20,w27
+ ror w6,w15,#17
+ bic w28,w21,w27
+ ror w8,w23,#2
+ add w22,w22,w0 // h+=X[i]
+ eor w16,w16,w27,ror#11
+ eor w7,w7,w2,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w23,w24 // a^b, b^c in next round
+ eor w16,w16,w27,ror#25 // Sigma1(e)
+ eor w8,w8,w23,ror#13
+ add w22,w22,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w6,w6,w15,ror#19
+ eor w7,w7,w2,lsr#3 // sigma0(X[i+1])
+ add w22,w22,w16 // h+=Sigma1(e)
+ eor w19,w19,w24 // Maj(a,b,c)
+ eor w17,w8,w23,ror#22 // Sigma0(a)
+ eor w6,w6,w15,lsr#10 // sigma1(X[i+14])
+ add w1,w1,w10
+ add w26,w26,w22 // d+=h
+ add w22,w22,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w1,w1,w7
+ add w22,w22,w17 // h+=Sigma0(a)
+ add w1,w1,w6
+ ldr w6,[sp,#12]
+ str w9,[sp,#8]
+ ror w16,w26,#6
+ add w21,w21,w19 // h+=K[i]
+ ror w8,w3,#7
+ and w17,w27,w26
+ ror w7,w0,#17
+ bic w19,w20,w26
+ ror w9,w22,#2
+ add w21,w21,w1 // h+=X[i]
+ eor w16,w16,w26,ror#11
+ eor w8,w8,w3,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w22,w23 // a^b, b^c in next round
+ eor w16,w16,w26,ror#25 // Sigma1(e)
+ eor w9,w9,w22,ror#13
+ add w21,w21,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w7,w7,w0,ror#19
+ eor w8,w8,w3,lsr#3 // sigma0(X[i+1])
+ add w21,w21,w16 // h+=Sigma1(e)
+ eor w28,w28,w23 // Maj(a,b,c)
+ eor w17,w9,w22,ror#22 // Sigma0(a)
+ eor w7,w7,w0,lsr#10 // sigma1(X[i+14])
+ add w2,w2,w11
+ add w25,w25,w21 // d+=h
+ add w21,w21,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w2,w2,w8
+ add w21,w21,w17 // h+=Sigma0(a)
+ add w2,w2,w7
+ ldr w7,[sp,#0]
+ str w10,[sp,#12]
+ ror w16,w25,#6
+ add w20,w20,w28 // h+=K[i]
+ ror w9,w4,#7
+ and w17,w26,w25
+ ror w8,w1,#17
+ bic w28,w27,w25
+ ror w10,w21,#2
+ add w20,w20,w2 // h+=X[i]
+ eor w16,w16,w25,ror#11
+ eor w9,w9,w4,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w21,w22 // a^b, b^c in next round
+ eor w16,w16,w25,ror#25 // Sigma1(e)
+ eor w10,w10,w21,ror#13
+ add w20,w20,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w8,w8,w1,ror#19
+ eor w9,w9,w4,lsr#3 // sigma0(X[i+1])
+ add w20,w20,w16 // h+=Sigma1(e)
+ eor w19,w19,w22 // Maj(a,b,c)
+ eor w17,w10,w21,ror#22 // Sigma0(a)
+ eor w8,w8,w1,lsr#10 // sigma1(X[i+14])
+ add w3,w3,w12
+ add w24,w24,w20 // d+=h
+ add w20,w20,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w3,w3,w9
+ add w20,w20,w17 // h+=Sigma0(a)
+ add w3,w3,w8
+ cbnz w19,.Loop_16_xx
+
+ ldp x0,x2,[x29,#96]
+ ldr x1,[x29,#112]
+ sub x30,x30,#260 // rewind
+
+ ldp w3,w4,[x0]
+ ldp w5,w6,[x0,#2*4]
+ add x1,x1,#14*4 // advance input pointer
+ ldp w7,w8,[x0,#4*4]
+ add w20,w20,w3
+ ldp w9,w10,[x0,#6*4]
+ add w21,w21,w4
+ add w22,w22,w5
+ add w23,w23,w6
+ stp w20,w21,[x0]
+ add w24,w24,w7
+ add w25,w25,w8
+ stp w22,w23,[x0,#2*4]
+ add w26,w26,w9
+ add w27,w27,w10
+ cmp x1,x2
+ stp w24,w25,[x0,#4*4]
+ stp w26,w27,[x0,#6*4]
+ b.ne .Loop
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#4*4
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#128
+ ret
+.size zfs_sha256_block_armv7,.-zfs_sha256_block_armv7
+
+.globl zfs_sha256_block_armv8
+.type zfs_sha256_block_armv8,%function
+.align 6
+zfs_sha256_block_armv8:
+.Lv8_entry:
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ld1 {v0.4s,v1.4s},[x0]
+ adr x3,.LK256
+
+.Loop_hw:
+ ld1 {v4.16b-v7.16b},[x1],#64
+ sub x2,x2,#1
+ ld1 {v16.4s},[x3],#16
+ rev32 v4.16b,v4.16b
+ rev32 v5.16b,v5.16b
+ rev32 v6.16b,v6.16b
+ rev32 v7.16b,v7.16b
+ orr v18.16b,v0.16b,v0.16b // offload
+ orr v19.16b,v1.16b,v1.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v4.4s
+ .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b
+ orr v2.16b,v0.16b,v0.16b
+ .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+ .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+ .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v5.4s
+ .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b
+ orr v2.16b,v0.16b,v0.16b
+ .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+ .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+ .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v6.4s
+ .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b
+ orr v2.16b,v0.16b,v0.16b
+ .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+ .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+ .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v7.4s
+ .inst 0x5e282887 //sha256su0 v7.16b,v4.16b
+ orr v2.16b,v0.16b,v0.16b
+ .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+ .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+ .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v4.4s
+ .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b
+ orr v2.16b,v0.16b,v0.16b
+ .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+ .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+ .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v5.4s
+ .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b
+ orr v2.16b,v0.16b,v0.16b
+ .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+ .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+ .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v6.4s
+ .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b
+ orr v2.16b,v0.16b,v0.16b
+ .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+ .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+ .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v7.4s
+ .inst 0x5e282887 //sha256su0 v7.16b,v4.16b
+ orr v2.16b,v0.16b,v0.16b
+ .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+ .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+ .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v4.4s
+ .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b
+ orr v2.16b,v0.16b,v0.16b
+ .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+ .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+ .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v5.4s
+ .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b
+ orr v2.16b,v0.16b,v0.16b
+ .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+ .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+ .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v6.4s
+ .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b
+ orr v2.16b,v0.16b,v0.16b
+ .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+ .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+ .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v7.4s
+ .inst 0x5e282887 //sha256su0 v7.16b,v4.16b
+ orr v2.16b,v0.16b,v0.16b
+ .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+ .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+ .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v4.4s
+ orr v2.16b,v0.16b,v0.16b
+ .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+ .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v5.4s
+ orr v2.16b,v0.16b,v0.16b
+ .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+ .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+
+ ld1 {v17.4s},[x3]
+ add v16.4s,v16.4s,v6.4s
+ sub x3,x3,#64*4-16 // rewind
+ orr v2.16b,v0.16b,v0.16b
+ .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+ .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+
+ add v17.4s,v17.4s,v7.4s
+ orr v2.16b,v0.16b,v0.16b
+ .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+ .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+
+ add v0.4s,v0.4s,v18.4s
+ add v1.4s,v1.4s,v19.4s
+
+ cbnz x2,.Loop_hw
+
+ st1 {v0.4s,v1.4s},[x0]
+
+ ldr x29,[sp],#16
+ ret
+.size zfs_sha256_block_armv8,.-zfs_sha256_block_armv8
+
+.globl zfs_sha256_block_neon
+.type zfs_sha256_block_neon,%function
+.align 4
+zfs_sha256_block_neon:
+.Lneon_entry:
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+ sub sp,sp,#16*4
+
+ adr x16,.LK256
+ add x2,x1,x2,lsl#6 // len to point at the end of inp
+
+ ld1 {v0.16b},[x1], #16
+ ld1 {v1.16b},[x1], #16
+ ld1 {v2.16b},[x1], #16
+ ld1 {v3.16b},[x1], #16
+ ld1 {v4.4s},[x16], #16
+ ld1 {v5.4s},[x16], #16
+ ld1 {v6.4s},[x16], #16
+ ld1 {v7.4s},[x16], #16
+ rev32 v0.16b,v0.16b // yes, even on
+ rev32 v1.16b,v1.16b // big-endian
+ rev32 v2.16b,v2.16b
+ rev32 v3.16b,v3.16b
+ mov x17,sp
+ add v4.4s,v4.4s,v0.4s
+ add v5.4s,v5.4s,v1.4s
+ add v6.4s,v6.4s,v2.4s
+ st1 {v4.4s-v5.4s},[x17], #32
+ add v7.4s,v7.4s,v3.4s
+ st1 {v6.4s-v7.4s},[x17]
+ sub x17,x17,#32
+
+ ldp w3,w4,[x0]
+ ldp w5,w6,[x0,#8]
+ ldp w7,w8,[x0,#16]
+ ldp w9,w10,[x0,#24]
+ ldr w12,[sp,#0]
+ mov w13,wzr
+ eor w14,w4,w5
+ mov w15,wzr
+ b .L_00_48
+
+.align 4
+.L_00_48:
+ ext v4.16b,v0.16b,v1.16b,#4
+ add w10,w10,w12
+ add w3,w3,w15
+ and w12,w8,w7
+ bic w15,w9,w7
+ ext v7.16b,v2.16b,v3.16b,#4
+ eor w11,w7,w7,ror#5
+ add w3,w3,w13
+ mov d19,v3.d[1]
+ orr w12,w12,w15
+ eor w11,w11,w7,ror#19
+ ushr v6.4s,v4.4s,#7
+ eor w15,w3,w3,ror#11
+ ushr v5.4s,v4.4s,#3
+ add w10,w10,w12
+ add v0.4s,v0.4s,v7.4s
+ ror w11,w11,#6
+ sli v6.4s,v4.4s,#25
+ eor w13,w3,w4
+ eor w15,w15,w3,ror#20
+ ushr v7.4s,v4.4s,#18
+ add w10,w10,w11
+ ldr w12,[sp,#4]
+ and w14,w14,w13
+ eor v5.16b,v5.16b,v6.16b
+ ror w15,w15,#2
+ add w6,w6,w10
+ sli v7.4s,v4.4s,#14
+ eor w14,w14,w4
+ ushr v16.4s,v19.4s,#17
+ add w9,w9,w12
+ add w10,w10,w15
+ and w12,w7,w6
+ eor v5.16b,v5.16b,v7.16b
+ bic w15,w8,w6
+ eor w11,w6,w6,ror#5
+ sli v16.4s,v19.4s,#15
+ add w10,w10,w14
+ orr w12,w12,w15
+ ushr v17.4s,v19.4s,#10
+ eor w11,w11,w6,ror#19
+ eor w15,w10,w10,ror#11
+ ushr v7.4s,v19.4s,#19
+ add w9,w9,w12
+ ror w11,w11,#6
+ add v0.4s,v0.4s,v5.4s
+ eor w14,w10,w3
+ eor w15,w15,w10,ror#20
+ sli v7.4s,v19.4s,#13
+ add w9,w9,w11
+ ldr w12,[sp,#8]
+ and w13,w13,w14
+ eor v17.16b,v17.16b,v16.16b
+ ror w15,w15,#2
+ add w5,w5,w9
+ eor w13,w13,w3
+ eor v17.16b,v17.16b,v7.16b
+ add w8,w8,w12
+ add w9,w9,w15
+ and w12,w6,w5
+ add v0.4s,v0.4s,v17.4s
+ bic w15,w7,w5
+ eor w11,w5,w5,ror#5
+ add w9,w9,w13
+ ushr v18.4s,v0.4s,#17
+ orr w12,w12,w15
+ ushr v19.4s,v0.4s,#10
+ eor w11,w11,w5,ror#19
+ eor w15,w9,w9,ror#11
+ sli v18.4s,v0.4s,#15
+ add w8,w8,w12
+ ushr v17.4s,v0.4s,#19
+ ror w11,w11,#6
+ eor w13,w9,w10
+ eor v19.16b,v19.16b,v18.16b
+ eor w15,w15,w9,ror#20
+ add w8,w8,w11
+ sli v17.4s,v0.4s,#13
+ ldr w12,[sp,#12]
+ and w14,w14,w13
+ ror w15,w15,#2
+ ld1 {v4.4s},[x16], #16
+ add w4,w4,w8
+ eor v19.16b,v19.16b,v17.16b
+ eor w14,w14,w10
+ eor v17.16b,v17.16b,v17.16b
+ add w7,w7,w12
+ add w8,w8,w15
+ and w12,w5,w4
+ mov v17.d[1],v19.d[0]
+ bic w15,w6,w4
+ eor w11,w4,w4,ror#5
+ add w8,w8,w14
+ add v0.4s,v0.4s,v17.4s
+ orr w12,w12,w15
+ eor w11,w11,w4,ror#19
+ eor w15,w8,w8,ror#11
+ add v4.4s,v4.4s,v0.4s
+ add w7,w7,w12
+ ror w11,w11,#6
+ eor w14,w8,w9
+ eor w15,w15,w8,ror#20
+ add w7,w7,w11
+ ldr w12,[sp,#16]
+ and w13,w13,w14
+ ror w15,w15,#2
+ add w3,w3,w7
+ eor w13,w13,w9
+ st1 {v4.4s},[x17], #16
+ ext v4.16b,v1.16b,v2.16b,#4
+ add w6,w6,w12
+ add w7,w7,w15
+ and w12,w4,w3
+ bic w15,w5,w3
+ ext v7.16b,v3.16b,v0.16b,#4
+ eor w11,w3,w3,ror#5
+ add w7,w7,w13
+ mov d19,v0.d[1]
+ orr w12,w12,w15
+ eor w11,w11,w3,ror#19
+ ushr v6.4s,v4.4s,#7
+ eor w15,w7,w7,ror#11
+ ushr v5.4s,v4.4s,#3
+ add w6,w6,w12
+ add v1.4s,v1.4s,v7.4s
+ ror w11,w11,#6
+ sli v6.4s,v4.4s,#25
+ eor w13,w7,w8
+ eor w15,w15,w7,ror#20
+ ushr v7.4s,v4.4s,#18
+ add w6,w6,w11
+ ldr w12,[sp,#20]
+ and w14,w14,w13
+ eor v5.16b,v5.16b,v6.16b
+ ror w15,w15,#2
+ add w10,w10,w6
+ sli v7.4s,v4.4s,#14
+ eor w14,w14,w8
+ ushr v16.4s,v19.4s,#17
+ add w5,w5,w12
+ add w6,w6,w15
+ and w12,w3,w10
+ eor v5.16b,v5.16b,v7.16b
+ bic w15,w4,w10
+ eor w11,w10,w10,ror#5
+ sli v16.4s,v19.4s,#15
+ add w6,w6,w14
+ orr w12,w12,w15
+ ushr v17.4s,v19.4s,#10
+ eor w11,w11,w10,ror#19
+ eor w15,w6,w6,ror#11
+ ushr v7.4s,v19.4s,#19
+ add w5,w5,w12
+ ror w11,w11,#6
+ add v1.4s,v1.4s,v5.4s
+ eor w14,w6,w7
+ eor w15,w15,w6,ror#20
+ sli v7.4s,v19.4s,#13
+ add w5,w5,w11
+ ldr w12,[sp,#24]
+ and w13,w13,w14
+ eor v17.16b,v17.16b,v16.16b
+ ror w15,w15,#2
+ add w9,w9,w5
+ eor w13,w13,w7
+ eor v17.16b,v17.16b,v7.16b
+ add w4,w4,w12
+ add w5,w5,w15
+ and w12,w10,w9
+ add v1.4s,v1.4s,v17.4s
+ bic w15,w3,w9
+ eor w11,w9,w9,ror#5
+ add w5,w5,w13
+ ushr v18.4s,v1.4s,#17
+ orr w12,w12,w15
+ ushr v19.4s,v1.4s,#10
+ eor w11,w11,w9,ror#19
+ eor w15,w5,w5,ror#11
+ sli v18.4s,v1.4s,#15
+ add w4,w4,w12
+ ushr v17.4s,v1.4s,#19
+ ror w11,w11,#6
+ eor w13,w5,w6
+ eor v19.16b,v19.16b,v18.16b
+ eor w15,w15,w5,ror#20
+ add w4,w4,w11
+ sli v17.4s,v1.4s,#13
+ ldr w12,[sp,#28]
+ and w14,w14,w13
+ ror w15,w15,#2
+ ld1 {v4.4s},[x16], #16
+ add w8,w8,w4
+ eor v19.16b,v19.16b,v17.16b
+ eor w14,w14,w6
+ eor v17.16b,v17.16b,v17.16b
+ add w3,w3,w12
+ add w4,w4,w15
+ and w12,w9,w8
+ mov v17.d[1],v19.d[0]
+ bic w15,w10,w8
+ eor w11,w8,w8,ror#5
+ add w4,w4,w14
+ add v1.4s,v1.4s,v17.4s
+ orr w12,w12,w15
+ eor w11,w11,w8,ror#19
+ eor w15,w4,w4,ror#11
+ add v4.4s,v4.4s,v1.4s
+ add w3,w3,w12
+ ror w11,w11,#6
+ eor w14,w4,w5
+ eor w15,w15,w4,ror#20
+ add w3,w3,w11
+ ldr w12,[sp,#32]
+ and w13,w13,w14
+ ror w15,w15,#2
+ add w7,w7,w3
+ eor w13,w13,w5
+ st1 {v4.4s},[x17], #16
+ ext v4.16b,v2.16b,v3.16b,#4
+ add w10,w10,w12
+ add w3,w3,w15
+ and w12,w8,w7
+ bic w15,w9,w7
+ ext v7.16b,v0.16b,v1.16b,#4
+ eor w11,w7,w7,ror#5
+ add w3,w3,w13
+ mov d19,v1.d[1]
+ orr w12,w12,w15
+ eor w11,w11,w7,ror#19
+ ushr v6.4s,v4.4s,#7
+ eor w15,w3,w3,ror#11
+ ushr v5.4s,v4.4s,#3
+ add w10,w10,w12
+ add v2.4s,v2.4s,v7.4s
+ ror w11,w11,#6
+ sli v6.4s,v4.4s,#25
+ eor w13,w3,w4
+ eor w15,w15,w3,ror#20
+ ushr v7.4s,v4.4s,#18
+ add w10,w10,w11
+ ldr w12,[sp,#36]
+ and w14,w14,w13
+ eor v5.16b,v5.16b,v6.16b
+ ror w15,w15,#2
+ add w6,w6,w10
+ sli v7.4s,v4.4s,#14
+ eor w14,w14,w4
+ ushr v16.4s,v19.4s,#17
+ add w9,w9,w12
+ add w10,w10,w15
+ and w12,w7,w6
+ eor v5.16b,v5.16b,v7.16b
+ bic w15,w8,w6
+ eor w11,w6,w6,ror#5
+ sli v16.4s,v19.4s,#15
+ add w10,w10,w14
+ orr w12,w12,w15
+ ushr v17.4s,v19.4s,#10
+ eor w11,w11,w6,ror#19
+ eor w15,w10,w10,ror#11
+ ushr v7.4s,v19.4s,#19
+ add w9,w9,w12
+ ror w11,w11,#6
+ add v2.4s,v2.4s,v5.4s
+ eor w14,w10,w3
+ eor w15,w15,w10,ror#20
+ sli v7.4s,v19.4s,#13
+ add w9,w9,w11
+ ldr w12,[sp,#40]
+ and w13,w13,w14
+ eor v17.16b,v17.16b,v16.16b
+ ror w15,w15,#2
+ add w5,w5,w9
+ eor w13,w13,w3
+ eor v17.16b,v17.16b,v7.16b
+ add w8,w8,w12
+ add w9,w9,w15
+ and w12,w6,w5
+ add v2.4s,v2.4s,v17.4s
+ bic w15,w7,w5
+ eor w11,w5,w5,ror#5
+ add w9,w9,w13
+ ushr v18.4s,v2.4s,#17
+ orr w12,w12,w15
+ ushr v19.4s,v2.4s,#10
+ eor w11,w11,w5,ror#19
+ eor w15,w9,w9,ror#11
+ sli v18.4s,v2.4s,#15
+ add w8,w8,w12
+ ushr v17.4s,v2.4s,#19
+ ror w11,w11,#6
+ eor w13,w9,w10
+ eor v19.16b,v19.16b,v18.16b
+ eor w15,w15,w9,ror#20
+ add w8,w8,w11
+ sli v17.4s,v2.4s,#13
+ ldr w12,[sp,#44]
+ and w14,w14,w13
+ ror w15,w15,#2
+ ld1 {v4.4s},[x16], #16
+ add w4,w4,w8
+ eor v19.16b,v19.16b,v17.16b
+ eor w14,w14,w10
+ eor v17.16b,v17.16b,v17.16b
+ add w7,w7,w12
+ add w8,w8,w15
+ and w12,w5,w4
+ mov v17.d[1],v19.d[0]
+ bic w15,w6,w4
+ eor w11,w4,w4,ror#5
+ add w8,w8,w14
+ add v2.4s,v2.4s,v17.4s
+ orr w12,w12,w15
+ eor w11,w11,w4,ror#19
+ eor w15,w8,w8,ror#11
+ add v4.4s,v4.4s,v2.4s
+ add w7,w7,w12
+ ror w11,w11,#6
+ eor w14,w8,w9
+ eor w15,w15,w8,ror#20
+ add w7,w7,w11
+ ldr w12,[sp,#48]
+ and w13,w13,w14
+ ror w15,w15,#2
+ add w3,w3,w7
+ eor w13,w13,w9
+ st1 {v4.4s},[x17], #16
+ ext v4.16b,v3.16b,v0.16b,#4
+ add w6,w6,w12
+ add w7,w7,w15
+ and w12,w4,w3
+ bic w15,w5,w3
+ ext v7.16b,v1.16b,v2.16b,#4
+ eor w11,w3,w3,ror#5
+ add w7,w7,w13
+ mov d19,v2.d[1]
+ orr w12,w12,w15
+ eor w11,w11,w3,ror#19
+ ushr v6.4s,v4.4s,#7
+ eor w15,w7,w7,ror#11
+ ushr v5.4s,v4.4s,#3
+ add w6,w6,w12
+ add v3.4s,v3.4s,v7.4s
+ ror w11,w11,#6
+ sli v6.4s,v4.4s,#25
+ eor w13,w7,w8
+ eor w15,w15,w7,ror#20
+ ushr v7.4s,v4.4s,#18
+ add w6,w6,w11
+ ldr w12,[sp,#52]
+ and w14,w14,w13
+ eor v5.16b,v5.16b,v6.16b
+ ror w15,w15,#2
+ add w10,w10,w6
+ sli v7.4s,v4.4s,#14
+ eor w14,w14,w8
+ ushr v16.4s,v19.4s,#17
+ add w5,w5,w12
+ add w6,w6,w15
+ and w12,w3,w10
+ eor v5.16b,v5.16b,v7.16b
+ bic w15,w4,w10
+ eor w11,w10,w10,ror#5
+ sli v16.4s,v19.4s,#15
+ add w6,w6,w14
+ orr w12,w12,w15
+ ushr v17.4s,v19.4s,#10
+ eor w11,w11,w10,ror#19
+ eor w15,w6,w6,ror#11
+ ushr v7.4s,v19.4s,#19
+ add w5,w5,w12
+ ror w11,w11,#6
+ add v3.4s,v3.4s,v5.4s
+ eor w14,w6,w7
+ eor w15,w15,w6,ror#20
+ sli v7.4s,v19.4s,#13
+ add w5,w5,w11
+ ldr w12,[sp,#56]
+ and w13,w13,w14
+ eor v17.16b,v17.16b,v16.16b
+ ror w15,w15,#2
+ add w9,w9,w5
+ eor w13,w13,w7
+ eor v17.16b,v17.16b,v7.16b
+ add w4,w4,w12
+ add w5,w5,w15
+ and w12,w10,w9
+ add v3.4s,v3.4s,v17.4s
+ bic w15,w3,w9
+ eor w11,w9,w9,ror#5
+ add w5,w5,w13
+ ushr v18.4s,v3.4s,#17
+ orr w12,w12,w15
+ ushr v19.4s,v3.4s,#10
+ eor w11,w11,w9,ror#19
+ eor w15,w5,w5,ror#11
+ sli v18.4s,v3.4s,#15
+ add w4,w4,w12
+ ushr v17.4s,v3.4s,#19
+ ror w11,w11,#6
+ eor w13,w5,w6
+ eor v19.16b,v19.16b,v18.16b
+ eor w15,w15,w5,ror#20
+ add w4,w4,w11
+ sli v17.4s,v3.4s,#13
+ ldr w12,[sp,#60]
+ and w14,w14,w13
+ ror w15,w15,#2
+ ld1 {v4.4s},[x16], #16
+ add w8,w8,w4
+ eor v19.16b,v19.16b,v17.16b
+ eor w14,w14,w6
+ eor v17.16b,v17.16b,v17.16b
+ add w3,w3,w12
+ add w4,w4,w15
+ and w12,w9,w8
+ mov v17.d[1],v19.d[0]
+ bic w15,w10,w8
+ eor w11,w8,w8,ror#5
+ add w4,w4,w14
+ add v3.4s,v3.4s,v17.4s
+ orr w12,w12,w15
+ eor w11,w11,w8,ror#19
+ eor w15,w4,w4,ror#11
+ add v4.4s,v4.4s,v3.4s
+ add w3,w3,w12
+ ror w11,w11,#6
+ eor w14,w4,w5
+ eor w15,w15,w4,ror#20
+ add w3,w3,w11
+ ldr w12,[x16]
+ and w13,w13,w14
+ ror w15,w15,#2
+ add w7,w7,w3
+ eor w13,w13,w5
+ st1 {v4.4s},[x17], #16
+ cmp w12,#0 // check for K256 terminator
+ ldr w12,[sp,#0]
+ sub x17,x17,#64
+ bne .L_00_48
+
+ sub x16,x16,#256 // rewind x16
+ cmp x1,x2
+ mov x17, #64
+ csel x17, x17, xzr, eq
+ sub x1,x1,x17 // avoid SEGV
+ mov x17,sp
+ add w10,w10,w12
+ add w3,w3,w15
+ and w12,w8,w7
+ ld1 {v0.16b},[x1],#16
+ bic w15,w9,w7
+ eor w11,w7,w7,ror#5
+ ld1 {v4.4s},[x16],#16
+ add w3,w3,w13
+ orr w12,w12,w15
+ eor w11,w11,w7,ror#19
+ eor w15,w3,w3,ror#11
+ rev32 v0.16b,v0.16b
+ add w10,w10,w12
+ ror w11,w11,#6
+ eor w13,w3,w4
+ eor w15,w15,w3,ror#20
+ add v4.4s,v4.4s,v0.4s
+ add w10,w10,w11
+ ldr w12,[sp,#4]
+ and w14,w14,w13
+ ror w15,w15,#2
+ add w6,w6,w10
+ eor w14,w14,w4
+ add w9,w9,w12
+ add w10,w10,w15
+ and w12,w7,w6
+ bic w15,w8,w6
+ eor w11,w6,w6,ror#5
+ add w10,w10,w14
+ orr w12,w12,w15
+ eor w11,w11,w6,ror#19
+ eor w15,w10,w10,ror#11
+ add w9,w9,w12
+ ror w11,w11,#6
+ eor w14,w10,w3
+ eor w15,w15,w10,ror#20
+ add w9,w9,w11
+ ldr w12,[sp,#8]
+ and w13,w13,w14
+ ror w15,w15,#2
+ add w5,w5,w9
+ eor w13,w13,w3
+ add w8,w8,w12
+ add w9,w9,w15
+ and w12,w6,w5
+ bic w15,w7,w5
+ eor w11,w5,w5,ror#5
+ add w9,w9,w13
+ orr w12,w12,w15
+ eor w11,w11,w5,ror#19
+ eor w15,w9,w9,ror#11
+ add w8,w8,w12
+ ror w11,w11,#6
+ eor w13,w9,w10
+ eor w15,w15,w9,ror#20
+ add w8,w8,w11
+ ldr w12,[sp,#12]
+ and w14,w14,w13
+ ror w15,w15,#2
+ add w4,w4,w8
+ eor w14,w14,w10
+ add w7,w7,w12
+ add w8,w8,w15
+ and w12,w5,w4
+ bic w15,w6,w4
+ eor w11,w4,w4,ror#5
+ add w8,w8,w14
+ orr w12,w12,w15
+ eor w11,w11,w4,ror#19
+ eor w15,w8,w8,ror#11
+ add w7,w7,w12
+ ror w11,w11,#6
+ eor w14,w8,w9
+ eor w15,w15,w8,ror#20
+ add w7,w7,w11
+ ldr w12,[sp,#16]
+ and w13,w13,w14
+ ror w15,w15,#2
+ add w3,w3,w7
+ eor w13,w13,w9
+ st1 {v4.4s},[x17], #16
+ add w6,w6,w12
+ add w7,w7,w15
+ and w12,w4,w3
+ ld1 {v1.16b},[x1],#16
+ bic w15,w5,w3
+ eor w11,w3,w3,ror#5
+ ld1 {v4.4s},[x16],#16
+ add w7,w7,w13
+ orr w12,w12,w15
+ eor w11,w11,w3,ror#19
+ eor w15,w7,w7,ror#11
+ rev32 v1.16b,v1.16b
+ add w6,w6,w12
+ ror w11,w11,#6
+ eor w13,w7,w8
+ eor w15,w15,w7,ror#20
+ add v4.4s,v4.4s,v1.4s
+ add w6,w6,w11
+ ldr w12,[sp,#20]
+ and w14,w14,w13
+ ror w15,w15,#2
+ add w10,w10,w6
+ eor w14,w14,w8
+ add w5,w5,w12
+ add w6,w6,w15
+ and w12,w3,w10
+ bic w15,w4,w10
+ eor w11,w10,w10,ror#5
+ add w6,w6,w14
+ orr w12,w12,w15
+ eor w11,w11,w10,ror#19
+ eor w15,w6,w6,ror#11
+ add w5,w5,w12
+ ror w11,w11,#6
+ eor w14,w6,w7
+ eor w15,w15,w6,ror#20
+ add w5,w5,w11
+ ldr w12,[sp,#24]
+ and w13,w13,w14
+ ror w15,w15,#2
+ add w9,w9,w5
+ eor w13,w13,w7
+ add w4,w4,w12
+ add w5,w5,w15
+ and w12,w10,w9
+ bic w15,w3,w9
+ eor w11,w9,w9,ror#5
+ add w5,w5,w13
+ orr w12,w12,w15
+ eor w11,w11,w9,ror#19
+ eor w15,w5,w5,ror#11
+ add w4,w4,w12
+ ror w11,w11,#6
+ eor w13,w5,w6
+ eor w15,w15,w5,ror#20
+ add w4,w4,w11
+ ldr w12,[sp,#28]
+ and w14,w14,w13
+ ror w15,w15,#2
+ add w8,w8,w4
+ eor w14,w14,w6
+ add w3,w3,w12
+ add w4,w4,w15
+ and w12,w9,w8
+ bic w15,w10,w8
+ eor w11,w8,w8,ror#5
+ add w4,w4,w14
+ orr w12,w12,w15
+ eor w11,w11,w8,ror#19
+ eor w15,w4,w4,ror#11
+ add w3,w3,w12
+ ror w11,w11,#6
+ eor w14,w4,w5
+ eor w15,w15,w4,ror#20
+ add w3,w3,w11
+ ldr w12,[sp,#32]
+ and w13,w13,w14
+ ror w15,w15,#2
+ add w7,w7,w3
+ eor w13,w13,w5
+ st1 {v4.4s},[x17], #16
+ add w10,w10,w12
+ add w3,w3,w15
+ and w12,w8,w7
+ ld1 {v2.16b},[x1],#16
+ bic w15,w9,w7
+ eor w11,w7,w7,ror#5
+ ld1 {v4.4s},[x16],#16
+ add w3,w3,w13
+ orr w12,w12,w15
+ eor w11,w11,w7,ror#19
+ eor w15,w3,w3,ror#11
+ rev32 v2.16b,v2.16b
+ add w10,w10,w12
+ ror w11,w11,#6
+ eor w13,w3,w4
+ eor w15,w15,w3,ror#20
+ add v4.4s,v4.4s,v2.4s
+ add w10,w10,w11
+ ldr w12,[sp,#36]
+ and w14,w14,w13
+ ror w15,w15,#2
+ add w6,w6,w10
+ eor w14,w14,w4
+ add w9,w9,w12
+ add w10,w10,w15
+ and w12,w7,w6
+ bic w15,w8,w6
+ eor w11,w6,w6,ror#5
+ add w10,w10,w14
+ orr w12,w12,w15
+ eor w11,w11,w6,ror#19
+ eor w15,w10,w10,ror#11
+ add w9,w9,w12
+ ror w11,w11,#6
+ eor w14,w10,w3
+ eor w15,w15,w10,ror#20
+ add w9,w9,w11
+ ldr w12,[sp,#40]
+ and w13,w13,w14
+ ror w15,w15,#2
+ add w5,w5,w9
+ eor w13,w13,w3
+ add w8,w8,w12
+ add w9,w9,w15
+ and w12,w6,w5
+ bic w15,w7,w5
+ eor w11,w5,w5,ror#5
+ add w9,w9,w13
+ orr w12,w12,w15
+ eor w11,w11,w5,ror#19
+ eor w15,w9,w9,ror#11
+ add w8,w8,w12
+ ror w11,w11,#6
+ eor w13,w9,w10
+ eor w15,w15,w9,ror#20
+ add w8,w8,w11
+ ldr w12,[sp,#44]
+ and w14,w14,w13
+ ror w15,w15,#2
+ add w4,w4,w8
+ eor w14,w14,w10
+ add w7,w7,w12
+ add w8,w8,w15
+ and w12,w5,w4
+ bic w15,w6,w4
+ eor w11,w4,w4,ror#5
+ add w8,w8,w14
+ orr w12,w12,w15
+ eor w11,w11,w4,ror#19
+ eor w15,w8,w8,ror#11
+ add w7,w7,w12
+ ror w11,w11,#6
+ eor w14,w8,w9
+ eor w15,w15,w8,ror#20
+ add w7,w7,w11
+ ldr w12,[sp,#48]
+ and w13,w13,w14
+ ror w15,w15,#2
+ add w3,w3,w7
+ eor w13,w13,w9
+ st1 {v4.4s},[x17], #16
+ add w6,w6,w12
+ add w7,w7,w15
+ and w12,w4,w3
+ ld1 {v3.16b},[x1],#16
+ bic w15,w5,w3
+ eor w11,w3,w3,ror#5
+ ld1 {v4.4s},[x16],#16
+ add w7,w7,w13
+ orr w12,w12,w15
+ eor w11,w11,w3,ror#19
+ eor w15,w7,w7,ror#11
+ rev32 v3.16b,v3.16b
+ add w6,w6,w12
+ ror w11,w11,#6
+ eor w13,w7,w8
+ eor w15,w15,w7,ror#20
+ add v4.4s,v4.4s,v3.4s
+ add w6,w6,w11
+ ldr w12,[sp,#52]
+ and w14,w14,w13
+ ror w15,w15,#2
+ add w10,w10,w6
+ eor w14,w14,w8
+ add w5,w5,w12
+ add w6,w6,w15
+ and w12,w3,w10
+ bic w15,w4,w10
+ eor w11,w10,w10,ror#5
+ add w6,w6,w14
+ orr w12,w12,w15
+ eor w11,w11,w10,ror#19
+ eor w15,w6,w6,ror#11
+ add w5,w5,w12
+ ror w11,w11,#6
+ eor w14,w6,w7
+ eor w15,w15,w6,ror#20
+ add w5,w5,w11
+ ldr w12,[sp,#56]
+ and w13,w13,w14
+ ror w15,w15,#2
+ add w9,w9,w5
+ eor w13,w13,w7
+ add w4,w4,w12
+ add w5,w5,w15
+ and w12,w10,w9
+ bic w15,w3,w9
+ eor w11,w9,w9,ror#5
+ add w5,w5,w13
+ orr w12,w12,w15
+ eor w11,w11,w9,ror#19
+ eor w15,w5,w5,ror#11
+ add w4,w4,w12
+ ror w11,w11,#6
+ eor w13,w5,w6
+ eor w15,w15,w5,ror#20
+ add w4,w4,w11
+ ldr w12,[sp,#60]
+ and w14,w14,w13
+ ror w15,w15,#2
+ add w8,w8,w4
+ eor w14,w14,w6
+ add w3,w3,w12
+ add w4,w4,w15
+ and w12,w9,w8
+ bic w15,w10,w8
+ eor w11,w8,w8,ror#5
+ add w4,w4,w14
+ orr w12,w12,w15
+ eor w11,w11,w8,ror#19
+ eor w15,w4,w4,ror#11
+ add w3,w3,w12
+ ror w11,w11,#6
+ eor w14,w4,w5
+ eor w15,w15,w4,ror#20
+ add w3,w3,w11
+ and w13,w13,w14
+ ror w15,w15,#2
+ add w7,w7,w3
+ eor w13,w13,w5
+ st1 {v4.4s},[x17], #16
+ add w3,w3,w15 // h+=Sigma0(a) from the past
+ ldp w11,w12,[x0,#0]
+ add w3,w3,w13 // h+=Maj(a,b,c) from the past
+ ldp w13,w14,[x0,#8]
+ add w3,w3,w11 // accumulate
+ add w4,w4,w12
+ ldp w11,w12,[x0,#16]
+ add w5,w5,w13
+ add w6,w6,w14
+ ldp w13,w14,[x0,#24]
+ add w7,w7,w11
+ add w8,w8,w12
+ ldr w12,[sp,#0]
+ stp w3,w4,[x0,#0]
+ add w9,w9,w13
+ mov w13,wzr
+ stp w5,w6,[x0,#8]
+ add w10,w10,w14
+ stp w7,w8,[x0,#16]
+ eor w14,w4,w5
+ stp w9,w10,[x0,#24]
+ mov w15,wzr
+ mov x17,sp
+ b.ne .L_00_48
+
+ ldr x29,[x29]
+ add sp,sp,#16*4+16
+ ret
+.size zfs_sha256_block_neon,.-zfs_sha256_block_neon
+
+#endif
diff --git a/module/icp/asm-aarch64/sha2/sha512-armv8.S b/module/icp/asm-aarch64/sha2/sha512-armv8.S
new file mode 100644
index 000000000..1683fc1ca
--- /dev/null
+++ b/module/icp/asm-aarch64/sha2/sha512-armv8.S
@@ -0,0 +1,1558 @@
+/*
+ * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Portions Copyright (c) 2022 Tino Reichardt <[email protected]>
+ * - modified assembly to fit into OpenZFS
+ */
+
+#if defined(__aarch64__)
+
+.text
+
+.align 6
+.type .LK512,%object
+.LK512:
+ .quad 0x428a2f98d728ae22,0x7137449123ef65cd
+ .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+ .quad 0x3956c25bf348b538,0x59f111f1b605d019
+ .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+ .quad 0xd807aa98a3030242,0x12835b0145706fbe
+ .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+ .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+ .quad 0x9bdc06a725c71235,0xc19bf174cf692694
+ .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+ .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+ .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+ .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+ .quad 0x983e5152ee66dfab,0xa831c66d2db43210
+ .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+ .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+ .quad 0x06ca6351e003826f,0x142929670a0e6e70
+ .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+ .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+ .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+ .quad 0x81c2c92e47edaee6,0x92722c851482353b
+ .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+ .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+ .quad 0xd192e819d6ef5218,0xd69906245565a910
+ .quad 0xf40e35855771202a,0x106aa07032bbd1b8
+ .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+ .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+ .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+ .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+ .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+ .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+ .quad 0x90befffa23631e28,0xa4506cebde82bde9
+ .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+ .quad 0xca273eceea26619c,0xd186b8c721c0c207
+ .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+ .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+ .quad 0x113f9804bef90dae,0x1b710b35131c471b
+ .quad 0x28db77f523047d84,0x32caab7b40c72493
+ .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+ .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+ .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+ .quad 0 // terminator
+.size .LK512,.-.LK512
+
+.globl zfs_sha512_block_armv7
+.type zfs_sha512_block_armv7,%function
+.align 6
+zfs_sha512_block_armv7:
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ sub sp,sp,#4*8
+
+ ldp x20,x21,[x0] // load context
+ ldp x22,x23,[x0,#2*8]
+ ldp x24,x25,[x0,#4*8]
+ add x2,x1,x2,lsl#7 // end of input
+ ldp x26,x27,[x0,#6*8]
+ adr x30,.LK512
+ stp x0,x2,[x29,#96]
+
+.Loop:
+ ldp x3,x4,[x1],#2*8
+ ldr x19,[x30],#8 // *K++
+ eor x28,x21,x22 // magic seed
+ str x1,[x29,#112]
+#ifndef __AARCH64EB__
+ rev x3,x3 // 0
+#endif
+ ror x16,x24,#14
+ add x27,x27,x19 // h+=K[i]
+ eor x6,x24,x24,ror#23
+ and x17,x25,x24
+ bic x19,x26,x24
+ add x27,x27,x3 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x20,x21 // a^b, b^c in next round
+ eor x16,x16,x6,ror#18 // Sigma1(e)
+ ror x6,x20,#28
+ add x27,x27,x17 // h+=Ch(e,f,g)
+ eor x17,x20,x20,ror#5
+ add x27,x27,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x23,x23,x27 // d+=h
+ eor x28,x28,x21 // Maj(a,b,c)
+ eor x17,x6,x17,ror#34 // Sigma0(a)
+ add x27,x27,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x27,x27,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x4,x4 // 1
+#endif
+ ldp x5,x6,[x1],#2*8
+ add x27,x27,x17 // h+=Sigma0(a)
+ ror x16,x23,#14
+ add x26,x26,x28 // h+=K[i]
+ eor x7,x23,x23,ror#23
+ and x17,x24,x23
+ bic x28,x25,x23
+ add x26,x26,x4 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x27,x20 // a^b, b^c in next round
+ eor x16,x16,x7,ror#18 // Sigma1(e)
+ ror x7,x27,#28
+ add x26,x26,x17 // h+=Ch(e,f,g)
+ eor x17,x27,x27,ror#5
+ add x26,x26,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x22,x22,x26 // d+=h
+ eor x19,x19,x20 // Maj(a,b,c)
+ eor x17,x7,x17,ror#34 // Sigma0(a)
+ add x26,x26,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x26,x26,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x5,x5 // 2
+#endif
+ add x26,x26,x17 // h+=Sigma0(a)
+ ror x16,x22,#14
+ add x25,x25,x19 // h+=K[i]
+ eor x8,x22,x22,ror#23
+ and x17,x23,x22
+ bic x19,x24,x22
+ add x25,x25,x5 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x26,x27 // a^b, b^c in next round
+ eor x16,x16,x8,ror#18 // Sigma1(e)
+ ror x8,x26,#28
+ add x25,x25,x17 // h+=Ch(e,f,g)
+ eor x17,x26,x26,ror#5
+ add x25,x25,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x21,x21,x25 // d+=h
+ eor x28,x28,x27 // Maj(a,b,c)
+ eor x17,x8,x17,ror#34 // Sigma0(a)
+ add x25,x25,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x25,x25,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x6,x6 // 3
+#endif
+ ldp x7,x8,[x1],#2*8
+ add x25,x25,x17 // h+=Sigma0(a)
+ ror x16,x21,#14
+ add x24,x24,x28 // h+=K[i]
+ eor x9,x21,x21,ror#23
+ and x17,x22,x21
+ bic x28,x23,x21
+ add x24,x24,x6 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x25,x26 // a^b, b^c in next round
+ eor x16,x16,x9,ror#18 // Sigma1(e)
+ ror x9,x25,#28
+ add x24,x24,x17 // h+=Ch(e,f,g)
+ eor x17,x25,x25,ror#5
+ add x24,x24,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x20,x20,x24 // d+=h
+ eor x19,x19,x26 // Maj(a,b,c)
+ eor x17,x9,x17,ror#34 // Sigma0(a)
+ add x24,x24,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x24,x24,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x7,x7 // 4
+#endif
+ add x24,x24,x17 // h+=Sigma0(a)
+ ror x16,x20,#14
+ add x23,x23,x19 // h+=K[i]
+ eor x10,x20,x20,ror#23
+ and x17,x21,x20
+ bic x19,x22,x20
+ add x23,x23,x7 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x24,x25 // a^b, b^c in next round
+ eor x16,x16,x10,ror#18 // Sigma1(e)
+ ror x10,x24,#28
+ add x23,x23,x17 // h+=Ch(e,f,g)
+ eor x17,x24,x24,ror#5
+ add x23,x23,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x27,x27,x23 // d+=h
+ eor x28,x28,x25 // Maj(a,b,c)
+ eor x17,x10,x17,ror#34 // Sigma0(a)
+ add x23,x23,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x23,x23,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x8,x8 // 5
+#endif
+ ldp x9,x10,[x1],#2*8
+ add x23,x23,x17 // h+=Sigma0(a)
+ ror x16,x27,#14
+ add x22,x22,x28 // h+=K[i]
+ eor x11,x27,x27,ror#23
+ and x17,x20,x27
+ bic x28,x21,x27
+ add x22,x22,x8 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x23,x24 // a^b, b^c in next round
+ eor x16,x16,x11,ror#18 // Sigma1(e)
+ ror x11,x23,#28
+ add x22,x22,x17 // h+=Ch(e,f,g)
+ eor x17,x23,x23,ror#5
+ add x22,x22,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x26,x26,x22 // d+=h
+ eor x19,x19,x24 // Maj(a,b,c)
+ eor x17,x11,x17,ror#34 // Sigma0(a)
+ add x22,x22,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x22,x22,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x9,x9 // 6
+#endif
+ add x22,x22,x17 // h+=Sigma0(a)
+ ror x16,x26,#14
+ add x21,x21,x19 // h+=K[i]
+ eor x12,x26,x26,ror#23
+ and x17,x27,x26
+ bic x19,x20,x26
+ add x21,x21,x9 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x22,x23 // a^b, b^c in next round
+ eor x16,x16,x12,ror#18 // Sigma1(e)
+ ror x12,x22,#28
+ add x21,x21,x17 // h+=Ch(e,f,g)
+ eor x17,x22,x22,ror#5
+ add x21,x21,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x25,x25,x21 // d+=h
+ eor x28,x28,x23 // Maj(a,b,c)
+ eor x17,x12,x17,ror#34 // Sigma0(a)
+ add x21,x21,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x21,x21,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x10,x10 // 7
+#endif
+ ldp x11,x12,[x1],#2*8
+ add x21,x21,x17 // h+=Sigma0(a)
+ ror x16,x25,#14
+ add x20,x20,x28 // h+=K[i]
+ eor x13,x25,x25,ror#23
+ and x17,x26,x25
+ bic x28,x27,x25
+ add x20,x20,x10 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x21,x22 // a^b, b^c in next round
+ eor x16,x16,x13,ror#18 // Sigma1(e)
+ ror x13,x21,#28
+ add x20,x20,x17 // h+=Ch(e,f,g)
+ eor x17,x21,x21,ror#5
+ add x20,x20,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x24,x24,x20 // d+=h
+ eor x19,x19,x22 // Maj(a,b,c)
+ eor x17,x13,x17,ror#34 // Sigma0(a)
+ add x20,x20,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x20,x20,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x11,x11 // 8
+#endif
+ add x20,x20,x17 // h+=Sigma0(a)
+ ror x16,x24,#14
+ add x27,x27,x19 // h+=K[i]
+ eor x14,x24,x24,ror#23
+ and x17,x25,x24
+ bic x19,x26,x24
+ add x27,x27,x11 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x20,x21 // a^b, b^c in next round
+ eor x16,x16,x14,ror#18 // Sigma1(e)
+ ror x14,x20,#28
+ add x27,x27,x17 // h+=Ch(e,f,g)
+ eor x17,x20,x20,ror#5
+ add x27,x27,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x23,x23,x27 // d+=h
+ eor x28,x28,x21 // Maj(a,b,c)
+ eor x17,x14,x17,ror#34 // Sigma0(a)
+ add x27,x27,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x27,x27,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x12,x12 // 9
+#endif
+ ldp x13,x14,[x1],#2*8
+ add x27,x27,x17 // h+=Sigma0(a)
+ ror x16,x23,#14
+ add x26,x26,x28 // h+=K[i]
+ eor x15,x23,x23,ror#23
+ and x17,x24,x23
+ bic x28,x25,x23
+ add x26,x26,x12 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x27,x20 // a^b, b^c in next round
+ eor x16,x16,x15,ror#18 // Sigma1(e)
+ ror x15,x27,#28
+ add x26,x26,x17 // h+=Ch(e,f,g)
+ eor x17,x27,x27,ror#5
+ add x26,x26,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x22,x22,x26 // d+=h
+ eor x19,x19,x20 // Maj(a,b,c)
+ eor x17,x15,x17,ror#34 // Sigma0(a)
+ add x26,x26,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x26,x26,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x13,x13 // 10
+#endif
+ add x26,x26,x17 // h+=Sigma0(a)
+ ror x16,x22,#14
+ add x25,x25,x19 // h+=K[i]
+ eor x0,x22,x22,ror#23
+ and x17,x23,x22
+ bic x19,x24,x22
+ add x25,x25,x13 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x26,x27 // a^b, b^c in next round
+ eor x16,x16,x0,ror#18 // Sigma1(e)
+ ror x0,x26,#28
+ add x25,x25,x17 // h+=Ch(e,f,g)
+ eor x17,x26,x26,ror#5
+ add x25,x25,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x21,x21,x25 // d+=h
+ eor x28,x28,x27 // Maj(a,b,c)
+ eor x17,x0,x17,ror#34 // Sigma0(a)
+ add x25,x25,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x25,x25,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x14,x14 // 11
+#endif
+ ldp x15,x0,[x1],#2*8
+ add x25,x25,x17 // h+=Sigma0(a)
+ str x6,[sp,#24]
+ ror x16,x21,#14
+ add x24,x24,x28 // h+=K[i]
+ eor x6,x21,x21,ror#23
+ and x17,x22,x21
+ bic x28,x23,x21
+ add x24,x24,x14 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x25,x26 // a^b, b^c in next round
+ eor x16,x16,x6,ror#18 // Sigma1(e)
+ ror x6,x25,#28
+ add x24,x24,x17 // h+=Ch(e,f,g)
+ eor x17,x25,x25,ror#5
+ add x24,x24,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x20,x20,x24 // d+=h
+ eor x19,x19,x26 // Maj(a,b,c)
+ eor x17,x6,x17,ror#34 // Sigma0(a)
+ add x24,x24,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x24,x24,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x15,x15 // 12
+#endif
+ add x24,x24,x17 // h+=Sigma0(a)
+ str x7,[sp,#0]
+ ror x16,x20,#14
+ add x23,x23,x19 // h+=K[i]
+ eor x7,x20,x20,ror#23
+ and x17,x21,x20
+ bic x19,x22,x20
+ add x23,x23,x15 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x24,x25 // a^b, b^c in next round
+ eor x16,x16,x7,ror#18 // Sigma1(e)
+ ror x7,x24,#28
+ add x23,x23,x17 // h+=Ch(e,f,g)
+ eor x17,x24,x24,ror#5
+ add x23,x23,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x27,x27,x23 // d+=h
+ eor x28,x28,x25 // Maj(a,b,c)
+ eor x17,x7,x17,ror#34 // Sigma0(a)
+ add x23,x23,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x23,x23,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x0,x0 // 13
+#endif
+ ldp x1,x2,[x1]
+ add x23,x23,x17 // h+=Sigma0(a)
+ str x8,[sp,#8]
+ ror x16,x27,#14
+ add x22,x22,x28 // h+=K[i]
+ eor x8,x27,x27,ror#23
+ and x17,x20,x27
+ bic x28,x21,x27
+ add x22,x22,x0 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x23,x24 // a^b, b^c in next round
+ eor x16,x16,x8,ror#18 // Sigma1(e)
+ ror x8,x23,#28
+ add x22,x22,x17 // h+=Ch(e,f,g)
+ eor x17,x23,x23,ror#5
+ add x22,x22,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x26,x26,x22 // d+=h
+ eor x19,x19,x24 // Maj(a,b,c)
+ eor x17,x8,x17,ror#34 // Sigma0(a)
+ add x22,x22,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x22,x22,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x1,x1 // 14
+#endif
+ ldr x6,[sp,#24]
+ add x22,x22,x17 // h+=Sigma0(a)
+ str x9,[sp,#16]
+ ror x16,x26,#14
+ add x21,x21,x19 // h+=K[i]
+ eor x9,x26,x26,ror#23
+ and x17,x27,x26
+ bic x19,x20,x26
+ add x21,x21,x1 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x22,x23 // a^b, b^c in next round
+ eor x16,x16,x9,ror#18 // Sigma1(e)
+ ror x9,x22,#28
+ add x21,x21,x17 // h+=Ch(e,f,g)
+ eor x17,x22,x22,ror#5
+ add x21,x21,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x25,x25,x21 // d+=h
+ eor x28,x28,x23 // Maj(a,b,c)
+ eor x17,x9,x17,ror#34 // Sigma0(a)
+ add x21,x21,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x21,x21,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x2,x2 // 15
+#endif
+ ldr x7,[sp,#0]
+ add x21,x21,x17 // h+=Sigma0(a)
+ str x10,[sp,#24]
+ ror x16,x25,#14
+ add x20,x20,x28 // h+=K[i]
+ ror x9,x4,#1
+ and x17,x26,x25
+ ror x8,x1,#19
+ bic x28,x27,x25
+ ror x10,x21,#28
+ add x20,x20,x2 // h+=X[i]
+ eor x16,x16,x25,ror#18
+ eor x9,x9,x4,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x21,x22 // a^b, b^c in next round
+ eor x16,x16,x25,ror#41 // Sigma1(e)
+ eor x10,x10,x21,ror#34
+ add x20,x20,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x8,x8,x1,ror#61
+ eor x9,x9,x4,lsr#7 // sigma0(X[i+1])
+ add x20,x20,x16 // h+=Sigma1(e)
+ eor x19,x19,x22 // Maj(a,b,c)
+ eor x17,x10,x21,ror#39 // Sigma0(a)
+ eor x8,x8,x1,lsr#6 // sigma1(X[i+14])
+ add x3,x3,x12
+ add x24,x24,x20 // d+=h
+ add x20,x20,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x3,x3,x9
+ add x20,x20,x17 // h+=Sigma0(a)
+ add x3,x3,x8
+.Loop_16_xx:
+ ldr x8,[sp,#8]
+ str x11,[sp,#0]
+ ror x16,x24,#14
+ add x27,x27,x19 // h+=K[i]
+ ror x10,x5,#1
+ and x17,x25,x24
+ ror x9,x2,#19
+ bic x19,x26,x24
+ ror x11,x20,#28
+ add x27,x27,x3 // h+=X[i]
+ eor x16,x16,x24,ror#18
+ eor x10,x10,x5,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x20,x21 // a^b, b^c in next round
+ eor x16,x16,x24,ror#41 // Sigma1(e)
+ eor x11,x11,x20,ror#34
+ add x27,x27,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x9,x9,x2,ror#61
+ eor x10,x10,x5,lsr#7 // sigma0(X[i+1])
+ add x27,x27,x16 // h+=Sigma1(e)
+ eor x28,x28,x21 // Maj(a,b,c)
+ eor x17,x11,x20,ror#39 // Sigma0(a)
+ eor x9,x9,x2,lsr#6 // sigma1(X[i+14])
+ add x4,x4,x13
+ add x23,x23,x27 // d+=h
+ add x27,x27,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x4,x4,x10
+ add x27,x27,x17 // h+=Sigma0(a)
+ add x4,x4,x9
+ ldr x9,[sp,#16]
+ str x12,[sp,#8]
+ ror x16,x23,#14
+ add x26,x26,x28 // h+=K[i]
+ ror x11,x6,#1
+ and x17,x24,x23
+ ror x10,x3,#19
+ bic x28,x25,x23
+ ror x12,x27,#28
+ add x26,x26,x4 // h+=X[i]
+ eor x16,x16,x23,ror#18
+ eor x11,x11,x6,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x27,x20 // a^b, b^c in next round
+ eor x16,x16,x23,ror#41 // Sigma1(e)
+ eor x12,x12,x27,ror#34
+ add x26,x26,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x10,x10,x3,ror#61
+ eor x11,x11,x6,lsr#7 // sigma0(X[i+1])
+ add x26,x26,x16 // h+=Sigma1(e)
+ eor x19,x19,x20 // Maj(a,b,c)
+ eor x17,x12,x27,ror#39 // Sigma0(a)
+ eor x10,x10,x3,lsr#6 // sigma1(X[i+14])
+ add x5,x5,x14
+ add x22,x22,x26 // d+=h
+ add x26,x26,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x5,x5,x11
+ add x26,x26,x17 // h+=Sigma0(a)
+ add x5,x5,x10
+ ldr x10,[sp,#24]
+ str x13,[sp,#16]
+ ror x16,x22,#14
+ add x25,x25,x19 // h+=K[i]
+ ror x12,x7,#1
+ and x17,x23,x22
+ ror x11,x4,#19
+ bic x19,x24,x22
+ ror x13,x26,#28
+ add x25,x25,x5 // h+=X[i]
+ eor x16,x16,x22,ror#18
+ eor x12,x12,x7,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x26,x27 // a^b, b^c in next round
+ eor x16,x16,x22,ror#41 // Sigma1(e)
+ eor x13,x13,x26,ror#34
+ add x25,x25,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x11,x11,x4,ror#61
+ eor x12,x12,x7,lsr#7 // sigma0(X[i+1])
+ add x25,x25,x16 // h+=Sigma1(e)
+ eor x28,x28,x27 // Maj(a,b,c)
+ eor x17,x13,x26,ror#39 // Sigma0(a)
+ eor x11,x11,x4,lsr#6 // sigma1(X[i+14])
+ add x6,x6,x15
+ add x21,x21,x25 // d+=h
+ add x25,x25,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x6,x6,x12
+ add x25,x25,x17 // h+=Sigma0(a)
+ add x6,x6,x11
+ ldr x11,[sp,#0]
+ str x14,[sp,#24]
+ ror x16,x21,#14
+ add x24,x24,x28 // h+=K[i]
+ ror x13,x8,#1
+ and x17,x22,x21
+ ror x12,x5,#19
+ bic x28,x23,x21
+ ror x14,x25,#28
+ add x24,x24,x6 // h+=X[i]
+ eor x16,x16,x21,ror#18
+ eor x13,x13,x8,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x25,x26 // a^b, b^c in next round
+ eor x16,x16,x21,ror#41 // Sigma1(e)
+ eor x14,x14,x25,ror#34
+ add x24,x24,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x12,x12,x5,ror#61
+ eor x13,x13,x8,lsr#7 // sigma0(X[i+1])
+ add x24,x24,x16 // h+=Sigma1(e)
+ eor x19,x19,x26 // Maj(a,b,c)
+ eor x17,x14,x25,ror#39 // Sigma0(a)
+ eor x12,x12,x5,lsr#6 // sigma1(X[i+14])
+ add x7,x7,x0
+ add x20,x20,x24 // d+=h
+ add x24,x24,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x7,x7,x13
+ add x24,x24,x17 // h+=Sigma0(a)
+ add x7,x7,x12
+ ldr x12,[sp,#8]
+ str x15,[sp,#0]
+ ror x16,x20,#14
+ add x23,x23,x19 // h+=K[i]
+ ror x14,x9,#1
+ and x17,x21,x20
+ ror x13,x6,#19
+ bic x19,x22,x20
+ ror x15,x24,#28
+ add x23,x23,x7 // h+=X[i]
+ eor x16,x16,x20,ror#18
+ eor x14,x14,x9,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x24,x25 // a^b, b^c in next round
+ eor x16,x16,x20,ror#41 // Sigma1(e)
+ eor x15,x15,x24,ror#34
+ add x23,x23,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x13,x13,x6,ror#61
+ eor x14,x14,x9,lsr#7 // sigma0(X[i+1])
+ add x23,x23,x16 // h+=Sigma1(e)
+ eor x28,x28,x25 // Maj(a,b,c)
+ eor x17,x15,x24,ror#39 // Sigma0(a)
+ eor x13,x13,x6,lsr#6 // sigma1(X[i+14])
+ add x8,x8,x1
+ add x27,x27,x23 // d+=h
+ add x23,x23,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x8,x8,x14
+ add x23,x23,x17 // h+=Sigma0(a)
+ add x8,x8,x13
+ ldr x13,[sp,#16]
+ str x0,[sp,#8]
+ ror x16,x27,#14
+ add x22,x22,x28 // h+=K[i]
+ ror x15,x10,#1
+ and x17,x20,x27
+ ror x14,x7,#19
+ bic x28,x21,x27
+ ror x0,x23,#28
+ add x22,x22,x8 // h+=X[i]
+ eor x16,x16,x27,ror#18
+ eor x15,x15,x10,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x23,x24 // a^b, b^c in next round
+ eor x16,x16,x27,ror#41 // Sigma1(e)
+ eor x0,x0,x23,ror#34
+ add x22,x22,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x14,x14,x7,ror#61
+ eor x15,x15,x10,lsr#7 // sigma0(X[i+1])
+ add x22,x22,x16 // h+=Sigma1(e)
+ eor x19,x19,x24 // Maj(a,b,c)
+ eor x17,x0,x23,ror#39 // Sigma0(a)
+ eor x14,x14,x7,lsr#6 // sigma1(X[i+14])
+ add x9,x9,x2
+ add x26,x26,x22 // d+=h
+ add x22,x22,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x9,x9,x15
+ add x22,x22,x17 // h+=Sigma0(a)
+ add x9,x9,x14
+ ldr x14,[sp,#24]
+ str x1,[sp,#16]
+ ror x16,x26,#14
+ add x21,x21,x19 // h+=K[i]
+ ror x0,x11,#1
+ and x17,x27,x26
+ ror x15,x8,#19
+ bic x19,x20,x26
+ ror x1,x22,#28
+ add x21,x21,x9 // h+=X[i]
+ eor x16,x16,x26,ror#18
+ eor x0,x0,x11,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x22,x23 // a^b, b^c in next round
+ eor x16,x16,x26,ror#41 // Sigma1(e)
+ eor x1,x1,x22,ror#34
+ add x21,x21,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x15,x15,x8,ror#61
+ eor x0,x0,x11,lsr#7 // sigma0(X[i+1])
+ add x21,x21,x16 // h+=Sigma1(e)
+ eor x28,x28,x23 // Maj(a,b,c)
+ eor x17,x1,x22,ror#39 // Sigma0(a)
+ eor x15,x15,x8,lsr#6 // sigma1(X[i+14])
+ add x10,x10,x3
+ add x25,x25,x21 // d+=h
+ add x21,x21,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x10,x10,x0
+ add x21,x21,x17 // h+=Sigma0(a)
+ add x10,x10,x15
+ ldr x15,[sp,#0]
+ str x2,[sp,#24]
+ ror x16,x25,#14
+ add x20,x20,x28 // h+=K[i]
+ ror x1,x12,#1
+ and x17,x26,x25
+ ror x0,x9,#19
+ bic x28,x27,x25
+ ror x2,x21,#28
+ add x20,x20,x10 // h+=X[i]
+ eor x16,x16,x25,ror#18
+ eor x1,x1,x12,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x21,x22 // a^b, b^c in next round
+ eor x16,x16,x25,ror#41 // Sigma1(e)
+ eor x2,x2,x21,ror#34
+ add x20,x20,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x0,x0,x9,ror#61
+ eor x1,x1,x12,lsr#7 // sigma0(X[i+1])
+ add x20,x20,x16 // h+=Sigma1(e)
+ eor x19,x19,x22 // Maj(a,b,c)
+ eor x17,x2,x21,ror#39 // Sigma0(a)
+ eor x0,x0,x9,lsr#6 // sigma1(X[i+14])
+ add x11,x11,x4
+ add x24,x24,x20 // d+=h
+ add x20,x20,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x11,x11,x1
+ add x20,x20,x17 // h+=Sigma0(a)
+ add x11,x11,x0
+ ldr x0,[sp,#8]
+ str x3,[sp,#0]
+ ror x16,x24,#14
+ add x27,x27,x19 // h+=K[i]
+ ror x2,x13,#1
+ and x17,x25,x24
+ ror x1,x10,#19
+ bic x19,x26,x24
+ ror x3,x20,#28
+ add x27,x27,x11 // h+=X[i]
+ eor x16,x16,x24,ror#18
+ eor x2,x2,x13,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x20,x21 // a^b, b^c in next round
+ eor x16,x16,x24,ror#41 // Sigma1(e)
+ eor x3,x3,x20,ror#34
+ add x27,x27,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x1,x1,x10,ror#61
+ eor x2,x2,x13,lsr#7 // sigma0(X[i+1])
+ add x27,x27,x16 // h+=Sigma1(e)
+ eor x28,x28,x21 // Maj(a,b,c)
+ eor x17,x3,x20,ror#39 // Sigma0(a)
+ eor x1,x1,x10,lsr#6 // sigma1(X[i+14])
+ add x12,x12,x5
+ add x23,x23,x27 // d+=h
+ add x27,x27,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x12,x12,x2
+ add x27,x27,x17 // h+=Sigma0(a)
+ add x12,x12,x1
+ ldr x1,[sp,#16]
+ str x4,[sp,#8]
+ ror x16,x23,#14
+ add x26,x26,x28 // h+=K[i]
+ ror x3,x14,#1
+ and x17,x24,x23
+ ror x2,x11,#19
+ bic x28,x25,x23
+ ror x4,x27,#28
+ add x26,x26,x12 // h+=X[i]
+ eor x16,x16,x23,ror#18
+ eor x3,x3,x14,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x27,x20 // a^b, b^c in next round
+ eor x16,x16,x23,ror#41 // Sigma1(e)
+ eor x4,x4,x27,ror#34
+ add x26,x26,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x2,x2,x11,ror#61
+ eor x3,x3,x14,lsr#7 // sigma0(X[i+1])
+ add x26,x26,x16 // h+=Sigma1(e)
+ eor x19,x19,x20 // Maj(a,b,c)
+ eor x17,x4,x27,ror#39 // Sigma0(a)
+ eor x2,x2,x11,lsr#6 // sigma1(X[i+14])
+ add x13,x13,x6
+ add x22,x22,x26 // d+=h
+ add x26,x26,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x13,x13,x3
+ add x26,x26,x17 // h+=Sigma0(a)
+ add x13,x13,x2
+ ldr x2,[sp,#24]
+ str x5,[sp,#16]
+ ror x16,x22,#14
+ add x25,x25,x19 // h+=K[i]
+ ror x4,x15,#1
+ and x17,x23,x22
+ ror x3,x12,#19
+ bic x19,x24,x22
+ ror x5,x26,#28
+ add x25,x25,x13 // h+=X[i]
+ eor x16,x16,x22,ror#18
+ eor x4,x4,x15,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x26,x27 // a^b, b^c in next round
+ eor x16,x16,x22,ror#41 // Sigma1(e)
+ eor x5,x5,x26,ror#34
+ add x25,x25,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x3,x3,x12,ror#61
+ eor x4,x4,x15,lsr#7 // sigma0(X[i+1])
+ add x25,x25,x16 // h+=Sigma1(e)
+ eor x28,x28,x27 // Maj(a,b,c)
+ eor x17,x5,x26,ror#39 // Sigma0(a)
+ eor x3,x3,x12,lsr#6 // sigma1(X[i+14])
+ add x14,x14,x7
+ add x21,x21,x25 // d+=h
+ add x25,x25,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x14,x14,x4
+ add x25,x25,x17 // h+=Sigma0(a)
+ add x14,x14,x3
+ ldr x3,[sp,#0]
+ str x6,[sp,#24]
+ ror x16,x21,#14
+ add x24,x24,x28 // h+=K[i]
+ ror x5,x0,#1
+ and x17,x22,x21
+ ror x4,x13,#19
+ bic x28,x23,x21
+ ror x6,x25,#28
+ add x24,x24,x14 // h+=X[i]
+ eor x16,x16,x21,ror#18
+ eor x5,x5,x0,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x25,x26 // a^b, b^c in next round
+ eor x16,x16,x21,ror#41 // Sigma1(e)
+ eor x6,x6,x25,ror#34
+ add x24,x24,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x4,x4,x13,ror#61
+ eor x5,x5,x0,lsr#7 // sigma0(X[i+1])
+ add x24,x24,x16 // h+=Sigma1(e)
+ eor x19,x19,x26 // Maj(a,b,c)
+ eor x17,x6,x25,ror#39 // Sigma0(a)
+ eor x4,x4,x13,lsr#6 // sigma1(X[i+14])
+ add x15,x15,x8
+ add x20,x20,x24 // d+=h
+ add x24,x24,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x15,x15,x5
+ add x24,x24,x17 // h+=Sigma0(a)
+ add x15,x15,x4
+ ldr x4,[sp,#8]
+ str x7,[sp,#0]
+ ror x16,x20,#14
+ add x23,x23,x19 // h+=K[i]
+ ror x6,x1,#1
+ and x17,x21,x20
+ ror x5,x14,#19
+ bic x19,x22,x20
+ ror x7,x24,#28
+ add x23,x23,x15 // h+=X[i]
+ eor x16,x16,x20,ror#18
+ eor x6,x6,x1,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x24,x25 // a^b, b^c in next round
+ eor x16,x16,x20,ror#41 // Sigma1(e)
+ eor x7,x7,x24,ror#34
+ add x23,x23,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x5,x5,x14,ror#61
+ eor x6,x6,x1,lsr#7 // sigma0(X[i+1])
+ add x23,x23,x16 // h+=Sigma1(e)
+ eor x28,x28,x25 // Maj(a,b,c)
+ eor x17,x7,x24,ror#39 // Sigma0(a)
+ eor x5,x5,x14,lsr#6 // sigma1(X[i+14])
+ add x0,x0,x9
+ add x27,x27,x23 // d+=h
+ add x23,x23,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x0,x0,x6
+ add x23,x23,x17 // h+=Sigma0(a)
+ add x0,x0,x5
+ ldr x5,[sp,#16]
+ str x8,[sp,#8]
+ ror x16,x27,#14
+ add x22,x22,x28 // h+=K[i]
+ ror x7,x2,#1
+ and x17,x20,x27
+ ror x6,x15,#19
+ bic x28,x21,x27
+ ror x8,x23,#28
+ add x22,x22,x0 // h+=X[i]
+ eor x16,x16,x27,ror#18
+ eor x7,x7,x2,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x23,x24 // a^b, b^c in next round
+ eor x16,x16,x27,ror#41 // Sigma1(e)
+ eor x8,x8,x23,ror#34
+ add x22,x22,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x6,x6,x15,ror#61
+ eor x7,x7,x2,lsr#7 // sigma0(X[i+1])
+ add x22,x22,x16 // h+=Sigma1(e)
+ eor x19,x19,x24 // Maj(a,b,c)
+ eor x17,x8,x23,ror#39 // Sigma0(a)
+ eor x6,x6,x15,lsr#6 // sigma1(X[i+14])
+ add x1,x1,x10
+ add x26,x26,x22 // d+=h
+ add x22,x22,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x1,x1,x7
+ add x22,x22,x17 // h+=Sigma0(a)
+ add x1,x1,x6
+ ldr x6,[sp,#24]
+ str x9,[sp,#16]
+ ror x16,x26,#14
+ add x21,x21,x19 // h+=K[i]
+ ror x8,x3,#1
+ and x17,x27,x26
+ ror x7,x0,#19
+ bic x19,x20,x26
+ ror x9,x22,#28
+ add x21,x21,x1 // h+=X[i]
+ eor x16,x16,x26,ror#18
+ eor x8,x8,x3,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x22,x23 // a^b, b^c in next round
+ eor x16,x16,x26,ror#41 // Sigma1(e)
+ eor x9,x9,x22,ror#34
+ add x21,x21,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x7,x7,x0,ror#61
+ eor x8,x8,x3,lsr#7 // sigma0(X[i+1])
+ add x21,x21,x16 // h+=Sigma1(e)
+ eor x28,x28,x23 // Maj(a,b,c)
+ eor x17,x9,x22,ror#39 // Sigma0(a)
+ eor x7,x7,x0,lsr#6 // sigma1(X[i+14])
+ add x2,x2,x11
+ add x25,x25,x21 // d+=h
+ add x21,x21,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x2,x2,x8
+ add x21,x21,x17 // h+=Sigma0(a)
+ add x2,x2,x7
+ ldr x7,[sp,#0]
+ str x10,[sp,#24]
+ ror x16,x25,#14
+ add x20,x20,x28 // h+=K[i]
+ ror x9,x4,#1
+ and x17,x26,x25
+ ror x8,x1,#19
+ bic x28,x27,x25
+ ror x10,x21,#28
+ add x20,x20,x2 // h+=X[i]
+ eor x16,x16,x25,ror#18
+ eor x9,x9,x4,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x21,x22 // a^b, b^c in next round
+ eor x16,x16,x25,ror#41 // Sigma1(e)
+ eor x10,x10,x21,ror#34
+ add x20,x20,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x8,x8,x1,ror#61
+ eor x9,x9,x4,lsr#7 // sigma0(X[i+1])
+ add x20,x20,x16 // h+=Sigma1(e)
+ eor x19,x19,x22 // Maj(a,b,c)
+ eor x17,x10,x21,ror#39 // Sigma0(a)
+ eor x8,x8,x1,lsr#6 // sigma1(X[i+14])
+ add x3,x3,x12
+ add x24,x24,x20 // d+=h
+ add x20,x20,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x3,x3,x9
+ add x20,x20,x17 // h+=Sigma0(a)
+ add x3,x3,x8
+ cbnz x19,.Loop_16_xx
+
+ ldp x0,x2,[x29,#96]
+ ldr x1,[x29,#112]
+ sub x30,x30,#648 // rewind
+
+ ldp x3,x4,[x0]
+ ldp x5,x6,[x0,#2*8]
+ add x1,x1,#14*8 // advance input pointer
+ ldp x7,x8,[x0,#4*8]
+ add x20,x20,x3
+ ldp x9,x10,[x0,#6*8]
+ add x21,x21,x4
+ add x22,x22,x5
+ add x23,x23,x6
+ stp x20,x21,[x0]
+ add x24,x24,x7
+ add x25,x25,x8
+ stp x22,x23,[x0,#2*8]
+ add x26,x26,x9
+ add x27,x27,x10
+ cmp x1,x2
+ stp x24,x25,[x0,#4*8]
+ stp x26,x27,[x0,#6*8]
+ b.ne .Loop
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#4*8
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#128
+ ret
+.size zfs_sha512_block_armv7,.-zfs_sha512_block_armv7
+
+
+.globl zfs_sha512_block_armv8
+.type zfs_sha512_block_armv8,%function
+.align 6
+zfs_sha512_block_armv8:
+.Lv8_entry:
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ld1 {v16.16b-v19.16b},[x1],#64 // load input
+ ld1 {v20.16b-v23.16b},[x1],#64
+
+ ld1 {v0.2d-v3.2d},[x0] // load context
+ adr x3,.LK512
+
+ rev64 v16.16b,v16.16b
+ rev64 v17.16b,v17.16b
+ rev64 v18.16b,v18.16b
+ rev64 v19.16b,v19.16b
+ rev64 v20.16b,v20.16b
+ rev64 v21.16b,v21.16b
+ rev64 v22.16b,v22.16b
+ rev64 v23.16b,v23.16b
+ b .Loop_hw
+
+.align 4
+.Loop_hw:
+ ld1 {v24.2d},[x3],#16
+ subs x2,x2,#1
+ sub x4,x1,#128
+ orr v26.16b,v0.16b,v0.16b // offload
+ orr v27.16b,v1.16b,v1.16b
+ orr v28.16b,v2.16b,v2.16b
+ orr v29.16b,v3.16b,v3.16b
+ csel x1,x1,x4,ne // conditional rewind
+ add v24.2d,v24.2d,v16.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
+ .inst 0xcec08230 //sha512su0 v16.16b,v17.16b
+ ext v7.16b,v20.16b,v21.16b,#8
+ .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+ .inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+ .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v25.2d,v25.2d,v17.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
+ .inst 0xcec08251 //sha512su0 v17.16b,v18.16b
+ ext v7.16b,v21.16b,v22.16b,#8
+ .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+ .inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+ .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ add v24.2d,v24.2d,v18.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
+ .inst 0xcec08272 //sha512su0 v18.16b,v19.16b
+ ext v7.16b,v22.16b,v23.16b,#8
+ .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+ .inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+ .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ add v25.2d,v25.2d,v19.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
+ .inst 0xcec08293 //sha512su0 v19.16b,v20.16b
+ ext v7.16b,v23.16b,v16.16b,#8
+ .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+ .inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+ .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ add v24.2d,v24.2d,v20.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
+ .inst 0xcec082b4 //sha512su0 v20.16b,v21.16b
+ ext v7.16b,v16.16b,v17.16b,#8
+ .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+ .inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+ .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v25.2d,v25.2d,v21.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
+ .inst 0xcec082d5 //sha512su0 v21.16b,v22.16b
+ ext v7.16b,v17.16b,v18.16b,#8
+ .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+ .inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+ .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v24.2d,v24.2d,v22.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
+ .inst 0xcec082f6 //sha512su0 v22.16b,v23.16b
+ ext v7.16b,v18.16b,v19.16b,#8
+ .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+ .inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+ .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ add v25.2d,v25.2d,v23.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
+ .inst 0xcec08217 //sha512su0 v23.16b,v16.16b
+ ext v7.16b,v19.16b,v20.16b,#8
+ .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+ .inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+ .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ add v24.2d,v24.2d,v16.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
+ .inst 0xcec08230 //sha512su0 v16.16b,v17.16b
+ ext v7.16b,v20.16b,v21.16b,#8
+ .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+ .inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+ .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ add v25.2d,v25.2d,v17.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
+ .inst 0xcec08251 //sha512su0 v17.16b,v18.16b
+ ext v7.16b,v21.16b,v22.16b,#8
+ .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+ .inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+ .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v24.2d,v24.2d,v18.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
+ .inst 0xcec08272 //sha512su0 v18.16b,v19.16b
+ ext v7.16b,v22.16b,v23.16b,#8
+ .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+ .inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+ .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v25.2d,v25.2d,v19.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
+ .inst 0xcec08293 //sha512su0 v19.16b,v20.16b
+ ext v7.16b,v23.16b,v16.16b,#8
+ .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+ .inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+ .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ add v24.2d,v24.2d,v20.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
+ .inst 0xcec082b4 //sha512su0 v20.16b,v21.16b
+ ext v7.16b,v16.16b,v17.16b,#8
+ .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+ .inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+ .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ add v25.2d,v25.2d,v21.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
+ .inst 0xcec082d5 //sha512su0 v21.16b,v22.16b
+ ext v7.16b,v17.16b,v18.16b,#8
+ .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+ .inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+ .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ add v24.2d,v24.2d,v22.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
+ .inst 0xcec082f6 //sha512su0 v22.16b,v23.16b
+ ext v7.16b,v18.16b,v19.16b,#8
+ .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+ .inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+ .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v25.2d,v25.2d,v23.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
+ .inst 0xcec08217 //sha512su0 v23.16b,v16.16b
+ ext v7.16b,v19.16b,v20.16b,#8
+ .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+ .inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+ .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v24.2d,v24.2d,v16.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
+ .inst 0xcec08230 //sha512su0 v16.16b,v17.16b
+ ext v7.16b,v20.16b,v21.16b,#8
+ .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+ .inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+ .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ add v25.2d,v25.2d,v17.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
+ .inst 0xcec08251 //sha512su0 v17.16b,v18.16b
+ ext v7.16b,v21.16b,v22.16b,#8
+ .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+ .inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+ .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ add v24.2d,v24.2d,v18.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
+ .inst 0xcec08272 //sha512su0 v18.16b,v19.16b
+ ext v7.16b,v22.16b,v23.16b,#8
+ .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+ .inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+ .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ add v25.2d,v25.2d,v19.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
+ .inst 0xcec08293 //sha512su0 v19.16b,v20.16b
+ ext v7.16b,v23.16b,v16.16b,#8
+ .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+ .inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+ .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v24.2d,v24.2d,v20.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
+ .inst 0xcec082b4 //sha512su0 v20.16b,v21.16b
+ ext v7.16b,v16.16b,v17.16b,#8
+ .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+ .inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+ .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v25.2d,v25.2d,v21.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
+ .inst 0xcec082d5 //sha512su0 v21.16b,v22.16b
+ ext v7.16b,v17.16b,v18.16b,#8
+ .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+ .inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+ .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ add v24.2d,v24.2d,v22.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
+ .inst 0xcec082f6 //sha512su0 v22.16b,v23.16b
+ ext v7.16b,v18.16b,v19.16b,#8
+ .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+ .inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+ .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ add v25.2d,v25.2d,v23.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
+ .inst 0xcec08217 //sha512su0 v23.16b,v16.16b
+ ext v7.16b,v19.16b,v20.16b,#8
+ .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+ .inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+ .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ add v24.2d,v24.2d,v16.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
+ .inst 0xcec08230 //sha512su0 v16.16b,v17.16b
+ ext v7.16b,v20.16b,v21.16b,#8
+ .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+ .inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+ .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v25.2d,v25.2d,v17.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
+ .inst 0xcec08251 //sha512su0 v17.16b,v18.16b
+ ext v7.16b,v21.16b,v22.16b,#8
+ .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+ .inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+ .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v24.2d,v24.2d,v18.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
+ .inst 0xcec08272 //sha512su0 v18.16b,v19.16b
+ ext v7.16b,v22.16b,v23.16b,#8
+ .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+ .inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+ .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ add v25.2d,v25.2d,v19.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
+ .inst 0xcec08293 //sha512su0 v19.16b,v20.16b
+ ext v7.16b,v23.16b,v16.16b,#8
+ .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+ .inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+ .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ add v24.2d,v24.2d,v20.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
+ .inst 0xcec082b4 //sha512su0 v20.16b,v21.16b
+ ext v7.16b,v16.16b,v17.16b,#8
+ .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+ .inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+ .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ add v25.2d,v25.2d,v21.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
+ .inst 0xcec082d5 //sha512su0 v21.16b,v22.16b
+ ext v7.16b,v17.16b,v18.16b,#8
+ .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+ .inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+ .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v24.2d,v24.2d,v22.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
+ .inst 0xcec082f6 //sha512su0 v22.16b,v23.16b
+ ext v7.16b,v18.16b,v19.16b,#8
+ .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+ .inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+ .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v25.2d,v25.2d,v23.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
+ .inst 0xcec08217 //sha512su0 v23.16b,v16.16b
+ ext v7.16b,v19.16b,v20.16b,#8
+ .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+ .inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+ .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ ld1 {v25.2d},[x3],#16
+ add v24.2d,v24.2d,v16.2d
+ ld1 {v16.16b},[x1],#16 // load next input
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
+ .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+ rev64 v16.16b,v16.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+ .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ ld1 {v24.2d},[x3],#16
+ add v25.2d,v25.2d,v17.2d
+ ld1 {v17.16b},[x1],#16 // load next input
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
+ .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+ rev64 v17.16b,v17.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+ .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ ld1 {v25.2d},[x3],#16
+ add v24.2d,v24.2d,v18.2d
+ ld1 {v18.16b},[x1],#16 // load next input
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
+ .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+ rev64 v18.16b,v18.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+ .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ ld1 {v24.2d},[x3],#16
+ add v25.2d,v25.2d,v19.2d
+ ld1 {v19.16b},[x1],#16 // load next input
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
+ .inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+ rev64 v19.16b,v19.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+ .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ ld1 {v25.2d},[x3],#16
+ add v24.2d,v24.2d,v20.2d
+ ld1 {v20.16b},[x1],#16 // load next input
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
+ .inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+ rev64 v20.16b,v20.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+ .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ ld1 {v24.2d},[x3],#16
+ add v25.2d,v25.2d,v21.2d
+ ld1 {v21.16b},[x1],#16 // load next input
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
+ .inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+ rev64 v21.16b,v21.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+ .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ ld1 {v25.2d},[x3],#16
+ add v24.2d,v24.2d,v22.2d
+ ld1 {v22.16b},[x1],#16 // load next input
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
+ .inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+ rev64 v22.16b,v22.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+ .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ sub x3,x3,#80*8 // rewind
+ add v25.2d,v25.2d,v23.2d
+ ld1 {v23.16b},[x1],#16 // load next input
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
+ .inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+ rev64 v23.16b,v23.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+ .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v0.2d,v0.2d,v26.2d // accumulate
+ add v1.2d,v1.2d,v27.2d
+ add v2.2d,v2.2d,v28.2d
+ add v3.2d,v3.2d,v29.2d
+
+ cbnz x2,.Loop_hw
+
+ st1 {v0.2d-v3.2d},[x0] // store context
+
+ ldr x29,[sp],#16
+ ret
+.size zfs_sha512_block_armv8,.-zfs_sha512_block_armv8
+#endif
diff --git a/module/icp/asm-arm/sha2/sha256-armv7.S b/module/icp/asm-arm/sha2/sha256-armv7.S
new file mode 100644
index 000000000..0001e4d69
--- /dev/null
+++ b/module/icp/asm-arm/sha2/sha256-armv7.S
@@ -0,0 +1,2769 @@
+/*
+ * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Portions Copyright (c) 2022 Tino Reichardt <[email protected]>
+ * - modified assembly to fit into OpenZFS
+ */
+
+#if defined(__arm__)
+
+#define __ARM_ARCH__ 7
+#define __ARM_MAX_ARCH__ 7
+
+#if defined(__thumb2__)
+.syntax unified
+.thumb
+#else
+.code 32
+#endif
+
+.text
+
+.type K256,%object
+.align 5
+K256:
+.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size K256,.-K256
+.word 0 @ terminator
+
+.align 5
+.globl zfs_sha256_block_armv7
+.type zfs_sha256_block_armv7,%function
+zfs_sha256_block_armv7:
+.Lzfs_sha256_block_armv7:
+
+#if __ARM_ARCH__<7 && !defined(__thumb2__)
+ sub r3,pc,#8 @ zfs_sha256_block_armv7
+#else
+ adr r3,.Lzfs_sha256_block_armv7
+#endif
+
+ add r2,r1,r2,lsl#6 @ len to point at the end of inp
+ stmdb sp!,{r0,r1,r2,r4-r11,lr}
+ ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11}
+ sub r14,r3,#256+32 @ K256
+ sub sp,sp,#16*4 @ alloca(X[16])
+.Loop:
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ magic
+ eor r12,r12,r12
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 0
+# if 0==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r8,r8,ror#5
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 0
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 0==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r8,r8,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r11,r11,r2 @ h+=X[i]
+ str r2,[sp,#0*4]
+ eor r2,r9,r10
+ add r11,r11,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r8
+ add r11,r11,r12 @ h+=K256[i]
+ eor r2,r2,r10 @ Ch(e,f,g)
+ eor r0,r4,r4,ror#11
+ add r11,r11,r2 @ h+=Ch(e,f,g)
+#if 0==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 0<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r4,r5 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#2*4] @ from future BODY_16_xx
+ eor r12,r4,r5 @ a^b, b^c in next round
+ ldr r1,[sp,#15*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r4,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r7,r7,r11 @ d+=h
+ eor r3,r3,r5 @ Maj(a,b,c)
+ add r11,r11,r0,ror#2 @ h+=Sigma0(a)
+ @ add r11,r11,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 1
+# if 1==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r7,r7,ror#5
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 1
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 1==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r7,r7,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r10,r10,r2 @ h+=X[i]
+ str r2,[sp,#1*4]
+ eor r2,r8,r9
+ add r10,r10,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r7
+ add r10,r10,r3 @ h+=K256[i]
+ eor r2,r2,r9 @ Ch(e,f,g)
+ eor r0,r11,r11,ror#11
+ add r10,r10,r2 @ h+=Ch(e,f,g)
+#if 1==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 1<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r11,r4 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#3*4] @ from future BODY_16_xx
+ eor r3,r11,r4 @ a^b, b^c in next round
+ ldr r1,[sp,#0*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r11,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r6,r6,r10 @ d+=h
+ eor r12,r12,r4 @ Maj(a,b,c)
+ add r10,r10,r0,ror#2 @ h+=Sigma0(a)
+ @ add r10,r10,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 2
+# if 2==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 2
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 2==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r6,r6,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r9,r9,r2 @ h+=X[i]
+ str r2,[sp,#2*4]
+ eor r2,r7,r8
+ add r9,r9,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r6
+ add r9,r9,r12 @ h+=K256[i]
+ eor r2,r2,r8 @ Ch(e,f,g)
+ eor r0,r10,r10,ror#11
+ add r9,r9,r2 @ h+=Ch(e,f,g)
+#if 2==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 2<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r10,r11 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#4*4] @ from future BODY_16_xx
+ eor r12,r10,r11 @ a^b, b^c in next round
+ ldr r1,[sp,#1*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r10,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r5,r5,r9 @ d+=h
+ eor r3,r3,r11 @ Maj(a,b,c)
+ add r9,r9,r0,ror#2 @ h+=Sigma0(a)
+ @ add r9,r9,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 3
+# if 3==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r5,r5,ror#5
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 3
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 3==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r5,r5,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r8,r8,r2 @ h+=X[i]
+ str r2,[sp,#3*4]
+ eor r2,r6,r7
+ add r8,r8,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r5
+ add r8,r8,r3 @ h+=K256[i]
+ eor r2,r2,r7 @ Ch(e,f,g)
+ eor r0,r9,r9,ror#11
+ add r8,r8,r2 @ h+=Ch(e,f,g)
+#if 3==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 3<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r9,r10 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#5*4] @ from future BODY_16_xx
+ eor r3,r9,r10 @ a^b, b^c in next round
+ ldr r1,[sp,#2*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r9,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r4,r4,r8 @ d+=h
+ eor r12,r12,r10 @ Maj(a,b,c)
+ add r8,r8,r0,ror#2 @ h+=Sigma0(a)
+ @ add r8,r8,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 4
+# if 4==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r4,r4,ror#5
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 4
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 4==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r4,r4,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r7,r7,r2 @ h+=X[i]
+ str r2,[sp,#4*4]
+ eor r2,r5,r6
+ add r7,r7,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r4
+ add r7,r7,r12 @ h+=K256[i]
+ eor r2,r2,r6 @ Ch(e,f,g)
+ eor r0,r8,r8,ror#11
+ add r7,r7,r2 @ h+=Ch(e,f,g)
+#if 4==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 4<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r8,r9 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#6*4] @ from future BODY_16_xx
+ eor r12,r8,r9 @ a^b, b^c in next round
+ ldr r1,[sp,#3*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r8,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r11,r11,r7 @ d+=h
+ eor r3,r3,r9 @ Maj(a,b,c)
+ add r7,r7,r0,ror#2 @ h+=Sigma0(a)
+ @ add r7,r7,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 5
+# if 5==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r11,r11,ror#5
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 5
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 5==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r11,r11,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r6,r6,r2 @ h+=X[i]
+ str r2,[sp,#5*4]
+ eor r2,r4,r5
+ add r6,r6,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r11
+ add r6,r6,r3 @ h+=K256[i]
+ eor r2,r2,r5 @ Ch(e,f,g)
+ eor r0,r7,r7,ror#11
+ add r6,r6,r2 @ h+=Ch(e,f,g)
+#if 5==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 5<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r7,r8 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#7*4] @ from future BODY_16_xx
+ eor r3,r7,r8 @ a^b, b^c in next round
+ ldr r1,[sp,#4*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r7,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r10,r10,r6 @ d+=h
+ eor r12,r12,r8 @ Maj(a,b,c)
+ add r6,r6,r0,ror#2 @ h+=Sigma0(a)
+ @ add r6,r6,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 6
+# if 6==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 6
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 6==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r10,r10,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r5,r5,r2 @ h+=X[i]
+ str r2,[sp,#6*4]
+ eor r2,r11,r4
+ add r5,r5,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r10
+ add r5,r5,r12 @ h+=K256[i]
+ eor r2,r2,r4 @ Ch(e,f,g)
+ eor r0,r6,r6,ror#11
+ add r5,r5,r2 @ h+=Ch(e,f,g)
+#if 6==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 6<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r6,r7 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#8*4] @ from future BODY_16_xx
+ eor r12,r6,r7 @ a^b, b^c in next round
+ ldr r1,[sp,#5*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r6,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r9,r9,r5 @ d+=h
+ eor r3,r3,r7 @ Maj(a,b,c)
+ add r5,r5,r0,ror#2 @ h+=Sigma0(a)
+ @ add r5,r5,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 7
+# if 7==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r9,r9,ror#5
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 7
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 7==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r9,r9,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r4,r4,r2 @ h+=X[i]
+ str r2,[sp,#7*4]
+ eor r2,r10,r11
+ add r4,r4,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r9
+ add r4,r4,r3 @ h+=K256[i]
+ eor r2,r2,r11 @ Ch(e,f,g)
+ eor r0,r5,r5,ror#11
+ add r4,r4,r2 @ h+=Ch(e,f,g)
+#if 7==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 7<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#9*4] @ from future BODY_16_xx
+ eor r3,r5,r6 @ a^b, b^c in next round
+ ldr r1,[sp,#6*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r5,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r8,r8,r4 @ d+=h
+ eor r12,r12,r6 @ Maj(a,b,c)
+ add r4,r4,r0,ror#2 @ h+=Sigma0(a)
+ @ add r4,r4,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 8
+# if 8==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r8,r8,ror#5
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 8
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 8==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r8,r8,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r11,r11,r2 @ h+=X[i]
+ str r2,[sp,#8*4]
+ eor r2,r9,r10
+ add r11,r11,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r8
+ add r11,r11,r12 @ h+=K256[i]
+ eor r2,r2,r10 @ Ch(e,f,g)
+ eor r0,r4,r4,ror#11
+ add r11,r11,r2 @ h+=Ch(e,f,g)
+#if 8==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 8<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r4,r5 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#10*4] @ from future BODY_16_xx
+ eor r12,r4,r5 @ a^b, b^c in next round
+ ldr r1,[sp,#7*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r4,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r7,r7,r11 @ d+=h
+ eor r3,r3,r5 @ Maj(a,b,c)
+ add r11,r11,r0,ror#2 @ h+=Sigma0(a)
+ @ add r11,r11,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 9
+# if 9==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r7,r7,ror#5
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 9
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 9==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r7,r7,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r10,r10,r2 @ h+=X[i]
+ str r2,[sp,#9*4]
+ eor r2,r8,r9
+ add r10,r10,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r7
+ add r10,r10,r3 @ h+=K256[i]
+ eor r2,r2,r9 @ Ch(e,f,g)
+ eor r0,r11,r11,ror#11
+ add r10,r10,r2 @ h+=Ch(e,f,g)
+#if 9==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 9<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r11,r4 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#11*4] @ from future BODY_16_xx
+ eor r3,r11,r4 @ a^b, b^c in next round
+ ldr r1,[sp,#8*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r11,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r6,r6,r10 @ d+=h
+ eor r12,r12,r4 @ Maj(a,b,c)
+ add r10,r10,r0,ror#2 @ h+=Sigma0(a)
+ @ add r10,r10,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 10
+# if 10==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 10
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 10==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r6,r6,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r9,r9,r2 @ h+=X[i]
+ str r2,[sp,#10*4]
+ eor r2,r7,r8
+ add r9,r9,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r6
+ add r9,r9,r12 @ h+=K256[i]
+ eor r2,r2,r8 @ Ch(e,f,g)
+ eor r0,r10,r10,ror#11
+ add r9,r9,r2 @ h+=Ch(e,f,g)
+#if 10==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 10<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r10,r11 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#12*4] @ from future BODY_16_xx
+ eor r12,r10,r11 @ a^b, b^c in next round
+ ldr r1,[sp,#9*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r10,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r5,r5,r9 @ d+=h
+ eor r3,r3,r11 @ Maj(a,b,c)
+ add r9,r9,r0,ror#2 @ h+=Sigma0(a)
+ @ add r9,r9,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 11
+# if 11==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r5,r5,ror#5
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 11
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 11==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r5,r5,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r8,r8,r2 @ h+=X[i]
+ str r2,[sp,#11*4]
+ eor r2,r6,r7
+ add r8,r8,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r5
+ add r8,r8,r3 @ h+=K256[i]
+ eor r2,r2,r7 @ Ch(e,f,g)
+ eor r0,r9,r9,ror#11
+ add r8,r8,r2 @ h+=Ch(e,f,g)
+#if 11==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 11<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r9,r10 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#13*4] @ from future BODY_16_xx
+ eor r3,r9,r10 @ a^b, b^c in next round
+ ldr r1,[sp,#10*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r9,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r4,r4,r8 @ d+=h
+ eor r12,r12,r10 @ Maj(a,b,c)
+ add r8,r8,r0,ror#2 @ h+=Sigma0(a)
+ @ add r8,r8,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 12
+# if 12==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r4,r4,ror#5
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 12
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 12==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r4,r4,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r7,r7,r2 @ h+=X[i]
+ str r2,[sp,#12*4]
+ eor r2,r5,r6
+ add r7,r7,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r4
+ add r7,r7,r12 @ h+=K256[i]
+ eor r2,r2,r6 @ Ch(e,f,g)
+ eor r0,r8,r8,ror#11
+ add r7,r7,r2 @ h+=Ch(e,f,g)
+#if 12==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 12<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r8,r9 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#14*4] @ from future BODY_16_xx
+ eor r12,r8,r9 @ a^b, b^c in next round
+ ldr r1,[sp,#11*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r8,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r11,r11,r7 @ d+=h
+ eor r3,r3,r9 @ Maj(a,b,c)
+ add r7,r7,r0,ror#2 @ h+=Sigma0(a)
+ @ add r7,r7,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 13
+# if 13==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r11,r11,ror#5
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 13
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 13==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r11,r11,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r6,r6,r2 @ h+=X[i]
+ str r2,[sp,#13*4]
+ eor r2,r4,r5
+ add r6,r6,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r11
+ add r6,r6,r3 @ h+=K256[i]
+ eor r2,r2,r5 @ Ch(e,f,g)
+ eor r0,r7,r7,ror#11
+ add r6,r6,r2 @ h+=Ch(e,f,g)
+#if 13==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 13<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r7,r8 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#15*4] @ from future BODY_16_xx
+ eor r3,r7,r8 @ a^b, b^c in next round
+ ldr r1,[sp,#12*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r7,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r10,r10,r6 @ d+=h
+ eor r12,r12,r8 @ Maj(a,b,c)
+ add r6,r6,r0,ror#2 @ h+=Sigma0(a)
+ @ add r6,r6,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 14
+# if 14==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 14
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 14==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r10,r10,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r5,r5,r2 @ h+=X[i]
+ str r2,[sp,#14*4]
+ eor r2,r11,r4
+ add r5,r5,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r10
+ add r5,r5,r12 @ h+=K256[i]
+ eor r2,r2,r4 @ Ch(e,f,g)
+ eor r0,r6,r6,ror#11
+ add r5,r5,r2 @ h+=Ch(e,f,g)
+#if 14==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 14<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r6,r7 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#0*4] @ from future BODY_16_xx
+ eor r12,r6,r7 @ a^b, b^c in next round
+ ldr r1,[sp,#13*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r6,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r9,r9,r5 @ d+=h
+ eor r3,r3,r7 @ Maj(a,b,c)
+ add r5,r5,r0,ror#2 @ h+=Sigma0(a)
+ @ add r5,r5,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 15
+# if 15==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r9,r9,ror#5
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 15
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 15==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r9,r9,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r4,r4,r2 @ h+=X[i]
+ str r2,[sp,#15*4]
+ eor r2,r10,r11
+ add r4,r4,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r9
+ add r4,r4,r3 @ h+=K256[i]
+ eor r2,r2,r11 @ Ch(e,f,g)
+ eor r0,r5,r5,ror#11
+ add r4,r4,r2 @ h+=Ch(e,f,g)
+#if 15==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 15<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#1*4] @ from future BODY_16_xx
+ eor r3,r5,r6 @ a^b, b^c in next round
+ ldr r1,[sp,#14*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r5,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r8,r8,r4 @ d+=h
+ eor r12,r12,r6 @ Maj(a,b,c)
+ add r4,r4,r0,ror#2 @ h+=Sigma0(a)
+ @ add r4,r4,r12 @ h+=Maj(a,b,c)
+.Lrounds_16_xx:
+ @ ldr r2,[sp,#1*4] @ 16
+ @ ldr r1,[sp,#14*4]
+ mov r0,r2,ror#7
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#0*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#9*4]
+
+ add r12,r12,r0
+ eor r0,r8,r8,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r11,r11,r2 @ h+=X[i]
+ str r2,[sp,#0*4]
+ eor r2,r9,r10
+ add r11,r11,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r8
+ add r11,r11,r12 @ h+=K256[i]
+ eor r2,r2,r10 @ Ch(e,f,g)
+ eor r0,r4,r4,ror#11
+ add r11,r11,r2 @ h+=Ch(e,f,g)
+#if 16==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 16<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r4,r5 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#2*4] @ from future BODY_16_xx
+ eor r12,r4,r5 @ a^b, b^c in next round
+ ldr r1,[sp,#15*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r4,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r7,r7,r11 @ d+=h
+ eor r3,r3,r5 @ Maj(a,b,c)
+ add r11,r11,r0,ror#2 @ h+=Sigma0(a)
+ @ add r11,r11,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#2*4] @ 17
+ @ ldr r1,[sp,#15*4]
+ mov r0,r2,ror#7
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#1*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#10*4]
+
+ add r3,r3,r0
+ eor r0,r7,r7,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r10,r10,r2 @ h+=X[i]
+ str r2,[sp,#1*4]
+ eor r2,r8,r9
+ add r10,r10,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r7
+ add r10,r10,r3 @ h+=K256[i]
+ eor r2,r2,r9 @ Ch(e,f,g)
+ eor r0,r11,r11,ror#11
+ add r10,r10,r2 @ h+=Ch(e,f,g)
+#if 17==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 17<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r11,r4 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#3*4] @ from future BODY_16_xx
+ eor r3,r11,r4 @ a^b, b^c in next round
+ ldr r1,[sp,#0*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r11,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r6,r6,r10 @ d+=h
+ eor r12,r12,r4 @ Maj(a,b,c)
+ add r10,r10,r0,ror#2 @ h+=Sigma0(a)
+ @ add r10,r10,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#3*4] @ 18
+ @ ldr r1,[sp,#0*4]
+ mov r0,r2,ror#7
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#2*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#11*4]
+
+ add r12,r12,r0
+ eor r0,r6,r6,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r9,r9,r2 @ h+=X[i]
+ str r2,[sp,#2*4]
+ eor r2,r7,r8
+ add r9,r9,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r6
+ add r9,r9,r12 @ h+=K256[i]
+ eor r2,r2,r8 @ Ch(e,f,g)
+ eor r0,r10,r10,ror#11
+ add r9,r9,r2 @ h+=Ch(e,f,g)
+#if 18==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 18<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r10,r11 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#4*4] @ from future BODY_16_xx
+ eor r12,r10,r11 @ a^b, b^c in next round
+ ldr r1,[sp,#1*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r10,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r5,r5,r9 @ d+=h
+ eor r3,r3,r11 @ Maj(a,b,c)
+ add r9,r9,r0,ror#2 @ h+=Sigma0(a)
+ @ add r9,r9,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#4*4] @ 19
+ @ ldr r1,[sp,#1*4]
+ mov r0,r2,ror#7
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#3*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#12*4]
+
+ add r3,r3,r0
+ eor r0,r5,r5,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r8,r8,r2 @ h+=X[i]
+ str r2,[sp,#3*4]
+ eor r2,r6,r7
+ add r8,r8,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r5
+ add r8,r8,r3 @ h+=K256[i]
+ eor r2,r2,r7 @ Ch(e,f,g)
+ eor r0,r9,r9,ror#11
+ add r8,r8,r2 @ h+=Ch(e,f,g)
+#if 19==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 19<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r9,r10 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#5*4] @ from future BODY_16_xx
+ eor r3,r9,r10 @ a^b, b^c in next round
+ ldr r1,[sp,#2*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r9,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r4,r4,r8 @ d+=h
+ eor r12,r12,r10 @ Maj(a,b,c)
+ add r8,r8,r0,ror#2 @ h+=Sigma0(a)
+ @ add r8,r8,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#5*4] @ 20
+ @ ldr r1,[sp,#2*4]
+ mov r0,r2,ror#7
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#4*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#13*4]
+
+ add r12,r12,r0
+ eor r0,r4,r4,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r7,r7,r2 @ h+=X[i]
+ str r2,[sp,#4*4]
+ eor r2,r5,r6
+ add r7,r7,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r4
+ add r7,r7,r12 @ h+=K256[i]
+ eor r2,r2,r6 @ Ch(e,f,g)
+ eor r0,r8,r8,ror#11
+ add r7,r7,r2 @ h+=Ch(e,f,g)
+#if 20==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 20<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r8,r9 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#6*4] @ from future BODY_16_xx
+ eor r12,r8,r9 @ a^b, b^c in next round
+ ldr r1,[sp,#3*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r8,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r11,r11,r7 @ d+=h
+ eor r3,r3,r9 @ Maj(a,b,c)
+ add r7,r7,r0,ror#2 @ h+=Sigma0(a)
+ @ add r7,r7,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#6*4] @ 21
+ @ ldr r1,[sp,#3*4]
+ mov r0,r2,ror#7
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#5*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#14*4]
+
+ add r3,r3,r0
+ eor r0,r11,r11,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r6,r6,r2 @ h+=X[i]
+ str r2,[sp,#5*4]
+ eor r2,r4,r5
+ add r6,r6,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r11
+ add r6,r6,r3 @ h+=K256[i]
+ eor r2,r2,r5 @ Ch(e,f,g)
+ eor r0,r7,r7,ror#11
+ add r6,r6,r2 @ h+=Ch(e,f,g)
+#if 21==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 21<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r7,r8 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#7*4] @ from future BODY_16_xx
+ eor r3,r7,r8 @ a^b, b^c in next round
+ ldr r1,[sp,#4*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r7,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r10,r10,r6 @ d+=h
+ eor r12,r12,r8 @ Maj(a,b,c)
+ add r6,r6,r0,ror#2 @ h+=Sigma0(a)
+ @ add r6,r6,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#7*4] @ 22
+ @ ldr r1,[sp,#4*4]
+ mov r0,r2,ror#7
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#6*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#15*4]
+
+ add r12,r12,r0
+ eor r0,r10,r10,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r5,r5,r2 @ h+=X[i]
+ str r2,[sp,#6*4]
+ eor r2,r11,r4
+ add r5,r5,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r10
+ add r5,r5,r12 @ h+=K256[i]
+ eor r2,r2,r4 @ Ch(e,f,g)
+ eor r0,r6,r6,ror#11
+ add r5,r5,r2 @ h+=Ch(e,f,g)
+#if 22==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 22<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r6,r7 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#8*4] @ from future BODY_16_xx
+ eor r12,r6,r7 @ a^b, b^c in next round
+ ldr r1,[sp,#5*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r6,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r9,r9,r5 @ d+=h
+ eor r3,r3,r7 @ Maj(a,b,c)
+ add r5,r5,r0,ror#2 @ h+=Sigma0(a)
+ @ add r5,r5,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#8*4] @ 23
+ @ ldr r1,[sp,#5*4]
+ mov r0,r2,ror#7
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#7*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#0*4]
+
+ add r3,r3,r0
+ eor r0,r9,r9,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r4,r4,r2 @ h+=X[i]
+ str r2,[sp,#7*4]
+ eor r2,r10,r11
+ add r4,r4,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r9
+ add r4,r4,r3 @ h+=K256[i]
+ eor r2,r2,r11 @ Ch(e,f,g)
+ eor r0,r5,r5,ror#11
+ add r4,r4,r2 @ h+=Ch(e,f,g)
+#if 23==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 23<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#9*4] @ from future BODY_16_xx
+ eor r3,r5,r6 @ a^b, b^c in next round
+ ldr r1,[sp,#6*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r5,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r8,r8,r4 @ d+=h
+ eor r12,r12,r6 @ Maj(a,b,c)
+ add r4,r4,r0,ror#2 @ h+=Sigma0(a)
+ @ add r4,r4,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#9*4] @ 24
+ @ ldr r1,[sp,#6*4]
+ mov r0,r2,ror#7
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#8*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#1*4]
+
+ add r12,r12,r0
+ eor r0,r8,r8,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r11,r11,r2 @ h+=X[i]
+ str r2,[sp,#8*4]
+ eor r2,r9,r10
+ add r11,r11,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r8
+ add r11,r11,r12 @ h+=K256[i]
+ eor r2,r2,r10 @ Ch(e,f,g)
+ eor r0,r4,r4,ror#11
+ add r11,r11,r2 @ h+=Ch(e,f,g)
+#if 24==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 24<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r4,r5 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#10*4] @ from future BODY_16_xx
+ eor r12,r4,r5 @ a^b, b^c in next round
+ ldr r1,[sp,#7*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r4,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r7,r7,r11 @ d+=h
+ eor r3,r3,r5 @ Maj(a,b,c)
+ add r11,r11,r0,ror#2 @ h+=Sigma0(a)
+ @ add r11,r11,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#10*4] @ 25
+ @ ldr r1,[sp,#7*4]
+ mov r0,r2,ror#7
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#9*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#2*4]
+
+ add r3,r3,r0
+ eor r0,r7,r7,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r10,r10,r2 @ h+=X[i]
+ str r2,[sp,#9*4]
+ eor r2,r8,r9
+ add r10,r10,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r7
+ add r10,r10,r3 @ h+=K256[i]
+ eor r2,r2,r9 @ Ch(e,f,g)
+ eor r0,r11,r11,ror#11
+ add r10,r10,r2 @ h+=Ch(e,f,g)
+#if 25==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 25<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r11,r4 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#11*4] @ from future BODY_16_xx
+ eor r3,r11,r4 @ a^b, b^c in next round
+ ldr r1,[sp,#8*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r11,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r6,r6,r10 @ d+=h
+ eor r12,r12,r4 @ Maj(a,b,c)
+ add r10,r10,r0,ror#2 @ h+=Sigma0(a)
+ @ add r10,r10,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#11*4] @ 26
+ @ ldr r1,[sp,#8*4]
+ mov r0,r2,ror#7
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#10*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#3*4]
+
+ add r12,r12,r0
+ eor r0,r6,r6,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r9,r9,r2 @ h+=X[i]
+ str r2,[sp,#10*4]
+ eor r2,r7,r8
+ add r9,r9,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r6
+ add r9,r9,r12 @ h+=K256[i]
+ eor r2,r2,r8 @ Ch(e,f,g)
+ eor r0,r10,r10,ror#11
+ add r9,r9,r2 @ h+=Ch(e,f,g)
+#if 26==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 26<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r10,r11 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#12*4] @ from future BODY_16_xx
+ eor r12,r10,r11 @ a^b, b^c in next round
+ ldr r1,[sp,#9*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r10,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r5,r5,r9 @ d+=h
+ eor r3,r3,r11 @ Maj(a,b,c)
+ add r9,r9,r0,ror#2 @ h+=Sigma0(a)
+ @ add r9,r9,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#12*4] @ 27
+ @ ldr r1,[sp,#9*4]
+ mov r0,r2,ror#7
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#11*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#4*4]
+
+ add r3,r3,r0
+ eor r0,r5,r5,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r8,r8,r2 @ h+=X[i]
+ str r2,[sp,#11*4]
+ eor r2,r6,r7
+ add r8,r8,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r5
+ add r8,r8,r3 @ h+=K256[i]
+ eor r2,r2,r7 @ Ch(e,f,g)
+ eor r0,r9,r9,ror#11
+ add r8,r8,r2 @ h+=Ch(e,f,g)
+#if 27==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 27<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r9,r10 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#13*4] @ from future BODY_16_xx
+ eor r3,r9,r10 @ a^b, b^c in next round
+ ldr r1,[sp,#10*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r9,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r4,r4,r8 @ d+=h
+ eor r12,r12,r10 @ Maj(a,b,c)
+ add r8,r8,r0,ror#2 @ h+=Sigma0(a)
+ @ add r8,r8,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#13*4] @ 28
+ @ ldr r1,[sp,#10*4]
+ mov r0,r2,ror#7
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#12*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#5*4]
+
+ add r12,r12,r0
+ eor r0,r4,r4,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r7,r7,r2 @ h+=X[i]
+ str r2,[sp,#12*4]
+ eor r2,r5,r6
+ add r7,r7,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r4
+ add r7,r7,r12 @ h+=K256[i]
+ eor r2,r2,r6 @ Ch(e,f,g)
+ eor r0,r8,r8,ror#11
+ add r7,r7,r2 @ h+=Ch(e,f,g)
+#if 28==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 28<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r8,r9 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#14*4] @ from future BODY_16_xx
+ eor r12,r8,r9 @ a^b, b^c in next round
+ ldr r1,[sp,#11*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r8,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r11,r11,r7 @ d+=h
+ eor r3,r3,r9 @ Maj(a,b,c)
+ add r7,r7,r0,ror#2 @ h+=Sigma0(a)
+ @ add r7,r7,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#14*4] @ 29
+ @ ldr r1,[sp,#11*4]
+ mov r0,r2,ror#7
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#13*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#6*4]
+
+ add r3,r3,r0
+ eor r0,r11,r11,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r6,r6,r2 @ h+=X[i]
+ str r2,[sp,#13*4]
+ eor r2,r4,r5
+ add r6,r6,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r11
+ add r6,r6,r3 @ h+=K256[i]
+ eor r2,r2,r5 @ Ch(e,f,g)
+ eor r0,r7,r7,ror#11
+ add r6,r6,r2 @ h+=Ch(e,f,g)
+#if 29==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 29<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r7,r8 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#15*4] @ from future BODY_16_xx
+ eor r3,r7,r8 @ a^b, b^c in next round
+ ldr r1,[sp,#12*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r7,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r10,r10,r6 @ d+=h
+ eor r12,r12,r8 @ Maj(a,b,c)
+ add r6,r6,r0,ror#2 @ h+=Sigma0(a)
+ @ add r6,r6,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#15*4] @ 30
+ @ ldr r1,[sp,#12*4]
+ mov r0,r2,ror#7
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#14*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#7*4]
+
+ add r12,r12,r0
+ eor r0,r10,r10,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r5,r5,r2 @ h+=X[i]
+ str r2,[sp,#14*4]
+ eor r2,r11,r4
+ add r5,r5,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r10
+ add r5,r5,r12 @ h+=K256[i]
+ eor r2,r2,r4 @ Ch(e,f,g)
+ eor r0,r6,r6,ror#11
+ add r5,r5,r2 @ h+=Ch(e,f,g)
+#if 30==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 30<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r6,r7 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#0*4] @ from future BODY_16_xx
+ eor r12,r6,r7 @ a^b, b^c in next round
+ ldr r1,[sp,#13*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r6,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r9,r9,r5 @ d+=h
+ eor r3,r3,r7 @ Maj(a,b,c)
+ add r5,r5,r0,ror#2 @ h+=Sigma0(a)
+ @ add r5,r5,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#0*4] @ 31
+ @ ldr r1,[sp,#13*4]
+ mov r0,r2,ror#7
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#15*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#8*4]
+
+ add r3,r3,r0
+ eor r0,r9,r9,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r4,r4,r2 @ h+=X[i]
+ str r2,[sp,#15*4]
+ eor r2,r10,r11
+ add r4,r4,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r9
+ add r4,r4,r3 @ h+=K256[i]
+ eor r2,r2,r11 @ Ch(e,f,g)
+ eor r0,r5,r5,ror#11
+ add r4,r4,r2 @ h+=Ch(e,f,g)
+#if 31==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 31<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#1*4] @ from future BODY_16_xx
+ eor r3,r5,r6 @ a^b, b^c in next round
+ ldr r1,[sp,#14*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r5,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r8,r8,r4 @ d+=h
+ eor r12,r12,r6 @ Maj(a,b,c)
+ add r4,r4,r0,ror#2 @ h+=Sigma0(a)
+ @ add r4,r4,r12 @ h+=Maj(a,b,c)
+#ifdef __thumb2__
+ ite eq @ Thumb2 thing, sanity check in ARM
+#endif
+ ldreq r3,[sp,#16*4] @ pull ctx
+ bne .Lrounds_16_xx
+
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ ldr r0,[r3,#0]
+ ldr r2,[r3,#4]
+ ldr r12,[r3,#8]
+ add r4,r4,r0
+ ldr r0,[r3,#12]
+ add r5,r5,r2
+ ldr r2,[r3,#16]
+ add r6,r6,r12
+ ldr r12,[r3,#20]
+ add r7,r7,r0
+ ldr r0,[r3,#24]
+ add r8,r8,r2
+ ldr r2,[r3,#28]
+ add r9,r9,r12
+ ldr r1,[sp,#17*4] @ pull inp
+ ldr r12,[sp,#18*4] @ pull inp+len
+ add r10,r10,r0
+ add r11,r11,r2
+ stmia r3,{r4,r5,r6,r7,r8,r9,r10,r11}
+ cmp r1,r12
+ sub r14,r14,#256 @ rewind Ktbl
+ bne .Loop
+
+ add sp,sp,#19*4 @ destroy frame
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r11,pc}
+#else
+ ldmia sp!,{r4-r11,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ .word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
+.size zfs_sha256_block_armv7,.-zfs_sha256_block_armv7
+
+.arch armv7-a
+.fpu neon
+
+.globl zfs_sha256_block_neon
+.type zfs_sha256_block_neon,%function
+.align 5
+.skip 16
+zfs_sha256_block_neon:
+.LNEON:
+ stmdb sp!,{r4-r12,lr}
+
+ sub r11,sp,#16*4+16
+ adr r14,K256
+ bic r11,r11,#15 @ align for 128-bit stores
+ mov r12,sp
+ mov sp,r11 @ alloca
+ add r2,r1,r2,lsl#6 @ len to point at the end of inp
+
+ vld1.8 {q0},[r1]!
+ vld1.8 {q1},[r1]!
+ vld1.8 {q2},[r1]!
+ vld1.8 {q3},[r1]!
+ vld1.32 {q8},[r14,:128]!
+ vld1.32 {q9},[r14,:128]!
+ vld1.32 {q10},[r14,:128]!
+ vld1.32 {q11},[r14,:128]!
+ vrev32.8 q0,q0 @ yes, even on
+ str r0,[sp,#64]
+ vrev32.8 q1,q1 @ big-endian
+ str r1,[sp,#68]
+ mov r1,sp
+ vrev32.8 q2,q2
+ str r2,[sp,#72]
+ vrev32.8 q3,q3
+ str r12,[sp,#76] @ save original sp
+ vadd.i32 q8,q8,q0
+ vadd.i32 q9,q9,q1
+ vst1.32 {q8},[r1,:128]!
+ vadd.i32 q10,q10,q2
+ vst1.32 {q9},[r1,:128]!
+ vadd.i32 q11,q11,q3
+ vst1.32 {q10},[r1,:128]!
+ vst1.32 {q11},[r1,:128]!
+
+ ldmia r0,{r4-r11}
+ sub r1,r1,#64
+ ldr r2,[sp,#0]
+ eor r12,r12,r12
+ eor r3,r5,r6
+ b .L_00_48
+
+.align 4
+.L_00_48:
+ vext.8 q8,q0,q1,#4
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ vext.8 q9,q2,q3,#4
+ add r4,r4,r12
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vadd.i32 q0,q0,q9
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#4]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ veor q9,q9,q10
+ add r10,r10,r2
+ vsli.32 q11,q8,#14
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ vshr.u32 d24,d7,#17
+ add r11,r11,r3
+ and r2,r2,r7
+ veor q9,q9,q11
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ vsli.32 d24,d7,#15
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ vshr.u32 d25,d7,#10
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ vadd.i32 q0,q0,q9
+ add r10,r10,r2
+ ldr r2,[sp,#8]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r6,r6,r10
+ vshr.u32 d24,d7,#19
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ vsli.32 d24,d7,#13
+ add r9,r9,r2
+ eor r2,r7,r8
+ veor d25,d25,d24
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ vadd.i32 d0,d0,d25
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ vshr.u32 d24,d0,#17
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ vsli.32 d24,d0,#15
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ vshr.u32 d25,d0,#10
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#12]
+ and r3,r3,r12
+ vshr.u32 d24,d0,#19
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ vld1.32 {q8},[r14,:128]!
+ add r8,r8,r2
+ vsli.32 d24,d0,#13
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ veor d25,d25,d24
+ add r9,r9,r3
+ and r2,r2,r5
+ vadd.i32 d1,d1,d25
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ vadd.i32 q8,q8,q0
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#16]
+ and r12,r12,r3
+ add r4,r4,r8
+ vst1.32 {q8},[r1,:128]!
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vext.8 q8,q1,q2,#4
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ vext.8 q9,q3,q0,#4
+ add r8,r8,r12
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vadd.i32 q1,q1,q9
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#20]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ veor q9,q9,q10
+ add r6,r6,r2
+ vsli.32 q11,q8,#14
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ vshr.u32 d24,d1,#17
+ add r7,r7,r3
+ and r2,r2,r11
+ veor q9,q9,q11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ vsli.32 d24,d1,#15
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ vshr.u32 d25,d1,#10
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ vadd.i32 q1,q1,q9
+ add r6,r6,r2
+ ldr r2,[sp,#24]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r10,r10,r6
+ vshr.u32 d24,d1,#19
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ vsli.32 d24,d1,#13
+ add r5,r5,r2
+ eor r2,r11,r4
+ veor d25,d25,d24
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ vadd.i32 d2,d2,d25
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ vshr.u32 d24,d2,#17
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ vsli.32 d24,d2,#15
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ vshr.u32 d25,d2,#10
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#28]
+ and r3,r3,r12
+ vshr.u32 d24,d2,#19
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ vld1.32 {q8},[r14,:128]!
+ add r4,r4,r2
+ vsli.32 d24,d2,#13
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ veor d25,d25,d24
+ add r5,r5,r3
+ and r2,r2,r9
+ vadd.i32 d3,d3,d25
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ vadd.i32 q8,q8,q1
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[sp,#32]
+ and r12,r12,r3
+ add r8,r8,r4
+ vst1.32 {q8},[r1,:128]!
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ vext.8 q8,q2,q3,#4
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ vext.8 q9,q0,q1,#4
+ add r4,r4,r12
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vadd.i32 q2,q2,q9
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#36]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ veor q9,q9,q10
+ add r10,r10,r2
+ vsli.32 q11,q8,#14
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ vshr.u32 d24,d3,#17
+ add r11,r11,r3
+ and r2,r2,r7
+ veor q9,q9,q11
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ vsli.32 d24,d3,#15
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ vshr.u32 d25,d3,#10
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ vadd.i32 q2,q2,q9
+ add r10,r10,r2
+ ldr r2,[sp,#40]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r6,r6,r10
+ vshr.u32 d24,d3,#19
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ vsli.32 d24,d3,#13
+ add r9,r9,r2
+ eor r2,r7,r8
+ veor d25,d25,d24
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ vadd.i32 d4,d4,d25
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ vshr.u32 d24,d4,#17
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ vsli.32 d24,d4,#15
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ vshr.u32 d25,d4,#10
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#44]
+ and r3,r3,r12
+ vshr.u32 d24,d4,#19
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ vld1.32 {q8},[r14,:128]!
+ add r8,r8,r2
+ vsli.32 d24,d4,#13
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ veor d25,d25,d24
+ add r9,r9,r3
+ and r2,r2,r5
+ vadd.i32 d5,d5,d25
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ vadd.i32 q8,q8,q2
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#48]
+ and r12,r12,r3
+ add r4,r4,r8
+ vst1.32 {q8},[r1,:128]!
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vext.8 q8,q3,q0,#4
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ vext.8 q9,q1,q2,#4
+ add r8,r8,r12
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vadd.i32 q3,q3,q9
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#52]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ veor q9,q9,q10
+ add r6,r6,r2
+ vsli.32 q11,q8,#14
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ vshr.u32 d24,d5,#17
+ add r7,r7,r3
+ and r2,r2,r11
+ veor q9,q9,q11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ vsli.32 d24,d5,#15
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ vshr.u32 d25,d5,#10
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ vadd.i32 q3,q3,q9
+ add r6,r6,r2
+ ldr r2,[sp,#56]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r10,r10,r6
+ vshr.u32 d24,d5,#19
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ vsli.32 d24,d5,#13
+ add r5,r5,r2
+ eor r2,r11,r4
+ veor d25,d25,d24
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ vadd.i32 d6,d6,d25
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ vshr.u32 d24,d6,#17
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ vsli.32 d24,d6,#15
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ vshr.u32 d25,d6,#10
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#60]
+ and r3,r3,r12
+ vshr.u32 d24,d6,#19
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ vld1.32 {q8},[r14,:128]!
+ add r4,r4,r2
+ vsli.32 d24,d6,#13
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ veor d25,d25,d24
+ add r5,r5,r3
+ and r2,r2,r9
+ vadd.i32 d7,d7,d25
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ vadd.i32 q8,q8,q3
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[r14]
+ and r12,r12,r3
+ add r8,r8,r4
+ vst1.32 {q8},[r1,:128]!
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ teq r2,#0 @ check for K256 terminator
+ ldr r2,[sp,#0]
+ sub r1,r1,#64
+ bne .L_00_48
+
+ ldr r1,[sp,#68]
+ ldr r0,[sp,#72]
+ sub r14,r14,#256 @ rewind r14
+ teq r1,r0
+ it eq
+ subeq r1,r1,#64 @ avoid SEGV
+ vld1.8 {q0},[r1]! @ load next input block
+ vld1.8 {q1},[r1]!
+ vld1.8 {q2},[r1]!
+ vld1.8 {q3},[r1]!
+ it ne
+ strne r1,[sp,#68]
+ mov r1,sp
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ add r4,r4,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vrev32.8 q0,q0
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vadd.i32 q8,q8,q0
+ ldr r2,[sp,#4]
+ and r3,r3,r12
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ add r10,r10,r2
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ add r11,r11,r3
+ and r2,r2,r7
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ add r10,r10,r2
+ ldr r2,[sp,#8]
+ and r12,r12,r3
+ add r6,r6,r10
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ add r9,r9,r2
+ eor r2,r7,r8
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ ldr r2,[sp,#12]
+ and r3,r3,r12
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ add r8,r8,r2
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ add r9,r9,r3
+ and r2,r2,r5
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#16]
+ and r12,r12,r3
+ add r4,r4,r8
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vst1.32 {q8},[r1,:128]!
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ add r8,r8,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vrev32.8 q1,q1
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vadd.i32 q8,q8,q1
+ ldr r2,[sp,#20]
+ and r3,r3,r12
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ add r6,r6,r2
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ add r7,r7,r3
+ and r2,r2,r11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ add r6,r6,r2
+ ldr r2,[sp,#24]
+ and r12,r12,r3
+ add r10,r10,r6
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ add r5,r5,r2
+ eor r2,r11,r4
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ ldr r2,[sp,#28]
+ and r3,r3,r12
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ add r4,r4,r2
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ add r5,r5,r3
+ and r2,r2,r9
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[sp,#32]
+ and r12,r12,r3
+ add r8,r8,r4
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ vst1.32 {q8},[r1,:128]!
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ add r4,r4,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vrev32.8 q2,q2
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vadd.i32 q8,q8,q2
+ ldr r2,[sp,#36]
+ and r3,r3,r12
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ add r10,r10,r2
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ add r11,r11,r3
+ and r2,r2,r7
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ add r10,r10,r2
+ ldr r2,[sp,#40]
+ and r12,r12,r3
+ add r6,r6,r10
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ add r9,r9,r2
+ eor r2,r7,r8
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ ldr r2,[sp,#44]
+ and r3,r3,r12
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ add r8,r8,r2
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ add r9,r9,r3
+ and r2,r2,r5
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#48]
+ and r12,r12,r3
+ add r4,r4,r8
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vst1.32 {q8},[r1,:128]!
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ add r8,r8,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vrev32.8 q3,q3
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vadd.i32 q8,q8,q3
+ ldr r2,[sp,#52]
+ and r3,r3,r12
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ add r6,r6,r2
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ add r7,r7,r3
+ and r2,r2,r11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ add r6,r6,r2
+ ldr r2,[sp,#56]
+ and r12,r12,r3
+ add r10,r10,r6
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ add r5,r5,r2
+ eor r2,r11,r4
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ ldr r2,[sp,#60]
+ and r3,r3,r12
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ add r4,r4,r2
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ add r5,r5,r3
+ and r2,r2,r9
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[sp,#64]
+ and r12,r12,r3
+ add r8,r8,r4
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ vst1.32 {q8},[r1,:128]!
+ ldr r0,[r2,#0]
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ ldr r12,[r2,#4]
+ ldr r3,[r2,#8]
+ ldr r1,[r2,#12]
+ add r4,r4,r0 @ accumulate
+ ldr r0,[r2,#16]
+ add r5,r5,r12
+ ldr r12,[r2,#20]
+ add r6,r6,r3
+ ldr r3,[r2,#24]
+ add r7,r7,r1
+ ldr r1,[r2,#28]
+ add r8,r8,r0
+ str r4,[r2],#4
+ add r9,r9,r12
+ str r5,[r2],#4
+ add r10,r10,r3
+ str r6,[r2],#4
+ add r11,r11,r1
+ str r7,[r2],#4
+ stmia r2,{r8-r11}
+
+ ittte ne
+ movne r1,sp
+ ldrne r2,[sp,#0]
+ eorne r12,r12,r12
+ ldreq sp,[sp,#76] @ restore original sp
+ itt ne
+ eorne r3,r5,r6
+ bne .L_00_48
+
+ ldmia sp!,{r4-r12,pc}
+.size zfs_sha256_block_neon,.-zfs_sha256_block_neon
+
+# if defined(__thumb2__)
+# define INST(a,b,c,d) .byte c,d|0xc,a,b
+# else
+# define INST(a,b,c,d) .byte a,b,c,d
+# endif
+
+.globl zfs_sha256_block_armv8
+.type zfs_sha256_block_armv8,%function
+.align 5
+zfs_sha256_block_armv8:
+.LARMv8:
+ vld1.32 {q0,q1},[r0]
+ sub r3,r3,#256+32
+ add r2,r1,r2,lsl#6 @ len to point at the end of inp
+ b .Loop_v8
+
+.align 4
+.Loop_v8:
+ vld1.8 {q8-q9},[r1]!
+ vld1.8 {q10-q11},[r1]!
+ vld1.32 {q12},[r3]!
+ vrev32.8 q8,q8
+ vrev32.8 q9,q9
+ vrev32.8 q10,q10
+ vrev32.8 q11,q11
+ vmov q14,q0 @ offload
+ vmov q15,q1
+ teq r1,r2
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q8
+ INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q9
+ INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q10
+ INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q11
+ INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q8
+ INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q9
+ INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q10
+ INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q11
+ INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q8
+ INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q9
+ INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q10
+ INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q11
+ INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q8
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q9
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+
+ vld1.32 {q13},[r3]
+ vadd.i32 q12,q12,q10
+ sub r3,r3,#256-16 @ rewind
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+
+ vadd.i32 q13,q13,q11
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+
+ vadd.i32 q0,q0,q14
+ vadd.i32 q1,q1,q15
+ it ne
+ bne .Loop_v8
+
+ vst1.32 {q0,q1},[r0]
+
+ bx lr @ bx lr
+.size zfs_sha256_block_armv8,.-zfs_sha256_block_armv8
+
+#endif
diff --git a/module/icp/asm-arm/sha2/sha512-armv7.S b/module/icp/asm-arm/sha2/sha512-armv7.S
new file mode 100644
index 000000000..a4c804033
--- /dev/null
+++ b/module/icp/asm-arm/sha2/sha512-armv7.S
@@ -0,0 +1,1822 @@
+/*
+ * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Portions Copyright (c) 2022 Tino Reichardt <[email protected]>
+ * - modified assembly to fit into OpenZFS
+ */
+
+#if defined(__arm__)
+
+#define __ARM_ARCH__ 7
+#define __ARM_MAX_ARCH__ 7
+
+#ifndef __KERNEL__
+# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
+# define VFP_ABI_POP vldmia sp!,{d8-d15}
+#else
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+#endif
+
+#ifdef __ARMEL__
+# define LO 0
+# define HI 4
+# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
+#else
+# define HI 0
+# define LO 4
+# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
+#endif
+
+#if defined(__thumb2__)
+.syntax unified
+.thumb
+# define adrl adr
+#else
+.code 32
+#endif
+
+.text
+
+.type K512,%object
+.align 5
+K512:
+ WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
+ WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
+ WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
+ WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
+ WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
+ WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
+ WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
+ WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
+ WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
+ WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
+ WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
+ WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
+ WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
+ WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
+ WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
+ WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
+ WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
+ WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
+ WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
+ WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
+ WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
+ WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
+ WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
+ WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
+ WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
+ WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
+ WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
+ WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
+ WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
+ WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
+ WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
+ WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
+ WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
+ WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
+ WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
+ WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
+ WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
+ WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
+ WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
+ WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
+.size K512,.-K512
+.word 0 @ terminator
+
+.align 5
+.globl zfs_sha512_block_armv7
+.type zfs_sha512_block_armv7,%function
+zfs_sha512_block_armv7:
+.Lzfs_sha512_block_armv7:
+
+#if __ARM_ARCH__<7 && !defined(__thumb2__)
+ sub r3,pc,#8 @ zfs_sha512_block_armv7
+#else
+ adr r3,.Lzfs_sha512_block_armv7
+#endif
+
+ add r2,r1,r2,lsl#7 @ len to point at the end of inp
+ stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+ sub r14,r3,#672 @ K512
+ sub sp,sp,#9*8
+
+ ldr r7,[r0,#32+LO]
+ ldr r8,[r0,#32+HI]
+ ldr r9, [r0,#48+LO]
+ ldr r10, [r0,#48+HI]
+ ldr r11, [r0,#56+LO]
+ ldr r12, [r0,#56+HI]
+.Loop:
+ str r9, [sp,#48+0]
+ str r10, [sp,#48+4]
+ str r11, [sp,#56+0]
+ str r12, [sp,#56+4]
+ ldr r5,[r0,#0+LO]
+ ldr r6,[r0,#0+HI]
+ ldr r3,[r0,#8+LO]
+ ldr r4,[r0,#8+HI]
+ ldr r9, [r0,#16+LO]
+ ldr r10, [r0,#16+HI]
+ ldr r11, [r0,#24+LO]
+ ldr r12, [r0,#24+HI]
+ str r3,[sp,#8+0]
+ str r4,[sp,#8+4]
+ str r9, [sp,#16+0]
+ str r10, [sp,#16+4]
+ str r11, [sp,#24+0]
+ str r12, [sp,#24+4]
+ ldr r3,[r0,#40+LO]
+ ldr r4,[r0,#40+HI]
+ str r3,[sp,#40+0]
+ str r4,[sp,#40+4]
+
+.L00_15:
+#if __ARM_ARCH__<7
+ ldrb r3,[r1,#7]
+ ldrb r9, [r1,#6]
+ ldrb r10, [r1,#5]
+ ldrb r11, [r1,#4]
+ ldrb r4,[r1,#3]
+ ldrb r12, [r1,#2]
+ orr r3,r3,r9,lsl#8
+ ldrb r9, [r1,#1]
+ orr r3,r3,r10,lsl#16
+ ldrb r10, [r1],#8
+ orr r3,r3,r11,lsl#24
+ orr r4,r4,r12,lsl#8
+ orr r4,r4,r9,lsl#16
+ orr r4,r4,r10,lsl#24
+#else
+ ldr r3,[r1,#4]
+ ldr r4,[r1],#8
+#ifdef __ARMEL__
+ rev r3,r3
+ rev r4,r4
+#endif
+#endif
+ @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
+ @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
+ @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
+ mov r9,r7,lsr#14
+ str r3,[sp,#64+0]
+ mov r10,r8,lsr#14
+ str r4,[sp,#64+4]
+ eor r9,r9,r8,lsl#18
+ ldr r11,[sp,#56+0] @ h.lo
+ eor r10,r10,r7,lsl#18
+ ldr r12,[sp,#56+4] @ h.hi
+ eor r9,r9,r7,lsr#18
+ eor r10,r10,r8,lsr#18
+ eor r9,r9,r8,lsl#14
+ eor r10,r10,r7,lsl#14
+ eor r9,r9,r8,lsr#9
+ eor r10,r10,r7,lsr#9
+ eor r9,r9,r7,lsl#23
+ eor r10,r10,r8,lsl#23 @ Sigma1(e)
+ adds r3,r3,r9
+ ldr r9,[sp,#40+0] @ f.lo
+ adc r4,r4,r10 @ T += Sigma1(e)
+ ldr r10,[sp,#40+4] @ f.hi
+ adds r3,r3,r11
+ ldr r11,[sp,#48+0] @ g.lo
+ adc r4,r4,r12 @ T += h
+ ldr r12,[sp,#48+4] @ g.hi
+
+ eor r9,r9,r11
+ str r7,[sp,#32+0]
+ eor r10,r10,r12
+ str r8,[sp,#32+4]
+ and r9,r9,r7
+ str r5,[sp,#0+0]
+ and r10,r10,r8
+ str r6,[sp,#0+4]
+ eor r9,r9,r11
+ ldr r11,[r14,#LO] @ K[i].lo
+ eor r10,r10,r12 @ Ch(e,f,g)
+ ldr r12,[r14,#HI] @ K[i].hi
+
+ adds r3,r3,r9
+ ldr r7,[sp,#24+0] @ d.lo
+ adc r4,r4,r10 @ T += Ch(e,f,g)
+ ldr r8,[sp,#24+4] @ d.hi
+ adds r3,r3,r11
+ and r9,r11,#0xff
+ adc r4,r4,r12 @ T += K[i]
+ adds r7,r7,r3
+ ldr r11,[sp,#8+0] @ b.lo
+ adc r8,r8,r4 @ d += T
+ teq r9,#148
+
+ ldr r12,[sp,#16+0] @ c.lo
+#ifdef __thumb2__
+ it eq @ Thumb2 thing, sanity check in ARM
+#endif
+ orreq r14,r14,#1
+ @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
+ @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
+ @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
+ mov r9,r5,lsr#28
+ mov r10,r6,lsr#28
+ eor r9,r9,r6,lsl#4
+ eor r10,r10,r5,lsl#4
+ eor r9,r9,r6,lsr#2
+ eor r10,r10,r5,lsr#2
+ eor r9,r9,r5,lsl#30
+ eor r10,r10,r6,lsl#30
+ eor r9,r9,r6,lsr#7
+ eor r10,r10,r5,lsr#7
+ eor r9,r9,r5,lsl#25
+ eor r10,r10,r6,lsl#25 @ Sigma0(a)
+ adds r3,r3,r9
+ and r9,r5,r11
+ adc r4,r4,r10 @ T += Sigma0(a)
+
+ ldr r10,[sp,#8+4] @ b.hi
+ orr r5,r5,r11
+ ldr r11,[sp,#16+4] @ c.hi
+ and r5,r5,r12
+ and r12,r6,r10
+ orr r6,r6,r10
+ orr r5,r5,r9 @ Maj(a,b,c).lo
+ and r6,r6,r11
+ adds r5,r5,r3
+ orr r6,r6,r12 @ Maj(a,b,c).hi
+ sub sp,sp,#8
+ adc r6,r6,r4 @ h += T
+ tst r14,#1
+ add r14,r14,#8
+ tst r14,#1
+ beq .L00_15
+ ldr r9,[sp,#184+0]
+ ldr r10,[sp,#184+4]
+ bic r14,r14,#1
+.L16_79:
+ @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
+ @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
+ @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
+ mov r3,r9,lsr#1
+ ldr r11,[sp,#80+0]
+ mov r4,r10,lsr#1
+ ldr r12,[sp,#80+4]
+ eor r3,r3,r10,lsl#31
+ eor r4,r4,r9,lsl#31
+ eor r3,r3,r9,lsr#8
+ eor r4,r4,r10,lsr#8
+ eor r3,r3,r10,lsl#24
+ eor r4,r4,r9,lsl#24
+ eor r3,r3,r9,lsr#7
+ eor r4,r4,r10,lsr#7
+ eor r3,r3,r10,lsl#25
+
+ @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
+ @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
+ @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
+ mov r9,r11,lsr#19
+ mov r10,r12,lsr#19
+ eor r9,r9,r12,lsl#13
+ eor r10,r10,r11,lsl#13
+ eor r9,r9,r12,lsr#29
+ eor r10,r10,r11,lsr#29
+ eor r9,r9,r11,lsl#3
+ eor r10,r10,r12,lsl#3
+ eor r9,r9,r11,lsr#6
+ eor r10,r10,r12,lsr#6
+ ldr r11,[sp,#120+0]
+ eor r9,r9,r12,lsl#26
+
+ ldr r12,[sp,#120+4]
+ adds r3,r3,r9
+ ldr r9,[sp,#192+0]
+ adc r4,r4,r10
+
+ ldr r10,[sp,#192+4]
+ adds r3,r3,r11
+ adc r4,r4,r12
+ adds r3,r3,r9
+ adc r4,r4,r10
+ @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
+ @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
+ @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
+ mov r9,r7,lsr#14
+ str r3,[sp,#64+0]
+ mov r10,r8,lsr#14
+ str r4,[sp,#64+4]
+ eor r9,r9,r8,lsl#18
+ ldr r11,[sp,#56+0] @ h.lo
+ eor r10,r10,r7,lsl#18
+ ldr r12,[sp,#56+4] @ h.hi
+ eor r9,r9,r7,lsr#18
+ eor r10,r10,r8,lsr#18
+ eor r9,r9,r8,lsl#14
+ eor r10,r10,r7,lsl#14
+ eor r9,r9,r8,lsr#9
+ eor r10,r10,r7,lsr#9
+ eor r9,r9,r7,lsl#23
+ eor r10,r10,r8,lsl#23 @ Sigma1(e)
+ adds r3,r3,r9
+ ldr r9,[sp,#40+0] @ f.lo
+ adc r4,r4,r10 @ T += Sigma1(e)
+ ldr r10,[sp,#40+4] @ f.hi
+ adds r3,r3,r11
+ ldr r11,[sp,#48+0] @ g.lo
+ adc r4,r4,r12 @ T += h
+ ldr r12,[sp,#48+4] @ g.hi
+
+ eor r9,r9,r11
+ str r7,[sp,#32+0]
+ eor r10,r10,r12
+ str r8,[sp,#32+4]
+ and r9,r9,r7
+ str r5,[sp,#0+0]
+ and r10,r10,r8
+ str r6,[sp,#0+4]
+ eor r9,r9,r11
+ ldr r11,[r14,#LO] @ K[i].lo
+ eor r10,r10,r12 @ Ch(e,f,g)
+ ldr r12,[r14,#HI] @ K[i].hi
+
+ adds r3,r3,r9
+ ldr r7,[sp,#24+0] @ d.lo
+ adc r4,r4,r10 @ T += Ch(e,f,g)
+ ldr r8,[sp,#24+4] @ d.hi
+ adds r3,r3,r11
+ and r9,r11,#0xff
+ adc r4,r4,r12 @ T += K[i]
+ adds r7,r7,r3
+ ldr r11,[sp,#8+0] @ b.lo
+ adc r8,r8,r4 @ d += T
+ teq r9,#23
+
+ ldr r12,[sp,#16+0] @ c.lo
+#ifdef __thumb2__
+ it eq @ Thumb2 thing, sanity check in ARM
+#endif
+ orreq r14,r14,#1
+ @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
+ @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
+ @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
+ mov r9,r5,lsr#28
+ mov r10,r6,lsr#28
+ eor r9,r9,r6,lsl#4
+ eor r10,r10,r5,lsl#4
+ eor r9,r9,r6,lsr#2
+ eor r10,r10,r5,lsr#2
+ eor r9,r9,r5,lsl#30
+ eor r10,r10,r6,lsl#30
+ eor r9,r9,r6,lsr#7
+ eor r10,r10,r5,lsr#7
+ eor r9,r9,r5,lsl#25
+ eor r10,r10,r6,lsl#25 @ Sigma0(a)
+ adds r3,r3,r9
+ and r9,r5,r11
+ adc r4,r4,r10 @ T += Sigma0(a)
+
+ ldr r10,[sp,#8+4] @ b.hi
+ orr r5,r5,r11
+ ldr r11,[sp,#16+4] @ c.hi
+ and r5,r5,r12
+ and r12,r6,r10
+ orr r6,r6,r10
+ orr r5,r5,r9 @ Maj(a,b,c).lo
+ and r6,r6,r11
+ adds r5,r5,r3
+ orr r6,r6,r12 @ Maj(a,b,c).hi
+ sub sp,sp,#8
+ adc r6,r6,r4 @ h += T
+ tst r14,#1
+ add r14,r14,#8
+#ifdef __thumb2__
+ ittt eq @ Thumb2 thing, sanity check in ARM
+#endif
+ ldreq r9,[sp,#184+0]
+ ldreq r10,[sp,#184+4]
+ beq .L16_79
+ bic r14,r14,#1
+
+ ldr r3,[sp,#8+0]
+ ldr r4,[sp,#8+4]
+ ldr r9, [r0,#0+LO]
+ ldr r10, [r0,#0+HI]
+ ldr r11, [r0,#8+LO]
+ ldr r12, [r0,#8+HI]
+ adds r9,r5,r9
+ str r9, [r0,#0+LO]
+ adc r10,r6,r10
+ str r10, [r0,#0+HI]
+ adds r11,r3,r11
+ str r11, [r0,#8+LO]
+ adc r12,r4,r12
+ str r12, [r0,#8+HI]
+
+ ldr r5,[sp,#16+0]
+ ldr r6,[sp,#16+4]
+ ldr r3,[sp,#24+0]
+ ldr r4,[sp,#24+4]
+ ldr r9, [r0,#16+LO]
+ ldr r10, [r0,#16+HI]
+ ldr r11, [r0,#24+LO]
+ ldr r12, [r0,#24+HI]
+ adds r9,r5,r9
+ str r9, [r0,#16+LO]
+ adc r10,r6,r10
+ str r10, [r0,#16+HI]
+ adds r11,r3,r11
+ str r11, [r0,#24+LO]
+ adc r12,r4,r12
+ str r12, [r0,#24+HI]
+
+ ldr r3,[sp,#40+0]
+ ldr r4,[sp,#40+4]
+ ldr r9, [r0,#32+LO]
+ ldr r10, [r0,#32+HI]
+ ldr r11, [r0,#40+LO]
+ ldr r12, [r0,#40+HI]
+ adds r7,r7,r9
+ str r7,[r0,#32+LO]
+ adc r8,r8,r10
+ str r8,[r0,#32+HI]
+ adds r11,r3,r11
+ str r11, [r0,#40+LO]
+ adc r12,r4,r12
+ str r12, [r0,#40+HI]
+
+ ldr r5,[sp,#48+0]
+ ldr r6,[sp,#48+4]
+ ldr r3,[sp,#56+0]
+ ldr r4,[sp,#56+4]
+ ldr r9, [r0,#48+LO]
+ ldr r10, [r0,#48+HI]
+ ldr r11, [r0,#56+LO]
+ ldr r12, [r0,#56+HI]
+ adds r9,r5,r9
+ str r9, [r0,#48+LO]
+ adc r10,r6,r10
+ str r10, [r0,#48+HI]
+ adds r11,r3,r11
+ str r11, [r0,#56+LO]
+ adc r12,r4,r12
+ str r12, [r0,#56+HI]
+
+ add sp,sp,#640
+ sub r14,r14,#640
+
+ teq r1,r2
+ bne .Loop
+
+ add sp,sp,#8*9 @ destroy frame
+
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+#else
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+.word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
+.size zfs_sha512_block_armv7,.-zfs_sha512_block_armv7
+
+.arch armv7-a
+.fpu neon
+
+.globl zfs_sha512_block_neon
+.type zfs_sha512_block_neon,%function
+.align 4
+zfs_sha512_block_neon:
+.LNEON:
+ dmb @ errata #451034 on early Cortex A8
+ add r2,r1,r2,lsl#7 @ len to point at the end of inp
+ adr r3,K512
+ VFP_ABI_PUSH
+ vldmia r0,{d16,d17,d18,d19,d20,d21,d22,d23} @ load context
+.Loop_neon:
+ vshr.u64 d24,d20,#14 @ 0
+#if 0<16
+ vld1.64 {d0},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d20,#18
+#if 0>0
+ vadd.i64 d16,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d20,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d20,#50
+ vsli.64 d25,d20,#46
+ vmov d29,d20
+ vsli.64 d26,d20,#23
+#if 0<16 && defined(__ARMEL__)
+ vrev64.8 d0,d0
+#endif
+ veor d25,d24
+ vbsl d29,d21,d22 @ Ch(e,f,g)
+ vshr.u64 d24,d16,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d23
+ vshr.u64 d25,d16,#34
+ vsli.64 d24,d16,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d16,#39
+ vadd.i64 d28,d0
+ vsli.64 d25,d16,#30
+ veor d30,d16,d17
+ vsli.64 d26,d16,#25
+ veor d23,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d18,d17 @ Maj(a,b,c)
+ veor d23,d26 @ Sigma0(a)
+ vadd.i64 d19,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d23,d30
+ vshr.u64 d24,d19,#14 @ 1
+#if 1<16
+ vld1.64 {d1},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d19,#18
+#if 1>0
+ vadd.i64 d23,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d19,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d19,#50
+ vsli.64 d25,d19,#46
+ vmov d29,d19
+ vsli.64 d26,d19,#23
+#if 1<16 && defined(__ARMEL__)
+ vrev64.8 d1,d1
+#endif
+ veor d25,d24
+ vbsl d29,d20,d21 @ Ch(e,f,g)
+ vshr.u64 d24,d23,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d22
+ vshr.u64 d25,d23,#34
+ vsli.64 d24,d23,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d23,#39
+ vadd.i64 d28,d1
+ vsli.64 d25,d23,#30
+ veor d30,d23,d16
+ vsli.64 d26,d23,#25
+ veor d22,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d17,d16 @ Maj(a,b,c)
+ veor d22,d26 @ Sigma0(a)
+ vadd.i64 d18,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d22,d30
+ vshr.u64 d24,d18,#14 @ 2
+#if 2<16
+ vld1.64 {d2},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d18,#18
+#if 2>0
+ vadd.i64 d22,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d18,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d18,#50
+ vsli.64 d25,d18,#46
+ vmov d29,d18
+ vsli.64 d26,d18,#23
+#if 2<16 && defined(__ARMEL__)
+ vrev64.8 d2,d2
+#endif
+ veor d25,d24
+ vbsl d29,d19,d20 @ Ch(e,f,g)
+ vshr.u64 d24,d22,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d21
+ vshr.u64 d25,d22,#34
+ vsli.64 d24,d22,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d22,#39
+ vadd.i64 d28,d2
+ vsli.64 d25,d22,#30
+ veor d30,d22,d23
+ vsli.64 d26,d22,#25
+ veor d21,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d16,d23 @ Maj(a,b,c)
+ veor d21,d26 @ Sigma0(a)
+ vadd.i64 d17,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d21,d30
+ vshr.u64 d24,d17,#14 @ 3
+#if 3<16
+ vld1.64 {d3},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d17,#18
+#if 3>0
+ vadd.i64 d21,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d17,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d17,#50
+ vsli.64 d25,d17,#46
+ vmov d29,d17
+ vsli.64 d26,d17,#23
+#if 3<16 && defined(__ARMEL__)
+ vrev64.8 d3,d3
+#endif
+ veor d25,d24
+ vbsl d29,d18,d19 @ Ch(e,f,g)
+ vshr.u64 d24,d21,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d20
+ vshr.u64 d25,d21,#34
+ vsli.64 d24,d21,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d21,#39
+ vadd.i64 d28,d3
+ vsli.64 d25,d21,#30
+ veor d30,d21,d22
+ vsli.64 d26,d21,#25
+ veor d20,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d23,d22 @ Maj(a,b,c)
+ veor d20,d26 @ Sigma0(a)
+ vadd.i64 d16,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d20,d30
+ vshr.u64 d24,d16,#14 @ 4
+#if 4<16
+ vld1.64 {d4},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d16,#18
+#if 4>0
+ vadd.i64 d20,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d16,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d16,#50
+ vsli.64 d25,d16,#46
+ vmov d29,d16
+ vsli.64 d26,d16,#23
+#if 4<16 && defined(__ARMEL__)
+ vrev64.8 d4,d4
+#endif
+ veor d25,d24
+ vbsl d29,d17,d18 @ Ch(e,f,g)
+ vshr.u64 d24,d20,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d19
+ vshr.u64 d25,d20,#34
+ vsli.64 d24,d20,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d20,#39
+ vadd.i64 d28,d4
+ vsli.64 d25,d20,#30
+ veor d30,d20,d21
+ vsli.64 d26,d20,#25
+ veor d19,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d22,d21 @ Maj(a,b,c)
+ veor d19,d26 @ Sigma0(a)
+ vadd.i64 d23,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d19,d30
+ vshr.u64 d24,d23,#14 @ 5
+#if 5<16
+ vld1.64 {d5},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d23,#18
+#if 5>0
+ vadd.i64 d19,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d23,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d23,#50
+ vsli.64 d25,d23,#46
+ vmov d29,d23
+ vsli.64 d26,d23,#23
+#if 5<16 && defined(__ARMEL__)
+ vrev64.8 d5,d5
+#endif
+ veor d25,d24
+ vbsl d29,d16,d17 @ Ch(e,f,g)
+ vshr.u64 d24,d19,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d18
+ vshr.u64 d25,d19,#34
+ vsli.64 d24,d19,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d19,#39
+ vadd.i64 d28,d5
+ vsli.64 d25,d19,#30
+ veor d30,d19,d20
+ vsli.64 d26,d19,#25
+ veor d18,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d21,d20 @ Maj(a,b,c)
+ veor d18,d26 @ Sigma0(a)
+ vadd.i64 d22,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d18,d30
+ vshr.u64 d24,d22,#14 @ 6
+#if 6<16
+ vld1.64 {d6},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d22,#18
+#if 6>0
+ vadd.i64 d18,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d22,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d22,#50
+ vsli.64 d25,d22,#46
+ vmov d29,d22
+ vsli.64 d26,d22,#23
+#if 6<16 && defined(__ARMEL__)
+ vrev64.8 d6,d6
+#endif
+ veor d25,d24
+ vbsl d29,d23,d16 @ Ch(e,f,g)
+ vshr.u64 d24,d18,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d17
+ vshr.u64 d25,d18,#34
+ vsli.64 d24,d18,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d18,#39
+ vadd.i64 d28,d6
+ vsli.64 d25,d18,#30
+ veor d30,d18,d19
+ vsli.64 d26,d18,#25
+ veor d17,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d20,d19 @ Maj(a,b,c)
+ veor d17,d26 @ Sigma0(a)
+ vadd.i64 d21,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d17,d30
+ vshr.u64 d24,d21,#14 @ 7
+#if 7<16
+ vld1.64 {d7},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d21,#18
+#if 7>0
+ vadd.i64 d17,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d21,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d21,#50
+ vsli.64 d25,d21,#46
+ vmov d29,d21
+ vsli.64 d26,d21,#23
+#if 7<16 && defined(__ARMEL__)
+ vrev64.8 d7,d7
+#endif
+ veor d25,d24
+ vbsl d29,d22,d23 @ Ch(e,f,g)
+ vshr.u64 d24,d17,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d16
+ vshr.u64 d25,d17,#34
+ vsli.64 d24,d17,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d17,#39
+ vadd.i64 d28,d7
+ vsli.64 d25,d17,#30
+ veor d30,d17,d18
+ vsli.64 d26,d17,#25
+ veor d16,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d19,d18 @ Maj(a,b,c)
+ veor d16,d26 @ Sigma0(a)
+ vadd.i64 d20,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d16,d30
+ vshr.u64 d24,d20,#14 @ 8
+#if 8<16
+ vld1.64 {d8},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d20,#18
+#if 8>0
+ vadd.i64 d16,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d20,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d20,#50
+ vsli.64 d25,d20,#46
+ vmov d29,d20
+ vsli.64 d26,d20,#23
+#if 8<16 && defined(__ARMEL__)
+ vrev64.8 d8,d8
+#endif
+ veor d25,d24
+ vbsl d29,d21,d22 @ Ch(e,f,g)
+ vshr.u64 d24,d16,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d23
+ vshr.u64 d25,d16,#34
+ vsli.64 d24,d16,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d16,#39
+ vadd.i64 d28,d8
+ vsli.64 d25,d16,#30
+ veor d30,d16,d17
+ vsli.64 d26,d16,#25
+ veor d23,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d18,d17 @ Maj(a,b,c)
+ veor d23,d26 @ Sigma0(a)
+ vadd.i64 d19,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d23,d30
+ vshr.u64 d24,d19,#14 @ 9
+#if 9<16
+ vld1.64 {d9},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d19,#18
+#if 9>0
+ vadd.i64 d23,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d19,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d19,#50
+ vsli.64 d25,d19,#46
+ vmov d29,d19
+ vsli.64 d26,d19,#23
+#if 9<16 && defined(__ARMEL__)
+ vrev64.8 d9,d9
+#endif
+ veor d25,d24
+ vbsl d29,d20,d21 @ Ch(e,f,g)
+ vshr.u64 d24,d23,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d22
+ vshr.u64 d25,d23,#34
+ vsli.64 d24,d23,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d23,#39
+ vadd.i64 d28,d9
+ vsli.64 d25,d23,#30
+ veor d30,d23,d16
+ vsli.64 d26,d23,#25
+ veor d22,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d17,d16 @ Maj(a,b,c)
+ veor d22,d26 @ Sigma0(a)
+ vadd.i64 d18,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d22,d30
+ vshr.u64 d24,d18,#14 @ 10
+#if 10<16
+ vld1.64 {d10},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d18,#18
+#if 10>0
+ vadd.i64 d22,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d18,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d18,#50
+ vsli.64 d25,d18,#46
+ vmov d29,d18
+ vsli.64 d26,d18,#23
+#if 10<16 && defined(__ARMEL__)
+ vrev64.8 d10,d10
+#endif
+ veor d25,d24
+ vbsl d29,d19,d20 @ Ch(e,f,g)
+ vshr.u64 d24,d22,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d21
+ vshr.u64 d25,d22,#34
+ vsli.64 d24,d22,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d22,#39
+ vadd.i64 d28,d10
+ vsli.64 d25,d22,#30
+ veor d30,d22,d23
+ vsli.64 d26,d22,#25
+ veor d21,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d16,d23 @ Maj(a,b,c)
+ veor d21,d26 @ Sigma0(a)
+ vadd.i64 d17,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d21,d30
+ vshr.u64 d24,d17,#14 @ 11
+#if 11<16
+ vld1.64 {d11},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d17,#18
+#if 11>0
+ vadd.i64 d21,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d17,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d17,#50
+ vsli.64 d25,d17,#46
+ vmov d29,d17
+ vsli.64 d26,d17,#23
+#if 11<16 && defined(__ARMEL__)
+ vrev64.8 d11,d11
+#endif
+ veor d25,d24
+ vbsl d29,d18,d19 @ Ch(e,f,g)
+ vshr.u64 d24,d21,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d20
+ vshr.u64 d25,d21,#34
+ vsli.64 d24,d21,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d21,#39
+ vadd.i64 d28,d11
+ vsli.64 d25,d21,#30
+ veor d30,d21,d22
+ vsli.64 d26,d21,#25
+ veor d20,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d23,d22 @ Maj(a,b,c)
+ veor d20,d26 @ Sigma0(a)
+ vadd.i64 d16,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d20,d30
+ vshr.u64 d24,d16,#14 @ 12
+#if 12<16
+ vld1.64 {d12},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d16,#18
+#if 12>0
+ vadd.i64 d20,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d16,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d16,#50
+ vsli.64 d25,d16,#46
+ vmov d29,d16
+ vsli.64 d26,d16,#23
+#if 12<16 && defined(__ARMEL__)
+ vrev64.8 d12,d12
+#endif
+ veor d25,d24
+ vbsl d29,d17,d18 @ Ch(e,f,g)
+ vshr.u64 d24,d20,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d19
+ vshr.u64 d25,d20,#34
+ vsli.64 d24,d20,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d20,#39
+ vadd.i64 d28,d12
+ vsli.64 d25,d20,#30
+ veor d30,d20,d21
+ vsli.64 d26,d20,#25
+ veor d19,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d22,d21 @ Maj(a,b,c)
+ veor d19,d26 @ Sigma0(a)
+ vadd.i64 d23,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d19,d30
+ vshr.u64 d24,d23,#14 @ 13
+#if 13<16
+ vld1.64 {d13},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d23,#18
+#if 13>0
+ vadd.i64 d19,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d23,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d23,#50
+ vsli.64 d25,d23,#46
+ vmov d29,d23
+ vsli.64 d26,d23,#23
+#if 13<16 && defined(__ARMEL__)
+ vrev64.8 d13,d13
+#endif
+ veor d25,d24
+ vbsl d29,d16,d17 @ Ch(e,f,g)
+ vshr.u64 d24,d19,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d18
+ vshr.u64 d25,d19,#34
+ vsli.64 d24,d19,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d19,#39
+ vadd.i64 d28,d13
+ vsli.64 d25,d19,#30
+ veor d30,d19,d20
+ vsli.64 d26,d19,#25
+ veor d18,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d21,d20 @ Maj(a,b,c)
+ veor d18,d26 @ Sigma0(a)
+ vadd.i64 d22,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d18,d30
+ vshr.u64 d24,d22,#14 @ 14
+#if 14<16
+ vld1.64 {d14},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d22,#18
+#if 14>0
+ vadd.i64 d18,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d22,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d22,#50
+ vsli.64 d25,d22,#46
+ vmov d29,d22
+ vsli.64 d26,d22,#23
+#if 14<16 && defined(__ARMEL__)
+ vrev64.8 d14,d14
+#endif
+ veor d25,d24
+ vbsl d29,d23,d16 @ Ch(e,f,g)
+ vshr.u64 d24,d18,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d17
+ vshr.u64 d25,d18,#34
+ vsli.64 d24,d18,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d18,#39
+ vadd.i64 d28,d14
+ vsli.64 d25,d18,#30
+ veor d30,d18,d19
+ vsli.64 d26,d18,#25
+ veor d17,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d20,d19 @ Maj(a,b,c)
+ veor d17,d26 @ Sigma0(a)
+ vadd.i64 d21,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d17,d30
+ vshr.u64 d24,d21,#14 @ 15
+#if 15<16
+ vld1.64 {d15},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d21,#18
+#if 15>0
+ vadd.i64 d17,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d21,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d21,#50
+ vsli.64 d25,d21,#46
+ vmov d29,d21
+ vsli.64 d26,d21,#23
+#if 15<16 && defined(__ARMEL__)
+ vrev64.8 d15,d15
+#endif
+ veor d25,d24
+ vbsl d29,d22,d23 @ Ch(e,f,g)
+ vshr.u64 d24,d17,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d16
+ vshr.u64 d25,d17,#34
+ vsli.64 d24,d17,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d17,#39
+ vadd.i64 d28,d15
+ vsli.64 d25,d17,#30
+ veor d30,d17,d18
+ vsli.64 d26,d17,#25
+ veor d16,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d19,d18 @ Maj(a,b,c)
+ veor d16,d26 @ Sigma0(a)
+ vadd.i64 d20,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d16,d30
+ mov r12,#4
+.L16_79_neon:
+ subs r12,#1
+ vshr.u64 q12,q7,#19
+ vshr.u64 q13,q7,#61
+ vadd.i64 d16,d30 @ h+=Maj from the past
+ vshr.u64 q15,q7,#6
+ vsli.64 q12,q7,#45
+ vext.8 q14,q0,q1,#8 @ X[i+1]
+ vsli.64 q13,q7,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q0,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q4,q5,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d20,#14 @ from NEON_00_15
+ vadd.i64 q0,q14
+ vshr.u64 d25,d20,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d20,#41 @ from NEON_00_15
+ vadd.i64 q0,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d20,#50
+ vsli.64 d25,d20,#46
+ vmov d29,d20
+ vsli.64 d26,d20,#23
+#if 16<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d21,d22 @ Ch(e,f,g)
+ vshr.u64 d24,d16,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d23
+ vshr.u64 d25,d16,#34
+ vsli.64 d24,d16,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d16,#39
+ vadd.i64 d28,d0
+ vsli.64 d25,d16,#30
+ veor d30,d16,d17
+ vsli.64 d26,d16,#25
+ veor d23,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d18,d17 @ Maj(a,b,c)
+ veor d23,d26 @ Sigma0(a)
+ vadd.i64 d19,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d23,d30
+ vshr.u64 d24,d19,#14 @ 17
+#if 17<16
+ vld1.64 {d1},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d19,#18
+#if 17>0
+ vadd.i64 d23,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d19,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d19,#50
+ vsli.64 d25,d19,#46
+ vmov d29,d19
+ vsli.64 d26,d19,#23
+#if 17<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d20,d21 @ Ch(e,f,g)
+ vshr.u64 d24,d23,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d22
+ vshr.u64 d25,d23,#34
+ vsli.64 d24,d23,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d23,#39
+ vadd.i64 d28,d1
+ vsli.64 d25,d23,#30
+ veor d30,d23,d16
+ vsli.64 d26,d23,#25
+ veor d22,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d17,d16 @ Maj(a,b,c)
+ veor d22,d26 @ Sigma0(a)
+ vadd.i64 d18,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d22,d30
+ vshr.u64 q12,q0,#19
+ vshr.u64 q13,q0,#61
+ vadd.i64 d22,d30 @ h+=Maj from the past
+ vshr.u64 q15,q0,#6
+ vsli.64 q12,q0,#45
+ vext.8 q14,q1,q2,#8 @ X[i+1]
+ vsli.64 q13,q0,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q1,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q5,q6,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d18,#14 @ from NEON_00_15
+ vadd.i64 q1,q14
+ vshr.u64 d25,d18,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d18,#41 @ from NEON_00_15
+ vadd.i64 q1,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d18,#50
+ vsli.64 d25,d18,#46
+ vmov d29,d18
+ vsli.64 d26,d18,#23
+#if 18<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d19,d20 @ Ch(e,f,g)
+ vshr.u64 d24,d22,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d21
+ vshr.u64 d25,d22,#34
+ vsli.64 d24,d22,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d22,#39
+ vadd.i64 d28,d2
+ vsli.64 d25,d22,#30
+ veor d30,d22,d23
+ vsli.64 d26,d22,#25
+ veor d21,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d16,d23 @ Maj(a,b,c)
+ veor d21,d26 @ Sigma0(a)
+ vadd.i64 d17,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d21,d30
+ vshr.u64 d24,d17,#14 @ 19
+#if 19<16
+ vld1.64 {d3},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d17,#18
+#if 19>0
+ vadd.i64 d21,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d17,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d17,#50
+ vsli.64 d25,d17,#46
+ vmov d29,d17
+ vsli.64 d26,d17,#23
+#if 19<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d18,d19 @ Ch(e,f,g)
+ vshr.u64 d24,d21,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d20
+ vshr.u64 d25,d21,#34
+ vsli.64 d24,d21,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d21,#39
+ vadd.i64 d28,d3
+ vsli.64 d25,d21,#30
+ veor d30,d21,d22
+ vsli.64 d26,d21,#25
+ veor d20,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d23,d22 @ Maj(a,b,c)
+ veor d20,d26 @ Sigma0(a)
+ vadd.i64 d16,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d20,d30
+ vshr.u64 q12,q1,#19
+ vshr.u64 q13,q1,#61
+ vadd.i64 d20,d30 @ h+=Maj from the past
+ vshr.u64 q15,q1,#6
+ vsli.64 q12,q1,#45
+ vext.8 q14,q2,q3,#8 @ X[i+1]
+ vsli.64 q13,q1,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q2,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q6,q7,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d16,#14 @ from NEON_00_15
+ vadd.i64 q2,q14
+ vshr.u64 d25,d16,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d16,#41 @ from NEON_00_15
+ vadd.i64 q2,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d16,#50
+ vsli.64 d25,d16,#46
+ vmov d29,d16
+ vsli.64 d26,d16,#23
+#if 20<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d17,d18 @ Ch(e,f,g)
+ vshr.u64 d24,d20,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d19
+ vshr.u64 d25,d20,#34
+ vsli.64 d24,d20,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d20,#39
+ vadd.i64 d28,d4
+ vsli.64 d25,d20,#30
+ veor d30,d20,d21
+ vsli.64 d26,d20,#25
+ veor d19,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d22,d21 @ Maj(a,b,c)
+ veor d19,d26 @ Sigma0(a)
+ vadd.i64 d23,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d19,d30
+ vshr.u64 d24,d23,#14 @ 21
+#if 21<16
+ vld1.64 {d5},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d23,#18
+#if 21>0
+ vadd.i64 d19,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d23,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d23,#50
+ vsli.64 d25,d23,#46
+ vmov d29,d23
+ vsli.64 d26,d23,#23
+#if 21<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d16,d17 @ Ch(e,f,g)
+ vshr.u64 d24,d19,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d18
+ vshr.u64 d25,d19,#34
+ vsli.64 d24,d19,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d19,#39
+ vadd.i64 d28,d5
+ vsli.64 d25,d19,#30
+ veor d30,d19,d20
+ vsli.64 d26,d19,#25
+ veor d18,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d21,d20 @ Maj(a,b,c)
+ veor d18,d26 @ Sigma0(a)
+ vadd.i64 d22,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d18,d30
+ vshr.u64 q12,q2,#19
+ vshr.u64 q13,q2,#61
+ vadd.i64 d18,d30 @ h+=Maj from the past
+ vshr.u64 q15,q2,#6
+ vsli.64 q12,q2,#45
+ vext.8 q14,q3,q4,#8 @ X[i+1]
+ vsli.64 q13,q2,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q3,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q7,q0,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d22,#14 @ from NEON_00_15
+ vadd.i64 q3,q14
+ vshr.u64 d25,d22,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d22,#41 @ from NEON_00_15
+ vadd.i64 q3,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d22,#50
+ vsli.64 d25,d22,#46
+ vmov d29,d22
+ vsli.64 d26,d22,#23
+#if 22<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d23,d16 @ Ch(e,f,g)
+ vshr.u64 d24,d18,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d17
+ vshr.u64 d25,d18,#34
+ vsli.64 d24,d18,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d18,#39
+ vadd.i64 d28,d6
+ vsli.64 d25,d18,#30
+ veor d30,d18,d19
+ vsli.64 d26,d18,#25
+ veor d17,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d20,d19 @ Maj(a,b,c)
+ veor d17,d26 @ Sigma0(a)
+ vadd.i64 d21,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d17,d30
+ vshr.u64 d24,d21,#14 @ 23
+#if 23<16
+ vld1.64 {d7},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d21,#18
+#if 23>0
+ vadd.i64 d17,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d21,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d21,#50
+ vsli.64 d25,d21,#46
+ vmov d29,d21
+ vsli.64 d26,d21,#23
+#if 23<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d22,d23 @ Ch(e,f,g)
+ vshr.u64 d24,d17,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d16
+ vshr.u64 d25,d17,#34
+ vsli.64 d24,d17,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d17,#39
+ vadd.i64 d28,d7
+ vsli.64 d25,d17,#30
+ veor d30,d17,d18
+ vsli.64 d26,d17,#25
+ veor d16,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d19,d18 @ Maj(a,b,c)
+ veor d16,d26 @ Sigma0(a)
+ vadd.i64 d20,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d16,d30
+ vshr.u64 q12,q3,#19
+ vshr.u64 q13,q3,#61
+ vadd.i64 d16,d30 @ h+=Maj from the past
+ vshr.u64 q15,q3,#6
+ vsli.64 q12,q3,#45
+ vext.8 q14,q4,q5,#8 @ X[i+1]
+ vsli.64 q13,q3,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q4,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q0,q1,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d20,#14 @ from NEON_00_15
+ vadd.i64 q4,q14
+ vshr.u64 d25,d20,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d20,#41 @ from NEON_00_15
+ vadd.i64 q4,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d20,#50
+ vsli.64 d25,d20,#46
+ vmov d29,d20
+ vsli.64 d26,d20,#23
+#if 24<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d21,d22 @ Ch(e,f,g)
+ vshr.u64 d24,d16,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d23
+ vshr.u64 d25,d16,#34
+ vsli.64 d24,d16,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d16,#39
+ vadd.i64 d28,d8
+ vsli.64 d25,d16,#30
+ veor d30,d16,d17
+ vsli.64 d26,d16,#25
+ veor d23,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d18,d17 @ Maj(a,b,c)
+ veor d23,d26 @ Sigma0(a)
+ vadd.i64 d19,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d23,d30
+ vshr.u64 d24,d19,#14 @ 25
+#if 25<16
+ vld1.64 {d9},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d19,#18
+#if 25>0
+ vadd.i64 d23,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d19,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d19,#50
+ vsli.64 d25,d19,#46
+ vmov d29,d19
+ vsli.64 d26,d19,#23
+#if 25<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d20,d21 @ Ch(e,f,g)
+ vshr.u64 d24,d23,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d22
+ vshr.u64 d25,d23,#34
+ vsli.64 d24,d23,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d23,#39
+ vadd.i64 d28,d9
+ vsli.64 d25,d23,#30
+ veor d30,d23,d16
+ vsli.64 d26,d23,#25
+ veor d22,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d17,d16 @ Maj(a,b,c)
+ veor d22,d26 @ Sigma0(a)
+ vadd.i64 d18,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d22,d30
+ vshr.u64 q12,q4,#19
+ vshr.u64 q13,q4,#61
+ vadd.i64 d22,d30 @ h+=Maj from the past
+ vshr.u64 q15,q4,#6
+ vsli.64 q12,q4,#45
+ vext.8 q14,q5,q6,#8 @ X[i+1]
+ vsli.64 q13,q4,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q5,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q1,q2,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d18,#14 @ from NEON_00_15
+ vadd.i64 q5,q14
+ vshr.u64 d25,d18,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d18,#41 @ from NEON_00_15
+ vadd.i64 q5,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d18,#50
+ vsli.64 d25,d18,#46
+ vmov d29,d18
+ vsli.64 d26,d18,#23
+#if 26<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d19,d20 @ Ch(e,f,g)
+ vshr.u64 d24,d22,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d21
+ vshr.u64 d25,d22,#34
+ vsli.64 d24,d22,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d22,#39
+ vadd.i64 d28,d10
+ vsli.64 d25,d22,#30
+ veor d30,d22,d23
+ vsli.64 d26,d22,#25
+ veor d21,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d16,d23 @ Maj(a,b,c)
+ veor d21,d26 @ Sigma0(a)
+ vadd.i64 d17,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d21,d30
+ vshr.u64 d24,d17,#14 @ 27
+#if 27<16
+ vld1.64 {d11},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d17,#18
+#if 27>0
+ vadd.i64 d21,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d17,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d17,#50
+ vsli.64 d25,d17,#46
+ vmov d29,d17
+ vsli.64 d26,d17,#23
+#if 27<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d18,d19 @ Ch(e,f,g)
+ vshr.u64 d24,d21,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d20
+ vshr.u64 d25,d21,#34
+ vsli.64 d24,d21,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d21,#39
+ vadd.i64 d28,d11
+ vsli.64 d25,d21,#30
+ veor d30,d21,d22
+ vsli.64 d26,d21,#25
+ veor d20,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d23,d22 @ Maj(a,b,c)
+ veor d20,d26 @ Sigma0(a)
+ vadd.i64 d16,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d20,d30
+ vshr.u64 q12,q5,#19
+ vshr.u64 q13,q5,#61
+ vadd.i64 d20,d30 @ h+=Maj from the past
+ vshr.u64 q15,q5,#6
+ vsli.64 q12,q5,#45
+ vext.8 q14,q6,q7,#8 @ X[i+1]
+ vsli.64 q13,q5,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q6,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q2,q3,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d16,#14 @ from NEON_00_15
+ vadd.i64 q6,q14
+ vshr.u64 d25,d16,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d16,#41 @ from NEON_00_15
+ vadd.i64 q6,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d16,#50
+ vsli.64 d25,d16,#46
+ vmov d29,d16
+ vsli.64 d26,d16,#23
+#if 28<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d17,d18 @ Ch(e,f,g)
+ vshr.u64 d24,d20,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d19
+ vshr.u64 d25,d20,#34
+ vsli.64 d24,d20,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d20,#39
+ vadd.i64 d28,d12
+ vsli.64 d25,d20,#30
+ veor d30,d20,d21
+ vsli.64 d26,d20,#25
+ veor d19,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d22,d21 @ Maj(a,b,c)
+ veor d19,d26 @ Sigma0(a)
+ vadd.i64 d23,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d19,d30
+ vshr.u64 d24,d23,#14 @ 29
+#if 29<16
+ vld1.64 {d13},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d23,#18
+#if 29>0
+ vadd.i64 d19,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d23,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d23,#50
+ vsli.64 d25,d23,#46
+ vmov d29,d23
+ vsli.64 d26,d23,#23
+#if 29<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d16,d17 @ Ch(e,f,g)
+ vshr.u64 d24,d19,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d18
+ vshr.u64 d25,d19,#34
+ vsli.64 d24,d19,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d19,#39
+ vadd.i64 d28,d13
+ vsli.64 d25,d19,#30
+ veor d30,d19,d20
+ vsli.64 d26,d19,#25
+ veor d18,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d21,d20 @ Maj(a,b,c)
+ veor d18,d26 @ Sigma0(a)
+ vadd.i64 d22,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d18,d30
+ vshr.u64 q12,q6,#19
+ vshr.u64 q13,q6,#61
+ vadd.i64 d18,d30 @ h+=Maj from the past
+ vshr.u64 q15,q6,#6
+ vsli.64 q12,q6,#45
+ vext.8 q14,q7,q0,#8 @ X[i+1]
+ vsli.64 q13,q6,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q7,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q3,q4,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d22,#14 @ from NEON_00_15
+ vadd.i64 q7,q14
+ vshr.u64 d25,d22,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d22,#41 @ from NEON_00_15
+ vadd.i64 q7,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d22,#50
+ vsli.64 d25,d22,#46
+ vmov d29,d22
+ vsli.64 d26,d22,#23
+#if 30<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d23,d16 @ Ch(e,f,g)
+ vshr.u64 d24,d18,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d17
+ vshr.u64 d25,d18,#34
+ vsli.64 d24,d18,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d18,#39
+ vadd.i64 d28,d14
+ vsli.64 d25,d18,#30
+ veor d30,d18,d19
+ vsli.64 d26,d18,#25
+ veor d17,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d20,d19 @ Maj(a,b,c)
+ veor d17,d26 @ Sigma0(a)
+ vadd.i64 d21,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d17,d30
+ vshr.u64 d24,d21,#14 @ 31
+#if 31<16
+ vld1.64 {d15},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d21,#18
+#if 31>0
+ vadd.i64 d17,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d21,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d21,#50
+ vsli.64 d25,d21,#46
+ vmov d29,d21
+ vsli.64 d26,d21,#23
+#if 31<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d22,d23 @ Ch(e,f,g)
+ vshr.u64 d24,d17,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d16
+ vshr.u64 d25,d17,#34
+ vsli.64 d24,d17,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d17,#39
+ vadd.i64 d28,d15
+ vsli.64 d25,d17,#30
+ veor d30,d17,d18
+ vsli.64 d26,d17,#25
+ veor d16,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d19,d18 @ Maj(a,b,c)
+ veor d16,d26 @ Sigma0(a)
+ vadd.i64 d20,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d16,d30
+ bne .L16_79_neon
+
+ vadd.i64 d16,d30 @ h+=Maj from the past
+ vldmia r0,{d24,d25,d26,d27,d28,d29,d30,d31} @ load context to temp
+ vadd.i64 q8,q12 @ vectorized accumulate
+ vadd.i64 q9,q13
+ vadd.i64 q10,q14
+ vadd.i64 q11,q15
+ vstmia r0,{d16,d17,d18,d19,d20,d21,d22,d23} @ save context
+ teq r1,r2
+ sub r3,#640 @ rewind K512
+ bne .Loop_neon
+
+ VFP_ABI_POP
+ bx lr @ .word 0xe12fff1e
+.size zfs_sha512_block_neon,.-zfs_sha512_block_neon
+#endif
diff --git a/module/icp/asm-ppc64/sha2/sha256-p8.S b/module/icp/asm-ppc64/sha2/sha256-p8.S
new file mode 100644
index 000000000..6bbfe23b6
--- /dev/null
+++ b/module/icp/asm-ppc64/sha2/sha256-p8.S
@@ -0,0 +1,1505 @@
+/*
+ * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Portions Copyright (c) 2022 Tino Reichardt <[email protected]>
+ * - modified assembly to fit into OpenZFS
+ */
+
+#if (defined(__PPC64__) && defined(__BIG_ENDIAN__))
+
+.text
+
+.globl zfs_sha256_power8
+.globl .zfs_sha256_power8
+.type zfs_sha256_power8,@function
+.section ".opd","aw"
+.align 3
+zfs_sha256_power8:
+.quad .zfs_sha256_power8,.TOC.@tocbase,0
+.previous
+.align 6
+.zfs_sha256_power8:
+ stdu 1,-384(1)
+ mflr 8
+ li 10,207
+ li 11,223
+ stvx 24,10,1
+ addi 10,10,32
+ mfspr 12,256
+ stvx 25,11,1
+ addi 11,11,32
+ stvx 26,10,1
+ addi 10,10,32
+ stvx 27,11,1
+ addi 11,11,32
+ stvx 28,10,1
+ addi 10,10,32
+ stvx 29,11,1
+ addi 11,11,32
+ stvx 30,10,1
+ stvx 31,11,1
+ li 11,-4096+255
+ stw 12,332(1)
+ li 10,0x10
+ std 26,336(1)
+ li 26,0x20
+ std 27,344(1)
+ li 27,0x30
+ std 28,352(1)
+ li 28,0x40
+ std 29,360(1)
+ li 29,0x50
+ std 30,368(1)
+ li 30,0x60
+ std 31,376(1)
+ li 31,0x70
+ std 8,400(1)
+ mtspr 256,11
+
+ bl .LPICmeup
+ addi 11,1,79
+ .long 0x7C001E19
+ .long 0x7C8A1E19
+ vsldoi 1,0,0,4
+ vsldoi 2,0,0,8
+ vsldoi 3,0,0,12
+ vsldoi 5,4,4,4
+ vsldoi 6,4,4,8
+ vsldoi 7,4,4,12
+ li 0,3
+ b .Loop
+.align 5
+.Loop:
+ lvx 28,0,6
+ .long 0x7D002699
+ addi 4,4,16
+ mr 7,6
+ stvx 0,0,11
+ stvx 1,10,11
+ stvx 2,26,11
+ stvx 3,27,11
+ stvx 4,28,11
+ stvx 5,29,11
+ stvx 6,30,11
+ stvx 7,31,11
+ vadduwm 7,7,28
+ lvx 28,10,6
+ vadduwm 7,7,8
+ vsel 29,6,5,4
+ vadduwm 6,6,28
+ vadduwm 7,7,29
+ .long 0x13C4FE82
+ vadduwm 7,7,30
+ vxor 29,0,1
+ vsel 29,1,2,29
+ vadduwm 3,3,7
+ .long 0x13C08682
+ vadduwm 30,30,29
+ vadduwm 7,7,30
+ lvx 28,26,7
+ vsldoi 9,8,8,4
+ vadduwm 6,6,9
+ vsel 29,5,4,3
+ vadduwm 5,5,28
+ vadduwm 6,6,29
+ .long 0x13C3FE82
+ vadduwm 6,6,30
+ vxor 29,7,0
+ vsel 29,0,1,29
+ vadduwm 2,2,6
+ .long 0x13C78682
+ vadduwm 30,30,29
+ vadduwm 6,6,30
+ lvx 28,27,7
+ vsldoi 10,9,9,4
+ vadduwm 5,5,10
+ vsel 29,4,3,2
+ vadduwm 4,4,28
+ vadduwm 5,5,29
+ .long 0x13C2FE82
+ vadduwm 5,5,30
+ vxor 29,6,7
+ vsel 29,7,0,29
+ vadduwm 1,1,5
+ .long 0x13C68682
+ vadduwm 30,30,29
+ vadduwm 5,5,30
+ lvx 28,28,7
+ .long 0x7D802699
+ addi 4,4,16
+ vsldoi 11,10,10,4
+ vadduwm 4,4,11
+ vsel 29,3,2,1
+ vadduwm 3,3,28
+ vadduwm 4,4,29
+ .long 0x13C1FE82
+ vadduwm 4,4,30
+ vxor 29,5,6
+ vsel 29,6,7,29
+ vadduwm 0,0,4
+ .long 0x13C58682
+ vadduwm 30,30,29
+ vadduwm 4,4,30
+ lvx 28,29,7
+ vadduwm 3,3,12
+ vsel 29,2,1,0
+ vadduwm 2,2,28
+ vadduwm 3,3,29
+ .long 0x13C0FE82
+ vadduwm 3,3,30
+ vxor 29,4,5
+ vsel 29,5,6,29
+ vadduwm 7,7,3
+ .long 0x13C48682
+ vadduwm 30,30,29
+ vadduwm 3,3,30
+ lvx 28,30,7
+ vsldoi 13,12,12,4
+ vadduwm 2,2,13
+ vsel 29,1,0,7
+ vadduwm 1,1,28
+ vadduwm 2,2,29
+ .long 0x13C7FE82
+ vadduwm 2,2,30
+ vxor 29,3,4
+ vsel 29,4,5,29
+ vadduwm 6,6,2
+ .long 0x13C38682
+ vadduwm 30,30,29
+ vadduwm 2,2,30
+ lvx 28,31,7
+ addi 7,7,0x80
+ vsldoi 14,13,13,4
+ vadduwm 1,1,14
+ vsel 29,0,7,6
+ vadduwm 0,0,28
+ vadduwm 1,1,29
+ .long 0x13C6FE82
+ vadduwm 1,1,30
+ vxor 29,2,3
+ vsel 29,3,4,29
+ vadduwm 5,5,1
+ .long 0x13C28682
+ vadduwm 30,30,29
+ vadduwm 1,1,30
+ lvx 28,0,7
+ .long 0x7E002699
+ addi 4,4,16
+ vsldoi 15,14,14,4
+ vadduwm 0,0,15
+ vsel 29,7,6,5
+ vadduwm 7,7,28
+ vadduwm 0,0,29
+ .long 0x13C5FE82
+ vadduwm 0,0,30
+ vxor 29,1,2
+ vsel 29,2,3,29
+ vadduwm 4,4,0
+ .long 0x13C18682
+ vadduwm 30,30,29
+ vadduwm 0,0,30
+ lvx 28,10,7
+ vadduwm 7,7,16
+ vsel 29,6,5,4
+ vadduwm 6,6,28
+ vadduwm 7,7,29
+ .long 0x13C4FE82
+ vadduwm 7,7,30
+ vxor 29,0,1
+ vsel 29,1,2,29
+ vadduwm 3,3,7
+ .long 0x13C08682
+ vadduwm 30,30,29
+ vadduwm 7,7,30
+ lvx 28,26,7
+ vsldoi 17,16,16,4
+ vadduwm 6,6,17
+ vsel 29,5,4,3
+ vadduwm 5,5,28
+ vadduwm 6,6,29
+ .long 0x13C3FE82
+ vadduwm 6,6,30
+ vxor 29,7,0
+ vsel 29,0,1,29
+ vadduwm 2,2,6
+ .long 0x13C78682
+ vadduwm 30,30,29
+ vadduwm 6,6,30
+ lvx 28,27,7
+ vsldoi 18,17,17,4
+ vadduwm 5,5,18
+ vsel 29,4,3,2
+ vadduwm 4,4,28
+ vadduwm 5,5,29
+ .long 0x13C2FE82
+ vadduwm 5,5,30
+ vxor 29,6,7
+ vsel 29,7,0,29
+ vadduwm 1,1,5
+ .long 0x13C68682
+ vadduwm 30,30,29
+ vadduwm 5,5,30
+ lvx 28,28,7
+ .long 0x7F002699
+ addi 4,4,16
+ vsldoi 19,18,18,4
+ vadduwm 4,4,19
+ vsel 29,3,2,1
+ vadduwm 3,3,28
+ vadduwm 4,4,29
+ .long 0x13C1FE82
+ vadduwm 4,4,30
+ vxor 29,5,6
+ vsel 29,6,7,29
+ vadduwm 0,0,4
+ .long 0x13C58682
+ vadduwm 30,30,29
+ vadduwm 4,4,30
+ lvx 28,29,7
+ vadduwm 3,3,24
+ vsel 29,2,1,0
+ vadduwm 2,2,28
+ vadduwm 3,3,29
+ .long 0x13C0FE82
+ vadduwm 3,3,30
+ vxor 29,4,5
+ vsel 29,5,6,29
+ vadduwm 7,7,3
+ .long 0x13C48682
+ vadduwm 30,30,29
+ vadduwm 3,3,30
+ lvx 28,30,7
+ vsldoi 25,24,24,4
+ vadduwm 2,2,25
+ vsel 29,1,0,7
+ vadduwm 1,1,28
+ vadduwm 2,2,29
+ .long 0x13C7FE82
+ vadduwm 2,2,30
+ vxor 29,3,4
+ vsel 29,4,5,29
+ vadduwm 6,6,2
+ .long 0x13C38682
+ vadduwm 30,30,29
+ vadduwm 2,2,30
+ lvx 28,31,7
+ addi 7,7,0x80
+ vsldoi 26,25,25,4
+ vadduwm 1,1,26
+ vsel 29,0,7,6
+ vadduwm 0,0,28
+ vadduwm 1,1,29
+ .long 0x13C6FE82
+ vadduwm 1,1,30
+ vxor 29,2,3
+ vsel 29,3,4,29
+ vadduwm 5,5,1
+ .long 0x13C28682
+ vadduwm 30,30,29
+ vadduwm 1,1,30
+ lvx 28,0,7
+ vsldoi 27,26,26,4
+ .long 0x13C90682
+ vadduwm 8,8,30
+ .long 0x13DA7E82
+ vadduwm 8,8,30
+ vadduwm 8,8,17
+ vadduwm 0,0,27
+ vsel 29,7,6,5
+ vadduwm 7,7,28
+ vadduwm 0,0,29
+ .long 0x13C5FE82
+ vadduwm 0,0,30
+ vxor 29,1,2
+ vsel 29,2,3,29
+ vadduwm 4,4,0
+ .long 0x13C18682
+ vadduwm 30,30,29
+ vadduwm 0,0,30
+ lvx 28,10,7
+ mtctr 0
+ b .L16_xx
+.align 5
+.L16_xx:
+ .long 0x13CA0682
+ vadduwm 9,9,30
+ .long 0x13DB7E82
+ vadduwm 9,9,30
+ vadduwm 9,9,18
+ vadduwm 7,7,8
+ vsel 29,6,5,4
+ vadduwm 6,6,28
+ vadduwm 7,7,29
+ .long 0x13C4FE82
+ vadduwm 7,7,30
+ vxor 29,0,1
+ vsel 29,1,2,29
+ vadduwm 3,3,7
+ .long 0x13C08682
+ vadduwm 30,30,29
+ vadduwm 7,7,30
+ lvx 28,26,7
+ .long 0x13CB0682
+ vadduwm 10,10,30
+ .long 0x13C87E82
+ vadduwm 10,10,30
+ vadduwm 10,10,19
+ vadduwm 6,6,9
+ vsel 29,5,4,3
+ vadduwm 5,5,28
+ vadduwm 6,6,29
+ .long 0x13C3FE82
+ vadduwm 6,6,30
+ vxor 29,7,0
+ vsel 29,0,1,29
+ vadduwm 2,2,6
+ .long 0x13C78682
+ vadduwm 30,30,29
+ vadduwm 6,6,30
+ lvx 28,27,7
+ .long 0x13CC0682
+ vadduwm 11,11,30
+ .long 0x13C97E82
+ vadduwm 11,11,30
+ vadduwm 11,11,24
+ vadduwm 5,5,10
+ vsel 29,4,3,2
+ vadduwm 4,4,28
+ vadduwm 5,5,29
+ .long 0x13C2FE82
+ vadduwm 5,5,30
+ vxor 29,6,7
+ vsel 29,7,0,29
+ vadduwm 1,1,5
+ .long 0x13C68682
+ vadduwm 30,30,29
+ vadduwm 5,5,30
+ lvx 28,28,7
+ .long 0x13CD0682
+ vadduwm 12,12,30
+ .long 0x13CA7E82
+ vadduwm 12,12,30
+ vadduwm 12,12,25
+ vadduwm 4,4,11
+ vsel 29,3,2,1
+ vadduwm 3,3,28
+ vadduwm 4,4,29
+ .long 0x13C1FE82
+ vadduwm 4,4,30
+ vxor 29,5,6
+ vsel 29,6,7,29
+ vadduwm 0,0,4
+ .long 0x13C58682
+ vadduwm 30,30,29
+ vadduwm 4,4,30
+ lvx 28,29,7
+ .long 0x13CE0682
+ vadduwm 13,13,30
+ .long 0x13CB7E82
+ vadduwm 13,13,30
+ vadduwm 13,13,26
+ vadduwm 3,3,12
+ vsel 29,2,1,0
+ vadduwm 2,2,28
+ vadduwm 3,3,29
+ .long 0x13C0FE82
+ vadduwm 3,3,30
+ vxor 29,4,5
+ vsel 29,5,6,29
+ vadduwm 7,7,3
+ .long 0x13C48682
+ vadduwm 30,30,29
+ vadduwm 3,3,30
+ lvx 28,30,7
+ .long 0x13CF0682
+ vadduwm 14,14,30
+ .long 0x13CC7E82
+ vadduwm 14,14,30
+ vadduwm 14,14,27
+ vadduwm 2,2,13
+ vsel 29,1,0,7
+ vadduwm 1,1,28
+ vadduwm 2,2,29
+ .long 0x13C7FE82
+ vadduwm 2,2,30
+ vxor 29,3,4
+ vsel 29,4,5,29
+ vadduwm 6,6,2
+ .long 0x13C38682
+ vadduwm 30,30,29
+ vadduwm 2,2,30
+ lvx 28,31,7
+ addi 7,7,0x80
+ .long 0x13D00682
+ vadduwm 15,15,30
+ .long 0x13CD7E82
+ vadduwm 15,15,30
+ vadduwm 15,15,8
+ vadduwm 1,1,14
+ vsel 29,0,7,6
+ vadduwm 0,0,28
+ vadduwm 1,1,29
+ .long 0x13C6FE82
+ vadduwm 1,1,30
+ vxor 29,2,3
+ vsel 29,3,4,29
+ vadduwm 5,5,1
+ .long 0x13C28682
+ vadduwm 30,30,29
+ vadduwm 1,1,30
+ lvx 28,0,7
+ .long 0x13D10682
+ vadduwm 16,16,30
+ .long 0x13CE7E82
+ vadduwm 16,16,30
+ vadduwm 16,16,9
+ vadduwm 0,0,15
+ vsel 29,7,6,5
+ vadduwm 7,7,28
+ vadduwm 0,0,29
+ .long 0x13C5FE82
+ vadduwm 0,0,30
+ vxor 29,1,2
+ vsel 29,2,3,29
+ vadduwm 4,4,0
+ .long 0x13C18682
+ vadduwm 30,30,29
+ vadduwm 0,0,30
+ lvx 28,10,7
+ .long 0x13D20682
+ vadduwm 17,17,30
+ .long 0x13CF7E82
+ vadduwm 17,17,30
+ vadduwm 17,17,10
+ vadduwm 7,7,16
+ vsel 29,6,5,4
+ vadduwm 6,6,28
+ vadduwm 7,7,29
+ .long 0x13C4FE82
+ vadduwm 7,7,30
+ vxor 29,0,1
+ vsel 29,1,2,29
+ vadduwm 3,3,7
+ .long 0x13C08682
+ vadduwm 30,30,29
+ vadduwm 7,7,30
+ lvx 28,26,7
+ .long 0x13D30682
+ vadduwm 18,18,30
+ .long 0x13D07E82
+ vadduwm 18,18,30
+ vadduwm 18,18,11
+ vadduwm 6,6,17
+ vsel 29,5,4,3
+ vadduwm 5,5,28
+ vadduwm 6,6,29
+ .long 0x13C3FE82
+ vadduwm 6,6,30
+ vxor 29,7,0
+ vsel 29,0,1,29
+ vadduwm 2,2,6
+ .long 0x13C78682
+ vadduwm 30,30,29
+ vadduwm 6,6,30
+ lvx 28,27,7
+ .long 0x13D80682
+ vadduwm 19,19,30
+ .long 0x13D17E82
+ vadduwm 19,19,30
+ vadduwm 19,19,12
+ vadduwm 5,5,18
+ vsel 29,4,3,2
+ vadduwm 4,4,28
+ vadduwm 5,5,29
+ .long 0x13C2FE82
+ vadduwm 5,5,30
+ vxor 29,6,7
+ vsel 29,7,0,29
+ vadduwm 1,1,5
+ .long 0x13C68682
+ vadduwm 30,30,29
+ vadduwm 5,5,30
+ lvx 28,28,7
+ .long 0x13D90682
+ vadduwm 24,24,30
+ .long 0x13D27E82
+ vadduwm 24,24,30
+ vadduwm 24,24,13
+ vadduwm 4,4,19
+ vsel 29,3,2,1
+ vadduwm 3,3,28
+ vadduwm 4,4,29
+ .long 0x13C1FE82
+ vadduwm 4,4,30
+ vxor 29,5,6
+ vsel 29,6,7,29
+ vadduwm 0,0,4
+ .long 0x13C58682
+ vadduwm 30,30,29
+ vadduwm 4,4,30
+ lvx 28,29,7
+ .long 0x13DA0682
+ vadduwm 25,25,30
+ .long 0x13D37E82
+ vadduwm 25,25,30
+ vadduwm 25,25,14
+ vadduwm 3,3,24
+ vsel 29,2,1,0
+ vadduwm 2,2,28
+ vadduwm 3,3,29
+ .long 0x13C0FE82
+ vadduwm 3,3,30
+ vxor 29,4,5
+ vsel 29,5,6,29
+ vadduwm 7,7,3
+ .long 0x13C48682
+ vadduwm 30,30,29
+ vadduwm 3,3,30
+ lvx 28,30,7
+ .long 0x13DB0682
+ vadduwm 26,26,30
+ .long 0x13D87E82
+ vadduwm 26,26,30
+ vadduwm 26,26,15
+ vadduwm 2,2,25
+ vsel 29,1,0,7
+ vadduwm 1,1,28
+ vadduwm 2,2,29
+ .long 0x13C7FE82
+ vadduwm 2,2,30
+ vxor 29,3,4
+ vsel 29,4,5,29
+ vadduwm 6,6,2
+ .long 0x13C38682
+ vadduwm 30,30,29
+ vadduwm 2,2,30
+ lvx 28,31,7
+ addi 7,7,0x80
+ .long 0x13C80682
+ vadduwm 27,27,30
+ .long 0x13D97E82
+ vadduwm 27,27,30
+ vadduwm 27,27,16
+ vadduwm 1,1,26
+ vsel 29,0,7,6
+ vadduwm 0,0,28
+ vadduwm 1,1,29
+ .long 0x13C6FE82
+ vadduwm 1,1,30
+ vxor 29,2,3
+ vsel 29,3,4,29
+ vadduwm 5,5,1
+ .long 0x13C28682
+ vadduwm 30,30,29
+ vadduwm 1,1,30
+ lvx 28,0,7
+ .long 0x13C90682
+ vadduwm 8,8,30
+ .long 0x13DA7E82
+ vadduwm 8,8,30
+ vadduwm 8,8,17
+ vadduwm 0,0,27
+ vsel 29,7,6,5
+ vadduwm 7,7,28
+ vadduwm 0,0,29
+ .long 0x13C5FE82
+ vadduwm 0,0,30
+ vxor 29,1,2
+ vsel 29,2,3,29
+ vadduwm 4,4,0
+ .long 0x13C18682
+ vadduwm 30,30,29
+ vadduwm 0,0,30
+ lvx 28,10,7
+ bdnz .L16_xx
+
+ lvx 10,0,11
+ subic. 5,5,1
+ lvx 11,10,11
+ vadduwm 0,0,10
+ lvx 12,26,11
+ vadduwm 1,1,11
+ lvx 13,27,11
+ vadduwm 2,2,12
+ lvx 14,28,11
+ vadduwm 3,3,13
+ lvx 15,29,11
+ vadduwm 4,4,14
+ lvx 16,30,11
+ vadduwm 5,5,15
+ lvx 17,31,11
+ vadduwm 6,6,16
+ vadduwm 7,7,17
+ bne .Loop
+ lvx 8,26,7
+ vperm 0,0,1,28
+ lvx 9,27,7
+ vperm 4,4,5,28
+ vperm 0,0,2,8
+ vperm 4,4,6,8
+ vperm 0,0,3,9
+ vperm 4,4,7,9
+ .long 0x7C001F19
+ .long 0x7C8A1F19
+ addi 11,1,207
+ mtlr 8
+ mtspr 256,12
+ lvx 24,0,11
+ lvx 25,10,11
+ lvx 26,26,11
+ lvx 27,27,11
+ lvx 28,28,11
+ lvx 29,29,11
+ lvx 30,30,11
+ lvx 31,31,11
+ ld 26,336(1)
+ ld 27,344(1)
+ ld 28,352(1)
+ ld 29,360(1)
+ ld 30,368(1)
+ ld 31,376(1)
+ addi 1,1,384
+ blr
+.long 0
+.byte 0,12,4,1,0x80,6,3,0
+.long 0
+.size .zfs_sha256_power8,.-.zfs_sha256_power8
+.size zfs_sha256_power8,.-.zfs_sha256_power8
+.align 6
+.LPICmeup:
+ mflr 0
+ bcl 20,31,$+4
+ mflr 6
+ addi 6,6,56
+ mtlr 0
+ blr
+.long 0
+.byte 0,12,0x14,0,0,0,0,0
+.space 28
+.long 0x428a2f98,0x428a2f98,0x428a2f98,0x428a2f98
+.long 0x71374491,0x71374491,0x71374491,0x71374491
+.long 0xb5c0fbcf,0xb5c0fbcf,0xb5c0fbcf,0xb5c0fbcf
+.long 0xe9b5dba5,0xe9b5dba5,0xe9b5dba5,0xe9b5dba5
+.long 0x3956c25b,0x3956c25b,0x3956c25b,0x3956c25b
+.long 0x59f111f1,0x59f111f1,0x59f111f1,0x59f111f1
+.long 0x923f82a4,0x923f82a4,0x923f82a4,0x923f82a4
+.long 0xab1c5ed5,0xab1c5ed5,0xab1c5ed5,0xab1c5ed5
+.long 0xd807aa98,0xd807aa98,0xd807aa98,0xd807aa98
+.long 0x12835b01,0x12835b01,0x12835b01,0x12835b01
+.long 0x243185be,0x243185be,0x243185be,0x243185be
+.long 0x550c7dc3,0x550c7dc3,0x550c7dc3,0x550c7dc3
+.long 0x72be5d74,0x72be5d74,0x72be5d74,0x72be5d74
+.long 0x80deb1fe,0x80deb1fe,0x80deb1fe,0x80deb1fe
+.long 0x9bdc06a7,0x9bdc06a7,0x9bdc06a7,0x9bdc06a7
+.long 0xc19bf174,0xc19bf174,0xc19bf174,0xc19bf174
+.long 0xe49b69c1,0xe49b69c1,0xe49b69c1,0xe49b69c1
+.long 0xefbe4786,0xefbe4786,0xefbe4786,0xefbe4786
+.long 0x0fc19dc6,0x0fc19dc6,0x0fc19dc6,0x0fc19dc6
+.long 0x240ca1cc,0x240ca1cc,0x240ca1cc,0x240ca1cc
+.long 0x2de92c6f,0x2de92c6f,0x2de92c6f,0x2de92c6f
+.long 0x4a7484aa,0x4a7484aa,0x4a7484aa,0x4a7484aa
+.long 0x5cb0a9dc,0x5cb0a9dc,0x5cb0a9dc,0x5cb0a9dc
+.long 0x76f988da,0x76f988da,0x76f988da,0x76f988da
+.long 0x983e5152,0x983e5152,0x983e5152,0x983e5152
+.long 0xa831c66d,0xa831c66d,0xa831c66d,0xa831c66d
+.long 0xb00327c8,0xb00327c8,0xb00327c8,0xb00327c8
+.long 0xbf597fc7,0xbf597fc7,0xbf597fc7,0xbf597fc7
+.long 0xc6e00bf3,0xc6e00bf3,0xc6e00bf3,0xc6e00bf3
+.long 0xd5a79147,0xd5a79147,0xd5a79147,0xd5a79147
+.long 0x06ca6351,0x06ca6351,0x06ca6351,0x06ca6351
+.long 0x14292967,0x14292967,0x14292967,0x14292967
+.long 0x27b70a85,0x27b70a85,0x27b70a85,0x27b70a85
+.long 0x2e1b2138,0x2e1b2138,0x2e1b2138,0x2e1b2138
+.long 0x4d2c6dfc,0x4d2c6dfc,0x4d2c6dfc,0x4d2c6dfc
+.long 0x53380d13,0x53380d13,0x53380d13,0x53380d13
+.long 0x650a7354,0x650a7354,0x650a7354,0x650a7354
+.long 0x766a0abb,0x766a0abb,0x766a0abb,0x766a0abb
+.long 0x81c2c92e,0x81c2c92e,0x81c2c92e,0x81c2c92e
+.long 0x92722c85,0x92722c85,0x92722c85,0x92722c85
+.long 0xa2bfe8a1,0xa2bfe8a1,0xa2bfe8a1,0xa2bfe8a1
+.long 0xa81a664b,0xa81a664b,0xa81a664b,0xa81a664b
+.long 0xc24b8b70,0xc24b8b70,0xc24b8b70,0xc24b8b70
+.long 0xc76c51a3,0xc76c51a3,0xc76c51a3,0xc76c51a3
+.long 0xd192e819,0xd192e819,0xd192e819,0xd192e819
+.long 0xd6990624,0xd6990624,0xd6990624,0xd6990624
+.long 0xf40e3585,0xf40e3585,0xf40e3585,0xf40e3585
+.long 0x106aa070,0x106aa070,0x106aa070,0x106aa070
+.long 0x19a4c116,0x19a4c116,0x19a4c116,0x19a4c116
+.long 0x1e376c08,0x1e376c08,0x1e376c08,0x1e376c08
+.long 0x2748774c,0x2748774c,0x2748774c,0x2748774c
+.long 0x34b0bcb5,0x34b0bcb5,0x34b0bcb5,0x34b0bcb5
+.long 0x391c0cb3,0x391c0cb3,0x391c0cb3,0x391c0cb3
+.long 0x4ed8aa4a,0x4ed8aa4a,0x4ed8aa4a,0x4ed8aa4a
+.long 0x5b9cca4f,0x5b9cca4f,0x5b9cca4f,0x5b9cca4f
+.long 0x682e6ff3,0x682e6ff3,0x682e6ff3,0x682e6ff3
+.long 0x748f82ee,0x748f82ee,0x748f82ee,0x748f82ee
+.long 0x78a5636f,0x78a5636f,0x78a5636f,0x78a5636f
+.long 0x84c87814,0x84c87814,0x84c87814,0x84c87814
+.long 0x8cc70208,0x8cc70208,0x8cc70208,0x8cc70208
+.long 0x90befffa,0x90befffa,0x90befffa,0x90befffa
+.long 0xa4506ceb,0xa4506ceb,0xa4506ceb,0xa4506ceb
+.long 0xbef9a3f7,0xbef9a3f7,0xbef9a3f7,0xbef9a3f7
+.long 0xc67178f2,0xc67178f2,0xc67178f2,0xc67178f2
+.long 0,0,0,0
+.long 0x00010203,0x10111213,0x10111213,0x10111213
+.long 0x00010203,0x04050607,0x10111213,0x10111213
+.long 0x00010203,0x04050607,0x08090a0b,0x10111213
+
+#elif (defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
+
+.abiversion 2
+.text
+
+.globl zfs_sha256_power8
+.type zfs_sha256_power8,@function
+.align 6
+zfs_sha256_power8:
+.localentry zfs_sha256_power8,0
+
+ stdu 1,-384(1)
+ mflr 8
+ li 10,207
+ li 11,223
+ stvx 24,10,1
+ addi 10,10,32
+ li 12,-1
+ stvx 25,11,1
+ addi 11,11,32
+ stvx 26,10,1
+ addi 10,10,32
+ stvx 27,11,1
+ addi 11,11,32
+ stvx 28,10,1
+ addi 10,10,32
+ stvx 29,11,1
+ addi 11,11,32
+ stvx 30,10,1
+ stvx 31,11,1
+ li 11,-4096+255
+ stw 12,332(1)
+ li 10,0x10
+ std 26,336(1)
+ li 26,0x20
+ std 27,344(1)
+ li 27,0x30
+ std 28,352(1)
+ li 28,0x40
+ std 29,360(1)
+ li 29,0x50
+ std 30,368(1)
+ li 30,0x60
+ std 31,376(1)
+ li 31,0x70
+ std 8,400(1)
+ or 11,11,11
+
+ bl .LPICmeup
+ addi 11,1,79
+ li 7,8
+ lvsl 31,0,7
+ vspltisb 28,0x0f
+ vxor 31,31,28
+ .long 0x7C001E19
+ .long 0x7C8A1E19
+ vsldoi 1,0,0,4
+ vsldoi 2,0,0,8
+ vsldoi 3,0,0,12
+ vsldoi 5,4,4,4
+ vsldoi 6,4,4,8
+ vsldoi 7,4,4,12
+ li 0,3
+ b .Loop
+.align 5
+.Loop:
+ lvx 28,0,6
+ .long 0x7D002699
+ addi 4,4,16
+ mr 7,6
+ stvx 0,0,11
+ stvx 1,10,11
+ stvx 2,26,11
+ stvx 3,27,11
+ stvx 4,28,11
+ stvx 5,29,11
+ stvx 6,30,11
+ stvx 7,31,11
+ vadduwm 7,7,28
+ lvx 28,10,6
+ vperm 8,8,8,31
+ vadduwm 7,7,8
+ vsel 29,6,5,4
+ vadduwm 6,6,28
+ vadduwm 7,7,29
+ .long 0x13C4FE82
+ vadduwm 7,7,30
+ vxor 29,0,1
+ vsel 29,1,2,29
+ vadduwm 3,3,7
+ .long 0x13C08682
+ vadduwm 30,30,29
+ vadduwm 7,7,30
+ lvx 28,26,7
+ vsldoi 9,8,8,4
+ vadduwm 6,6,9
+ vsel 29,5,4,3
+ vadduwm 5,5,28
+ vadduwm 6,6,29
+ .long 0x13C3FE82
+ vadduwm 6,6,30
+ vxor 29,7,0
+ vsel 29,0,1,29
+ vadduwm 2,2,6
+ .long 0x13C78682
+ vadduwm 30,30,29
+ vadduwm 6,6,30
+ lvx 28,27,7
+ vsldoi 10,9,9,4
+ vadduwm 5,5,10
+ vsel 29,4,3,2
+ vadduwm 4,4,28
+ vadduwm 5,5,29
+ .long 0x13C2FE82
+ vadduwm 5,5,30
+ vxor 29,6,7
+ vsel 29,7,0,29
+ vadduwm 1,1,5
+ .long 0x13C68682
+ vadduwm 30,30,29
+ vadduwm 5,5,30
+ lvx 28,28,7
+ .long 0x7D802699
+ addi 4,4,16
+ vsldoi 11,10,10,4
+ vadduwm 4,4,11
+ vsel 29,3,2,1
+ vadduwm 3,3,28
+ vadduwm 4,4,29
+ .long 0x13C1FE82
+ vadduwm 4,4,30
+ vxor 29,5,6
+ vsel 29,6,7,29
+ vadduwm 0,0,4
+ .long 0x13C58682
+ vadduwm 30,30,29
+ vadduwm 4,4,30
+ lvx 28,29,7
+ vperm 12,12,12,31
+ vadduwm 3,3,12
+ vsel 29,2,1,0
+ vadduwm 2,2,28
+ vadduwm 3,3,29
+ .long 0x13C0FE82
+ vadduwm 3,3,30
+ vxor 29,4,5
+ vsel 29,5,6,29
+ vadduwm 7,7,3
+ .long 0x13C48682
+ vadduwm 30,30,29
+ vadduwm 3,3,30
+ lvx 28,30,7
+ vsldoi 13,12,12,4
+ vadduwm 2,2,13
+ vsel 29,1,0,7
+ vadduwm 1,1,28
+ vadduwm 2,2,29
+ .long 0x13C7FE82
+ vadduwm 2,2,30
+ vxor 29,3,4
+ vsel 29,4,5,29
+ vadduwm 6,6,2
+ .long 0x13C38682
+ vadduwm 30,30,29
+ vadduwm 2,2,30
+ lvx 28,31,7
+ addi 7,7,0x80
+ vsldoi 14,13,13,4
+ vadduwm 1,1,14
+ vsel 29,0,7,6
+ vadduwm 0,0,28
+ vadduwm 1,1,29
+ .long 0x13C6FE82
+ vadduwm 1,1,30
+ vxor 29,2,3
+ vsel 29,3,4,29
+ vadduwm 5,5,1
+ .long 0x13C28682
+ vadduwm 30,30,29
+ vadduwm 1,1,30
+ lvx 28,0,7
+ .long 0x7E002699
+ addi 4,4,16
+ vsldoi 15,14,14,4
+ vadduwm 0,0,15
+ vsel 29,7,6,5
+ vadduwm 7,7,28
+ vadduwm 0,0,29
+ .long 0x13C5FE82
+ vadduwm 0,0,30
+ vxor 29,1,2
+ vsel 29,2,3,29
+ vadduwm 4,4,0
+ .long 0x13C18682
+ vadduwm 30,30,29
+ vadduwm 0,0,30
+ lvx 28,10,7
+ vperm 16,16,16,31
+ vadduwm 7,7,16
+ vsel 29,6,5,4
+ vadduwm 6,6,28
+ vadduwm 7,7,29
+ .long 0x13C4FE82
+ vadduwm 7,7,30
+ vxor 29,0,1
+ vsel 29,1,2,29
+ vadduwm 3,3,7
+ .long 0x13C08682
+ vadduwm 30,30,29
+ vadduwm 7,7,30
+ lvx 28,26,7
+ vsldoi 17,16,16,4
+ vadduwm 6,6,17
+ vsel 29,5,4,3
+ vadduwm 5,5,28
+ vadduwm 6,6,29
+ .long 0x13C3FE82
+ vadduwm 6,6,30
+ vxor 29,7,0
+ vsel 29,0,1,29
+ vadduwm 2,2,6
+ .long 0x13C78682
+ vadduwm 30,30,29
+ vadduwm 6,6,30
+ lvx 28,27,7
+ vsldoi 18,17,17,4
+ vadduwm 5,5,18
+ vsel 29,4,3,2
+ vadduwm 4,4,28
+ vadduwm 5,5,29
+ .long 0x13C2FE82
+ vadduwm 5,5,30
+ vxor 29,6,7
+ vsel 29,7,0,29
+ vadduwm 1,1,5
+ .long 0x13C68682
+ vadduwm 30,30,29
+ vadduwm 5,5,30
+ lvx 28,28,7
+ .long 0x7F002699
+ addi 4,4,16
+ vsldoi 19,18,18,4
+ vadduwm 4,4,19
+ vsel 29,3,2,1
+ vadduwm 3,3,28
+ vadduwm 4,4,29
+ .long 0x13C1FE82
+ vadduwm 4,4,30
+ vxor 29,5,6
+ vsel 29,6,7,29
+ vadduwm 0,0,4
+ .long 0x13C58682
+ vadduwm 30,30,29
+ vadduwm 4,4,30
+ lvx 28,29,7
+ vperm 24,24,24,31
+ vadduwm 3,3,24
+ vsel 29,2,1,0
+ vadduwm 2,2,28
+ vadduwm 3,3,29
+ .long 0x13C0FE82
+ vadduwm 3,3,30
+ vxor 29,4,5
+ vsel 29,5,6,29
+ vadduwm 7,7,3
+ .long 0x13C48682
+ vadduwm 30,30,29
+ vadduwm 3,3,30
+ lvx 28,30,7
+ vsldoi 25,24,24,4
+ vadduwm 2,2,25
+ vsel 29,1,0,7
+ vadduwm 1,1,28
+ vadduwm 2,2,29
+ .long 0x13C7FE82
+ vadduwm 2,2,30
+ vxor 29,3,4
+ vsel 29,4,5,29
+ vadduwm 6,6,2
+ .long 0x13C38682
+ vadduwm 30,30,29
+ vadduwm 2,2,30
+ lvx 28,31,7
+ addi 7,7,0x80
+ vsldoi 26,25,25,4
+ vadduwm 1,1,26
+ vsel 29,0,7,6
+ vadduwm 0,0,28
+ vadduwm 1,1,29
+ .long 0x13C6FE82
+ vadduwm 1,1,30
+ vxor 29,2,3
+ vsel 29,3,4,29
+ vadduwm 5,5,1
+ .long 0x13C28682
+ vadduwm 30,30,29
+ vadduwm 1,1,30
+ lvx 28,0,7
+ vsldoi 27,26,26,4
+ .long 0x13C90682
+ vadduwm 8,8,30
+ .long 0x13DA7E82
+ vadduwm 8,8,30
+ vadduwm 8,8,17
+ vadduwm 0,0,27
+ vsel 29,7,6,5
+ vadduwm 7,7,28
+ vadduwm 0,0,29
+ .long 0x13C5FE82
+ vadduwm 0,0,30
+ vxor 29,1,2
+ vsel 29,2,3,29
+ vadduwm 4,4,0
+ .long 0x13C18682
+ vadduwm 30,30,29
+ vadduwm 0,0,30
+ lvx 28,10,7
+ mtctr 0
+ b .L16_xx
+.align 5
+.L16_xx:
+ .long 0x13CA0682
+ vadduwm 9,9,30
+ .long 0x13DB7E82
+ vadduwm 9,9,30
+ vadduwm 9,9,18
+ vadduwm 7,7,8
+ vsel 29,6,5,4
+ vadduwm 6,6,28
+ vadduwm 7,7,29
+ .long 0x13C4FE82
+ vadduwm 7,7,30
+ vxor 29,0,1
+ vsel 29,1,2,29
+ vadduwm 3,3,7
+ .long 0x13C08682
+ vadduwm 30,30,29
+ vadduwm 7,7,30
+ lvx 28,26,7
+ .long 0x13CB0682
+ vadduwm 10,10,30
+ .long 0x13C87E82
+ vadduwm 10,10,30
+ vadduwm 10,10,19
+ vadduwm 6,6,9
+ vsel 29,5,4,3
+ vadduwm 5,5,28
+ vadduwm 6,6,29
+ .long 0x13C3FE82
+ vadduwm 6,6,30
+ vxor 29,7,0
+ vsel 29,0,1,29
+ vadduwm 2,2,6
+ .long 0x13C78682
+ vadduwm 30,30,29
+ vadduwm 6,6,30
+ lvx 28,27,7
+ .long 0x13CC0682
+ vadduwm 11,11,30
+ .long 0x13C97E82
+ vadduwm 11,11,30
+ vadduwm 11,11,24
+ vadduwm 5,5,10
+ vsel 29,4,3,2
+ vadduwm 4,4,28
+ vadduwm 5,5,29
+ .long 0x13C2FE82
+ vadduwm 5,5,30
+ vxor 29,6,7
+ vsel 29,7,0,29
+ vadduwm 1,1,5
+ .long 0x13C68682
+ vadduwm 30,30,29
+ vadduwm 5,5,30
+ lvx 28,28,7
+ .long 0x13CD0682
+ vadduwm 12,12,30
+ .long 0x13CA7E82
+ vadduwm 12,12,30
+ vadduwm 12,12,25
+ vadduwm 4,4,11
+ vsel 29,3,2,1
+ vadduwm 3,3,28
+ vadduwm 4,4,29
+ .long 0x13C1FE82
+ vadduwm 4,4,30
+ vxor 29,5,6
+ vsel 29,6,7,29
+ vadduwm 0,0,4
+ .long 0x13C58682
+ vadduwm 30,30,29
+ vadduwm 4,4,30
+ lvx 28,29,7
+ .long 0x13CE0682
+ vadduwm 13,13,30
+ .long 0x13CB7E82
+ vadduwm 13,13,30
+ vadduwm 13,13,26
+ vadduwm 3,3,12
+ vsel 29,2,1,0
+ vadduwm 2,2,28
+ vadduwm 3,3,29
+ .long 0x13C0FE82
+ vadduwm 3,3,30
+ vxor 29,4,5
+ vsel 29,5,6,29
+ vadduwm 7,7,3
+ .long 0x13C48682
+ vadduwm 30,30,29
+ vadduwm 3,3,30
+ lvx 28,30,7
+ .long 0x13CF0682
+ vadduwm 14,14,30
+ .long 0x13CC7E82
+ vadduwm 14,14,30
+ vadduwm 14,14,27
+ vadduwm 2,2,13
+ vsel 29,1,0,7
+ vadduwm 1,1,28
+ vadduwm 2,2,29
+ .long 0x13C7FE82
+ vadduwm 2,2,30
+ vxor 29,3,4
+ vsel 29,4,5,29
+ vadduwm 6,6,2
+ .long 0x13C38682
+ vadduwm 30,30,29
+ vadduwm 2,2,30
+ lvx 28,31,7
+ addi 7,7,0x80
+ .long 0x13D00682
+ vadduwm 15,15,30
+ .long 0x13CD7E82
+ vadduwm 15,15,30
+ vadduwm 15,15,8
+ vadduwm 1,1,14
+ vsel 29,0,7,6
+ vadduwm 0,0,28
+ vadduwm 1,1,29
+ .long 0x13C6FE82
+ vadduwm 1,1,30
+ vxor 29,2,3
+ vsel 29,3,4,29
+ vadduwm 5,5,1
+ .long 0x13C28682
+ vadduwm 30,30,29
+ vadduwm 1,1,30
+ lvx 28,0,7
+ .long 0x13D10682
+ vadduwm 16,16,30
+ .long 0x13CE7E82
+ vadduwm 16,16,30
+ vadduwm 16,16,9
+ vadduwm 0,0,15
+ vsel 29,7,6,5
+ vadduwm 7,7,28
+ vadduwm 0,0,29
+ .long 0x13C5FE82
+ vadduwm 0,0,30
+ vxor 29,1,2
+ vsel 29,2,3,29
+ vadduwm 4,4,0
+ .long 0x13C18682
+ vadduwm 30,30,29
+ vadduwm 0,0,30
+ lvx 28,10,7
+ .long 0x13D20682
+ vadduwm 17,17,30
+ .long 0x13CF7E82
+ vadduwm 17,17,30
+ vadduwm 17,17,10
+ vadduwm 7,7,16
+ vsel 29,6,5,4
+ vadduwm 6,6,28
+ vadduwm 7,7,29
+ .long 0x13C4FE82
+ vadduwm 7,7,30
+ vxor 29,0,1
+ vsel 29,1,2,29
+ vadduwm 3,3,7
+ .long 0x13C08682
+ vadduwm 30,30,29
+ vadduwm 7,7,30
+ lvx 28,26,7
+ .long 0x13D30682
+ vadduwm 18,18,30
+ .long 0x13D07E82
+ vadduwm 18,18,30
+ vadduwm 18,18,11
+ vadduwm 6,6,17
+ vsel 29,5,4,3
+ vadduwm 5,5,28
+ vadduwm 6,6,29
+ .long 0x13C3FE82
+ vadduwm 6,6,30
+ vxor 29,7,0
+ vsel 29,0,1,29
+ vadduwm 2,2,6
+ .long 0x13C78682
+ vadduwm 30,30,29
+ vadduwm 6,6,30
+ lvx 28,27,7
+ .long 0x13D80682
+ vadduwm 19,19,30
+ .long 0x13D17E82
+ vadduwm 19,19,30
+ vadduwm 19,19,12
+ vadduwm 5,5,18
+ vsel 29,4,3,2
+ vadduwm 4,4,28
+ vadduwm 5,5,29
+ .long 0x13C2FE82
+ vadduwm 5,5,30
+ vxor 29,6,7
+ vsel 29,7,0,29
+ vadduwm 1,1,5
+ .long 0x13C68682
+ vadduwm 30,30,29
+ vadduwm 5,5,30
+ lvx 28,28,7
+ .long 0x13D90682
+ vadduwm 24,24,30
+ .long 0x13D27E82
+ vadduwm 24,24,30
+ vadduwm 24,24,13
+ vadduwm 4,4,19
+ vsel 29,3,2,1
+ vadduwm 3,3,28
+ vadduwm 4,4,29
+ .long 0x13C1FE82
+ vadduwm 4,4,30
+ vxor 29,5,6
+ vsel 29,6,7,29
+ vadduwm 0,0,4
+ .long 0x13C58682
+ vadduwm 30,30,29
+ vadduwm 4,4,30
+ lvx 28,29,7
+ .long 0x13DA0682
+ vadduwm 25,25,30
+ .long 0x13D37E82
+ vadduwm 25,25,30
+ vadduwm 25,25,14
+ vadduwm 3,3,24
+ vsel 29,2,1,0
+ vadduwm 2,2,28
+ vadduwm 3,3,29
+ .long 0x13C0FE82
+ vadduwm 3,3,30
+ vxor 29,4,5
+ vsel 29,5,6,29
+ vadduwm 7,7,3
+ .long 0x13C48682
+ vadduwm 30,30,29
+ vadduwm 3,3,30
+ lvx 28,30,7
+ .long 0x13DB0682
+ vadduwm 26,26,30
+ .long 0x13D87E82
+ vadduwm 26,26,30
+ vadduwm 26,26,15
+ vadduwm 2,2,25
+ vsel 29,1,0,7
+ vadduwm 1,1,28
+ vadduwm 2,2,29
+ .long 0x13C7FE82
+ vadduwm 2,2,30
+ vxor 29,3,4
+ vsel 29,4,5,29
+ vadduwm 6,6,2
+ .long 0x13C38682
+ vadduwm 30,30,29
+ vadduwm 2,2,30
+ lvx 28,31,7
+ addi 7,7,0x80
+ .long 0x13C80682
+ vadduwm 27,27,30
+ .long 0x13D97E82
+ vadduwm 27,27,30
+ vadduwm 27,27,16
+ vadduwm 1,1,26
+ vsel 29,0,7,6
+ vadduwm 0,0,28
+ vadduwm 1,1,29
+ .long 0x13C6FE82
+ vadduwm 1,1,30
+ vxor 29,2,3
+ vsel 29,3,4,29
+ vadduwm 5,5,1
+ .long 0x13C28682
+ vadduwm 30,30,29
+ vadduwm 1,1,30
+ lvx 28,0,7
+ .long 0x13C90682
+ vadduwm 8,8,30
+ .long 0x13DA7E82
+ vadduwm 8,8,30
+ vadduwm 8,8,17
+ vadduwm 0,0,27
+ vsel 29,7,6,5
+ vadduwm 7,7,28
+ vadduwm 0,0,29
+ .long 0x13C5FE82
+ vadduwm 0,0,30
+ vxor 29,1,2
+ vsel 29,2,3,29
+ vadduwm 4,4,0
+ .long 0x13C18682
+ vadduwm 30,30,29
+ vadduwm 0,0,30
+ lvx 28,10,7
+ bdnz .L16_xx
+
+ lvx 10,0,11
+ subic. 5,5,1
+ lvx 11,10,11
+ vadduwm 0,0,10
+ lvx 12,26,11
+ vadduwm 1,1,11
+ lvx 13,27,11
+ vadduwm 2,2,12
+ lvx 14,28,11
+ vadduwm 3,3,13
+ lvx 15,29,11
+ vadduwm 4,4,14
+ lvx 16,30,11
+ vadduwm 5,5,15
+ lvx 17,31,11
+ vadduwm 6,6,16
+ vadduwm 7,7,17
+ bne .Loop
+ lvx 8,26,7
+ vperm 0,0,1,28
+ lvx 9,27,7
+ vperm 4,4,5,28
+ vperm 0,0,2,8
+ vperm 4,4,6,8
+ vperm 0,0,3,9
+ vperm 4,4,7,9
+ .long 0x7C001F19
+ .long 0x7C8A1F19
+ addi 11,1,207
+ mtlr 8
+ or 12,12,12
+ lvx 24,0,11
+ lvx 25,10,11
+ lvx 26,26,11
+ lvx 27,27,11
+ lvx 28,28,11
+ lvx 29,29,11
+ lvx 30,30,11
+ lvx 31,31,11
+ ld 26,336(1)
+ ld 27,344(1)
+ ld 28,352(1)
+ ld 29,360(1)
+ ld 30,368(1)
+ ld 31,376(1)
+ addi 1,1,384
+ blr
+.long 0
+.byte 0,12,4,1,0x80,6,3,0
+.long 0
+.size zfs_sha256_power8,.-zfs_sha256_power8
+.align 6
+.LPICmeup:
+ mflr 0
+ bcl 20,31,$+4
+ mflr 6
+ addi 6,6,56
+ mtlr 0
+ blr
+.long 0
+.byte 0,12,0x14,0,0,0,0,0
+.space 28
+.long 0x428a2f98,0x428a2f98,0x428a2f98,0x428a2f98
+.long 0x71374491,0x71374491,0x71374491,0x71374491
+.long 0xb5c0fbcf,0xb5c0fbcf,0xb5c0fbcf,0xb5c0fbcf
+.long 0xe9b5dba5,0xe9b5dba5,0xe9b5dba5,0xe9b5dba5
+.long 0x3956c25b,0x3956c25b,0x3956c25b,0x3956c25b
+.long 0x59f111f1,0x59f111f1,0x59f111f1,0x59f111f1
+.long 0x923f82a4,0x923f82a4,0x923f82a4,0x923f82a4
+.long 0xab1c5ed5,0xab1c5ed5,0xab1c5ed5,0xab1c5ed5
+.long 0xd807aa98,0xd807aa98,0xd807aa98,0xd807aa98
+.long 0x12835b01,0x12835b01,0x12835b01,0x12835b01
+.long 0x243185be,0x243185be,0x243185be,0x243185be
+.long 0x550c7dc3,0x550c7dc3,0x550c7dc3,0x550c7dc3
+.long 0x72be5d74,0x72be5d74,0x72be5d74,0x72be5d74
+.long 0x80deb1fe,0x80deb1fe,0x80deb1fe,0x80deb1fe
+.long 0x9bdc06a7,0x9bdc06a7,0x9bdc06a7,0x9bdc06a7
+.long 0xc19bf174,0xc19bf174,0xc19bf174,0xc19bf174
+.long 0xe49b69c1,0xe49b69c1,0xe49b69c1,0xe49b69c1
+.long 0xefbe4786,0xefbe4786,0xefbe4786,0xefbe4786
+.long 0x0fc19dc6,0x0fc19dc6,0x0fc19dc6,0x0fc19dc6
+.long 0x240ca1cc,0x240ca1cc,0x240ca1cc,0x240ca1cc
+.long 0x2de92c6f,0x2de92c6f,0x2de92c6f,0x2de92c6f
+.long 0x4a7484aa,0x4a7484aa,0x4a7484aa,0x4a7484aa
+.long 0x5cb0a9dc,0x5cb0a9dc,0x5cb0a9dc,0x5cb0a9dc
+.long 0x76f988da,0x76f988da,0x76f988da,0x76f988da
+.long 0x983e5152,0x983e5152,0x983e5152,0x983e5152
+.long 0xa831c66d,0xa831c66d,0xa831c66d,0xa831c66d
+.long 0xb00327c8,0xb00327c8,0xb00327c8,0xb00327c8
+.long 0xbf597fc7,0xbf597fc7,0xbf597fc7,0xbf597fc7
+.long 0xc6e00bf3,0xc6e00bf3,0xc6e00bf3,0xc6e00bf3
+.long 0xd5a79147,0xd5a79147,0xd5a79147,0xd5a79147
+.long 0x06ca6351,0x06ca6351,0x06ca6351,0x06ca6351
+.long 0x14292967,0x14292967,0x14292967,0x14292967
+.long 0x27b70a85,0x27b70a85,0x27b70a85,0x27b70a85
+.long 0x2e1b2138,0x2e1b2138,0x2e1b2138,0x2e1b2138
+.long 0x4d2c6dfc,0x4d2c6dfc,0x4d2c6dfc,0x4d2c6dfc
+.long 0x53380d13,0x53380d13,0x53380d13,0x53380d13
+.long 0x650a7354,0x650a7354,0x650a7354,0x650a7354
+.long 0x766a0abb,0x766a0abb,0x766a0abb,0x766a0abb
+.long 0x81c2c92e,0x81c2c92e,0x81c2c92e,0x81c2c92e
+.long 0x92722c85,0x92722c85,0x92722c85,0x92722c85
+.long 0xa2bfe8a1,0xa2bfe8a1,0xa2bfe8a1,0xa2bfe8a1
+.long 0xa81a664b,0xa81a664b,0xa81a664b,0xa81a664b
+.long 0xc24b8b70,0xc24b8b70,0xc24b8b70,0xc24b8b70
+.long 0xc76c51a3,0xc76c51a3,0xc76c51a3,0xc76c51a3
+.long 0xd192e819,0xd192e819,0xd192e819,0xd192e819
+.long 0xd6990624,0xd6990624,0xd6990624,0xd6990624
+.long 0xf40e3585,0xf40e3585,0xf40e3585,0xf40e3585
+.long 0x106aa070,0x106aa070,0x106aa070,0x106aa070
+.long 0x19a4c116,0x19a4c116,0x19a4c116,0x19a4c116
+.long 0x1e376c08,0x1e376c08,0x1e376c08,0x1e376c08
+.long 0x2748774c,0x2748774c,0x2748774c,0x2748774c
+.long 0x34b0bcb5,0x34b0bcb5,0x34b0bcb5,0x34b0bcb5
+.long 0x391c0cb3,0x391c0cb3,0x391c0cb3,0x391c0cb3
+.long 0x4ed8aa4a,0x4ed8aa4a,0x4ed8aa4a,0x4ed8aa4a
+.long 0x5b9cca4f,0x5b9cca4f,0x5b9cca4f,0x5b9cca4f
+.long 0x682e6ff3,0x682e6ff3,0x682e6ff3,0x682e6ff3
+.long 0x748f82ee,0x748f82ee,0x748f82ee,0x748f82ee
+.long 0x78a5636f,0x78a5636f,0x78a5636f,0x78a5636f
+.long 0x84c87814,0x84c87814,0x84c87814,0x84c87814
+.long 0x8cc70208,0x8cc70208,0x8cc70208,0x8cc70208
+.long 0x90befffa,0x90befffa,0x90befffa,0x90befffa
+.long 0xa4506ceb,0xa4506ceb,0xa4506ceb,0xa4506ceb
+.long 0xbef9a3f7,0xbef9a3f7,0xbef9a3f7,0xbef9a3f7
+.long 0xc67178f2,0xc67178f2,0xc67178f2,0xc67178f2
+.long 0,0,0,0
+.long 0x10111213,0x10111213,0x10111213,0x00010203
+.long 0x10111213,0x10111213,0x04050607,0x00010203
+.long 0x10111213,0x08090a0b,0x04050607,0x00010203
+#endif
diff --git a/module/icp/asm-ppc64/sha2/sha256-ppc.S b/module/icp/asm-ppc64/sha2/sha256-ppc.S
new file mode 100644
index 000000000..2219e313c
--- /dev/null
+++ b/module/icp/asm-ppc64/sha2/sha256-ppc.S
@@ -0,0 +1,2712 @@
+/*
+ * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Portions Copyright (c) 2022 Tino Reichardt <[email protected]>
+ * - modified assembly to fit into OpenZFS
+ */
+
+#if (defined(__PPC64__) && defined(__BIG_ENDIAN__))
+
+.text
+
+.globl zfs_sha256_ppc
+.globl .zfs_sha256_ppc
+.type zfs_sha256_ppc,@function
+.section ".opd","aw"
+.align 3
+zfs_sha256_ppc:
+.quad .zfs_sha256_ppc,.TOC.@tocbase,0
+.previous
+.align 6
+.zfs_sha256_ppc:
+ stdu 1,-320(1)
+ mflr 0
+ sldi 5,5,6
+
+ std 3,144(1)
+
+ std 14,176(1)
+ std 15,184(1)
+ std 16,192(1)
+ std 17,200(1)
+ std 18,208(1)
+ std 19,216(1)
+ std 20,224(1)
+ std 21,232(1)
+ std 22,240(1)
+ std 23,248(1)
+ std 24,256(1)
+ std 25,264(1)
+ std 26,272(1)
+ std 27,280(1)
+ std 28,288(1)
+ std 29,296(1)
+ std 30,304(1)
+ std 31,312(1)
+ std 0,336(1)
+ lwz 8,0(3)
+ mr 31,4
+ lwz 9,4(3)
+ lwz 10,8(3)
+ lwz 11,12(3)
+ lwz 12,16(3)
+ lwz 6,20(3)
+ lwz 14,24(3)
+ lwz 15,28(3)
+ bl .LPICmeup
+.LPICedup:
+ andi. 0,31,3
+ bne .Lunaligned
+.Laligned:
+ add 5,31,5
+ std 5,128(1)
+ std 31,136(1)
+ bl .Lsha2_block_private
+ b .Ldone
+
+.align 4
+.Lunaligned:
+ subfic 0,31,4096
+ andi. 0,0,4032
+ beq .Lcross_page
+ cmpld 5,0
+ ble .Laligned
+ subfc 5,0,5
+ add 0,31,0
+ std 5,120(1)
+ std 0,128(1)
+ std 31,136(1)
+ bl .Lsha2_block_private
+
+ ld 5,120(1)
+.Lcross_page:
+ li 0,16
+ mtctr 0
+ addi 20,1,48
+.Lmemcpy:
+ lbz 16,0(31)
+ lbz 17,1(31)
+ lbz 18,2(31)
+ lbz 19,3(31)
+ addi 31,31,4
+ stb 16,0(20)
+ stb 17,1(20)
+ stb 18,2(20)
+ stb 19,3(20)
+ addi 20,20,4
+ bdnz .Lmemcpy
+ std 31,112(1)
+ addi 0,1,112
+ addi 31,1,48
+ std 5,120(1)
+ std 0,128(1)
+ std 31,136(1)
+ bl .Lsha2_block_private
+ ld 31,112(1)
+ ld 5,120(1)
+ addic. 5,5,-64
+ bne .Lunaligned
+
+.Ldone:
+ ld 0,336(1)
+ ld 14,176(1)
+ ld 15,184(1)
+ ld 16,192(1)
+ ld 17,200(1)
+ ld 18,208(1)
+ ld 19,216(1)
+ ld 20,224(1)
+ ld 21,232(1)
+ ld 22,240(1)
+ ld 23,248(1)
+ ld 24,256(1)
+ ld 25,264(1)
+ ld 26,272(1)
+ ld 27,280(1)
+ ld 28,288(1)
+ ld 29,296(1)
+ ld 30,304(1)
+ ld 31,312(1)
+ mtlr 0
+ addi 1,1,320
+ blr
+.long 0
+.byte 0,12,4,1,0x80,18,3,0
+.long 0
+.align 4
+.Lsha2_block_private:
+ lwz 0,0(7)
+ lwz 16,0(31)
+ rotrwi 3,12,6
+ rotrwi 4,12,11
+ and 5,6,12
+ xor 3,3,4
+ add 15,15,0
+ andc 0,14,12
+ rotrwi 4,4,14
+ or 5,5,0
+ add 15,15,16
+ xor 3,3,4
+ add 15,15,5
+ add 15,15,3
+
+ rotrwi 3,8,2
+ rotrwi 4,8,13
+ and 5,8,9
+ and 0,8,10
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,9,10
+ xor 3,3,4
+ add 11,11,15
+ xor 5,5,0
+ lwz 0,4(7)
+ add 15,15,3
+ add 15,15,5
+
+ lwz 17,4(31)
+ rotrwi 3,11,6
+ rotrwi 4,11,11
+ and 5,12,11
+ xor 3,3,4
+ add 14,14,0
+ andc 0,6,11
+ rotrwi 4,4,14
+ or 5,5,0
+ add 14,14,17
+ xor 3,3,4
+ add 14,14,5
+ add 14,14,3
+
+ rotrwi 3,15,2
+ rotrwi 4,15,13
+ and 5,15,8
+ and 0,15,9
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,8,9
+ xor 3,3,4
+ add 10,10,14
+ xor 5,5,0
+ lwz 0,8(7)
+ add 14,14,3
+ add 14,14,5
+
+ lwz 18,8(31)
+ rotrwi 3,10,6
+ rotrwi 4,10,11
+ and 5,11,10
+ xor 3,3,4
+ add 6,6,0
+ andc 0,12,10
+ rotrwi 4,4,14
+ or 5,5,0
+ add 6,6,18
+ xor 3,3,4
+ add 6,6,5
+ add 6,6,3
+
+ rotrwi 3,14,2
+ rotrwi 4,14,13
+ and 5,14,15
+ and 0,14,8
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,15,8
+ xor 3,3,4
+ add 9,9,6
+ xor 5,5,0
+ lwz 0,12(7)
+ add 6,6,3
+ add 6,6,5
+
+ lwz 19,12(31)
+ rotrwi 3,9,6
+ rotrwi 4,9,11
+ and 5,10,9
+ xor 3,3,4
+ add 12,12,0
+ andc 0,11,9
+ rotrwi 4,4,14
+ or 5,5,0
+ add 12,12,19
+ xor 3,3,4
+ add 12,12,5
+ add 12,12,3
+
+ rotrwi 3,6,2
+ rotrwi 4,6,13
+ and 5,6,14
+ and 0,6,15
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,14,15
+ xor 3,3,4
+ add 8,8,12
+ xor 5,5,0
+ lwz 0,16(7)
+ add 12,12,3
+ add 12,12,5
+
+ lwz 20,16(31)
+ rotrwi 3,8,6
+ rotrwi 4,8,11
+ and 5,9,8
+ xor 3,3,4
+ add 11,11,0
+ andc 0,10,8
+ rotrwi 4,4,14
+ or 5,5,0
+ add 11,11,20
+ xor 3,3,4
+ add 11,11,5
+ add 11,11,3
+
+ rotrwi 3,12,2
+ rotrwi 4,12,13
+ and 5,12,6
+ and 0,12,14
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,6,14
+ xor 3,3,4
+ add 15,15,11
+ xor 5,5,0
+ lwz 0,20(7)
+ add 11,11,3
+ add 11,11,5
+
+ lwz 21,20(31)
+ rotrwi 3,15,6
+ rotrwi 4,15,11
+ and 5,8,15
+ xor 3,3,4
+ add 10,10,0
+ andc 0,9,15
+ rotrwi 4,4,14
+ or 5,5,0
+ add 10,10,21
+ xor 3,3,4
+ add 10,10,5
+ add 10,10,3
+
+ rotrwi 3,11,2
+ rotrwi 4,11,13
+ and 5,11,12
+ and 0,11,6
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,12,6
+ xor 3,3,4
+ add 14,14,10
+ xor 5,5,0
+ lwz 0,24(7)
+ add 10,10,3
+ add 10,10,5
+
+ lwz 22,24(31)
+ rotrwi 3,14,6
+ rotrwi 4,14,11
+ and 5,15,14
+ xor 3,3,4
+ add 9,9,0
+ andc 0,8,14
+ rotrwi 4,4,14
+ or 5,5,0
+ add 9,9,22
+ xor 3,3,4
+ add 9,9,5
+ add 9,9,3
+
+ rotrwi 3,10,2
+ rotrwi 4,10,13
+ and 5,10,11
+ and 0,10,12
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,11,12
+ xor 3,3,4
+ add 6,6,9
+ xor 5,5,0
+ lwz 0,28(7)
+ add 9,9,3
+ add 9,9,5
+
+ lwz 23,28(31)
+ rotrwi 3,6,6
+ rotrwi 4,6,11
+ and 5,14,6
+ xor 3,3,4
+ add 8,8,0
+ andc 0,15,6
+ rotrwi 4,4,14
+ or 5,5,0
+ add 8,8,23
+ xor 3,3,4
+ add 8,8,5
+ add 8,8,3
+
+ rotrwi 3,9,2
+ rotrwi 4,9,13
+ and 5,9,10
+ and 0,9,11
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,10,11
+ xor 3,3,4
+ add 12,12,8
+ xor 5,5,0
+ lwz 0,32(7)
+ add 8,8,3
+ add 8,8,5
+
+ lwz 24,32(31)
+ rotrwi 3,12,6
+ rotrwi 4,12,11
+ and 5,6,12
+ xor 3,3,4
+ add 15,15,0
+ andc 0,14,12
+ rotrwi 4,4,14
+ or 5,5,0
+ add 15,15,24
+ xor 3,3,4
+ add 15,15,5
+ add 15,15,3
+
+ rotrwi 3,8,2
+ rotrwi 4,8,13
+ and 5,8,9
+ and 0,8,10
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,9,10
+ xor 3,3,4
+ add 11,11,15
+ xor 5,5,0
+ lwz 0,36(7)
+ add 15,15,3
+ add 15,15,5
+
+ lwz 25,36(31)
+ rotrwi 3,11,6
+ rotrwi 4,11,11
+ and 5,12,11
+ xor 3,3,4
+ add 14,14,0
+ andc 0,6,11
+ rotrwi 4,4,14
+ or 5,5,0
+ add 14,14,25
+ xor 3,3,4
+ add 14,14,5
+ add 14,14,3
+
+ rotrwi 3,15,2
+ rotrwi 4,15,13
+ and 5,15,8
+ and 0,15,9
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,8,9
+ xor 3,3,4
+ add 10,10,14
+ xor 5,5,0
+ lwz 0,40(7)
+ add 14,14,3
+ add 14,14,5
+
+ lwz 26,40(31)
+ rotrwi 3,10,6
+ rotrwi 4,10,11
+ and 5,11,10
+ xor 3,3,4
+ add 6,6,0
+ andc 0,12,10
+ rotrwi 4,4,14
+ or 5,5,0
+ add 6,6,26
+ xor 3,3,4
+ add 6,6,5
+ add 6,6,3
+
+ rotrwi 3,14,2
+ rotrwi 4,14,13
+ and 5,14,15
+ and 0,14,8
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,15,8
+ xor 3,3,4
+ add 9,9,6
+ xor 5,5,0
+ lwz 0,44(7)
+ add 6,6,3
+ add 6,6,5
+
+ lwz 27,44(31)
+ rotrwi 3,9,6
+ rotrwi 4,9,11
+ and 5,10,9
+ xor 3,3,4
+ add 12,12,0
+ andc 0,11,9
+ rotrwi 4,4,14
+ or 5,5,0
+ add 12,12,27
+ xor 3,3,4
+ add 12,12,5
+ add 12,12,3
+
+ rotrwi 3,6,2
+ rotrwi 4,6,13
+ and 5,6,14
+ and 0,6,15
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,14,15
+ xor 3,3,4
+ add 8,8,12
+ xor 5,5,0
+ lwz 0,48(7)
+ add 12,12,3
+ add 12,12,5
+
+ lwz 28,48(31)
+ rotrwi 3,8,6
+ rotrwi 4,8,11
+ and 5,9,8
+ xor 3,3,4
+ add 11,11,0
+ andc 0,10,8
+ rotrwi 4,4,14
+ or 5,5,0
+ add 11,11,28
+ xor 3,3,4
+ add 11,11,5
+ add 11,11,3
+
+ rotrwi 3,12,2
+ rotrwi 4,12,13
+ and 5,12,6
+ and 0,12,14
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,6,14
+ xor 3,3,4
+ add 15,15,11
+ xor 5,5,0
+ lwz 0,52(7)
+ add 11,11,3
+ add 11,11,5
+
+ lwz 29,52(31)
+ rotrwi 3,15,6
+ rotrwi 4,15,11
+ and 5,8,15
+ xor 3,3,4
+ add 10,10,0
+ andc 0,9,15
+ rotrwi 4,4,14
+ or 5,5,0
+ add 10,10,29
+ xor 3,3,4
+ add 10,10,5
+ add 10,10,3
+
+ rotrwi 3,11,2
+ rotrwi 4,11,13
+ and 5,11,12
+ and 0,11,6
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,12,6
+ xor 3,3,4
+ add 14,14,10
+ xor 5,5,0
+ lwz 0,56(7)
+ add 10,10,3
+ add 10,10,5
+
+ lwz 30,56(31)
+ rotrwi 3,14,6
+ rotrwi 4,14,11
+ and 5,15,14
+ xor 3,3,4
+ add 9,9,0
+ andc 0,8,14
+ rotrwi 4,4,14
+ or 5,5,0
+ add 9,9,30
+ xor 3,3,4
+ add 9,9,5
+ add 9,9,3
+
+ rotrwi 3,10,2
+ rotrwi 4,10,13
+ and 5,10,11
+ and 0,10,12
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,11,12
+ xor 3,3,4
+ add 6,6,9
+ xor 5,5,0
+ lwz 0,60(7)
+ add 9,9,3
+ add 9,9,5
+
+ lwz 31,60(31)
+ rotrwi 3,6,6
+ rotrwi 4,6,11
+ and 5,14,6
+ xor 3,3,4
+ add 8,8,0
+ andc 0,15,6
+ rotrwi 4,4,14
+ or 5,5,0
+ add 8,8,31
+ xor 3,3,4
+ add 8,8,5
+ add 8,8,3
+
+ rotrwi 3,9,2
+ rotrwi 4,9,13
+ and 5,9,10
+ and 0,9,11
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,10,11
+ xor 3,3,4
+ add 12,12,8
+ xor 5,5,0
+ add 8,8,3
+ add 8,8,5
+
+ li 5,3
+ mtctr 5
+.align 4
+.Lrounds:
+ addi 7,7,64
+ rotrwi 3,17,7
+ rotrwi 4,17,18
+ rotrwi 5,30,17
+ rotrwi 0,30,19
+ xor 3,3,4
+ srwi 4,17,3
+ xor 5,5,0
+ srwi 0,30,10
+ add 16,16,25
+ xor 3,3,4
+ xor 5,5,0
+ lwz 0,0(7)
+ add 16,16,3
+ add 16,16,5
+ rotrwi 3,12,6
+ rotrwi 4,12,11
+ and 5,6,12
+ xor 3,3,4
+ add 15,15,0
+ andc 0,14,12
+ rotrwi 4,4,14
+ or 5,5,0
+ add 15,15,16
+ xor 3,3,4
+ add 15,15,5
+ add 15,15,3
+
+ rotrwi 3,8,2
+ rotrwi 4,8,13
+ and 5,8,9
+ and 0,8,10
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,9,10
+ xor 3,3,4
+ add 11,11,15
+ xor 5,5,0
+ add 15,15,3
+ add 15,15,5
+
+ rotrwi 3,18,7
+ rotrwi 4,18,18
+ rotrwi 5,31,17
+ rotrwi 0,31,19
+ xor 3,3,4
+ srwi 4,18,3
+ xor 5,5,0
+ srwi 0,31,10
+ add 17,17,26
+ xor 3,3,4
+ xor 5,5,0
+ lwz 0,4(7)
+ add 17,17,3
+ add 17,17,5
+ rotrwi 3,11,6
+ rotrwi 4,11,11
+ and 5,12,11
+ xor 3,3,4
+ add 14,14,0
+ andc 0,6,11
+ rotrwi 4,4,14
+ or 5,5,0
+ add 14,14,17
+ xor 3,3,4
+ add 14,14,5
+ add 14,14,3
+
+ rotrwi 3,15,2
+ rotrwi 4,15,13
+ and 5,15,8
+ and 0,15,9
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,8,9
+ xor 3,3,4
+ add 10,10,14
+ xor 5,5,0
+ add 14,14,3
+ add 14,14,5
+
+ rotrwi 3,19,7
+ rotrwi 4,19,18
+ rotrwi 5,16,17
+ rotrwi 0,16,19
+ xor 3,3,4
+ srwi 4,19,3
+ xor 5,5,0
+ srwi 0,16,10
+ add 18,18,27
+ xor 3,3,4
+ xor 5,5,0
+ lwz 0,8(7)
+ add 18,18,3
+ add 18,18,5
+ rotrwi 3,10,6
+ rotrwi 4,10,11
+ and 5,11,10
+ xor 3,3,4
+ add 6,6,0
+ andc 0,12,10
+ rotrwi 4,4,14
+ or 5,5,0
+ add 6,6,18
+ xor 3,3,4
+ add 6,6,5
+ add 6,6,3
+
+ rotrwi 3,14,2
+ rotrwi 4,14,13
+ and 5,14,15
+ and 0,14,8
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,15,8
+ xor 3,3,4
+ add 9,9,6
+ xor 5,5,0
+ add 6,6,3
+ add 6,6,5
+
+ rotrwi 3,20,7
+ rotrwi 4,20,18
+ rotrwi 5,17,17
+ rotrwi 0,17,19
+ xor 3,3,4
+ srwi 4,20,3
+ xor 5,5,0
+ srwi 0,17,10
+ add 19,19,28
+ xor 3,3,4
+ xor 5,5,0
+ lwz 0,12(7)
+ add 19,19,3
+ add 19,19,5
+ rotrwi 3,9,6
+ rotrwi 4,9,11
+ and 5,10,9
+ xor 3,3,4
+ add 12,12,0
+ andc 0,11,9
+ rotrwi 4,4,14
+ or 5,5,0
+ add 12,12,19
+ xor 3,3,4
+ add 12,12,5
+ add 12,12,3
+
+ rotrwi 3,6,2
+ rotrwi 4,6,13
+ and 5,6,14
+ and 0,6,15
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,14,15
+ xor 3,3,4
+ add 8,8,12
+ xor 5,5,0
+ add 12,12,3
+ add 12,12,5
+
+ rotrwi 3,21,7
+ rotrwi 4,21,18
+ rotrwi 5,18,17
+ rotrwi 0,18,19
+ xor 3,3,4
+ srwi 4,21,3
+ xor 5,5,0
+ srwi 0,18,10
+ add 20,20,29
+ xor 3,3,4
+ xor 5,5,0
+ lwz 0,16(7)
+ add 20,20,3
+ add 20,20,5
+ rotrwi 3,8,6
+ rotrwi 4,8,11
+ and 5,9,8
+ xor 3,3,4
+ add 11,11,0
+ andc 0,10,8
+ rotrwi 4,4,14
+ or 5,5,0
+ add 11,11,20
+ xor 3,3,4
+ add 11,11,5
+ add 11,11,3
+
+ rotrwi 3,12,2
+ rotrwi 4,12,13
+ and 5,12,6
+ and 0,12,14
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,6,14
+ xor 3,3,4
+ add 15,15,11
+ xor 5,5,0
+ add 11,11,3
+ add 11,11,5
+
+ rotrwi 3,22,7
+ rotrwi 4,22,18
+ rotrwi 5,19,17
+ rotrwi 0,19,19
+ xor 3,3,4
+ srwi 4,22,3
+ xor 5,5,0
+ srwi 0,19,10
+ add 21,21,30
+ xor 3,3,4
+ xor 5,5,0
+ lwz 0,20(7)
+ add 21,21,3
+ add 21,21,5
+ rotrwi 3,15,6
+ rotrwi 4,15,11
+ and 5,8,15
+ xor 3,3,4
+ add 10,10,0
+ andc 0,9,15
+ rotrwi 4,4,14
+ or 5,5,0
+ add 10,10,21
+ xor 3,3,4
+ add 10,10,5
+ add 10,10,3
+
+ rotrwi 3,11,2
+ rotrwi 4,11,13
+ and 5,11,12
+ and 0,11,6
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,12,6
+ xor 3,3,4
+ add 14,14,10
+ xor 5,5,0
+ add 10,10,3
+ add 10,10,5
+
+ rotrwi 3,23,7
+ rotrwi 4,23,18
+ rotrwi 5,20,17
+ rotrwi 0,20,19
+ xor 3,3,4
+ srwi 4,23,3
+ xor 5,5,0
+ srwi 0,20,10
+ add 22,22,31
+ xor 3,3,4
+ xor 5,5,0
+ lwz 0,24(7)
+ add 22,22,3
+ add 22,22,5
+ rotrwi 3,14,6
+ rotrwi 4,14,11
+ and 5,15,14
+ xor 3,3,4
+ add 9,9,0
+ andc 0,8,14
+ rotrwi 4,4,14
+ or 5,5,0
+ add 9,9,22
+ xor 3,3,4
+ add 9,9,5
+ add 9,9,3
+
+ rotrwi 3,10,2
+ rotrwi 4,10,13
+ and 5,10,11
+ and 0,10,12
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,11,12
+ xor 3,3,4
+ add 6,6,9
+ xor 5,5,0
+ add 9,9,3
+ add 9,9,5
+
+ rotrwi 3,24,7
+ rotrwi 4,24,18
+ rotrwi 5,21,17
+ rotrwi 0,21,19
+ xor 3,3,4
+ srwi 4,24,3
+ xor 5,5,0
+ srwi 0,21,10
+ add 23,23,16
+ xor 3,3,4
+ xor 5,5,0
+ lwz 0,28(7)
+ add 23,23,3
+ add 23,23,5
+ rotrwi 3,6,6
+ rotrwi 4,6,11
+ and 5,14,6
+ xor 3,3,4
+ add 8,8,0
+ andc 0,15,6
+ rotrwi 4,4,14
+ or 5,5,0
+ add 8,8,23
+ xor 3,3,4
+ add 8,8,5
+ add 8,8,3
+
+ rotrwi 3,9,2
+ rotrwi 4,9,13
+ and 5,9,10
+ and 0,9,11
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,10,11
+ xor 3,3,4
+ add 12,12,8
+ xor 5,5,0
+ add 8,8,3
+ add 8,8,5
+
+ rotrwi 3,25,7
+ rotrwi 4,25,18
+ rotrwi 5,22,17
+ rotrwi 0,22,19
+ xor 3,3,4
+ srwi 4,25,3
+ xor 5,5,0
+ srwi 0,22,10
+ add 24,24,17
+ xor 3,3,4
+ xor 5,5,0
+ lwz 0,32(7)
+ add 24,24,3
+ add 24,24,5
+ rotrwi 3,12,6
+ rotrwi 4,12,11
+ and 5,6,12
+ xor 3,3,4
+ add 15,15,0
+ andc 0,14,12
+ rotrwi 4,4,14
+ or 5,5,0
+ add 15,15,24
+ xor 3,3,4
+ add 15,15,5
+ add 15,15,3
+
+ rotrwi 3,8,2
+ rotrwi 4,8,13
+ and 5,8,9
+ and 0,8,10
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,9,10
+ xor 3,3,4
+ add 11,11,15
+ xor 5,5,0
+ add 15,15,3
+ add 15,15,5
+
+ rotrwi 3,26,7
+ rotrwi 4,26,18
+ rotrwi 5,23,17
+ rotrwi 0,23,19
+ xor 3,3,4
+ srwi 4,26,3
+ xor 5,5,0
+ srwi 0,23,10
+ add 25,25,18
+ xor 3,3,4
+ xor 5,5,0
+ lwz 0,36(7)
+ add 25,25,3
+ add 25,25,5
+ rotrwi 3,11,6
+ rotrwi 4,11,11
+ and 5,12,11
+ xor 3,3,4
+ add 14,14,0
+ andc 0,6,11
+ rotrwi 4,4,14
+ or 5,5,0
+ add 14,14,25
+ xor 3,3,4
+ add 14,14,5
+ add 14,14,3
+
+ rotrwi 3,15,2
+ rotrwi 4,15,13
+ and 5,15,8
+ and 0,15,9
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,8,9
+ xor 3,3,4
+ add 10,10,14
+ xor 5,5,0
+ add 14,14,3
+ add 14,14,5
+
+ rotrwi 3,27,7
+ rotrwi 4,27,18
+ rotrwi 5,24,17
+ rotrwi 0,24,19
+ xor 3,3,4
+ srwi 4,27,3
+ xor 5,5,0
+ srwi 0,24,10
+ add 26,26,19
+ xor 3,3,4
+ xor 5,5,0
+ lwz 0,40(7)
+ add 26,26,3
+ add 26,26,5
+ rotrwi 3,10,6
+ rotrwi 4,10,11
+ and 5,11,10
+ xor 3,3,4
+ add 6,6,0
+ andc 0,12,10
+ rotrwi 4,4,14
+ or 5,5,0
+ add 6,6,26
+ xor 3,3,4
+ add 6,6,5
+ add 6,6,3
+
+ rotrwi 3,14,2
+ rotrwi 4,14,13
+ and 5,14,15
+ and 0,14,8
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,15,8
+ xor 3,3,4
+ add 9,9,6
+ xor 5,5,0
+ add 6,6,3
+ add 6,6,5
+
+ rotrwi 3,28,7
+ rotrwi 4,28,18
+ rotrwi 5,25,17
+ rotrwi 0,25,19
+ xor 3,3,4
+ srwi 4,28,3
+ xor 5,5,0
+ srwi 0,25,10
+ add 27,27,20
+ xor 3,3,4
+ xor 5,5,0
+ lwz 0,44(7)
+ add 27,27,3
+ add 27,27,5
+ rotrwi 3,9,6
+ rotrwi 4,9,11
+ and 5,10,9
+ xor 3,3,4
+ add 12,12,0
+ andc 0,11,9
+ rotrwi 4,4,14
+ or 5,5,0
+ add 12,12,27
+ xor 3,3,4
+ add 12,12,5
+ add 12,12,3
+
+ rotrwi 3,6,2
+ rotrwi 4,6,13
+ and 5,6,14
+ and 0,6,15
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,14,15
+ xor 3,3,4
+ add 8,8,12
+ xor 5,5,0
+ add 12,12,3
+ add 12,12,5
+
+ rotrwi 3,29,7
+ rotrwi 4,29,18
+ rotrwi 5,26,17
+ rotrwi 0,26,19
+ xor 3,3,4
+ srwi 4,29,3
+ xor 5,5,0
+ srwi 0,26,10
+ add 28,28,21
+ xor 3,3,4
+ xor 5,5,0
+ lwz 0,48(7)
+ add 28,28,3
+ add 28,28,5
+ rotrwi 3,8,6
+ rotrwi 4,8,11
+ and 5,9,8
+ xor 3,3,4
+ add 11,11,0
+ andc 0,10,8
+ rotrwi 4,4,14
+ or 5,5,0
+ add 11,11,28
+ xor 3,3,4
+ add 11,11,5
+ add 11,11,3
+
+ rotrwi 3,12,2
+ rotrwi 4,12,13
+ and 5,12,6
+ and 0,12,14
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,6,14
+ xor 3,3,4
+ add 15,15,11
+ xor 5,5,0
+ add 11,11,3
+ add 11,11,5
+
+ rotrwi 3,30,7
+ rotrwi 4,30,18
+ rotrwi 5,27,17
+ rotrwi 0,27,19
+ xor 3,3,4
+ srwi 4,30,3
+ xor 5,5,0
+ srwi 0,27,10
+ add 29,29,22
+ xor 3,3,4
+ xor 5,5,0
+ lwz 0,52(7)
+ add 29,29,3
+ add 29,29,5
+ rotrwi 3,15,6
+ rotrwi 4,15,11
+ and 5,8,15
+ xor 3,3,4
+ add 10,10,0
+ andc 0,9,15
+ rotrwi 4,4,14
+ or 5,5,0
+ add 10,10,29
+ xor 3,3,4
+ add 10,10,5
+ add 10,10,3
+
+ rotrwi 3,11,2
+ rotrwi 4,11,13
+ and 5,11,12
+ and 0,11,6
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,12,6
+ xor 3,3,4
+ add 14,14,10
+ xor 5,5,0
+ add 10,10,3
+ add 10,10,5
+
+ rotrwi 3,31,7
+ rotrwi 4,31,18
+ rotrwi 5,28,17
+ rotrwi 0,28,19
+ xor 3,3,4
+ srwi 4,31,3
+ xor 5,5,0
+ srwi 0,28,10
+ add 30,30,23
+ xor 3,3,4
+ xor 5,5,0
+ lwz 0,56(7)
+ add 30,30,3
+ add 30,30,5
+ rotrwi 3,14,6
+ rotrwi 4,14,11
+ and 5,15,14
+ xor 3,3,4
+ add 9,9,0
+ andc 0,8,14
+ rotrwi 4,4,14
+ or 5,5,0
+ add 9,9,30
+ xor 3,3,4
+ add 9,9,5
+ add 9,9,3
+
+ rotrwi 3,10,2
+ rotrwi 4,10,13
+ and 5,10,11
+ and 0,10,12
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,11,12
+ xor 3,3,4
+ add 6,6,9
+ xor 5,5,0
+ add 9,9,3
+ add 9,9,5
+
+ rotrwi 3,16,7
+ rotrwi 4,16,18
+ rotrwi 5,29,17
+ rotrwi 0,29,19
+ xor 3,3,4
+ srwi 4,16,3
+ xor 5,5,0
+ srwi 0,29,10
+ add 31,31,24
+ xor 3,3,4
+ xor 5,5,0
+ lwz 0,60(7)
+ add 31,31,3
+ add 31,31,5
+ rotrwi 3,6,6
+ rotrwi 4,6,11
+ and 5,14,6
+ xor 3,3,4
+ add 8,8,0
+ andc 0,15,6
+ rotrwi 4,4,14
+ or 5,5,0
+ add 8,8,31
+ xor 3,3,4
+ add 8,8,5
+ add 8,8,3
+
+ rotrwi 3,9,2
+ rotrwi 4,9,13
+ and 5,9,10
+ and 0,9,11
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,10,11
+ xor 3,3,4
+ add 12,12,8
+ xor 5,5,0
+ add 8,8,3
+ add 8,8,5
+
+ bdnz .Lrounds
+
+ ld 3,144(1)
+ ld 31,136(1)
+ ld 5,128(1)
+ subi 7,7,192
+
+ lwz 16,0(3)
+ lwz 17,4(3)
+ lwz 18,8(3)
+ lwz 19,12(3)
+ lwz 20,16(3)
+ lwz 21,20(3)
+ lwz 22,24(3)
+ addi 31,31,64
+ lwz 23,28(3)
+ add 8,8,16
+ add 9,9,17
+ std 31,136(1)
+ add 10,10,18
+ stw 8,0(3)
+ add 11,11,19
+ stw 9,4(3)
+ add 12,12,20
+ stw 10,8(3)
+ add 6,6,21
+ stw 11,12(3)
+ add 14,14,22
+ stw 12,16(3)
+ add 15,15,23
+ stw 6,20(3)
+ stw 14,24(3)
+ cmpld 31,5
+ stw 15,28(3)
+ bne .Lsha2_block_private
+ blr
+.long 0
+.byte 0,12,0x14,0,0,0,0,0
+.size .zfs_sha256_ppc,.-.zfs_sha256_ppc
+.size zfs_sha256_ppc,.-.zfs_sha256_ppc
+.align 6
+.LPICmeup:
+ mflr 0
+ bcl 20,31,$+4
+ mflr 7
+ addi 7,7,56
+ mtlr 0
+ blr
+.long 0
+.byte 0,12,0x14,0,0,0,0,0
+.space 28
+.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+#elif (defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
+
+.abiversion 2
+.text
+
+.globl zfs_sha256_ppc
+.type zfs_sha256_ppc,@function
+.align 6
+zfs_sha256_ppc:
+.localentry zfs_sha256_ppc,0
+
+ stdu 1,-320(1)
+ mflr 0
+ sldi 5,5,6
+
+ std 3,144(1)
+
+ std 14,176(1)
+ std 15,184(1)
+ std 16,192(1)
+ std 17,200(1)
+ std 18,208(1)
+ std 19,216(1)
+ std 20,224(1)
+ std 21,232(1)
+ std 22,240(1)
+ std 23,248(1)
+ std 24,256(1)
+ std 25,264(1)
+ std 26,272(1)
+ std 27,280(1)
+ std 28,288(1)
+ std 29,296(1)
+ std 30,304(1)
+ std 31,312(1)
+ std 0,336(1)
+ lwz 8,0(3)
+ mr 31,4
+ lwz 9,4(3)
+ lwz 10,8(3)
+ lwz 11,12(3)
+ lwz 12,16(3)
+ lwz 6,20(3)
+ lwz 14,24(3)
+ lwz 15,28(3)
+ bl .LPICmeup
+.LPICedup:
+ andi. 0,31,3
+ bne .Lunaligned
+.Laligned:
+ add 5,31,5
+ std 5,128(1)
+ std 31,136(1)
+ bl .Lsha2_block_private
+ b .Ldone
+
+.align 4
+.Lunaligned:
+ subfic 0,31,4096
+ andi. 0,0,4032
+ beq .Lcross_page
+ cmpld 5,0
+ ble .Laligned
+ subfc 5,0,5
+ add 0,31,0
+ std 5,120(1)
+ std 0,128(1)
+ std 31,136(1)
+ bl .Lsha2_block_private
+
+ ld 5,120(1)
+.Lcross_page:
+ li 0,16
+ mtctr 0
+ addi 20,1,48
+.Lmemcpy:
+ lbz 16,0(31)
+ lbz 17,1(31)
+ lbz 18,2(31)
+ lbz 19,3(31)
+ addi 31,31,4
+ stb 16,0(20)
+ stb 17,1(20)
+ stb 18,2(20)
+ stb 19,3(20)
+ addi 20,20,4
+ bdnz .Lmemcpy
+ std 31,112(1)
+ addi 0,1,112
+ addi 31,1,48
+ std 5,120(1)
+ std 0,128(1)
+ std 31,136(1)
+ bl .Lsha2_block_private
+ ld 31,112(1)
+ ld 5,120(1)
+ addic. 5,5,-64
+ bne .Lunaligned
+
+.Ldone:
+ ld 0,336(1)
+ ld 14,176(1)
+ ld 15,184(1)
+ ld 16,192(1)
+ ld 17,200(1)
+ ld 18,208(1)
+ ld 19,216(1)
+ ld 20,224(1)
+ ld 21,232(1)
+ ld 22,240(1)
+ ld 23,248(1)
+ ld 24,256(1)
+ ld 25,264(1)
+ ld 26,272(1)
+ ld 27,280(1)
+ ld 28,288(1)
+ ld 29,296(1)
+ ld 30,304(1)
+ ld 31,312(1)
+ mtlr 0
+ addi 1,1,320
+ blr
+.long 0
+.byte 0,12,4,1,0x80,18,3,0
+.long 0
+.align 4
+.Lsha2_block_private:
+ lwz 0,0(7)
+ lwz 3,0(31)
+ rotlwi 16,3,8
+ rlwimi 16,3,24,0,7
+ rlwimi 16,3,24,16,23
+ rotrwi 3,12,6
+ rotrwi 4,12,11
+ and 5,6,12
+ xor 3,3,4
+ add 15,15,0
+ andc 0,14,12
+ rotrwi 4,4,14
+ or 5,5,0
+ add 15,15,16
+ xor 3,3,4
+ add 15,15,5
+ add 15,15,3
+
+ rotrwi 3,8,2
+ rotrwi 4,8,13
+ and 5,8,9
+ and 0,8,10
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,9,10
+ xor 3,3,4
+ add 11,11,15
+ xor 5,5,0
+ lwz 0,4(7)
+ add 15,15,3
+ add 15,15,5
+
+ lwz 3,4(31)
+ rotlwi 17,3,8
+ rlwimi 17,3,24,0,7
+ rlwimi 17,3,24,16,23
+ rotrwi 3,11,6
+ rotrwi 4,11,11
+ and 5,12,11
+ xor 3,3,4
+ add 14,14,0
+ andc 0,6,11
+ rotrwi 4,4,14
+ or 5,5,0
+ add 14,14,17
+ xor 3,3,4
+ add 14,14,5
+ add 14,14,3
+
+ rotrwi 3,15,2
+ rotrwi 4,15,13
+ and 5,15,8
+ and 0,15,9
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,8,9
+ xor 3,3,4
+ add 10,10,14
+ xor 5,5,0
+ lwz 0,8(7)
+ add 14,14,3
+ add 14,14,5
+
+ lwz 3,8(31)
+ rotlwi 18,3,8
+ rlwimi 18,3,24,0,7
+ rlwimi 18,3,24,16,23
+ rotrwi 3,10,6
+ rotrwi 4,10,11
+ and 5,11,10
+ xor 3,3,4
+ add 6,6,0
+ andc 0,12,10
+ rotrwi 4,4,14
+ or 5,5,0
+ add 6,6,18
+ xor 3,3,4
+ add 6,6,5
+ add 6,6,3
+
+ rotrwi 3,14,2
+ rotrwi 4,14,13
+ and 5,14,15
+ and 0,14,8
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,15,8
+ xor 3,3,4
+ add 9,9,6
+ xor 5,5,0
+ lwz 0,12(7)
+ add 6,6,3
+ add 6,6,5
+
+ lwz 3,12(31)
+ rotlwi 19,3,8
+ rlwimi 19,3,24,0,7
+ rlwimi 19,3,24,16,23
+ rotrwi 3,9,6
+ rotrwi 4,9,11
+ and 5,10,9
+ xor 3,3,4
+ add 12,12,0
+ andc 0,11,9
+ rotrwi 4,4,14
+ or 5,5,0
+ add 12,12,19
+ xor 3,3,4
+ add 12,12,5
+ add 12,12,3
+
+ rotrwi 3,6,2
+ rotrwi 4,6,13
+ and 5,6,14
+ and 0,6,15
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,14,15
+ xor 3,3,4
+ add 8,8,12
+ xor 5,5,0
+ lwz 0,16(7)
+ add 12,12,3
+ add 12,12,5
+
+ lwz 3,16(31)
+ rotlwi 20,3,8
+ rlwimi 20,3,24,0,7
+ rlwimi 20,3,24,16,23
+ rotrwi 3,8,6
+ rotrwi 4,8,11
+ and 5,9,8
+ xor 3,3,4
+ add 11,11,0
+ andc 0,10,8
+ rotrwi 4,4,14
+ or 5,5,0
+ add 11,11,20
+ xor 3,3,4
+ add 11,11,5
+ add 11,11,3
+
+ rotrwi 3,12,2
+ rotrwi 4,12,13
+ and 5,12,6
+ and 0,12,14
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,6,14
+ xor 3,3,4
+ add 15,15,11
+ xor 5,5,0
+ lwz 0,20(7)
+ add 11,11,3
+ add 11,11,5
+
+ lwz 3,20(31)
+ rotlwi 21,3,8
+ rlwimi 21,3,24,0,7
+ rlwimi 21,3,24,16,23
+ rotrwi 3,15,6
+ rotrwi 4,15,11
+ and 5,8,15
+ xor 3,3,4
+ add 10,10,0
+ andc 0,9,15
+ rotrwi 4,4,14
+ or 5,5,0
+ add 10,10,21
+ xor 3,3,4
+ add 10,10,5
+ add 10,10,3
+
+ rotrwi 3,11,2
+ rotrwi 4,11,13
+ and 5,11,12
+ and 0,11,6
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,12,6
+ xor 3,3,4
+ add 14,14,10
+ xor 5,5,0
+ lwz 0,24(7)
+ add 10,10,3
+ add 10,10,5
+
+ lwz 3,24(31)
+ rotlwi 22,3,8
+ rlwimi 22,3,24,0,7
+ rlwimi 22,3,24,16,23
+ rotrwi 3,14,6
+ rotrwi 4,14,11
+ and 5,15,14
+ xor 3,3,4
+ add 9,9,0
+ andc 0,8,14
+ rotrwi 4,4,14
+ or 5,5,0
+ add 9,9,22
+ xor 3,3,4
+ add 9,9,5
+ add 9,9,3
+
+ rotrwi 3,10,2
+ rotrwi 4,10,13
+ and 5,10,11
+ and 0,10,12
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,11,12
+ xor 3,3,4
+ add 6,6,9
+ xor 5,5,0
+ lwz 0,28(7)
+ add 9,9,3
+ add 9,9,5
+
+ lwz 3,28(31)
+ rotlwi 23,3,8
+ rlwimi 23,3,24,0,7
+ rlwimi 23,3,24,16,23
+ rotrwi 3,6,6
+ rotrwi 4,6,11
+ and 5,14,6
+ xor 3,3,4
+ add 8,8,0
+ andc 0,15,6
+ rotrwi 4,4,14
+ or 5,5,0
+ add 8,8,23
+ xor 3,3,4
+ add 8,8,5
+ add 8,8,3
+
+ rotrwi 3,9,2
+ rotrwi 4,9,13
+ and 5,9,10
+ and 0,9,11
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,10,11
+ xor 3,3,4
+ add 12,12,8
+ xor 5,5,0
+ lwz 0,32(7)
+ add 8,8,3
+ add 8,8,5
+
+ lwz 3,32(31)
+ rotlwi 24,3,8
+ rlwimi 24,3,24,0,7
+ rlwimi 24,3,24,16,23
+ rotrwi 3,12,6
+ rotrwi 4,12,11
+ and 5,6,12
+ xor 3,3,4
+ add 15,15,0
+ andc 0,14,12
+ rotrwi 4,4,14
+ or 5,5,0
+ add 15,15,24
+ xor 3,3,4
+ add 15,15,5
+ add 15,15,3
+
+ rotrwi 3,8,2
+ rotrwi 4,8,13
+ and 5,8,9
+ and 0,8,10
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,9,10
+ xor 3,3,4
+ add 11,11,15
+ xor 5,5,0
+ lwz 0,36(7)
+ add 15,15,3
+ add 15,15,5
+
+ lwz 3,36(31)
+ rotlwi 25,3,8
+ rlwimi 25,3,24,0,7
+ rlwimi 25,3,24,16,23
+ rotrwi 3,11,6
+ rotrwi 4,11,11
+ and 5,12,11
+ xor 3,3,4
+ add 14,14,0
+ andc 0,6,11
+ rotrwi 4,4,14
+ or 5,5,0
+ add 14,14,25
+ xor 3,3,4
+ add 14,14,5
+ add 14,14,3
+
+ rotrwi 3,15,2
+ rotrwi 4,15,13
+ and 5,15,8
+ and 0,15,9
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,8,9
+ xor 3,3,4
+ add 10,10,14
+ xor 5,5,0
+ lwz 0,40(7)
+ add 14,14,3
+ add 14,14,5
+
+ lwz 3,40(31)
+ rotlwi 26,3,8
+ rlwimi 26,3,24,0,7
+ rlwimi 26,3,24,16,23
+ rotrwi 3,10,6
+ rotrwi 4,10,11
+ and 5,11,10
+ xor 3,3,4
+ add 6,6,0
+ andc 0,12,10
+ rotrwi 4,4,14
+ or 5,5,0
+ add 6,6,26
+ xor 3,3,4
+ add 6,6,5
+ add 6,6,3
+
+ rotrwi 3,14,2
+ rotrwi 4,14,13
+ and 5,14,15
+ and 0,14,8
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,15,8
+ xor 3,3,4
+ add 9,9,6
+ xor 5,5,0
+ lwz 0,44(7)
+ add 6,6,3
+ add 6,6,5
+
+ lwz 3,44(31)
+ rotlwi 27,3,8
+ rlwimi 27,3,24,0,7
+ rlwimi 27,3,24,16,23
+ rotrwi 3,9,6
+ rotrwi 4,9,11
+ and 5,10,9
+ xor 3,3,4
+ add 12,12,0
+ andc 0,11,9
+ rotrwi 4,4,14
+ or 5,5,0
+ add 12,12,27
+ xor 3,3,4
+ add 12,12,5
+ add 12,12,3
+
+ rotrwi 3,6,2
+ rotrwi 4,6,13
+ and 5,6,14
+ and 0,6,15
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,14,15
+ xor 3,3,4
+ add 8,8,12
+ xor 5,5,0
+ lwz 0,48(7)
+ add 12,12,3
+ add 12,12,5
+
+ lwz 3,48(31)
+ rotlwi 28,3,8
+ rlwimi 28,3,24,0,7
+ rlwimi 28,3,24,16,23
+ rotrwi 3,8,6
+ rotrwi 4,8,11
+ and 5,9,8
+ xor 3,3,4
+ add 11,11,0
+ andc 0,10,8
+ rotrwi 4,4,14
+ or 5,5,0
+ add 11,11,28
+ xor 3,3,4
+ add 11,11,5
+ add 11,11,3
+
+ rotrwi 3,12,2
+ rotrwi 4,12,13
+ and 5,12,6
+ and 0,12,14
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,6,14
+ xor 3,3,4
+ add 15,15,11
+ xor 5,5,0
+ lwz 0,52(7)
+ add 11,11,3
+ add 11,11,5
+
+ lwz 3,52(31)
+ rotlwi 29,3,8
+ rlwimi 29,3,24,0,7
+ rlwimi 29,3,24,16,23
+ rotrwi 3,15,6
+ rotrwi 4,15,11
+ and 5,8,15
+ xor 3,3,4
+ add 10,10,0
+ andc 0,9,15
+ rotrwi 4,4,14
+ or 5,5,0
+ add 10,10,29
+ xor 3,3,4
+ add 10,10,5
+ add 10,10,3
+
+ rotrwi 3,11,2
+ rotrwi 4,11,13
+ and 5,11,12
+ and 0,11,6
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,12,6
+ xor 3,3,4
+ add 14,14,10
+ xor 5,5,0
+ lwz 0,56(7)
+ add 10,10,3
+ add 10,10,5
+
+ lwz 3,56(31)
+ rotlwi 30,3,8
+ rlwimi 30,3,24,0,7
+ rlwimi 30,3,24,16,23
+ rotrwi 3,14,6
+ rotrwi 4,14,11
+ and 5,15,14
+ xor 3,3,4
+ add 9,9,0
+ andc 0,8,14
+ rotrwi 4,4,14
+ or 5,5,0
+ add 9,9,30
+ xor 3,3,4
+ add 9,9,5
+ add 9,9,3
+
+ rotrwi 3,10,2
+ rotrwi 4,10,13
+ and 5,10,11
+ and 0,10,12
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,11,12
+ xor 3,3,4
+ add 6,6,9
+ xor 5,5,0
+ lwz 0,60(7)
+ add 9,9,3
+ add 9,9,5
+
+ lwz 3,60(31)
+ rotlwi 31,3,8
+ rlwimi 31,3,24,0,7
+ rlwimi 31,3,24,16,23
+ rotrwi 3,6,6
+ rotrwi 4,6,11
+ and 5,14,6
+ xor 3,3,4
+ add 8,8,0
+ andc 0,15,6
+ rotrwi 4,4,14
+ or 5,5,0
+ add 8,8,31
+ xor 3,3,4
+ add 8,8,5
+ add 8,8,3
+
+ rotrwi 3,9,2
+ rotrwi 4,9,13
+ and 5,9,10
+ and 0,9,11
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,10,11
+ xor 3,3,4
+ add 12,12,8
+ xor 5,5,0
+ add 8,8,3
+ add 8,8,5
+
+ li 5,3
+ mtctr 5
+.align 4
+.Lrounds:
+ addi 7,7,64
+ rotrwi 3,17,7
+ rotrwi 4,17,18
+ rotrwi 5,30,17
+ rotrwi 0,30,19
+ xor 3,3,4
+ srwi 4,17,3
+ xor 5,5,0
+ srwi 0,30,10
+ add 16,16,25
+ xor 3,3,4
+ xor 5,5,0
+ lwz 0,0(7)
+ add 16,16,3
+ add 16,16,5
+ rotrwi 3,12,6
+ rotrwi 4,12,11
+ and 5,6,12
+ xor 3,3,4
+ add 15,15,0
+ andc 0,14,12
+ rotrwi 4,4,14
+ or 5,5,0
+ add 15,15,16
+ xor 3,3,4
+ add 15,15,5
+ add 15,15,3
+
+ rotrwi 3,8,2
+ rotrwi 4,8,13
+ and 5,8,9
+ and 0,8,10
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,9,10
+ xor 3,3,4
+ add 11,11,15
+ xor 5,5,0
+ add 15,15,3
+ add 15,15,5
+
+ rotrwi 3,18,7
+ rotrwi 4,18,18
+ rotrwi 5,31,17
+ rotrwi 0,31,19
+ xor 3,3,4
+ srwi 4,18,3
+ xor 5,5,0
+ srwi 0,31,10
+ add 17,17,26
+ xor 3,3,4
+ xor 5,5,0
+ lwz 0,4(7)
+ add 17,17,3
+ add 17,17,5
+ rotrwi 3,11,6
+ rotrwi 4,11,11
+ and 5,12,11
+ xor 3,3,4
+ add 14,14,0
+ andc 0,6,11
+ rotrwi 4,4,14
+ or 5,5,0
+ add 14,14,17
+ xor 3,3,4
+ add 14,14,5
+ add 14,14,3
+
+ rotrwi 3,15,2
+ rotrwi 4,15,13
+ and 5,15,8
+ and 0,15,9
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,8,9
+ xor 3,3,4
+ add 10,10,14
+ xor 5,5,0
+ add 14,14,3
+ add 14,14,5
+
+ rotrwi 3,19,7
+ rotrwi 4,19,18
+ rotrwi 5,16,17
+ rotrwi 0,16,19
+ xor 3,3,4
+ srwi 4,19,3
+ xor 5,5,0
+ srwi 0,16,10
+ add 18,18,27
+ xor 3,3,4
+ xor 5,5,0
+ lwz 0,8(7)
+ add 18,18,3
+ add 18,18,5
+ rotrwi 3,10,6
+ rotrwi 4,10,11
+ and 5,11,10
+ xor 3,3,4
+ add 6,6,0
+ andc 0,12,10
+ rotrwi 4,4,14
+ or 5,5,0
+ add 6,6,18
+ xor 3,3,4
+ add 6,6,5
+ add 6,6,3
+
+ rotrwi 3,14,2
+ rotrwi 4,14,13
+ and 5,14,15
+ and 0,14,8
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,15,8
+ xor 3,3,4
+ add 9,9,6
+ xor 5,5,0
+ add 6,6,3
+ add 6,6,5
+
+ rotrwi 3,20,7
+ rotrwi 4,20,18
+ rotrwi 5,17,17
+ rotrwi 0,17,19
+ xor 3,3,4
+ srwi 4,20,3
+ xor 5,5,0
+ srwi 0,17,10
+ add 19,19,28
+ xor 3,3,4
+ xor 5,5,0
+ lwz 0,12(7)
+ add 19,19,3
+ add 19,19,5
+ rotrwi 3,9,6
+ rotrwi 4,9,11
+ and 5,10,9
+ xor 3,3,4
+ add 12,12,0
+ andc 0,11,9
+ rotrwi 4,4,14
+ or 5,5,0
+ add 12,12,19
+ xor 3,3,4
+ add 12,12,5
+ add 12,12,3
+
+ rotrwi 3,6,2
+ rotrwi 4,6,13
+ and 5,6,14
+ and 0,6,15
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,14,15
+ xor 3,3,4
+ add 8,8,12
+ xor 5,5,0
+ add 12,12,3
+ add 12,12,5
+
+ rotrwi 3,21,7
+ rotrwi 4,21,18
+ rotrwi 5,18,17
+ rotrwi 0,18,19
+ xor 3,3,4
+ srwi 4,21,3
+ xor 5,5,0
+ srwi 0,18,10
+ add 20,20,29
+ xor 3,3,4
+ xor 5,5,0
+ lwz 0,16(7)
+ add 20,20,3
+ add 20,20,5
+ rotrwi 3,8,6
+ rotrwi 4,8,11
+ and 5,9,8
+ xor 3,3,4
+ add 11,11,0
+ andc 0,10,8
+ rotrwi 4,4,14
+ or 5,5,0
+ add 11,11,20
+ xor 3,3,4
+ add 11,11,5
+ add 11,11,3
+
+ rotrwi 3,12,2
+ rotrwi 4,12,13
+ and 5,12,6
+ and 0,12,14
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,6,14
+ xor 3,3,4
+ add 15,15,11
+ xor 5,5,0
+ add 11,11,3
+ add 11,11,5
+
+ rotrwi 3,22,7
+ rotrwi 4,22,18
+ rotrwi 5,19,17
+ rotrwi 0,19,19
+ xor 3,3,4
+ srwi 4,22,3
+ xor 5,5,0
+ srwi 0,19,10
+ add 21,21,30
+ xor 3,3,4
+ xor 5,5,0
+ lwz 0,20(7)
+ add 21,21,3
+ add 21,21,5
+ rotrwi 3,15,6
+ rotrwi 4,15,11
+ and 5,8,15
+ xor 3,3,4
+ add 10,10,0
+ andc 0,9,15
+ rotrwi 4,4,14
+ or 5,5,0
+ add 10,10,21
+ xor 3,3,4
+ add 10,10,5
+ add 10,10,3
+
+ rotrwi 3,11,2
+ rotrwi 4,11,13
+ and 5,11,12
+ and 0,11,6
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,12,6
+ xor 3,3,4
+ add 14,14,10
+ xor 5,5,0
+ add 10,10,3
+ add 10,10,5
+
+ rotrwi 3,23,7
+ rotrwi 4,23,18
+ rotrwi 5,20,17
+ rotrwi 0,20,19
+ xor 3,3,4
+ srwi 4,23,3
+ xor 5,5,0
+ srwi 0,20,10
+ add 22,22,31
+ xor 3,3,4
+ xor 5,5,0
+ lwz 0,24(7)
+ add 22,22,3
+ add 22,22,5
+ rotrwi 3,14,6
+ rotrwi 4,14,11
+ and 5,15,14
+ xor 3,3,4
+ add 9,9,0
+ andc 0,8,14
+ rotrwi 4,4,14
+ or 5,5,0
+ add 9,9,22
+ xor 3,3,4
+ add 9,9,5
+ add 9,9,3
+
+ rotrwi 3,10,2
+ rotrwi 4,10,13
+ and 5,10,11
+ and 0,10,12
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,11,12
+ xor 3,3,4
+ add 6,6,9
+ xor 5,5,0
+ add 9,9,3
+ add 9,9,5
+
+ rotrwi 3,24,7
+ rotrwi 4,24,18
+ rotrwi 5,21,17
+ rotrwi 0,21,19
+ xor 3,3,4
+ srwi 4,24,3
+ xor 5,5,0
+ srwi 0,21,10
+ add 23,23,16
+ xor 3,3,4
+ xor 5,5,0
+ lwz 0,28(7)
+ add 23,23,3
+ add 23,23,5
+ rotrwi 3,6,6
+ rotrwi 4,6,11
+ and 5,14,6
+ xor 3,3,4
+ add 8,8,0
+ andc 0,15,6
+ rotrwi 4,4,14
+ or 5,5,0
+ add 8,8,23
+ xor 3,3,4
+ add 8,8,5
+ add 8,8,3
+
+ rotrwi 3,9,2
+ rotrwi 4,9,13
+ and 5,9,10
+ and 0,9,11
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,10,11
+ xor 3,3,4
+ add 12,12,8
+ xor 5,5,0
+ add 8,8,3
+ add 8,8,5
+
+ rotrwi 3,25,7
+ rotrwi 4,25,18
+ rotrwi 5,22,17
+ rotrwi 0,22,19
+ xor 3,3,4
+ srwi 4,25,3
+ xor 5,5,0
+ srwi 0,22,10
+ add 24,24,17
+ xor 3,3,4
+ xor 5,5,0
+ lwz 0,32(7)
+ add 24,24,3
+ add 24,24,5
+ rotrwi 3,12,6
+ rotrwi 4,12,11
+ and 5,6,12
+ xor 3,3,4
+ add 15,15,0
+ andc 0,14,12
+ rotrwi 4,4,14
+ or 5,5,0
+ add 15,15,24
+ xor 3,3,4
+ add 15,15,5
+ add 15,15,3
+
+ rotrwi 3,8,2
+ rotrwi 4,8,13
+ and 5,8,9
+ and 0,8,10
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,9,10
+ xor 3,3,4
+ add 11,11,15
+ xor 5,5,0
+ add 15,15,3
+ add 15,15,5
+
+ rotrwi 3,26,7
+ rotrwi 4,26,18
+ rotrwi 5,23,17
+ rotrwi 0,23,19
+ xor 3,3,4
+ srwi 4,26,3
+ xor 5,5,0
+ srwi 0,23,10
+ add 25,25,18
+ xor 3,3,4
+ xor 5,5,0
+ lwz 0,36(7)
+ add 25,25,3
+ add 25,25,5
+ rotrwi 3,11,6
+ rotrwi 4,11,11
+ and 5,12,11
+ xor 3,3,4
+ add 14,14,0
+ andc 0,6,11
+ rotrwi 4,4,14
+ or 5,5,0
+ add 14,14,25
+ xor 3,3,4
+ add 14,14,5
+ add 14,14,3
+
+ rotrwi 3,15,2
+ rotrwi 4,15,13
+ and 5,15,8
+ and 0,15,9
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,8,9
+ xor 3,3,4
+ add 10,10,14
+ xor 5,5,0
+ add 14,14,3
+ add 14,14,5
+
+ rotrwi 3,27,7
+ rotrwi 4,27,18
+ rotrwi 5,24,17
+ rotrwi 0,24,19
+ xor 3,3,4
+ srwi 4,27,3
+ xor 5,5,0
+ srwi 0,24,10
+ add 26,26,19
+ xor 3,3,4
+ xor 5,5,0
+ lwz 0,40(7)
+ add 26,26,3
+ add 26,26,5
+ rotrwi 3,10,6
+ rotrwi 4,10,11
+ and 5,11,10
+ xor 3,3,4
+ add 6,6,0
+ andc 0,12,10
+ rotrwi 4,4,14
+ or 5,5,0
+ add 6,6,26
+ xor 3,3,4
+ add 6,6,5
+ add 6,6,3
+
+ rotrwi 3,14,2
+ rotrwi 4,14,13
+ and 5,14,15
+ and 0,14,8
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,15,8
+ xor 3,3,4
+ add 9,9,6
+ xor 5,5,0
+ add 6,6,3
+ add 6,6,5
+
+ rotrwi 3,28,7
+ rotrwi 4,28,18
+ rotrwi 5,25,17
+ rotrwi 0,25,19
+ xor 3,3,4
+ srwi 4,28,3
+ xor 5,5,0
+ srwi 0,25,10
+ add 27,27,20
+ xor 3,3,4
+ xor 5,5,0
+ lwz 0,44(7)
+ add 27,27,3
+ add 27,27,5
+ rotrwi 3,9,6
+ rotrwi 4,9,11
+ and 5,10,9
+ xor 3,3,4
+ add 12,12,0
+ andc 0,11,9
+ rotrwi 4,4,14
+ or 5,5,0
+ add 12,12,27
+ xor 3,3,4
+ add 12,12,5
+ add 12,12,3
+
+ rotrwi 3,6,2
+ rotrwi 4,6,13
+ and 5,6,14
+ and 0,6,15
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,14,15
+ xor 3,3,4
+ add 8,8,12
+ xor 5,5,0
+ add 12,12,3
+ add 12,12,5
+
+ rotrwi 3,29,7
+ rotrwi 4,29,18
+ rotrwi 5,26,17
+ rotrwi 0,26,19
+ xor 3,3,4
+ srwi 4,29,3
+ xor 5,5,0
+ srwi 0,26,10
+ add 28,28,21
+ xor 3,3,4
+ xor 5,5,0
+ lwz 0,48(7)
+ add 28,28,3
+ add 28,28,5
+ rotrwi 3,8,6
+ rotrwi 4,8,11
+ and 5,9,8
+ xor 3,3,4
+ add 11,11,0
+ andc 0,10,8
+ rotrwi 4,4,14
+ or 5,5,0
+ add 11,11,28
+ xor 3,3,4
+ add 11,11,5
+ add 11,11,3
+
+ rotrwi 3,12,2
+ rotrwi 4,12,13
+ and 5,12,6
+ and 0,12,14
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,6,14
+ xor 3,3,4
+ add 15,15,11
+ xor 5,5,0
+ add 11,11,3
+ add 11,11,5
+
+ rotrwi 3,30,7
+ rotrwi 4,30,18
+ rotrwi 5,27,17
+ rotrwi 0,27,19
+ xor 3,3,4
+ srwi 4,30,3
+ xor 5,5,0
+ srwi 0,27,10
+ add 29,29,22
+ xor 3,3,4
+ xor 5,5,0
+ lwz 0,52(7)
+ add 29,29,3
+ add 29,29,5
+ rotrwi 3,15,6
+ rotrwi 4,15,11
+ and 5,8,15
+ xor 3,3,4
+ add 10,10,0
+ andc 0,9,15
+ rotrwi 4,4,14
+ or 5,5,0
+ add 10,10,29
+ xor 3,3,4
+ add 10,10,5
+ add 10,10,3
+
+ rotrwi 3,11,2
+ rotrwi 4,11,13
+ and 5,11,12
+ and 0,11,6
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,12,6
+ xor 3,3,4
+ add 14,14,10
+ xor 5,5,0
+ add 10,10,3
+ add 10,10,5
+
+ rotrwi 3,31,7
+ rotrwi 4,31,18
+ rotrwi 5,28,17
+ rotrwi 0,28,19
+ xor 3,3,4
+ srwi 4,31,3
+ xor 5,5,0
+ srwi 0,28,10
+ add 30,30,23
+ xor 3,3,4
+ xor 5,5,0
+ lwz 0,56(7)
+ add 30,30,3
+ add 30,30,5
+ rotrwi 3,14,6
+ rotrwi 4,14,11
+ and 5,15,14
+ xor 3,3,4
+ add 9,9,0
+ andc 0,8,14
+ rotrwi 4,4,14
+ or 5,5,0
+ add 9,9,30
+ xor 3,3,4
+ add 9,9,5
+ add 9,9,3
+
+ rotrwi 3,10,2
+ rotrwi 4,10,13
+ and 5,10,11
+ and 0,10,12
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,11,12
+ xor 3,3,4
+ add 6,6,9
+ xor 5,5,0
+ add 9,9,3
+ add 9,9,5
+
+ rotrwi 3,16,7
+ rotrwi 4,16,18
+ rotrwi 5,29,17
+ rotrwi 0,29,19
+ xor 3,3,4
+ srwi 4,16,3
+ xor 5,5,0
+ srwi 0,29,10
+ add 31,31,24
+ xor 3,3,4
+ xor 5,5,0
+ lwz 0,60(7)
+ add 31,31,3
+ add 31,31,5
+ rotrwi 3,6,6
+ rotrwi 4,6,11
+ and 5,14,6
+ xor 3,3,4
+ add 8,8,0
+ andc 0,15,6
+ rotrwi 4,4,14
+ or 5,5,0
+ add 8,8,31
+ xor 3,3,4
+ add 8,8,5
+ add 8,8,3
+
+ rotrwi 3,9,2
+ rotrwi 4,9,13
+ and 5,9,10
+ and 0,9,11
+ xor 3,3,4
+ rotrwi 4,4,9
+ xor 5,5,0
+ and 0,10,11
+ xor 3,3,4
+ add 12,12,8
+ xor 5,5,0
+ add 8,8,3
+ add 8,8,5
+
+ bdnz .Lrounds
+
+ ld 3,144(1)
+ ld 31,136(1)
+ ld 5,128(1)
+ subi 7,7,192
+
+ lwz 16,0(3)
+ lwz 17,4(3)
+ lwz 18,8(3)
+ lwz 19,12(3)
+ lwz 20,16(3)
+ lwz 21,20(3)
+ lwz 22,24(3)
+ addi 31,31,64
+ lwz 23,28(3)
+ add 8,8,16
+ add 9,9,17
+ std 31,136(1)
+ add 10,10,18
+ stw 8,0(3)
+ add 11,11,19
+ stw 9,4(3)
+ add 12,12,20
+ stw 10,8(3)
+ add 6,6,21
+ stw 11,12(3)
+ add 14,14,22
+ stw 12,16(3)
+ add 15,15,23
+ stw 6,20(3)
+ stw 14,24(3)
+ cmpld 31,5
+ stw 15,28(3)
+ bne .Lsha2_block_private
+ blr
+.long 0
+.byte 0,12,0x14,0,0,0,0,0
+.size zfs_sha256_ppc,.-zfs_sha256_ppc
+.align 6
+.LPICmeup:
+ mflr 0
+ bcl 20,31,$+4
+ mflr 7
+ addi 7,7,56
+ mtlr 0
+ blr
+.long 0
+.byte 0,12,0x14,0,0,0,0,0
+.space 28
+.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+#endif
diff --git a/module/icp/asm-ppc64/sha2/sha512-p8.S b/module/icp/asm-ppc64/sha2/sha512-p8.S
new file mode 100644
index 000000000..39a90ede3
--- /dev/null
+++ b/module/icp/asm-ppc64/sha2/sha512-p8.S
@@ -0,0 +1,1706 @@
+/*
+ * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Portions Copyright (c) 2022 Tino Reichardt <[email protected]>
+ * - modified assembly to fit into OpenZFS
+ */
+
+#if (defined(__PPC64__) && defined(__BIG_ENDIAN__))
+
+.text
+
+.globl zfs_sha512_power8
+.globl .zfs_sha512_power8
+.type zfs_sha512_power8,@function
+.section ".opd","aw"
+.align 3
+zfs_sha512_power8:
+.quad .zfs_sha512_power8,.TOC.@tocbase,0
+.previous
+.align 6
+.zfs_sha512_power8:
+ stdu 1,-384(1)
+ mflr 8
+ li 10,207
+ li 11,223
+ stvx 24,10,1
+ addi 10,10,32
+ mfspr 12,256
+ stvx 25,11,1
+ addi 11,11,32
+ stvx 26,10,1
+ addi 10,10,32
+ stvx 27,11,1
+ addi 11,11,32
+ stvx 28,10,1
+ addi 10,10,32
+ stvx 29,11,1
+ addi 11,11,32
+ stvx 30,10,1
+ stvx 31,11,1
+ li 11,-4096+255
+ stw 12,332(1)
+ li 10,0x10
+ std 26,336(1)
+ li 26,0x20
+ std 27,344(1)
+ li 27,0x30
+ std 28,352(1)
+ li 28,0x40
+ std 29,360(1)
+ li 29,0x50
+ std 30,368(1)
+ li 30,0x60
+ std 31,376(1)
+ li 31,0x70
+ std 8,400(1)
+ mtspr 256,11
+
+ bl .LPICmeup
+ addi 11,1,79
+ .long 0x7C001E99
+ .long 0x7C4A1E99
+ .long 0x7C9A1E99
+ vsldoi 1,0,0,8
+ .long 0x7CDB1E99
+ vsldoi 3,2,2,8
+ vsldoi 5,4,4,8
+ vsldoi 7,6,6,8
+ li 0,4
+ b .Loop
+.align 5
+.Loop:
+ lvx 28,0,6
+ .long 0x7D002699
+ addi 4,4,16
+ mr 7,6
+ stvx 0,0,11
+ stvx 1,10,11
+ stvx 2,26,11
+ stvx 3,27,11
+ stvx 4,28,11
+ stvx 5,29,11
+ stvx 6,30,11
+ stvx 7,31,11
+ .long 0x10E7E0C0
+ lvx 28,10,6
+ .long 0x10E740C0
+ vsel 29,6,5,4
+ .long 0x10C6E0C0
+ .long 0x10E7E8C0
+ .long 0x13C4FEC2
+ .long 0x10E7F0C0
+ vxor 29,0,1
+ vsel 29,1,2,29
+ .long 0x106338C0
+ .long 0x13C086C2
+ .long 0x13DEE8C0
+ .long 0x10E7F0C0
+ lvx 28,26,7
+ .long 0x7D402699
+ addi 4,4,16
+ vsldoi 9,8,8,8
+ .long 0x10C648C0
+ vsel 29,5,4,3
+ .long 0x10A5E0C0
+ .long 0x10C6E8C0
+ .long 0x13C3FEC2
+ .long 0x10C6F0C0
+ vxor 29,7,0
+ vsel 29,0,1,29
+ .long 0x104230C0
+ .long 0x13C786C2
+ .long 0x13DEE8C0
+ .long 0x10C6F0C0
+ lvx 28,27,7
+ .long 0x10A550C0
+ vsel 29,4,3,2
+ .long 0x1084E0C0
+ .long 0x10A5E8C0
+ .long 0x13C2FEC2
+ .long 0x10A5F0C0
+ vxor 29,6,7
+ vsel 29,7,0,29
+ .long 0x102128C0
+ .long 0x13C686C2
+ .long 0x13DEE8C0
+ .long 0x10A5F0C0
+ lvx 28,28,7
+ .long 0x7D802699
+ addi 4,4,16
+ vsldoi 11,10,10,8
+ .long 0x108458C0
+ vsel 29,3,2,1
+ .long 0x1063E0C0
+ .long 0x1084E8C0
+ .long 0x13C1FEC2
+ .long 0x1084F0C0
+ vxor 29,5,6
+ vsel 29,6,7,29
+ .long 0x100020C0
+ .long 0x13C586C2
+ .long 0x13DEE8C0
+ .long 0x1084F0C0
+ lvx 28,29,7
+ .long 0x106360C0
+ vsel 29,2,1,0
+ .long 0x1042E0C0
+ .long 0x1063E8C0
+ .long 0x13C0FEC2
+ .long 0x1063F0C0
+ vxor 29,4,5
+ vsel 29,5,6,29
+ .long 0x10E718C0
+ .long 0x13C486C2
+ .long 0x13DEE8C0
+ .long 0x1063F0C0
+ lvx 28,30,7
+ .long 0x7DC02699
+ addi 4,4,16
+ vsldoi 13,12,12,8
+ .long 0x104268C0
+ vsel 29,1,0,7
+ .long 0x1021E0C0
+ .long 0x1042E8C0
+ .long 0x13C7FEC2
+ .long 0x1042F0C0
+ vxor 29,3,4
+ vsel 29,4,5,29
+ .long 0x10C610C0
+ .long 0x13C386C2
+ .long 0x13DEE8C0
+ .long 0x1042F0C0
+ lvx 28,31,7
+ addi 7,7,0x80
+ .long 0x102170C0
+ vsel 29,0,7,6
+ .long 0x1000E0C0
+ .long 0x1021E8C0
+ .long 0x13C6FEC2
+ .long 0x1021F0C0
+ vxor 29,2,3
+ vsel 29,3,4,29
+ .long 0x10A508C0
+ .long 0x13C286C2
+ .long 0x13DEE8C0
+ .long 0x1021F0C0
+ lvx 28,0,7
+ .long 0x7E002699
+ addi 4,4,16
+ vsldoi 15,14,14,8
+ .long 0x100078C0
+ vsel 29,7,6,5
+ .long 0x10E7E0C0
+ .long 0x1000E8C0
+ .long 0x13C5FEC2
+ .long 0x1000F0C0
+ vxor 29,1,2
+ vsel 29,2,3,29
+ .long 0x108400C0
+ .long 0x13C186C2
+ .long 0x13DEE8C0
+ .long 0x1000F0C0
+ lvx 28,10,7
+ .long 0x10E780C0
+ vsel 29,6,5,4
+ .long 0x10C6E0C0
+ .long 0x10E7E8C0
+ .long 0x13C4FEC2
+ .long 0x10E7F0C0
+ vxor 29,0,1
+ vsel 29,1,2,29
+ .long 0x106338C0
+ .long 0x13C086C2
+ .long 0x13DEE8C0
+ .long 0x10E7F0C0
+ lvx 28,26,7
+ .long 0x7E402699
+ addi 4,4,16
+ vsldoi 17,16,16,8
+ .long 0x10C688C0
+ vsel 29,5,4,3
+ .long 0x10A5E0C0
+ .long 0x10C6E8C0
+ .long 0x13C3FEC2
+ .long 0x10C6F0C0
+ vxor 29,7,0
+ vsel 29,0,1,29
+ .long 0x104230C0
+ .long 0x13C786C2
+ .long 0x13DEE8C0
+ .long 0x10C6F0C0
+ lvx 28,27,7
+ .long 0x10A590C0
+ vsel 29,4,3,2
+ .long 0x1084E0C0
+ .long 0x10A5E8C0
+ .long 0x13C2FEC2
+ .long 0x10A5F0C0
+ vxor 29,6,7
+ vsel 29,7,0,29
+ .long 0x102128C0
+ .long 0x13C686C2
+ .long 0x13DEE8C0
+ .long 0x10A5F0C0
+ lvx 28,28,7
+ .long 0x7F002699
+ addi 4,4,16
+ vsldoi 19,18,18,8
+ .long 0x108498C0
+ vsel 29,3,2,1
+ .long 0x1063E0C0
+ .long 0x1084E8C0
+ .long 0x13C1FEC2
+ .long 0x1084F0C0
+ vxor 29,5,6
+ vsel 29,6,7,29
+ .long 0x100020C0
+ .long 0x13C586C2
+ .long 0x13DEE8C0
+ .long 0x1084F0C0
+ lvx 28,29,7
+ .long 0x1063C0C0
+ vsel 29,2,1,0
+ .long 0x1042E0C0
+ .long 0x1063E8C0
+ .long 0x13C0FEC2
+ .long 0x1063F0C0
+ vxor 29,4,5
+ vsel 29,5,6,29
+ .long 0x10E718C0
+ .long 0x13C486C2
+ .long 0x13DEE8C0
+ .long 0x1063F0C0
+ lvx 28,30,7
+ .long 0x7F402699
+ addi 4,4,16
+ vsldoi 25,24,24,8
+ .long 0x1042C8C0
+ vsel 29,1,0,7
+ .long 0x1021E0C0
+ .long 0x1042E8C0
+ .long 0x13C7FEC2
+ .long 0x1042F0C0
+ vxor 29,3,4
+ vsel 29,4,5,29
+ .long 0x10C610C0
+ .long 0x13C386C2
+ .long 0x13DEE8C0
+ .long 0x1042F0C0
+ lvx 28,31,7
+ addi 7,7,0x80
+ .long 0x1021D0C0
+ vsel 29,0,7,6
+ .long 0x1000E0C0
+ .long 0x1021E8C0
+ .long 0x13C6FEC2
+ .long 0x1021F0C0
+ vxor 29,2,3
+ vsel 29,3,4,29
+ .long 0x10A508C0
+ .long 0x13C286C2
+ .long 0x13DEE8C0
+ .long 0x1021F0C0
+ lvx 28,0,7
+ vsldoi 27,26,26,8
+ .long 0x13C906C2
+ .long 0x1108F0C0
+ .long 0x13DA7EC2
+ .long 0x1108F0C0
+ .long 0x110888C0
+ .long 0x1000D8C0
+ vsel 29,7,6,5
+ .long 0x10E7E0C0
+ .long 0x1000E8C0
+ .long 0x13C5FEC2
+ .long 0x1000F0C0
+ vxor 29,1,2
+ vsel 29,2,3,29
+ .long 0x108400C0
+ .long 0x13C186C2
+ .long 0x13DEE8C0
+ .long 0x1000F0C0
+ lvx 28,10,7
+ mtctr 0
+ b .L16_xx
+.align 5
+.L16_xx:
+ .long 0x13CA06C2
+ .long 0x1129F0C0
+ .long 0x13DB7EC2
+ .long 0x1129F0C0
+ .long 0x112990C0
+ .long 0x10E740C0
+ vsel 29,6,5,4
+ .long 0x10C6E0C0
+ .long 0x10E7E8C0
+ .long 0x13C4FEC2
+ .long 0x10E7F0C0
+ vxor 29,0,1
+ vsel 29,1,2,29
+ .long 0x106338C0
+ .long 0x13C086C2
+ .long 0x13DEE8C0
+ .long 0x10E7F0C0
+ lvx 28,26,7
+ .long 0x13CB06C2
+ .long 0x114AF0C0
+ .long 0x13C87EC2
+ .long 0x114AF0C0
+ .long 0x114A98C0
+ .long 0x10C648C0
+ vsel 29,5,4,3
+ .long 0x10A5E0C0
+ .long 0x10C6E8C0
+ .long 0x13C3FEC2
+ .long 0x10C6F0C0
+ vxor 29,7,0
+ vsel 29,0,1,29
+ .long 0x104230C0
+ .long 0x13C786C2
+ .long 0x13DEE8C0
+ .long 0x10C6F0C0
+ lvx 28,27,7
+ .long 0x13CC06C2
+ .long 0x116BF0C0
+ .long 0x13C97EC2
+ .long 0x116BF0C0
+ .long 0x116BC0C0
+ .long 0x10A550C0
+ vsel 29,4,3,2
+ .long 0x1084E0C0
+ .long 0x10A5E8C0
+ .long 0x13C2FEC2
+ .long 0x10A5F0C0
+ vxor 29,6,7
+ vsel 29,7,0,29
+ .long 0x102128C0
+ .long 0x13C686C2
+ .long 0x13DEE8C0
+ .long 0x10A5F0C0
+ lvx 28,28,7
+ .long 0x13CD06C2
+ .long 0x118CF0C0
+ .long 0x13CA7EC2
+ .long 0x118CF0C0
+ .long 0x118CC8C0
+ .long 0x108458C0
+ vsel 29,3,2,1
+ .long 0x1063E0C0
+ .long 0x1084E8C0
+ .long 0x13C1FEC2
+ .long 0x1084F0C0
+ vxor 29,5,6
+ vsel 29,6,7,29
+ .long 0x100020C0
+ .long 0x13C586C2
+ .long 0x13DEE8C0
+ .long 0x1084F0C0
+ lvx 28,29,7
+ .long 0x13CE06C2
+ .long 0x11ADF0C0
+ .long 0x13CB7EC2
+ .long 0x11ADF0C0
+ .long 0x11ADD0C0
+ .long 0x106360C0
+ vsel 29,2,1,0
+ .long 0x1042E0C0
+ .long 0x1063E8C0
+ .long 0x13C0FEC2
+ .long 0x1063F0C0
+ vxor 29,4,5
+ vsel 29,5,6,29
+ .long 0x10E718C0
+ .long 0x13C486C2
+ .long 0x13DEE8C0
+ .long 0x1063F0C0
+ lvx 28,30,7
+ .long 0x13CF06C2
+ .long 0x11CEF0C0
+ .long 0x13CC7EC2
+ .long 0x11CEF0C0
+ .long 0x11CED8C0
+ .long 0x104268C0
+ vsel 29,1,0,7
+ .long 0x1021E0C0
+ .long 0x1042E8C0
+ .long 0x13C7FEC2
+ .long 0x1042F0C0
+ vxor 29,3,4
+ vsel 29,4,5,29
+ .long 0x10C610C0
+ .long 0x13C386C2
+ .long 0x13DEE8C0
+ .long 0x1042F0C0
+ lvx 28,31,7
+ addi 7,7,0x80
+ .long 0x13D006C2
+ .long 0x11EFF0C0
+ .long 0x13CD7EC2
+ .long 0x11EFF0C0
+ .long 0x11EF40C0
+ .long 0x102170C0
+ vsel 29,0,7,6
+ .long 0x1000E0C0
+ .long 0x1021E8C0
+ .long 0x13C6FEC2
+ .long 0x1021F0C0
+ vxor 29,2,3
+ vsel 29,3,4,29
+ .long 0x10A508C0
+ .long 0x13C286C2
+ .long 0x13DEE8C0
+ .long 0x1021F0C0
+ lvx 28,0,7
+ .long 0x13D106C2
+ .long 0x1210F0C0
+ .long 0x13CE7EC2
+ .long 0x1210F0C0
+ .long 0x121048C0
+ .long 0x100078C0
+ vsel 29,7,6,5
+ .long 0x10E7E0C0
+ .long 0x1000E8C0
+ .long 0x13C5FEC2
+ .long 0x1000F0C0
+ vxor 29,1,2
+ vsel 29,2,3,29
+ .long 0x108400C0
+ .long 0x13C186C2
+ .long 0x13DEE8C0
+ .long 0x1000F0C0
+ lvx 28,10,7
+ .long 0x13D206C2
+ .long 0x1231F0C0
+ .long 0x13CF7EC2
+ .long 0x1231F0C0
+ .long 0x123150C0
+ .long 0x10E780C0
+ vsel 29,6,5,4
+ .long 0x10C6E0C0
+ .long 0x10E7E8C0
+ .long 0x13C4FEC2
+ .long 0x10E7F0C0
+ vxor 29,0,1
+ vsel 29,1,2,29
+ .long 0x106338C0
+ .long 0x13C086C2
+ .long 0x13DEE8C0
+ .long 0x10E7F0C0
+ lvx 28,26,7
+ .long 0x13D306C2
+ .long 0x1252F0C0
+ .long 0x13D07EC2
+ .long 0x1252F0C0
+ .long 0x125258C0
+ .long 0x10C688C0
+ vsel 29,5,4,3
+ .long 0x10A5E0C0
+ .long 0x10C6E8C0
+ .long 0x13C3FEC2
+ .long 0x10C6F0C0
+ vxor 29,7,0
+ vsel 29,0,1,29
+ .long 0x104230C0
+ .long 0x13C786C2
+ .long 0x13DEE8C0
+ .long 0x10C6F0C0
+ lvx 28,27,7
+ .long 0x13D806C2
+ .long 0x1273F0C0
+ .long 0x13D17EC2
+ .long 0x1273F0C0
+ .long 0x127360C0
+ .long 0x10A590C0
+ vsel 29,4,3,2
+ .long 0x1084E0C0
+ .long 0x10A5E8C0
+ .long 0x13C2FEC2
+ .long 0x10A5F0C0
+ vxor 29,6,7
+ vsel 29,7,0,29
+ .long 0x102128C0
+ .long 0x13C686C2
+ .long 0x13DEE8C0
+ .long 0x10A5F0C0
+ lvx 28,28,7
+ .long 0x13D906C2
+ .long 0x1318F0C0
+ .long 0x13D27EC2
+ .long 0x1318F0C0
+ .long 0x131868C0
+ .long 0x108498C0
+ vsel 29,3,2,1
+ .long 0x1063E0C0
+ .long 0x1084E8C0
+ .long 0x13C1FEC2
+ .long 0x1084F0C0
+ vxor 29,5,6
+ vsel 29,6,7,29
+ .long 0x100020C0
+ .long 0x13C586C2
+ .long 0x13DEE8C0
+ .long 0x1084F0C0
+ lvx 28,29,7
+ .long 0x13DA06C2
+ .long 0x1339F0C0
+ .long 0x13D37EC2
+ .long 0x1339F0C0
+ .long 0x133970C0
+ .long 0x1063C0C0
+ vsel 29,2,1,0
+ .long 0x1042E0C0
+ .long 0x1063E8C0
+ .long 0x13C0FEC2
+ .long 0x1063F0C0
+ vxor 29,4,5
+ vsel 29,5,6,29
+ .long 0x10E718C0
+ .long 0x13C486C2
+ .long 0x13DEE8C0
+ .long 0x1063F0C0
+ lvx 28,30,7
+ .long 0x13DB06C2
+ .long 0x135AF0C0
+ .long 0x13D87EC2
+ .long 0x135AF0C0
+ .long 0x135A78C0
+ .long 0x1042C8C0
+ vsel 29,1,0,7
+ .long 0x1021E0C0
+ .long 0x1042E8C0
+ .long 0x13C7FEC2
+ .long 0x1042F0C0
+ vxor 29,3,4
+ vsel 29,4,5,29
+ .long 0x10C610C0
+ .long 0x13C386C2
+ .long 0x13DEE8C0
+ .long 0x1042F0C0
+ lvx 28,31,7
+ addi 7,7,0x80
+ .long 0x13C806C2
+ .long 0x137BF0C0
+ .long 0x13D97EC2
+ .long 0x137BF0C0
+ .long 0x137B80C0
+ .long 0x1021D0C0
+ vsel 29,0,7,6
+ .long 0x1000E0C0
+ .long 0x1021E8C0
+ .long 0x13C6FEC2
+ .long 0x1021F0C0
+ vxor 29,2,3
+ vsel 29,3,4,29
+ .long 0x10A508C0
+ .long 0x13C286C2
+ .long 0x13DEE8C0
+ .long 0x1021F0C0
+ lvx 28,0,7
+ .long 0x13C906C2
+ .long 0x1108F0C0
+ .long 0x13DA7EC2
+ .long 0x1108F0C0
+ .long 0x110888C0
+ .long 0x1000D8C0
+ vsel 29,7,6,5
+ .long 0x10E7E0C0
+ .long 0x1000E8C0
+ .long 0x13C5FEC2
+ .long 0x1000F0C0
+ vxor 29,1,2
+ vsel 29,2,3,29
+ .long 0x108400C0
+ .long 0x13C186C2
+ .long 0x13DEE8C0
+ .long 0x1000F0C0
+ lvx 28,10,7
+ bdnz .L16_xx
+
+ lvx 10,0,11
+ subic. 5,5,1
+ lvx 11,10,11
+ .long 0x100050C0
+ lvx 12,26,11
+ .long 0x102158C0
+ lvx 13,27,11
+ .long 0x104260C0
+ lvx 14,28,11
+ .long 0x106368C0
+ lvx 15,29,11
+ .long 0x108470C0
+ lvx 16,30,11
+ .long 0x10A578C0
+ lvx 17,31,11
+ .long 0x10C680C0
+ .long 0x10E788C0
+ bne .Loop
+ vperm 0,0,1,28
+ vperm 2,2,3,28
+ vperm 4,4,5,28
+ vperm 6,6,7,28
+ .long 0x7C001F99
+ .long 0x7C4A1F99
+ .long 0x7C9A1F99
+ .long 0x7CDB1F99
+ addi 11,1,207
+ mtlr 8
+ mtspr 256,12
+ lvx 24,0,11
+ lvx 25,10,11
+ lvx 26,26,11
+ lvx 27,27,11
+ lvx 28,28,11
+ lvx 29,29,11
+ lvx 30,30,11
+ lvx 31,31,11
+ ld 26,336(1)
+ ld 27,344(1)
+ ld 28,352(1)
+ ld 29,360(1)
+ ld 30,368(1)
+ ld 31,376(1)
+ addi 1,1,384
+ blr
+.long 0
+.byte 0,12,4,1,0x80,6,3,0
+.long 0
+.size .zfs_sha512_power8,.-.zfs_sha512_power8
+.size zfs_sha512_power8,.-.zfs_sha512_power8
+.align 6
+.LPICmeup:
+ mflr 0
+ bcl 20,31,$+4
+ mflr 6
+ addi 6,6,56
+ mtlr 0
+ blr
+.long 0
+.byte 0,12,0x14,0,0,0,0,0
+.space 28
+.long 0x428a2f98,0xd728ae22
+.long 0x428a2f98,0xd728ae22
+.long 0x71374491,0x23ef65cd
+.long 0x71374491,0x23ef65cd
+.long 0xb5c0fbcf,0xec4d3b2f
+.long 0xb5c0fbcf,0xec4d3b2f
+.long 0xe9b5dba5,0x8189dbbc
+.long 0xe9b5dba5,0x8189dbbc
+.long 0x3956c25b,0xf348b538
+.long 0x3956c25b,0xf348b538
+.long 0x59f111f1,0xb605d019
+.long 0x59f111f1,0xb605d019
+.long 0x923f82a4,0xaf194f9b
+.long 0x923f82a4,0xaf194f9b
+.long 0xab1c5ed5,0xda6d8118
+.long 0xab1c5ed5,0xda6d8118
+.long 0xd807aa98,0xa3030242
+.long 0xd807aa98,0xa3030242
+.long 0x12835b01,0x45706fbe
+.long 0x12835b01,0x45706fbe
+.long 0x243185be,0x4ee4b28c
+.long 0x243185be,0x4ee4b28c
+.long 0x550c7dc3,0xd5ffb4e2
+.long 0x550c7dc3,0xd5ffb4e2
+.long 0x72be5d74,0xf27b896f
+.long 0x72be5d74,0xf27b896f
+.long 0x80deb1fe,0x3b1696b1
+.long 0x80deb1fe,0x3b1696b1
+.long 0x9bdc06a7,0x25c71235
+.long 0x9bdc06a7,0x25c71235
+.long 0xc19bf174,0xcf692694
+.long 0xc19bf174,0xcf692694
+.long 0xe49b69c1,0x9ef14ad2
+.long 0xe49b69c1,0x9ef14ad2
+.long 0xefbe4786,0x384f25e3
+.long 0xefbe4786,0x384f25e3
+.long 0x0fc19dc6,0x8b8cd5b5
+.long 0x0fc19dc6,0x8b8cd5b5
+.long 0x240ca1cc,0x77ac9c65
+.long 0x240ca1cc,0x77ac9c65
+.long 0x2de92c6f,0x592b0275
+.long 0x2de92c6f,0x592b0275
+.long 0x4a7484aa,0x6ea6e483
+.long 0x4a7484aa,0x6ea6e483
+.long 0x5cb0a9dc,0xbd41fbd4
+.long 0x5cb0a9dc,0xbd41fbd4
+.long 0x76f988da,0x831153b5
+.long 0x76f988da,0x831153b5
+.long 0x983e5152,0xee66dfab
+.long 0x983e5152,0xee66dfab
+.long 0xa831c66d,0x2db43210
+.long 0xa831c66d,0x2db43210
+.long 0xb00327c8,0x98fb213f
+.long 0xb00327c8,0x98fb213f
+.long 0xbf597fc7,0xbeef0ee4
+.long 0xbf597fc7,0xbeef0ee4
+.long 0xc6e00bf3,0x3da88fc2
+.long 0xc6e00bf3,0x3da88fc2
+.long 0xd5a79147,0x930aa725
+.long 0xd5a79147,0x930aa725
+.long 0x06ca6351,0xe003826f
+.long 0x06ca6351,0xe003826f
+.long 0x14292967,0x0a0e6e70
+.long 0x14292967,0x0a0e6e70
+.long 0x27b70a85,0x46d22ffc
+.long 0x27b70a85,0x46d22ffc
+.long 0x2e1b2138,0x5c26c926
+.long 0x2e1b2138,0x5c26c926
+.long 0x4d2c6dfc,0x5ac42aed
+.long 0x4d2c6dfc,0x5ac42aed
+.long 0x53380d13,0x9d95b3df
+.long 0x53380d13,0x9d95b3df
+.long 0x650a7354,0x8baf63de
+.long 0x650a7354,0x8baf63de
+.long 0x766a0abb,0x3c77b2a8
+.long 0x766a0abb,0x3c77b2a8
+.long 0x81c2c92e,0x47edaee6
+.long 0x81c2c92e,0x47edaee6
+.long 0x92722c85,0x1482353b
+.long 0x92722c85,0x1482353b
+.long 0xa2bfe8a1,0x4cf10364
+.long 0xa2bfe8a1,0x4cf10364
+.long 0xa81a664b,0xbc423001
+.long 0xa81a664b,0xbc423001
+.long 0xc24b8b70,0xd0f89791
+.long 0xc24b8b70,0xd0f89791
+.long 0xc76c51a3,0x0654be30
+.long 0xc76c51a3,0x0654be30
+.long 0xd192e819,0xd6ef5218
+.long 0xd192e819,0xd6ef5218
+.long 0xd6990624,0x5565a910
+.long 0xd6990624,0x5565a910
+.long 0xf40e3585,0x5771202a
+.long 0xf40e3585,0x5771202a
+.long 0x106aa070,0x32bbd1b8
+.long 0x106aa070,0x32bbd1b8
+.long 0x19a4c116,0xb8d2d0c8
+.long 0x19a4c116,0xb8d2d0c8
+.long 0x1e376c08,0x5141ab53
+.long 0x1e376c08,0x5141ab53
+.long 0x2748774c,0xdf8eeb99
+.long 0x2748774c,0xdf8eeb99
+.long 0x34b0bcb5,0xe19b48a8
+.long 0x34b0bcb5,0xe19b48a8
+.long 0x391c0cb3,0xc5c95a63
+.long 0x391c0cb3,0xc5c95a63
+.long 0x4ed8aa4a,0xe3418acb
+.long 0x4ed8aa4a,0xe3418acb
+.long 0x5b9cca4f,0x7763e373
+.long 0x5b9cca4f,0x7763e373
+.long 0x682e6ff3,0xd6b2b8a3
+.long 0x682e6ff3,0xd6b2b8a3
+.long 0x748f82ee,0x5defb2fc
+.long 0x748f82ee,0x5defb2fc
+.long 0x78a5636f,0x43172f60
+.long 0x78a5636f,0x43172f60
+.long 0x84c87814,0xa1f0ab72
+.long 0x84c87814,0xa1f0ab72
+.long 0x8cc70208,0x1a6439ec
+.long 0x8cc70208,0x1a6439ec
+.long 0x90befffa,0x23631e28
+.long 0x90befffa,0x23631e28
+.long 0xa4506ceb,0xde82bde9
+.long 0xa4506ceb,0xde82bde9
+.long 0xbef9a3f7,0xb2c67915
+.long 0xbef9a3f7,0xb2c67915
+.long 0xc67178f2,0xe372532b
+.long 0xc67178f2,0xe372532b
+.long 0xca273ece,0xea26619c
+.long 0xca273ece,0xea26619c
+.long 0xd186b8c7,0x21c0c207
+.long 0xd186b8c7,0x21c0c207
+.long 0xeada7dd6,0xcde0eb1e
+.long 0xeada7dd6,0xcde0eb1e
+.long 0xf57d4f7f,0xee6ed178
+.long 0xf57d4f7f,0xee6ed178
+.long 0x06f067aa,0x72176fba
+.long 0x06f067aa,0x72176fba
+.long 0x0a637dc5,0xa2c898a6
+.long 0x0a637dc5,0xa2c898a6
+.long 0x113f9804,0xbef90dae
+.long 0x113f9804,0xbef90dae
+.long 0x1b710b35,0x131c471b
+.long 0x1b710b35,0x131c471b
+.long 0x28db77f5,0x23047d84
+.long 0x28db77f5,0x23047d84
+.long 0x32caab7b,0x40c72493
+.long 0x32caab7b,0x40c72493
+.long 0x3c9ebe0a,0x15c9bebc
+.long 0x3c9ebe0a,0x15c9bebc
+.long 0x431d67c4,0x9c100d4c
+.long 0x431d67c4,0x9c100d4c
+.long 0x4cc5d4be,0xcb3e42b6
+.long 0x4cc5d4be,0xcb3e42b6
+.long 0x597f299c,0xfc657e2a
+.long 0x597f299c,0xfc657e2a
+.long 0x5fcb6fab,0x3ad6faec
+.long 0x5fcb6fab,0x3ad6faec
+.long 0x6c44198c,0x4a475817
+.long 0x6c44198c,0x4a475817
+.long 0,0
+.long 0,0
+.long 0x00010203,0x04050607
+.long 0x10111213,0x14151617
+
+#elif (defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
+
+.abiversion 2
+.text
+
+.globl zfs_sha512_power8
+.type zfs_sha512_power8,@function
+.align 6
+zfs_sha512_power8:
+.localentry zfs_sha512_power8,0
+
+ stdu 1,-384(1)
+ mflr 8
+ li 10,207
+ li 11,223
+ stvx 24,10,1
+ addi 10,10,32
+ li 12,-1
+ stvx 25,11,1
+ addi 11,11,32
+ stvx 26,10,1
+ addi 10,10,32
+ stvx 27,11,1
+ addi 11,11,32
+ stvx 28,10,1
+ addi 10,10,32
+ stvx 29,11,1
+ addi 11,11,32
+ stvx 30,10,1
+ stvx 31,11,1
+ li 11,-4096+255
+ stw 12,332(1)
+ li 10,0x10
+ std 26,336(1)
+ li 26,0x20
+ std 27,344(1)
+ li 27,0x30
+ std 28,352(1)
+ li 28,0x40
+ std 29,360(1)
+ li 29,0x50
+ std 30,368(1)
+ li 30,0x60
+ std 31,376(1)
+ li 31,0x70
+ std 8,400(1)
+ or 11,11,11
+
+ bl .LPICmeup
+ addi 11,1,79
+ li 7,8
+ lvsl 31,0,7
+ vspltisb 28,0x0f
+ vxor 31,31,28
+ .long 0x7C001E99
+ .long 0x7C4A1E99
+ .long 0x7C9A1E99
+ vsldoi 1,0,0,8
+ .long 0x7CDB1E99
+ vsldoi 3,2,2,8
+ vsldoi 5,4,4,8
+ vsldoi 7,6,6,8
+ li 0,4
+ b .Loop
+.align 5
+.Loop:
+ lvx 28,0,6
+ .long 0x7D002699
+ addi 4,4,16
+ mr 7,6
+ stvx 0,0,11
+ stvx 1,10,11
+ stvx 2,26,11
+ stvx 3,27,11
+ stvx 4,28,11
+ stvx 5,29,11
+ stvx 6,30,11
+ stvx 7,31,11
+ .long 0x10E7E0C0
+ lvx 28,10,6
+ vperm 8,8,8,31
+ .long 0x10E740C0
+ vsel 29,6,5,4
+ .long 0x10C6E0C0
+ .long 0x10E7E8C0
+ .long 0x13C4FEC2
+ .long 0x10E7F0C0
+ vxor 29,0,1
+ vsel 29,1,2,29
+ .long 0x106338C0
+ .long 0x13C086C2
+ .long 0x13DEE8C0
+ .long 0x10E7F0C0
+ lvx 28,26,7
+ .long 0x7D402699
+ addi 4,4,16
+ vsldoi 9,8,8,8
+ .long 0x10C648C0
+ vsel 29,5,4,3
+ .long 0x10A5E0C0
+ .long 0x10C6E8C0
+ .long 0x13C3FEC2
+ .long 0x10C6F0C0
+ vxor 29,7,0
+ vsel 29,0,1,29
+ .long 0x104230C0
+ .long 0x13C786C2
+ .long 0x13DEE8C0
+ .long 0x10C6F0C0
+ lvx 28,27,7
+ vperm 10,10,10,31
+ .long 0x10A550C0
+ vsel 29,4,3,2
+ .long 0x1084E0C0
+ .long 0x10A5E8C0
+ .long 0x13C2FEC2
+ .long 0x10A5F0C0
+ vxor 29,6,7
+ vsel 29,7,0,29
+ .long 0x102128C0
+ .long 0x13C686C2
+ .long 0x13DEE8C0
+ .long 0x10A5F0C0
+ lvx 28,28,7
+ .long 0x7D802699
+ addi 4,4,16
+ vsldoi 11,10,10,8
+ .long 0x108458C0
+ vsel 29,3,2,1
+ .long 0x1063E0C0
+ .long 0x1084E8C0
+ .long 0x13C1FEC2
+ .long 0x1084F0C0
+ vxor 29,5,6
+ vsel 29,6,7,29
+ .long 0x100020C0
+ .long 0x13C586C2
+ .long 0x13DEE8C0
+ .long 0x1084F0C0
+ lvx 28,29,7
+ vperm 12,12,12,31
+ .long 0x106360C0
+ vsel 29,2,1,0
+ .long 0x1042E0C0
+ .long 0x1063E8C0
+ .long 0x13C0FEC2
+ .long 0x1063F0C0
+ vxor 29,4,5
+ vsel 29,5,6,29
+ .long 0x10E718C0
+ .long 0x13C486C2
+ .long 0x13DEE8C0
+ .long 0x1063F0C0
+ lvx 28,30,7
+ .long 0x7DC02699
+ addi 4,4,16
+ vsldoi 13,12,12,8
+ .long 0x104268C0
+ vsel 29,1,0,7
+ .long 0x1021E0C0
+ .long 0x1042E8C0
+ .long 0x13C7FEC2
+ .long 0x1042F0C0
+ vxor 29,3,4
+ vsel 29,4,5,29
+ .long 0x10C610C0
+ .long 0x13C386C2
+ .long 0x13DEE8C0
+ .long 0x1042F0C0
+ lvx 28,31,7
+ addi 7,7,0x80
+ vperm 14,14,14,31
+ .long 0x102170C0
+ vsel 29,0,7,6
+ .long 0x1000E0C0
+ .long 0x1021E8C0
+ .long 0x13C6FEC2
+ .long 0x1021F0C0
+ vxor 29,2,3
+ vsel 29,3,4,29
+ .long 0x10A508C0
+ .long 0x13C286C2
+ .long 0x13DEE8C0
+ .long 0x1021F0C0
+ lvx 28,0,7
+ .long 0x7E002699
+ addi 4,4,16
+ vsldoi 15,14,14,8
+ .long 0x100078C0
+ vsel 29,7,6,5
+ .long 0x10E7E0C0
+ .long 0x1000E8C0
+ .long 0x13C5FEC2
+ .long 0x1000F0C0
+ vxor 29,1,2
+ vsel 29,2,3,29
+ .long 0x108400C0
+ .long 0x13C186C2
+ .long 0x13DEE8C0
+ .long 0x1000F0C0
+ lvx 28,10,7
+ vperm 16,16,16,31
+ .long 0x10E780C0
+ vsel 29,6,5,4
+ .long 0x10C6E0C0
+ .long 0x10E7E8C0
+ .long 0x13C4FEC2
+ .long 0x10E7F0C0
+ vxor 29,0,1
+ vsel 29,1,2,29
+ .long 0x106338C0
+ .long 0x13C086C2
+ .long 0x13DEE8C0
+ .long 0x10E7F0C0
+ lvx 28,26,7
+ .long 0x7E402699
+ addi 4,4,16
+ vsldoi 17,16,16,8
+ .long 0x10C688C0
+ vsel 29,5,4,3
+ .long 0x10A5E0C0
+ .long 0x10C6E8C0
+ .long 0x13C3FEC2
+ .long 0x10C6F0C0
+ vxor 29,7,0
+ vsel 29,0,1,29
+ .long 0x104230C0
+ .long 0x13C786C2
+ .long 0x13DEE8C0
+ .long 0x10C6F0C0
+ lvx 28,27,7
+ vperm 18,18,18,31
+ .long 0x10A590C0
+ vsel 29,4,3,2
+ .long 0x1084E0C0
+ .long 0x10A5E8C0
+ .long 0x13C2FEC2
+ .long 0x10A5F0C0
+ vxor 29,6,7
+ vsel 29,7,0,29
+ .long 0x102128C0
+ .long 0x13C686C2
+ .long 0x13DEE8C0
+ .long 0x10A5F0C0
+ lvx 28,28,7
+ .long 0x7F002699
+ addi 4,4,16
+ vsldoi 19,18,18,8
+ .long 0x108498C0
+ vsel 29,3,2,1
+ .long 0x1063E0C0
+ .long 0x1084E8C0
+ .long 0x13C1FEC2
+ .long 0x1084F0C0
+ vxor 29,5,6
+ vsel 29,6,7,29
+ .long 0x100020C0
+ .long 0x13C586C2
+ .long 0x13DEE8C0
+ .long 0x1084F0C0
+ lvx 28,29,7
+ vperm 24,24,24,31
+ .long 0x1063C0C0
+ vsel 29,2,1,0
+ .long 0x1042E0C0
+ .long 0x1063E8C0
+ .long 0x13C0FEC2
+ .long 0x1063F0C0
+ vxor 29,4,5
+ vsel 29,5,6,29
+ .long 0x10E718C0
+ .long 0x13C486C2
+ .long 0x13DEE8C0
+ .long 0x1063F0C0
+ lvx 28,30,7
+ .long 0x7F402699
+ addi 4,4,16
+ vsldoi 25,24,24,8
+ .long 0x1042C8C0
+ vsel 29,1,0,7
+ .long 0x1021E0C0
+ .long 0x1042E8C0
+ .long 0x13C7FEC2
+ .long 0x1042F0C0
+ vxor 29,3,4
+ vsel 29,4,5,29
+ .long 0x10C610C0
+ .long 0x13C386C2
+ .long 0x13DEE8C0
+ .long 0x1042F0C0
+ lvx 28,31,7
+ addi 7,7,0x80
+ vperm 26,26,26,31
+ .long 0x1021D0C0
+ vsel 29,0,7,6
+ .long 0x1000E0C0
+ .long 0x1021E8C0
+ .long 0x13C6FEC2
+ .long 0x1021F0C0
+ vxor 29,2,3
+ vsel 29,3,4,29
+ .long 0x10A508C0
+ .long 0x13C286C2
+ .long 0x13DEE8C0
+ .long 0x1021F0C0
+ lvx 28,0,7
+ vsldoi 27,26,26,8
+ .long 0x13C906C2
+ .long 0x1108F0C0
+ .long 0x13DA7EC2
+ .long 0x1108F0C0
+ .long 0x110888C0
+ .long 0x1000D8C0
+ vsel 29,7,6,5
+ .long 0x10E7E0C0
+ .long 0x1000E8C0
+ .long 0x13C5FEC2
+ .long 0x1000F0C0
+ vxor 29,1,2
+ vsel 29,2,3,29
+ .long 0x108400C0
+ .long 0x13C186C2
+ .long 0x13DEE8C0
+ .long 0x1000F0C0
+ lvx 28,10,7
+ mtctr 0
+ b .L16_xx
+.align 5
+.L16_xx:
+ .long 0x13CA06C2
+ .long 0x1129F0C0
+ .long 0x13DB7EC2
+ .long 0x1129F0C0
+ .long 0x112990C0
+ .long 0x10E740C0
+ vsel 29,6,5,4
+ .long 0x10C6E0C0
+ .long 0x10E7E8C0
+ .long 0x13C4FEC2
+ .long 0x10E7F0C0
+ vxor 29,0,1
+ vsel 29,1,2,29
+ .long 0x106338C0
+ .long 0x13C086C2
+ .long 0x13DEE8C0
+ .long 0x10E7F0C0
+ lvx 28,26,7
+ .long 0x13CB06C2
+ .long 0x114AF0C0
+ .long 0x13C87EC2
+ .long 0x114AF0C0
+ .long 0x114A98C0
+ .long 0x10C648C0
+ vsel 29,5,4,3
+ .long 0x10A5E0C0
+ .long 0x10C6E8C0
+ .long 0x13C3FEC2
+ .long 0x10C6F0C0
+ vxor 29,7,0
+ vsel 29,0,1,29
+ .long 0x104230C0
+ .long 0x13C786C2
+ .long 0x13DEE8C0
+ .long 0x10C6F0C0
+ lvx 28,27,7
+ .long 0x13CC06C2
+ .long 0x116BF0C0
+ .long 0x13C97EC2
+ .long 0x116BF0C0
+ .long 0x116BC0C0
+ .long 0x10A550C0
+ vsel 29,4,3,2
+ .long 0x1084E0C0
+ .long 0x10A5E8C0
+ .long 0x13C2FEC2
+ .long 0x10A5F0C0
+ vxor 29,6,7
+ vsel 29,7,0,29
+ .long 0x102128C0
+ .long 0x13C686C2
+ .long 0x13DEE8C0
+ .long 0x10A5F0C0
+ lvx 28,28,7
+ .long 0x13CD06C2
+ .long 0x118CF0C0
+ .long 0x13CA7EC2
+ .long 0x118CF0C0
+ .long 0x118CC8C0
+ .long 0x108458C0
+ vsel 29,3,2,1
+ .long 0x1063E0C0
+ .long 0x1084E8C0
+ .long 0x13C1FEC2
+ .long 0x1084F0C0
+ vxor 29,5,6
+ vsel 29,6,7,29
+ .long 0x100020C0
+ .long 0x13C586C2
+ .long 0x13DEE8C0
+ .long 0x1084F0C0
+ lvx 28,29,7
+ .long 0x13CE06C2
+ .long 0x11ADF0C0
+ .long 0x13CB7EC2
+ .long 0x11ADF0C0
+ .long 0x11ADD0C0
+ .long 0x106360C0
+ vsel 29,2,1,0
+ .long 0x1042E0C0
+ .long 0x1063E8C0
+ .long 0x13C0FEC2
+ .long 0x1063F0C0
+ vxor 29,4,5
+ vsel 29,5,6,29
+ .long 0x10E718C0
+ .long 0x13C486C2
+ .long 0x13DEE8C0
+ .long 0x1063F0C0
+ lvx 28,30,7
+ .long 0x13CF06C2
+ .long 0x11CEF0C0
+ .long 0x13CC7EC2
+ .long 0x11CEF0C0
+ .long 0x11CED8C0
+ .long 0x104268C0
+ vsel 29,1,0,7
+ .long 0x1021E0C0
+ .long 0x1042E8C0
+ .long 0x13C7FEC2
+ .long 0x1042F0C0
+ vxor 29,3,4
+ vsel 29,4,5,29
+ .long 0x10C610C0
+ .long 0x13C386C2
+ .long 0x13DEE8C0
+ .long 0x1042F0C0
+ lvx 28,31,7
+ addi 7,7,0x80
+ .long 0x13D006C2
+ .long 0x11EFF0C0
+ .long 0x13CD7EC2
+ .long 0x11EFF0C0
+ .long 0x11EF40C0
+ .long 0x102170C0
+ vsel 29,0,7,6
+ .long 0x1000E0C0
+ .long 0x1021E8C0
+ .long 0x13C6FEC2
+ .long 0x1021F0C0
+ vxor 29,2,3
+ vsel 29,3,4,29
+ .long 0x10A508C0
+ .long 0x13C286C2
+ .long 0x13DEE8C0
+ .long 0x1021F0C0
+ lvx 28,0,7
+ .long 0x13D106C2
+ .long 0x1210F0C0
+ .long 0x13CE7EC2
+ .long 0x1210F0C0
+ .long 0x121048C0
+ .long 0x100078C0
+ vsel 29,7,6,5
+ .long 0x10E7E0C0
+ .long 0x1000E8C0
+ .long 0x13C5FEC2
+ .long 0x1000F0C0
+ vxor 29,1,2
+ vsel 29,2,3,29
+ .long 0x108400C0
+ .long 0x13C186C2
+ .long 0x13DEE8C0
+ .long 0x1000F0C0
+ lvx 28,10,7
+ .long 0x13D206C2
+ .long 0x1231F0C0
+ .long 0x13CF7EC2
+ .long 0x1231F0C0
+ .long 0x123150C0
+ .long 0x10E780C0
+ vsel 29,6,5,4
+ .long 0x10C6E0C0
+ .long 0x10E7E8C0
+ .long 0x13C4FEC2
+ .long 0x10E7F0C0
+ vxor 29,0,1
+ vsel 29,1,2,29
+ .long 0x106338C0
+ .long 0x13C086C2
+ .long 0x13DEE8C0
+ .long 0x10E7F0C0
+ lvx 28,26,7
+ .long 0x13D306C2
+ .long 0x1252F0C0
+ .long 0x13D07EC2
+ .long 0x1252F0C0
+ .long 0x125258C0
+ .long 0x10C688C0
+ vsel 29,5,4,3
+ .long 0x10A5E0C0
+ .long 0x10C6E8C0
+ .long 0x13C3FEC2
+ .long 0x10C6F0C0
+ vxor 29,7,0
+ vsel 29,0,1,29
+ .long 0x104230C0
+ .long 0x13C786C2
+ .long 0x13DEE8C0
+ .long 0x10C6F0C0
+ lvx 28,27,7
+ .long 0x13D806C2
+ .long 0x1273F0C0
+ .long 0x13D17EC2
+ .long 0x1273F0C0
+ .long 0x127360C0
+ .long 0x10A590C0
+ vsel 29,4,3,2
+ .long 0x1084E0C0
+ .long 0x10A5E8C0
+ .long 0x13C2FEC2
+ .long 0x10A5F0C0
+ vxor 29,6,7
+ vsel 29,7,0,29
+ .long 0x102128C0
+ .long 0x13C686C2
+ .long 0x13DEE8C0
+ .long 0x10A5F0C0
+ lvx 28,28,7
+ .long 0x13D906C2
+ .long 0x1318F0C0
+ .long 0x13D27EC2
+ .long 0x1318F0C0
+ .long 0x131868C0
+ .long 0x108498C0
+ vsel 29,3,2,1
+ .long 0x1063E0C0
+ .long 0x1084E8C0
+ .long 0x13C1FEC2
+ .long 0x1084F0C0
+ vxor 29,5,6
+ vsel 29,6,7,29
+ .long 0x100020C0
+ .long 0x13C586C2
+ .long 0x13DEE8C0
+ .long 0x1084F0C0
+ lvx 28,29,7
+ .long 0x13DA06C2
+ .long 0x1339F0C0
+ .long 0x13D37EC2
+ .long 0x1339F0C0
+ .long 0x133970C0
+ .long 0x1063C0C0
+ vsel 29,2,1,0
+ .long 0x1042E0C0
+ .long 0x1063E8C0
+ .long 0x13C0FEC2
+ .long 0x1063F0C0
+ vxor 29,4,5
+ vsel 29,5,6,29
+ .long 0x10E718C0
+ .long 0x13C486C2
+ .long 0x13DEE8C0
+ .long 0x1063F0C0
+ lvx 28,30,7
+ .long 0x13DB06C2
+ .long 0x135AF0C0
+ .long 0x13D87EC2
+ .long 0x135AF0C0
+ .long 0x135A78C0
+ .long 0x1042C8C0
+ vsel 29,1,0,7
+ .long 0x1021E0C0
+ .long 0x1042E8C0
+ .long 0x13C7FEC2
+ .long 0x1042F0C0
+ vxor 29,3,4
+ vsel 29,4,5,29
+ .long 0x10C610C0
+ .long 0x13C386C2
+ .long 0x13DEE8C0
+ .long 0x1042F0C0
+ lvx 28,31,7
+ addi 7,7,0x80
+ .long 0x13C806C2
+ .long 0x137BF0C0
+ .long 0x13D97EC2
+ .long 0x137BF0C0
+ .long 0x137B80C0
+ .long 0x1021D0C0
+ vsel 29,0,7,6
+ .long 0x1000E0C0
+ .long 0x1021E8C0
+ .long 0x13C6FEC2
+ .long 0x1021F0C0
+ vxor 29,2,3
+ vsel 29,3,4,29
+ .long 0x10A508C0
+ .long 0x13C286C2
+ .long 0x13DEE8C0
+ .long 0x1021F0C0
+ lvx 28,0,7
+ .long 0x13C906C2
+ .long 0x1108F0C0
+ .long 0x13DA7EC2
+ .long 0x1108F0C0
+ .long 0x110888C0
+ .long 0x1000D8C0
+ vsel 29,7,6,5
+ .long 0x10E7E0C0
+ .long 0x1000E8C0
+ .long 0x13C5FEC2
+ .long 0x1000F0C0
+ vxor 29,1,2
+ vsel 29,2,3,29
+ .long 0x108400C0
+ .long 0x13C186C2
+ .long 0x13DEE8C0
+ .long 0x1000F0C0
+ lvx 28,10,7
+ bdnz .L16_xx
+
+ lvx 10,0,11
+ subic. 5,5,1
+ lvx 11,10,11
+ .long 0x100050C0
+ lvx 12,26,11
+ .long 0x102158C0
+ lvx 13,27,11
+ .long 0x104260C0
+ lvx 14,28,11
+ .long 0x106368C0
+ lvx 15,29,11
+ .long 0x108470C0
+ lvx 16,30,11
+ .long 0x10A578C0
+ lvx 17,31,11
+ .long 0x10C680C0
+ .long 0x10E788C0
+ bne .Loop
+ vperm 0,0,1,28
+ vperm 2,2,3,28
+ vperm 4,4,5,28
+ vperm 6,6,7,28
+ .long 0x7C001F99
+ .long 0x7C4A1F99
+ .long 0x7C9A1F99
+ .long 0x7CDB1F99
+ addi 11,1,207
+ mtlr 8
+ or 12,12,12
+ lvx 24,0,11
+ lvx 25,10,11
+ lvx 26,26,11
+ lvx 27,27,11
+ lvx 28,28,11
+ lvx 29,29,11
+ lvx 30,30,11
+ lvx 31,31,11
+ ld 26,336(1)
+ ld 27,344(1)
+ ld 28,352(1)
+ ld 29,360(1)
+ ld 30,368(1)
+ ld 31,376(1)
+ addi 1,1,384
+ blr
+.long 0
+.byte 0,12,4,1,0x80,6,3,0
+.long 0
+.size zfs_sha512_power8,.-zfs_sha512_power8
+.align 6
+.LPICmeup:
+ mflr 0
+ bcl 20,31,$+4
+ mflr 6
+ addi 6,6,56
+ mtlr 0
+ blr
+.long 0
+.byte 0,12,0x14,0,0,0,0,0
+.space 28
+.long 0xd728ae22,0x428a2f98
+.long 0xd728ae22,0x428a2f98
+.long 0x23ef65cd,0x71374491
+.long 0x23ef65cd,0x71374491
+.long 0xec4d3b2f,0xb5c0fbcf
+.long 0xec4d3b2f,0xb5c0fbcf
+.long 0x8189dbbc,0xe9b5dba5
+.long 0x8189dbbc,0xe9b5dba5
+.long 0xf348b538,0x3956c25b
+.long 0xf348b538,0x3956c25b
+.long 0xb605d019,0x59f111f1
+.long 0xb605d019,0x59f111f1
+.long 0xaf194f9b,0x923f82a4
+.long 0xaf194f9b,0x923f82a4
+.long 0xda6d8118,0xab1c5ed5
+.long 0xda6d8118,0xab1c5ed5
+.long 0xa3030242,0xd807aa98
+.long 0xa3030242,0xd807aa98
+.long 0x45706fbe,0x12835b01
+.long 0x45706fbe,0x12835b01
+.long 0x4ee4b28c,0x243185be
+.long 0x4ee4b28c,0x243185be
+.long 0xd5ffb4e2,0x550c7dc3
+.long 0xd5ffb4e2,0x550c7dc3
+.long 0xf27b896f,0x72be5d74
+.long 0xf27b896f,0x72be5d74
+.long 0x3b1696b1,0x80deb1fe
+.long 0x3b1696b1,0x80deb1fe
+.long 0x25c71235,0x9bdc06a7
+.long 0x25c71235,0x9bdc06a7
+.long 0xcf692694,0xc19bf174
+.long 0xcf692694,0xc19bf174
+.long 0x9ef14ad2,0xe49b69c1
+.long 0x9ef14ad2,0xe49b69c1
+.long 0x384f25e3,0xefbe4786
+.long 0x384f25e3,0xefbe4786
+.long 0x8b8cd5b5,0x0fc19dc6
+.long 0x8b8cd5b5,0x0fc19dc6
+.long 0x77ac9c65,0x240ca1cc
+.long 0x77ac9c65,0x240ca1cc
+.long 0x592b0275,0x2de92c6f
+.long 0x592b0275,0x2de92c6f
+.long 0x6ea6e483,0x4a7484aa
+.long 0x6ea6e483,0x4a7484aa
+.long 0xbd41fbd4,0x5cb0a9dc
+.long 0xbd41fbd4,0x5cb0a9dc
+.long 0x831153b5,0x76f988da
+.long 0x831153b5,0x76f988da
+.long 0xee66dfab,0x983e5152
+.long 0xee66dfab,0x983e5152
+.long 0x2db43210,0xa831c66d
+.long 0x2db43210,0xa831c66d
+.long 0x98fb213f,0xb00327c8
+.long 0x98fb213f,0xb00327c8
+.long 0xbeef0ee4,0xbf597fc7
+.long 0xbeef0ee4,0xbf597fc7
+.long 0x3da88fc2,0xc6e00bf3
+.long 0x3da88fc2,0xc6e00bf3
+.long 0x930aa725,0xd5a79147
+.long 0x930aa725,0xd5a79147
+.long 0xe003826f,0x06ca6351
+.long 0xe003826f,0x06ca6351
+.long 0x0a0e6e70,0x14292967
+.long 0x0a0e6e70,0x14292967
+.long 0x46d22ffc,0x27b70a85
+.long 0x46d22ffc,0x27b70a85
+.long 0x5c26c926,0x2e1b2138
+.long 0x5c26c926,0x2e1b2138
+.long 0x5ac42aed,0x4d2c6dfc
+.long 0x5ac42aed,0x4d2c6dfc
+.long 0x9d95b3df,0x53380d13
+.long 0x9d95b3df,0x53380d13
+.long 0x8baf63de,0x650a7354
+.long 0x8baf63de,0x650a7354
+.long 0x3c77b2a8,0x766a0abb
+.long 0x3c77b2a8,0x766a0abb
+.long 0x47edaee6,0x81c2c92e
+.long 0x47edaee6,0x81c2c92e
+.long 0x1482353b,0x92722c85
+.long 0x1482353b,0x92722c85
+.long 0x4cf10364,0xa2bfe8a1
+.long 0x4cf10364,0xa2bfe8a1
+.long 0xbc423001,0xa81a664b
+.long 0xbc423001,0xa81a664b
+.long 0xd0f89791,0xc24b8b70
+.long 0xd0f89791,0xc24b8b70
+.long 0x0654be30,0xc76c51a3
+.long 0x0654be30,0xc76c51a3
+.long 0xd6ef5218,0xd192e819
+.long 0xd6ef5218,0xd192e819
+.long 0x5565a910,0xd6990624
+.long 0x5565a910,0xd6990624
+.long 0x5771202a,0xf40e3585
+.long 0x5771202a,0xf40e3585
+.long 0x32bbd1b8,0x106aa070
+.long 0x32bbd1b8,0x106aa070
+.long 0xb8d2d0c8,0x19a4c116
+.long 0xb8d2d0c8,0x19a4c116
+.long 0x5141ab53,0x1e376c08
+.long 0x5141ab53,0x1e376c08
+.long 0xdf8eeb99,0x2748774c
+.long 0xdf8eeb99,0x2748774c
+.long 0xe19b48a8,0x34b0bcb5
+.long 0xe19b48a8,0x34b0bcb5
+.long 0xc5c95a63,0x391c0cb3
+.long 0xc5c95a63,0x391c0cb3
+.long 0xe3418acb,0x4ed8aa4a
+.long 0xe3418acb,0x4ed8aa4a
+.long 0x7763e373,0x5b9cca4f
+.long 0x7763e373,0x5b9cca4f
+.long 0xd6b2b8a3,0x682e6ff3
+.long 0xd6b2b8a3,0x682e6ff3
+.long 0x5defb2fc,0x748f82ee
+.long 0x5defb2fc,0x748f82ee
+.long 0x43172f60,0x78a5636f
+.long 0x43172f60,0x78a5636f
+.long 0xa1f0ab72,0x84c87814
+.long 0xa1f0ab72,0x84c87814
+.long 0x1a6439ec,0x8cc70208
+.long 0x1a6439ec,0x8cc70208
+.long 0x23631e28,0x90befffa
+.long 0x23631e28,0x90befffa
+.long 0xde82bde9,0xa4506ceb
+.long 0xde82bde9,0xa4506ceb
+.long 0xb2c67915,0xbef9a3f7
+.long 0xb2c67915,0xbef9a3f7
+.long 0xe372532b,0xc67178f2
+.long 0xe372532b,0xc67178f2
+.long 0xea26619c,0xca273ece
+.long 0xea26619c,0xca273ece
+.long 0x21c0c207,0xd186b8c7
+.long 0x21c0c207,0xd186b8c7
+.long 0xcde0eb1e,0xeada7dd6
+.long 0xcde0eb1e,0xeada7dd6
+.long 0xee6ed178,0xf57d4f7f
+.long 0xee6ed178,0xf57d4f7f
+.long 0x72176fba,0x06f067aa
+.long 0x72176fba,0x06f067aa
+.long 0xa2c898a6,0x0a637dc5
+.long 0xa2c898a6,0x0a637dc5
+.long 0xbef90dae,0x113f9804
+.long 0xbef90dae,0x113f9804
+.long 0x131c471b,0x1b710b35
+.long 0x131c471b,0x1b710b35
+.long 0x23047d84,0x28db77f5
+.long 0x23047d84,0x28db77f5
+.long 0x40c72493,0x32caab7b
+.long 0x40c72493,0x32caab7b
+.long 0x15c9bebc,0x3c9ebe0a
+.long 0x15c9bebc,0x3c9ebe0a
+.long 0x9c100d4c,0x431d67c4
+.long 0x9c100d4c,0x431d67c4
+.long 0xcb3e42b6,0x4cc5d4be
+.long 0xcb3e42b6,0x4cc5d4be
+.long 0xfc657e2a,0x597f299c
+.long 0xfc657e2a,0x597f299c
+.long 0x3ad6faec,0x5fcb6fab
+.long 0x3ad6faec,0x5fcb6fab
+.long 0x4a475817,0x6c44198c
+.long 0x4a475817,0x6c44198c
+.long 0,0
+.long 0,0
+.long 0x14151617,0x10111213
+.long 0x04050607,0x00010203
+
+#endif
diff --git a/module/icp/asm-ppc64/sha2/sha512-ppc.S b/module/icp/asm-ppc64/sha2/sha512-ppc.S
new file mode 100644
index 000000000..37070115c
--- /dev/null
+++ b/module/icp/asm-ppc64/sha2/sha512-ppc.S
@@ -0,0 +1,2958 @@
+/*
+ * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Portions Copyright (c) 2022 Tino Reichardt <[email protected]>
+ * - modified assembly to fit into OpenZFS
+ */
+
+#if (defined(__PPC64__) && defined(__BIG_ENDIAN__))
+
+.text
+
+.globl zfs_sha512_ppc
+.globl .zfs_sha512_ppc
+.type zfs_sha512_ppc,@function
+.section ".opd","aw"
+.align 3
+zfs_sha512_ppc:
+.quad .zfs_sha512_ppc,.TOC.@tocbase,0
+.previous
+.align 6
+.zfs_sha512_ppc:
+ stdu 1,-384(1)
+ mflr 0
+ sldi 5,5,7
+
+ std 3,208(1)
+
+ std 14,240(1)
+ std 15,248(1)
+ std 16,256(1)
+ std 17,264(1)
+ std 18,272(1)
+ std 19,280(1)
+ std 20,288(1)
+ std 21,296(1)
+ std 22,304(1)
+ std 23,312(1)
+ std 24,320(1)
+ std 25,328(1)
+ std 26,336(1)
+ std 27,344(1)
+ std 28,352(1)
+ std 29,360(1)
+ std 30,368(1)
+ std 31,376(1)
+ std 0,400(1)
+ ld 8,0(3)
+ mr 31,4
+ ld 9,8(3)
+ ld 10,16(3)
+ ld 11,24(3)
+ ld 12,32(3)
+ ld 6,40(3)
+ ld 14,48(3)
+ ld 15,56(3)
+ bl .LPICmeup
+.LPICedup:
+ andi. 0,31,3
+ bne .Lunaligned
+.Laligned:
+ add 5,31,5
+ std 5,192(1)
+ std 31,200(1)
+ bl .Lsha2_block_private
+ b .Ldone
+
+
+
+
+
+
+
+.align 4
+.Lunaligned:
+ subfic 0,31,4096
+ andi. 0,0,3968
+ beq .Lcross_page
+ cmpld 5,0
+ ble .Laligned
+ subfc 5,0,5
+ add 0,31,0
+ std 5,184(1)
+ std 0,192(1)
+ std 31,200(1)
+ bl .Lsha2_block_private
+
+ ld 5,184(1)
+.Lcross_page:
+ li 0,32
+ mtctr 0
+ addi 20,1,48
+.Lmemcpy:
+ lbz 16,0(31)
+ lbz 17,1(31)
+ lbz 18,2(31)
+ lbz 19,3(31)
+ addi 31,31,4
+ stb 16,0(20)
+ stb 17,1(20)
+ stb 18,2(20)
+ stb 19,3(20)
+ addi 20,20,4
+ bdnz .Lmemcpy
+ std 31,176(1)
+ addi 0,1,176
+ addi 31,1,48
+ std 5,184(1)
+ std 0,192(1)
+ std 31,200(1)
+ bl .Lsha2_block_private
+ ld 31,176(1)
+ ld 5,184(1)
+ addic. 5,5,-128
+ bne .Lunaligned
+
+.Ldone:
+ ld 0,400(1)
+ ld 14,240(1)
+ ld 15,248(1)
+ ld 16,256(1)
+ ld 17,264(1)
+ ld 18,272(1)
+ ld 19,280(1)
+ ld 20,288(1)
+ ld 21,296(1)
+ ld 22,304(1)
+ ld 23,312(1)
+ ld 24,320(1)
+ ld 25,328(1)
+ ld 26,336(1)
+ ld 27,344(1)
+ ld 28,352(1)
+ ld 29,360(1)
+ ld 30,368(1)
+ ld 31,376(1)
+ mtlr 0
+ addi 1,1,384
+ blr
+.long 0
+.byte 0,12,4,1,0x80,18,3,0
+.long 0
+.align 4
+.Lsha2_block_private:
+ ld 0,0(7)
+ lwz 5,0(31)
+ lwz 16,4(31)
+ insrdi 16,5,32,0
+ rotrdi 3,12,14
+ rotrdi 4,12,18
+ and 5,6,12
+ xor 3,3,4
+ add 15,15,0
+ andc 0,14,12
+ rotrdi 4,4,23
+ or 5,5,0
+ add 15,15,16
+ xor 3,3,4
+ add 15,15,5
+ add 15,15,3
+
+ rotrdi 3,8,28
+ rotrdi 4,8,34
+ and 5,8,9
+ and 0,8,10
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,9,10
+ xor 3,3,4
+ add 11,11,15
+ xor 5,5,0
+ ld 0,8(7)
+ add 15,15,3
+ add 15,15,5
+
+ lwz 5,8(31)
+ lwz 17,12(31)
+ insrdi 17,5,32,0
+ rotrdi 3,11,14
+ rotrdi 4,11,18
+ and 5,12,11
+ xor 3,3,4
+ add 14,14,0
+ andc 0,6,11
+ rotrdi 4,4,23
+ or 5,5,0
+ add 14,14,17
+ xor 3,3,4
+ add 14,14,5
+ add 14,14,3
+
+ rotrdi 3,15,28
+ rotrdi 4,15,34
+ and 5,15,8
+ and 0,15,9
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,8,9
+ xor 3,3,4
+ add 10,10,14
+ xor 5,5,0
+ ld 0,16(7)
+ add 14,14,3
+ add 14,14,5
+
+ lwz 5,16(31)
+ lwz 18,20(31)
+ insrdi 18,5,32,0
+ rotrdi 3,10,14
+ rotrdi 4,10,18
+ and 5,11,10
+ xor 3,3,4
+ add 6,6,0
+ andc 0,12,10
+ rotrdi 4,4,23
+ or 5,5,0
+ add 6,6,18
+ xor 3,3,4
+ add 6,6,5
+ add 6,6,3
+
+ rotrdi 3,14,28
+ rotrdi 4,14,34
+ and 5,14,15
+ and 0,14,8
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,15,8
+ xor 3,3,4
+ add 9,9,6
+ xor 5,5,0
+ ld 0,24(7)
+ add 6,6,3
+ add 6,6,5
+
+ lwz 5,24(31)
+ lwz 19,28(31)
+ insrdi 19,5,32,0
+ rotrdi 3,9,14
+ rotrdi 4,9,18
+ and 5,10,9
+ xor 3,3,4
+ add 12,12,0
+ andc 0,11,9
+ rotrdi 4,4,23
+ or 5,5,0
+ add 12,12,19
+ xor 3,3,4
+ add 12,12,5
+ add 12,12,3
+
+ rotrdi 3,6,28
+ rotrdi 4,6,34
+ and 5,6,14
+ and 0,6,15
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,14,15
+ xor 3,3,4
+ add 8,8,12
+ xor 5,5,0
+ ld 0,32(7)
+ add 12,12,3
+ add 12,12,5
+
+ lwz 5,32(31)
+ lwz 20,36(31)
+ insrdi 20,5,32,0
+ rotrdi 3,8,14
+ rotrdi 4,8,18
+ and 5,9,8
+ xor 3,3,4
+ add 11,11,0
+ andc 0,10,8
+ rotrdi 4,4,23
+ or 5,5,0
+ add 11,11,20
+ xor 3,3,4
+ add 11,11,5
+ add 11,11,3
+
+ rotrdi 3,12,28
+ rotrdi 4,12,34
+ and 5,12,6
+ and 0,12,14
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,6,14
+ xor 3,3,4
+ add 15,15,11
+ xor 5,5,0
+ ld 0,40(7)
+ add 11,11,3
+ add 11,11,5
+
+ lwz 5,40(31)
+ lwz 21,44(31)
+ insrdi 21,5,32,0
+ rotrdi 3,15,14
+ rotrdi 4,15,18
+ and 5,8,15
+ xor 3,3,4
+ add 10,10,0
+ andc 0,9,15
+ rotrdi 4,4,23
+ or 5,5,0
+ add 10,10,21
+ xor 3,3,4
+ add 10,10,5
+ add 10,10,3
+
+ rotrdi 3,11,28
+ rotrdi 4,11,34
+ and 5,11,12
+ and 0,11,6
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,12,6
+ xor 3,3,4
+ add 14,14,10
+ xor 5,5,0
+ ld 0,48(7)
+ add 10,10,3
+ add 10,10,5
+
+ lwz 5,48(31)
+ lwz 22,52(31)
+ insrdi 22,5,32,0
+ rotrdi 3,14,14
+ rotrdi 4,14,18
+ and 5,15,14
+ xor 3,3,4
+ add 9,9,0
+ andc 0,8,14
+ rotrdi 4,4,23
+ or 5,5,0
+ add 9,9,22
+ xor 3,3,4
+ add 9,9,5
+ add 9,9,3
+
+ rotrdi 3,10,28
+ rotrdi 4,10,34
+ and 5,10,11
+ and 0,10,12
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,11,12
+ xor 3,3,4
+ add 6,6,9
+ xor 5,5,0
+ ld 0,56(7)
+ add 9,9,3
+ add 9,9,5
+
+ lwz 5,56(31)
+ lwz 23,60(31)
+ insrdi 23,5,32,0
+ rotrdi 3,6,14
+ rotrdi 4,6,18
+ and 5,14,6
+ xor 3,3,4
+ add 8,8,0
+ andc 0,15,6
+ rotrdi 4,4,23
+ or 5,5,0
+ add 8,8,23
+ xor 3,3,4
+ add 8,8,5
+ add 8,8,3
+
+ rotrdi 3,9,28
+ rotrdi 4,9,34
+ and 5,9,10
+ and 0,9,11
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,10,11
+ xor 3,3,4
+ add 12,12,8
+ xor 5,5,0
+ ld 0,64(7)
+ add 8,8,3
+ add 8,8,5
+
+ lwz 5,64(31)
+ lwz 24,68(31)
+ insrdi 24,5,32,0
+ rotrdi 3,12,14
+ rotrdi 4,12,18
+ and 5,6,12
+ xor 3,3,4
+ add 15,15,0
+ andc 0,14,12
+ rotrdi 4,4,23
+ or 5,5,0
+ add 15,15,24
+ xor 3,3,4
+ add 15,15,5
+ add 15,15,3
+
+ rotrdi 3,8,28
+ rotrdi 4,8,34
+ and 5,8,9
+ and 0,8,10
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,9,10
+ xor 3,3,4
+ add 11,11,15
+ xor 5,5,0
+ ld 0,72(7)
+ add 15,15,3
+ add 15,15,5
+
+ lwz 5,72(31)
+ lwz 25,76(31)
+ insrdi 25,5,32,0
+ rotrdi 3,11,14
+ rotrdi 4,11,18
+ and 5,12,11
+ xor 3,3,4
+ add 14,14,0
+ andc 0,6,11
+ rotrdi 4,4,23
+ or 5,5,0
+ add 14,14,25
+ xor 3,3,4
+ add 14,14,5
+ add 14,14,3
+
+ rotrdi 3,15,28
+ rotrdi 4,15,34
+ and 5,15,8
+ and 0,15,9
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,8,9
+ xor 3,3,4
+ add 10,10,14
+ xor 5,5,0
+ ld 0,80(7)
+ add 14,14,3
+ add 14,14,5
+
+ lwz 5,80(31)
+ lwz 26,84(31)
+ insrdi 26,5,32,0
+ rotrdi 3,10,14
+ rotrdi 4,10,18
+ and 5,11,10
+ xor 3,3,4
+ add 6,6,0
+ andc 0,12,10
+ rotrdi 4,4,23
+ or 5,5,0
+ add 6,6,26
+ xor 3,3,4
+ add 6,6,5
+ add 6,6,3
+
+ rotrdi 3,14,28
+ rotrdi 4,14,34
+ and 5,14,15
+ and 0,14,8
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,15,8
+ xor 3,3,4
+ add 9,9,6
+ xor 5,5,0
+ ld 0,88(7)
+ add 6,6,3
+ add 6,6,5
+
+ lwz 5,88(31)
+ lwz 27,92(31)
+ insrdi 27,5,32,0
+ rotrdi 3,9,14
+ rotrdi 4,9,18
+ and 5,10,9
+ xor 3,3,4
+ add 12,12,0
+ andc 0,11,9
+ rotrdi 4,4,23
+ or 5,5,0
+ add 12,12,27
+ xor 3,3,4
+ add 12,12,5
+ add 12,12,3
+
+ rotrdi 3,6,28
+ rotrdi 4,6,34
+ and 5,6,14
+ and 0,6,15
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,14,15
+ xor 3,3,4
+ add 8,8,12
+ xor 5,5,0
+ ld 0,96(7)
+ add 12,12,3
+ add 12,12,5
+
+ lwz 5,96(31)
+ lwz 28,100(31)
+ insrdi 28,5,32,0
+ rotrdi 3,8,14
+ rotrdi 4,8,18
+ and 5,9,8
+ xor 3,3,4
+ add 11,11,0
+ andc 0,10,8
+ rotrdi 4,4,23
+ or 5,5,0
+ add 11,11,28
+ xor 3,3,4
+ add 11,11,5
+ add 11,11,3
+
+ rotrdi 3,12,28
+ rotrdi 4,12,34
+ and 5,12,6
+ and 0,12,14
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,6,14
+ xor 3,3,4
+ add 15,15,11
+ xor 5,5,0
+ ld 0,104(7)
+ add 11,11,3
+ add 11,11,5
+
+ lwz 5,104(31)
+ lwz 29,108(31)
+ insrdi 29,5,32,0
+ rotrdi 3,15,14
+ rotrdi 4,15,18
+ and 5,8,15
+ xor 3,3,4
+ add 10,10,0
+ andc 0,9,15
+ rotrdi 4,4,23
+ or 5,5,0
+ add 10,10,29
+ xor 3,3,4
+ add 10,10,5
+ add 10,10,3
+
+ rotrdi 3,11,28
+ rotrdi 4,11,34
+ and 5,11,12
+ and 0,11,6
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,12,6
+ xor 3,3,4
+ add 14,14,10
+ xor 5,5,0
+ ld 0,112(7)
+ add 10,10,3
+ add 10,10,5
+
+ lwz 5,112(31)
+ lwz 30,116(31)
+ insrdi 30,5,32,0
+ rotrdi 3,14,14
+ rotrdi 4,14,18
+ and 5,15,14
+ xor 3,3,4
+ add 9,9,0
+ andc 0,8,14
+ rotrdi 4,4,23
+ or 5,5,0
+ add 9,9,30
+ xor 3,3,4
+ add 9,9,5
+ add 9,9,3
+
+ rotrdi 3,10,28
+ rotrdi 4,10,34
+ and 5,10,11
+ and 0,10,12
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,11,12
+ xor 3,3,4
+ add 6,6,9
+ xor 5,5,0
+ ld 0,120(7)
+ add 9,9,3
+ add 9,9,5
+
+ lwz 5,120(31)
+ lwz 31,124(31)
+ insrdi 31,5,32,0
+ rotrdi 3,6,14
+ rotrdi 4,6,18
+ and 5,14,6
+ xor 3,3,4
+ add 8,8,0
+ andc 0,15,6
+ rotrdi 4,4,23
+ or 5,5,0
+ add 8,8,31
+ xor 3,3,4
+ add 8,8,5
+ add 8,8,3
+
+ rotrdi 3,9,28
+ rotrdi 4,9,34
+ and 5,9,10
+ and 0,9,11
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,10,11
+ xor 3,3,4
+ add 12,12,8
+ xor 5,5,0
+ add 8,8,3
+ add 8,8,5
+
+ li 5,4
+ mtctr 5
+.align 4
+.Lrounds:
+ addi 7,7,128
+ rotrdi 3,17,1
+ rotrdi 4,17,8
+ rotrdi 5,30,19
+ rotrdi 0,30,61
+ xor 3,3,4
+ srdi 4,17,7
+ xor 5,5,0
+ srdi 0,30,6
+ add 16,16,25
+ xor 3,3,4
+ xor 5,5,0
+ ld 0,0(7)
+ add 16,16,3
+ add 16,16,5
+ rotrdi 3,12,14
+ rotrdi 4,12,18
+ and 5,6,12
+ xor 3,3,4
+ add 15,15,0
+ andc 0,14,12
+ rotrdi 4,4,23
+ or 5,5,0
+ add 15,15,16
+ xor 3,3,4
+ add 15,15,5
+ add 15,15,3
+
+ rotrdi 3,8,28
+ rotrdi 4,8,34
+ and 5,8,9
+ and 0,8,10
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,9,10
+ xor 3,3,4
+ add 11,11,15
+ xor 5,5,0
+ add 15,15,3
+ add 15,15,5
+
+ rotrdi 3,18,1
+ rotrdi 4,18,8
+ rotrdi 5,31,19
+ rotrdi 0,31,61
+ xor 3,3,4
+ srdi 4,18,7
+ xor 5,5,0
+ srdi 0,31,6
+ add 17,17,26
+ xor 3,3,4
+ xor 5,5,0
+ ld 0,8(7)
+ add 17,17,3
+ add 17,17,5
+ rotrdi 3,11,14
+ rotrdi 4,11,18
+ and 5,12,11
+ xor 3,3,4
+ add 14,14,0
+ andc 0,6,11
+ rotrdi 4,4,23
+ or 5,5,0
+ add 14,14,17
+ xor 3,3,4
+ add 14,14,5
+ add 14,14,3
+
+ rotrdi 3,15,28
+ rotrdi 4,15,34
+ and 5,15,8
+ and 0,15,9
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,8,9
+ xor 3,3,4
+ add 10,10,14
+ xor 5,5,0
+ add 14,14,3
+ add 14,14,5
+
+ rotrdi 3,19,1
+ rotrdi 4,19,8
+ rotrdi 5,16,19
+ rotrdi 0,16,61
+ xor 3,3,4
+ srdi 4,19,7
+ xor 5,5,0
+ srdi 0,16,6
+ add 18,18,27
+ xor 3,3,4
+ xor 5,5,0
+ ld 0,16(7)
+ add 18,18,3
+ add 18,18,5
+ rotrdi 3,10,14
+ rotrdi 4,10,18
+ and 5,11,10
+ xor 3,3,4
+ add 6,6,0
+ andc 0,12,10
+ rotrdi 4,4,23
+ or 5,5,0
+ add 6,6,18
+ xor 3,3,4
+ add 6,6,5
+ add 6,6,3
+
+ rotrdi 3,14,28
+ rotrdi 4,14,34
+ and 5,14,15
+ and 0,14,8
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,15,8
+ xor 3,3,4
+ add 9,9,6
+ xor 5,5,0
+ add 6,6,3
+ add 6,6,5
+
+ rotrdi 3,20,1
+ rotrdi 4,20,8
+ rotrdi 5,17,19
+ rotrdi 0,17,61
+ xor 3,3,4
+ srdi 4,20,7
+ xor 5,5,0
+ srdi 0,17,6
+ add 19,19,28
+ xor 3,3,4
+ xor 5,5,0
+ ld 0,24(7)
+ add 19,19,3
+ add 19,19,5
+ rotrdi 3,9,14
+ rotrdi 4,9,18
+ and 5,10,9
+ xor 3,3,4
+ add 12,12,0
+ andc 0,11,9
+ rotrdi 4,4,23
+ or 5,5,0
+ add 12,12,19
+ xor 3,3,4
+ add 12,12,5
+ add 12,12,3
+
+ rotrdi 3,6,28
+ rotrdi 4,6,34
+ and 5,6,14
+ and 0,6,15
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,14,15
+ xor 3,3,4
+ add 8,8,12
+ xor 5,5,0
+ add 12,12,3
+ add 12,12,5
+
+ rotrdi 3,21,1
+ rotrdi 4,21,8
+ rotrdi 5,18,19
+ rotrdi 0,18,61
+ xor 3,3,4
+ srdi 4,21,7
+ xor 5,5,0
+ srdi 0,18,6
+ add 20,20,29
+ xor 3,3,4
+ xor 5,5,0
+ ld 0,32(7)
+ add 20,20,3
+ add 20,20,5
+ rotrdi 3,8,14
+ rotrdi 4,8,18
+ and 5,9,8
+ xor 3,3,4
+ add 11,11,0
+ andc 0,10,8
+ rotrdi 4,4,23
+ or 5,5,0
+ add 11,11,20
+ xor 3,3,4
+ add 11,11,5
+ add 11,11,3
+
+ rotrdi 3,12,28
+ rotrdi 4,12,34
+ and 5,12,6
+ and 0,12,14
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,6,14
+ xor 3,3,4
+ add 15,15,11
+ xor 5,5,0
+ add 11,11,3
+ add 11,11,5
+
+ rotrdi 3,22,1
+ rotrdi 4,22,8
+ rotrdi 5,19,19
+ rotrdi 0,19,61
+ xor 3,3,4
+ srdi 4,22,7
+ xor 5,5,0
+ srdi 0,19,6
+ add 21,21,30
+ xor 3,3,4
+ xor 5,5,0
+ ld 0,40(7)
+ add 21,21,3
+ add 21,21,5
+ rotrdi 3,15,14
+ rotrdi 4,15,18
+ and 5,8,15
+ xor 3,3,4
+ add 10,10,0
+ andc 0,9,15
+ rotrdi 4,4,23
+ or 5,5,0
+ add 10,10,21
+ xor 3,3,4
+ add 10,10,5
+ add 10,10,3
+
+ rotrdi 3,11,28
+ rotrdi 4,11,34
+ and 5,11,12
+ and 0,11,6
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,12,6
+ xor 3,3,4
+ add 14,14,10
+ xor 5,5,0
+ add 10,10,3
+ add 10,10,5
+
+ rotrdi 3,23,1
+ rotrdi 4,23,8
+ rotrdi 5,20,19
+ rotrdi 0,20,61
+ xor 3,3,4
+ srdi 4,23,7
+ xor 5,5,0
+ srdi 0,20,6
+ add 22,22,31
+ xor 3,3,4
+ xor 5,5,0
+ ld 0,48(7)
+ add 22,22,3
+ add 22,22,5
+ rotrdi 3,14,14
+ rotrdi 4,14,18
+ and 5,15,14
+ xor 3,3,4
+ add 9,9,0
+ andc 0,8,14
+ rotrdi 4,4,23
+ or 5,5,0
+ add 9,9,22
+ xor 3,3,4
+ add 9,9,5
+ add 9,9,3
+
+ rotrdi 3,10,28
+ rotrdi 4,10,34
+ and 5,10,11
+ and 0,10,12
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,11,12
+ xor 3,3,4
+ add 6,6,9
+ xor 5,5,0
+ add 9,9,3
+ add 9,9,5
+
+ rotrdi 3,24,1
+ rotrdi 4,24,8
+ rotrdi 5,21,19
+ rotrdi 0,21,61
+ xor 3,3,4
+ srdi 4,24,7
+ xor 5,5,0
+ srdi 0,21,6
+ add 23,23,16
+ xor 3,3,4
+ xor 5,5,0
+ ld 0,56(7)
+ add 23,23,3
+ add 23,23,5
+ rotrdi 3,6,14
+ rotrdi 4,6,18
+ and 5,14,6
+ xor 3,3,4
+ add 8,8,0
+ andc 0,15,6
+ rotrdi 4,4,23
+ or 5,5,0
+ add 8,8,23
+ xor 3,3,4
+ add 8,8,5
+ add 8,8,3
+
+ rotrdi 3,9,28
+ rotrdi 4,9,34
+ and 5,9,10
+ and 0,9,11
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,10,11
+ xor 3,3,4
+ add 12,12,8
+ xor 5,5,0
+ add 8,8,3
+ add 8,8,5
+
+ rotrdi 3,25,1
+ rotrdi 4,25,8
+ rotrdi 5,22,19
+ rotrdi 0,22,61
+ xor 3,3,4
+ srdi 4,25,7
+ xor 5,5,0
+ srdi 0,22,6
+ add 24,24,17
+ xor 3,3,4
+ xor 5,5,0
+ ld 0,64(7)
+ add 24,24,3
+ add 24,24,5
+ rotrdi 3,12,14
+ rotrdi 4,12,18
+ and 5,6,12
+ xor 3,3,4
+ add 15,15,0
+ andc 0,14,12
+ rotrdi 4,4,23
+ or 5,5,0
+ add 15,15,24
+ xor 3,3,4
+ add 15,15,5
+ add 15,15,3
+
+ rotrdi 3,8,28
+ rotrdi 4,8,34
+ and 5,8,9
+ and 0,8,10
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,9,10
+ xor 3,3,4
+ add 11,11,15
+ xor 5,5,0
+ add 15,15,3
+ add 15,15,5
+
+ rotrdi 3,26,1
+ rotrdi 4,26,8
+ rotrdi 5,23,19
+ rotrdi 0,23,61
+ xor 3,3,4
+ srdi 4,26,7
+ xor 5,5,0
+ srdi 0,23,6
+ add 25,25,18
+ xor 3,3,4
+ xor 5,5,0
+ ld 0,72(7)
+ add 25,25,3
+ add 25,25,5
+ rotrdi 3,11,14
+ rotrdi 4,11,18
+ and 5,12,11
+ xor 3,3,4
+ add 14,14,0
+ andc 0,6,11
+ rotrdi 4,4,23
+ or 5,5,0
+ add 14,14,25
+ xor 3,3,4
+ add 14,14,5
+ add 14,14,3
+
+ rotrdi 3,15,28
+ rotrdi 4,15,34
+ and 5,15,8
+ and 0,15,9
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,8,9
+ xor 3,3,4
+ add 10,10,14
+ xor 5,5,0
+ add 14,14,3
+ add 14,14,5
+
+ rotrdi 3,27,1
+ rotrdi 4,27,8
+ rotrdi 5,24,19
+ rotrdi 0,24,61
+ xor 3,3,4
+ srdi 4,27,7
+ xor 5,5,0
+ srdi 0,24,6
+ add 26,26,19
+ xor 3,3,4
+ xor 5,5,0
+ ld 0,80(7)
+ add 26,26,3
+ add 26,26,5
+ rotrdi 3,10,14
+ rotrdi 4,10,18
+ and 5,11,10
+ xor 3,3,4
+ add 6,6,0
+ andc 0,12,10
+ rotrdi 4,4,23
+ or 5,5,0
+ add 6,6,26
+ xor 3,3,4
+ add 6,6,5
+ add 6,6,3
+
+ rotrdi 3,14,28
+ rotrdi 4,14,34
+ and 5,14,15
+ and 0,14,8
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,15,8
+ xor 3,3,4
+ add 9,9,6
+ xor 5,5,0
+ add 6,6,3
+ add 6,6,5
+
+ rotrdi 3,28,1
+ rotrdi 4,28,8
+ rotrdi 5,25,19
+ rotrdi 0,25,61
+ xor 3,3,4
+ srdi 4,28,7
+ xor 5,5,0
+ srdi 0,25,6
+ add 27,27,20
+ xor 3,3,4
+ xor 5,5,0
+ ld 0,88(7)
+ add 27,27,3
+ add 27,27,5
+ rotrdi 3,9,14
+ rotrdi 4,9,18
+ and 5,10,9
+ xor 3,3,4
+ add 12,12,0
+ andc 0,11,9
+ rotrdi 4,4,23
+ or 5,5,0
+ add 12,12,27
+ xor 3,3,4
+ add 12,12,5
+ add 12,12,3
+
+ rotrdi 3,6,28
+ rotrdi 4,6,34
+ and 5,6,14
+ and 0,6,15
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,14,15
+ xor 3,3,4
+ add 8,8,12
+ xor 5,5,0
+ add 12,12,3
+ add 12,12,5
+
+ rotrdi 3,29,1
+ rotrdi 4,29,8
+ rotrdi 5,26,19
+ rotrdi 0,26,61
+ xor 3,3,4
+ srdi 4,29,7
+ xor 5,5,0
+ srdi 0,26,6
+ add 28,28,21
+ xor 3,3,4
+ xor 5,5,0
+ ld 0,96(7)
+ add 28,28,3
+ add 28,28,5
+ rotrdi 3,8,14
+ rotrdi 4,8,18
+ and 5,9,8
+ xor 3,3,4
+ add 11,11,0
+ andc 0,10,8
+ rotrdi 4,4,23
+ or 5,5,0
+ add 11,11,28
+ xor 3,3,4
+ add 11,11,5
+ add 11,11,3
+
+ rotrdi 3,12,28
+ rotrdi 4,12,34
+ and 5,12,6
+ and 0,12,14
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,6,14
+ xor 3,3,4
+ add 15,15,11
+ xor 5,5,0
+ add 11,11,3
+ add 11,11,5
+
+ rotrdi 3,30,1
+ rotrdi 4,30,8
+ rotrdi 5,27,19
+ rotrdi 0,27,61
+ xor 3,3,4
+ srdi 4,30,7
+ xor 5,5,0
+ srdi 0,27,6
+ add 29,29,22
+ xor 3,3,4
+ xor 5,5,0
+ ld 0,104(7)
+ add 29,29,3
+ add 29,29,5
+ rotrdi 3,15,14
+ rotrdi 4,15,18
+ and 5,8,15
+ xor 3,3,4
+ add 10,10,0
+ andc 0,9,15
+ rotrdi 4,4,23
+ or 5,5,0
+ add 10,10,29
+ xor 3,3,4
+ add 10,10,5
+ add 10,10,3
+
+ rotrdi 3,11,28
+ rotrdi 4,11,34
+ and 5,11,12
+ and 0,11,6
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,12,6
+ xor 3,3,4
+ add 14,14,10
+ xor 5,5,0
+ add 10,10,3
+ add 10,10,5
+
+ rotrdi 3,31,1
+ rotrdi 4,31,8
+ rotrdi 5,28,19
+ rotrdi 0,28,61
+ xor 3,3,4
+ srdi 4,31,7
+ xor 5,5,0
+ srdi 0,28,6
+ add 30,30,23
+ xor 3,3,4
+ xor 5,5,0
+ ld 0,112(7)
+ add 30,30,3
+ add 30,30,5
+ rotrdi 3,14,14
+ rotrdi 4,14,18
+ and 5,15,14
+ xor 3,3,4
+ add 9,9,0
+ andc 0,8,14
+ rotrdi 4,4,23
+ or 5,5,0
+ add 9,9,30
+ xor 3,3,4
+ add 9,9,5
+ add 9,9,3
+
+ rotrdi 3,10,28
+ rotrdi 4,10,34
+ and 5,10,11
+ and 0,10,12
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,11,12
+ xor 3,3,4
+ add 6,6,9
+ xor 5,5,0
+ add 9,9,3
+ add 9,9,5
+
+ rotrdi 3,16,1
+ rotrdi 4,16,8
+ rotrdi 5,29,19
+ rotrdi 0,29,61
+ xor 3,3,4
+ srdi 4,16,7
+ xor 5,5,0
+ srdi 0,29,6
+ add 31,31,24
+ xor 3,3,4
+ xor 5,5,0
+ ld 0,120(7)
+ add 31,31,3
+ add 31,31,5
+ rotrdi 3,6,14
+ rotrdi 4,6,18
+ and 5,14,6
+ xor 3,3,4
+ add 8,8,0
+ andc 0,15,6
+ rotrdi 4,4,23
+ or 5,5,0
+ add 8,8,31
+ xor 3,3,4
+ add 8,8,5
+ add 8,8,3
+
+ rotrdi 3,9,28
+ rotrdi 4,9,34
+ and 5,9,10
+ and 0,9,11
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,10,11
+ xor 3,3,4
+ add 12,12,8
+ xor 5,5,0
+ add 8,8,3
+ add 8,8,5
+
+ bdnz .Lrounds
+
+ ld 3,208(1)
+ ld 31,200(1)
+ ld 5,192(1)
+ subi 7,7,512
+
+ ld 16,0(3)
+ ld 17,8(3)
+ ld 18,16(3)
+ ld 19,24(3)
+ ld 20,32(3)
+ ld 21,40(3)
+ ld 22,48(3)
+ addi 31,31,128
+ ld 23,56(3)
+ add 8,8,16
+ add 9,9,17
+ std 31,200(1)
+ add 10,10,18
+ std 8,0(3)
+ add 11,11,19
+ std 9,8(3)
+ add 12,12,20
+ std 10,16(3)
+ add 6,6,21
+ std 11,24(3)
+ add 14,14,22
+ std 12,32(3)
+ add 15,15,23
+ std 6,40(3)
+ std 14,48(3)
+ cmpld 31,5
+ std 15,56(3)
+ bne .Lsha2_block_private
+ blr
+.long 0
+.byte 0,12,0x14,0,0,0,0,0
+.size .zfs_sha512_ppc,.-.zfs_sha512_ppc
+.size zfs_sha512_ppc,.-.zfs_sha512_ppc
+.align 6
+.LPICmeup:
+ mflr 0
+ bcl 20,31,$+4
+ mflr 7
+ addi 7,7,56
+ mtlr 0
+ blr
+.long 0
+.byte 0,12,0x14,0,0,0,0,0
+.space 28
+.long 0x428a2f98,0xd728ae22
+.long 0x71374491,0x23ef65cd
+.long 0xb5c0fbcf,0xec4d3b2f
+.long 0xe9b5dba5,0x8189dbbc
+.long 0x3956c25b,0xf348b538
+.long 0x59f111f1,0xb605d019
+.long 0x923f82a4,0xaf194f9b
+.long 0xab1c5ed5,0xda6d8118
+.long 0xd807aa98,0xa3030242
+.long 0x12835b01,0x45706fbe
+.long 0x243185be,0x4ee4b28c
+.long 0x550c7dc3,0xd5ffb4e2
+.long 0x72be5d74,0xf27b896f
+.long 0x80deb1fe,0x3b1696b1
+.long 0x9bdc06a7,0x25c71235
+.long 0xc19bf174,0xcf692694
+.long 0xe49b69c1,0x9ef14ad2
+.long 0xefbe4786,0x384f25e3
+.long 0x0fc19dc6,0x8b8cd5b5
+.long 0x240ca1cc,0x77ac9c65
+.long 0x2de92c6f,0x592b0275
+.long 0x4a7484aa,0x6ea6e483
+.long 0x5cb0a9dc,0xbd41fbd4
+.long 0x76f988da,0x831153b5
+.long 0x983e5152,0xee66dfab
+.long 0xa831c66d,0x2db43210
+.long 0xb00327c8,0x98fb213f
+.long 0xbf597fc7,0xbeef0ee4
+.long 0xc6e00bf3,0x3da88fc2
+.long 0xd5a79147,0x930aa725
+.long 0x06ca6351,0xe003826f
+.long 0x14292967,0x0a0e6e70
+.long 0x27b70a85,0x46d22ffc
+.long 0x2e1b2138,0x5c26c926
+.long 0x4d2c6dfc,0x5ac42aed
+.long 0x53380d13,0x9d95b3df
+.long 0x650a7354,0x8baf63de
+.long 0x766a0abb,0x3c77b2a8
+.long 0x81c2c92e,0x47edaee6
+.long 0x92722c85,0x1482353b
+.long 0xa2bfe8a1,0x4cf10364
+.long 0xa81a664b,0xbc423001
+.long 0xc24b8b70,0xd0f89791
+.long 0xc76c51a3,0x0654be30
+.long 0xd192e819,0xd6ef5218
+.long 0xd6990624,0x5565a910
+.long 0xf40e3585,0x5771202a
+.long 0x106aa070,0x32bbd1b8
+.long 0x19a4c116,0xb8d2d0c8
+.long 0x1e376c08,0x5141ab53
+.long 0x2748774c,0xdf8eeb99
+.long 0x34b0bcb5,0xe19b48a8
+.long 0x391c0cb3,0xc5c95a63
+.long 0x4ed8aa4a,0xe3418acb
+.long 0x5b9cca4f,0x7763e373
+.long 0x682e6ff3,0xd6b2b8a3
+.long 0x748f82ee,0x5defb2fc
+.long 0x78a5636f,0x43172f60
+.long 0x84c87814,0xa1f0ab72
+.long 0x8cc70208,0x1a6439ec
+.long 0x90befffa,0x23631e28
+.long 0xa4506ceb,0xde82bde9
+.long 0xbef9a3f7,0xb2c67915
+.long 0xc67178f2,0xe372532b
+.long 0xca273ece,0xea26619c
+.long 0xd186b8c7,0x21c0c207
+.long 0xeada7dd6,0xcde0eb1e
+.long 0xf57d4f7f,0xee6ed178
+.long 0x06f067aa,0x72176fba
+.long 0x0a637dc5,0xa2c898a6
+.long 0x113f9804,0xbef90dae
+.long 0x1b710b35,0x131c471b
+.long 0x28db77f5,0x23047d84
+.long 0x32caab7b,0x40c72493
+.long 0x3c9ebe0a,0x15c9bebc
+.long 0x431d67c4,0x9c100d4c
+.long 0x4cc5d4be,0xcb3e42b6
+.long 0x597f299c,0xfc657e2a
+.long 0x5fcb6fab,0x3ad6faec
+.long 0x6c44198c,0x4a475817
+
+#elif (defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
+
+.abiversion 2
+.text
+
+.globl zfs_sha512_ppc
+.type zfs_sha512_ppc,@function
+.align 6
+zfs_sha512_ppc:
+.localentry zfs_sha512_ppc,0
+
+ stdu 1,-384(1)
+ mflr 0
+ sldi 5,5,7
+
+ std 3,208(1)
+
+ std 14,240(1)
+ std 15,248(1)
+ std 16,256(1)
+ std 17,264(1)
+ std 18,272(1)
+ std 19,280(1)
+ std 20,288(1)
+ std 21,296(1)
+ std 22,304(1)
+ std 23,312(1)
+ std 24,320(1)
+ std 25,328(1)
+ std 26,336(1)
+ std 27,344(1)
+ std 28,352(1)
+ std 29,360(1)
+ std 30,368(1)
+ std 31,376(1)
+ std 0,400(1)
+ ld 8,0(3)
+ mr 31,4
+ ld 9,8(3)
+ ld 10,16(3)
+ ld 11,24(3)
+ ld 12,32(3)
+ ld 6,40(3)
+ ld 14,48(3)
+ ld 15,56(3)
+ bl .LPICmeup
+.LPICedup:
+ andi. 0,31,3
+ bne .Lunaligned
+.Laligned:
+ add 5,31,5
+ std 5,192(1)
+ std 31,200(1)
+ bl .Lsha2_block_private
+ b .Ldone
+
+.align 4
+.Lunaligned:
+ subfic 0,31,4096
+ andi. 0,0,3968
+ beq .Lcross_page
+ cmpld 5,0
+ ble .Laligned
+ subfc 5,0,5
+ add 0,31,0
+ std 5,184(1)
+ std 0,192(1)
+ std 31,200(1)
+ bl .Lsha2_block_private
+
+ ld 5,184(1)
+.Lcross_page:
+ li 0,32
+ mtctr 0
+ addi 20,1,48
+.Lmemcpy:
+ lbz 16,0(31)
+ lbz 17,1(31)
+ lbz 18,2(31)
+ lbz 19,3(31)
+ addi 31,31,4
+ stb 16,0(20)
+ stb 17,1(20)
+ stb 18,2(20)
+ stb 19,3(20)
+ addi 20,20,4
+ bdnz .Lmemcpy
+ std 31,176(1)
+ addi 0,1,176
+ addi 31,1,48
+ std 5,184(1)
+ std 0,192(1)
+ std 31,200(1)
+ bl .Lsha2_block_private
+ ld 31,176(1)
+ ld 5,184(1)
+ addic. 5,5,-128
+ bne .Lunaligned
+
+.Ldone:
+ ld 0,400(1)
+ ld 14,240(1)
+ ld 15,248(1)
+ ld 16,256(1)
+ ld 17,264(1)
+ ld 18,272(1)
+ ld 19,280(1)
+ ld 20,288(1)
+ ld 21,296(1)
+ ld 22,304(1)
+ ld 23,312(1)
+ ld 24,320(1)
+ ld 25,328(1)
+ ld 26,336(1)
+ ld 27,344(1)
+ ld 28,352(1)
+ ld 29,360(1)
+ ld 30,368(1)
+ ld 31,376(1)
+ mtlr 0
+ addi 1,1,384
+ blr
+.long 0
+.byte 0,12,4,1,0x80,18,3,0
+.long 0
+.align 4
+.Lsha2_block_private:
+ ld 0,0(7)
+ lwz 3,0(31)
+ lwz 4,4(31)
+ rotlwi 5,3,8
+ rotlwi 16,4,8
+ rlwimi 5,3,24,0,7
+ rlwimi 16,4,24,0,7
+ rlwimi 5,3,24,16,23
+ rlwimi 16,4,24,16,23
+ insrdi 16,5,32,0
+ rotrdi 3,12,14
+ rotrdi 4,12,18
+ and 5,6,12
+ xor 3,3,4
+ add 15,15,0
+ andc 0,14,12
+ rotrdi 4,4,23
+ or 5,5,0
+ add 15,15,16
+ xor 3,3,4
+ add 15,15,5
+ add 15,15,3
+
+ rotrdi 3,8,28
+ rotrdi 4,8,34
+ and 5,8,9
+ and 0,8,10
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,9,10
+ xor 3,3,4
+ add 11,11,15
+ xor 5,5,0
+ ld 0,8(7)
+ add 15,15,3
+ add 15,15,5
+
+ lwz 3,8(31)
+ lwz 4,12(31)
+ rotlwi 5,3,8
+ rotlwi 17,4,8
+ rlwimi 5,3,24,0,7
+ rlwimi 17,4,24,0,7
+ rlwimi 5,3,24,16,23
+ rlwimi 17,4,24,16,23
+ insrdi 17,5,32,0
+ rotrdi 3,11,14
+ rotrdi 4,11,18
+ and 5,12,11
+ xor 3,3,4
+ add 14,14,0
+ andc 0,6,11
+ rotrdi 4,4,23
+ or 5,5,0
+ add 14,14,17
+ xor 3,3,4
+ add 14,14,5
+ add 14,14,3
+
+ rotrdi 3,15,28
+ rotrdi 4,15,34
+ and 5,15,8
+ and 0,15,9
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,8,9
+ xor 3,3,4
+ add 10,10,14
+ xor 5,5,0
+ ld 0,16(7)
+ add 14,14,3
+ add 14,14,5
+
+ lwz 3,16(31)
+ lwz 4,20(31)
+ rotlwi 5,3,8
+ rotlwi 18,4,8
+ rlwimi 5,3,24,0,7
+ rlwimi 18,4,24,0,7
+ rlwimi 5,3,24,16,23
+ rlwimi 18,4,24,16,23
+ insrdi 18,5,32,0
+ rotrdi 3,10,14
+ rotrdi 4,10,18
+ and 5,11,10
+ xor 3,3,4
+ add 6,6,0
+ andc 0,12,10
+ rotrdi 4,4,23
+ or 5,5,0
+ add 6,6,18
+ xor 3,3,4
+ add 6,6,5
+ add 6,6,3
+
+ rotrdi 3,14,28
+ rotrdi 4,14,34
+ and 5,14,15
+ and 0,14,8
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,15,8
+ xor 3,3,4
+ add 9,9,6
+ xor 5,5,0
+ ld 0,24(7)
+ add 6,6,3
+ add 6,6,5
+
+ lwz 3,24(31)
+ lwz 4,28(31)
+ rotlwi 5,3,8
+ rotlwi 19,4,8
+ rlwimi 5,3,24,0,7
+ rlwimi 19,4,24,0,7
+ rlwimi 5,3,24,16,23
+ rlwimi 19,4,24,16,23
+ insrdi 19,5,32,0
+ rotrdi 3,9,14
+ rotrdi 4,9,18
+ and 5,10,9
+ xor 3,3,4
+ add 12,12,0
+ andc 0,11,9
+ rotrdi 4,4,23
+ or 5,5,0
+ add 12,12,19
+ xor 3,3,4
+ add 12,12,5
+ add 12,12,3
+
+ rotrdi 3,6,28
+ rotrdi 4,6,34
+ and 5,6,14
+ and 0,6,15
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,14,15
+ xor 3,3,4
+ add 8,8,12
+ xor 5,5,0
+ ld 0,32(7)
+ add 12,12,3
+ add 12,12,5
+
+ lwz 3,32(31)
+ lwz 4,36(31)
+ rotlwi 5,3,8
+ rotlwi 20,4,8
+ rlwimi 5,3,24,0,7
+ rlwimi 20,4,24,0,7
+ rlwimi 5,3,24,16,23
+ rlwimi 20,4,24,16,23
+ insrdi 20,5,32,0
+ rotrdi 3,8,14
+ rotrdi 4,8,18
+ and 5,9,8
+ xor 3,3,4
+ add 11,11,0
+ andc 0,10,8
+ rotrdi 4,4,23
+ or 5,5,0
+ add 11,11,20
+ xor 3,3,4
+ add 11,11,5
+ add 11,11,3
+
+ rotrdi 3,12,28
+ rotrdi 4,12,34
+ and 5,12,6
+ and 0,12,14
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,6,14
+ xor 3,3,4
+ add 15,15,11
+ xor 5,5,0
+ ld 0,40(7)
+ add 11,11,3
+ add 11,11,5
+
+ lwz 3,40(31)
+ lwz 4,44(31)
+ rotlwi 5,3,8
+ rotlwi 21,4,8
+ rlwimi 5,3,24,0,7
+ rlwimi 21,4,24,0,7
+ rlwimi 5,3,24,16,23
+ rlwimi 21,4,24,16,23
+ insrdi 21,5,32,0
+ rotrdi 3,15,14
+ rotrdi 4,15,18
+ and 5,8,15
+ xor 3,3,4
+ add 10,10,0
+ andc 0,9,15
+ rotrdi 4,4,23
+ or 5,5,0
+ add 10,10,21
+ xor 3,3,4
+ add 10,10,5
+ add 10,10,3
+
+ rotrdi 3,11,28
+ rotrdi 4,11,34
+ and 5,11,12
+ and 0,11,6
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,12,6
+ xor 3,3,4
+ add 14,14,10
+ xor 5,5,0
+ ld 0,48(7)
+ add 10,10,3
+ add 10,10,5
+
+ lwz 3,48(31)
+ lwz 4,52(31)
+ rotlwi 5,3,8
+ rotlwi 22,4,8
+ rlwimi 5,3,24,0,7
+ rlwimi 22,4,24,0,7
+ rlwimi 5,3,24,16,23
+ rlwimi 22,4,24,16,23
+ insrdi 22,5,32,0
+ rotrdi 3,14,14
+ rotrdi 4,14,18
+ and 5,15,14
+ xor 3,3,4
+ add 9,9,0
+ andc 0,8,14
+ rotrdi 4,4,23
+ or 5,5,0
+ add 9,9,22
+ xor 3,3,4
+ add 9,9,5
+ add 9,9,3
+
+ rotrdi 3,10,28
+ rotrdi 4,10,34
+ and 5,10,11
+ and 0,10,12
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,11,12
+ xor 3,3,4
+ add 6,6,9
+ xor 5,5,0
+ ld 0,56(7)
+ add 9,9,3
+ add 9,9,5
+
+ lwz 3,56(31)
+ lwz 4,60(31)
+ rotlwi 5,3,8
+ rotlwi 23,4,8
+ rlwimi 5,3,24,0,7
+ rlwimi 23,4,24,0,7
+ rlwimi 5,3,24,16,23
+ rlwimi 23,4,24,16,23
+ insrdi 23,5,32,0
+ rotrdi 3,6,14
+ rotrdi 4,6,18
+ and 5,14,6
+ xor 3,3,4
+ add 8,8,0
+ andc 0,15,6
+ rotrdi 4,4,23
+ or 5,5,0
+ add 8,8,23
+ xor 3,3,4
+ add 8,8,5
+ add 8,8,3
+
+ rotrdi 3,9,28
+ rotrdi 4,9,34
+ and 5,9,10
+ and 0,9,11
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,10,11
+ xor 3,3,4
+ add 12,12,8
+ xor 5,5,0
+ ld 0,64(7)
+ add 8,8,3
+ add 8,8,5
+
+ lwz 3,64(31)
+ lwz 4,68(31)
+ rotlwi 5,3,8
+ rotlwi 24,4,8
+ rlwimi 5,3,24,0,7
+ rlwimi 24,4,24,0,7
+ rlwimi 5,3,24,16,23
+ rlwimi 24,4,24,16,23
+ insrdi 24,5,32,0
+ rotrdi 3,12,14
+ rotrdi 4,12,18
+ and 5,6,12
+ xor 3,3,4
+ add 15,15,0
+ andc 0,14,12
+ rotrdi 4,4,23
+ or 5,5,0
+ add 15,15,24
+ xor 3,3,4
+ add 15,15,5
+ add 15,15,3
+
+ rotrdi 3,8,28
+ rotrdi 4,8,34
+ and 5,8,9
+ and 0,8,10
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,9,10
+ xor 3,3,4
+ add 11,11,15
+ xor 5,5,0
+ ld 0,72(7)
+ add 15,15,3
+ add 15,15,5
+
+ lwz 3,72(31)
+ lwz 4,76(31)
+ rotlwi 5,3,8
+ rotlwi 25,4,8
+ rlwimi 5,3,24,0,7
+ rlwimi 25,4,24,0,7
+ rlwimi 5,3,24,16,23
+ rlwimi 25,4,24,16,23
+ insrdi 25,5,32,0
+ rotrdi 3,11,14
+ rotrdi 4,11,18
+ and 5,12,11
+ xor 3,3,4
+ add 14,14,0
+ andc 0,6,11
+ rotrdi 4,4,23
+ or 5,5,0
+ add 14,14,25
+ xor 3,3,4
+ add 14,14,5
+ add 14,14,3
+
+ rotrdi 3,15,28
+ rotrdi 4,15,34
+ and 5,15,8
+ and 0,15,9
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,8,9
+ xor 3,3,4
+ add 10,10,14
+ xor 5,5,0
+ ld 0,80(7)
+ add 14,14,3
+ add 14,14,5
+
+ lwz 3,80(31)
+ lwz 4,84(31)
+ rotlwi 5,3,8
+ rotlwi 26,4,8
+ rlwimi 5,3,24,0,7
+ rlwimi 26,4,24,0,7
+ rlwimi 5,3,24,16,23
+ rlwimi 26,4,24,16,23
+ insrdi 26,5,32,0
+ rotrdi 3,10,14
+ rotrdi 4,10,18
+ and 5,11,10
+ xor 3,3,4
+ add 6,6,0
+ andc 0,12,10
+ rotrdi 4,4,23
+ or 5,5,0
+ add 6,6,26
+ xor 3,3,4
+ add 6,6,5
+ add 6,6,3
+
+ rotrdi 3,14,28
+ rotrdi 4,14,34
+ and 5,14,15
+ and 0,14,8
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,15,8
+ xor 3,3,4
+ add 9,9,6
+ xor 5,5,0
+ ld 0,88(7)
+ add 6,6,3
+ add 6,6,5
+
+ lwz 3,88(31)
+ lwz 4,92(31)
+ rotlwi 5,3,8
+ rotlwi 27,4,8
+ rlwimi 5,3,24,0,7
+ rlwimi 27,4,24,0,7
+ rlwimi 5,3,24,16,23
+ rlwimi 27,4,24,16,23
+ insrdi 27,5,32,0
+ rotrdi 3,9,14
+ rotrdi 4,9,18
+ and 5,10,9
+ xor 3,3,4
+ add 12,12,0
+ andc 0,11,9
+ rotrdi 4,4,23
+ or 5,5,0
+ add 12,12,27
+ xor 3,3,4
+ add 12,12,5
+ add 12,12,3
+
+ rotrdi 3,6,28
+ rotrdi 4,6,34
+ and 5,6,14
+ and 0,6,15
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,14,15
+ xor 3,3,4
+ add 8,8,12
+ xor 5,5,0
+ ld 0,96(7)
+ add 12,12,3
+ add 12,12,5
+
+ lwz 3,96(31)
+ lwz 4,100(31)
+ rotlwi 5,3,8
+ rotlwi 28,4,8
+ rlwimi 5,3,24,0,7
+ rlwimi 28,4,24,0,7
+ rlwimi 5,3,24,16,23
+ rlwimi 28,4,24,16,23
+ insrdi 28,5,32,0
+ rotrdi 3,8,14
+ rotrdi 4,8,18
+ and 5,9,8
+ xor 3,3,4
+ add 11,11,0
+ andc 0,10,8
+ rotrdi 4,4,23
+ or 5,5,0
+ add 11,11,28
+ xor 3,3,4
+ add 11,11,5
+ add 11,11,3
+
+ rotrdi 3,12,28
+ rotrdi 4,12,34
+ and 5,12,6
+ and 0,12,14
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,6,14
+ xor 3,3,4
+ add 15,15,11
+ xor 5,5,0
+ ld 0,104(7)
+ add 11,11,3
+ add 11,11,5
+
+ lwz 3,104(31)
+ lwz 4,108(31)
+ rotlwi 5,3,8
+ rotlwi 29,4,8
+ rlwimi 5,3,24,0,7
+ rlwimi 29,4,24,0,7
+ rlwimi 5,3,24,16,23
+ rlwimi 29,4,24,16,23
+ insrdi 29,5,32,0
+ rotrdi 3,15,14
+ rotrdi 4,15,18
+ and 5,8,15
+ xor 3,3,4
+ add 10,10,0
+ andc 0,9,15
+ rotrdi 4,4,23
+ or 5,5,0
+ add 10,10,29
+ xor 3,3,4
+ add 10,10,5
+ add 10,10,3
+
+ rotrdi 3,11,28
+ rotrdi 4,11,34
+ and 5,11,12
+ and 0,11,6
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,12,6
+ xor 3,3,4
+ add 14,14,10
+ xor 5,5,0
+ ld 0,112(7)
+ add 10,10,3
+ add 10,10,5
+
+ lwz 3,112(31)
+ lwz 4,116(31)
+ rotlwi 5,3,8
+ rotlwi 30,4,8
+ rlwimi 5,3,24,0,7
+ rlwimi 30,4,24,0,7
+ rlwimi 5,3,24,16,23
+ rlwimi 30,4,24,16,23
+ insrdi 30,5,32,0
+ rotrdi 3,14,14
+ rotrdi 4,14,18
+ and 5,15,14
+ xor 3,3,4
+ add 9,9,0
+ andc 0,8,14
+ rotrdi 4,4,23
+ or 5,5,0
+ add 9,9,30
+ xor 3,3,4
+ add 9,9,5
+ add 9,9,3
+
+ rotrdi 3,10,28
+ rotrdi 4,10,34
+ and 5,10,11
+ and 0,10,12
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,11,12
+ xor 3,3,4
+ add 6,6,9
+ xor 5,5,0
+ ld 0,120(7)
+ add 9,9,3
+ add 9,9,5
+
+ lwz 3,120(31)
+ lwz 4,124(31)
+ rotlwi 5,3,8
+ rotlwi 31,4,8
+ rlwimi 5,3,24,0,7
+ rlwimi 31,4,24,0,7
+ rlwimi 5,3,24,16,23
+ rlwimi 31,4,24,16,23
+ insrdi 31,5,32,0
+ rotrdi 3,6,14
+ rotrdi 4,6,18
+ and 5,14,6
+ xor 3,3,4
+ add 8,8,0
+ andc 0,15,6
+ rotrdi 4,4,23
+ or 5,5,0
+ add 8,8,31
+ xor 3,3,4
+ add 8,8,5
+ add 8,8,3
+
+ rotrdi 3,9,28
+ rotrdi 4,9,34
+ and 5,9,10
+ and 0,9,11
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,10,11
+ xor 3,3,4
+ add 12,12,8
+ xor 5,5,0
+ add 8,8,3
+ add 8,8,5
+
+ li 5,4
+ mtctr 5
+.align 4
+.Lrounds:
+ addi 7,7,128
+ rotrdi 3,17,1
+ rotrdi 4,17,8
+ rotrdi 5,30,19
+ rotrdi 0,30,61
+ xor 3,3,4
+ srdi 4,17,7
+ xor 5,5,0
+ srdi 0,30,6
+ add 16,16,25
+ xor 3,3,4
+ xor 5,5,0
+ ld 0,0(7)
+ add 16,16,3
+ add 16,16,5
+ rotrdi 3,12,14
+ rotrdi 4,12,18
+ and 5,6,12
+ xor 3,3,4
+ add 15,15,0
+ andc 0,14,12
+ rotrdi 4,4,23
+ or 5,5,0
+ add 15,15,16
+ xor 3,3,4
+ add 15,15,5
+ add 15,15,3
+
+ rotrdi 3,8,28
+ rotrdi 4,8,34
+ and 5,8,9
+ and 0,8,10
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,9,10
+ xor 3,3,4
+ add 11,11,15
+ xor 5,5,0
+ add 15,15,3
+ add 15,15,5
+
+ rotrdi 3,18,1
+ rotrdi 4,18,8
+ rotrdi 5,31,19
+ rotrdi 0,31,61
+ xor 3,3,4
+ srdi 4,18,7
+ xor 5,5,0
+ srdi 0,31,6
+ add 17,17,26
+ xor 3,3,4
+ xor 5,5,0
+ ld 0,8(7)
+ add 17,17,3
+ add 17,17,5
+ rotrdi 3,11,14
+ rotrdi 4,11,18
+ and 5,12,11
+ xor 3,3,4
+ add 14,14,0
+ andc 0,6,11
+ rotrdi 4,4,23
+ or 5,5,0
+ add 14,14,17
+ xor 3,3,4
+ add 14,14,5
+ add 14,14,3
+
+ rotrdi 3,15,28
+ rotrdi 4,15,34
+ and 5,15,8
+ and 0,15,9
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,8,9
+ xor 3,3,4
+ add 10,10,14
+ xor 5,5,0
+ add 14,14,3
+ add 14,14,5
+
+ rotrdi 3,19,1
+ rotrdi 4,19,8
+ rotrdi 5,16,19
+ rotrdi 0,16,61
+ xor 3,3,4
+ srdi 4,19,7
+ xor 5,5,0
+ srdi 0,16,6
+ add 18,18,27
+ xor 3,3,4
+ xor 5,5,0
+ ld 0,16(7)
+ add 18,18,3
+ add 18,18,5
+ rotrdi 3,10,14
+ rotrdi 4,10,18
+ and 5,11,10
+ xor 3,3,4
+ add 6,6,0
+ andc 0,12,10
+ rotrdi 4,4,23
+ or 5,5,0
+ add 6,6,18
+ xor 3,3,4
+ add 6,6,5
+ add 6,6,3
+
+ rotrdi 3,14,28
+ rotrdi 4,14,34
+ and 5,14,15
+ and 0,14,8
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,15,8
+ xor 3,3,4
+ add 9,9,6
+ xor 5,5,0
+ add 6,6,3
+ add 6,6,5
+
+ rotrdi 3,20,1
+ rotrdi 4,20,8
+ rotrdi 5,17,19
+ rotrdi 0,17,61
+ xor 3,3,4
+ srdi 4,20,7
+ xor 5,5,0
+ srdi 0,17,6
+ add 19,19,28
+ xor 3,3,4
+ xor 5,5,0
+ ld 0,24(7)
+ add 19,19,3
+ add 19,19,5
+ rotrdi 3,9,14
+ rotrdi 4,9,18
+ and 5,10,9
+ xor 3,3,4
+ add 12,12,0
+ andc 0,11,9
+ rotrdi 4,4,23
+ or 5,5,0
+ add 12,12,19
+ xor 3,3,4
+ add 12,12,5
+ add 12,12,3
+
+ rotrdi 3,6,28
+ rotrdi 4,6,34
+ and 5,6,14
+ and 0,6,15
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,14,15
+ xor 3,3,4
+ add 8,8,12
+ xor 5,5,0
+ add 12,12,3
+ add 12,12,5
+
+ rotrdi 3,21,1
+ rotrdi 4,21,8
+ rotrdi 5,18,19
+ rotrdi 0,18,61
+ xor 3,3,4
+ srdi 4,21,7
+ xor 5,5,0
+ srdi 0,18,6
+ add 20,20,29
+ xor 3,3,4
+ xor 5,5,0
+ ld 0,32(7)
+ add 20,20,3
+ add 20,20,5
+ rotrdi 3,8,14
+ rotrdi 4,8,18
+ and 5,9,8
+ xor 3,3,4
+ add 11,11,0
+ andc 0,10,8
+ rotrdi 4,4,23
+ or 5,5,0
+ add 11,11,20
+ xor 3,3,4
+ add 11,11,5
+ add 11,11,3
+
+ rotrdi 3,12,28
+ rotrdi 4,12,34
+ and 5,12,6
+ and 0,12,14
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,6,14
+ xor 3,3,4
+ add 15,15,11
+ xor 5,5,0
+ add 11,11,3
+ add 11,11,5
+
+ rotrdi 3,22,1
+ rotrdi 4,22,8
+ rotrdi 5,19,19
+ rotrdi 0,19,61
+ xor 3,3,4
+ srdi 4,22,7
+ xor 5,5,0
+ srdi 0,19,6
+ add 21,21,30
+ xor 3,3,4
+ xor 5,5,0
+ ld 0,40(7)
+ add 21,21,3
+ add 21,21,5
+ rotrdi 3,15,14
+ rotrdi 4,15,18
+ and 5,8,15
+ xor 3,3,4
+ add 10,10,0
+ andc 0,9,15
+ rotrdi 4,4,23
+ or 5,5,0
+ add 10,10,21
+ xor 3,3,4
+ add 10,10,5
+ add 10,10,3
+
+ rotrdi 3,11,28
+ rotrdi 4,11,34
+ and 5,11,12
+ and 0,11,6
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,12,6
+ xor 3,3,4
+ add 14,14,10
+ xor 5,5,0
+ add 10,10,3
+ add 10,10,5
+
+ rotrdi 3,23,1
+ rotrdi 4,23,8
+ rotrdi 5,20,19
+ rotrdi 0,20,61
+ xor 3,3,4
+ srdi 4,23,7
+ xor 5,5,0
+ srdi 0,20,6
+ add 22,22,31
+ xor 3,3,4
+ xor 5,5,0
+ ld 0,48(7)
+ add 22,22,3
+ add 22,22,5
+ rotrdi 3,14,14
+ rotrdi 4,14,18
+ and 5,15,14
+ xor 3,3,4
+ add 9,9,0
+ andc 0,8,14
+ rotrdi 4,4,23
+ or 5,5,0
+ add 9,9,22
+ xor 3,3,4
+ add 9,9,5
+ add 9,9,3
+
+ rotrdi 3,10,28
+ rotrdi 4,10,34
+ and 5,10,11
+ and 0,10,12
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,11,12
+ xor 3,3,4
+ add 6,6,9
+ xor 5,5,0
+ add 9,9,3
+ add 9,9,5
+
+ rotrdi 3,24,1
+ rotrdi 4,24,8
+ rotrdi 5,21,19
+ rotrdi 0,21,61
+ xor 3,3,4
+ srdi 4,24,7
+ xor 5,5,0
+ srdi 0,21,6
+ add 23,23,16
+ xor 3,3,4
+ xor 5,5,0
+ ld 0,56(7)
+ add 23,23,3
+ add 23,23,5
+ rotrdi 3,6,14
+ rotrdi 4,6,18
+ and 5,14,6
+ xor 3,3,4
+ add 8,8,0
+ andc 0,15,6
+ rotrdi 4,4,23
+ or 5,5,0
+ add 8,8,23
+ xor 3,3,4
+ add 8,8,5
+ add 8,8,3
+
+ rotrdi 3,9,28
+ rotrdi 4,9,34
+ and 5,9,10
+ and 0,9,11
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,10,11
+ xor 3,3,4
+ add 12,12,8
+ xor 5,5,0
+ add 8,8,3
+ add 8,8,5
+
+ rotrdi 3,25,1
+ rotrdi 4,25,8
+ rotrdi 5,22,19
+ rotrdi 0,22,61
+ xor 3,3,4
+ srdi 4,25,7
+ xor 5,5,0
+ srdi 0,22,6
+ add 24,24,17
+ xor 3,3,4
+ xor 5,5,0
+ ld 0,64(7)
+ add 24,24,3
+ add 24,24,5
+ rotrdi 3,12,14
+ rotrdi 4,12,18
+ and 5,6,12
+ xor 3,3,4
+ add 15,15,0
+ andc 0,14,12
+ rotrdi 4,4,23
+ or 5,5,0
+ add 15,15,24
+ xor 3,3,4
+ add 15,15,5
+ add 15,15,3
+
+ rotrdi 3,8,28
+ rotrdi 4,8,34
+ and 5,8,9
+ and 0,8,10
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,9,10
+ xor 3,3,4
+ add 11,11,15
+ xor 5,5,0
+ add 15,15,3
+ add 15,15,5
+
+ rotrdi 3,26,1
+ rotrdi 4,26,8
+ rotrdi 5,23,19
+ rotrdi 0,23,61
+ xor 3,3,4
+ srdi 4,26,7
+ xor 5,5,0
+ srdi 0,23,6
+ add 25,25,18
+ xor 3,3,4
+ xor 5,5,0
+ ld 0,72(7)
+ add 25,25,3
+ add 25,25,5
+ rotrdi 3,11,14
+ rotrdi 4,11,18
+ and 5,12,11
+ xor 3,3,4
+ add 14,14,0
+ andc 0,6,11
+ rotrdi 4,4,23
+ or 5,5,0
+ add 14,14,25
+ xor 3,3,4
+ add 14,14,5
+ add 14,14,3
+
+ rotrdi 3,15,28
+ rotrdi 4,15,34
+ and 5,15,8
+ and 0,15,9
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,8,9
+ xor 3,3,4
+ add 10,10,14
+ xor 5,5,0
+ add 14,14,3
+ add 14,14,5
+
+ rotrdi 3,27,1
+ rotrdi 4,27,8
+ rotrdi 5,24,19
+ rotrdi 0,24,61
+ xor 3,3,4
+ srdi 4,27,7
+ xor 5,5,0
+ srdi 0,24,6
+ add 26,26,19
+ xor 3,3,4
+ xor 5,5,0
+ ld 0,80(7)
+ add 26,26,3
+ add 26,26,5
+ rotrdi 3,10,14
+ rotrdi 4,10,18
+ and 5,11,10
+ xor 3,3,4
+ add 6,6,0
+ andc 0,12,10
+ rotrdi 4,4,23
+ or 5,5,0
+ add 6,6,26
+ xor 3,3,4
+ add 6,6,5
+ add 6,6,3
+
+ rotrdi 3,14,28
+ rotrdi 4,14,34
+ and 5,14,15
+ and 0,14,8
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,15,8
+ xor 3,3,4
+ add 9,9,6
+ xor 5,5,0
+ add 6,6,3
+ add 6,6,5
+
+ rotrdi 3,28,1
+ rotrdi 4,28,8
+ rotrdi 5,25,19
+ rotrdi 0,25,61
+ xor 3,3,4
+ srdi 4,28,7
+ xor 5,5,0
+ srdi 0,25,6
+ add 27,27,20
+ xor 3,3,4
+ xor 5,5,0
+ ld 0,88(7)
+ add 27,27,3
+ add 27,27,5
+ rotrdi 3,9,14
+ rotrdi 4,9,18
+ and 5,10,9
+ xor 3,3,4
+ add 12,12,0
+ andc 0,11,9
+ rotrdi 4,4,23
+ or 5,5,0
+ add 12,12,27
+ xor 3,3,4
+ add 12,12,5
+ add 12,12,3
+
+ rotrdi 3,6,28
+ rotrdi 4,6,34
+ and 5,6,14
+ and 0,6,15
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,14,15
+ xor 3,3,4
+ add 8,8,12
+ xor 5,5,0
+ add 12,12,3
+ add 12,12,5
+
+ rotrdi 3,29,1
+ rotrdi 4,29,8
+ rotrdi 5,26,19
+ rotrdi 0,26,61
+ xor 3,3,4
+ srdi 4,29,7
+ xor 5,5,0
+ srdi 0,26,6
+ add 28,28,21
+ xor 3,3,4
+ xor 5,5,0
+ ld 0,96(7)
+ add 28,28,3
+ add 28,28,5
+ rotrdi 3,8,14
+ rotrdi 4,8,18
+ and 5,9,8
+ xor 3,3,4
+ add 11,11,0
+ andc 0,10,8
+ rotrdi 4,4,23
+ or 5,5,0
+ add 11,11,28
+ xor 3,3,4
+ add 11,11,5
+ add 11,11,3
+
+ rotrdi 3,12,28
+ rotrdi 4,12,34
+ and 5,12,6
+ and 0,12,14
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,6,14
+ xor 3,3,4
+ add 15,15,11
+ xor 5,5,0
+ add 11,11,3
+ add 11,11,5
+
+ rotrdi 3,30,1
+ rotrdi 4,30,8
+ rotrdi 5,27,19
+ rotrdi 0,27,61
+ xor 3,3,4
+ srdi 4,30,7
+ xor 5,5,0
+ srdi 0,27,6
+ add 29,29,22
+ xor 3,3,4
+ xor 5,5,0
+ ld 0,104(7)
+ add 29,29,3
+ add 29,29,5
+ rotrdi 3,15,14
+ rotrdi 4,15,18
+ and 5,8,15
+ xor 3,3,4
+ add 10,10,0
+ andc 0,9,15
+ rotrdi 4,4,23
+ or 5,5,0
+ add 10,10,29
+ xor 3,3,4
+ add 10,10,5
+ add 10,10,3
+
+ rotrdi 3,11,28
+ rotrdi 4,11,34
+ and 5,11,12
+ and 0,11,6
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,12,6
+ xor 3,3,4
+ add 14,14,10
+ xor 5,5,0
+ add 10,10,3
+ add 10,10,5
+
+ rotrdi 3,31,1
+ rotrdi 4,31,8
+ rotrdi 5,28,19
+ rotrdi 0,28,61
+ xor 3,3,4
+ srdi 4,31,7
+ xor 5,5,0
+ srdi 0,28,6
+ add 30,30,23
+ xor 3,3,4
+ xor 5,5,0
+ ld 0,112(7)
+ add 30,30,3
+ add 30,30,5
+ rotrdi 3,14,14
+ rotrdi 4,14,18
+ and 5,15,14
+ xor 3,3,4
+ add 9,9,0
+ andc 0,8,14
+ rotrdi 4,4,23
+ or 5,5,0
+ add 9,9,30
+ xor 3,3,4
+ add 9,9,5
+ add 9,9,3
+
+ rotrdi 3,10,28
+ rotrdi 4,10,34
+ and 5,10,11
+ and 0,10,12
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,11,12
+ xor 3,3,4
+ add 6,6,9
+ xor 5,5,0
+ add 9,9,3
+ add 9,9,5
+
+ rotrdi 3,16,1
+ rotrdi 4,16,8
+ rotrdi 5,29,19
+ rotrdi 0,29,61
+ xor 3,3,4
+ srdi 4,16,7
+ xor 5,5,0
+ srdi 0,29,6
+ add 31,31,24
+ xor 3,3,4
+ xor 5,5,0
+ ld 0,120(7)
+ add 31,31,3
+ add 31,31,5
+ rotrdi 3,6,14
+ rotrdi 4,6,18
+ and 5,14,6
+ xor 3,3,4
+ add 8,8,0
+ andc 0,15,6
+ rotrdi 4,4,23
+ or 5,5,0
+ add 8,8,31
+ xor 3,3,4
+ add 8,8,5
+ add 8,8,3
+
+ rotrdi 3,9,28
+ rotrdi 4,9,34
+ and 5,9,10
+ and 0,9,11
+ xor 3,3,4
+ rotrdi 4,4,5
+ xor 5,5,0
+ and 0,10,11
+ xor 3,3,4
+ add 12,12,8
+ xor 5,5,0
+ add 8,8,3
+ add 8,8,5
+
+ bdnz .Lrounds
+
+ ld 3,208(1)
+ ld 31,200(1)
+ ld 5,192(1)
+ subi 7,7,512
+
+ ld 16,0(3)
+ ld 17,8(3)
+ ld 18,16(3)
+ ld 19,24(3)
+ ld 20,32(3)
+ ld 21,40(3)
+ ld 22,48(3)
+ addi 31,31,128
+ ld 23,56(3)
+ add 8,8,16
+ add 9,9,17
+ std 31,200(1)
+ add 10,10,18
+ std 8,0(3)
+ add 11,11,19
+ std 9,8(3)
+ add 12,12,20
+ std 10,16(3)
+ add 6,6,21
+ std 11,24(3)
+ add 14,14,22
+ std 12,32(3)
+ add 15,15,23
+ std 6,40(3)
+ std 14,48(3)
+ cmpld 31,5
+ std 15,56(3)
+ bne .Lsha2_block_private
+ blr
+.long 0
+.byte 0,12,0x14,0,0,0,0,0
+.size zfs_sha512_ppc,.-zfs_sha512_ppc
+.align 6
+.LPICmeup:
+ mflr 0
+ bcl 20,31,$+4
+ mflr 7
+ addi 7,7,56
+ mtlr 0
+ blr
+.long 0
+.byte 0,12,0x14,0,0,0,0,0
+.space 28
+.long 0xd728ae22,0x428a2f98
+.long 0x23ef65cd,0x71374491
+.long 0xec4d3b2f,0xb5c0fbcf
+.long 0x8189dbbc,0xe9b5dba5
+.long 0xf348b538,0x3956c25b
+.long 0xb605d019,0x59f111f1
+.long 0xaf194f9b,0x923f82a4
+.long 0xda6d8118,0xab1c5ed5
+.long 0xa3030242,0xd807aa98
+.long 0x45706fbe,0x12835b01
+.long 0x4ee4b28c,0x243185be
+.long 0xd5ffb4e2,0x550c7dc3
+.long 0xf27b896f,0x72be5d74
+.long 0x3b1696b1,0x80deb1fe
+.long 0x25c71235,0x9bdc06a7
+.long 0xcf692694,0xc19bf174
+.long 0x9ef14ad2,0xe49b69c1
+.long 0x384f25e3,0xefbe4786
+.long 0x8b8cd5b5,0x0fc19dc6
+.long 0x77ac9c65,0x240ca1cc
+.long 0x592b0275,0x2de92c6f
+.long 0x6ea6e483,0x4a7484aa
+.long 0xbd41fbd4,0x5cb0a9dc
+.long 0x831153b5,0x76f988da
+.long 0xee66dfab,0x983e5152
+.long 0x2db43210,0xa831c66d
+.long 0x98fb213f,0xb00327c8
+.long 0xbeef0ee4,0xbf597fc7
+.long 0x3da88fc2,0xc6e00bf3
+.long 0x930aa725,0xd5a79147
+.long 0xe003826f,0x06ca6351
+.long 0x0a0e6e70,0x14292967
+.long 0x46d22ffc,0x27b70a85
+.long 0x5c26c926,0x2e1b2138
+.long 0x5ac42aed,0x4d2c6dfc
+.long 0x9d95b3df,0x53380d13
+.long 0x8baf63de,0x650a7354
+.long 0x3c77b2a8,0x766a0abb
+.long 0x47edaee6,0x81c2c92e
+.long 0x1482353b,0x92722c85
+.long 0x4cf10364,0xa2bfe8a1
+.long 0xbc423001,0xa81a664b
+.long 0xd0f89791,0xc24b8b70
+.long 0x0654be30,0xc76c51a3
+.long 0xd6ef5218,0xd192e819
+.long 0x5565a910,0xd6990624
+.long 0x5771202a,0xf40e3585
+.long 0x32bbd1b8,0x106aa070
+.long 0xb8d2d0c8,0x19a4c116
+.long 0x5141ab53,0x1e376c08
+.long 0xdf8eeb99,0x2748774c
+.long 0xe19b48a8,0x34b0bcb5
+.long 0xc5c95a63,0x391c0cb3
+.long 0xe3418acb,0x4ed8aa4a
+.long 0x7763e373,0x5b9cca4f
+.long 0xd6b2b8a3,0x682e6ff3
+.long 0x5defb2fc,0x748f82ee
+.long 0x43172f60,0x78a5636f
+.long 0xa1f0ab72,0x84c87814
+.long 0x1a6439ec,0x8cc70208
+.long 0x23631e28,0x90befffa
+.long 0xde82bde9,0xa4506ceb
+.long 0xb2c67915,0xbef9a3f7
+.long 0xe372532b,0xc67178f2
+.long 0xea26619c,0xca273ece
+.long 0x21c0c207,0xd186b8c7
+.long 0xcde0eb1e,0xeada7dd6
+.long 0xee6ed178,0xf57d4f7f
+.long 0x72176fba,0x06f067aa
+.long 0xa2c898a6,0x0a637dc5
+.long 0xbef90dae,0x113f9804
+.long 0x131c471b,0x1b710b35
+.long 0x23047d84,0x28db77f5
+.long 0x40c72493,0x32caab7b
+.long 0x15c9bebc,0x3c9ebe0a
+.long 0x9c100d4c,0x431d67c4
+.long 0xcb3e42b6,0x4cc5d4be
+.long 0xfc657e2a,0x597f299c
+.long 0x3ad6faec,0x5fcb6fab
+.long 0x4a475817,0x6c44198c
+
+#endif
diff --git a/module/icp/asm-x86_64/sha2/sha256-x86_64.S b/module/icp/asm-x86_64/sha2/sha256-x86_64.S
new file mode 100644
index 000000000..f78cd5fb1
--- /dev/null
+++ b/module/icp/asm-x86_64/sha2/sha256-x86_64.S
@@ -0,0 +1,5104 @@
+/*
+ * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Portions Copyright (c) 2022 Tino Reichardt <[email protected]>
+ * - modified assembly to fit into OpenZFS
+ */
+
+#if defined(__x86_64)
+
+#define _ASM
+#include <sys/asm_linkage.h>
+
+.section .rodata
+
+.align 64
+.type K256,@object
+K256:
+.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+
+ENTRY_ALIGN(zfs_sha256_transform_x64, 16)
+.cfi_startproc
+ ENDBR
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ shlq $4,%rdx
+ subq $64+32,%rsp
+ leaq (%rsi,%rdx,4),%rdx
+ andq $-64,%rsp
+ movq %rdi,64+0(%rsp)
+ movq %rsi,64+8(%rsp)
+ movq %rdx,64+16(%rsp)
+ movq %rax,88(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
+.Lprologue:
+ movl 0(%rdi),%eax
+ movl 4(%rdi),%ebx
+ movl 8(%rdi),%ecx
+ movl 12(%rdi),%edx
+ movl 16(%rdi),%r8d
+ movl 20(%rdi),%r9d
+ movl 24(%rdi),%r10d
+ movl 28(%rdi),%r11d
+ jmp .Lloop
+.align 16
+.Lloop:
+ movl %ebx,%edi
+ leaq K256(%rip),%rbp
+ xorl %ecx,%edi
+ movl 0(%rsi),%r12d
+ movl %r8d,%r13d
+ movl %eax,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r9d,%r15d
+ xorl %r8d,%r13d
+ rorl $9,%r14d
+ xorl %r10d,%r15d
+ movl %r12d,0(%rsp)
+ xorl %eax,%r14d
+ andl %r8d,%r15d
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %r10d,%r15d
+ rorl $11,%r14d
+ xorl %r8d,%r13d
+ addl %r15d,%r12d
+ movl %eax,%r15d
+ addl (%rbp),%r12d
+ xorl %eax,%r14d
+ xorl %ebx,%r15d
+ rorl $6,%r13d
+ movl %ebx,%r11d
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ xorl %edi,%r11d
+ addl %r12d,%edx
+ addl %r12d,%r11d
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r11d
+ movl 4(%rsi),%r12d
+ movl %edx,%r13d
+ movl %r11d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r8d,%edi
+ xorl %edx,%r13d
+ rorl $9,%r14d
+ xorl %r9d,%edi
+ movl %r12d,4(%rsp)
+ xorl %r11d,%r14d
+ andl %edx,%edi
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r9d,%edi
+ rorl $11,%r14d
+ xorl %edx,%r13d
+ addl %edi,%r12d
+ movl %r11d,%edi
+ addl (%rbp),%r12d
+ xorl %r11d,%r14d
+ xorl %eax,%edi
+ rorl $6,%r13d
+ movl %eax,%r10d
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ xorl %r15d,%r10d
+ addl %r12d,%ecx
+ addl %r12d,%r10d
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r10d
+ movl 8(%rsi),%r12d
+ movl %ecx,%r13d
+ movl %r10d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %edx,%r15d
+ xorl %ecx,%r13d
+ rorl $9,%r14d
+ xorl %r8d,%r15d
+ movl %r12d,8(%rsp)
+ xorl %r10d,%r14d
+ andl %ecx,%r15d
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r8d,%r15d
+ rorl $11,%r14d
+ xorl %ecx,%r13d
+ addl %r15d,%r12d
+ movl %r10d,%r15d
+ addl (%rbp),%r12d
+ xorl %r10d,%r14d
+ xorl %r11d,%r15d
+ rorl $6,%r13d
+ movl %r11d,%r9d
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ xorl %edi,%r9d
+ addl %r12d,%ebx
+ addl %r12d,%r9d
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r9d
+ movl 12(%rsi),%r12d
+ movl %ebx,%r13d
+ movl %r9d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %ecx,%edi
+ xorl %ebx,%r13d
+ rorl $9,%r14d
+ xorl %edx,%edi
+ movl %r12d,12(%rsp)
+ xorl %r9d,%r14d
+ andl %ebx,%edi
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %edx,%edi
+ rorl $11,%r14d
+ xorl %ebx,%r13d
+ addl %edi,%r12d
+ movl %r9d,%edi
+ addl (%rbp),%r12d
+ xorl %r9d,%r14d
+ xorl %r10d,%edi
+ rorl $6,%r13d
+ movl %r10d,%r8d
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ xorl %r15d,%r8d
+ addl %r12d,%eax
+ addl %r12d,%r8d
+ leaq 20(%rbp),%rbp
+ addl %r14d,%r8d
+ movl 16(%rsi),%r12d
+ movl %eax,%r13d
+ movl %r8d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %ebx,%r15d
+ xorl %eax,%r13d
+ rorl $9,%r14d
+ xorl %ecx,%r15d
+ movl %r12d,16(%rsp)
+ xorl %r8d,%r14d
+ andl %eax,%r15d
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %ecx,%r15d
+ rorl $11,%r14d
+ xorl %eax,%r13d
+ addl %r15d,%r12d
+ movl %r8d,%r15d
+ addl (%rbp),%r12d
+ xorl %r8d,%r14d
+ xorl %r9d,%r15d
+ rorl $6,%r13d
+ movl %r9d,%edx
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ xorl %edi,%edx
+ addl %r12d,%r11d
+ addl %r12d,%edx
+ leaq 4(%rbp),%rbp
+ addl %r14d,%edx
+ movl 20(%rsi),%r12d
+ movl %r11d,%r13d
+ movl %edx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %eax,%edi
+ xorl %r11d,%r13d
+ rorl $9,%r14d
+ xorl %ebx,%edi
+ movl %r12d,20(%rsp)
+ xorl %edx,%r14d
+ andl %r11d,%edi
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %ebx,%edi
+ rorl $11,%r14d
+ xorl %r11d,%r13d
+ addl %edi,%r12d
+ movl %edx,%edi
+ addl (%rbp),%r12d
+ xorl %edx,%r14d
+ xorl %r8d,%edi
+ rorl $6,%r13d
+ movl %r8d,%ecx
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ xorl %r15d,%ecx
+ addl %r12d,%r10d
+ addl %r12d,%ecx
+ leaq 4(%rbp),%rbp
+ addl %r14d,%ecx
+ movl 24(%rsi),%r12d
+ movl %r10d,%r13d
+ movl %ecx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r11d,%r15d
+ xorl %r10d,%r13d
+ rorl $9,%r14d
+ xorl %eax,%r15d
+ movl %r12d,24(%rsp)
+ xorl %ecx,%r14d
+ andl %r10d,%r15d
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %eax,%r15d
+ rorl $11,%r14d
+ xorl %r10d,%r13d
+ addl %r15d,%r12d
+ movl %ecx,%r15d
+ addl (%rbp),%r12d
+ xorl %ecx,%r14d
+ xorl %edx,%r15d
+ rorl $6,%r13d
+ movl %edx,%ebx
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ xorl %edi,%ebx
+ addl %r12d,%r9d
+ addl %r12d,%ebx
+ leaq 4(%rbp),%rbp
+ addl %r14d,%ebx
+ movl 28(%rsi),%r12d
+ movl %r9d,%r13d
+ movl %ebx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r10d,%edi
+ xorl %r9d,%r13d
+ rorl $9,%r14d
+ xorl %r11d,%edi
+ movl %r12d,28(%rsp)
+ xorl %ebx,%r14d
+ andl %r9d,%edi
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %r11d,%edi
+ rorl $11,%r14d
+ xorl %r9d,%r13d
+ addl %edi,%r12d
+ movl %ebx,%edi
+ addl (%rbp),%r12d
+ xorl %ebx,%r14d
+ xorl %ecx,%edi
+ rorl $6,%r13d
+ movl %ecx,%eax
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ xorl %r15d,%eax
+ addl %r12d,%r8d
+ addl %r12d,%eax
+ leaq 20(%rbp),%rbp
+ addl %r14d,%eax
+ movl 32(%rsi),%r12d
+ movl %r8d,%r13d
+ movl %eax,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r9d,%r15d
+ xorl %r8d,%r13d
+ rorl $9,%r14d
+ xorl %r10d,%r15d
+ movl %r12d,32(%rsp)
+ xorl %eax,%r14d
+ andl %r8d,%r15d
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %r10d,%r15d
+ rorl $11,%r14d
+ xorl %r8d,%r13d
+ addl %r15d,%r12d
+ movl %eax,%r15d
+ addl (%rbp),%r12d
+ xorl %eax,%r14d
+ xorl %ebx,%r15d
+ rorl $6,%r13d
+ movl %ebx,%r11d
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ xorl %edi,%r11d
+ addl %r12d,%edx
+ addl %r12d,%r11d
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r11d
+ movl 36(%rsi),%r12d
+ movl %edx,%r13d
+ movl %r11d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r8d,%edi
+ xorl %edx,%r13d
+ rorl $9,%r14d
+ xorl %r9d,%edi
+ movl %r12d,36(%rsp)
+ xorl %r11d,%r14d
+ andl %edx,%edi
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r9d,%edi
+ rorl $11,%r14d
+ xorl %edx,%r13d
+ addl %edi,%r12d
+ movl %r11d,%edi
+ addl (%rbp),%r12d
+ xorl %r11d,%r14d
+ xorl %eax,%edi
+ rorl $6,%r13d
+ movl %eax,%r10d
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ xorl %r15d,%r10d
+ addl %r12d,%ecx
+ addl %r12d,%r10d
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r10d
+ movl 40(%rsi),%r12d
+ movl %ecx,%r13d
+ movl %r10d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %edx,%r15d
+ xorl %ecx,%r13d
+ rorl $9,%r14d
+ xorl %r8d,%r15d
+ movl %r12d,40(%rsp)
+ xorl %r10d,%r14d
+ andl %ecx,%r15d
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r8d,%r15d
+ rorl $11,%r14d
+ xorl %ecx,%r13d
+ addl %r15d,%r12d
+ movl %r10d,%r15d
+ addl (%rbp),%r12d
+ xorl %r10d,%r14d
+ xorl %r11d,%r15d
+ rorl $6,%r13d
+ movl %r11d,%r9d
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ xorl %edi,%r9d
+ addl %r12d,%ebx
+ addl %r12d,%r9d
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r9d
+ movl 44(%rsi),%r12d
+ movl %ebx,%r13d
+ movl %r9d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %ecx,%edi
+ xorl %ebx,%r13d
+ rorl $9,%r14d
+ xorl %edx,%edi
+ movl %r12d,44(%rsp)
+ xorl %r9d,%r14d
+ andl %ebx,%edi
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %edx,%edi
+ rorl $11,%r14d
+ xorl %ebx,%r13d
+ addl %edi,%r12d
+ movl %r9d,%edi
+ addl (%rbp),%r12d
+ xorl %r9d,%r14d
+ xorl %r10d,%edi
+ rorl $6,%r13d
+ movl %r10d,%r8d
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ xorl %r15d,%r8d
+ addl %r12d,%eax
+ addl %r12d,%r8d
+ leaq 20(%rbp),%rbp
+ addl %r14d,%r8d
+ movl 48(%rsi),%r12d
+ movl %eax,%r13d
+ movl %r8d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %ebx,%r15d
+ xorl %eax,%r13d
+ rorl $9,%r14d
+ xorl %ecx,%r15d
+ movl %r12d,48(%rsp)
+ xorl %r8d,%r14d
+ andl %eax,%r15d
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %ecx,%r15d
+ rorl $11,%r14d
+ xorl %eax,%r13d
+ addl %r15d,%r12d
+ movl %r8d,%r15d
+ addl (%rbp),%r12d
+ xorl %r8d,%r14d
+ xorl %r9d,%r15d
+ rorl $6,%r13d
+ movl %r9d,%edx
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ xorl %edi,%edx
+ addl %r12d,%r11d
+ addl %r12d,%edx
+ leaq 4(%rbp),%rbp
+ addl %r14d,%edx
+ movl 52(%rsi),%r12d
+ movl %r11d,%r13d
+ movl %edx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %eax,%edi
+ xorl %r11d,%r13d
+ rorl $9,%r14d
+ xorl %ebx,%edi
+ movl %r12d,52(%rsp)
+ xorl %edx,%r14d
+ andl %r11d,%edi
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %ebx,%edi
+ rorl $11,%r14d
+ xorl %r11d,%r13d
+ addl %edi,%r12d
+ movl %edx,%edi
+ addl (%rbp),%r12d
+ xorl %edx,%r14d
+ xorl %r8d,%edi
+ rorl $6,%r13d
+ movl %r8d,%ecx
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ xorl %r15d,%ecx
+ addl %r12d,%r10d
+ addl %r12d,%ecx
+ leaq 4(%rbp),%rbp
+ addl %r14d,%ecx
+ movl 56(%rsi),%r12d
+ movl %r10d,%r13d
+ movl %ecx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r11d,%r15d
+ xorl %r10d,%r13d
+ rorl $9,%r14d
+ xorl %eax,%r15d
+ movl %r12d,56(%rsp)
+ xorl %ecx,%r14d
+ andl %r10d,%r15d
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %eax,%r15d
+ rorl $11,%r14d
+ xorl %r10d,%r13d
+ addl %r15d,%r12d
+ movl %ecx,%r15d
+ addl (%rbp),%r12d
+ xorl %ecx,%r14d
+ xorl %edx,%r15d
+ rorl $6,%r13d
+ movl %edx,%ebx
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ xorl %edi,%ebx
+ addl %r12d,%r9d
+ addl %r12d,%ebx
+ leaq 4(%rbp),%rbp
+ addl %r14d,%ebx
+ movl 60(%rsi),%r12d
+ movl %r9d,%r13d
+ movl %ebx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r10d,%edi
+ xorl %r9d,%r13d
+ rorl $9,%r14d
+ xorl %r11d,%edi
+ movl %r12d,60(%rsp)
+ xorl %ebx,%r14d
+ andl %r9d,%edi
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %r11d,%edi
+ rorl $11,%r14d
+ xorl %r9d,%r13d
+ addl %edi,%r12d
+ movl %ebx,%edi
+ addl (%rbp),%r12d
+ xorl %ebx,%r14d
+ xorl %ecx,%edi
+ rorl $6,%r13d
+ movl %ecx,%eax
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ xorl %r15d,%eax
+ addl %r12d,%r8d
+ addl %r12d,%eax
+ leaq 20(%rbp),%rbp
+ jmp .Lrounds_16_xx
+.align 16
+.Lrounds_16_xx:
+ movl 4(%rsp),%r13d
+ movl 56(%rsp),%r15d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%eax
+ movl %r15d,%r14d
+ rorl $2,%r15d
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 36(%rsp),%r12d
+ addl 0(%rsp),%r12d
+ movl %r8d,%r13d
+ addl %r15d,%r12d
+ movl %eax,%r14d
+ rorl $14,%r13d
+ movl %r9d,%r15d
+ xorl %r8d,%r13d
+ rorl $9,%r14d
+ xorl %r10d,%r15d
+ movl %r12d,0(%rsp)
+ xorl %eax,%r14d
+ andl %r8d,%r15d
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %r10d,%r15d
+ rorl $11,%r14d
+ xorl %r8d,%r13d
+ addl %r15d,%r12d
+ movl %eax,%r15d
+ addl (%rbp),%r12d
+ xorl %eax,%r14d
+ xorl %ebx,%r15d
+ rorl $6,%r13d
+ movl %ebx,%r11d
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ xorl %edi,%r11d
+ addl %r12d,%edx
+ addl %r12d,%r11d
+ leaq 4(%rbp),%rbp
+ movl 8(%rsp),%r13d
+ movl 60(%rsp),%edi
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r11d
+ movl %edi,%r14d
+ rorl $2,%edi
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 40(%rsp),%r12d
+ addl 4(%rsp),%r12d
+ movl %edx,%r13d
+ addl %edi,%r12d
+ movl %r11d,%r14d
+ rorl $14,%r13d
+ movl %r8d,%edi
+ xorl %edx,%r13d
+ rorl $9,%r14d
+ xorl %r9d,%edi
+ movl %r12d,4(%rsp)
+ xorl %r11d,%r14d
+ andl %edx,%edi
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r9d,%edi
+ rorl $11,%r14d
+ xorl %edx,%r13d
+ addl %edi,%r12d
+ movl %r11d,%edi
+ addl (%rbp),%r12d
+ xorl %r11d,%r14d
+ xorl %eax,%edi
+ rorl $6,%r13d
+ movl %eax,%r10d
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ xorl %r15d,%r10d
+ addl %r12d,%ecx
+ addl %r12d,%r10d
+ leaq 4(%rbp),%rbp
+ movl 12(%rsp),%r13d
+ movl 0(%rsp),%r15d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r10d
+ movl %r15d,%r14d
+ rorl $2,%r15d
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 44(%rsp),%r12d
+ addl 8(%rsp),%r12d
+ movl %ecx,%r13d
+ addl %r15d,%r12d
+ movl %r10d,%r14d
+ rorl $14,%r13d
+ movl %edx,%r15d
+ xorl %ecx,%r13d
+ rorl $9,%r14d
+ xorl %r8d,%r15d
+ movl %r12d,8(%rsp)
+ xorl %r10d,%r14d
+ andl %ecx,%r15d
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r8d,%r15d
+ rorl $11,%r14d
+ xorl %ecx,%r13d
+ addl %r15d,%r12d
+ movl %r10d,%r15d
+ addl (%rbp),%r12d
+ xorl %r10d,%r14d
+ xorl %r11d,%r15d
+ rorl $6,%r13d
+ movl %r11d,%r9d
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ xorl %edi,%r9d
+ addl %r12d,%ebx
+ addl %r12d,%r9d
+ leaq 4(%rbp),%rbp
+ movl 16(%rsp),%r13d
+ movl 4(%rsp),%edi
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r9d
+ movl %edi,%r14d
+ rorl $2,%edi
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 48(%rsp),%r12d
+ addl 12(%rsp),%r12d
+ movl %ebx,%r13d
+ addl %edi,%r12d
+ movl %r9d,%r14d
+ rorl $14,%r13d
+ movl %ecx,%edi
+ xorl %ebx,%r13d
+ rorl $9,%r14d
+ xorl %edx,%edi
+ movl %r12d,12(%rsp)
+ xorl %r9d,%r14d
+ andl %ebx,%edi
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %edx,%edi
+ rorl $11,%r14d
+ xorl %ebx,%r13d
+ addl %edi,%r12d
+ movl %r9d,%edi
+ addl (%rbp),%r12d
+ xorl %r9d,%r14d
+ xorl %r10d,%edi
+ rorl $6,%r13d
+ movl %r10d,%r8d
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ xorl %r15d,%r8d
+ addl %r12d,%eax
+ addl %r12d,%r8d
+ leaq 20(%rbp),%rbp
+ movl 20(%rsp),%r13d
+ movl 8(%rsp),%r15d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r8d
+ movl %r15d,%r14d
+ rorl $2,%r15d
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 52(%rsp),%r12d
+ addl 16(%rsp),%r12d
+ movl %eax,%r13d
+ addl %r15d,%r12d
+ movl %r8d,%r14d
+ rorl $14,%r13d
+ movl %ebx,%r15d
+ xorl %eax,%r13d
+ rorl $9,%r14d
+ xorl %ecx,%r15d
+ movl %r12d,16(%rsp)
+ xorl %r8d,%r14d
+ andl %eax,%r15d
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %ecx,%r15d
+ rorl $11,%r14d
+ xorl %eax,%r13d
+ addl %r15d,%r12d
+ movl %r8d,%r15d
+ addl (%rbp),%r12d
+ xorl %r8d,%r14d
+ xorl %r9d,%r15d
+ rorl $6,%r13d
+ movl %r9d,%edx
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ xorl %edi,%edx
+ addl %r12d,%r11d
+ addl %r12d,%edx
+ leaq 4(%rbp),%rbp
+ movl 24(%rsp),%r13d
+ movl 12(%rsp),%edi
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%edx
+ movl %edi,%r14d
+ rorl $2,%edi
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 56(%rsp),%r12d
+ addl 20(%rsp),%r12d
+ movl %r11d,%r13d
+ addl %edi,%r12d
+ movl %edx,%r14d
+ rorl $14,%r13d
+ movl %eax,%edi
+ xorl %r11d,%r13d
+ rorl $9,%r14d
+ xorl %ebx,%edi
+ movl %r12d,20(%rsp)
+ xorl %edx,%r14d
+ andl %r11d,%edi
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %ebx,%edi
+ rorl $11,%r14d
+ xorl %r11d,%r13d
+ addl %edi,%r12d
+ movl %edx,%edi
+ addl (%rbp),%r12d
+ xorl %edx,%r14d
+ xorl %r8d,%edi
+ rorl $6,%r13d
+ movl %r8d,%ecx
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ xorl %r15d,%ecx
+ addl %r12d,%r10d
+ addl %r12d,%ecx
+ leaq 4(%rbp),%rbp
+ movl 28(%rsp),%r13d
+ movl 16(%rsp),%r15d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%ecx
+ movl %r15d,%r14d
+ rorl $2,%r15d
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 60(%rsp),%r12d
+ addl 24(%rsp),%r12d
+ movl %r10d,%r13d
+ addl %r15d,%r12d
+ movl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r11d,%r15d
+ xorl %r10d,%r13d
+ rorl $9,%r14d
+ xorl %eax,%r15d
+ movl %r12d,24(%rsp)
+ xorl %ecx,%r14d
+ andl %r10d,%r15d
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %eax,%r15d
+ rorl $11,%r14d
+ xorl %r10d,%r13d
+ addl %r15d,%r12d
+ movl %ecx,%r15d
+ addl (%rbp),%r12d
+ xorl %ecx,%r14d
+ xorl %edx,%r15d
+ rorl $6,%r13d
+ movl %edx,%ebx
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ xorl %edi,%ebx
+ addl %r12d,%r9d
+ addl %r12d,%ebx
+ leaq 4(%rbp),%rbp
+ movl 32(%rsp),%r13d
+ movl 20(%rsp),%edi
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%ebx
+ movl %edi,%r14d
+ rorl $2,%edi
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 0(%rsp),%r12d
+ addl 28(%rsp),%r12d
+ movl %r9d,%r13d
+ addl %edi,%r12d
+ movl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r10d,%edi
+ xorl %r9d,%r13d
+ rorl $9,%r14d
+ xorl %r11d,%edi
+ movl %r12d,28(%rsp)
+ xorl %ebx,%r14d
+ andl %r9d,%edi
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %r11d,%edi
+ rorl $11,%r14d
+ xorl %r9d,%r13d
+ addl %edi,%r12d
+ movl %ebx,%edi
+ addl (%rbp),%r12d
+ xorl %ebx,%r14d
+ xorl %ecx,%edi
+ rorl $6,%r13d
+ movl %ecx,%eax
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ xorl %r15d,%eax
+ addl %r12d,%r8d
+ addl %r12d,%eax
+ leaq 20(%rbp),%rbp
+ movl 36(%rsp),%r13d
+ movl 24(%rsp),%r15d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%eax
+ movl %r15d,%r14d
+ rorl $2,%r15d
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 4(%rsp),%r12d
+ addl 32(%rsp),%r12d
+ movl %r8d,%r13d
+ addl %r15d,%r12d
+ movl %eax,%r14d
+ rorl $14,%r13d
+ movl %r9d,%r15d
+ xorl %r8d,%r13d
+ rorl $9,%r14d
+ xorl %r10d,%r15d
+ movl %r12d,32(%rsp)
+ xorl %eax,%r14d
+ andl %r8d,%r15d
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %r10d,%r15d
+ rorl $11,%r14d
+ xorl %r8d,%r13d
+ addl %r15d,%r12d
+ movl %eax,%r15d
+ addl (%rbp),%r12d
+ xorl %eax,%r14d
+ xorl %ebx,%r15d
+ rorl $6,%r13d
+ movl %ebx,%r11d
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ xorl %edi,%r11d
+ addl %r12d,%edx
+ addl %r12d,%r11d
+ leaq 4(%rbp),%rbp
+ movl 40(%rsp),%r13d
+ movl 28(%rsp),%edi
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r11d
+ movl %edi,%r14d
+ rorl $2,%edi
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 8(%rsp),%r12d
+ addl 36(%rsp),%r12d
+ movl %edx,%r13d
+ addl %edi,%r12d
+ movl %r11d,%r14d
+ rorl $14,%r13d
+ movl %r8d,%edi
+ xorl %edx,%r13d
+ rorl $9,%r14d
+ xorl %r9d,%edi
+ movl %r12d,36(%rsp)
+ xorl %r11d,%r14d
+ andl %edx,%edi
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r9d,%edi
+ rorl $11,%r14d
+ xorl %edx,%r13d
+ addl %edi,%r12d
+ movl %r11d,%edi
+ addl (%rbp),%r12d
+ xorl %r11d,%r14d
+ xorl %eax,%edi
+ rorl $6,%r13d
+ movl %eax,%r10d
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ xorl %r15d,%r10d
+ addl %r12d,%ecx
+ addl %r12d,%r10d
+ leaq 4(%rbp),%rbp
+ movl 44(%rsp),%r13d
+ movl 32(%rsp),%r15d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r10d
+ movl %r15d,%r14d
+ rorl $2,%r15d
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 12(%rsp),%r12d
+ addl 40(%rsp),%r12d
+ movl %ecx,%r13d
+ addl %r15d,%r12d
+ movl %r10d,%r14d
+ rorl $14,%r13d
+ movl %edx,%r15d
+ xorl %ecx,%r13d
+ rorl $9,%r14d
+ xorl %r8d,%r15d
+ movl %r12d,40(%rsp)
+ xorl %r10d,%r14d
+ andl %ecx,%r15d
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r8d,%r15d
+ rorl $11,%r14d
+ xorl %ecx,%r13d
+ addl %r15d,%r12d
+ movl %r10d,%r15d
+ addl (%rbp),%r12d
+ xorl %r10d,%r14d
+ xorl %r11d,%r15d
+ rorl $6,%r13d
+ movl %r11d,%r9d
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ xorl %edi,%r9d
+ addl %r12d,%ebx
+ addl %r12d,%r9d
+ leaq 4(%rbp),%rbp
+ movl 48(%rsp),%r13d
+ movl 36(%rsp),%edi
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r9d
+ movl %edi,%r14d
+ rorl $2,%edi
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 16(%rsp),%r12d
+ addl 44(%rsp),%r12d
+ movl %ebx,%r13d
+ addl %edi,%r12d
+ movl %r9d,%r14d
+ rorl $14,%r13d
+ movl %ecx,%edi
+ xorl %ebx,%r13d
+ rorl $9,%r14d
+ xorl %edx,%edi
+ movl %r12d,44(%rsp)
+ xorl %r9d,%r14d
+ andl %ebx,%edi
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %edx,%edi
+ rorl $11,%r14d
+ xorl %ebx,%r13d
+ addl %edi,%r12d
+ movl %r9d,%edi
+ addl (%rbp),%r12d
+ xorl %r9d,%r14d
+ xorl %r10d,%edi
+ rorl $6,%r13d
+ movl %r10d,%r8d
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ xorl %r15d,%r8d
+ addl %r12d,%eax
+ addl %r12d,%r8d
+ leaq 20(%rbp),%rbp
+ movl 52(%rsp),%r13d
+ movl 40(%rsp),%r15d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r8d
+ movl %r15d,%r14d
+ rorl $2,%r15d
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 20(%rsp),%r12d
+ addl 48(%rsp),%r12d
+ movl %eax,%r13d
+ addl %r15d,%r12d
+ movl %r8d,%r14d
+ rorl $14,%r13d
+ movl %ebx,%r15d
+ xorl %eax,%r13d
+ rorl $9,%r14d
+ xorl %ecx,%r15d
+ movl %r12d,48(%rsp)
+ xorl %r8d,%r14d
+ andl %eax,%r15d
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %ecx,%r15d
+ rorl $11,%r14d
+ xorl %eax,%r13d
+ addl %r15d,%r12d
+ movl %r8d,%r15d
+ addl (%rbp),%r12d
+ xorl %r8d,%r14d
+ xorl %r9d,%r15d
+ rorl $6,%r13d
+ movl %r9d,%edx
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ xorl %edi,%edx
+ addl %r12d,%r11d
+ addl %r12d,%edx
+ leaq 4(%rbp),%rbp
+ movl 56(%rsp),%r13d
+ movl 44(%rsp),%edi
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%edx
+ movl %edi,%r14d
+ rorl $2,%edi
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 24(%rsp),%r12d
+ addl 52(%rsp),%r12d
+ movl %r11d,%r13d
+ addl %edi,%r12d
+ movl %edx,%r14d
+ rorl $14,%r13d
+ movl %eax,%edi
+ xorl %r11d,%r13d
+ rorl $9,%r14d
+ xorl %ebx,%edi
+ movl %r12d,52(%rsp)
+ xorl %edx,%r14d
+ andl %r11d,%edi
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %ebx,%edi
+ rorl $11,%r14d
+ xorl %r11d,%r13d
+ addl %edi,%r12d
+ movl %edx,%edi
+ addl (%rbp),%r12d
+ xorl %edx,%r14d
+ xorl %r8d,%edi
+ rorl $6,%r13d
+ movl %r8d,%ecx
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ xorl %r15d,%ecx
+ addl %r12d,%r10d
+ addl %r12d,%ecx
+ leaq 4(%rbp),%rbp
+ movl 60(%rsp),%r13d
+ movl 48(%rsp),%r15d
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%ecx
+ movl %r15d,%r14d
+ rorl $2,%r15d
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 28(%rsp),%r12d
+ addl 56(%rsp),%r12d
+ movl %r10d,%r13d
+ addl %r15d,%r12d
+ movl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r11d,%r15d
+ xorl %r10d,%r13d
+ rorl $9,%r14d
+ xorl %eax,%r15d
+ movl %r12d,56(%rsp)
+ xorl %ecx,%r14d
+ andl %r10d,%r15d
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %eax,%r15d
+ rorl $11,%r14d
+ xorl %r10d,%r13d
+ addl %r15d,%r12d
+ movl %ecx,%r15d
+ addl (%rbp),%r12d
+ xorl %ecx,%r14d
+ xorl %edx,%r15d
+ rorl $6,%r13d
+ movl %edx,%ebx
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ xorl %edi,%ebx
+ addl %r12d,%r9d
+ addl %r12d,%ebx
+ leaq 4(%rbp),%rbp
+ movl 0(%rsp),%r13d
+ movl 52(%rsp),%edi
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%ebx
+ movl %edi,%r14d
+ rorl $2,%edi
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 32(%rsp),%r12d
+ addl 60(%rsp),%r12d
+ movl %r9d,%r13d
+ addl %edi,%r12d
+ movl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r10d,%edi
+ xorl %r9d,%r13d
+ rorl $9,%r14d
+ xorl %r11d,%edi
+ movl %r12d,60(%rsp)
+ xorl %ebx,%r14d
+ andl %r9d,%edi
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %r11d,%edi
+ rorl $11,%r14d
+ xorl %r9d,%r13d
+ addl %edi,%r12d
+ movl %ebx,%edi
+ addl (%rbp),%r12d
+ xorl %ebx,%r14d
+ xorl %ecx,%edi
+ rorl $6,%r13d
+ movl %ecx,%eax
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+ xorl %r15d,%eax
+ addl %r12d,%r8d
+ addl %r12d,%eax
+ leaq 20(%rbp),%rbp
+ cmpb $0,3(%rbp)
+ jnz .Lrounds_16_xx
+ movq 64+0(%rsp),%rdi
+ addl %r14d,%eax
+ leaq 64(%rsi),%rsi
+ addl 0(%rdi),%eax
+ addl 4(%rdi),%ebx
+ addl 8(%rdi),%ecx
+ addl 12(%rdi),%edx
+ addl 16(%rdi),%r8d
+ addl 20(%rdi),%r9d
+ addl 24(%rdi),%r10d
+ addl 28(%rdi),%r11d
+ cmpq 64+16(%rsp),%rsi
+ movl %eax,0(%rdi)
+ movl %ebx,4(%rdi)
+ movl %ecx,8(%rdi)
+ movl %edx,12(%rdi)
+ movl %r8d,16(%rdi)
+ movl %r9d,20(%rdi)
+ movl %r10d,24(%rdi)
+ movl %r11d,28(%rdi)
+ jb .Lloop
+ movq 88(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue:
+ RET
+.cfi_endproc
+SET_SIZE(zfs_sha256_transform_x64)
+
+ENTRY_ALIGN(zfs_sha256_transform_shani, 64)
+.cfi_startproc
+ ENDBR
+ leaq K256+128(%rip),%rcx
+ movdqu (%rdi),%xmm1
+ movdqu 16(%rdi),%xmm2
+ movdqa 512-128(%rcx),%xmm7
+
+ pshufd $0x1b,%xmm1,%xmm0
+ pshufd $0xb1,%xmm1,%xmm1
+ pshufd $0x1b,%xmm2,%xmm2
+ movdqa %xmm7,%xmm8
+.byte 102,15,58,15,202,8
+ punpcklqdq %xmm0,%xmm2
+ jmp .Loop_shani
+
+.align 16
+.Loop_shani:
+ movdqu (%rsi),%xmm3
+ movdqu 16(%rsi),%xmm4
+ movdqu 32(%rsi),%xmm5
+.byte 102,15,56,0,223
+ movdqu 48(%rsi),%xmm6
+
+ movdqa 0-128(%rcx),%xmm0
+ paddd %xmm3,%xmm0
+.byte 102,15,56,0,231
+ movdqa %xmm2,%xmm10
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ nop
+ movdqa %xmm1,%xmm9
+.byte 15,56,203,202
+
+ movdqa 32-128(%rcx),%xmm0
+ paddd %xmm4,%xmm0
+.byte 102,15,56,0,239
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ leaq 64(%rsi),%rsi
+.byte 15,56,204,220
+.byte 15,56,203,202
+
+ movdqa 64-128(%rcx),%xmm0
+ paddd %xmm5,%xmm0
+.byte 102,15,56,0,247
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+
+ movdqa 96-128(%rcx),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa 128-128(%rcx),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa 160-128(%rcx),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+ nop
+ paddd %xmm7,%xmm6
+.byte 15,56,204,220
+.byte 15,56,203,202
+ movdqa 192-128(%rcx),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,205,245
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+ movdqa 224-128(%rcx),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa 256-128(%rcx),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa 288-128(%rcx),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+ nop
+ paddd %xmm7,%xmm6
+.byte 15,56,204,220
+.byte 15,56,203,202
+ movdqa 320-128(%rcx),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,205,245
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+ movdqa 352-128(%rcx),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa 384-128(%rcx),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa 416-128(%rcx),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+.byte 15,56,203,202
+ paddd %xmm7,%xmm6
+
+ movdqa 448-128(%rcx),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+.byte 15,56,205,245
+ movdqa %xmm8,%xmm7
+.byte 15,56,203,202
+
+ movdqa 480-128(%rcx),%xmm0
+ paddd %xmm6,%xmm0
+ nop
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ decq %rdx
+ nop
+.byte 15,56,203,202
+
+ paddd %xmm10,%xmm2
+ paddd %xmm9,%xmm1
+ jnz .Loop_shani
+
+ pshufd $0xb1,%xmm2,%xmm2
+ pshufd $0x1b,%xmm1,%xmm7
+ pshufd $0xb1,%xmm1,%xmm1
+ punpckhqdq %xmm2,%xmm1
+.byte 102,15,58,15,215,8
+
+ movdqu %xmm1,(%rdi)
+ movdqu %xmm2,16(%rdi)
+ RET
+.cfi_endproc
+SET_SIZE(zfs_sha256_transform_shani)
+
+ENTRY_ALIGN(zfs_sha256_transform_ssse3, 64)
+.cfi_startproc
+ ENDBR
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ shlq $4,%rdx
+ subq $96,%rsp
+ leaq (%rsi,%rdx,4),%rdx
+ andq $-64,%rsp
+ movq %rdi,64+0(%rsp)
+ movq %rsi,64+8(%rsp)
+ movq %rdx,64+16(%rsp)
+ movq %rax,88(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
+.Lprologue_ssse3:
+
+ movl 0(%rdi),%eax
+ movl 4(%rdi),%ebx
+ movl 8(%rdi),%ecx
+ movl 12(%rdi),%edx
+ movl 16(%rdi),%r8d
+ movl 20(%rdi),%r9d
+ movl 24(%rdi),%r10d
+ movl 28(%rdi),%r11d
+
+ jmp .Lloop_ssse3
+.align 16
+.Lloop_ssse3:
+ movdqa K256+512(%rip),%xmm7
+ movdqu 0(%rsi),%xmm0
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+.byte 102,15,56,0,199
+ movdqu 48(%rsi),%xmm3
+ leaq K256(%rip),%rbp
+.byte 102,15,56,0,207
+ movdqa 0(%rbp),%xmm4
+ movdqa 32(%rbp),%xmm5
+.byte 102,15,56,0,215
+ paddd %xmm0,%xmm4
+ movdqa 64(%rbp),%xmm6
+.byte 102,15,56,0,223
+ movdqa 96(%rbp),%xmm7
+ paddd %xmm1,%xmm5
+ paddd %xmm2,%xmm6
+ paddd %xmm3,%xmm7
+ movdqa %xmm4,0(%rsp)
+ movl %eax,%r14d
+ movdqa %xmm5,16(%rsp)
+ movl %ebx,%edi
+ movdqa %xmm6,32(%rsp)
+ xorl %ecx,%edi
+ movdqa %xmm7,48(%rsp)
+ movl %r8d,%r13d
+ jmp .Lssse3_00_47
+
+.align 16
+.Lssse3_00_47:
+ subq $-128,%rbp
+ rorl $14,%r13d
+ movdqa %xmm1,%xmm4
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ movdqa %xmm3,%xmm7
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+.byte 102,15,58,15,224,4
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+.byte 102,15,58,15,250,4
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ rorl $11,%r14d
+ movdqa %xmm4,%xmm5
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ movdqa %xmm4,%xmm6
+ rorl $6,%r13d
+ andl %r15d,%edi
+ psrld $3,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ paddd %xmm7,%xmm0
+ rorl $2,%r14d
+ addl %r11d,%edx
+ psrld $7,%xmm6
+ addl %edi,%r11d
+ movl %edx,%r13d
+ pshufd $250,%xmm3,%xmm7
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ pslld $14,%xmm5
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ pxor %xmm6,%xmm4
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ psrld $11,%xmm6
+ xorl %r11d,%r14d
+ pxor %xmm5,%xmm4
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ pslld $11,%xmm5
+ addl 4(%rsp),%r10d
+ movl %r11d,%edi
+ pxor %xmm6,%xmm4
+ xorl %r9d,%r12d
+ rorl $11,%r14d
+ movdqa %xmm7,%xmm6
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ pxor %xmm5,%xmm4
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ psrld $10,%xmm7
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ paddd %xmm4,%xmm0
+ rorl $2,%r14d
+ addl %r10d,%ecx
+ psrlq $17,%xmm6
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ pxor %xmm6,%xmm7
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ rorl $9,%r14d
+ psrlq $2,%xmm6
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ pxor %xmm6,%xmm7
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ pshufd $128,%xmm7,%xmm7
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ movl %r10d,%r15d
+ psrldq $8,%xmm7
+ xorl %r8d,%r12d
+ rorl $11,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ rorl $6,%r13d
+ paddd %xmm7,%xmm0
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ pshufd $80,%xmm0,%xmm7
+ xorl %r11d,%edi
+ rorl $2,%r14d
+ addl %r9d,%ebx
+ movdqa %xmm7,%xmm6
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ psrld $10,%xmm7
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ psrlq $17,%xmm6
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ pxor %xmm6,%xmm7
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ psrlq $2,%xmm6
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ pxor %xmm6,%xmm7
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ rorl $11,%r14d
+ pshufd $8,%xmm7,%xmm7
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ movdqa 0(%rbp),%xmm6
+ rorl $6,%r13d
+ andl %edi,%r15d
+ pslldq $8,%xmm7
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ paddd %xmm7,%xmm0
+ rorl $2,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ paddd %xmm0,%xmm6
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ movdqa %xmm6,0(%rsp)
+ rorl $14,%r13d
+ movdqa %xmm2,%xmm4
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ movdqa %xmm0,%xmm7
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+.byte 102,15,58,15,225,4
+ andl %eax,%r12d
+ xorl %eax,%r13d
+.byte 102,15,58,15,251,4
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ rorl $11,%r14d
+ movdqa %xmm4,%xmm5
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ movdqa %xmm4,%xmm6
+ rorl $6,%r13d
+ andl %r15d,%edi
+ psrld $3,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ paddd %xmm7,%xmm1
+ rorl $2,%r14d
+ addl %edx,%r11d
+ psrld $7,%xmm6
+ addl %edi,%edx
+ movl %r11d,%r13d
+ pshufd $250,%xmm0,%xmm7
+ addl %edx,%r14d
+ rorl $14,%r13d
+ pslld $14,%xmm5
+ movl %r14d,%edx
+ movl %eax,%r12d
+ pxor %xmm6,%xmm4
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ psrld $11,%xmm6
+ xorl %edx,%r14d
+ pxor %xmm5,%xmm4
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ pslld $11,%xmm5
+ addl 20(%rsp),%ecx
+ movl %edx,%edi
+ pxor %xmm6,%xmm4
+ xorl %ebx,%r12d
+ rorl $11,%r14d
+ movdqa %xmm7,%xmm6
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ pxor %xmm5,%xmm4
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ psrld $10,%xmm7
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ paddd %xmm4,%xmm1
+ rorl $2,%r14d
+ addl %ecx,%r10d
+ psrlq $17,%xmm6
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ pxor %xmm6,%xmm7
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ rorl $9,%r14d
+ psrlq $2,%xmm6
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ pxor %xmm6,%xmm7
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ pshufd $128,%xmm7,%xmm7
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ movl %ecx,%r15d
+ psrldq $8,%xmm7
+ xorl %eax,%r12d
+ rorl $11,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ rorl $6,%r13d
+ paddd %xmm7,%xmm1
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ pshufd $80,%xmm1,%xmm7
+ xorl %edx,%edi
+ rorl $2,%r14d
+ addl %ebx,%r9d
+ movdqa %xmm7,%xmm6
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ psrld $10,%xmm7
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ psrlq $17,%xmm6
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ pxor %xmm6,%xmm7
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ psrlq $2,%xmm6
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ pxor %xmm6,%xmm7
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ rorl $11,%r14d
+ pshufd $8,%xmm7,%xmm7
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ movdqa 32(%rbp),%xmm6
+ rorl $6,%r13d
+ andl %edi,%r15d
+ pslldq $8,%xmm7
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ paddd %xmm7,%xmm1
+ rorl $2,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ paddd %xmm1,%xmm6
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movdqa %xmm6,16(%rsp)
+ rorl $14,%r13d
+ movdqa %xmm3,%xmm4
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ movdqa %xmm1,%xmm7
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+.byte 102,15,58,15,226,4
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+.byte 102,15,58,15,248,4
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ rorl $11,%r14d
+ movdqa %xmm4,%xmm5
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ movdqa %xmm4,%xmm6
+ rorl $6,%r13d
+ andl %r15d,%edi
+ psrld $3,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ paddd %xmm7,%xmm2
+ rorl $2,%r14d
+ addl %r11d,%edx
+ psrld $7,%xmm6
+ addl %edi,%r11d
+ movl %edx,%r13d
+ pshufd $250,%xmm1,%xmm7
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ pslld $14,%xmm5
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ pxor %xmm6,%xmm4
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ psrld $11,%xmm6
+ xorl %r11d,%r14d
+ pxor %xmm5,%xmm4
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ pslld $11,%xmm5
+ addl 36(%rsp),%r10d
+ movl %r11d,%edi
+ pxor %xmm6,%xmm4
+ xorl %r9d,%r12d
+ rorl $11,%r14d
+ movdqa %xmm7,%xmm6
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ pxor %xmm5,%xmm4
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ psrld $10,%xmm7
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ paddd %xmm4,%xmm2
+ rorl $2,%r14d
+ addl %r10d,%ecx
+ psrlq $17,%xmm6
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ pxor %xmm6,%xmm7
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ rorl $9,%r14d
+ psrlq $2,%xmm6
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ pxor %xmm6,%xmm7
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ pshufd $128,%xmm7,%xmm7
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ movl %r10d,%r15d
+ psrldq $8,%xmm7
+ xorl %r8d,%r12d
+ rorl $11,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ rorl $6,%r13d
+ paddd %xmm7,%xmm2
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ pshufd $80,%xmm2,%xmm7
+ xorl %r11d,%edi
+ rorl $2,%r14d
+ addl %r9d,%ebx
+ movdqa %xmm7,%xmm6
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ psrld $10,%xmm7
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ psrlq $17,%xmm6
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ pxor %xmm6,%xmm7
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ psrlq $2,%xmm6
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ pxor %xmm6,%xmm7
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ rorl $11,%r14d
+ pshufd $8,%xmm7,%xmm7
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ movdqa 64(%rbp),%xmm6
+ rorl $6,%r13d
+ andl %edi,%r15d
+ pslldq $8,%xmm7
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ paddd %xmm7,%xmm2
+ rorl $2,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ paddd %xmm2,%xmm6
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ movdqa %xmm6,32(%rsp)
+ rorl $14,%r13d
+ movdqa %xmm0,%xmm4
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ movdqa %xmm2,%xmm7
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+.byte 102,15,58,15,227,4
+ andl %eax,%r12d
+ xorl %eax,%r13d
+.byte 102,15,58,15,249,4
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ rorl $11,%r14d
+ movdqa %xmm4,%xmm5
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ movdqa %xmm4,%xmm6
+ rorl $6,%r13d
+ andl %r15d,%edi
+ psrld $3,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ paddd %xmm7,%xmm3
+ rorl $2,%r14d
+ addl %edx,%r11d
+ psrld $7,%xmm6
+ addl %edi,%edx
+ movl %r11d,%r13d
+ pshufd $250,%xmm2,%xmm7
+ addl %edx,%r14d
+ rorl $14,%r13d
+ pslld $14,%xmm5
+ movl %r14d,%edx
+ movl %eax,%r12d
+ pxor %xmm6,%xmm4
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ psrld $11,%xmm6
+ xorl %edx,%r14d
+ pxor %xmm5,%xmm4
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ pslld $11,%xmm5
+ addl 52(%rsp),%ecx
+ movl %edx,%edi
+ pxor %xmm6,%xmm4
+ xorl %ebx,%r12d
+ rorl $11,%r14d
+ movdqa %xmm7,%xmm6
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ pxor %xmm5,%xmm4
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ psrld $10,%xmm7
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ paddd %xmm4,%xmm3
+ rorl $2,%r14d
+ addl %ecx,%r10d
+ psrlq $17,%xmm6
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ pxor %xmm6,%xmm7
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ rorl $9,%r14d
+ psrlq $2,%xmm6
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ pxor %xmm6,%xmm7
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ pshufd $128,%xmm7,%xmm7
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ movl %ecx,%r15d
+ psrldq $8,%xmm7
+ xorl %eax,%r12d
+ rorl $11,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ rorl $6,%r13d
+ paddd %xmm7,%xmm3
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ pshufd $80,%xmm3,%xmm7
+ xorl %edx,%edi
+ rorl $2,%r14d
+ addl %ebx,%r9d
+ movdqa %xmm7,%xmm6
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ psrld $10,%xmm7
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ psrlq $17,%xmm6
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ pxor %xmm6,%xmm7
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ psrlq $2,%xmm6
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ pxor %xmm6,%xmm7
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ rorl $11,%r14d
+ pshufd $8,%xmm7,%xmm7
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ movdqa 96(%rbp),%xmm6
+ rorl $6,%r13d
+ andl %edi,%r15d
+ pslldq $8,%xmm7
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ paddd %xmm7,%xmm3
+ rorl $2,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ paddd %xmm3,%xmm6
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movdqa %xmm6,48(%rsp)
+ cmpb $0,131(%rbp)
+ jne .Lssse3_00_47
+ rorl $14,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ rorl $11,%r14d
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ rorl $2,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ addl 4(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ rorl $11,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ rorl $2,%r14d
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ rorl $9,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ rorl $11,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%edi
+ rorl $2,%r14d
+ addl %r9d,%ebx
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ rorl $11,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ rorl $2,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ xorl %eax,%r13d
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ rorl $11,%r14d
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ rorl $2,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ addl 20(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ rorl $11,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ rorl $2,%r14d
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ rorl $9,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ rorl $11,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%edi
+ rorl $2,%r14d
+ addl %ebx,%r9d
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ rorl $11,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ rorl $2,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ rorl $14,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ rorl $11,%r14d
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ rorl $2,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ addl 36(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ rorl $11,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ rorl $2,%r14d
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ rorl $9,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ rorl $11,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%edi
+ rorl $2,%r14d
+ addl %r9d,%ebx
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ rorl $11,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ rorl $2,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ xorl %eax,%r13d
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ rorl $11,%r14d
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ rorl $2,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ addl 52(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ rorl $11,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ rorl $2,%r14d
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ rorl $9,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ rorl $11,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%edi
+ rorl $2,%r14d
+ addl %ebx,%r9d
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ rorl $11,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ rorl $2,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movq 64+0(%rsp),%rdi
+ movl %r14d,%eax
+
+ addl 0(%rdi),%eax
+ leaq 64(%rsi),%rsi
+ addl 4(%rdi),%ebx
+ addl 8(%rdi),%ecx
+ addl 12(%rdi),%edx
+ addl 16(%rdi),%r8d
+ addl 20(%rdi),%r9d
+ addl 24(%rdi),%r10d
+ addl 28(%rdi),%r11d
+
+ cmpq 64+16(%rsp),%rsi
+
+ movl %eax,0(%rdi)
+ movl %ebx,4(%rdi)
+ movl %ecx,8(%rdi)
+ movl %edx,12(%rdi)
+ movl %r8d,16(%rdi)
+ movl %r9d,20(%rdi)
+ movl %r10d,24(%rdi)
+ movl %r11d,28(%rdi)
+ jb .Lloop_ssse3
+
+ movq 88(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_ssse3:
+ RET
+.cfi_endproc
+SET_SIZE(zfs_sha256_transform_ssse3)
+
+ENTRY_ALIGN(zfs_sha256_transform_avx, 64)
+.cfi_startproc
+ ENDBR
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ shlq $4,%rdx
+ subq $96,%rsp
+ leaq (%rsi,%rdx,4),%rdx
+ andq $-64,%rsp
+ movq %rdi,64+0(%rsp)
+ movq %rsi,64+8(%rsp)
+ movq %rdx,64+16(%rsp)
+ movq %rax,88(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
+.Lprologue_avx:
+
+ vzeroupper
+ movl 0(%rdi),%eax
+ movl 4(%rdi),%ebx
+ movl 8(%rdi),%ecx
+ movl 12(%rdi),%edx
+ movl 16(%rdi),%r8d
+ movl 20(%rdi),%r9d
+ movl 24(%rdi),%r10d
+ movl 28(%rdi),%r11d
+ vmovdqa K256+512+32(%rip),%xmm8
+ vmovdqa K256+512+64(%rip),%xmm9
+ jmp .Lloop_avx
+.align 16
+.Lloop_avx:
+ vmovdqa K256+512(%rip),%xmm7
+ vmovdqu 0(%rsi),%xmm0
+ vmovdqu 16(%rsi),%xmm1
+ vmovdqu 32(%rsi),%xmm2
+ vmovdqu 48(%rsi),%xmm3
+ vpshufb %xmm7,%xmm0,%xmm0
+ leaq K256(%rip),%rbp
+ vpshufb %xmm7,%xmm1,%xmm1
+ vpshufb %xmm7,%xmm2,%xmm2
+ vpaddd 0(%rbp),%xmm0,%xmm4
+ vpshufb %xmm7,%xmm3,%xmm3
+ vpaddd 32(%rbp),%xmm1,%xmm5
+ vpaddd 64(%rbp),%xmm2,%xmm6
+ vpaddd 96(%rbp),%xmm3,%xmm7
+ vmovdqa %xmm4,0(%rsp)
+ movl %eax,%r14d
+ vmovdqa %xmm5,16(%rsp)
+ movl %ebx,%edi
+ vmovdqa %xmm6,32(%rsp)
+ xorl %ecx,%edi
+ vmovdqa %xmm7,48(%rsp)
+ movl %r8d,%r13d
+ jmp .Lavx_00_47
+
+.align 16
+.Lavx_00_47:
+ subq $-128,%rbp
+ vpalignr $4,%xmm0,%xmm1,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ vpalignr $4,%xmm2,%xmm3,%xmm7
+ shrdl $9,%r14d,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ vpaddd %xmm7,%xmm0,%xmm0
+ xorl %r8d,%r13d
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ xorl %r10d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ebx,%r15d
+ vpslld $14,%xmm4,%xmm5
+ addl %r12d,%r11d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ vpshufd $250,%xmm3,%xmm7
+ shrdl $2,%r14d,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ vpsrld $11,%xmm6,%xmm6
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ shrdl $9,%r14d,%r14d
+ vpslld $11,%xmm5,%xmm5
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 4(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ vpsrlq $17,%xmm7,%xmm7
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ vpaddd %xmm4,%xmm0,%xmm0
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %edx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r13d
+ vpshufb %xmm8,%xmm6,%xmm6
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ vpaddd %xmm6,%xmm0,%xmm0
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ vpshufd $80,%xmm0,%xmm7
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ shrdl $11,%r14d,%r14d
+ vpsrld $10,%xmm7,%xmm6
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %r11d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r9d,%ebx
+ vpsrlq $2,%xmm7,%xmm7
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ vpshufb %xmm9,%xmm6,%xmm6
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ vpaddd %xmm6,%xmm0,%xmm0
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vpaddd 0(%rbp),%xmm0,%xmm6
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ vmovdqa %xmm6,0(%rsp)
+ vpalignr $4,%xmm1,%xmm2,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ vpalignr $4,%xmm3,%xmm0,%xmm7
+ shrdl $9,%r14d,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ vpaddd %xmm7,%xmm1,%xmm1
+ xorl %eax,%r13d
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ xorl %ecx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r9d,%r15d
+ vpslld $14,%xmm4,%xmm5
+ addl %r12d,%edx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ vpshufd $250,%xmm0,%xmm7
+ shrdl $2,%r14d,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ vpsrld $11,%xmm6,%xmm6
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%edx
+ movl %eax,%r12d
+ shrdl $9,%r14d,%r14d
+ vpslld $11,%xmm5,%xmm5
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 20(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ vpsrlq $17,%xmm7,%xmm7
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ vpaddd %xmm4,%xmm1,%xmm1
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %r11d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r13d
+ vpshufb %xmm8,%xmm6,%xmm6
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ vpaddd %xmm6,%xmm1,%xmm1
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ vpshufd $80,%xmm1,%xmm7
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ shrdl $11,%r14d,%r14d
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %edx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %ebx,%r9d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ vpshufb %xmm9,%xmm6,%xmm6
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ vpaddd %xmm6,%xmm1,%xmm1
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vpaddd 32(%rbp),%xmm1,%xmm6
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ vmovdqa %xmm6,16(%rsp)
+ vpalignr $4,%xmm2,%xmm3,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ vpalignr $4,%xmm0,%xmm1,%xmm7
+ shrdl $9,%r14d,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ vpaddd %xmm7,%xmm2,%xmm2
+ xorl %r8d,%r13d
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ xorl %r10d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ebx,%r15d
+ vpslld $14,%xmm4,%xmm5
+ addl %r12d,%r11d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ vpshufd $250,%xmm1,%xmm7
+ shrdl $2,%r14d,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ vpsrld $11,%xmm6,%xmm6
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ shrdl $9,%r14d,%r14d
+ vpslld $11,%xmm5,%xmm5
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 36(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ vpsrlq $17,%xmm7,%xmm7
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ vpaddd %xmm4,%xmm2,%xmm2
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %edx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r13d
+ vpshufb %xmm8,%xmm6,%xmm6
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ vpaddd %xmm6,%xmm2,%xmm2
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ vpshufd $80,%xmm2,%xmm7
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ shrdl $11,%r14d,%r14d
+ vpsrld $10,%xmm7,%xmm6
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %r11d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r9d,%ebx
+ vpsrlq $2,%xmm7,%xmm7
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ vpshufb %xmm9,%xmm6,%xmm6
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ vpaddd %xmm6,%xmm2,%xmm2
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vpaddd 64(%rbp),%xmm2,%xmm6
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ vmovdqa %xmm6,32(%rsp)
+ vpalignr $4,%xmm3,%xmm0,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ vpalignr $4,%xmm1,%xmm2,%xmm7
+ shrdl $9,%r14d,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ vpaddd %xmm7,%xmm3,%xmm3
+ xorl %eax,%r13d
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ xorl %ecx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r9d,%r15d
+ vpslld $14,%xmm4,%xmm5
+ addl %r12d,%edx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ vpshufd $250,%xmm2,%xmm7
+ shrdl $2,%r14d,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ vpsrld $11,%xmm6,%xmm6
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%edx
+ movl %eax,%r12d
+ shrdl $9,%r14d,%r14d
+ vpslld $11,%xmm5,%xmm5
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 52(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ vpsrlq $17,%xmm7,%xmm7
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ vpaddd %xmm4,%xmm3,%xmm3
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %r11d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r13d
+ vpshufb %xmm8,%xmm6,%xmm6
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ vpaddd %xmm6,%xmm3,%xmm3
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ vpshufd $80,%xmm3,%xmm7
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ shrdl $11,%r14d,%r14d
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %edx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %ebx,%r9d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ vpshufb %xmm9,%xmm6,%xmm6
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ vpaddd %xmm6,%xmm3,%xmm3
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vpaddd 96(%rbp),%xmm3,%xmm6
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ vmovdqa %xmm6,48(%rsp)
+ cmpb $0,131(%rbp)
+ jne .Lavx_00_47
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ addl 4(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r9d,%ebx
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ xorl %eax,%r13d
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ addl 20(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %ebx,%r9d
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ addl 36(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r9d,%ebx
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ xorl %eax,%r13d
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ addl 52(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %ebx,%r9d
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movq 64+0(%rsp),%rdi
+ movl %r14d,%eax
+
+ addl 0(%rdi),%eax
+ leaq 64(%rsi),%rsi
+ addl 4(%rdi),%ebx
+ addl 8(%rdi),%ecx
+ addl 12(%rdi),%edx
+ addl 16(%rdi),%r8d
+ addl 20(%rdi),%r9d
+ addl 24(%rdi),%r10d
+ addl 28(%rdi),%r11d
+
+ cmpq 64+16(%rsp),%rsi
+
+ movl %eax,0(%rdi)
+ movl %ebx,4(%rdi)
+ movl %ecx,8(%rdi)
+ movl %edx,12(%rdi)
+ movl %r8d,16(%rdi)
+ movl %r9d,20(%rdi)
+ movl %r10d,24(%rdi)
+ movl %r11d,28(%rdi)
+ jb .Lloop_avx
+
+ movq 88(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ vzeroupper
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx:
+ RET
+.cfi_endproc
+SET_SIZE(zfs_sha256_transform_avx)
+
+ENTRY_ALIGN(zfs_sha256_transform_avx2, 64)
+.cfi_startproc
+ ENDBR
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ subq $544,%rsp
+ shlq $4,%rdx
+ andq $-1024,%rsp
+ leaq (%rsi,%rdx,4),%rdx
+ addq $448,%rsp
+ movq %rdi,64+0(%rsp)
+ movq %rsi,64+8(%rsp)
+ movq %rdx,64+16(%rsp)
+ movq %rax,88(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
+.Lprologue_avx2:
+
+ vzeroupper
+ subq $-64,%rsi
+ movl 0(%rdi),%eax
+ movq %rsi,%r12
+ movl 4(%rdi),%ebx
+ cmpq %rdx,%rsi
+ movl 8(%rdi),%ecx
+ cmoveq %rsp,%r12
+ movl 12(%rdi),%edx
+ movl 16(%rdi),%r8d
+ movl 20(%rdi),%r9d
+ movl 24(%rdi),%r10d
+ movl 28(%rdi),%r11d
+ vmovdqa K256+512+32(%rip),%ymm8
+ vmovdqa K256+512+64(%rip),%ymm9
+ jmp .Loop_avx2
+.align 16
+.Loop_avx2:
+ vmovdqa K256+512(%rip),%ymm7
+ vmovdqu -64+0(%rsi),%xmm0
+ vmovdqu -64+16(%rsi),%xmm1
+ vmovdqu -64+32(%rsi),%xmm2
+ vmovdqu -64+48(%rsi),%xmm3
+
+ vinserti128 $1,(%r12),%ymm0,%ymm0
+ vinserti128 $1,16(%r12),%ymm1,%ymm1
+ vpshufb %ymm7,%ymm0,%ymm0
+ vinserti128 $1,32(%r12),%ymm2,%ymm2
+ vpshufb %ymm7,%ymm1,%ymm1
+ vinserti128 $1,48(%r12),%ymm3,%ymm3
+
+ leaq K256(%rip),%rbp
+ vpshufb %ymm7,%ymm2,%ymm2
+ vpaddd 0(%rbp),%ymm0,%ymm4
+ vpshufb %ymm7,%ymm3,%ymm3
+ vpaddd 32(%rbp),%ymm1,%ymm5
+ vpaddd 64(%rbp),%ymm2,%ymm6
+ vpaddd 96(%rbp),%ymm3,%ymm7
+ vmovdqa %ymm4,0(%rsp)
+ xorl %r14d,%r14d
+ vmovdqa %ymm5,32(%rsp)
+
+ movq 88(%rsp),%rdi
+.cfi_def_cfa %rdi,8
+ leaq -64(%rsp),%rsp
+
+
+
+ movq %rdi,-8(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08
+ movl %ebx,%edi
+ vmovdqa %ymm6,0(%rsp)
+ xorl %ecx,%edi
+ vmovdqa %ymm7,32(%rsp)
+ movl %r9d,%r12d
+ subq $-32*4,%rbp
+ jmp .Lavx2_00_47
+
+.align 16
+.Lavx2_00_47:
+ leaq -64(%rsp),%rsp
+.cfi_escape 0x0f,0x05,0x77,0x38,0x06,0x23,0x08
+
+ pushq 64-8(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08
+ leaq 8(%rsp),%rsp
+.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08
+ vpalignr $4,%ymm0,%ymm1,%ymm4
+ addl 0+128(%rsp),%r11d
+ andl %r8d,%r12d
+ rorxl $25,%r8d,%r13d
+ vpalignr $4,%ymm2,%ymm3,%ymm7
+ rorxl $11,%r8d,%r15d
+ leal (%rax,%r14,1),%eax
+ leal (%r11,%r12,1),%r11d
+ vpsrld $7,%ymm4,%ymm6
+ andnl %r10d,%r8d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r8d,%r14d
+ vpaddd %ymm7,%ymm0,%ymm0
+ leal (%r11,%r12,1),%r11d
+ xorl %r14d,%r13d
+ movl %eax,%r15d
+ vpsrld $3,%ymm4,%ymm7
+ rorxl $22,%eax,%r12d
+ leal (%r11,%r13,1),%r11d
+ xorl %ebx,%r15d
+ vpslld $14,%ymm4,%ymm5
+ rorxl $13,%eax,%r14d
+ rorxl $2,%eax,%r13d
+ leal (%rdx,%r11,1),%edx
+ vpxor %ymm6,%ymm7,%ymm4
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %ebx,%edi
+ vpshufd $250,%ymm3,%ymm7
+ xorl %r13d,%r14d
+ leal (%r11,%rdi,1),%r11d
+ movl %r8d,%r12d
+ vpsrld $11,%ymm6,%ymm6
+ addl 4+128(%rsp),%r10d
+ andl %edx,%r12d
+ rorxl $25,%edx,%r13d
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $11,%edx,%edi
+ leal (%r11,%r14,1),%r11d
+ leal (%r10,%r12,1),%r10d
+ vpslld $11,%ymm5,%ymm5
+ andnl %r9d,%edx,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%edx,%r14d
+ vpxor %ymm6,%ymm4,%ymm4
+ leal (%r10,%r12,1),%r10d
+ xorl %r14d,%r13d
+ movl %r11d,%edi
+ vpsrld $10,%ymm7,%ymm6
+ rorxl $22,%r11d,%r12d
+ leal (%r10,%r13,1),%r10d
+ xorl %eax,%edi
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $13,%r11d,%r14d
+ rorxl $2,%r11d,%r13d
+ leal (%rcx,%r10,1),%ecx
+ vpsrlq $17,%ymm7,%ymm7
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %eax,%r15d
+ vpaddd %ymm4,%ymm0,%ymm0
+ xorl %r13d,%r14d
+ leal (%r10,%r15,1),%r10d
+ movl %edx,%r12d
+ vpxor %ymm7,%ymm6,%ymm6
+ addl 8+128(%rsp),%r9d
+ andl %ecx,%r12d
+ rorxl $25,%ecx,%r13d
+ vpsrlq $2,%ymm7,%ymm7
+ rorxl $11,%ecx,%r15d
+ leal (%r10,%r14,1),%r10d
+ leal (%r9,%r12,1),%r9d
+ vpxor %ymm7,%ymm6,%ymm6
+ andnl %r8d,%ecx,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%ecx,%r14d
+ vpshufb %ymm8,%ymm6,%ymm6
+ leal (%r9,%r12,1),%r9d
+ xorl %r14d,%r13d
+ movl %r10d,%r15d
+ vpaddd %ymm6,%ymm0,%ymm0
+ rorxl $22,%r10d,%r12d
+ leal (%r9,%r13,1),%r9d
+ xorl %r11d,%r15d
+ vpshufd $80,%ymm0,%ymm7
+ rorxl $13,%r10d,%r14d
+ rorxl $2,%r10d,%r13d
+ leal (%rbx,%r9,1),%ebx
+ vpsrld $10,%ymm7,%ymm6
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %r11d,%edi
+ vpsrlq $17,%ymm7,%ymm7
+ xorl %r13d,%r14d
+ leal (%r9,%rdi,1),%r9d
+ movl %ecx,%r12d
+ vpxor %ymm7,%ymm6,%ymm6
+ addl 12+128(%rsp),%r8d
+ andl %ebx,%r12d
+ rorxl $25,%ebx,%r13d
+ vpsrlq $2,%ymm7,%ymm7
+ rorxl $11,%ebx,%edi
+ leal (%r9,%r14,1),%r9d
+ leal (%r8,%r12,1),%r8d
+ vpxor %ymm7,%ymm6,%ymm6
+ andnl %edx,%ebx,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%ebx,%r14d
+ vpshufb %ymm9,%ymm6,%ymm6
+ leal (%r8,%r12,1),%r8d
+ xorl %r14d,%r13d
+ movl %r9d,%edi
+ vpaddd %ymm6,%ymm0,%ymm0
+ rorxl $22,%r9d,%r12d
+ leal (%r8,%r13,1),%r8d
+ xorl %r10d,%edi
+ vpaddd 0(%rbp),%ymm0,%ymm6
+ rorxl $13,%r9d,%r14d
+ rorxl $2,%r9d,%r13d
+ leal (%rax,%r8,1),%eax
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %r10d,%r15d
+ xorl %r13d,%r14d
+ leal (%r8,%r15,1),%r8d
+ movl %ebx,%r12d
+ vmovdqa %ymm6,0(%rsp)
+ vpalignr $4,%ymm1,%ymm2,%ymm4
+ addl 32+128(%rsp),%edx
+ andl %eax,%r12d
+ rorxl $25,%eax,%r13d
+ vpalignr $4,%ymm3,%ymm0,%ymm7
+ rorxl $11,%eax,%r15d
+ leal (%r8,%r14,1),%r8d
+ leal (%rdx,%r12,1),%edx
+ vpsrld $7,%ymm4,%ymm6
+ andnl %ecx,%eax,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%eax,%r14d
+ vpaddd %ymm7,%ymm1,%ymm1
+ leal (%rdx,%r12,1),%edx
+ xorl %r14d,%r13d
+ movl %r8d,%r15d
+ vpsrld $3,%ymm4,%ymm7
+ rorxl $22,%r8d,%r12d
+ leal (%rdx,%r13,1),%edx
+ xorl %r9d,%r15d
+ vpslld $14,%ymm4,%ymm5
+ rorxl $13,%r8d,%r14d
+ rorxl $2,%r8d,%r13d
+ leal (%r11,%rdx,1),%r11d
+ vpxor %ymm6,%ymm7,%ymm4
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %r9d,%edi
+ vpshufd $250,%ymm0,%ymm7
+ xorl %r13d,%r14d
+ leal (%rdx,%rdi,1),%edx
+ movl %eax,%r12d
+ vpsrld $11,%ymm6,%ymm6
+ addl 36+128(%rsp),%ecx
+ andl %r11d,%r12d
+ rorxl $25,%r11d,%r13d
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $11,%r11d,%edi
+ leal (%rdx,%r14,1),%edx
+ leal (%rcx,%r12,1),%ecx
+ vpslld $11,%ymm5,%ymm5
+ andnl %ebx,%r11d,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%r11d,%r14d
+ vpxor %ymm6,%ymm4,%ymm4
+ leal (%rcx,%r12,1),%ecx
+ xorl %r14d,%r13d
+ movl %edx,%edi
+ vpsrld $10,%ymm7,%ymm6
+ rorxl $22,%edx,%r12d
+ leal (%rcx,%r13,1),%ecx
+ xorl %r8d,%edi
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $13,%edx,%r14d
+ rorxl $2,%edx,%r13d
+ leal (%r10,%rcx,1),%r10d
+ vpsrlq $17,%ymm7,%ymm7
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %r8d,%r15d
+ vpaddd %ymm4,%ymm1,%ymm1
+ xorl %r13d,%r14d
+ leal (%rcx,%r15,1),%ecx
+ movl %r11d,%r12d
+ vpxor %ymm7,%ymm6,%ymm6
+ addl 40+128(%rsp),%ebx
+ andl %r10d,%r12d
+ rorxl $25,%r10d,%r13d
+ vpsrlq $2,%ymm7,%ymm7
+ rorxl $11,%r10d,%r15d
+ leal (%rcx,%r14,1),%ecx
+ leal (%rbx,%r12,1),%ebx
+ vpxor %ymm7,%ymm6,%ymm6
+ andnl %eax,%r10d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r10d,%r14d
+ vpshufb %ymm8,%ymm6,%ymm6
+ leal (%rbx,%r12,1),%ebx
+ xorl %r14d,%r13d
+ movl %ecx,%r15d
+ vpaddd %ymm6,%ymm1,%ymm1
+ rorxl $22,%ecx,%r12d
+ leal (%rbx,%r13,1),%ebx
+ xorl %edx,%r15d
+ vpshufd $80,%ymm1,%ymm7
+ rorxl $13,%ecx,%r14d
+ rorxl $2,%ecx,%r13d
+ leal (%r9,%rbx,1),%r9d
+ vpsrld $10,%ymm7,%ymm6
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %edx,%edi
+ vpsrlq $17,%ymm7,%ymm7
+ xorl %r13d,%r14d
+ leal (%rbx,%rdi,1),%ebx
+ movl %r10d,%r12d
+ vpxor %ymm7,%ymm6,%ymm6
+ addl 44+128(%rsp),%eax
+ andl %r9d,%r12d
+ rorxl $25,%r9d,%r13d
+ vpsrlq $2,%ymm7,%ymm7
+ rorxl $11,%r9d,%edi
+ leal (%rbx,%r14,1),%ebx
+ leal (%rax,%r12,1),%eax
+ vpxor %ymm7,%ymm6,%ymm6
+ andnl %r11d,%r9d,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%r9d,%r14d
+ vpshufb %ymm9,%ymm6,%ymm6
+ leal (%rax,%r12,1),%eax
+ xorl %r14d,%r13d
+ movl %ebx,%edi
+ vpaddd %ymm6,%ymm1,%ymm1
+ rorxl $22,%ebx,%r12d
+ leal (%rax,%r13,1),%eax
+ xorl %ecx,%edi
+ vpaddd 32(%rbp),%ymm1,%ymm6
+ rorxl $13,%ebx,%r14d
+ rorxl $2,%ebx,%r13d
+ leal (%r8,%rax,1),%r8d
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %ecx,%r15d
+ xorl %r13d,%r14d
+ leal (%rax,%r15,1),%eax
+ movl %r9d,%r12d
+ vmovdqa %ymm6,32(%rsp)
+ leaq -64(%rsp),%rsp
+.cfi_escape 0x0f,0x05,0x77,0x38,0x06,0x23,0x08
+
+ pushq 64-8(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08
+ leaq 8(%rsp),%rsp
+.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08
+ vpalignr $4,%ymm2,%ymm3,%ymm4
+ addl 0+128(%rsp),%r11d
+ andl %r8d,%r12d
+ rorxl $25,%r8d,%r13d
+ vpalignr $4,%ymm0,%ymm1,%ymm7
+ rorxl $11,%r8d,%r15d
+ leal (%rax,%r14,1),%eax
+ leal (%r11,%r12,1),%r11d
+ vpsrld $7,%ymm4,%ymm6
+ andnl %r10d,%r8d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r8d,%r14d
+ vpaddd %ymm7,%ymm2,%ymm2
+ leal (%r11,%r12,1),%r11d
+ xorl %r14d,%r13d
+ movl %eax,%r15d
+ vpsrld $3,%ymm4,%ymm7
+ rorxl $22,%eax,%r12d
+ leal (%r11,%r13,1),%r11d
+ xorl %ebx,%r15d
+ vpslld $14,%ymm4,%ymm5
+ rorxl $13,%eax,%r14d
+ rorxl $2,%eax,%r13d
+ leal (%rdx,%r11,1),%edx
+ vpxor %ymm6,%ymm7,%ymm4
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %ebx,%edi
+ vpshufd $250,%ymm1,%ymm7
+ xorl %r13d,%r14d
+ leal (%r11,%rdi,1),%r11d
+ movl %r8d,%r12d
+ vpsrld $11,%ymm6,%ymm6
+ addl 4+128(%rsp),%r10d
+ andl %edx,%r12d
+ rorxl $25,%edx,%r13d
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $11,%edx,%edi
+ leal (%r11,%r14,1),%r11d
+ leal (%r10,%r12,1),%r10d
+ vpslld $11,%ymm5,%ymm5
+ andnl %r9d,%edx,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%edx,%r14d
+ vpxor %ymm6,%ymm4,%ymm4
+ leal (%r10,%r12,1),%r10d
+ xorl %r14d,%r13d
+ movl %r11d,%edi
+ vpsrld $10,%ymm7,%ymm6
+ rorxl $22,%r11d,%r12d
+ leal (%r10,%r13,1),%r10d
+ xorl %eax,%edi
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $13,%r11d,%r14d
+ rorxl $2,%r11d,%r13d
+ leal (%rcx,%r10,1),%ecx
+ vpsrlq $17,%ymm7,%ymm7
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %eax,%r15d
+ vpaddd %ymm4,%ymm2,%ymm2
+ xorl %r13d,%r14d
+ leal (%r10,%r15,1),%r10d
+ movl %edx,%r12d
+ vpxor %ymm7,%ymm6,%ymm6
+ addl 8+128(%rsp),%r9d
+ andl %ecx,%r12d
+ rorxl $25,%ecx,%r13d
+ vpsrlq $2,%ymm7,%ymm7
+ rorxl $11,%ecx,%r15d
+ leal (%r10,%r14,1),%r10d
+ leal (%r9,%r12,1),%r9d
+ vpxor %ymm7,%ymm6,%ymm6
+ andnl %r8d,%ecx,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%ecx,%r14d
+ vpshufb %ymm8,%ymm6,%ymm6
+ leal (%r9,%r12,1),%r9d
+ xorl %r14d,%r13d
+ movl %r10d,%r15d
+ vpaddd %ymm6,%ymm2,%ymm2
+ rorxl $22,%r10d,%r12d
+ leal (%r9,%r13,1),%r9d
+ xorl %r11d,%r15d
+ vpshufd $80,%ymm2,%ymm7
+ rorxl $13,%r10d,%r14d
+ rorxl $2,%r10d,%r13d
+ leal (%rbx,%r9,1),%ebx
+ vpsrld $10,%ymm7,%ymm6
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %r11d,%edi
+ vpsrlq $17,%ymm7,%ymm7
+ xorl %r13d,%r14d
+ leal (%r9,%rdi,1),%r9d
+ movl %ecx,%r12d
+ vpxor %ymm7,%ymm6,%ymm6
+ addl 12+128(%rsp),%r8d
+ andl %ebx,%r12d
+ rorxl $25,%ebx,%r13d
+ vpsrlq $2,%ymm7,%ymm7
+ rorxl $11,%ebx,%edi
+ leal (%r9,%r14,1),%r9d
+ leal (%r8,%r12,1),%r8d
+ vpxor %ymm7,%ymm6,%ymm6
+ andnl %edx,%ebx,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%ebx,%r14d
+ vpshufb %ymm9,%ymm6,%ymm6
+ leal (%r8,%r12,1),%r8d
+ xorl %r14d,%r13d
+ movl %r9d,%edi
+ vpaddd %ymm6,%ymm2,%ymm2
+ rorxl $22,%r9d,%r12d
+ leal (%r8,%r13,1),%r8d
+ xorl %r10d,%edi
+ vpaddd 64(%rbp),%ymm2,%ymm6
+ rorxl $13,%r9d,%r14d
+ rorxl $2,%r9d,%r13d
+ leal (%rax,%r8,1),%eax
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %r10d,%r15d
+ xorl %r13d,%r14d
+ leal (%r8,%r15,1),%r8d
+ movl %ebx,%r12d
+ vmovdqa %ymm6,0(%rsp)
+ vpalignr $4,%ymm3,%ymm0,%ymm4
+ addl 32+128(%rsp),%edx
+ andl %eax,%r12d
+ rorxl $25,%eax,%r13d
+ vpalignr $4,%ymm1,%ymm2,%ymm7
+ rorxl $11,%eax,%r15d
+ leal (%r8,%r14,1),%r8d
+ leal (%rdx,%r12,1),%edx
+ vpsrld $7,%ymm4,%ymm6
+ andnl %ecx,%eax,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%eax,%r14d
+ vpaddd %ymm7,%ymm3,%ymm3
+ leal (%rdx,%r12,1),%edx
+ xorl %r14d,%r13d
+ movl %r8d,%r15d
+ vpsrld $3,%ymm4,%ymm7
+ rorxl $22,%r8d,%r12d
+ leal (%rdx,%r13,1),%edx
+ xorl %r9d,%r15d
+ vpslld $14,%ymm4,%ymm5
+ rorxl $13,%r8d,%r14d
+ rorxl $2,%r8d,%r13d
+ leal (%r11,%rdx,1),%r11d
+ vpxor %ymm6,%ymm7,%ymm4
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %r9d,%edi
+ vpshufd $250,%ymm2,%ymm7
+ xorl %r13d,%r14d
+ leal (%rdx,%rdi,1),%edx
+ movl %eax,%r12d
+ vpsrld $11,%ymm6,%ymm6
+ addl 36+128(%rsp),%ecx
+ andl %r11d,%r12d
+ rorxl $25,%r11d,%r13d
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $11,%r11d,%edi
+ leal (%rdx,%r14,1),%edx
+ leal (%rcx,%r12,1),%ecx
+ vpslld $11,%ymm5,%ymm5
+ andnl %ebx,%r11d,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%r11d,%r14d
+ vpxor %ymm6,%ymm4,%ymm4
+ leal (%rcx,%r12,1),%ecx
+ xorl %r14d,%r13d
+ movl %edx,%edi
+ vpsrld $10,%ymm7,%ymm6
+ rorxl $22,%edx,%r12d
+ leal (%rcx,%r13,1),%ecx
+ xorl %r8d,%edi
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $13,%edx,%r14d
+ rorxl $2,%edx,%r13d
+ leal (%r10,%rcx,1),%r10d
+ vpsrlq $17,%ymm7,%ymm7
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %r8d,%r15d
+ vpaddd %ymm4,%ymm3,%ymm3
+ xorl %r13d,%r14d
+ leal (%rcx,%r15,1),%ecx
+ movl %r11d,%r12d
+ vpxor %ymm7,%ymm6,%ymm6
+ addl 40+128(%rsp),%ebx
+ andl %r10d,%r12d
+ rorxl $25,%r10d,%r13d
+ vpsrlq $2,%ymm7,%ymm7
+ rorxl $11,%r10d,%r15d
+ leal (%rcx,%r14,1),%ecx
+ leal (%rbx,%r12,1),%ebx
+ vpxor %ymm7,%ymm6,%ymm6
+ andnl %eax,%r10d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r10d,%r14d
+ vpshufb %ymm8,%ymm6,%ymm6
+ leal (%rbx,%r12,1),%ebx
+ xorl %r14d,%r13d
+ movl %ecx,%r15d
+ vpaddd %ymm6,%ymm3,%ymm3
+ rorxl $22,%ecx,%r12d
+ leal (%rbx,%r13,1),%ebx
+ xorl %edx,%r15d
+ vpshufd $80,%ymm3,%ymm7
+ rorxl $13,%ecx,%r14d
+ rorxl $2,%ecx,%r13d
+ leal (%r9,%rbx,1),%r9d
+ vpsrld $10,%ymm7,%ymm6
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %edx,%edi
+ vpsrlq $17,%ymm7,%ymm7
+ xorl %r13d,%r14d
+ leal (%rbx,%rdi,1),%ebx
+ movl %r10d,%r12d
+ vpxor %ymm7,%ymm6,%ymm6
+ addl 44+128(%rsp),%eax
+ andl %r9d,%r12d
+ rorxl $25,%r9d,%r13d
+ vpsrlq $2,%ymm7,%ymm7
+ rorxl $11,%r9d,%edi
+ leal (%rbx,%r14,1),%ebx
+ leal (%rax,%r12,1),%eax
+ vpxor %ymm7,%ymm6,%ymm6
+ andnl %r11d,%r9d,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%r9d,%r14d
+ vpshufb %ymm9,%ymm6,%ymm6
+ leal (%rax,%r12,1),%eax
+ xorl %r14d,%r13d
+ movl %ebx,%edi
+ vpaddd %ymm6,%ymm3,%ymm3
+ rorxl $22,%ebx,%r12d
+ leal (%rax,%r13,1),%eax
+ xorl %ecx,%edi
+ vpaddd 96(%rbp),%ymm3,%ymm6
+ rorxl $13,%ebx,%r14d
+ rorxl $2,%ebx,%r13d
+ leal (%r8,%rax,1),%r8d
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %ecx,%r15d
+ xorl %r13d,%r14d
+ leal (%rax,%r15,1),%eax
+ movl %r9d,%r12d
+ vmovdqa %ymm6,32(%rsp)
+ leaq 128(%rbp),%rbp
+ cmpb $0,3(%rbp)
+ jne .Lavx2_00_47
+ addl 0+64(%rsp),%r11d
+ andl %r8d,%r12d
+ rorxl $25,%r8d,%r13d
+ rorxl $11,%r8d,%r15d
+ leal (%rax,%r14,1),%eax
+ leal (%r11,%r12,1),%r11d
+ andnl %r10d,%r8d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r8d,%r14d
+ leal (%r11,%r12,1),%r11d
+ xorl %r14d,%r13d
+ movl %eax,%r15d
+ rorxl $22,%eax,%r12d
+ leal (%r11,%r13,1),%r11d
+ xorl %ebx,%r15d
+ rorxl $13,%eax,%r14d
+ rorxl $2,%eax,%r13d
+ leal (%rdx,%r11,1),%edx
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %ebx,%edi
+ xorl %r13d,%r14d
+ leal (%r11,%rdi,1),%r11d
+ movl %r8d,%r12d
+ addl 4+64(%rsp),%r10d
+ andl %edx,%r12d
+ rorxl $25,%edx,%r13d
+ rorxl $11,%edx,%edi
+ leal (%r11,%r14,1),%r11d
+ leal (%r10,%r12,1),%r10d
+ andnl %r9d,%edx,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%edx,%r14d
+ leal (%r10,%r12,1),%r10d
+ xorl %r14d,%r13d
+ movl %r11d,%edi
+ rorxl $22,%r11d,%r12d
+ leal (%r10,%r13,1),%r10d
+ xorl %eax,%edi
+ rorxl $13,%r11d,%r14d
+ rorxl $2,%r11d,%r13d
+ leal (%rcx,%r10,1),%ecx
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %eax,%r15d
+ xorl %r13d,%r14d
+ leal (%r10,%r15,1),%r10d
+ movl %edx,%r12d
+ addl 8+64(%rsp),%r9d
+ andl %ecx,%r12d
+ rorxl $25,%ecx,%r13d
+ rorxl $11,%ecx,%r15d
+ leal (%r10,%r14,1),%r10d
+ leal (%r9,%r12,1),%r9d
+ andnl %r8d,%ecx,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%ecx,%r14d
+ leal (%r9,%r12,1),%r9d
+ xorl %r14d,%r13d
+ movl %r10d,%r15d
+ rorxl $22,%r10d,%r12d
+ leal (%r9,%r13,1),%r9d
+ xorl %r11d,%r15d
+ rorxl $13,%r10d,%r14d
+ rorxl $2,%r10d,%r13d
+ leal (%rbx,%r9,1),%ebx
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %r11d,%edi
+ xorl %r13d,%r14d
+ leal (%r9,%rdi,1),%r9d
+ movl %ecx,%r12d
+ addl 12+64(%rsp),%r8d
+ andl %ebx,%r12d
+ rorxl $25,%ebx,%r13d
+ rorxl $11,%ebx,%edi
+ leal (%r9,%r14,1),%r9d
+ leal (%r8,%r12,1),%r8d
+ andnl %edx,%ebx,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%ebx,%r14d
+ leal (%r8,%r12,1),%r8d
+ xorl %r14d,%r13d
+ movl %r9d,%edi
+ rorxl $22,%r9d,%r12d
+ leal (%r8,%r13,1),%r8d
+ xorl %r10d,%edi
+ rorxl $13,%r9d,%r14d
+ rorxl $2,%r9d,%r13d
+ leal (%rax,%r8,1),%eax
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %r10d,%r15d
+ xorl %r13d,%r14d
+ leal (%r8,%r15,1),%r8d
+ movl %ebx,%r12d
+ addl 32+64(%rsp),%edx
+ andl %eax,%r12d
+ rorxl $25,%eax,%r13d
+ rorxl $11,%eax,%r15d
+ leal (%r8,%r14,1),%r8d
+ leal (%rdx,%r12,1),%edx
+ andnl %ecx,%eax,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%eax,%r14d
+ leal (%rdx,%r12,1),%edx
+ xorl %r14d,%r13d
+ movl %r8d,%r15d
+ rorxl $22,%r8d,%r12d
+ leal (%rdx,%r13,1),%edx
+ xorl %r9d,%r15d
+ rorxl $13,%r8d,%r14d
+ rorxl $2,%r8d,%r13d
+ leal (%r11,%rdx,1),%r11d
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %r9d,%edi
+ xorl %r13d,%r14d
+ leal (%rdx,%rdi,1),%edx
+ movl %eax,%r12d
+ addl 36+64(%rsp),%ecx
+ andl %r11d,%r12d
+ rorxl $25,%r11d,%r13d
+ rorxl $11,%r11d,%edi
+ leal (%rdx,%r14,1),%edx
+ leal (%rcx,%r12,1),%ecx
+ andnl %ebx,%r11d,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%r11d,%r14d
+ leal (%rcx,%r12,1),%ecx
+ xorl %r14d,%r13d
+ movl %edx,%edi
+ rorxl $22,%edx,%r12d
+ leal (%rcx,%r13,1),%ecx
+ xorl %r8d,%edi
+ rorxl $13,%edx,%r14d
+ rorxl $2,%edx,%r13d
+ leal (%r10,%rcx,1),%r10d
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %r8d,%r15d
+ xorl %r13d,%r14d
+ leal (%rcx,%r15,1),%ecx
+ movl %r11d,%r12d
+ addl 40+64(%rsp),%ebx
+ andl %r10d,%r12d
+ rorxl $25,%r10d,%r13d
+ rorxl $11,%r10d,%r15d
+ leal (%rcx,%r14,1),%ecx
+ leal (%rbx,%r12,1),%ebx
+ andnl %eax,%r10d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r10d,%r14d
+ leal (%rbx,%r12,1),%ebx
+ xorl %r14d,%r13d
+ movl %ecx,%r15d
+ rorxl $22,%ecx,%r12d
+ leal (%rbx,%r13,1),%ebx
+ xorl %edx,%r15d
+ rorxl $13,%ecx,%r14d
+ rorxl $2,%ecx,%r13d
+ leal (%r9,%rbx,1),%r9d
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %edx,%edi
+ xorl %r13d,%r14d
+ leal (%rbx,%rdi,1),%ebx
+ movl %r10d,%r12d
+ addl 44+64(%rsp),%eax
+ andl %r9d,%r12d
+ rorxl $25,%r9d,%r13d
+ rorxl $11,%r9d,%edi
+ leal (%rbx,%r14,1),%ebx
+ leal (%rax,%r12,1),%eax
+ andnl %r11d,%r9d,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%r9d,%r14d
+ leal (%rax,%r12,1),%eax
+ xorl %r14d,%r13d
+ movl %ebx,%edi
+ rorxl $22,%ebx,%r12d
+ leal (%rax,%r13,1),%eax
+ xorl %ecx,%edi
+ rorxl $13,%ebx,%r14d
+ rorxl $2,%ebx,%r13d
+ leal (%r8,%rax,1),%r8d
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %ecx,%r15d
+ xorl %r13d,%r14d
+ leal (%rax,%r15,1),%eax
+ movl %r9d,%r12d
+ addl 0(%rsp),%r11d
+ andl %r8d,%r12d
+ rorxl $25,%r8d,%r13d
+ rorxl $11,%r8d,%r15d
+ leal (%rax,%r14,1),%eax
+ leal (%r11,%r12,1),%r11d
+ andnl %r10d,%r8d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r8d,%r14d
+ leal (%r11,%r12,1),%r11d
+ xorl %r14d,%r13d
+ movl %eax,%r15d
+ rorxl $22,%eax,%r12d
+ leal (%r11,%r13,1),%r11d
+ xorl %ebx,%r15d
+ rorxl $13,%eax,%r14d
+ rorxl $2,%eax,%r13d
+ leal (%rdx,%r11,1),%edx
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %ebx,%edi
+ xorl %r13d,%r14d
+ leal (%r11,%rdi,1),%r11d
+ movl %r8d,%r12d
+ addl 4(%rsp),%r10d
+ andl %edx,%r12d
+ rorxl $25,%edx,%r13d
+ rorxl $11,%edx,%edi
+ leal (%r11,%r14,1),%r11d
+ leal (%r10,%r12,1),%r10d
+ andnl %r9d,%edx,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%edx,%r14d
+ leal (%r10,%r12,1),%r10d
+ xorl %r14d,%r13d
+ movl %r11d,%edi
+ rorxl $22,%r11d,%r12d
+ leal (%r10,%r13,1),%r10d
+ xorl %eax,%edi
+ rorxl $13,%r11d,%r14d
+ rorxl $2,%r11d,%r13d
+ leal (%rcx,%r10,1),%ecx
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %eax,%r15d
+ xorl %r13d,%r14d
+ leal (%r10,%r15,1),%r10d
+ movl %edx,%r12d
+ addl 8(%rsp),%r9d
+ andl %ecx,%r12d
+ rorxl $25,%ecx,%r13d
+ rorxl $11,%ecx,%r15d
+ leal (%r10,%r14,1),%r10d
+ leal (%r9,%r12,1),%r9d
+ andnl %r8d,%ecx,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%ecx,%r14d
+ leal (%r9,%r12,1),%r9d
+ xorl %r14d,%r13d
+ movl %r10d,%r15d
+ rorxl $22,%r10d,%r12d
+ leal (%r9,%r13,1),%r9d
+ xorl %r11d,%r15d
+ rorxl $13,%r10d,%r14d
+ rorxl $2,%r10d,%r13d
+ leal (%rbx,%r9,1),%ebx
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %r11d,%edi
+ xorl %r13d,%r14d
+ leal (%r9,%rdi,1),%r9d
+ movl %ecx,%r12d
+ addl 12(%rsp),%r8d
+ andl %ebx,%r12d
+ rorxl $25,%ebx,%r13d
+ rorxl $11,%ebx,%edi
+ leal (%r9,%r14,1),%r9d
+ leal (%r8,%r12,1),%r8d
+ andnl %edx,%ebx,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%ebx,%r14d
+ leal (%r8,%r12,1),%r8d
+ xorl %r14d,%r13d
+ movl %r9d,%edi
+ rorxl $22,%r9d,%r12d
+ leal (%r8,%r13,1),%r8d
+ xorl %r10d,%edi
+ rorxl $13,%r9d,%r14d
+ rorxl $2,%r9d,%r13d
+ leal (%rax,%r8,1),%eax
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %r10d,%r15d
+ xorl %r13d,%r14d
+ leal (%r8,%r15,1),%r8d
+ movl %ebx,%r12d
+ addl 32(%rsp),%edx
+ andl %eax,%r12d
+ rorxl $25,%eax,%r13d
+ rorxl $11,%eax,%r15d
+ leal (%r8,%r14,1),%r8d
+ leal (%rdx,%r12,1),%edx
+ andnl %ecx,%eax,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%eax,%r14d
+ leal (%rdx,%r12,1),%edx
+ xorl %r14d,%r13d
+ movl %r8d,%r15d
+ rorxl $22,%r8d,%r12d
+ leal (%rdx,%r13,1),%edx
+ xorl %r9d,%r15d
+ rorxl $13,%r8d,%r14d
+ rorxl $2,%r8d,%r13d
+ leal (%r11,%rdx,1),%r11d
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %r9d,%edi
+ xorl %r13d,%r14d
+ leal (%rdx,%rdi,1),%edx
+ movl %eax,%r12d
+ addl 36(%rsp),%ecx
+ andl %r11d,%r12d
+ rorxl $25,%r11d,%r13d
+ rorxl $11,%r11d,%edi
+ leal (%rdx,%r14,1),%edx
+ leal (%rcx,%r12,1),%ecx
+ andnl %ebx,%r11d,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%r11d,%r14d
+ leal (%rcx,%r12,1),%ecx
+ xorl %r14d,%r13d
+ movl %edx,%edi
+ rorxl $22,%edx,%r12d
+ leal (%rcx,%r13,1),%ecx
+ xorl %r8d,%edi
+ rorxl $13,%edx,%r14d
+ rorxl $2,%edx,%r13d
+ leal (%r10,%rcx,1),%r10d
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %r8d,%r15d
+ xorl %r13d,%r14d
+ leal (%rcx,%r15,1),%ecx
+ movl %r11d,%r12d
+ addl 40(%rsp),%ebx
+ andl %r10d,%r12d
+ rorxl $25,%r10d,%r13d
+ rorxl $11,%r10d,%r15d
+ leal (%rcx,%r14,1),%ecx
+ leal (%rbx,%r12,1),%ebx
+ andnl %eax,%r10d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r10d,%r14d
+ leal (%rbx,%r12,1),%ebx
+ xorl %r14d,%r13d
+ movl %ecx,%r15d
+ rorxl $22,%ecx,%r12d
+ leal (%rbx,%r13,1),%ebx
+ xorl %edx,%r15d
+ rorxl $13,%ecx,%r14d
+ rorxl $2,%ecx,%r13d
+ leal (%r9,%rbx,1),%r9d
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %edx,%edi
+ xorl %r13d,%r14d
+ leal (%rbx,%rdi,1),%ebx
+ movl %r10d,%r12d
+ addl 44(%rsp),%eax
+ andl %r9d,%r12d
+ rorxl $25,%r9d,%r13d
+ rorxl $11,%r9d,%edi
+ leal (%rbx,%r14,1),%ebx
+ leal (%rax,%r12,1),%eax
+ andnl %r11d,%r9d,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%r9d,%r14d
+ leal (%rax,%r12,1),%eax
+ xorl %r14d,%r13d
+ movl %ebx,%edi
+ rorxl $22,%ebx,%r12d
+ leal (%rax,%r13,1),%eax
+ xorl %ecx,%edi
+ rorxl $13,%ebx,%r14d
+ rorxl $2,%ebx,%r13d
+ leal (%r8,%rax,1),%r8d
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %ecx,%r15d
+ xorl %r13d,%r14d
+ leal (%rax,%r15,1),%eax
+ movl %r9d,%r12d
+ movq 512(%rsp),%rdi
+ addl %r14d,%eax
+
+ leaq 448(%rsp),%rbp
+
+ addl 0(%rdi),%eax
+ addl 4(%rdi),%ebx
+ addl 8(%rdi),%ecx
+ addl 12(%rdi),%edx
+ addl 16(%rdi),%r8d
+ addl 20(%rdi),%r9d
+ addl 24(%rdi),%r10d
+ addl 28(%rdi),%r11d
+
+ movl %eax,0(%rdi)
+ movl %ebx,4(%rdi)
+ movl %ecx,8(%rdi)
+ movl %edx,12(%rdi)
+ movl %r8d,16(%rdi)
+ movl %r9d,20(%rdi)
+ movl %r10d,24(%rdi)
+ movl %r11d,28(%rdi)
+
+ cmpq 80(%rbp),%rsi
+ je .Ldone_avx2
+
+ xorl %r14d,%r14d
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ movl %r9d,%r12d
+ jmp .Lower_avx2
+.align 16
+.Lower_avx2:
+ addl 0+16(%rbp),%r11d
+ andl %r8d,%r12d
+ rorxl $25,%r8d,%r13d
+ rorxl $11,%r8d,%r15d
+ leal (%rax,%r14,1),%eax
+ leal (%r11,%r12,1),%r11d
+ andnl %r10d,%r8d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r8d,%r14d
+ leal (%r11,%r12,1),%r11d
+ xorl %r14d,%r13d
+ movl %eax,%r15d
+ rorxl $22,%eax,%r12d
+ leal (%r11,%r13,1),%r11d
+ xorl %ebx,%r15d
+ rorxl $13,%eax,%r14d
+ rorxl $2,%eax,%r13d
+ leal (%rdx,%r11,1),%edx
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %ebx,%edi
+ xorl %r13d,%r14d
+ leal (%r11,%rdi,1),%r11d
+ movl %r8d,%r12d
+ addl 4+16(%rbp),%r10d
+ andl %edx,%r12d
+ rorxl $25,%edx,%r13d
+ rorxl $11,%edx,%edi
+ leal (%r11,%r14,1),%r11d
+ leal (%r10,%r12,1),%r10d
+ andnl %r9d,%edx,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%edx,%r14d
+ leal (%r10,%r12,1),%r10d
+ xorl %r14d,%r13d
+ movl %r11d,%edi
+ rorxl $22,%r11d,%r12d
+ leal (%r10,%r13,1),%r10d
+ xorl %eax,%edi
+ rorxl $13,%r11d,%r14d
+ rorxl $2,%r11d,%r13d
+ leal (%rcx,%r10,1),%ecx
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %eax,%r15d
+ xorl %r13d,%r14d
+ leal (%r10,%r15,1),%r10d
+ movl %edx,%r12d
+ addl 8+16(%rbp),%r9d
+ andl %ecx,%r12d
+ rorxl $25,%ecx,%r13d
+ rorxl $11,%ecx,%r15d
+ leal (%r10,%r14,1),%r10d
+ leal (%r9,%r12,1),%r9d
+ andnl %r8d,%ecx,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%ecx,%r14d
+ leal (%r9,%r12,1),%r9d
+ xorl %r14d,%r13d
+ movl %r10d,%r15d
+ rorxl $22,%r10d,%r12d
+ leal (%r9,%r13,1),%r9d
+ xorl %r11d,%r15d
+ rorxl $13,%r10d,%r14d
+ rorxl $2,%r10d,%r13d
+ leal (%rbx,%r9,1),%ebx
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %r11d,%edi
+ xorl %r13d,%r14d
+ leal (%r9,%rdi,1),%r9d
+ movl %ecx,%r12d
+ addl 12+16(%rbp),%r8d
+ andl %ebx,%r12d
+ rorxl $25,%ebx,%r13d
+ rorxl $11,%ebx,%edi
+ leal (%r9,%r14,1),%r9d
+ leal (%r8,%r12,1),%r8d
+ andnl %edx,%ebx,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%ebx,%r14d
+ leal (%r8,%r12,1),%r8d
+ xorl %r14d,%r13d
+ movl %r9d,%edi
+ rorxl $22,%r9d,%r12d
+ leal (%r8,%r13,1),%r8d
+ xorl %r10d,%edi
+ rorxl $13,%r9d,%r14d
+ rorxl $2,%r9d,%r13d
+ leal (%rax,%r8,1),%eax
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %r10d,%r15d
+ xorl %r13d,%r14d
+ leal (%r8,%r15,1),%r8d
+ movl %ebx,%r12d
+ addl 32+16(%rbp),%edx
+ andl %eax,%r12d
+ rorxl $25,%eax,%r13d
+ rorxl $11,%eax,%r15d
+ leal (%r8,%r14,1),%r8d
+ leal (%rdx,%r12,1),%edx
+ andnl %ecx,%eax,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%eax,%r14d
+ leal (%rdx,%r12,1),%edx
+ xorl %r14d,%r13d
+ movl %r8d,%r15d
+ rorxl $22,%r8d,%r12d
+ leal (%rdx,%r13,1),%edx
+ xorl %r9d,%r15d
+ rorxl $13,%r8d,%r14d
+ rorxl $2,%r8d,%r13d
+ leal (%r11,%rdx,1),%r11d
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %r9d,%edi
+ xorl %r13d,%r14d
+ leal (%rdx,%rdi,1),%edx
+ movl %eax,%r12d
+ addl 36+16(%rbp),%ecx
+ andl %r11d,%r12d
+ rorxl $25,%r11d,%r13d
+ rorxl $11,%r11d,%edi
+ leal (%rdx,%r14,1),%edx
+ leal (%rcx,%r12,1),%ecx
+ andnl %ebx,%r11d,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%r11d,%r14d
+ leal (%rcx,%r12,1),%ecx
+ xorl %r14d,%r13d
+ movl %edx,%edi
+ rorxl $22,%edx,%r12d
+ leal (%rcx,%r13,1),%ecx
+ xorl %r8d,%edi
+ rorxl $13,%edx,%r14d
+ rorxl $2,%edx,%r13d
+ leal (%r10,%rcx,1),%r10d
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %r8d,%r15d
+ xorl %r13d,%r14d
+ leal (%rcx,%r15,1),%ecx
+ movl %r11d,%r12d
+ addl 40+16(%rbp),%ebx
+ andl %r10d,%r12d
+ rorxl $25,%r10d,%r13d
+ rorxl $11,%r10d,%r15d
+ leal (%rcx,%r14,1),%ecx
+ leal (%rbx,%r12,1),%ebx
+ andnl %eax,%r10d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r10d,%r14d
+ leal (%rbx,%r12,1),%ebx
+ xorl %r14d,%r13d
+ movl %ecx,%r15d
+ rorxl $22,%ecx,%r12d
+ leal (%rbx,%r13,1),%ebx
+ xorl %edx,%r15d
+ rorxl $13,%ecx,%r14d
+ rorxl $2,%ecx,%r13d
+ leal (%r9,%rbx,1),%r9d
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %edx,%edi
+ xorl %r13d,%r14d
+ leal (%rbx,%rdi,1),%ebx
+ movl %r10d,%r12d
+ addl 44+16(%rbp),%eax
+ andl %r9d,%r12d
+ rorxl $25,%r9d,%r13d
+ rorxl $11,%r9d,%edi
+ leal (%rbx,%r14,1),%ebx
+ leal (%rax,%r12,1),%eax
+ andnl %r11d,%r9d,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%r9d,%r14d
+ leal (%rax,%r12,1),%eax
+ xorl %r14d,%r13d
+ movl %ebx,%edi
+ rorxl $22,%ebx,%r12d
+ leal (%rax,%r13,1),%eax
+ xorl %ecx,%edi
+ rorxl $13,%ebx,%r14d
+ rorxl $2,%ebx,%r13d
+ leal (%r8,%rax,1),%r8d
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %ecx,%r15d
+ xorl %r13d,%r14d
+ leal (%rax,%r15,1),%eax
+ movl %r9d,%r12d
+ leaq -64(%rbp),%rbp
+ cmpq %rsp,%rbp
+ jae .Lower_avx2
+
+ movq 512(%rsp),%rdi
+ addl %r14d,%eax
+
+ leaq 448(%rsp),%rsp
+
+.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
+
+ addl 0(%rdi),%eax
+ addl 4(%rdi),%ebx
+ addl 8(%rdi),%ecx
+ addl 12(%rdi),%edx
+ addl 16(%rdi),%r8d
+ addl 20(%rdi),%r9d
+ leaq 128(%rsi),%rsi
+ addl 24(%rdi),%r10d
+ movq %rsi,%r12
+ addl 28(%rdi),%r11d
+ cmpq 64+16(%rsp),%rsi
+
+ movl %eax,0(%rdi)
+ cmoveq %rsp,%r12
+ movl %ebx,4(%rdi)
+ movl %ecx,8(%rdi)
+ movl %edx,12(%rdi)
+ movl %r8d,16(%rdi)
+ movl %r9d,20(%rdi)
+ movl %r10d,24(%rdi)
+ movl %r11d,28(%rdi)
+
+ jbe .Loop_avx2
+ leaq (%rsp),%rbp
+
+
+.cfi_escape 0x0f,0x06,0x76,0xd8,0x00,0x06,0x23,0x08
+
+.Ldone_avx2:
+ movq 88(%rbp),%rsi
+.cfi_def_cfa %rsi,8
+ vzeroupper
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx2:
+ RET
+.cfi_endproc
+SET_SIZE(zfs_sha256_transform_avx2)
+
+#if defined(__ELF__)
+ .section .note.GNU-stack,"",%progbits
+#endif
+#endif
diff --git a/module/icp/asm-x86_64/sha2/sha512-x86_64.S b/module/icp/asm-x86_64/sha2/sha512-x86_64.S
new file mode 100644
index 000000000..ce8e108af
--- /dev/null
+++ b/module/icp/asm-x86_64/sha2/sha512-x86_64.S
@@ -0,0 +1,4011 @@
+/*
+ * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Portions Copyright (c) 2022 Tino Reichardt <[email protected]>
+ * - modified assembly to fit into OpenZFS
+ */
+
+#if defined(__x86_64)
+
+#define _ASM
+#include <sys/asm_linkage.h>
+
+.section .rodata
+
+.align 64
+.type K512,@object
+K512:
+.quad 0x428a2f98d728ae22,0x7137449123ef65cd
+.quad 0x428a2f98d728ae22,0x7137449123ef65cd
+.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad 0x3956c25bf348b538,0x59f111f1b605d019
+.quad 0x3956c25bf348b538,0x59f111f1b605d019
+.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad 0xd807aa98a3030242,0x12835b0145706fbe
+.quad 0xd807aa98a3030242,0x12835b0145706fbe
+.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad 0x9bdc06a725c71235,0xc19bf174cf692694
+.quad 0x9bdc06a725c71235,0xc19bf174cf692694
+.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad 0x983e5152ee66dfab,0xa831c66d2db43210
+.quad 0x983e5152ee66dfab,0xa831c66d2db43210
+.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad 0x06ca6351e003826f,0x142929670a0e6e70
+.quad 0x06ca6351e003826f,0x142929670a0e6e70
+.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad 0x81c2c92e47edaee6,0x92722c851482353b
+.quad 0x81c2c92e47edaee6,0x92722c851482353b
+.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad 0xd192e819d6ef5218,0xd69906245565a910
+.quad 0xd192e819d6ef5218,0xd69906245565a910
+.quad 0xf40e35855771202a,0x106aa07032bbd1b8
+.quad 0xf40e35855771202a,0x106aa07032bbd1b8
+.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad 0x90befffa23631e28,0xa4506cebde82bde9
+.quad 0x90befffa23631e28,0xa4506cebde82bde9
+.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad 0xca273eceea26619c,0xd186b8c721c0c207
+.quad 0xca273eceea26619c,0xd186b8c721c0c207
+.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad 0x113f9804bef90dae,0x1b710b35131c471b
+.quad 0x113f9804bef90dae,0x1b710b35131c471b
+.quad 0x28db77f523047d84,0x32caab7b40c72493
+.quad 0x28db77f523047d84,0x32caab7b40c72493
+.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+.quad 0x0001020304050607,0x08090a0b0c0d0e0f
+.quad 0x0001020304050607,0x08090a0b0c0d0e0f
+
+ENTRY_ALIGN(zfs_sha512_transform_x64, 16)
+.cfi_startproc
+ ENDBR
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ shlq $4,%rdx
+ subq $128+32,%rsp
+ leaq (%rsi,%rdx,8),%rdx
+ andq $-64,%rsp
+ movq %rdi,128+0(%rsp)
+ movq %rsi,128+8(%rsp)
+ movq %rdx,128+16(%rsp)
+ movq %rax,152(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08
+.Lprologue:
+ movq 0(%rdi),%rax
+ movq 8(%rdi),%rbx
+ movq 16(%rdi),%rcx
+ movq 24(%rdi),%rdx
+ movq 32(%rdi),%r8
+ movq 40(%rdi),%r9
+ movq 48(%rdi),%r10
+ movq 56(%rdi),%r11
+ jmp .Lloop
+.align 16
+.Lloop:
+ movq %rbx,%rdi
+ leaq K512(%rip),%rbp
+ xorq %rcx,%rdi
+ movq 0(%rsi),%r12
+ movq %r8,%r13
+ movq %rax,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r9,%r15
+ xorq %r8,%r13
+ rorq $5,%r14
+ xorq %r10,%r15
+ movq %r12,0(%rsp)
+ xorq %rax,%r14
+ andq %r8,%r15
+ rorq $4,%r13
+ addq %r11,%r12
+ xorq %r10,%r15
+ rorq $6,%r14
+ xorq %r8,%r13
+ addq %r15,%r12
+ movq %rax,%r15
+ addq (%rbp),%r12
+ xorq %rax,%r14
+ xorq %rbx,%r15
+ rorq $14,%r13
+ movq %rbx,%r11
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+ xorq %rdi,%r11
+ addq %r12,%rdx
+ addq %r12,%r11
+ leaq 8(%rbp),%rbp
+ addq %r14,%r11
+ movq 8(%rsi),%r12
+ movq %rdx,%r13
+ movq %r11,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r8,%rdi
+ xorq %rdx,%r13
+ rorq $5,%r14
+ xorq %r9,%rdi
+ movq %r12,8(%rsp)
+ xorq %r11,%r14
+ andq %rdx,%rdi
+ rorq $4,%r13
+ addq %r10,%r12
+ xorq %r9,%rdi
+ rorq $6,%r14
+ xorq %rdx,%r13
+ addq %rdi,%r12
+ movq %r11,%rdi
+ addq (%rbp),%r12
+ xorq %r11,%r14
+ xorq %rax,%rdi
+ rorq $14,%r13
+ movq %rax,%r10
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+ xorq %r15,%r10
+ addq %r12,%rcx
+ addq %r12,%r10
+ leaq 24(%rbp),%rbp
+ addq %r14,%r10
+ movq 16(%rsi),%r12
+ movq %rcx,%r13
+ movq %r10,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rdx,%r15
+ xorq %rcx,%r13
+ rorq $5,%r14
+ xorq %r8,%r15
+ movq %r12,16(%rsp)
+ xorq %r10,%r14
+ andq %rcx,%r15
+ rorq $4,%r13
+ addq %r9,%r12
+ xorq %r8,%r15
+ rorq $6,%r14
+ xorq %rcx,%r13
+ addq %r15,%r12
+ movq %r10,%r15
+ addq (%rbp),%r12
+ xorq %r10,%r14
+ xorq %r11,%r15
+ rorq $14,%r13
+ movq %r11,%r9
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+ xorq %rdi,%r9
+ addq %r12,%rbx
+ addq %r12,%r9
+ leaq 8(%rbp),%rbp
+ addq %r14,%r9
+ movq 24(%rsi),%r12
+ movq %rbx,%r13
+ movq %r9,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rcx,%rdi
+ xorq %rbx,%r13
+ rorq $5,%r14
+ xorq %rdx,%rdi
+ movq %r12,24(%rsp)
+ xorq %r9,%r14
+ andq %rbx,%rdi
+ rorq $4,%r13
+ addq %r8,%r12
+ xorq %rdx,%rdi
+ rorq $6,%r14
+ xorq %rbx,%r13
+ addq %rdi,%r12
+ movq %r9,%rdi
+ addq (%rbp),%r12
+ xorq %r9,%r14
+ xorq %r10,%rdi
+ rorq $14,%r13
+ movq %r10,%r8
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+ xorq %r15,%r8
+ addq %r12,%rax
+ addq %r12,%r8
+ leaq 24(%rbp),%rbp
+ addq %r14,%r8
+ movq 32(%rsi),%r12
+ movq %rax,%r13
+ movq %r8,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rbx,%r15
+ xorq %rax,%r13
+ rorq $5,%r14
+ xorq %rcx,%r15
+ movq %r12,32(%rsp)
+ xorq %r8,%r14
+ andq %rax,%r15
+ rorq $4,%r13
+ addq %rdx,%r12
+ xorq %rcx,%r15
+ rorq $6,%r14
+ xorq %rax,%r13
+ addq %r15,%r12
+ movq %r8,%r15
+ addq (%rbp),%r12
+ xorq %r8,%r14
+ xorq %r9,%r15
+ rorq $14,%r13
+ movq %r9,%rdx
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+ xorq %rdi,%rdx
+ addq %r12,%r11
+ addq %r12,%rdx
+ leaq 8(%rbp),%rbp
+ addq %r14,%rdx
+ movq 40(%rsi),%r12
+ movq %r11,%r13
+ movq %rdx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rax,%rdi
+ xorq %r11,%r13
+ rorq $5,%r14
+ xorq %rbx,%rdi
+ movq %r12,40(%rsp)
+ xorq %rdx,%r14
+ andq %r11,%rdi
+ rorq $4,%r13
+ addq %rcx,%r12
+ xorq %rbx,%rdi
+ rorq $6,%r14
+ xorq %r11,%r13
+ addq %rdi,%r12
+ movq %rdx,%rdi
+ addq (%rbp),%r12
+ xorq %rdx,%r14
+ xorq %r8,%rdi
+ rorq $14,%r13
+ movq %r8,%rcx
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+ xorq %r15,%rcx
+ addq %r12,%r10
+ addq %r12,%rcx
+ leaq 24(%rbp),%rbp
+ addq %r14,%rcx
+ movq 48(%rsi),%r12
+ movq %r10,%r13
+ movq %rcx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r11,%r15
+ xorq %r10,%r13
+ rorq $5,%r14
+ xorq %rax,%r15
+ movq %r12,48(%rsp)
+ xorq %rcx,%r14
+ andq %r10,%r15
+ rorq $4,%r13
+ addq %rbx,%r12
+ xorq %rax,%r15
+ rorq $6,%r14
+ xorq %r10,%r13
+ addq %r15,%r12
+ movq %rcx,%r15
+ addq (%rbp),%r12
+ xorq %rcx,%r14
+ xorq %rdx,%r15
+ rorq $14,%r13
+ movq %rdx,%rbx
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+ xorq %rdi,%rbx
+ addq %r12,%r9
+ addq %r12,%rbx
+ leaq 8(%rbp),%rbp
+ addq %r14,%rbx
+ movq 56(%rsi),%r12
+ movq %r9,%r13
+ movq %rbx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r10,%rdi
+ xorq %r9,%r13
+ rorq $5,%r14
+ xorq %r11,%rdi
+ movq %r12,56(%rsp)
+ xorq %rbx,%r14
+ andq %r9,%rdi
+ rorq $4,%r13
+ addq %rax,%r12
+ xorq %r11,%rdi
+ rorq $6,%r14
+ xorq %r9,%r13
+ addq %rdi,%r12
+ movq %rbx,%rdi
+ addq (%rbp),%r12
+ xorq %rbx,%r14
+ xorq %rcx,%rdi
+ rorq $14,%r13
+ movq %rcx,%rax
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+ xorq %r15,%rax
+ addq %r12,%r8
+ addq %r12,%rax
+ leaq 24(%rbp),%rbp
+ addq %r14,%rax
+ movq 64(%rsi),%r12
+ movq %r8,%r13
+ movq %rax,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r9,%r15
+ xorq %r8,%r13
+ rorq $5,%r14
+ xorq %r10,%r15
+ movq %r12,64(%rsp)
+ xorq %rax,%r14
+ andq %r8,%r15
+ rorq $4,%r13
+ addq %r11,%r12
+ xorq %r10,%r15
+ rorq $6,%r14
+ xorq %r8,%r13
+ addq %r15,%r12
+ movq %rax,%r15
+ addq (%rbp),%r12
+ xorq %rax,%r14
+ xorq %rbx,%r15
+ rorq $14,%r13
+ movq %rbx,%r11
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+ xorq %rdi,%r11
+ addq %r12,%rdx
+ addq %r12,%r11
+ leaq 8(%rbp),%rbp
+ addq %r14,%r11
+ movq 72(%rsi),%r12
+ movq %rdx,%r13
+ movq %r11,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r8,%rdi
+ xorq %rdx,%r13
+ rorq $5,%r14
+ xorq %r9,%rdi
+ movq %r12,72(%rsp)
+ xorq %r11,%r14
+ andq %rdx,%rdi
+ rorq $4,%r13
+ addq %r10,%r12
+ xorq %r9,%rdi
+ rorq $6,%r14
+ xorq %rdx,%r13
+ addq %rdi,%r12
+ movq %r11,%rdi
+ addq (%rbp),%r12
+ xorq %r11,%r14
+ xorq %rax,%rdi
+ rorq $14,%r13
+ movq %rax,%r10
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+ xorq %r15,%r10
+ addq %r12,%rcx
+ addq %r12,%r10
+ leaq 24(%rbp),%rbp
+ addq %r14,%r10
+ movq 80(%rsi),%r12
+ movq %rcx,%r13
+ movq %r10,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rdx,%r15
+ xorq %rcx,%r13
+ rorq $5,%r14
+ xorq %r8,%r15
+ movq %r12,80(%rsp)
+ xorq %r10,%r14
+ andq %rcx,%r15
+ rorq $4,%r13
+ addq %r9,%r12
+ xorq %r8,%r15
+ rorq $6,%r14
+ xorq %rcx,%r13
+ addq %r15,%r12
+ movq %r10,%r15
+ addq (%rbp),%r12
+ xorq %r10,%r14
+ xorq %r11,%r15
+ rorq $14,%r13
+ movq %r11,%r9
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+ xorq %rdi,%r9
+ addq %r12,%rbx
+ addq %r12,%r9
+ leaq 8(%rbp),%rbp
+ addq %r14,%r9
+ movq 88(%rsi),%r12
+ movq %rbx,%r13
+ movq %r9,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rcx,%rdi
+ xorq %rbx,%r13
+ rorq $5,%r14
+ xorq %rdx,%rdi
+ movq %r12,88(%rsp)
+ xorq %r9,%r14
+ andq %rbx,%rdi
+ rorq $4,%r13
+ addq %r8,%r12
+ xorq %rdx,%rdi
+ rorq $6,%r14
+ xorq %rbx,%r13
+ addq %rdi,%r12
+ movq %r9,%rdi
+ addq (%rbp),%r12
+ xorq %r9,%r14
+ xorq %r10,%rdi
+ rorq $14,%r13
+ movq %r10,%r8
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+ xorq %r15,%r8
+ addq %r12,%rax
+ addq %r12,%r8
+ leaq 24(%rbp),%rbp
+ addq %r14,%r8
+ movq 96(%rsi),%r12
+ movq %rax,%r13
+ movq %r8,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rbx,%r15
+ xorq %rax,%r13
+ rorq $5,%r14
+ xorq %rcx,%r15
+ movq %r12,96(%rsp)
+ xorq %r8,%r14
+ andq %rax,%r15
+ rorq $4,%r13
+ addq %rdx,%r12
+ xorq %rcx,%r15
+ rorq $6,%r14
+ xorq %rax,%r13
+ addq %r15,%r12
+ movq %r8,%r15
+ addq (%rbp),%r12
+ xorq %r8,%r14
+ xorq %r9,%r15
+ rorq $14,%r13
+ movq %r9,%rdx
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+ xorq %rdi,%rdx
+ addq %r12,%r11
+ addq %r12,%rdx
+ leaq 8(%rbp),%rbp
+ addq %r14,%rdx
+ movq 104(%rsi),%r12
+ movq %r11,%r13
+ movq %rdx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rax,%rdi
+ xorq %r11,%r13
+ rorq $5,%r14
+ xorq %rbx,%rdi
+ movq %r12,104(%rsp)
+ xorq %rdx,%r14
+ andq %r11,%rdi
+ rorq $4,%r13
+ addq %rcx,%r12
+ xorq %rbx,%rdi
+ rorq $6,%r14
+ xorq %r11,%r13
+ addq %rdi,%r12
+ movq %rdx,%rdi
+ addq (%rbp),%r12
+ xorq %rdx,%r14
+ xorq %r8,%rdi
+ rorq $14,%r13
+ movq %r8,%rcx
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+ xorq %r15,%rcx
+ addq %r12,%r10
+ addq %r12,%rcx
+ leaq 24(%rbp),%rbp
+ addq %r14,%rcx
+ movq 112(%rsi),%r12
+ movq %r10,%r13
+ movq %rcx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r11,%r15
+ xorq %r10,%r13
+ rorq $5,%r14
+ xorq %rax,%r15
+ movq %r12,112(%rsp)
+ xorq %rcx,%r14
+ andq %r10,%r15
+ rorq $4,%r13
+ addq %rbx,%r12
+ xorq %rax,%r15
+ rorq $6,%r14
+ xorq %r10,%r13
+ addq %r15,%r12
+ movq %rcx,%r15
+ addq (%rbp),%r12
+ xorq %rcx,%r14
+ xorq %rdx,%r15
+ rorq $14,%r13
+ movq %rdx,%rbx
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+ xorq %rdi,%rbx
+ addq %r12,%r9
+ addq %r12,%rbx
+ leaq 8(%rbp),%rbp
+ addq %r14,%rbx
+ movq 120(%rsi),%r12
+ movq %r9,%r13
+ movq %rbx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r10,%rdi
+ xorq %r9,%r13
+ rorq $5,%r14
+ xorq %r11,%rdi
+ movq %r12,120(%rsp)
+ xorq %rbx,%r14
+ andq %r9,%rdi
+ rorq $4,%r13
+ addq %rax,%r12
+ xorq %r11,%rdi
+ rorq $6,%r14
+ xorq %r9,%r13
+ addq %rdi,%r12
+ movq %rbx,%rdi
+ addq (%rbp),%r12
+ xorq %rbx,%r14
+ xorq %rcx,%rdi
+ rorq $14,%r13
+ movq %rcx,%rax
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+ xorq %r15,%rax
+ addq %r12,%r8
+ addq %r12,%rax
+ leaq 24(%rbp),%rbp
+ jmp .Lrounds_16_xx
+.align 16
+.Lrounds_16_xx:
+ movq 8(%rsp),%r13
+ movq 112(%rsp),%r15
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rax
+ movq %r15,%r14
+ rorq $42,%r15
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 72(%rsp),%r12
+ addq 0(%rsp),%r12
+ movq %r8,%r13
+ addq %r15,%r12
+ movq %rax,%r14
+ rorq $23,%r13
+ movq %r9,%r15
+ xorq %r8,%r13
+ rorq $5,%r14
+ xorq %r10,%r15
+ movq %r12,0(%rsp)
+ xorq %rax,%r14
+ andq %r8,%r15
+ rorq $4,%r13
+ addq %r11,%r12
+ xorq %r10,%r15
+ rorq $6,%r14
+ xorq %r8,%r13
+ addq %r15,%r12
+ movq %rax,%r15
+ addq (%rbp),%r12
+ xorq %rax,%r14
+ xorq %rbx,%r15
+ rorq $14,%r13
+ movq %rbx,%r11
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+ xorq %rdi,%r11
+ addq %r12,%rdx
+ addq %r12,%r11
+ leaq 8(%rbp),%rbp
+ movq 16(%rsp),%r13
+ movq 120(%rsp),%rdi
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r11
+ movq %rdi,%r14
+ rorq $42,%rdi
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 80(%rsp),%r12
+ addq 8(%rsp),%r12
+ movq %rdx,%r13
+ addq %rdi,%r12
+ movq %r11,%r14
+ rorq $23,%r13
+ movq %r8,%rdi
+ xorq %rdx,%r13
+ rorq $5,%r14
+ xorq %r9,%rdi
+ movq %r12,8(%rsp)
+ xorq %r11,%r14
+ andq %rdx,%rdi
+ rorq $4,%r13
+ addq %r10,%r12
+ xorq %r9,%rdi
+ rorq $6,%r14
+ xorq %rdx,%r13
+ addq %rdi,%r12
+ movq %r11,%rdi
+ addq (%rbp),%r12
+ xorq %r11,%r14
+ xorq %rax,%rdi
+ rorq $14,%r13
+ movq %rax,%r10
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+ xorq %r15,%r10
+ addq %r12,%rcx
+ addq %r12,%r10
+ leaq 24(%rbp),%rbp
+ movq 24(%rsp),%r13
+ movq 0(%rsp),%r15
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r10
+ movq %r15,%r14
+ rorq $42,%r15
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 88(%rsp),%r12
+ addq 16(%rsp),%r12
+ movq %rcx,%r13
+ addq %r15,%r12
+ movq %r10,%r14
+ rorq $23,%r13
+ movq %rdx,%r15
+ xorq %rcx,%r13
+ rorq $5,%r14
+ xorq %r8,%r15
+ movq %r12,16(%rsp)
+ xorq %r10,%r14
+ andq %rcx,%r15
+ rorq $4,%r13
+ addq %r9,%r12
+ xorq %r8,%r15
+ rorq $6,%r14
+ xorq %rcx,%r13
+ addq %r15,%r12
+ movq %r10,%r15
+ addq (%rbp),%r12
+ xorq %r10,%r14
+ xorq %r11,%r15
+ rorq $14,%r13
+ movq %r11,%r9
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+ xorq %rdi,%r9
+ addq %r12,%rbx
+ addq %r12,%r9
+ leaq 8(%rbp),%rbp
+ movq 32(%rsp),%r13
+ movq 8(%rsp),%rdi
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r9
+ movq %rdi,%r14
+ rorq $42,%rdi
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 96(%rsp),%r12
+ addq 24(%rsp),%r12
+ movq %rbx,%r13
+ addq %rdi,%r12
+ movq %r9,%r14
+ rorq $23,%r13
+ movq %rcx,%rdi
+ xorq %rbx,%r13
+ rorq $5,%r14
+ xorq %rdx,%rdi
+ movq %r12,24(%rsp)
+ xorq %r9,%r14
+ andq %rbx,%rdi
+ rorq $4,%r13
+ addq %r8,%r12
+ xorq %rdx,%rdi
+ rorq $6,%r14
+ xorq %rbx,%r13
+ addq %rdi,%r12
+ movq %r9,%rdi
+ addq (%rbp),%r12
+ xorq %r9,%r14
+ xorq %r10,%rdi
+ rorq $14,%r13
+ movq %r10,%r8
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+ xorq %r15,%r8
+ addq %r12,%rax
+ addq %r12,%r8
+ leaq 24(%rbp),%rbp
+ movq 40(%rsp),%r13
+ movq 16(%rsp),%r15
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r8
+ movq %r15,%r14
+ rorq $42,%r15
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 104(%rsp),%r12
+ addq 32(%rsp),%r12
+ movq %rax,%r13
+ addq %r15,%r12
+ movq %r8,%r14
+ rorq $23,%r13
+ movq %rbx,%r15
+ xorq %rax,%r13
+ rorq $5,%r14
+ xorq %rcx,%r15
+ movq %r12,32(%rsp)
+ xorq %r8,%r14
+ andq %rax,%r15
+ rorq $4,%r13
+ addq %rdx,%r12
+ xorq %rcx,%r15
+ rorq $6,%r14
+ xorq %rax,%r13
+ addq %r15,%r12
+ movq %r8,%r15
+ addq (%rbp),%r12
+ xorq %r8,%r14
+ xorq %r9,%r15
+ rorq $14,%r13
+ movq %r9,%rdx
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+ xorq %rdi,%rdx
+ addq %r12,%r11
+ addq %r12,%rdx
+ leaq 8(%rbp),%rbp
+ movq 48(%rsp),%r13
+ movq 24(%rsp),%rdi
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rdx
+ movq %rdi,%r14
+ rorq $42,%rdi
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 112(%rsp),%r12
+ addq 40(%rsp),%r12
+ movq %r11,%r13
+ addq %rdi,%r12
+ movq %rdx,%r14
+ rorq $23,%r13
+ movq %rax,%rdi
+ xorq %r11,%r13
+ rorq $5,%r14
+ xorq %rbx,%rdi
+ movq %r12,40(%rsp)
+ xorq %rdx,%r14
+ andq %r11,%rdi
+ rorq $4,%r13
+ addq %rcx,%r12
+ xorq %rbx,%rdi
+ rorq $6,%r14
+ xorq %r11,%r13
+ addq %rdi,%r12
+ movq %rdx,%rdi
+ addq (%rbp),%r12
+ xorq %rdx,%r14
+ xorq %r8,%rdi
+ rorq $14,%r13
+ movq %r8,%rcx
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+ xorq %r15,%rcx
+ addq %r12,%r10
+ addq %r12,%rcx
+ leaq 24(%rbp),%rbp
+ movq 56(%rsp),%r13
+ movq 32(%rsp),%r15
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rcx
+ movq %r15,%r14
+ rorq $42,%r15
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 120(%rsp),%r12
+ addq 48(%rsp),%r12
+ movq %r10,%r13
+ addq %r15,%r12
+ movq %rcx,%r14
+ rorq $23,%r13
+ movq %r11,%r15
+ xorq %r10,%r13
+ rorq $5,%r14
+ xorq %rax,%r15
+ movq %r12,48(%rsp)
+ xorq %rcx,%r14
+ andq %r10,%r15
+ rorq $4,%r13
+ addq %rbx,%r12
+ xorq %rax,%r15
+ rorq $6,%r14
+ xorq %r10,%r13
+ addq %r15,%r12
+ movq %rcx,%r15
+ addq (%rbp),%r12
+ xorq %rcx,%r14
+ xorq %rdx,%r15
+ rorq $14,%r13
+ movq %rdx,%rbx
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+ xorq %rdi,%rbx
+ addq %r12,%r9
+ addq %r12,%rbx
+ leaq 8(%rbp),%rbp
+ movq 64(%rsp),%r13
+ movq 40(%rsp),%rdi
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rbx
+ movq %rdi,%r14
+ rorq $42,%rdi
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 0(%rsp),%r12
+ addq 56(%rsp),%r12
+ movq %r9,%r13
+ addq %rdi,%r12
+ movq %rbx,%r14
+ rorq $23,%r13
+ movq %r10,%rdi
+ xorq %r9,%r13
+ rorq $5,%r14
+ xorq %r11,%rdi
+ movq %r12,56(%rsp)
+ xorq %rbx,%r14
+ andq %r9,%rdi
+ rorq $4,%r13
+ addq %rax,%r12
+ xorq %r11,%rdi
+ rorq $6,%r14
+ xorq %r9,%r13
+ addq %rdi,%r12
+ movq %rbx,%rdi
+ addq (%rbp),%r12
+ xorq %rbx,%r14
+ xorq %rcx,%rdi
+ rorq $14,%r13
+ movq %rcx,%rax
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+ xorq %r15,%rax
+ addq %r12,%r8
+ addq %r12,%rax
+ leaq 24(%rbp),%rbp
+ movq 72(%rsp),%r13
+ movq 48(%rsp),%r15
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rax
+ movq %r15,%r14
+ rorq $42,%r15
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 8(%rsp),%r12
+ addq 64(%rsp),%r12
+ movq %r8,%r13
+ addq %r15,%r12
+ movq %rax,%r14
+ rorq $23,%r13
+ movq %r9,%r15
+ xorq %r8,%r13
+ rorq $5,%r14
+ xorq %r10,%r15
+ movq %r12,64(%rsp)
+ xorq %rax,%r14
+ andq %r8,%r15
+ rorq $4,%r13
+ addq %r11,%r12
+ xorq %r10,%r15
+ rorq $6,%r14
+ xorq %r8,%r13
+ addq %r15,%r12
+ movq %rax,%r15
+ addq (%rbp),%r12
+ xorq %rax,%r14
+ xorq %rbx,%r15
+ rorq $14,%r13
+ movq %rbx,%r11
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+ xorq %rdi,%r11
+ addq %r12,%rdx
+ addq %r12,%r11
+ leaq 8(%rbp),%rbp
+ movq 80(%rsp),%r13
+ movq 56(%rsp),%rdi
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r11
+ movq %rdi,%r14
+ rorq $42,%rdi
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 16(%rsp),%r12
+ addq 72(%rsp),%r12
+ movq %rdx,%r13
+ addq %rdi,%r12
+ movq %r11,%r14
+ rorq $23,%r13
+ movq %r8,%rdi
+ xorq %rdx,%r13
+ rorq $5,%r14
+ xorq %r9,%rdi
+ movq %r12,72(%rsp)
+ xorq %r11,%r14
+ andq %rdx,%rdi
+ rorq $4,%r13
+ addq %r10,%r12
+ xorq %r9,%rdi
+ rorq $6,%r14
+ xorq %rdx,%r13
+ addq %rdi,%r12
+ movq %r11,%rdi
+ addq (%rbp),%r12
+ xorq %r11,%r14
+ xorq %rax,%rdi
+ rorq $14,%r13
+ movq %rax,%r10
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+ xorq %r15,%r10
+ addq %r12,%rcx
+ addq %r12,%r10
+ leaq 24(%rbp),%rbp
+ movq 88(%rsp),%r13
+ movq 64(%rsp),%r15
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r10
+ movq %r15,%r14
+ rorq $42,%r15
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 24(%rsp),%r12
+ addq 80(%rsp),%r12
+ movq %rcx,%r13
+ addq %r15,%r12
+ movq %r10,%r14
+ rorq $23,%r13
+ movq %rdx,%r15
+ xorq %rcx,%r13
+ rorq $5,%r14
+ xorq %r8,%r15
+ movq %r12,80(%rsp)
+ xorq %r10,%r14
+ andq %rcx,%r15
+ rorq $4,%r13
+ addq %r9,%r12
+ xorq %r8,%r15
+ rorq $6,%r14
+ xorq %rcx,%r13
+ addq %r15,%r12
+ movq %r10,%r15
+ addq (%rbp),%r12
+ xorq %r10,%r14
+ xorq %r11,%r15
+ rorq $14,%r13
+ movq %r11,%r9
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+ xorq %rdi,%r9
+ addq %r12,%rbx
+ addq %r12,%r9
+ leaq 8(%rbp),%rbp
+ movq 96(%rsp),%r13
+ movq 72(%rsp),%rdi
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r9
+ movq %rdi,%r14
+ rorq $42,%rdi
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 32(%rsp),%r12
+ addq 88(%rsp),%r12
+ movq %rbx,%r13
+ addq %rdi,%r12
+ movq %r9,%r14
+ rorq $23,%r13
+ movq %rcx,%rdi
+ xorq %rbx,%r13
+ rorq $5,%r14
+ xorq %rdx,%rdi
+ movq %r12,88(%rsp)
+ xorq %r9,%r14
+ andq %rbx,%rdi
+ rorq $4,%r13
+ addq %r8,%r12
+ xorq %rdx,%rdi
+ rorq $6,%r14
+ xorq %rbx,%r13
+ addq %rdi,%r12
+ movq %r9,%rdi
+ addq (%rbp),%r12
+ xorq %r9,%r14
+ xorq %r10,%rdi
+ rorq $14,%r13
+ movq %r10,%r8
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+ xorq %r15,%r8
+ addq %r12,%rax
+ addq %r12,%r8
+ leaq 24(%rbp),%rbp
+ movq 104(%rsp),%r13
+ movq 80(%rsp),%r15
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r8
+ movq %r15,%r14
+ rorq $42,%r15
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 40(%rsp),%r12
+ addq 96(%rsp),%r12
+ movq %rax,%r13
+ addq %r15,%r12
+ movq %r8,%r14
+ rorq $23,%r13
+ movq %rbx,%r15
+ xorq %rax,%r13
+ rorq $5,%r14
+ xorq %rcx,%r15
+ movq %r12,96(%rsp)
+ xorq %r8,%r14
+ andq %rax,%r15
+ rorq $4,%r13
+ addq %rdx,%r12
+ xorq %rcx,%r15
+ rorq $6,%r14
+ xorq %rax,%r13
+ addq %r15,%r12
+ movq %r8,%r15
+ addq (%rbp),%r12
+ xorq %r8,%r14
+ xorq %r9,%r15
+ rorq $14,%r13
+ movq %r9,%rdx
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+ xorq %rdi,%rdx
+ addq %r12,%r11
+ addq %r12,%rdx
+ leaq 8(%rbp),%rbp
+ movq 112(%rsp),%r13
+ movq 88(%rsp),%rdi
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rdx
+ movq %rdi,%r14
+ rorq $42,%rdi
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 48(%rsp),%r12
+ addq 104(%rsp),%r12
+ movq %r11,%r13
+ addq %rdi,%r12
+ movq %rdx,%r14
+ rorq $23,%r13
+ movq %rax,%rdi
+ xorq %r11,%r13
+ rorq $5,%r14
+ xorq %rbx,%rdi
+ movq %r12,104(%rsp)
+ xorq %rdx,%r14
+ andq %r11,%rdi
+ rorq $4,%r13
+ addq %rcx,%r12
+ xorq %rbx,%rdi
+ rorq $6,%r14
+ xorq %r11,%r13
+ addq %rdi,%r12
+ movq %rdx,%rdi
+ addq (%rbp),%r12
+ xorq %rdx,%r14
+ xorq %r8,%rdi
+ rorq $14,%r13
+ movq %r8,%rcx
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+ xorq %r15,%rcx
+ addq %r12,%r10
+ addq %r12,%rcx
+ leaq 24(%rbp),%rbp
+ movq 120(%rsp),%r13
+ movq 96(%rsp),%r15
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rcx
+ movq %r15,%r14
+ rorq $42,%r15
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 56(%rsp),%r12
+ addq 112(%rsp),%r12
+ movq %r10,%r13
+ addq %r15,%r12
+ movq %rcx,%r14
+ rorq $23,%r13
+ movq %r11,%r15
+ xorq %r10,%r13
+ rorq $5,%r14
+ xorq %rax,%r15
+ movq %r12,112(%rsp)
+ xorq %rcx,%r14
+ andq %r10,%r15
+ rorq $4,%r13
+ addq %rbx,%r12
+ xorq %rax,%r15
+ rorq $6,%r14
+ xorq %r10,%r13
+ addq %r15,%r12
+ movq %rcx,%r15
+ addq (%rbp),%r12
+ xorq %rcx,%r14
+ xorq %rdx,%r15
+ rorq $14,%r13
+ movq %rdx,%rbx
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+ xorq %rdi,%rbx
+ addq %r12,%r9
+ addq %r12,%rbx
+ leaq 8(%rbp),%rbp
+ movq 0(%rsp),%r13
+ movq 104(%rsp),%rdi
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rbx
+ movq %rdi,%r14
+ rorq $42,%rdi
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 64(%rsp),%r12
+ addq 120(%rsp),%r12
+ movq %r9,%r13
+ addq %rdi,%r12
+ movq %rbx,%r14
+ rorq $23,%r13
+ movq %r10,%rdi
+ xorq %r9,%r13
+ rorq $5,%r14
+ xorq %r11,%rdi
+ movq %r12,120(%rsp)
+ xorq %rbx,%r14
+ andq %r9,%rdi
+ rorq $4,%r13
+ addq %rax,%r12
+ xorq %r11,%rdi
+ rorq $6,%r14
+ xorq %r9,%r13
+ addq %rdi,%r12
+ movq %rbx,%rdi
+ addq (%rbp),%r12
+ xorq %rbx,%r14
+ xorq %rcx,%rdi
+ rorq $14,%r13
+ movq %rcx,%rax
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+ xorq %r15,%rax
+ addq %r12,%r8
+ addq %r12,%rax
+ leaq 24(%rbp),%rbp
+ cmpb $0,7(%rbp)
+ jnz .Lrounds_16_xx
+ movq 128+0(%rsp),%rdi
+ addq %r14,%rax
+ leaq 128(%rsi),%rsi
+ addq 0(%rdi),%rax
+ addq 8(%rdi),%rbx
+ addq 16(%rdi),%rcx
+ addq 24(%rdi),%rdx
+ addq 32(%rdi),%r8
+ addq 40(%rdi),%r9
+ addq 48(%rdi),%r10
+ addq 56(%rdi),%r11
+ cmpq 128+16(%rsp),%rsi
+ movq %rax,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %rcx,16(%rdi)
+ movq %rdx,24(%rdi)
+ movq %r8,32(%rdi)
+ movq %r9,40(%rdi)
+ movq %r10,48(%rdi)
+ movq %r11,56(%rdi)
+ jb .Lloop
+ movq 152(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue:
+ RET
+.cfi_endproc
+SET_SIZE(zfs_sha512_transform_x64)
+
+ENTRY_ALIGN(zfs_sha512_transform_avx, 64)
+.cfi_startproc
+ ENDBR
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ shlq $4,%rdx
+ subq $160,%rsp
+ leaq (%rsi,%rdx,8),%rdx
+ andq $-64,%rsp
+ movq %rdi,128+0(%rsp)
+ movq %rsi,128+8(%rsp)
+ movq %rdx,128+16(%rsp)
+ movq %rax,152(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08
+.Lprologue_avx:
+
+ vzeroupper
+ movq 0(%rdi),%rax
+ movq 8(%rdi),%rbx
+ movq 16(%rdi),%rcx
+ movq 24(%rdi),%rdx
+ movq 32(%rdi),%r8
+ movq 40(%rdi),%r9
+ movq 48(%rdi),%r10
+ movq 56(%rdi),%r11
+ jmp .Lloop_avx
+.align 16
+.Lloop_avx:
+ vmovdqa K512+1280(%rip),%xmm11
+ vmovdqu 0(%rsi),%xmm0
+ leaq K512+128(%rip),%rbp
+ vmovdqu 16(%rsi),%xmm1
+ vmovdqu 32(%rsi),%xmm2
+ vpshufb %xmm11,%xmm0,%xmm0
+ vmovdqu 48(%rsi),%xmm3
+ vpshufb %xmm11,%xmm1,%xmm1
+ vmovdqu 64(%rsi),%xmm4
+ vpshufb %xmm11,%xmm2,%xmm2
+ vmovdqu 80(%rsi),%xmm5
+ vpshufb %xmm11,%xmm3,%xmm3
+ vmovdqu 96(%rsi),%xmm6
+ vpshufb %xmm11,%xmm4,%xmm4
+ vmovdqu 112(%rsi),%xmm7
+ vpshufb %xmm11,%xmm5,%xmm5
+ vpaddq -128(%rbp),%xmm0,%xmm8
+ vpshufb %xmm11,%xmm6,%xmm6
+ vpaddq -96(%rbp),%xmm1,%xmm9
+ vpshufb %xmm11,%xmm7,%xmm7
+ vpaddq -64(%rbp),%xmm2,%xmm10
+ vpaddq -32(%rbp),%xmm3,%xmm11
+ vmovdqa %xmm8,0(%rsp)
+ vpaddq 0(%rbp),%xmm4,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ vpaddq 32(%rbp),%xmm5,%xmm9
+ vmovdqa %xmm10,32(%rsp)
+ vpaddq 64(%rbp),%xmm6,%xmm10
+ vmovdqa %xmm11,48(%rsp)
+ vpaddq 96(%rbp),%xmm7,%xmm11
+ vmovdqa %xmm8,64(%rsp)
+ movq %rax,%r14
+ vmovdqa %xmm9,80(%rsp)
+ movq %rbx,%rdi
+ vmovdqa %xmm10,96(%rsp)
+ xorq %rcx,%rdi
+ vmovdqa %xmm11,112(%rsp)
+ movq %r8,%r13
+ jmp .Lavx_00_47
+
+.align 16
+.Lavx_00_47:
+ addq $256,%rbp
+ vpalignr $8,%xmm0,%xmm1,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%rax
+ vpalignr $8,%xmm4,%xmm5,%xmm11
+ movq %r9,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %r8,%r13
+ xorq %r10,%r12
+ vpaddq %xmm11,%xmm0,%xmm0
+ shrdq $4,%r13,%r13
+ xorq %rax,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %r8,%r12
+ xorq %r8,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 0(%rsp),%r11
+ movq %rax,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %r10,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %rbx,%r15
+ addq %r12,%r11
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %rax,%r14
+ addq %r13,%r11
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rbx,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm7,%xmm11
+ addq %r11,%rdx
+ addq %rdi,%r11
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %rdx,%r13
+ addq %r11,%r14
+ vpsllq $3,%xmm7,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%r11
+ vpaddq %xmm8,%xmm0,%xmm0
+ movq %r8,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm7,%xmm9
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %r11,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 8(%rsp),%r10
+ movq %r11,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %r9,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %rax,%rdi
+ addq %r12,%r10
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm0,%xmm0
+ xorq %r11,%r14
+ addq %r13,%r10
+ vpaddq -128(%rbp),%xmm0,%xmm10
+ xorq %rax,%r15
+ shrdq $28,%r14,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ vmovdqa %xmm10,0(%rsp)
+ vpalignr $8,%xmm1,%xmm2,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%r10
+ vpalignr $8,%xmm5,%xmm6,%xmm11
+ movq %rdx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ vpaddq %xmm11,%xmm1,%xmm1
+ shrdq $4,%r13,%r13
+ xorq %r10,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 16(%rsp),%r9
+ movq %r10,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %r8,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %r11,%r15
+ addq %r12,%r9
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %r10,%r14
+ addq %r13,%r9
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r11,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm0,%xmm11
+ addq %r9,%rbx
+ addq %rdi,%r9
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %rbx,%r13
+ addq %r9,%r14
+ vpsllq $3,%xmm0,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%r9
+ vpaddq %xmm8,%xmm1,%xmm1
+ movq %rcx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm0,%xmm9
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %r9,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 24(%rsp),%r8
+ movq %r9,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %rdx,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %r10,%rdi
+ addq %r12,%r8
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm1,%xmm1
+ xorq %r9,%r14
+ addq %r13,%r8
+ vpaddq -96(%rbp),%xmm1,%xmm10
+ xorq %r10,%r15
+ shrdq $28,%r14,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ vmovdqa %xmm10,16(%rsp)
+ vpalignr $8,%xmm2,%xmm3,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%r8
+ vpalignr $8,%xmm6,%xmm7,%xmm11
+ movq %rbx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ vpaddq %xmm11,%xmm2,%xmm2
+ shrdq $4,%r13,%r13
+ xorq %r8,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %rax,%r12
+ xorq %rax,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 32(%rsp),%rdx
+ movq %r8,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %rcx,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %r9,%r15
+ addq %r12,%rdx
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %r8,%r14
+ addq %r13,%rdx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r9,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm1,%xmm11
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %r11,%r13
+ addq %rdx,%r14
+ vpsllq $3,%xmm1,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%rdx
+ vpaddq %xmm8,%xmm2,%xmm2
+ movq %rax,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm1,%xmm9
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %rdx,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %r11,%r12
+ xorq %r11,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 40(%rsp),%rcx
+ movq %rdx,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %rbx,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm2,%xmm2
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ vpaddq -64(%rbp),%xmm2,%xmm10
+ xorq %r8,%r15
+ shrdq $28,%r14,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ vmovdqa %xmm10,32(%rsp)
+ vpalignr $8,%xmm3,%xmm4,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%rcx
+ vpalignr $8,%xmm7,%xmm0,%xmm11
+ movq %r11,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %r10,%r13
+ xorq %rax,%r12
+ vpaddq %xmm11,%xmm3,%xmm3
+ shrdq $4,%r13,%r13
+ xorq %rcx,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %r10,%r12
+ xorq %r10,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 48(%rsp),%rbx
+ movq %rcx,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %rax,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rdx,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm2,%xmm11
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %r9,%r13
+ addq %rbx,%r14
+ vpsllq $3,%xmm2,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%rbx
+ vpaddq %xmm8,%xmm3,%xmm3
+ movq %r10,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm2,%xmm9
+ xorq %r9,%r13
+ xorq %r11,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %rbx,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %r9,%r12
+ xorq %r9,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 56(%rsp),%rax
+ movq %rbx,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %r11,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm3,%xmm3
+ xorq %rbx,%r14
+ addq %r13,%rax
+ vpaddq -32(%rbp),%xmm3,%xmm10
+ xorq %rcx,%r15
+ shrdq $28,%r14,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ vmovdqa %xmm10,48(%rsp)
+ vpalignr $8,%xmm4,%xmm5,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%rax
+ vpalignr $8,%xmm0,%xmm1,%xmm11
+ movq %r9,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %r8,%r13
+ xorq %r10,%r12
+ vpaddq %xmm11,%xmm4,%xmm4
+ shrdq $4,%r13,%r13
+ xorq %rax,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %r8,%r12
+ xorq %r8,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 64(%rsp),%r11
+ movq %rax,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %r10,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %rbx,%r15
+ addq %r12,%r11
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %rax,%r14
+ addq %r13,%r11
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rbx,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm3,%xmm11
+ addq %r11,%rdx
+ addq %rdi,%r11
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %rdx,%r13
+ addq %r11,%r14
+ vpsllq $3,%xmm3,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%r11
+ vpaddq %xmm8,%xmm4,%xmm4
+ movq %r8,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm3,%xmm9
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %r11,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 72(%rsp),%r10
+ movq %r11,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %r9,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %rax,%rdi
+ addq %r12,%r10
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm4,%xmm4
+ xorq %r11,%r14
+ addq %r13,%r10
+ vpaddq 0(%rbp),%xmm4,%xmm10
+ xorq %rax,%r15
+ shrdq $28,%r14,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ vmovdqa %xmm10,64(%rsp)
+ vpalignr $8,%xmm5,%xmm6,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%r10
+ vpalignr $8,%xmm1,%xmm2,%xmm11
+ movq %rdx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ vpaddq %xmm11,%xmm5,%xmm5
+ shrdq $4,%r13,%r13
+ xorq %r10,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 80(%rsp),%r9
+ movq %r10,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %r8,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %r11,%r15
+ addq %r12,%r9
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %r10,%r14
+ addq %r13,%r9
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r11,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm4,%xmm11
+ addq %r9,%rbx
+ addq %rdi,%r9
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %rbx,%r13
+ addq %r9,%r14
+ vpsllq $3,%xmm4,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%r9
+ vpaddq %xmm8,%xmm5,%xmm5
+ movq %rcx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm4,%xmm9
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %r9,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 88(%rsp),%r8
+ movq %r9,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %rdx,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %r10,%rdi
+ addq %r12,%r8
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm5,%xmm5
+ xorq %r9,%r14
+ addq %r13,%r8
+ vpaddq 32(%rbp),%xmm5,%xmm10
+ xorq %r10,%r15
+ shrdq $28,%r14,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ vmovdqa %xmm10,80(%rsp)
+ vpalignr $8,%xmm6,%xmm7,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%r8
+ vpalignr $8,%xmm2,%xmm3,%xmm11
+ movq %rbx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ vpaddq %xmm11,%xmm6,%xmm6
+ shrdq $4,%r13,%r13
+ xorq %r8,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %rax,%r12
+ xorq %rax,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 96(%rsp),%rdx
+ movq %r8,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %rcx,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %r9,%r15
+ addq %r12,%rdx
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %r8,%r14
+ addq %r13,%rdx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r9,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm5,%xmm11
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %r11,%r13
+ addq %rdx,%r14
+ vpsllq $3,%xmm5,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%rdx
+ vpaddq %xmm8,%xmm6,%xmm6
+ movq %rax,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm5,%xmm9
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %rdx,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %r11,%r12
+ xorq %r11,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 104(%rsp),%rcx
+ movq %rdx,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %rbx,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm6,%xmm6
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ vpaddq 64(%rbp),%xmm6,%xmm10
+ xorq %r8,%r15
+ shrdq $28,%r14,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ vmovdqa %xmm10,96(%rsp)
+ vpalignr $8,%xmm7,%xmm0,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%rcx
+ vpalignr $8,%xmm3,%xmm4,%xmm11
+ movq %r11,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %r10,%r13
+ xorq %rax,%r12
+ vpaddq %xmm11,%xmm7,%xmm7
+ shrdq $4,%r13,%r13
+ xorq %rcx,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %r10,%r12
+ xorq %r10,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 112(%rsp),%rbx
+ movq %rcx,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %rax,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rdx,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm6,%xmm11
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %r9,%r13
+ addq %rbx,%r14
+ vpsllq $3,%xmm6,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%rbx
+ vpaddq %xmm8,%xmm7,%xmm7
+ movq %r10,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm6,%xmm9
+ xorq %r9,%r13
+ xorq %r11,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %rbx,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %r9,%r12
+ xorq %r9,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 120(%rsp),%rax
+ movq %rbx,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %r11,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm7,%xmm7
+ xorq %rbx,%r14
+ addq %r13,%rax
+ vpaddq 96(%rbp),%xmm7,%xmm10
+ xorq %rcx,%r15
+ shrdq $28,%r14,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ vmovdqa %xmm10,112(%rsp)
+ cmpb $0,135(%rbp)
+ jne .Lavx_00_47
+ shrdq $23,%r13,%r13
+ movq %r14,%rax
+ movq %r9,%r12
+ shrdq $5,%r14,%r14
+ xorq %r8,%r13
+ xorq %r10,%r12
+ shrdq $4,%r13,%r13
+ xorq %rax,%r14
+ andq %r8,%r12
+ xorq %r8,%r13
+ addq 0(%rsp),%r11
+ movq %rax,%r15
+ xorq %r10,%r12
+ shrdq $6,%r14,%r14
+ xorq %rbx,%r15
+ addq %r12,%r11
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %rax,%r14
+ addq %r13,%r11
+ xorq %rbx,%rdi
+ shrdq $28,%r14,%r14
+ addq %r11,%rdx
+ addq %rdi,%r11
+ movq %rdx,%r13
+ addq %r11,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r11
+ movq %r8,%r12
+ shrdq $5,%r14,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ shrdq $4,%r13,%r13
+ xorq %r11,%r14
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ addq 8(%rsp),%r10
+ movq %r11,%rdi
+ xorq %r9,%r12
+ shrdq $6,%r14,%r14
+ xorq %rax,%rdi
+ addq %r12,%r10
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %r11,%r14
+ addq %r13,%r10
+ xorq %rax,%r15
+ shrdq $28,%r14,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r10
+ movq %rdx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ shrdq $4,%r13,%r13
+ xorq %r10,%r14
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ addq 16(%rsp),%r9
+ movq %r10,%r15
+ xorq %r8,%r12
+ shrdq $6,%r14,%r14
+ xorq %r11,%r15
+ addq %r12,%r9
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %r10,%r14
+ addq %r13,%r9
+ xorq %r11,%rdi
+ shrdq $28,%r14,%r14
+ addq %r9,%rbx
+ addq %rdi,%r9
+ movq %rbx,%r13
+ addq %r9,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r9
+ movq %rcx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ shrdq $4,%r13,%r13
+ xorq %r9,%r14
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ addq 24(%rsp),%r8
+ movq %r9,%rdi
+ xorq %rdx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r10,%rdi
+ addq %r12,%r8
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %r9,%r14
+ addq %r13,%r8
+ xorq %r10,%r15
+ shrdq $28,%r14,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r8
+ movq %rbx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ shrdq $4,%r13,%r13
+ xorq %r8,%r14
+ andq %rax,%r12
+ xorq %rax,%r13
+ addq 32(%rsp),%rdx
+ movq %r8,%r15
+ xorq %rcx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r9,%r15
+ addq %r12,%rdx
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %r8,%r14
+ addq %r13,%rdx
+ xorq %r9,%rdi
+ shrdq $28,%r14,%r14
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ movq %r11,%r13
+ addq %rdx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rdx
+ movq %rax,%r12
+ shrdq $5,%r14,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ shrdq $4,%r13,%r13
+ xorq %rdx,%r14
+ andq %r11,%r12
+ xorq %r11,%r13
+ addq 40(%rsp),%rcx
+ movq %rdx,%rdi
+ xorq %rbx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ xorq %r8,%r15
+ shrdq $28,%r14,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rcx
+ movq %r11,%r12
+ shrdq $5,%r14,%r14
+ xorq %r10,%r13
+ xorq %rax,%r12
+ shrdq $4,%r13,%r13
+ xorq %rcx,%r14
+ andq %r10,%r12
+ xorq %r10,%r13
+ addq 48(%rsp),%rbx
+ movq %rcx,%r15
+ xorq %rax,%r12
+ shrdq $6,%r14,%r14
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ xorq %rdx,%rdi
+ shrdq $28,%r14,%r14
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ movq %r9,%r13
+ addq %rbx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rbx
+ movq %r10,%r12
+ shrdq $5,%r14,%r14
+ xorq %r9,%r13
+ xorq %r11,%r12
+ shrdq $4,%r13,%r13
+ xorq %rbx,%r14
+ andq %r9,%r12
+ xorq %r9,%r13
+ addq 56(%rsp),%rax
+ movq %rbx,%rdi
+ xorq %r11,%r12
+ shrdq $6,%r14,%r14
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %rbx,%r14
+ addq %r13,%rax
+ xorq %rcx,%r15
+ shrdq $28,%r14,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rax
+ movq %r9,%r12
+ shrdq $5,%r14,%r14
+ xorq %r8,%r13
+ xorq %r10,%r12
+ shrdq $4,%r13,%r13
+ xorq %rax,%r14
+ andq %r8,%r12
+ xorq %r8,%r13
+ addq 64(%rsp),%r11
+ movq %rax,%r15
+ xorq %r10,%r12
+ shrdq $6,%r14,%r14
+ xorq %rbx,%r15
+ addq %r12,%r11
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %rax,%r14
+ addq %r13,%r11
+ xorq %rbx,%rdi
+ shrdq $28,%r14,%r14
+ addq %r11,%rdx
+ addq %rdi,%r11
+ movq %rdx,%r13
+ addq %r11,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r11
+ movq %r8,%r12
+ shrdq $5,%r14,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ shrdq $4,%r13,%r13
+ xorq %r11,%r14
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ addq 72(%rsp),%r10
+ movq %r11,%rdi
+ xorq %r9,%r12
+ shrdq $6,%r14,%r14
+ xorq %rax,%rdi
+ addq %r12,%r10
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %r11,%r14
+ addq %r13,%r10
+ xorq %rax,%r15
+ shrdq $28,%r14,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r10
+ movq %rdx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ shrdq $4,%r13,%r13
+ xorq %r10,%r14
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ addq 80(%rsp),%r9
+ movq %r10,%r15
+ xorq %r8,%r12
+ shrdq $6,%r14,%r14
+ xorq %r11,%r15
+ addq %r12,%r9
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %r10,%r14
+ addq %r13,%r9
+ xorq %r11,%rdi
+ shrdq $28,%r14,%r14
+ addq %r9,%rbx
+ addq %rdi,%r9
+ movq %rbx,%r13
+ addq %r9,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r9
+ movq %rcx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ shrdq $4,%r13,%r13
+ xorq %r9,%r14
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ addq 88(%rsp),%r8
+ movq %r9,%rdi
+ xorq %rdx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r10,%rdi
+ addq %r12,%r8
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %r9,%r14
+ addq %r13,%r8
+ xorq %r10,%r15
+ shrdq $28,%r14,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r8
+ movq %rbx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ shrdq $4,%r13,%r13
+ xorq %r8,%r14
+ andq %rax,%r12
+ xorq %rax,%r13
+ addq 96(%rsp),%rdx
+ movq %r8,%r15
+ xorq %rcx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r9,%r15
+ addq %r12,%rdx
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %r8,%r14
+ addq %r13,%rdx
+ xorq %r9,%rdi
+ shrdq $28,%r14,%r14
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ movq %r11,%r13
+ addq %rdx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rdx
+ movq %rax,%r12
+ shrdq $5,%r14,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ shrdq $4,%r13,%r13
+ xorq %rdx,%r14
+ andq %r11,%r12
+ xorq %r11,%r13
+ addq 104(%rsp),%rcx
+ movq %rdx,%rdi
+ xorq %rbx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ xorq %r8,%r15
+ shrdq $28,%r14,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rcx
+ movq %r11,%r12
+ shrdq $5,%r14,%r14
+ xorq %r10,%r13
+ xorq %rax,%r12
+ shrdq $4,%r13,%r13
+ xorq %rcx,%r14
+ andq %r10,%r12
+ xorq %r10,%r13
+ addq 112(%rsp),%rbx
+ movq %rcx,%r15
+ xorq %rax,%r12
+ shrdq $6,%r14,%r14
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ xorq %rdx,%rdi
+ shrdq $28,%r14,%r14
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ movq %r9,%r13
+ addq %rbx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rbx
+ movq %r10,%r12
+ shrdq $5,%r14,%r14
+ xorq %r9,%r13
+ xorq %r11,%r12
+ shrdq $4,%r13,%r13
+ xorq %rbx,%r14
+ andq %r9,%r12
+ xorq %r9,%r13
+ addq 120(%rsp),%rax
+ movq %rbx,%rdi
+ xorq %r11,%r12
+ shrdq $6,%r14,%r14
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %rbx,%r14
+ addq %r13,%rax
+ xorq %rcx,%r15
+ shrdq $28,%r14,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ movq 128+0(%rsp),%rdi
+ movq %r14,%rax
+
+ addq 0(%rdi),%rax
+ leaq 128(%rsi),%rsi
+ addq 8(%rdi),%rbx
+ addq 16(%rdi),%rcx
+ addq 24(%rdi),%rdx
+ addq 32(%rdi),%r8
+ addq 40(%rdi),%r9
+ addq 48(%rdi),%r10
+ addq 56(%rdi),%r11
+
+ cmpq 128+16(%rsp),%rsi
+
+ movq %rax,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %rcx,16(%rdi)
+ movq %rdx,24(%rdi)
+ movq %r8,32(%rdi)
+ movq %r9,40(%rdi)
+ movq %r10,48(%rdi)
+ movq %r11,56(%rdi)
+ jb .Lloop_avx
+
+ movq 152(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ vzeroupper
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx:
+ RET
+.cfi_endproc
+SET_SIZE(zfs_sha512_transform_avx)
+
+ENTRY_ALIGN(zfs_sha512_transform_avx2, 64)
+.cfi_startproc
+ ENDBR
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ subq $1312,%rsp
+ shlq $4,%rdx
+ andq $-2048,%rsp
+ leaq (%rsi,%rdx,8),%rdx
+ addq $1152,%rsp
+ movq %rdi,128+0(%rsp)
+ movq %rsi,128+8(%rsp)
+ movq %rdx,128+16(%rsp)
+ movq %rax,152(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08
+.Lprologue_avx2:
+
+ vzeroupper
+ subq $-128,%rsi
+ movq 0(%rdi),%rax
+ movq %rsi,%r12
+ movq 8(%rdi),%rbx
+ cmpq %rdx,%rsi
+ movq 16(%rdi),%rcx
+ cmoveq %rsp,%r12
+ movq 24(%rdi),%rdx
+ movq 32(%rdi),%r8
+ movq 40(%rdi),%r9
+ movq 48(%rdi),%r10
+ movq 56(%rdi),%r11
+ jmp .Loop_avx2
+.align 16
+.Loop_avx2:
+ vmovdqu -128(%rsi),%xmm0
+ vmovdqu -128+16(%rsi),%xmm1
+ vmovdqu -128+32(%rsi),%xmm2
+ leaq K512+128(%rip),%rbp
+ vmovdqu -128+48(%rsi),%xmm3
+ vmovdqu -128+64(%rsi),%xmm4
+ vmovdqu -128+80(%rsi),%xmm5
+ vmovdqu -128+96(%rsi),%xmm6
+ vmovdqu -128+112(%rsi),%xmm7
+
+ vmovdqa 1152(%rbp),%ymm10
+ vinserti128 $1,(%r12),%ymm0,%ymm0
+ vinserti128 $1,16(%r12),%ymm1,%ymm1
+ vpshufb %ymm10,%ymm0,%ymm0
+ vinserti128 $1,32(%r12),%ymm2,%ymm2
+ vpshufb %ymm10,%ymm1,%ymm1
+ vinserti128 $1,48(%r12),%ymm3,%ymm3
+ vpshufb %ymm10,%ymm2,%ymm2
+ vinserti128 $1,64(%r12),%ymm4,%ymm4
+ vpshufb %ymm10,%ymm3,%ymm3
+ vinserti128 $1,80(%r12),%ymm5,%ymm5
+ vpshufb %ymm10,%ymm4,%ymm4
+ vinserti128 $1,96(%r12),%ymm6,%ymm6
+ vpshufb %ymm10,%ymm5,%ymm5
+ vinserti128 $1,112(%r12),%ymm7,%ymm7
+
+ vpaddq -128(%rbp),%ymm0,%ymm8
+ vpshufb %ymm10,%ymm6,%ymm6
+ vpaddq -96(%rbp),%ymm1,%ymm9
+ vpshufb %ymm10,%ymm7,%ymm7
+ vpaddq -64(%rbp),%ymm2,%ymm10
+ vpaddq -32(%rbp),%ymm3,%ymm11
+ vmovdqa %ymm8,0(%rsp)
+ vpaddq 0(%rbp),%ymm4,%ymm8
+ vmovdqa %ymm9,32(%rsp)
+ vpaddq 32(%rbp),%ymm5,%ymm9
+ vmovdqa %ymm10,64(%rsp)
+ vpaddq 64(%rbp),%ymm6,%ymm10
+ vmovdqa %ymm11,96(%rsp)
+
+ movq 152(%rsp),%rdi
+.cfi_def_cfa %rdi,8
+ leaq -128(%rsp),%rsp
+
+
+
+ movq %rdi,-8(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08
+ vpaddq 96(%rbp),%ymm7,%ymm11
+ vmovdqa %ymm8,0(%rsp)
+ xorq %r14,%r14
+ vmovdqa %ymm9,32(%rsp)
+ movq %rbx,%rdi
+ vmovdqa %ymm10,64(%rsp)
+ xorq %rcx,%rdi
+ vmovdqa %ymm11,96(%rsp)
+ movq %r9,%r12
+ addq $32*8,%rbp
+ jmp .Lavx2_00_47
+
+.align 16
+.Lavx2_00_47:
+ leaq -128(%rsp),%rsp
+.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08
+
+ pushq 128-8(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08
+ leaq 8(%rsp),%rsp
+.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08
+ vpalignr $8,%ymm0,%ymm1,%ymm8
+ addq 0+256(%rsp),%r11
+ andq %r8,%r12
+ rorxq $41,%r8,%r13
+ vpalignr $8,%ymm4,%ymm5,%ymm11
+ rorxq $18,%r8,%r15
+ leaq (%rax,%r14,1),%rax
+ leaq (%r11,%r12,1),%r11
+ vpsrlq $1,%ymm8,%ymm10
+ andnq %r10,%r8,%r12
+ xorq %r15,%r13
+ rorxq $14,%r8,%r14
+ vpaddq %ymm11,%ymm0,%ymm0
+ vpsrlq $7,%ymm8,%ymm11
+ leaq (%r11,%r12,1),%r11
+ xorq %r14,%r13
+ movq %rax,%r15
+ vpsllq $56,%ymm8,%ymm9
+ vpxor %ymm10,%ymm11,%ymm8
+ rorxq $39,%rax,%r12
+ leaq (%r11,%r13,1),%r11
+ xorq %rbx,%r15
+ vpsrlq $7,%ymm10,%ymm10
+ vpxor %ymm9,%ymm8,%ymm8
+ rorxq $34,%rax,%r14
+ rorxq $28,%rax,%r13
+ leaq (%rdx,%r11,1),%rdx
+ vpsllq $7,%ymm9,%ymm9
+ vpxor %ymm10,%ymm8,%ymm8
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %rbx,%rdi
+ vpsrlq $6,%ymm7,%ymm11
+ vpxor %ymm9,%ymm8,%ymm8
+ xorq %r13,%r14
+ leaq (%r11,%rdi,1),%r11
+ movq %r8,%r12
+ vpsllq $3,%ymm7,%ymm10
+ vpaddq %ymm8,%ymm0,%ymm0
+ addq 8+256(%rsp),%r10
+ andq %rdx,%r12
+ rorxq $41,%rdx,%r13
+ vpsrlq $19,%ymm7,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ rorxq $18,%rdx,%rdi
+ leaq (%r11,%r14,1),%r11
+ leaq (%r10,%r12,1),%r10
+ vpsllq $42,%ymm10,%ymm10
+ vpxor %ymm9,%ymm11,%ymm11
+ andnq %r9,%rdx,%r12
+ xorq %rdi,%r13
+ rorxq $14,%rdx,%r14
+ vpsrlq $42,%ymm9,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ leaq (%r10,%r12,1),%r10
+ xorq %r14,%r13
+ movq %r11,%rdi
+ vpxor %ymm9,%ymm11,%ymm11
+ rorxq $39,%r11,%r12
+ leaq (%r10,%r13,1),%r10
+ xorq %rax,%rdi
+ vpaddq %ymm11,%ymm0,%ymm0
+ rorxq $34,%r11,%r14
+ rorxq $28,%r11,%r13
+ leaq (%rcx,%r10,1),%rcx
+ vpaddq -128(%rbp),%ymm0,%ymm10
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %rax,%r15
+ xorq %r13,%r14
+ leaq (%r10,%r15,1),%r10
+ movq %rdx,%r12
+ vmovdqa %ymm10,0(%rsp)
+ vpalignr $8,%ymm1,%ymm2,%ymm8
+ addq 32+256(%rsp),%r9
+ andq %rcx,%r12
+ rorxq $41,%rcx,%r13
+ vpalignr $8,%ymm5,%ymm6,%ymm11
+ rorxq $18,%rcx,%r15
+ leaq (%r10,%r14,1),%r10
+ leaq (%r9,%r12,1),%r9
+ vpsrlq $1,%ymm8,%ymm10
+ andnq %r8,%rcx,%r12
+ xorq %r15,%r13
+ rorxq $14,%rcx,%r14
+ vpaddq %ymm11,%ymm1,%ymm1
+ vpsrlq $7,%ymm8,%ymm11
+ leaq (%r9,%r12,1),%r9
+ xorq %r14,%r13
+ movq %r10,%r15
+ vpsllq $56,%ymm8,%ymm9
+ vpxor %ymm10,%ymm11,%ymm8
+ rorxq $39,%r10,%r12
+ leaq (%r9,%r13,1),%r9
+ xorq %r11,%r15
+ vpsrlq $7,%ymm10,%ymm10
+ vpxor %ymm9,%ymm8,%ymm8
+ rorxq $34,%r10,%r14
+ rorxq $28,%r10,%r13
+ leaq (%rbx,%r9,1),%rbx
+ vpsllq $7,%ymm9,%ymm9
+ vpxor %ymm10,%ymm8,%ymm8
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %r11,%rdi
+ vpsrlq $6,%ymm0,%ymm11
+ vpxor %ymm9,%ymm8,%ymm8
+ xorq %r13,%r14
+ leaq (%r9,%rdi,1),%r9
+ movq %rcx,%r12
+ vpsllq $3,%ymm0,%ymm10
+ vpaddq %ymm8,%ymm1,%ymm1
+ addq 40+256(%rsp),%r8
+ andq %rbx,%r12
+ rorxq $41,%rbx,%r13
+ vpsrlq $19,%ymm0,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ rorxq $18,%rbx,%rdi
+ leaq (%r9,%r14,1),%r9
+ leaq (%r8,%r12,1),%r8
+ vpsllq $42,%ymm10,%ymm10
+ vpxor %ymm9,%ymm11,%ymm11
+ andnq %rdx,%rbx,%r12
+ xorq %rdi,%r13
+ rorxq $14,%rbx,%r14
+ vpsrlq $42,%ymm9,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ leaq (%r8,%r12,1),%r8
+ xorq %r14,%r13
+ movq %r9,%rdi
+ vpxor %ymm9,%ymm11,%ymm11
+ rorxq $39,%r9,%r12
+ leaq (%r8,%r13,1),%r8
+ xorq %r10,%rdi
+ vpaddq %ymm11,%ymm1,%ymm1
+ rorxq $34,%r9,%r14
+ rorxq $28,%r9,%r13
+ leaq (%rax,%r8,1),%rax
+ vpaddq -96(%rbp),%ymm1,%ymm10
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %r10,%r15
+ xorq %r13,%r14
+ leaq (%r8,%r15,1),%r8
+ movq %rbx,%r12
+ vmovdqa %ymm10,32(%rsp)
+ vpalignr $8,%ymm2,%ymm3,%ymm8
+ addq 64+256(%rsp),%rdx
+ andq %rax,%r12
+ rorxq $41,%rax,%r13
+ vpalignr $8,%ymm6,%ymm7,%ymm11
+ rorxq $18,%rax,%r15
+ leaq (%r8,%r14,1),%r8
+ leaq (%rdx,%r12,1),%rdx
+ vpsrlq $1,%ymm8,%ymm10
+ andnq %rcx,%rax,%r12
+ xorq %r15,%r13
+ rorxq $14,%rax,%r14
+ vpaddq %ymm11,%ymm2,%ymm2
+ vpsrlq $7,%ymm8,%ymm11
+ leaq (%rdx,%r12,1),%rdx
+ xorq %r14,%r13
+ movq %r8,%r15
+ vpsllq $56,%ymm8,%ymm9
+ vpxor %ymm10,%ymm11,%ymm8
+ rorxq $39,%r8,%r12
+ leaq (%rdx,%r13,1),%rdx
+ xorq %r9,%r15
+ vpsrlq $7,%ymm10,%ymm10
+ vpxor %ymm9,%ymm8,%ymm8
+ rorxq $34,%r8,%r14
+ rorxq $28,%r8,%r13
+ leaq (%r11,%rdx,1),%r11
+ vpsllq $7,%ymm9,%ymm9
+ vpxor %ymm10,%ymm8,%ymm8
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %r9,%rdi
+ vpsrlq $6,%ymm1,%ymm11
+ vpxor %ymm9,%ymm8,%ymm8
+ xorq %r13,%r14
+ leaq (%rdx,%rdi,1),%rdx
+ movq %rax,%r12
+ vpsllq $3,%ymm1,%ymm10
+ vpaddq %ymm8,%ymm2,%ymm2
+ addq 72+256(%rsp),%rcx
+ andq %r11,%r12
+ rorxq $41,%r11,%r13
+ vpsrlq $19,%ymm1,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ rorxq $18,%r11,%rdi
+ leaq (%rdx,%r14,1),%rdx
+ leaq (%rcx,%r12,1),%rcx
+ vpsllq $42,%ymm10,%ymm10
+ vpxor %ymm9,%ymm11,%ymm11
+ andnq %rbx,%r11,%r12
+ xorq %rdi,%r13
+ rorxq $14,%r11,%r14
+ vpsrlq $42,%ymm9,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ leaq (%rcx,%r12,1),%rcx
+ xorq %r14,%r13
+ movq %rdx,%rdi
+ vpxor %ymm9,%ymm11,%ymm11
+ rorxq $39,%rdx,%r12
+ leaq (%rcx,%r13,1),%rcx
+ xorq %r8,%rdi
+ vpaddq %ymm11,%ymm2,%ymm2
+ rorxq $34,%rdx,%r14
+ rorxq $28,%rdx,%r13
+ leaq (%r10,%rcx,1),%r10
+ vpaddq -64(%rbp),%ymm2,%ymm10
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %r8,%r15
+ xorq %r13,%r14
+ leaq (%rcx,%r15,1),%rcx
+ movq %r11,%r12
+ vmovdqa %ymm10,64(%rsp)
+ vpalignr $8,%ymm3,%ymm4,%ymm8
+ addq 96+256(%rsp),%rbx
+ andq %r10,%r12
+ rorxq $41,%r10,%r13
+ vpalignr $8,%ymm7,%ymm0,%ymm11
+ rorxq $18,%r10,%r15
+ leaq (%rcx,%r14,1),%rcx
+ leaq (%rbx,%r12,1),%rbx
+ vpsrlq $1,%ymm8,%ymm10
+ andnq %rax,%r10,%r12
+ xorq %r15,%r13
+ rorxq $14,%r10,%r14
+ vpaddq %ymm11,%ymm3,%ymm3
+ vpsrlq $7,%ymm8,%ymm11
+ leaq (%rbx,%r12,1),%rbx
+ xorq %r14,%r13
+ movq %rcx,%r15
+ vpsllq $56,%ymm8,%ymm9
+ vpxor %ymm10,%ymm11,%ymm8
+ rorxq $39,%rcx,%r12
+ leaq (%rbx,%r13,1),%rbx
+ xorq %rdx,%r15
+ vpsrlq $7,%ymm10,%ymm10
+ vpxor %ymm9,%ymm8,%ymm8
+ rorxq $34,%rcx,%r14
+ rorxq $28,%rcx,%r13
+ leaq (%r9,%rbx,1),%r9
+ vpsllq $7,%ymm9,%ymm9
+ vpxor %ymm10,%ymm8,%ymm8
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %rdx,%rdi
+ vpsrlq $6,%ymm2,%ymm11
+ vpxor %ymm9,%ymm8,%ymm8
+ xorq %r13,%r14
+ leaq (%rbx,%rdi,1),%rbx
+ movq %r10,%r12
+ vpsllq $3,%ymm2,%ymm10
+ vpaddq %ymm8,%ymm3,%ymm3
+ addq 104+256(%rsp),%rax
+ andq %r9,%r12
+ rorxq $41,%r9,%r13
+ vpsrlq $19,%ymm2,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ rorxq $18,%r9,%rdi
+ leaq (%rbx,%r14,1),%rbx
+ leaq (%rax,%r12,1),%rax
+ vpsllq $42,%ymm10,%ymm10
+ vpxor %ymm9,%ymm11,%ymm11
+ andnq %r11,%r9,%r12
+ xorq %rdi,%r13
+ rorxq $14,%r9,%r14
+ vpsrlq $42,%ymm9,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ leaq (%rax,%r12,1),%rax
+ xorq %r14,%r13
+ movq %rbx,%rdi
+ vpxor %ymm9,%ymm11,%ymm11
+ rorxq $39,%rbx,%r12
+ leaq (%rax,%r13,1),%rax
+ xorq %rcx,%rdi
+ vpaddq %ymm11,%ymm3,%ymm3
+ rorxq $34,%rbx,%r14
+ rorxq $28,%rbx,%r13
+ leaq (%r8,%rax,1),%r8
+ vpaddq -32(%rbp),%ymm3,%ymm10
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %rcx,%r15
+ xorq %r13,%r14
+ leaq (%rax,%r15,1),%rax
+ movq %r9,%r12
+ vmovdqa %ymm10,96(%rsp)
+ leaq -128(%rsp),%rsp
+.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08
+
+ pushq 128-8(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08
+ leaq 8(%rsp),%rsp
+.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08
+ vpalignr $8,%ymm4,%ymm5,%ymm8
+ addq 0+256(%rsp),%r11
+ andq %r8,%r12
+ rorxq $41,%r8,%r13
+ vpalignr $8,%ymm0,%ymm1,%ymm11
+ rorxq $18,%r8,%r15
+ leaq (%rax,%r14,1),%rax
+ leaq (%r11,%r12,1),%r11
+ vpsrlq $1,%ymm8,%ymm10
+ andnq %r10,%r8,%r12
+ xorq %r15,%r13
+ rorxq $14,%r8,%r14
+ vpaddq %ymm11,%ymm4,%ymm4
+ vpsrlq $7,%ymm8,%ymm11
+ leaq (%r11,%r12,1),%r11
+ xorq %r14,%r13
+ movq %rax,%r15
+ vpsllq $56,%ymm8,%ymm9
+ vpxor %ymm10,%ymm11,%ymm8
+ rorxq $39,%rax,%r12
+ leaq (%r11,%r13,1),%r11
+ xorq %rbx,%r15
+ vpsrlq $7,%ymm10,%ymm10
+ vpxor %ymm9,%ymm8,%ymm8
+ rorxq $34,%rax,%r14
+ rorxq $28,%rax,%r13
+ leaq (%rdx,%r11,1),%rdx
+ vpsllq $7,%ymm9,%ymm9
+ vpxor %ymm10,%ymm8,%ymm8
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %rbx,%rdi
+ vpsrlq $6,%ymm3,%ymm11
+ vpxor %ymm9,%ymm8,%ymm8
+ xorq %r13,%r14
+ leaq (%r11,%rdi,1),%r11
+ movq %r8,%r12
+ vpsllq $3,%ymm3,%ymm10
+ vpaddq %ymm8,%ymm4,%ymm4
+ addq 8+256(%rsp),%r10
+ andq %rdx,%r12
+ rorxq $41,%rdx,%r13
+ vpsrlq $19,%ymm3,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ rorxq $18,%rdx,%rdi
+ leaq (%r11,%r14,1),%r11
+ leaq (%r10,%r12,1),%r10
+ vpsllq $42,%ymm10,%ymm10
+ vpxor %ymm9,%ymm11,%ymm11
+ andnq %r9,%rdx,%r12
+ xorq %rdi,%r13
+ rorxq $14,%rdx,%r14
+ vpsrlq $42,%ymm9,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ leaq (%r10,%r12,1),%r10
+ xorq %r14,%r13
+ movq %r11,%rdi
+ vpxor %ymm9,%ymm11,%ymm11
+ rorxq $39,%r11,%r12
+ leaq (%r10,%r13,1),%r10
+ xorq %rax,%rdi
+ vpaddq %ymm11,%ymm4,%ymm4
+ rorxq $34,%r11,%r14
+ rorxq $28,%r11,%r13
+ leaq (%rcx,%r10,1),%rcx
+ vpaddq 0(%rbp),%ymm4,%ymm10
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %rax,%r15
+ xorq %r13,%r14
+ leaq (%r10,%r15,1),%r10
+ movq %rdx,%r12
+ vmovdqa %ymm10,0(%rsp)
+ vpalignr $8,%ymm5,%ymm6,%ymm8
+ addq 32+256(%rsp),%r9
+ andq %rcx,%r12
+ rorxq $41,%rcx,%r13
+ vpalignr $8,%ymm1,%ymm2,%ymm11
+ rorxq $18,%rcx,%r15
+ leaq (%r10,%r14,1),%r10
+ leaq (%r9,%r12,1),%r9
+ vpsrlq $1,%ymm8,%ymm10
+ andnq %r8,%rcx,%r12
+ xorq %r15,%r13
+ rorxq $14,%rcx,%r14
+ vpaddq %ymm11,%ymm5,%ymm5
+ vpsrlq $7,%ymm8,%ymm11
+ leaq (%r9,%r12,1),%r9
+ xorq %r14,%r13
+ movq %r10,%r15
+ vpsllq $56,%ymm8,%ymm9
+ vpxor %ymm10,%ymm11,%ymm8
+ rorxq $39,%r10,%r12
+ leaq (%r9,%r13,1),%r9
+ xorq %r11,%r15
+ vpsrlq $7,%ymm10,%ymm10
+ vpxor %ymm9,%ymm8,%ymm8
+ rorxq $34,%r10,%r14
+ rorxq $28,%r10,%r13
+ leaq (%rbx,%r9,1),%rbx
+ vpsllq $7,%ymm9,%ymm9
+ vpxor %ymm10,%ymm8,%ymm8
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %r11,%rdi
+ vpsrlq $6,%ymm4,%ymm11
+ vpxor %ymm9,%ymm8,%ymm8
+ xorq %r13,%r14
+ leaq (%r9,%rdi,1),%r9
+ movq %rcx,%r12
+ vpsllq $3,%ymm4,%ymm10
+ vpaddq %ymm8,%ymm5,%ymm5
+ addq 40+256(%rsp),%r8
+ andq %rbx,%r12
+ rorxq $41,%rbx,%r13
+ vpsrlq $19,%ymm4,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ rorxq $18,%rbx,%rdi
+ leaq (%r9,%r14,1),%r9
+ leaq (%r8,%r12,1),%r8
+ vpsllq $42,%ymm10,%ymm10
+ vpxor %ymm9,%ymm11,%ymm11
+ andnq %rdx,%rbx,%r12
+ xorq %rdi,%r13
+ rorxq $14,%rbx,%r14
+ vpsrlq $42,%ymm9,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ leaq (%r8,%r12,1),%r8
+ xorq %r14,%r13
+ movq %r9,%rdi
+ vpxor %ymm9,%ymm11,%ymm11
+ rorxq $39,%r9,%r12
+ leaq (%r8,%r13,1),%r8
+ xorq %r10,%rdi
+ vpaddq %ymm11,%ymm5,%ymm5
+ rorxq $34,%r9,%r14
+ rorxq $28,%r9,%r13
+ leaq (%rax,%r8,1),%rax
+ vpaddq 32(%rbp),%ymm5,%ymm10
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %r10,%r15
+ xorq %r13,%r14
+ leaq (%r8,%r15,1),%r8
+ movq %rbx,%r12
+ vmovdqa %ymm10,32(%rsp)
+ vpalignr $8,%ymm6,%ymm7,%ymm8
+ addq 64+256(%rsp),%rdx
+ andq %rax,%r12
+ rorxq $41,%rax,%r13
+ vpalignr $8,%ymm2,%ymm3,%ymm11
+ rorxq $18,%rax,%r15
+ leaq (%r8,%r14,1),%r8
+ leaq (%rdx,%r12,1),%rdx
+ vpsrlq $1,%ymm8,%ymm10
+ andnq %rcx,%rax,%r12
+ xorq %r15,%r13
+ rorxq $14,%rax,%r14
+ vpaddq %ymm11,%ymm6,%ymm6
+ vpsrlq $7,%ymm8,%ymm11
+ leaq (%rdx,%r12,1),%rdx
+ xorq %r14,%r13
+ movq %r8,%r15
+ vpsllq $56,%ymm8,%ymm9
+ vpxor %ymm10,%ymm11,%ymm8
+ rorxq $39,%r8,%r12
+ leaq (%rdx,%r13,1),%rdx
+ xorq %r9,%r15
+ vpsrlq $7,%ymm10,%ymm10
+ vpxor %ymm9,%ymm8,%ymm8
+ rorxq $34,%r8,%r14
+ rorxq $28,%r8,%r13
+ leaq (%r11,%rdx,1),%r11
+ vpsllq $7,%ymm9,%ymm9
+ vpxor %ymm10,%ymm8,%ymm8
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %r9,%rdi
+ vpsrlq $6,%ymm5,%ymm11
+ vpxor %ymm9,%ymm8,%ymm8
+ xorq %r13,%r14
+ leaq (%rdx,%rdi,1),%rdx
+ movq %rax,%r12
+ vpsllq $3,%ymm5,%ymm10
+ vpaddq %ymm8,%ymm6,%ymm6
+ addq 72+256(%rsp),%rcx
+ andq %r11,%r12
+ rorxq $41,%r11,%r13
+ vpsrlq $19,%ymm5,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ rorxq $18,%r11,%rdi
+ leaq (%rdx,%r14,1),%rdx
+ leaq (%rcx,%r12,1),%rcx
+ vpsllq $42,%ymm10,%ymm10
+ vpxor %ymm9,%ymm11,%ymm11
+ andnq %rbx,%r11,%r12
+ xorq %rdi,%r13
+ rorxq $14,%r11,%r14
+ vpsrlq $42,%ymm9,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ leaq (%rcx,%r12,1),%rcx
+ xorq %r14,%r13
+ movq %rdx,%rdi
+ vpxor %ymm9,%ymm11,%ymm11
+ rorxq $39,%rdx,%r12
+ leaq (%rcx,%r13,1),%rcx
+ xorq %r8,%rdi
+ vpaddq %ymm11,%ymm6,%ymm6
+ rorxq $34,%rdx,%r14
+ rorxq $28,%rdx,%r13
+ leaq (%r10,%rcx,1),%r10
+ vpaddq 64(%rbp),%ymm6,%ymm10
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %r8,%r15
+ xorq %r13,%r14
+ leaq (%rcx,%r15,1),%rcx
+ movq %r11,%r12
+ vmovdqa %ymm10,64(%rsp)
+ vpalignr $8,%ymm7,%ymm0,%ymm8
+ addq 96+256(%rsp),%rbx
+ andq %r10,%r12
+ rorxq $41,%r10,%r13
+ vpalignr $8,%ymm3,%ymm4,%ymm11
+ rorxq $18,%r10,%r15
+ leaq (%rcx,%r14,1),%rcx
+ leaq (%rbx,%r12,1),%rbx
+ vpsrlq $1,%ymm8,%ymm10
+ andnq %rax,%r10,%r12
+ xorq %r15,%r13
+ rorxq $14,%r10,%r14
+ vpaddq %ymm11,%ymm7,%ymm7
+ vpsrlq $7,%ymm8,%ymm11
+ leaq (%rbx,%r12,1),%rbx
+ xorq %r14,%r13
+ movq %rcx,%r15
+ vpsllq $56,%ymm8,%ymm9
+ vpxor %ymm10,%ymm11,%ymm8
+ rorxq $39,%rcx,%r12
+ leaq (%rbx,%r13,1),%rbx
+ xorq %rdx,%r15
+ vpsrlq $7,%ymm10,%ymm10
+ vpxor %ymm9,%ymm8,%ymm8
+ rorxq $34,%rcx,%r14
+ rorxq $28,%rcx,%r13
+ leaq (%r9,%rbx,1),%r9
+ vpsllq $7,%ymm9,%ymm9
+ vpxor %ymm10,%ymm8,%ymm8
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %rdx,%rdi
+ vpsrlq $6,%ymm6,%ymm11
+ vpxor %ymm9,%ymm8,%ymm8
+ xorq %r13,%r14
+ leaq (%rbx,%rdi,1),%rbx
+ movq %r10,%r12
+ vpsllq $3,%ymm6,%ymm10
+ vpaddq %ymm8,%ymm7,%ymm7
+ addq 104+256(%rsp),%rax
+ andq %r9,%r12
+ rorxq $41,%r9,%r13
+ vpsrlq $19,%ymm6,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ rorxq $18,%r9,%rdi
+ leaq (%rbx,%r14,1),%rbx
+ leaq (%rax,%r12,1),%rax
+ vpsllq $42,%ymm10,%ymm10
+ vpxor %ymm9,%ymm11,%ymm11
+ andnq %r11,%r9,%r12
+ xorq %rdi,%r13
+ rorxq $14,%r9,%r14
+ vpsrlq $42,%ymm9,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ leaq (%rax,%r12,1),%rax
+ xorq %r14,%r13
+ movq %rbx,%rdi
+ vpxor %ymm9,%ymm11,%ymm11
+ rorxq $39,%rbx,%r12
+ leaq (%rax,%r13,1),%rax
+ xorq %rcx,%rdi
+ vpaddq %ymm11,%ymm7,%ymm7
+ rorxq $34,%rbx,%r14
+ rorxq $28,%rbx,%r13
+ leaq (%r8,%rax,1),%r8
+ vpaddq 96(%rbp),%ymm7,%ymm10
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %rcx,%r15
+ xorq %r13,%r14
+ leaq (%rax,%r15,1),%rax
+ movq %r9,%r12
+ vmovdqa %ymm10,96(%rsp)
+ leaq 256(%rbp),%rbp
+ cmpb $0,-121(%rbp)
+ jne .Lavx2_00_47
+ addq 0+128(%rsp),%r11
+ andq %r8,%r12
+ rorxq $41,%r8,%r13
+ rorxq $18,%r8,%r15
+ leaq (%rax,%r14,1),%rax
+ leaq (%r11,%r12,1),%r11
+ andnq %r10,%r8,%r12
+ xorq %r15,%r13
+ rorxq $14,%r8,%r14
+ leaq (%r11,%r12,1),%r11
+ xorq %r14,%r13
+ movq %rax,%r15
+ rorxq $39,%rax,%r12
+ leaq (%r11,%r13,1),%r11
+ xorq %rbx,%r15
+ rorxq $34,%rax,%r14
+ rorxq $28,%rax,%r13
+ leaq (%rdx,%r11,1),%rdx
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %rbx,%rdi
+ xorq %r13,%r14
+ leaq (%r11,%rdi,1),%r11
+ movq %r8,%r12
+ addq 8+128(%rsp),%r10
+ andq %rdx,%r12
+ rorxq $41,%rdx,%r13
+ rorxq $18,%rdx,%rdi
+ leaq (%r11,%r14,1),%r11
+ leaq (%r10,%r12,1),%r10
+ andnq %r9,%rdx,%r12
+ xorq %rdi,%r13
+ rorxq $14,%rdx,%r14
+ leaq (%r10,%r12,1),%r10
+ xorq %r14,%r13
+ movq %r11,%rdi
+ rorxq $39,%r11,%r12
+ leaq (%r10,%r13,1),%r10
+ xorq %rax,%rdi
+ rorxq $34,%r11,%r14
+ rorxq $28,%r11,%r13
+ leaq (%rcx,%r10,1),%rcx
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %rax,%r15
+ xorq %r13,%r14
+ leaq (%r10,%r15,1),%r10
+ movq %rdx,%r12
+ addq 32+128(%rsp),%r9
+ andq %rcx,%r12
+ rorxq $41,%rcx,%r13
+ rorxq $18,%rcx,%r15
+ leaq (%r10,%r14,1),%r10
+ leaq (%r9,%r12,1),%r9
+ andnq %r8,%rcx,%r12
+ xorq %r15,%r13
+ rorxq $14,%rcx,%r14
+ leaq (%r9,%r12,1),%r9
+ xorq %r14,%r13
+ movq %r10,%r15
+ rorxq $39,%r10,%r12
+ leaq (%r9,%r13,1),%r9
+ xorq %r11,%r15
+ rorxq $34,%r10,%r14
+ rorxq $28,%r10,%r13
+ leaq (%rbx,%r9,1),%rbx
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %r11,%rdi
+ xorq %r13,%r14
+ leaq (%r9,%rdi,1),%r9
+ movq %rcx,%r12
+ addq 40+128(%rsp),%r8
+ andq %rbx,%r12
+ rorxq $41,%rbx,%r13
+ rorxq $18,%rbx,%rdi
+ leaq (%r9,%r14,1),%r9
+ leaq (%r8,%r12,1),%r8
+ andnq %rdx,%rbx,%r12
+ xorq %rdi,%r13
+ rorxq $14,%rbx,%r14
+ leaq (%r8,%r12,1),%r8
+ xorq %r14,%r13
+ movq %r9,%rdi
+ rorxq $39,%r9,%r12
+ leaq (%r8,%r13,1),%r8
+ xorq %r10,%rdi
+ rorxq $34,%r9,%r14
+ rorxq $28,%r9,%r13
+ leaq (%rax,%r8,1),%rax
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %r10,%r15
+ xorq %r13,%r14
+ leaq (%r8,%r15,1),%r8
+ movq %rbx,%r12
+ addq 64+128(%rsp),%rdx
+ andq %rax,%r12
+ rorxq $41,%rax,%r13
+ rorxq $18,%rax,%r15
+ leaq (%r8,%r14,1),%r8
+ leaq (%rdx,%r12,1),%rdx
+ andnq %rcx,%rax,%r12
+ xorq %r15,%r13
+ rorxq $14,%rax,%r14
+ leaq (%rdx,%r12,1),%rdx
+ xorq %r14,%r13
+ movq %r8,%r15
+ rorxq $39,%r8,%r12
+ leaq (%rdx,%r13,1),%rdx
+ xorq %r9,%r15
+ rorxq $34,%r8,%r14
+ rorxq $28,%r8,%r13
+ leaq (%r11,%rdx,1),%r11
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %r9,%rdi
+ xorq %r13,%r14
+ leaq (%rdx,%rdi,1),%rdx
+ movq %rax,%r12
+ addq 72+128(%rsp),%rcx
+ andq %r11,%r12
+ rorxq $41,%r11,%r13
+ rorxq $18,%r11,%rdi
+ leaq (%rdx,%r14,1),%rdx
+ leaq (%rcx,%r12,1),%rcx
+ andnq %rbx,%r11,%r12
+ xorq %rdi,%r13
+ rorxq $14,%r11,%r14
+ leaq (%rcx,%r12,1),%rcx
+ xorq %r14,%r13
+ movq %rdx,%rdi
+ rorxq $39,%rdx,%r12
+ leaq (%rcx,%r13,1),%rcx
+ xorq %r8,%rdi
+ rorxq $34,%rdx,%r14
+ rorxq $28,%rdx,%r13
+ leaq (%r10,%rcx,1),%r10
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %r8,%r15
+ xorq %r13,%r14
+ leaq (%rcx,%r15,1),%rcx
+ movq %r11,%r12
+ addq 96+128(%rsp),%rbx
+ andq %r10,%r12
+ rorxq $41,%r10,%r13
+ rorxq $18,%r10,%r15
+ leaq (%rcx,%r14,1),%rcx
+ leaq (%rbx,%r12,1),%rbx
+ andnq %rax,%r10,%r12
+ xorq %r15,%r13
+ rorxq $14,%r10,%r14
+ leaq (%rbx,%r12,1),%rbx
+ xorq %r14,%r13
+ movq %rcx,%r15
+ rorxq $39,%rcx,%r12
+ leaq (%rbx,%r13,1),%rbx
+ xorq %rdx,%r15
+ rorxq $34,%rcx,%r14
+ rorxq $28,%rcx,%r13
+ leaq (%r9,%rbx,1),%r9
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %rdx,%rdi
+ xorq %r13,%r14
+ leaq (%rbx,%rdi,1),%rbx
+ movq %r10,%r12
+ addq 104+128(%rsp),%rax
+ andq %r9,%r12
+ rorxq $41,%r9,%r13
+ rorxq $18,%r9,%rdi
+ leaq (%rbx,%r14,1),%rbx
+ leaq (%rax,%r12,1),%rax
+ andnq %r11,%r9,%r12
+ xorq %rdi,%r13
+ rorxq $14,%r9,%r14
+ leaq (%rax,%r12,1),%rax
+ xorq %r14,%r13
+ movq %rbx,%rdi
+ rorxq $39,%rbx,%r12
+ leaq (%rax,%r13,1),%rax
+ xorq %rcx,%rdi
+ rorxq $34,%rbx,%r14
+ rorxq $28,%rbx,%r13
+ leaq (%r8,%rax,1),%r8
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %rcx,%r15
+ xorq %r13,%r14
+ leaq (%rax,%r15,1),%rax
+ movq %r9,%r12
+ addq 0(%rsp),%r11
+ andq %r8,%r12
+ rorxq $41,%r8,%r13
+ rorxq $18,%r8,%r15
+ leaq (%rax,%r14,1),%rax
+ leaq (%r11,%r12,1),%r11
+ andnq %r10,%r8,%r12
+ xorq %r15,%r13
+ rorxq $14,%r8,%r14
+ leaq (%r11,%r12,1),%r11
+ xorq %r14,%r13
+ movq %rax,%r15
+ rorxq $39,%rax,%r12
+ leaq (%r11,%r13,1),%r11
+ xorq %rbx,%r15
+ rorxq $34,%rax,%r14
+ rorxq $28,%rax,%r13
+ leaq (%rdx,%r11,1),%rdx
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %rbx,%rdi
+ xorq %r13,%r14
+ leaq (%r11,%rdi,1),%r11
+ movq %r8,%r12
+ addq 8(%rsp),%r10
+ andq %rdx,%r12
+ rorxq $41,%rdx,%r13
+ rorxq $18,%rdx,%rdi
+ leaq (%r11,%r14,1),%r11
+ leaq (%r10,%r12,1),%r10
+ andnq %r9,%rdx,%r12
+ xorq %rdi,%r13
+ rorxq $14,%rdx,%r14
+ leaq (%r10,%r12,1),%r10
+ xorq %r14,%r13
+ movq %r11,%rdi
+ rorxq $39,%r11,%r12
+ leaq (%r10,%r13,1),%r10
+ xorq %rax,%rdi
+ rorxq $34,%r11,%r14
+ rorxq $28,%r11,%r13
+ leaq (%rcx,%r10,1),%rcx
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %rax,%r15
+ xorq %r13,%r14
+ leaq (%r10,%r15,1),%r10
+ movq %rdx,%r12
+ addq 32(%rsp),%r9
+ andq %rcx,%r12
+ rorxq $41,%rcx,%r13
+ rorxq $18,%rcx,%r15
+ leaq (%r10,%r14,1),%r10
+ leaq (%r9,%r12,1),%r9
+ andnq %r8,%rcx,%r12
+ xorq %r15,%r13
+ rorxq $14,%rcx,%r14
+ leaq (%r9,%r12,1),%r9
+ xorq %r14,%r13
+ movq %r10,%r15
+ rorxq $39,%r10,%r12
+ leaq (%r9,%r13,1),%r9
+ xorq %r11,%r15
+ rorxq $34,%r10,%r14
+ rorxq $28,%r10,%r13
+ leaq (%rbx,%r9,1),%rbx
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %r11,%rdi
+ xorq %r13,%r14
+ leaq (%r9,%rdi,1),%r9
+ movq %rcx,%r12
+ addq 40(%rsp),%r8
+ andq %rbx,%r12
+ rorxq $41,%rbx,%r13
+ rorxq $18,%rbx,%rdi
+ leaq (%r9,%r14,1),%r9
+ leaq (%r8,%r12,1),%r8
+ andnq %rdx,%rbx,%r12
+ xorq %rdi,%r13
+ rorxq $14,%rbx,%r14
+ leaq (%r8,%r12,1),%r8
+ xorq %r14,%r13
+ movq %r9,%rdi
+ rorxq $39,%r9,%r12
+ leaq (%r8,%r13,1),%r8
+ xorq %r10,%rdi
+ rorxq $34,%r9,%r14
+ rorxq $28,%r9,%r13
+ leaq (%rax,%r8,1),%rax
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %r10,%r15
+ xorq %r13,%r14
+ leaq (%r8,%r15,1),%r8
+ movq %rbx,%r12
+ addq 64(%rsp),%rdx
+ andq %rax,%r12
+ rorxq $41,%rax,%r13
+ rorxq $18,%rax,%r15
+ leaq (%r8,%r14,1),%r8
+ leaq (%rdx,%r12,1),%rdx
+ andnq %rcx,%rax,%r12
+ xorq %r15,%r13
+ rorxq $14,%rax,%r14
+ leaq (%rdx,%r12,1),%rdx
+ xorq %r14,%r13
+ movq %r8,%r15
+ rorxq $39,%r8,%r12
+ leaq (%rdx,%r13,1),%rdx
+ xorq %r9,%r15
+ rorxq $34,%r8,%r14
+ rorxq $28,%r8,%r13
+ leaq (%r11,%rdx,1),%r11
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %r9,%rdi
+ xorq %r13,%r14
+ leaq (%rdx,%rdi,1),%rdx
+ movq %rax,%r12
+ addq 72(%rsp),%rcx
+ andq %r11,%r12
+ rorxq $41,%r11,%r13
+ rorxq $18,%r11,%rdi
+ leaq (%rdx,%r14,1),%rdx
+ leaq (%rcx,%r12,1),%rcx
+ andnq %rbx,%r11,%r12
+ xorq %rdi,%r13
+ rorxq $14,%r11,%r14
+ leaq (%rcx,%r12,1),%rcx
+ xorq %r14,%r13
+ movq %rdx,%rdi
+ rorxq $39,%rdx,%r12
+ leaq (%rcx,%r13,1),%rcx
+ xorq %r8,%rdi
+ rorxq $34,%rdx,%r14
+ rorxq $28,%rdx,%r13
+ leaq (%r10,%rcx,1),%r10
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %r8,%r15
+ xorq %r13,%r14
+ leaq (%rcx,%r15,1),%rcx
+ movq %r11,%r12
+ addq 96(%rsp),%rbx
+ andq %r10,%r12
+ rorxq $41,%r10,%r13
+ rorxq $18,%r10,%r15
+ leaq (%rcx,%r14,1),%rcx
+ leaq (%rbx,%r12,1),%rbx
+ andnq %rax,%r10,%r12
+ xorq %r15,%r13
+ rorxq $14,%r10,%r14
+ leaq (%rbx,%r12,1),%rbx
+ xorq %r14,%r13
+ movq %rcx,%r15
+ rorxq $39,%rcx,%r12
+ leaq (%rbx,%r13,1),%rbx
+ xorq %rdx,%r15
+ rorxq $34,%rcx,%r14
+ rorxq $28,%rcx,%r13
+ leaq (%r9,%rbx,1),%r9
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %rdx,%rdi
+ xorq %r13,%r14
+ leaq (%rbx,%rdi,1),%rbx
+ movq %r10,%r12
+ addq 104(%rsp),%rax
+ andq %r9,%r12
+ rorxq $41,%r9,%r13
+ rorxq $18,%r9,%rdi
+ leaq (%rbx,%r14,1),%rbx
+ leaq (%rax,%r12,1),%rax
+ andnq %r11,%r9,%r12
+ xorq %rdi,%r13
+ rorxq $14,%r9,%r14
+ leaq (%rax,%r12,1),%rax
+ xorq %r14,%r13
+ movq %rbx,%rdi
+ rorxq $39,%rbx,%r12
+ leaq (%rax,%r13,1),%rax
+ xorq %rcx,%rdi
+ rorxq $34,%rbx,%r14
+ rorxq $28,%rbx,%r13
+ leaq (%r8,%rax,1),%r8
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %rcx,%r15
+ xorq %r13,%r14
+ leaq (%rax,%r15,1),%rax
+ movq %r9,%r12
+ movq 1280(%rsp),%rdi
+ addq %r14,%rax
+
+ leaq 1152(%rsp),%rbp
+
+ addq 0(%rdi),%rax
+ addq 8(%rdi),%rbx
+ addq 16(%rdi),%rcx
+ addq 24(%rdi),%rdx
+ addq 32(%rdi),%r8
+ addq 40(%rdi),%r9
+ addq 48(%rdi),%r10
+ addq 56(%rdi),%r11
+
+ movq %rax,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %rcx,16(%rdi)
+ movq %rdx,24(%rdi)
+ movq %r8,32(%rdi)
+ movq %r9,40(%rdi)
+ movq %r10,48(%rdi)
+ movq %r11,56(%rdi)
+
+ cmpq 144(%rbp),%rsi
+ je .Ldone_avx2
+
+ xorq %r14,%r14
+ movq %rbx,%rdi
+ xorq %rcx,%rdi
+ movq %r9,%r12
+ jmp .Lower_avx2
+.align 16
+.Lower_avx2:
+ addq 0+16(%rbp),%r11
+ andq %r8,%r12
+ rorxq $41,%r8,%r13
+ rorxq $18,%r8,%r15
+ leaq (%rax,%r14,1),%rax
+ leaq (%r11,%r12,1),%r11
+ andnq %r10,%r8,%r12
+ xorq %r15,%r13
+ rorxq $14,%r8,%r14
+ leaq (%r11,%r12,1),%r11
+ xorq %r14,%r13
+ movq %rax,%r15
+ rorxq $39,%rax,%r12
+ leaq (%r11,%r13,1),%r11
+ xorq %rbx,%r15
+ rorxq $34,%rax,%r14
+ rorxq $28,%rax,%r13
+ leaq (%rdx,%r11,1),%rdx
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %rbx,%rdi
+ xorq %r13,%r14
+ leaq (%r11,%rdi,1),%r11
+ movq %r8,%r12
+ addq 8+16(%rbp),%r10
+ andq %rdx,%r12
+ rorxq $41,%rdx,%r13
+ rorxq $18,%rdx,%rdi
+ leaq (%r11,%r14,1),%r11
+ leaq (%r10,%r12,1),%r10
+ andnq %r9,%rdx,%r12
+ xorq %rdi,%r13
+ rorxq $14,%rdx,%r14
+ leaq (%r10,%r12,1),%r10
+ xorq %r14,%r13
+ movq %r11,%rdi
+ rorxq $39,%r11,%r12
+ leaq (%r10,%r13,1),%r10
+ xorq %rax,%rdi
+ rorxq $34,%r11,%r14
+ rorxq $28,%r11,%r13
+ leaq (%rcx,%r10,1),%rcx
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %rax,%r15
+ xorq %r13,%r14
+ leaq (%r10,%r15,1),%r10
+ movq %rdx,%r12
+ addq 32+16(%rbp),%r9
+ andq %rcx,%r12
+ rorxq $41,%rcx,%r13
+ rorxq $18,%rcx,%r15
+ leaq (%r10,%r14,1),%r10
+ leaq (%r9,%r12,1),%r9
+ andnq %r8,%rcx,%r12
+ xorq %r15,%r13
+ rorxq $14,%rcx,%r14
+ leaq (%r9,%r12,1),%r9
+ xorq %r14,%r13
+ movq %r10,%r15
+ rorxq $39,%r10,%r12
+ leaq (%r9,%r13,1),%r9
+ xorq %r11,%r15
+ rorxq $34,%r10,%r14
+ rorxq $28,%r10,%r13
+ leaq (%rbx,%r9,1),%rbx
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %r11,%rdi
+ xorq %r13,%r14
+ leaq (%r9,%rdi,1),%r9
+ movq %rcx,%r12
+ addq 40+16(%rbp),%r8
+ andq %rbx,%r12
+ rorxq $41,%rbx,%r13
+ rorxq $18,%rbx,%rdi
+ leaq (%r9,%r14,1),%r9
+ leaq (%r8,%r12,1),%r8
+ andnq %rdx,%rbx,%r12
+ xorq %rdi,%r13
+ rorxq $14,%rbx,%r14
+ leaq (%r8,%r12,1),%r8
+ xorq %r14,%r13
+ movq %r9,%rdi
+ rorxq $39,%r9,%r12
+ leaq (%r8,%r13,1),%r8
+ xorq %r10,%rdi
+ rorxq $34,%r9,%r14
+ rorxq $28,%r9,%r13
+ leaq (%rax,%r8,1),%rax
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %r10,%r15
+ xorq %r13,%r14
+ leaq (%r8,%r15,1),%r8
+ movq %rbx,%r12
+ addq 64+16(%rbp),%rdx
+ andq %rax,%r12
+ rorxq $41,%rax,%r13
+ rorxq $18,%rax,%r15
+ leaq (%r8,%r14,1),%r8
+ leaq (%rdx,%r12,1),%rdx
+ andnq %rcx,%rax,%r12
+ xorq %r15,%r13
+ rorxq $14,%rax,%r14
+ leaq (%rdx,%r12,1),%rdx
+ xorq %r14,%r13
+ movq %r8,%r15
+ rorxq $39,%r8,%r12
+ leaq (%rdx,%r13,1),%rdx
+ xorq %r9,%r15
+ rorxq $34,%r8,%r14
+ rorxq $28,%r8,%r13
+ leaq (%r11,%rdx,1),%r11
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %r9,%rdi
+ xorq %r13,%r14
+ leaq (%rdx,%rdi,1),%rdx
+ movq %rax,%r12
+ addq 72+16(%rbp),%rcx
+ andq %r11,%r12
+ rorxq $41,%r11,%r13
+ rorxq $18,%r11,%rdi
+ leaq (%rdx,%r14,1),%rdx
+ leaq (%rcx,%r12,1),%rcx
+ andnq %rbx,%r11,%r12
+ xorq %rdi,%r13
+ rorxq $14,%r11,%r14
+ leaq (%rcx,%r12,1),%rcx
+ xorq %r14,%r13
+ movq %rdx,%rdi
+ rorxq $39,%rdx,%r12
+ leaq (%rcx,%r13,1),%rcx
+ xorq %r8,%rdi
+ rorxq $34,%rdx,%r14
+ rorxq $28,%rdx,%r13
+ leaq (%r10,%rcx,1),%r10
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %r8,%r15
+ xorq %r13,%r14
+ leaq (%rcx,%r15,1),%rcx
+ movq %r11,%r12
+ addq 96+16(%rbp),%rbx
+ andq %r10,%r12
+ rorxq $41,%r10,%r13
+ rorxq $18,%r10,%r15
+ leaq (%rcx,%r14,1),%rcx
+ leaq (%rbx,%r12,1),%rbx
+ andnq %rax,%r10,%r12
+ xorq %r15,%r13
+ rorxq $14,%r10,%r14
+ leaq (%rbx,%r12,1),%rbx
+ xorq %r14,%r13
+ movq %rcx,%r15
+ rorxq $39,%rcx,%r12
+ leaq (%rbx,%r13,1),%rbx
+ xorq %rdx,%r15
+ rorxq $34,%rcx,%r14
+ rorxq $28,%rcx,%r13
+ leaq (%r9,%rbx,1),%r9
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %rdx,%rdi
+ xorq %r13,%r14
+ leaq (%rbx,%rdi,1),%rbx
+ movq %r10,%r12
+ addq 104+16(%rbp),%rax
+ andq %r9,%r12
+ rorxq $41,%r9,%r13
+ rorxq $18,%r9,%rdi
+ leaq (%rbx,%r14,1),%rbx
+ leaq (%rax,%r12,1),%rax
+ andnq %r11,%r9,%r12
+ xorq %rdi,%r13
+ rorxq $14,%r9,%r14
+ leaq (%rax,%r12,1),%rax
+ xorq %r14,%r13
+ movq %rbx,%rdi
+ rorxq $39,%rbx,%r12
+ leaq (%rax,%r13,1),%rax
+ xorq %rcx,%rdi
+ rorxq $34,%rbx,%r14
+ rorxq $28,%rbx,%r13
+ leaq (%r8,%rax,1),%r8
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %rcx,%r15
+ xorq %r13,%r14
+ leaq (%rax,%r15,1),%rax
+ movq %r9,%r12
+ leaq -128(%rbp),%rbp
+ cmpq %rsp,%rbp
+ jae .Lower_avx2
+
+ movq 1280(%rsp),%rdi
+ addq %r14,%rax
+
+ leaq 1152(%rsp),%rsp
+
+.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08
+
+ addq 0(%rdi),%rax
+ addq 8(%rdi),%rbx
+ addq 16(%rdi),%rcx
+ addq 24(%rdi),%rdx
+ addq 32(%rdi),%r8
+ addq 40(%rdi),%r9
+ leaq 256(%rsi),%rsi
+ addq 48(%rdi),%r10
+ movq %rsi,%r12
+ addq 56(%rdi),%r11
+ cmpq 128+16(%rsp),%rsi
+
+ movq %rax,0(%rdi)
+ cmoveq %rsp,%r12
+ movq %rbx,8(%rdi)
+ movq %rcx,16(%rdi)
+ movq %rdx,24(%rdi)
+ movq %r8,32(%rdi)
+ movq %r9,40(%rdi)
+ movq %r10,48(%rdi)
+ movq %r11,56(%rdi)
+
+ jbe .Loop_avx2
+ leaq (%rsp),%rbp
+
+.cfi_escape 0x0f,0x06,0x76,0x98,0x01,0x06,0x23,0x08
+
+.Ldone_avx2:
+ movq 152(%rbp),%rsi
+.cfi_def_cfa %rsi,8
+ vzeroupper
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx2:
+ RET
+.cfi_endproc
+SET_SIZE(zfs_sha512_transform_avx2)
+
+#if defined(__ELF__)
+ .section .note.GNU-stack,"",%progbits
+#endif
+#endif
diff --git a/module/icp/include/generic_impl.c b/module/icp/include/generic_impl.c
new file mode 100644
index 000000000..16f802cf7
--- /dev/null
+++ b/module/icp/include/generic_impl.c
@@ -0,0 +1,233 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2003, 2010 Oracle and/or its affiliates.
+ * Copyright (c) 2022 Tino Reichardt <[email protected]>
+ */
+
+/*
+ * This file gets included by c files for implementing the full set
+ * of zfs_impl.h defines.
+ *
+ * It's ment for easier maintaining multiple implementations of
+ * algorithms. Look into blake3_impl.c, sha256_impl.c or sha512_impl.c
+ * for reference.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_impl.h>
+
+/* Two default implementations */
+#define IMPL_FASTEST (UINT32_MAX)
+#define IMPL_CYCLE (UINT32_MAX - 1)
+
+#define IMPL_READ(i) (*(volatile uint32_t *) &(i))
+
+/* Implementation that contains the fastest method */
+static IMPL_OPS_T generic_fastest_impl = {
+ .name = "fastest"
+};
+
+/* Hold all supported implementations */
+static const IMPL_OPS_T *generic_supp_impls[ARRAY_SIZE(IMPL_ARRAY)];
+static uint32_t generic_supp_impls_cnt = 0;
+
+/* Currently selected implementation */
+static uint32_t generic_impl_chosen = IMPL_FASTEST;
+
+static struct generic_impl_selector {
+ const char *name;
+ uint32_t sel;
+} generic_impl_selectors[] = {
+ { "cycle", IMPL_CYCLE },
+ { "fastest", IMPL_FASTEST }
+};
+
+/* check the supported implementations */
+static void
+generic_impl_init(void)
+{
+ int i, c;
+
+ /* init only once */
+ if (likely(generic_supp_impls_cnt != 0))
+ return;
+
+ /* Move supported implementations into generic_supp_impls */
+ for (i = 0, c = 0; i < ARRAY_SIZE(IMPL_ARRAY); i++) {
+ const IMPL_OPS_T *impl = IMPL_ARRAY[i];
+
+ if (impl->is_supported && impl->is_supported())
+ generic_supp_impls[c++] = impl;
+ }
+ generic_supp_impls_cnt = c;
+
+ /* first init generic impl, may be changed via set_fastest() */
+ memcpy(&generic_fastest_impl, generic_supp_impls[0],
+ sizeof (generic_fastest_impl));
+}
+
+/* get number of supported implementations */
+static uint32_t
+generic_impl_getcnt(void)
+{
+ generic_impl_init();
+ return (generic_supp_impls_cnt);
+}
+
+/* get id of selected implementation */
+static uint32_t
+generic_impl_getid(void)
+{
+ generic_impl_init();
+ return (IMPL_READ(generic_impl_chosen));
+}
+
+/* get name of selected implementation */
+static const char *
+generic_impl_getname(void)
+{
+ uint32_t impl = IMPL_READ(generic_impl_chosen);
+
+ generic_impl_init();
+ switch (impl) {
+ case IMPL_FASTEST:
+ return ("fastest");
+ case IMPL_CYCLE:
+ return ("cycle");
+ default:
+ return (generic_supp_impls[impl]->name);
+ }
+}
+
+/* set implementation by id */
+static void
+generic_impl_setid(uint32_t id)
+{
+ generic_impl_init();
+ switch (id) {
+ case IMPL_FASTEST:
+ atomic_swap_32(&generic_impl_chosen, IMPL_FASTEST);
+ break;
+ case IMPL_CYCLE:
+ atomic_swap_32(&generic_impl_chosen, IMPL_CYCLE);
+ break;
+ default:
+ ASSERT3U(id, <, generic_supp_impls_cnt);
+ atomic_swap_32(&generic_impl_chosen, id);
+ break;
+ }
+}
+
+/* set implementation by name */
+static int
+generic_impl_setname(const char *val)
+{
+ uint32_t impl = IMPL_READ(generic_impl_chosen);
+ size_t val_len;
+ int i, err = -EINVAL;
+
+ generic_impl_init();
+ val_len = strlen(val);
+ while ((val_len > 0) && !!isspace(val[val_len-1])) /* trim '\n' */
+ val_len--;
+
+ /* check mandatory implementations */
+ for (i = 0; i < ARRAY_SIZE(generic_impl_selectors); i++) {
+ const char *name = generic_impl_selectors[i].name;
+
+ if (val_len == strlen(name) &&
+ strncmp(val, name, val_len) == 0) {
+ impl = generic_impl_selectors[i].sel;
+ err = 0;
+ break;
+ }
+ }
+
+ /* check all supported implementations */
+ if (err != 0) {
+ for (i = 0; i < generic_supp_impls_cnt; i++) {
+ const char *name = generic_supp_impls[i]->name;
+
+ if (val_len == strlen(name) &&
+ strncmp(val, name, val_len) == 0) {
+ impl = i;
+ err = 0;
+ break;
+ }
+ }
+ }
+
+ if (err == 0) {
+ atomic_swap_32(&generic_impl_chosen, impl);
+ }
+
+ return (err);
+}
+
+/* setup id as fastest implementation */
+static void
+generic_impl_set_fastest(uint32_t id)
+{
+ generic_impl_init();
+ memcpy(&generic_fastest_impl, generic_supp_impls[id],
+ sizeof (generic_fastest_impl));
+}
+
+/* return impl iterating functions */
+const zfs_impl_t ZFS_IMPL_OPS = {
+ .name = IMPL_NAME,
+ .getcnt = generic_impl_getcnt,
+ .getid = generic_impl_getid,
+ .getname = generic_impl_getname,
+ .set_fastest = generic_impl_set_fastest,
+ .setid = generic_impl_setid,
+ .setname = generic_impl_setname
+};
+
+/* get impl ops_t of selected implementation */
+const IMPL_OPS_T *
+IMPL_GET_OPS(void)
+{
+ const IMPL_OPS_T *ops = NULL;
+ uint32_t idx, impl = IMPL_READ(generic_impl_chosen);
+ static uint32_t cycle_count = 0;
+
+ generic_impl_init();
+ switch (impl) {
+ case IMPL_FASTEST:
+ ops = &generic_fastest_impl;
+ break;
+ case IMPL_CYCLE:
+ idx = (++cycle_count) % generic_supp_impls_cnt;
+ ops = generic_supp_impls[idx];
+ break;
+ default:
+ ASSERT3U(impl, <, generic_supp_impls_cnt);
+ ops = generic_supp_impls[impl];
+ break;
+ }
+
+ ASSERT3P(ops, !=, NULL);
+ return (ops);
+}
diff --git a/module/icp/include/sha2/sha2_impl.h b/module/icp/include/sha2/sha2_impl.h
index 0e89747ee..9a1bd38f1 100644
--- a/module/icp/include/sha2/sha2_impl.h
+++ b/module/icp/include/sha2/sha2_impl.h
@@ -18,9 +18,10 @@
*
* CDDL HEADER END
*/
+
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2022 Tino Reichardt <[email protected]>
*/
#ifndef _SHA2_IMPL_H
@@ -32,6 +33,28 @@
extern "C" {
#endif
+/* transform function definition */
+typedef void (*sha256_f)(uint32_t state[8], const void *data, size_t blks);
+typedef void (*sha512_f)(uint64_t state[8], const void *data, size_t blks);
+
+/* needed for checking valid implementations */
+typedef boolean_t (*sha2_is_supported_f)(void);
+
+typedef struct {
+ const char *name;
+ sha256_f transform;
+ sha2_is_supported_f is_supported;
+} sha256_ops_t;
+
+typedef struct {
+ const char *name;
+ sha512_f transform;
+ sha2_is_supported_f is_supported;
+} sha512_ops_t;
+
+extern const sha256_ops_t *sha256_get_ops(void);
+extern const sha512_ops_t *sha512_get_ops(void);
+
typedef enum {
SHA1_TYPE,
SHA256_TYPE,
diff --git a/module/icp/io/sha2_mod.c b/module/icp/io/sha2_mod.c
index a58f0982c..f068951b0 100644
--- a/module/icp/io/sha2_mod.c
+++ b/module/icp/io/sha2_mod.c
@@ -28,7 +28,6 @@
#include <sys/crypto/common.h>
#include <sys/crypto/spi.h>
#include <sys/crypto/icp.h>
-#define _SHA2_IMPL
#include <sys/sha2.h>
#include <sha2/sha2_impl.h>
diff --git a/module/zfs/sha256.c b/module/zfs/sha2_zfs.c
index 445d82ed0..872b1e53e 100644
--- a/module/zfs/sha256.c
+++ b/module/zfs/sha2_zfs.c
@@ -18,16 +18,14 @@
*
* CDDL HEADER END
*/
+
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-/*
* Copyright 2013 Saso Kiselkov. All rights reserved.
* Copyright (c) 2016 by Delphix. All rights reserved.
*/
+
#include <sys/zfs_context.h>
-#include <sys/zio.h>
#include <sys/zio_checksum.h>
#include <sys/sha2.h>
#include <sys/abd.h>
@@ -42,7 +40,7 @@ sha_incremental(void *buf, size_t size, void *arg)
}
void
-abd_checksum_SHA256(abd_t *abd, uint64_t size,
+abd_checksum_sha256(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
(void) ctx_template;
@@ -79,7 +77,7 @@ bswap:
}
void
-abd_checksum_SHA512_native(abd_t *abd, uint64_t size,
+abd_checksum_sha512_native(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
(void) ctx_template;
@@ -91,12 +89,12 @@ abd_checksum_SHA512_native(abd_t *abd, uint64_t size,
}
void
-abd_checksum_SHA512_byteswap(abd_t *abd, uint64_t size,
+abd_checksum_sha512_byteswap(abd_t *abd, uint64_t size,
const void *ctx_template, zio_cksum_t *zcp)
{
zio_cksum_t tmp;
- abd_checksum_SHA512_native(abd, size, ctx_template, &tmp);
+ abd_checksum_sha512_native(abd, size, ctx_template, &tmp);
zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]);
zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]);
zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]);
diff --git a/module/zfs/zfs_chksum.c b/module/zfs/zfs_chksum.c
index 91247f292..acedeab7a 100644
--- a/module/zfs/zfs_chksum.c
+++ b/module/zfs/zfs_chksum.c
@@ -23,13 +23,13 @@
* Copyright (c) 2021-2022 Tino Reichardt <[email protected]>
*/
-#include <sys/types.h>
-#include <sys/spa.h>
#include <sys/zio_checksum.h>
#include <sys/zfs_context.h>
#include <sys/zfs_chksum.h>
+#include <sys/zfs_impl.h>
#include <sys/blake3.h>
+#include <sys/sha2.h>
/* limit benchmarking to max 256KiB, when EdonR is slower then this: */
#define LIMIT_PERF_MBS 300
@@ -56,25 +56,26 @@ static int chksum_stat_cnt = 0;
static kstat_t *chksum_kstat = NULL;
/*
- * i3-1005G1 test output:
+ * Sample output on i3-1005G1 System:
*
- * implementation 1k 4k 16k 64k 256k 1m 4m
- * fletcher-4 5421 15001 26468 32555 34720 32801 18847
- * edonr-generic 1196 1602 1761 1749 1762 1759 1751
- * skein-generic 546 591 608 615 619 612 616
- * sha256-generic 246 270 274 274 277 275 276
- * sha256-avx 262 296 304 307 307 307 306
- * sha256-sha-ni 769 1072 1172 1220 1219 1232 1228
- * sha256-openssl 240 300 316 314 304 285 276
- * sha512-generic 333 374 385 392 391 393 392
- * sha512-openssl 353 441 467 476 472 467 426
- * sha512-avx 362 444 473 475 479 476 478
- * sha512-avx2 394 500 530 538 543 545 542
- * blake3-generic 308 313 313 313 312 313 312
- * blake3-sse2 402 1289 1423 1446 1432 1458 1413
- * blake3-sse41 427 1470 1625 1704 1679 1607 1629
- * blake3-avx2 428 1920 3095 3343 3356 3318 3204
- * blake3-avx512 473 2687 4905 5836 5844 5643 5374
+ * implementation 1k 4k 16k 64k 256k 1m 4m 16m
+ * edonr-generic 1278 1625 1769 1776 1783 1778 1771 1767
+ * skein-generic 548 594 613 623 621 623 621 486
+ * sha256-generic 255 270 281 278 279 281 283 283
+ * sha256-x64 288 310 316 317 318 317 317 316
+ * sha256-ssse3 304 342 351 355 356 357 356 356
+ * sha256-avx 311 348 359 362 362 363 363 362
+ * sha256-avx2 330 378 389 395 395 395 395 395
+ * sha256-shani 908 1127 1212 1230 1233 1234 1223 1230
+ * sha512-generic 359 409 431 427 429 430 428 423
+ * sha512-x64 420 473 490 496 497 497 496 495
+ * sha512-avx 406 522 546 560 560 560 556 560
+ * sha512-avx2 464 568 601 606 609 610 607 608
+ * blake3-generic 330 327 324 323 324 320 323 322
+ * blake3-sse2 424 1366 1449 1468 1458 1453 1395 1408
+ * blake3-sse41 453 1554 1658 1703 1689 1669 1622 1630
+ * blake3-avx2 452 2013 3225 3351 3356 3261 3076 3101
+ * blake3-avx512 498 2869 5269 5926 5872 5643 5014 5005
*/
static int
chksum_kstat_headers(char *buf, size_t size)
@@ -237,25 +238,30 @@ abort:
static void
chksum_benchmark(void)
{
-
#ifndef _KERNEL
/* we need the benchmark only for the kernel module */
return;
#endif
chksum_stat_t *cs;
- int cbid = 0;
- uint64_t max = 0;
- uint32_t id, id_save;
-
- /* space for the benchmark times */
- chksum_stat_cnt = 4;
- chksum_stat_cnt += blake3_impl_getcnt();
+ uint64_t max;
+ uint32_t id, cbid = 0, id_save;
+ const zfs_impl_t *blake3 = zfs_impl_get_ops("blake3");
+ const zfs_impl_t *sha256 = zfs_impl_get_ops("sha256");
+ const zfs_impl_t *sha512 = zfs_impl_get_ops("sha512");
+
+ /* count implementations */
+ chksum_stat_cnt = 2;
+ chksum_stat_cnt += sha256->getcnt();
+ chksum_stat_cnt += sha512->getcnt();
+ chksum_stat_cnt += blake3->getcnt();
chksum_stat_data = kmem_zalloc(
sizeof (chksum_stat_t) * chksum_stat_cnt, KM_SLEEP);
/* edonr - needs to be the first one here (slow CPU check) */
cs = &chksum_stat_data[cbid++];
+
+ /* edonr */
cs->init = abd_checksum_edonr_tmpl_init;
cs->func = abd_checksum_edonr_native;
cs->free = abd_checksum_edonr_tmpl_free;
@@ -273,42 +279,58 @@ chksum_benchmark(void)
chksum_benchit(cs);
/* sha256 */
- cs = &chksum_stat_data[cbid++];
- cs->init = 0;
- cs->func = abd_checksum_SHA256;
- cs->free = 0;
- cs->name = "sha256";
- cs->impl = "generic";
- chksum_benchit(cs);
+ id_save = sha256->getid();
+ for (max = 0, id = 0; id < sha256->getcnt(); id++) {
+ sha256->setid(id);
+ cs = &chksum_stat_data[cbid++];
+ cs->init = 0;
+ cs->func = abd_checksum_sha256;
+ cs->free = 0;
+ cs->name = sha256->name;
+ cs->impl = sha256->getname();
+ chksum_benchit(cs);
+ if (cs->bs256k > max) {
+ max = cs->bs256k;
+ sha256->set_fastest(id);
+ }
+ }
+ sha256->setid(id_save);
/* sha512 */
- cs = &chksum_stat_data[cbid++];
- cs->init = 0;
- cs->func = abd_checksum_SHA512_native;
- cs->free = 0;
- cs->name = "sha512";
- cs->impl = "generic";
- chksum_benchit(cs);
+ id_save = sha512->getid();
+ for (max = 0, id = 0; id < sha512->getcnt(); id++) {
+ sha512->setid(id);
+ cs = &chksum_stat_data[cbid++];
+ cs->init = 0;
+ cs->func = abd_checksum_sha512_native;
+ cs->free = 0;
+ cs->name = sha512->name;
+ cs->impl = sha512->getname();
+ chksum_benchit(cs);
+ if (cs->bs256k > max) {
+ max = cs->bs256k;
+ sha512->set_fastest(id);
+ }
+ }
+ sha512->setid(id_save);
/* blake3 */
- id_save = blake3_impl_getid();
- for (id = 0; id < blake3_impl_getcnt(); id++) {
- blake3_impl_setid(id);
+ id_save = blake3->getid();
+ for (max = 0, id = 0; id < blake3->getcnt(); id++) {
+ blake3->setid(id);
cs = &chksum_stat_data[cbid++];
cs->init = abd_checksum_blake3_tmpl_init;
cs->func = abd_checksum_blake3_native;
cs->free = abd_checksum_blake3_tmpl_free;
- cs->name = "blake3";
- cs->impl = blake3_impl_getname();
+ cs->name = blake3->name;
+ cs->impl = blake3->getname();
chksum_benchit(cs);
if (cs->bs256k > max) {
max = cs->bs256k;
- blake3_impl_set_fastest(id);
+ blake3->set_fastest(id);
}
}
-
- /* restore initial value */
- blake3_impl_setid(id_save);
+ blake3->setid(id_save);
}
void
diff --git a/module/zfs/zfs_impl.c b/module/zfs/zfs_impl.c
new file mode 100644
index 000000000..20322ff98
--- /dev/null
+++ b/module/zfs/zfs_impl.c
@@ -0,0 +1,61 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2022 Tino Reichardt <[email protected]>
+ */
+
+#include <sys/zio_checksum.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_impl.h>
+
+#include <sys/blake3.h>
+#include <sys/sha2.h>
+
+/*
+ * impl_ops - backend for implementations of algorithms
+ */
+const zfs_impl_t *impl_ops[] = {
+ &zfs_blake3_ops,
+ &zfs_sha256_ops,
+ &zfs_sha512_ops,
+ NULL
+};
+
+/*
+ * zfs_impl_get_ops - Get the API functions for an impl backend
+ */
+const zfs_impl_t *
+zfs_impl_get_ops(const char *algo)
+{
+ const zfs_impl_t **ops = impl_ops;
+
+ if (!algo || !*algo)
+ return (*ops);
+
+ for (; *ops; ops++) {
+ if (strcmp(algo, (*ops)->name) == 0)
+ break;
+ }
+
+ ASSERT3P(ops, !=, NULL);
+ return (*ops);
+}
diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c
index 3743eaa53..6090959c5 100644
--- a/module/zfs/zio_checksum.c
+++ b/module/zfs/zio_checksum.c
@@ -165,10 +165,10 @@ zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
{{NULL, NULL}, NULL, NULL, 0, "on"},
{{abd_checksum_off, abd_checksum_off},
NULL, NULL, 0, "off"},
- {{abd_checksum_SHA256, abd_checksum_SHA256},
+ {{abd_checksum_sha256, abd_checksum_sha256},
NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
"label"},
- {{abd_checksum_SHA256, abd_checksum_SHA256},
+ {{abd_checksum_sha256, abd_checksum_sha256},
NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
"gang_header"},
{{abd_fletcher_2_native, abd_fletcher_2_byteswap},
@@ -177,14 +177,14 @@ zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
NULL, NULL, 0, "fletcher2"},
{{abd_fletcher_4_native, abd_fletcher_4_byteswap},
NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"},
- {{abd_checksum_SHA256, abd_checksum_SHA256},
+ {{abd_checksum_sha256, abd_checksum_sha256},
NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
ZCHECKSUM_FLAG_NOPWRITE, "sha256"},
{{abd_fletcher_4_native, abd_fletcher_4_byteswap},
NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"},
{{abd_checksum_off, abd_checksum_off},
NULL, NULL, 0, "noparity"},
- {{abd_checksum_SHA512_native, abd_checksum_SHA512_byteswap},
+ {{abd_checksum_sha512_native, abd_checksum_sha512_byteswap},
NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
ZCHECKSUM_FLAG_NOPWRITE, "sha512"},
{{abd_checksum_skein_native, abd_checksum_skein_byteswap},
diff --git a/tests/zfs-tests/cmd/checksum/sha2_test.c b/tests/zfs-tests/cmd/checksum/sha2_test.c
index d99e8757a..efcf812d7 100644
--- a/tests/zfs-tests/cmd/checksum/sha2_test.c
+++ b/tests/zfs-tests/cmd/checksum/sha2_test.c
@@ -33,11 +33,11 @@
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
+
#include <sys/time.h>
-#define _SHA2_IMPL
#include <sys/sha2.h>
#include <sys/stdtypes.h>
-
+#include <sys/zfs_impl.h>
/*
* Test messages from:
@@ -174,9 +174,19 @@ main(int argc, char *argv[])
boolean_t failed = B_FALSE;
uint64_t cpu_mhz = 0;
+ const zfs_impl_t *sha256 = zfs_impl_get_ops("sha256");
+ const zfs_impl_t *sha512 = zfs_impl_get_ops("sha512");
+ uint32_t id;
+
if (argc == 2)
cpu_mhz = atoi(argv[1]);
+ if (!sha256)
+ return (1);
+
+ if (!sha512)
+ return (1);
+
#define SHA2_ALGO_TEST(_m, mode, diglen, testdigest) \
do { \
SHA2_CTX ctx; \
@@ -194,7 +204,7 @@ main(int argc, char *argv[])
} \
} while (0)
-#define SHA2_PERF_TEST(mode, diglen) \
+#define SHA2_PERF_TEST(mode, diglen, name) \
do { \
SHA2_CTX ctx; \
uint8_t digest[diglen / 8]; \
@@ -216,8 +226,8 @@ main(int argc, char *argv[])
cpb = (cpu_mhz * 1e6 * ((double)delta / \
1000000)) / (8192 * 128 * 1024); \
} \
- (void) printf("SHA%-9s%llu us (%.02f CPB)\n", #mode, \
- (u_longlong_t)delta, cpb); \
+ (void) printf("sha%s-%-9s%7llu us (%.02f CPB)\n", #mode,\
+ name, (u_longlong_t)delta, cpb); \
} while (0)
(void) printf("Running algorithm correctness tests:\n");
@@ -237,8 +247,18 @@ main(int argc, char *argv[])
(void) printf("Running performance tests (hashing 1024 MiB of "
"data):\n");
- SHA2_PERF_TEST(256, 256);
- SHA2_PERF_TEST(512, 512);
+
+ for (id = 0; id < sha256->getcnt(); id++) {
+ sha256->setid(id);
+ const char *name = sha256->getname();
+ SHA2_PERF_TEST(256, 256, name);
+ }
+
+ for (id = 0; id < sha512->getcnt(); id++) {
+ sha512->setid(id);
+ const char *name = sha512->getname();
+ SHA2_PERF_TEST(512, 512, name);
+ }
return (0);
}